23 #include "kernels_trace.h"
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static float msone = -1.0;
29 static float sone = 1.0;
30 static float szero = 0.0;
119 pastix_int_t stride, stridef, indblok;
120 pastix_int_t M, N, K, m;
124 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
125 assert(!(fcblk->
cblktype & CBLK_LAYOUT_2D));
127 assert( work != NULL );
139 M = stride - indblok - (shift * N);
142 A = A + indblok + (shift * N);
149 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
150 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
155 kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
170 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
176 assert( fblok < fcblk[1].fblokptr );
182 pastix_cblk_lock( fcblk );
185 1.0, tmpC, stridef );
186 pastix_cblk_unlock( fcblk );
276 pastix_int_t stride, stridef;
277 pastix_int_t M, N, K;
281 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
282 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
302 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
308 assert( fblok < fcblk[1].fblokptr );
321 pastix_cblk_lock( fcblk );
322 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
323 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
325 (msone), blokA, stride,
327 (sone), blokC, stridef );
328 kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
329 pastix_cblk_unlock( fcblk );
415 pastix_int_t M, N, K, lda, ldb, ldc;
419 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
420 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
439 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
445 assert( fblok < fcblk[1].fblokptr );
459 pastix_cblk_lock( fcblk );
460 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
461 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
465 (sone), blokC, ldc );
466 kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
467 pastix_cblk_unlock( fcblk );
539 static inline pastix_fixdbl_t
541 pastix_int_t blok_mk,
542 pastix_int_t blok_kn,
543 pastix_int_t blok_mn,
555 const float *Aptr, *Bptr;
557 pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
558 pastix_int_t full_m, full_n;
559 size_t offsetA, offsetB, offsetC;
561 pastix_fixdbl_t flops = 0.0;
562 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
565 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
566 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
574 blokB = fblokK + blok_kn;
578 blokA = fblokK + blok_mk;
588 blokC = fblokN + blok_mn;
590 assert( blokC->
fcblknm == cblk_m );
596 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
598 Aptr = A + bA->
coefind - offsetA;
605 assert( bC < lblokN );
608 Cptr = C + bC->
coefind - offsetC;
612 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
615 Bptr = B + bB->
coefind - offsetB;
618 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
625 flops += FLOPS_SGEMM( M, N, K );
629 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlok2d2d,
705 static inline pastix_fixdbl_t
707 pastix_int_t blok_mk,
708 pastix_int_t blok_kn,
709 pastix_int_t blok_mn,
725 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
726 size_t offsetA, offsetB;
728 pastix_fixdbl_t flops = 0.0;
729 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
732 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
735 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
736 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
739 assert(!(cblk->
cblktype & CBLK_COMPRESSED) );
740 assert( fcblk->
cblktype & CBLK_COMPRESSED );
748 blokB = fblokK + blok_kn;
753 blokA = fblokK + blok_mk;
764 blokC = fblokN + blok_mn;
765 assert( blokC->
fcblknm == cblk_m );
779 params.
lock = &(lock);
782 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
788 lrA.
u = (
float*)A + bA->
coefind - offsetA;
795 assert( bC < lblokN );
803 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
811 lrB.
u = (
float*)B + bB->
coefind - offsetB;
820 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlokLRLR,
893 static inline pastix_fixdbl_t
895 pastix_int_t blok_mk,
896 pastix_int_t blok_kn,
897 pastix_int_t blok_mn,
913 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
915 pastix_fixdbl_t flops = 0.0;
916 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
919 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
922 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
923 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
926 assert( cblk->
cblktype & CBLK_COMPRESSED );
927 assert( fcblk->
cblktype & CBLK_COMPRESSED );
935 blokB = fblokK + blok_kn;
938 blokA = fblokK + blok_mk;
947 blokC = fblokN + blok_mn;
948 assert( blokC->
fcblknm == cblk_m );
962 params.
lock = &(lock);
965 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++, lrA++) {
974 assert( bC < lblokN );
982 for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++, blrB++) {
995 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlokLRLR,
1062 static inline pastix_fixdbl_t
1081 pastix_int_t stride, shift;
1084 pastix_fixdbl_t flops = 0.0;
1087 assert(!(cblk->
cblktype & CBLK_COMPRESSED));
1088 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1089 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1091 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1118 params.
alpha = -1.0;
1121 params.
lwork = lwork;
1125 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1132 assert( fblok < fcblk[1].fblokptr );
1139 lrA.
u = (
float*)A + iterblok->
coefind;
1213 static inline pastix_fixdbl_t
1230 pastix_int_t N, K, shift;
1233 pastix_fixdbl_t flops = 0.0;
1236 assert( cblk->
cblktype & CBLK_COMPRESSED );
1237 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1238 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1239 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1241 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1263 params.
alpha = -1.0;
1266 params.
lwork = lwork;
1269 params.
B = lrB + (blok - cblk->
fblokptr);
1272 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1273 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1280 assert( fblok < fcblk[1].fblokptr );
1360 static inline pastix_fixdbl_t
1377 pastix_int_t N, K, shift;
1381 pastix_fixdbl_t flops = 0.0;
1384 assert( cblk->
cblktype & CBLK_COMPRESSED );
1385 assert( !(fcblk->
cblktype & CBLK_COMPRESSED) );
1386 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1387 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1389 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1411 params.
alpha = -1.0;
1414 params.
lwork = lwork;
1417 params.
B = lrB + (blok - cblk->
fblokptr);
1424 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1425 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1431 assert( fblok < fcblk[1].fblokptr );
1524 pastix_ktype_t ktype;
1525 pastix_fixdbl_t time, flops = 0.0;
1526 pastix_int_t m = cblk->
stride;
1533 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1534 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1535 ktype = PastixKernelGEMMCblkLRLR;
1536 time = kernel_trace_start( ktype );
1540 A, B, C, work, lwork,
1544 ktype = PastixKernelGEMMCblkFRLR;
1545 time = kernel_trace_start( ktype );
1549 A, B, C, work, lwork,
1553 else if ( fcblk->
cblktype & CBLK_LAYOUT_2D ) {
1554 if ( cblk->
cblktype & CBLK_COMPRESSED) {
1555 ktype = PastixKernelGEMMCblk2d2d;
1556 time = kernel_trace_start( ktype );
1559 A, B, C, work, lwork,
1562 else if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
1563 ktype = PastixKernelGEMMCblk2d2d;
1564 time = kernel_trace_start( ktype );
1571 ktype = PastixKernelGEMMCblk1d2d;
1572 time = kernel_trace_start( ktype );
1578 flops = FLOPS_SGEMM( m, n, k );
1581 assert( !(cblk->
cblktype & CBLK_COMPRESSED) );
1582 ktype = PastixKernelGEMMCblk1d1d;
1583 time = kernel_trace_start( ktype );
1589 flops = FLOPS_SGEMM( m, n, k );
1592 kernel_trace_stop( blok->
inlast, ktype, m, n, k, flops, time );
1658 pastix_int_t blok_mk,
1659 pastix_int_t blok_nk,
1660 pastix_int_t blok_mn,
1666 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1667 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1669 blok_mk, blok_nk, blok_mn,
1675 blok_mk, blok_nk, blok_mn,
1681 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1687 blok_mk, blok_nk, blok_mn,