23 #include "kernels_trace.h"
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static pastix_complex64_t zone = 1.0;
77 const pastix_complex64_t *A,
78 pastix_complex64_t *C )
81 pastix_int_t M, N, lda;
91 assert( fblok + 1 < cblk[1].fblokptr );
93 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
98 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
99 cblas_ztrsm(CblasColMajor,
100 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
102 CBLAS_SADDR(zone), A, lda,
104 kernel_trace_stop_lvl2( FLOPS_ZTRSM( side, M, N ) );
154 const pastix_complex64_t *A,
155 pastix_complex64_t *C )
158 pastix_int_t M, N, lda, ldc;
159 pastix_complex64_t *blokC;
167 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
169 for (blok=fblok+1; blok<lblok; blok++) {
175 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
176 cblas_ztrsm(CblasColMajor,
177 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
179 CBLAS_SADDR(zone), A, lda,
181 kernel_trace_stop_lvl2( FLOPS_ZTRSM( side, M, N ) );
232 static inline pastix_fixdbl_t
243 pastix_int_t M, N, lda;
244 pastix_complex64_t *A;
246 pastix_fixdbl_t flops = 0.0;
247 pastix_fixdbl_t flops_lr, flops_c;
256 assert( lrA->
rk == -1 );
258 assert( cblk->
cblktype & CBLK_COMPRESSED );
259 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
262 for (blok=fblok+1; blok<lblok; blok++, lrC++) {
285 if ( lrC->
rk != 0 ) {
286 if ( lrC->
rk != -1 ) {
287 kernel_trace_start_lvl2( PastixKernelLvl2_LR_TRSM );
288 cblas_ztrsm(CblasColMajor,
289 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
291 CBLAS_SADDR(zone), A, lda,
293 flops_c = FLOPS_ZTRSM( side, lrC->
rk, N );
294 kernel_trace_stop_lvl2( flops_c );
297 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
298 cblas_ztrsm(CblasColMajor,
299 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
301 CBLAS_SADDR(zone), A, lda,
303 flops_c = FLOPS_ZTRSM( side, M, N );
304 kernel_trace_stop_lvl2( flops_c );
308 flops += flops_lr + flops_c;
365 if ( cblk[0].fblokptr + 1 < cblk[1].fblokptr )
367 pastix_ktype_t ktype = PastixKernelLvl1Nbr;
368 pastix_fixdbl_t time, flops = 0.0;
370 pastix_int_t m = cblk->
stride - n;
372 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
373 ktype = PastixKernelTRSMCblkLR;
374 time = kernel_trace_start( ktype );
377 cblk, A, C, lowrank );
380 if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
381 ktype = PastixKernelTRSMCblk2d;
382 time = kernel_trace_start( ktype );
388 ktype = PastixKernelTRSMCblk1d;
389 time = kernel_trace_start( ktype );
397 kernel_trace_stop( cblk->
fblokptr->
inlast, ktype, m, n, 0, flops, time );
447 static inline pastix_fixdbl_t
454 const pastix_complex64_t *A,
455 pastix_complex64_t *C )
458 pastix_int_t M, N, lda, ldc, offset, cblk_m, full_m;
459 pastix_complex64_t *Cptr;
460 pastix_fixdbl_t flops = 0.0;
461 pastix_fixdbl_t time = kernel_trace_start( PastixKernelTRSMBlok2d );
469 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
471 blok = fblok + blok_m;
476 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++) {
478 Cptr = C + blok->
coefind - offset;
482 cblas_ztrsm( CblasColMajor,
483 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
485 CBLAS_SADDR(zone), A, lda,
488 flops += FLOPS_ZTRSM( side, M, N );
492 kernel_trace_stop( cblk->
fblokptr->
inlast, PastixKernelTRSMBlok2d,
493 full_m, N, 0, flops, time );
545 static inline pastix_fixdbl_t
557 pastix_int_t M, N, lda, cblk_m, full_m, full_n;
558 pastix_complex64_t *A;
559 pastix_fixdbl_t flops = 0.0;
560 pastix_fixdbl_t time = kernel_trace_start( PastixKernelTRSMBlokLR );
569 assert( cblk->
cblktype & CBLK_COMPRESSED );
570 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
573 assert( lrA->
rk == -1 );
575 blok = fblok + blok_m;
580 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++, lrC++) {
600 if ( lrC->
rk != 0 ) {
601 if ( lrC->
rk != -1 ) {
602 cblas_ztrsm(CblasColMajor,
603 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
605 CBLAS_SADDR(zone), A, lda,
608 flops += FLOPS_ZTRSM( side, lrC->
rk, N );
612 cblas_ztrsm(CblasColMajor,
613 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
615 CBLAS_SADDR(zone), A, lda,
618 flops += FLOPS_ZTRSM( side, M, N );
625 kernel_trace_stop( cblk->
fblokptr->
inlast, PastixKernelTRSMBlokLR,
626 full_m, N, full_n, flops, time );
688 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
690 cblk, blok_m, A, C, lowrank );
694 cblk, blok_m, A, C );