23 #include "kernels_trace.h"
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static pastix_complex32_t cone = 1.0;
77 const pastix_complex32_t *A,
78 pastix_complex32_t *C )
81 pastix_int_t M, N, lda;
91 assert( fblok + 1 < cblk[1].fblokptr );
93 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
98 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
99 cblas_ctrsm(CblasColMajor,
100 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
102 CBLAS_SADDR(cone), A, lda,
104 kernel_trace_stop_lvl2( FLOPS_CTRSM( side, M, N ) );
154 const pastix_complex32_t *A,
155 pastix_complex32_t *C )
158 pastix_int_t M, N, lda, ldc;
159 pastix_complex32_t *blokC;
167 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
169 for (blok=fblok+1; blok<lblok; blok++) {
175 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
176 cblas_ctrsm(CblasColMajor,
177 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
179 CBLAS_SADDR(cone), A, lda,
181 kernel_trace_stop_lvl2( FLOPS_CTRSM( side, M, N ) );
232 static inline pastix_fixdbl_t
243 pastix_int_t M, N, lda;
244 pastix_complex32_t *A;
246 pastix_fixdbl_t flops = 0.0;
247 pastix_fixdbl_t flops_lr, flops_c;
256 assert( lrA->
rk == -1 );
258 assert( cblk->
cblktype & CBLK_COMPRESSED );
259 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
262 for (blok=fblok+1; blok<lblok; blok++, lrC++) {
285 if ( lrC->
rk != 0 ) {
286 if ( lrC->
rk != -1 ) {
287 kernel_trace_start_lvl2( PastixKernelLvl2_LR_TRSM );
288 cblas_ctrsm(CblasColMajor,
289 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
291 CBLAS_SADDR(cone), A, lda,
293 flops_c = FLOPS_CTRSM( side, lrC->
rk, N );
294 kernel_trace_stop_lvl2( flops_c );
297 kernel_trace_start_lvl2( PastixKernelLvl2_FR_TRSM );
298 cblas_ctrsm(CblasColMajor,
299 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
301 CBLAS_SADDR(cone), A, lda,
303 flops_c = FLOPS_CTRSM( side, M, N );
304 kernel_trace_stop_lvl2( flops_c );
308 flops += flops_lr + flops_c;
365 if ( cblk[0].fblokptr + 1 < cblk[1].fblokptr )
367 pastix_ktype_t ktype = PastixKernelLvl1Nbr;
368 pastix_fixdbl_t time, flops = 0.0;
370 pastix_int_t m = cblk->
stride - n;
372 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
373 ktype = PastixKernelTRSMCblkLR;
374 time = kernel_trace_start( ktype );
377 cblk, A, C, lowrank );
380 if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
381 ktype = PastixKernelTRSMCblk2d;
382 time = kernel_trace_start( ktype );
388 ktype = PastixKernelTRSMCblk1d;
389 time = kernel_trace_start( ktype );
397 kernel_trace_stop( cblk->
fblokptr->
inlast, ktype, m, n, 0, flops, time );
451 static inline pastix_fixdbl_t
458 const pastix_complex32_t *A,
459 pastix_complex32_t *C )
462 pastix_int_t M, N, lda, ldc, offset, cblk_m, full_m;
463 pastix_complex32_t *Cptr;
464 pastix_fixdbl_t flops = 0.0;
465 pastix_fixdbl_t time = kernel_trace_start( PastixKernelTRSMBlok2d );
473 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
475 blok = fblok + blok_m;
480 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++) {
482 Cptr = C + blok->
coefind - offset;
486 cblas_ctrsm( CblasColMajor,
487 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
489 CBLAS_SADDR(cone), A, lda,
492 flops += FLOPS_CTRSM( side, M, N );
496 kernel_trace_stop( cblk->
fblokptr->
inlast, PastixKernelTRSMBlok2d,
497 full_m, N, 0, flops, time );
553 static inline pastix_fixdbl_t
565 pastix_int_t M, N, lda, cblk_m, full_m, full_n;
566 pastix_complex32_t *A;
567 pastix_fixdbl_t flops = 0.0;
568 pastix_fixdbl_t time = kernel_trace_start( PastixKernelTRSMBlokLR );
577 assert( cblk->
cblktype & CBLK_COMPRESSED );
578 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
581 assert( lrA->
rk == -1 );
583 blok = fblok + blok_m;
588 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++, lrC++) {
608 if ( lrC->
rk != 0 ) {
609 if ( lrC->
rk != -1 ) {
610 cblas_ctrsm(CblasColMajor,
611 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
613 CBLAS_SADDR(cone), A, lda,
616 flops += FLOPS_CTRSM( side, lrC->
rk, N );
620 cblas_ctrsm(CblasColMajor,
621 (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
623 CBLAS_SADDR(cone), A, lda,
626 flops += FLOPS_CTRSM( side, M, N );
633 kernel_trace_stop( cblk->
fblokptr->
inlast, PastixKernelTRSMBlokLR,
634 full_m, N, full_n, flops, time );
700 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
702 cblk, blok_m, A, C, lowrank );
706 cblk, blok_m, A, C );
static pastix_fixdbl_t core_ctrsmsp_lr(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, const pastix_lrblock_t *lrA, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Computes the updates associated to one off-diagonal block between two cblk stored in low-rank format.
static void core_ctrsmsp_2d(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, const pastix_complex32_t *A, pastix_complex32_t *C)
Compute the updates associated to one off-diagonal block between two cblk stored in 2D.
static pastix_fixdbl_t core_ctrsmsp_2dsub(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, pastix_int_t blok_m, const pastix_complex32_t *A, pastix_complex32_t *C)
Compute the updates associated to one off-diagonal block between two cblk stored in 2D.
static pastix_fixdbl_t core_ctrsmsp_lrsub(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, pastix_int_t blok_m, const pastix_lrblock_t *lrA, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block between two cblk stored in low-rank format.
static void core_ctrsmsp_1d(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, const pastix_complex32_t *A, pastix_complex32_t *C)
Apply all the trsm updates on a panel stored in 1D layout.
pastix_fixdbl_t cpublok_ctrsmsp(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, pastix_int_t blok_m, const void *A, void *C, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
void cpucblk_ctrsmsp(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, const void *A, void *C, const pastix_lr_t *lowrank)
Compute the updates associated to a column of off-diagonal blocks.
pastix_fixdbl_t cpublok_ccompress(const pastix_lr_t *lowrank, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *lrA)
Compress a single block from full-rank to low-rank format.
pastix_int_t compress_min_width
pastix_int_t compress_min_height
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_diag_e pastix_diag_t
Diagonal.
enum pastix_uplo_e pastix_uplo_t
Upper/Lower part.
enum pastix_side_e pastix_side_t
Side of the operation.
enum pastix_trans_e pastix_trans_t
Transpostion.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Solver column block structure.