23 #include "kernels_trace.h"
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static double mdone = -1.0;
29 static double done = 1.0;
30 static double dzero = 0.0;
120 pastix_int_t stride, stridef, indblok;
121 pastix_int_t M, N, K, m;
125 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
126 assert(!(fcblk->
cblktype & CBLK_LAYOUT_2D));
128 assert( work != NULL );
140 M = stride - indblok - (shift * N);
143 A = A + indblok + (shift * N);
150 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
151 cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
156 kernel_trace_stop_lvl2( FLOPS_DGEMM( M, N, K ) );
171 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
177 assert( fblok < fcblk[1].fblokptr );
183 pastix_cblk_lock( fcblk );
186 1.0, tmpC, stridef );
187 pastix_cblk_unlock( fcblk );
278 pastix_int_t stride, stridef;
279 pastix_int_t M, N, K;
283 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
284 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
304 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
310 assert( fblok < fcblk[1].fblokptr );
323 pastix_cblk_lock( fcblk );
324 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
325 cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
327 (mdone), blokA, stride,
329 (done), blokC, stridef );
330 kernel_trace_stop_lvl2( FLOPS_DGEMM( M, N, K ) );
331 pastix_cblk_unlock( fcblk );
418 pastix_int_t M, N, K, lda, ldb, ldc;
422 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
423 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
442 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
448 assert( fblok < fcblk[1].fblokptr );
462 pastix_cblk_lock( fcblk );
463 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
464 cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
468 (done), blokC, ldc );
469 kernel_trace_stop_lvl2( FLOPS_DGEMM( M, N, K ) );
470 pastix_cblk_unlock( fcblk );
542 static inline pastix_fixdbl_t
544 pastix_int_t blok_mk,
545 pastix_int_t blok_kn,
546 pastix_int_t blok_mn,
558 const double *Aptr, *Bptr;
560 pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
561 pastix_int_t full_m, full_n;
562 size_t offsetA, offsetB, offsetC;
564 pastix_fixdbl_t flops = 0.0;
565 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
568 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
569 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
577 blokB = fblokK + blok_kn;
581 blokA = fblokK + blok_mk;
591 blokC = fblokN + blok_mn;
593 assert( blokC->
fcblknm == cblk_m );
599 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
601 Aptr = A + bA->
coefind - offsetA;
608 assert( bC < lblokN );
611 Cptr = C + bC->
coefind - offsetC;
615 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
618 Bptr = B + bB->
coefind - offsetB;
621 cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
628 flops += FLOPS_DGEMM( M, N, K );
632 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlok2d2d,
708 static inline pastix_fixdbl_t
710 pastix_int_t blok_mk,
711 pastix_int_t blok_kn,
712 pastix_int_t blok_mn,
728 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
729 size_t offsetA, offsetB;
731 pastix_fixdbl_t flops = 0.0;
732 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
735 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
738 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
739 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
742 assert(!(cblk->
cblktype & CBLK_COMPRESSED) );
743 assert( fcblk->
cblktype & CBLK_COMPRESSED );
751 blokB = fblokK + blok_kn;
756 blokA = fblokK + blok_mk;
767 blokC = fblokN + blok_mn;
768 assert( blokC->
fcblknm == cblk_m );
782 params.
lock = &(lock);
785 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
791 lrA.
u = (
double*)A + bA->
coefind - offsetA;
798 assert( bC < lblokN );
806 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
814 lrB.
u = (
double*)B + bB->
coefind - offsetB;
823 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlokLRLR,
896 static inline pastix_fixdbl_t
898 pastix_int_t blok_mk,
899 pastix_int_t blok_kn,
900 pastix_int_t blok_mn,
916 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
918 pastix_fixdbl_t flops = 0.0;
919 pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
922 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
925 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
926 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
929 assert( cblk->
cblktype & CBLK_COMPRESSED );
930 assert( fcblk->
cblktype & CBLK_COMPRESSED );
938 blokB = fblokK + blok_kn;
941 blokA = fblokK + blok_mk;
950 blokC = fblokN + blok_mn;
951 assert( blokC->
fcblknm == cblk_m );
965 params.
lock = &(lock);
968 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++, lrA++) {
977 assert( bC < lblokN );
985 for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++, blrB++) {
998 kernel_trace_stop( blokB->
inlast, PastixKernelGEMMBlokLRLR,
1068 static inline pastix_fixdbl_t
1087 pastix_int_t stride, shift;
1090 pastix_fixdbl_t flops = 0.0;
1093 assert(!(cblk->
cblktype & CBLK_COMPRESSED));
1094 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1095 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1097 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1107 lrB.
u = (
double*)B + blok->
coefind;
1124 params.
alpha = -1.0;
1127 params.
lwork = lwork;
1131 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1138 assert( fblok < fcblk[1].fblokptr );
1145 lrA.
u = (
double*)A + iterblok->
coefind;
1222 static inline pastix_fixdbl_t
1239 pastix_int_t N, K, shift;
1242 pastix_fixdbl_t flops = 0.0;
1245 assert( cblk->
cblktype & CBLK_COMPRESSED );
1246 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1247 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1248 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1250 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1272 params.
alpha = -1.0;
1275 params.
lwork = lwork;
1278 params.
B = lrB + (blok - cblk->
fblokptr);
1281 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1282 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1289 assert( fblok < fcblk[1].fblokptr );
1369 static inline pastix_fixdbl_t
1386 pastix_int_t N, K, shift;
1390 pastix_fixdbl_t flops = 0.0;
1393 assert( cblk->
cblktype & CBLK_COMPRESSED );
1394 assert( !(fcblk->
cblktype & CBLK_COMPRESSED) );
1395 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1396 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1398 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1420 params.
alpha = -1.0;
1423 params.
lwork = lwork;
1426 params.
B = lrB + (blok - cblk->
fblokptr);
1433 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1434 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1440 assert( fblok < fcblk[1].fblokptr );
1536 pastix_ktype_t ktype;
1537 pastix_fixdbl_t time, flops = 0.0;
1538 pastix_int_t m = cblk->
stride;
1545 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1546 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1547 ktype = PastixKernelGEMMCblkLRLR;
1548 time = kernel_trace_start( ktype );
1552 A, B, C, work, lwork,
1556 ktype = PastixKernelGEMMCblkFRLR;
1557 time = kernel_trace_start( ktype );
1561 A, B, C, work, lwork,
1565 else if ( fcblk->
cblktype & CBLK_LAYOUT_2D ) {
1566 if ( cblk->
cblktype & CBLK_COMPRESSED) {
1567 ktype = PastixKernelGEMMCblk2d2d;
1568 time = kernel_trace_start( ktype );
1571 A, B, C, work, lwork,
1574 else if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
1575 ktype = PastixKernelGEMMCblk2d2d;
1576 time = kernel_trace_start( ktype );
1583 ktype = PastixKernelGEMMCblk1d2d;
1584 time = kernel_trace_start( ktype );
1590 flops = FLOPS_DGEMM( m, n, k );
1593 assert( !(cblk->
cblktype & CBLK_COMPRESSED) );
1594 ktype = PastixKernelGEMMCblk1d1d;
1595 time = kernel_trace_start( ktype );
1601 flops = FLOPS_DGEMM( m, n, k );
1604 kernel_trace_stop( blok->
inlast, ktype, m, n, k, flops, time );
1674 pastix_int_t blok_mk,
1675 pastix_int_t blok_nk,
1676 pastix_int_t blok_mn,
1682 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1683 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1685 blok_mk, blok_nk, blok_mn,
1691 blok_mk, blok_nk, blok_mn,
1697 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1703 blok_mk, blok_nk, blok_mn,
int core_dgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, double alpha, const double *A, pastix_int_t LDA, double beta, double *B, pastix_int_t LDB)
Add two matrices together.
static pastix_fixdbl_t core_dgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, double *C, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_dgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const double *A, const double *B, double *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_dgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const double *A, const double *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static void core_dgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const double *A, const double *B, double *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_dgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static pastix_fixdbl_t core_dgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const double *A, const double *B, double *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static void core_dgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const double *A, const double *B, double *C, double *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_dgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_dgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const double *A, const double *B, pastix_lrblock_t *lrC, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpublok_dgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
pastix_fixdbl_t cpucblk_dgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_atomic_lock_t * lock
const pastix_lrblock_t * B
const pastix_lr_t * lowrank
const pastix_lrblock_t * A
pastix_fixdbl_t core_dlrmm(core_dlrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Structure to store all the parameters of the core_dlrmm family functions.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
pastix_atomic_lock_t lock
Solver column block structure.