29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
127 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
128 assert(!(fcblk->
cblktype & CBLK_LAYOUT_2D));
130 assert( work != NULL );
142 M = stride - indblok - (shift * N);
145 A = A + indblok + (shift * N);
152 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
153 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
155 CBLAS_SADDR(cone), A, stride,
157 CBLAS_SADDR(czero), wtmp, M );
158 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
173 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
179 assert( fblok < fcblk[1].fblokptr );
185 pastix_cblk_lock( fcblk );
188 1.0, tmpC, stridef );
189 pastix_cblk_unlock( fcblk );
285 assert(!(cblk->
cblktype & CBLK_LAYOUT_2D));
286 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
306 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
312 assert( fblok < fcblk[1].fblokptr );
325 pastix_cblk_lock( fcblk );
326 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
327 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
329 CBLAS_SADDR(mcone), blokA, stride,
331 CBLAS_SADDR(cone), blokC, stridef );
332 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
333 pastix_cblk_unlock( fcblk );
424 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
425 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
444 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
450 assert( fblok < fcblk[1].fblokptr );
464 pastix_cblk_lock( fcblk );
465 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
466 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
468 CBLAS_SADDR(mcone), blokA, lda,
470 CBLAS_SADDR(cone), blokC, ldc );
471 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
472 pastix_cblk_unlock( fcblk );
564 size_t offsetA, offsetB, offsetC;
570 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
571 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
579 blokB = fblokK + blok_kn;
583 blokA = fblokK + blok_mk;
593 blokC = fblokN + blok_mn;
595 assert( blokC->
lcblknm == cblk_n );
596 assert( blokC->
fcblknm == cblk_m );
602 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
604 Aptr = A + bA->
coefind - offsetA;
611 assert( bC < lblokN );
614 Cptr = C + bC->
coefind - offsetC;
618 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
621 Bptr = B + bB->
coefind - offsetB;
624 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
626 CBLAS_SADDR(mcone), Aptr, lda,
631 flops += FLOPS_CGEMM( M, N, K );
732 size_t offsetA, offsetB;
738 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
741 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
742 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
745 assert(!(cblk->
cblktype & CBLK_COMPRESSED) );
746 assert( fcblk->
cblktype & CBLK_COMPRESSED );
754 blokB = fblokK + blok_kn;
759 blokA = fblokK + blok_mk;
770 blokC = fblokN + blok_mn;
771 assert( blokC->
lcblknm == cblk_n );
772 assert( blokC->
fcblknm == cblk_m );
786 params.
lock = &(lock);
789 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++) {
802 assert( bC < lblokN );
810 for (bB = blokB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++) {
926 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
929 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
930 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
933 assert( cblk->
cblktype & CBLK_COMPRESSED );
934 assert( fcblk->
cblktype & CBLK_COMPRESSED );
942 blokB = fblokK + blok_kn;
945 blokA = fblokK + blok_mk;
954 blokC = fblokN + blok_mn;
955 assert( blokC->
lcblknm == cblk_n );
956 assert( blokC->
fcblknm == cblk_m );
970 params.
lock = &(lock);
973 for (bA = blokA; (bA < lblokK) && (bA->
fcblknm == cblk_m); bA++, lrA++) {
982 assert( bC < lblokN );
990 for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->
fcblknm == cblk_n); bB++, blrB++) {
1098 assert(!(cblk->
cblktype & CBLK_COMPRESSED));
1099 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1100 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1102 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1129 params.
alpha = -1.0;
1132 params.
lwork = lwork;
1136 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1143 assert( fblok < fcblk[1].fblokptr );
1250 assert( cblk->
cblktype & CBLK_COMPRESSED );
1251 assert( fcblk->
cblktype & CBLK_COMPRESSED );
1252 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1253 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1255 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1277 params.
alpha = -1.0;
1280 params.
lwork = lwork;
1283 params.
B = lrB + (blok - cblk->
fblokptr);
1286 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1287 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1294 assert( fblok < fcblk[1].fblokptr );
1398 assert( cblk->
cblktype & CBLK_COMPRESSED );
1399 assert( !(fcblk->
cblktype & CBLK_COMPRESSED) );
1400 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
1401 assert( fcblk->
cblktype & CBLK_LAYOUT_2D );
1403 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1425 params.
alpha = -1.0;
1428 params.
lwork = lwork;
1431 params.
B = lrB + (blok - cblk->
fblokptr);
1438 lrA = lrA + (blok - cblk->
fblokptr) + shift;
1439 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1445 assert( fblok < fcblk[1].fblokptr );
1550 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1551 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1557 A, B, C, work, lwork,
1566 A, B, C, work, lwork,
1570 else if ( fcblk->
cblktype & CBLK_LAYOUT_2D ) {
1571 if ( cblk->
cblktype & CBLK_COMPRESSED) {
1576 A, B, C, work, lwork,
1579 else if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
1595 flops = FLOPS_CGEMM( m, n, k );
1598 assert( !(cblk->
cblktype & CBLK_COMPRESSED) );
1606 flops = FLOPS_CGEMM( m, n, k );
1687 if ( fcblk->
cblktype & CBLK_COMPRESSED ) {
1688 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1690 blok_mk, blok_nk, blok_mn,
1696 blok_mk, blok_nk, blok_mn,
1702 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
1708 blok_mk, blok_nk, blok_mn,
BEGIN_C_DECLS typedef int pastix_int_t
float _Complex pastix_complex32_t
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
@ PastixKernelGEMMCblkFRLR
@ PastixKernelGEMMBlokLRLR
@ PastixKernelGEMMCblk1d2d
@ PastixKernelGEMMCblkLRLR
@ PastixKernelGEMMCblk1d1d
@ PastixKernelGEMMCblk2d2d
@ PastixKernelGEMMBlok2d2d
int core_cgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t LDA, pastix_complex32_t beta, pastix_complex32_t *B, pastix_int_t LDB)
Add two matrices together.
static void core_cgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C, pastix_complex32_t *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static void core_cgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static void core_cgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static pastix_fixdbl_t core_cgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_complex32_t *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static pastix_fixdbl_t core_cgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpucblk_cgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpublok_cgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
pastix_atomic_lock_t * lock
const pastix_lr_t * lowrank
const pastix_lrblock_t * B
const pastix_lrblock_t * A
pastix_complex32_t * work
pastix_fixdbl_t core_clrmm(core_clrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Structure to store all the parameters of the core_clrmm family functions.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
pastix_atomic_lock_t lock
Solver column block structure.