25 #ifndef DOXYGEN_SHOULD_SKIP_THIS
26 static float sone = 1.0;
27 static float szero = 0.0;
97 work = malloc( M * N *
sizeof(
float) );
105 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
110 flops = FLOPS_SGEMM( M, N, K );
163 ldbv = ( B->rk == -1 ) ? -1 : B->rkmax;
168 if ( B->rk > Brkmin ) {
173 pastix_fixdbl_t flops1 = FLOPS_SGEMM( M, B->rk, K ) + FLOPS_SGEMM( M, N, B->rk );
174 pastix_fixdbl_t flops2 = FLOPS_SGEMM( K, N, B->rk ) + FLOPS_SGEMM( M, N, K );
181 if ( flops1 <= flops2 ) {
183 work = malloc( (M * B->rk + M * N) *
sizeof(
float) );
194 cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
200 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
210 work = malloc( (K * N + M * N) *
sizeof(
float) );
221 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
227 cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
241 AB->
rkmax = B->rkmax;
246 work = malloc( M * B->rk *
sizeof(
float) );
251 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
256 flops = FLOPS_SGEMM( M, B->rk, K );
308 ldav = ( A->rk == -1 ) ? -1 : A->rkmax;
314 if ( A->rk > Arkmin ) {
319 pastix_fixdbl_t flops1 = FLOPS_SGEMM( A->rk, N, K ) + FLOPS_SGEMM( M, N, A->rk );
320 pastix_fixdbl_t flops2 = FLOPS_SGEMM( M, K, A->rk ) + FLOPS_SGEMM( M, N, K );
327 if ( flops1 <= flops2 ) {
329 work = malloc( (A->rk * N + M * N) *
sizeof(
float) );
340 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
344 (szero), tmp, A->rk );
346 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
356 work = malloc( (M * K + M * N) *
sizeof(
float) );
367 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
373 cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
392 work = malloc( A->rk * N *
sizeof(
float) );
397 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
401 (szero), AB->
v, AB->
rkmax );
403 flops = FLOPS_SGEMM( A->rk, N, K );
453 assert( A->rk <= A->rkmax && A->rk > 0 );
454 assert( B->rk <= B->rkmax && B->rk > 0 );
459 ldau = (A->rk == -1) ? A->rkmax : M;
461 ldbu = (B->rk == -1) ? B->rkmax : N;
465 work2 = malloc( A->rk * B->rk *
sizeof(
float) );
472 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
476 (szero), work2, A->rk );
477 flops = FLOPS_SGEMM( A->rk, B->rk, K );
482 flops += lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, A->rk, B->rk, work2, A->rk, &rArB );
487 if ( rArB.
rk == -1 ) {
488 if ( A->rk <= B->rk ) {
494 work = malloc( A->rk * N *
sizeof(
float) );
504 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
506 (sone), work2, A->rk,
508 (szero), AB->
v, AB->
rkmax );
509 flops += FLOPS_SGEMM( A->rk, N, B->rk );
517 work = malloc( B->rk * M *
sizeof(
float) );
526 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
531 flops += FLOPS_SGEMM( M, B->rk, A->rk );
536 else if ( rArB.
rk == 0 ) {
548 work = malloc( (M + N) * rArB.
rk *
sizeof(
float) );
555 AB->
v = work + M * rArB.
rk;
558 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
564 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
566 (sone), rArB.
v, rArB.
rkmax,
568 (szero), AB->
v, rArB.
rk );
570 flops += FLOPS_SGEMM( M, rArB.
rk, A->rk ) + FLOPS_SGEMM( rArB.
rk, N, B->rk );
BEGIN_C_DECLS typedef int pastix_int_t
#define PASTE_CORE_SLRMM_PARAMS(_a_)
Initialize all the parameters of the core_slrmm family functions to ease the access.
static float * core_slrmm_getws(core_slrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
pastix_fixdbl_t core_slrlr2lr(core_slrmm_t *params, pastix_lrblock_t *AB, int *infomask)
Perform the operation AB = op(A) * op(B), with A, B, and AB low-rank.
#define PASTE_CORE_SLRMM_VOID
Void all the parameters of the core_slrmm family functions to silent warnings.
pastix_fixdbl_t core_sfrlr2lr(core_slrmm_t *params, pastix_lrblock_t *AB, int *infomask, pastix_int_t Brkmin)
Perform the operation AB = op(A) * op(B), with A full-rank and B and AB low-rank.
pastix_fixdbl_t core_slrfr2lr(core_slrmm_t *params, pastix_lrblock_t *AB, int *infomask, pastix_int_t Arkmin)
Perform the operation AB = op(A) * op(B), with B full-rank and A and AB low-rank.
pastix_fixdbl_t core_sfrfr2lr(core_slrmm_t *params, pastix_lrblock_t *AB, int *infomask, pastix_int_t Kmax)
Perform the operation AB = op(A) * op(B), with A and B full-rank and AB low-rank.
Structure to store all the parameters of the core_slrmm family functions.
#define PASTIX_LRM3_ALLOCV
Macro to specify if the V part of a low-rank matrix has been allocated and need to be freed or not (U...
#define PASTIX_LRM3_TRANSB
Macro to specify if the the operator on B, still needs to be applied to the V part of the low-rank ma...
#define PASTIX_LRM3_ALLOCU
Macro to specify if the U part of a low-rank matrix has been allocated and need to be freed or not (U...
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
The block low-rank structure to hold a matrix in low-rank form.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.