29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
71 else if ( rkmax == 0 ) {
79 rkmax = pastix_imin( rkmax, rk );
81 #if defined(PASTIX_DEBUG_LR)
124 #if defined(PASTIX_DEBUG_LR)
185 newrkmax = (newrkmax == -1) ? newrk : newrkmax;
186 newrkmax = pastix_imax( newrkmax, newrk );
191 if ( (newrk > rklimit) || (newrk == -1) )
194 #if defined(PASTIX_DEBUG_LR)
205 else if (newrkmax == 0)
211 #if defined(PASTIX_DEBUG_LR)
227 if ( ( A->
rk == -1 ) ||
228 (( A->
rk != -1 ) && (newrkmax != A->
rkmax)) )
230 #if defined(PASTIX_DEBUG_LR)
235 v = u + M * newrkmax;
238 assert( A->
rk != -1 );
239 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M, newrk,
242 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', newrk, N,
243 A->
v, A->
rkmax, v, newrkmax );
247 #if defined(PASTIX_DEBUG_LR)
318 if (Alr == NULL || Alr->
rk > Alr->
rkmax) {
326 if ( Alr->
rk == -1 ) {
327 if (Alr->
u == NULL || Alr->
v != NULL || (Alr->
rkmax < m))
332 else if ( Alr->
rk != 0){
333 if (Alr->
u == NULL || Alr->
v == NULL) {
340 if ( Alr->
rk == -1 ) {
341 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
342 Alr->
u, Alr->
rkmax, A, lda );
345 else if ( Alr->
rk == 0 ) {
346 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', m, n,
351 cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
353 CBLAS_SADDR(cone), Alr->
u, m,
355 CBLAS_SADDR(czero), A, lda);
359 if ( Alr->
rk == -1 ) {
362 else if ( Alr->
rk == 0 ) {
363 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', n, m,
368 cblas_cgemm(CblasColMajor, CblasTrans, CblasTrans,
370 CBLAS_SADDR(cone), Alr->
v, Alr->
rkmax,
372 CBLAS_SADDR(czero), A, lda);
439 assert( (M1 + offx) <= M2 );
440 assert( (N1 + offy) <= N2 );
442 ldau = (A->
rk == -1) ? A->
rkmax : M1;
453 if ( (M1 != M2) || (N1 != N2) ) {
454 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', M2, N2,
460 0.0, u + M2 * offy + offx, M2 );
465 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', M2, B->
rk,
469 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M1, A->
rk,
475 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', B->
rk, N2,
476 0.0, 0.0, v, B->
rkmax );
548 rank = (A->
rk == -1) ? pastix_imin(M1, N1) : A->
rk;
551 ldau = (A->
rk == -1) ? A->
rkmax : M1;
554 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M2, B->
rk,
555 B->
u, ldbu, u1u2, M2 );
558 tmp = u1u2 + B->
rk * M2;
570 for (i=0; i<M1; i++, tmp += M2+1) {
576 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', M2, M1,
588 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M1, N1,
589 A->
u, ldau, tmp + offx, M2 );
600 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M1, A->
rk,
601 A->
u, ldau, tmp + offx, M2 );
661 rank = (A->
rk == -1) ? pastix_imin(M1, N1) : A->
rk;
664 ldau = (A->
rk == -1) ? A->
rkmax : M1;
668 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', B->
rk, N2,
669 B->
v, ldbv, v1v2, rank );
680 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', M1, N2,
681 0.0, 0.0, tmp, rank );
686 0.0, tmp + offy * rank, rank );
694 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', N1, N2,
695 0.0, 0.0, tmp, rank );
700 for (i=0; i<N1; i++, tmp += rank+1) {
706 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', N1, N2,
707 0.0, alpha, tmp + offy * rank, rank );
717 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'A', A->
rk, N2,
718 0.0, 0.0, tmp, rank );
723 0.0, tmp + offy * rank, rank );
800 float norm = LAPACKE_clange_work( LAPACK_COL_MAJOR,
'f', m, n,
803 if ( (norm == 0.) && (tol >= 0.)) {
813 else if ( use_reltol ) {
817 ret = rrqrfct( tol, rklimit, 0, nb,
831 #if defined(PASTIX_DEBUG_LR)
836 rwork = malloc( rsize *
sizeof(
float) );
842 rwork = (
float*)(work + lwork);
850 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
854 newrk = rrqrfct( tol, rklimit, 0, nb,
857 work, lwork, rwork );
859 flops = FLOPS_CGEQRF( m, n );
862 flops = FLOPS_CGEQRF( m, newrk ) + FLOPS_CUNMQR( m, n-newrk, newrk,
PastixLeft );
872 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
873 A, lda, Alr->
u, Alr->
rkmax );
876 else if ( newrk > 0 ) {
887 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, Alr->
rk,
891 ret = LAPACKE_cungqr_work( LAPACK_COL_MAJOR, m, Alr->
rk, Alr->
rk,
892 U, m, tau, work, lwork );
894 flops += FLOPS_CUNGQR( m, Alr->
rk, Alr->
rk );
897 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'L', Alr->
rk-1, Alr->
rk-1,
898 0.0, 0.0, Acpy + 1, m );
901 memcpy( V + jpvt[i] * Alr->
rk,
907 #if defined(PASTIX_DEBUG_LR)
911 fprintf(stderr,
"Failed to compress a matrix and generate an orthogonal u\n" );
918 #if defined(PASTIX_DEBUG_LR)
999 #if defined(PRECISION_c) || defined(PRECISION_z)
1005 float norm = LAPACKE_clange_work( LAPACK_COL_MAJOR,
'f', m, n,
1008 if ( (norm == 0.) && (tol >= 0.)) {
1018 else if ( use_reltol ) {
1022 ret = rrqrfct( tol, rklimit, nb,
1026 &zzsize, -1, norm );
1030 bsize = n * rklimit;
1038 #if defined(PASTIX_DEBUG_LR)
1059 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
1063 newrk = rrqrfct( tol, rklimit, nb,
1067 work, lwork, norm );
1069 flops = FLOPS_CGEQRF( m, n );
1072 flops = FLOPS_CGEQRF( m, newrk ) + FLOPS_CUNMQR( m, n-newrk, newrk,
PastixLeft );
1081 if ( newrk == -1 ) {
1082 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
1083 A, lda, Alr->
u, Alr->
rkmax );
1086 else if ( newrk > 0 ) {
1097 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', m, Alr->
rk,
1101 ret = LAPACKE_cungqr_work( LAPACK_COL_MAJOR, m, Alr->
rk, Alr->
rk,
1102 U, m, tau, work, lwork );
1104 flops += FLOPS_CUNGQR( m, Alr->
rk, Alr->
rk );
1107 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'U', Alr->
rk, n,
1108 Acpy, m, V, Alr->
rk );
1110 ret = LAPACKE_claset_work( LAPACK_COL_MAJOR,
'L', Alr->
rk-1, Alr->
rk-1,
1111 0.0, 0.0, V + 1, Alr->
rk );
1120 rk = (Alr->
rk / nb) * nb;
1122 d = pastix_imin( nb, Alr->
rk - rk );
1123 ret = LAPACKE_cunmqr_work( LAPACK_COL_MAJOR,
'R', trans,
1124 Alr->
rk - rk, n - rk, d,
1125 B + rk * n + rk, n, tau_b + rk,
1126 V + rk * Alr->
rk + rk, Alr->
rk,
1134 #if defined(PASTIX_DEBUG_LR)
1135 if ( Alr->
rk > 0 ) {
1138 fprintf(stderr,
"Failed to compress a matrix and generate an orthogonal u\n" );
1145 #if defined(PASTIX_DEBUG_LR)
1221 const void *alphaptr,
1248 #if defined(PASTIX_DEBUG_LR)
1252 fprintf(stderr,
"Failed to have B->u orthogonal in entry of rradd\n" );
1257 rankA = (A->
rk == -1) ? pastix_imin(M1, N1) : A->
rk;
1258 rank = rankA + B->
rk;
1259 M = pastix_imax(M2, M1);
1260 N = pastix_imax(N2, N1);
1262 minV = pastix_imin(N, rank);
1264 assert(M2 == M && N2 == N);
1265 assert(B->
rk != -1);
1270 if ( ((M1 + offx) > M2) ||
1271 ((N1 + offy) > N2) )
1273 pastix_print_error(
"Dimensions are not correct" );
1291 M1, N1, A, M2, N2, B,
1299 if ( rank > pastix_imin( M, N ) ) {
1306 ldau = (A->
rk == -1) ? A->
rkmax : M1;
1317 wzsize = (M+N) * rank;
1323 rrqrfct( tol, rklimit, 1, nb,
1326 &zzsize, -1, NULL );
1330 #if defined(PASTIX_DEBUG_LR)
1337 rwork = malloc( 2 * pastix_imax( rank, N ) *
sizeof(
float) );
1339 zbuf = malloc( wzsize *
sizeof(
pastix_complex32_t) + 2 * pastix_imax(rank, N) *
sizeof(
float) );
1342 v1v2 = u1u2 + ldu * rank;
1343 tauV = v1v2 + ldv * N;
1344 zwork = tauV + rank;
1346 rwork = (
float*)(zwork + lwork);
1383 u1u2, ldu, v1v2, ldv );
1388 u1u2, ldu, v1v2, ldv );
1394 u1u2, ldu, v1v2, ldv );
1396 kernel_trace_stop_lvl2( flops );
1398 total_flops += flops;
1401 rank = B->
rk + rankA;
1408 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', M, B->
rk, u1u2, ldu, B->
u, ldbu );
1410 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', B->
rk, N, v1v2, ldv, B->
v, ldbv );
1414 #if defined(PASTIX_DEBUG_LR)
1424 MALLOC_INTERN( jpvt, pastix_imax(rank, N),
pastix_int_t );
1452 tol = tol * ( cabsf(alpha) * normA + normB );
1459 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
1460 rklimit = pastix_imin( rklimit, rank );
1461 new_rank = rrqrfct( tol, rklimit, 1, nb,
1464 zwork, lwork, rwork );
1465 flops = (new_rank == -1) ? FLOPS_CGEQRF( rank, N )
1466 : (FLOPS_CGEQRF( rank, new_rank ) +
1467 FLOPS_CUNMQR( rank, N-new_rank, new_rank,
PastixLeft ));
1468 kernel_trace_stop_lvl2_rank( flops, new_rank );
1469 total_flops += flops;
1474 if ( (new_rank > rklimit) ||
1483 flops = FLOPS_CGEMM( M, N, Bbackup.
rk );
1484 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
1485 cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1487 CBLAS_SADDR(cone), Bbackup.
u, ldbu,
1489 CBLAS_SADDR(czero), u, M );
1490 kernel_trace_stop_lvl2( flops );
1491 total_flops += flops;
1494 if ( A->
rk == -1 ) {
1495 flops = 2 * M1 * N1;
1496 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1499 cone, u + offy * M + offx, M);
1500 kernel_trace_stop_lvl2( flops );
1503 flops = FLOPS_CGEMM( M1, N1, A->
rk );
1504 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1505 cblas_cgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
1507 CBLAS_SADDR(alpha), A->
u, ldau,
1509 CBLAS_SADDR(cone), u + offy * M + offx, M);
1510 kernel_trace_stop_lvl2( flops );
1512 total_flops += flops;
1516 #if defined(PASTIX_DEBUG_LR)
1525 else if ( new_rank == 0 ) {
1529 #if defined(PASTIX_DEBUG_LR)
1543 ret =
core_clrsze( 0, M, N, B, new_rank, -1, -1 );
1544 assert( ret != -1 );
1545 assert( B->
rkmax >= new_rank );
1557 for (i=0; i<N; i++){
1558 lm = pastix_imin( new_rank, i+1 );
1559 memcpy(tmpV + jpvt[i] * ldbv,
1567 flops = FLOPS_CUNGQR( rank, new_rank, new_rank )
1568 + FLOPS_CGEMM( M, new_rank, rank );
1570 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
1571 ret = LAPACKE_cungqr_work( LAPACK_COL_MAJOR, rank, new_rank, new_rank,
1572 v1v2, ldv, tauV, zwork, lwork );
1575 cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1577 CBLAS_SADDR(cone), u1u2, ldu,
1579 CBLAS_SADDR(czero), B->
u, ldbu);
1580 kernel_trace_stop_lvl2( flops );
1581 total_flops += flops;
1587 #if defined(PASTIX_DEBUG_LR)
1625 if ( A->
rk != -1 ) {
1626 return A->
rk * ( M + N );
1664 int rkmax = A->
rkmax;
1670 memcpy( buffer, &rk,
sizeof(
int ) );
1671 buffer +=
sizeof( int );
1679 if ( rk == rkmax ) {
1684 ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR,
'A', rk, N, v, rkmax,
1731 memcpy( &rk, buffer,
sizeof(
int ) );
1732 buffer +=
sizeof( int );
1789 char *output = *outptr;
1793 rk = *((
int *)input);
1794 input +=
sizeof( int );
1804 memcpy( A->
u, input, size );
1812 memcpy( A->
v, input, size );
1825 memcpy( A->
u, input, size );
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
BEGIN_C_DECLS typedef int pastix_int_t
float _Complex pastix_complex32_t
@ PastixKernelLvl2_LR_add2C_rradd_orthogonalize
pastix_fixdbl_t core_clrorthu_cgs(pastix_int_t M1, pastix_int_t N1, pastix_int_t M2, pastix_int_t N2, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, pastix_complex32_t *U, pastix_int_t ldu, pastix_complex32_t *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using CGS.
int core_cgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t LDA, pastix_complex32_t beta, pastix_complex32_t *B, pastix_int_t LDB)
Add two matrices together.
pastix_fixdbl_t core_clrorthu_fullqr(pastix_int_t M, pastix_int_t N, pastix_int_t rank, pastix_complex32_t *U, pastix_int_t ldu, pastix_complex32_t *V, pastix_int_t ldv)
Try to orthognalize the u part of the low-rank form, and update the v part accordingly using full QR.
pastix_fixdbl_t core_clrorthu_partialqr(pastix_int_t M, pastix_int_t N, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, pastix_complex32_t *U, pastix_int_t ldu, pastix_complex32_t *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using partial ...
void core_cgetmo(int m, int n, const pastix_complex32_t *A, int lda, pastix_complex32_t *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
int core_clrdbg_check_orthogonality(pastix_int_t M, pastix_int_t N, const pastix_complex32_t *A, pastix_int_t lda)
Check the orthogonality of the matrix A.
pastix_int_t pastix_lr_ortho
Define the orthogonalization method.
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_cge2lr_qrrt(core_crrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_crradd_qr(core_crrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
int(* core_crrqr_rt_t)(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex32_t *A, pastix_int_t lda, pastix_complex32_t *tau, pastix_complex32_t *B, pastix_int_t ldb, pastix_complex32_t *tau_b, pastix_complex32_t *work, pastix_int_t lwork, float normA)
TODO.
pastix_fixdbl_t core_cge2lr_qrcp(core_crrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
int(* core_crrqr_cp_t)(float tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex32_t *A, pastix_int_t lda, pastix_int_t *jpvt, pastix_complex32_t *tau, pastix_complex32_t *work, pastix_int_t lwork, float *rwork)
TODO.
char * core_clrunpack(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer)
Unpack low rank data and fill the cblk concerned by the computation.
int core_clr2ge(pastix_trans_t trans, pastix_int_t m, pastix_int_t n, const pastix_lrblock_t *Alr, pastix_complex32_t *A, pastix_int_t lda)
Convert a low rank matrix into a dense matrix.
char * core_clrpack(pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer)
Pack low-rank data by side.
float core_clrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
int core_clrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
const char * core_clrunpack2(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, const char *input, char **outptr)
Unpack low rank data and fill the cblk concerned by the computation.
void core_clrconcatenate_u(pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, pastix_complex32_t *u1u2)
Concatenate left parts of two low-rank matrices.
void core_clralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
void core_clrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
size_t core_clrgetsize(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A)
Compute the size of a block to send in LR.
void core_clrconcatenate_v(pastix_trans_t transA1, pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, pastix_complex32_t *v1v2)
Concatenate right parts of two low-rank matrices.
void core_clrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixCompressOrthoPartialQR