29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 #define PASTIX_SVD_2NORM 1
33 #if !defined(PASTIX_SVD_2NORM)
34 #include "common/frobeniusupdate.h"
72 for( i=0; i<n; i++, x+=incx ) {
73 frobenius_update( 1, &scale, &sumsq, x );
76 return scale * sqrt( sumsq );
80 #ifndef DOXYGEN_SHOULD_SKIP_THIS
81 static pastix_complex64_t zone = 1.0;
82 static pastix_complex64_t zzero = 0.0;
137 const pastix_complex64_t *A = (
const pastix_complex64_t*)Avoid;
139 pastix_complex64_t *u, *v, *zwork, *Acpy, ws;
159 norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR,
'f', m, n,
163 if ( (norm == 0.) && (tol >= 0.) ) {
172 else if ( use_reltol ) {
177 minMN = pastix_imin(m, n);
178 rklimit = pastix_imin( minMN, rklimit );
185 if ( rklimit == 0 ) {
186 if ( (tol < 0.) || (norm < tol) ) {
193 ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
194 A, lda, Alr->
u, Alr->
rkmax );
211 #if defined(PASTIX_DEBUG_LR_NANCHECK)
220 ret = MYLAPACKE_zgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
222 NULL, NULL, ldu, NULL, ldv,
223 &ws, lwork, &rwork );
232 #if defined(PRECISION_z) || defined(PRECISION_c)
236 zwork = malloc( zsize *
sizeof(pastix_complex64_t) + rsize *
sizeof(
double) );
237 rwork = (
double*)(zwork + zsize);
239 Acpy = zwork + lwork;
245 ret = LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
'A', m, n,
249 ret = MYLAPACKE_zgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
252 zwork, lwork, rwork + minMN );
254 pastix_print_error(
"SVD Failed\n" );
258 imax = pastix_imin( minMN, rklimit+1 );
259 for (i=0; i<imax; i++, v+=1) {
272 #if defined(PASTIX_SVD_2NORM)
275 frob_norm =
core_dlassq( minMN-i, s + minMN - 1, -1 );
278 if (frob_norm < tol) {
281 cblas_zdscal(n, s[i], v, ldv);
293 ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
294 A, lda, Alr->
u, Alr->
rkmax );
358 const void *alphaptr,
371 pastix_complex64_t *u1u2, *v1v2, *R, *u, *v;
372 pastix_complex64_t *tmp, *zbuf, *tauU, *tauV;
373 pastix_complex64_t querysize;
374 pastix_complex64_t alpha = *((pastix_complex64_t*)alphaptr);
376 size_t wzsize, wdsize;
380 rank = (A->
rk == -1) ? pastix_imin(M1, N1) : A->
rk;
382 M = pastix_imax(M2, M1);
383 N = pastix_imax(N2, N1);
384 minU = pastix_imin(M, rank);
385 minV = pastix_imin(N, rank);
387 assert(M2 == M && N2 == N);
393 if ( ((M1 + offx) > M2) ||
396 pastix_print_error(
"Dimensions are not correct" );
408 ldau = (A->
rk == -1) ? A->
rkmax : M1;
419 M1, N1, A, M2, N2, B,
427 if ( rank > pastix_imin( M, N ) ) {
435 wzsize = (M+N) * rank;
437 wzsize += minU + minV;
440 wzsize += 3 * rank * rank;
443 lwork = pastix_imax( M, N ) * 32;
446 #if defined(PASTIX_DEBUG_LR_NANCHECK)
455 ret = MYLAPACKE_zgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
456 rank, rank, NULL, rank,
457 NULL, NULL, rank, NULL, rank,
458 &querysize, -1, &rwork );
462 lwork = pastix_imax( lwork, querysize );
466 #if defined(PRECISION_z) || defined(PRECISION_c)
470 zbuf = malloc( wzsize *
sizeof(pastix_complex64_t) + wdsize *
sizeof(
double) );
471 s = (
double*)(zbuf + wzsize);
474 tauU = u1u2 + M * rank;
476 tauV = v1v2 + N * rank;
493 ret = LAPACKE_zgeqrf_work( LAPACK_COL_MAJOR, M, rank,
494 u1u2, M, tauU, zbuf, lwork );
496 total_flops += FLOPS_ZGEQRF( M, rank );
511 ret = LAPACKE_zgelqf_work( LAPACK_COL_MAJOR, rank, N,
512 v1v2, rank, tauV, zbuf, lwork );
514 total_flops += FLOPS_ZGELQF( rank, N );
522 memset(R, 0, rank * rank *
sizeof(pastix_complex64_t));
524 ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR,
'U', rank, rank,
528 cblas_ztrmm(CblasColMajor,
529 CblasRight, CblasLower,
530 CblasNoTrans, CblasNonUnit,
531 rank, rank, CBLAS_SADDR(zone),
532 v1v2, rank, R, rank);
533 total_flops += FLOPS_ZTRMM(
PastixRight, rank, rank );
561 tol = tol * ( cabs(alpha) * normA + normB );
568 flops = FLOPS_ZGEQRF( rank, rank ) + FLOPS_ZGELQF( rank, (rank-1) );
570 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
571 ret = MYLAPACKE_zgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
574 zbuf, lwork, s + rank );
576 pastix_print_error(
"LAPACKE_zgesvd FAILED" );
584 for (i=0; i<rank; i++, tmp+=1){
597 #if defined(PASTIX_SVD_2NORM)
600 frob_norm =
core_dlassq( rank-i, s + rank - 1, -1 );
603 if (frob_norm < tol) {
606 cblas_zdscal(rank, s[i], tmp, rank);
609 kernel_trace_stop_lvl2_rank( flops, new_rank );
610 total_flops += flops;
622 flops = FLOPS_ZGEMM( M, N, Bbackup.
rk );
623 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
624 cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
626 CBLAS_SADDR(zone), Bbackup.
u, ldbu,
628 CBLAS_SADDR(zzero), u, M );
629 kernel_trace_stop_lvl2( flops );
630 total_flops += flops;
635 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
638 zone, u + offy * M + offx, M);
639 kernel_trace_stop_lvl2( flops );
642 flops = FLOPS_ZGEMM( M1, N1, A->
rk );
643 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
644 cblas_zgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
646 CBLAS_SADDR(alpha), A->
u, ldau,
648 CBLAS_SADDR(zone), u + offy * M + offx, M);
649 kernel_trace_stop_lvl2( flops );
651 total_flops += flops;
656 else if ( new_rank == 0 ) {
668 assert( B->
rkmax >= new_rank );
677 #if defined(PASTIX_DEBUG_LR_NANCHECK)
678 ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR,
'A', M, new_rank,
679 0.0, 0.0, B->
u, ldbu );
681 ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR,
'A', rank, new_rank,
682 u, rank, B->
u, ldbu );
686 for (i=0; i<new_rank; i++, tmp+=ldbu, u+=rank) {
687 memcpy(tmp, u, rank *
sizeof(pastix_complex64_t));
688 memset(tmp + rank, 0, (M - rank) *
sizeof(pastix_complex64_t));
692 flops = FLOPS_ZUNMQR( M, new_rank, minU,
PastixLeft )
694 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
695 ret = LAPACKE_zunmqr_work(LAPACK_COL_MAJOR,
'L',
'N',
706 ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR,
'A', new_rank, rank,
707 v, rank, B->
v, ldbv );
710 ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR,
'A', new_rank, N-rank,
711 0.0, 0.0, tmp + ldbv * rank, ldbv );
714 ret = LAPACKE_zunmlq_work(LAPACK_COL_MAJOR,
'R',
'N',
720 kernel_trace_stop_lvl2( flops );
721 total_flops += flops;
static double core_dlassq(int n, const double *x, int incx)
Compute the frobenius norm of a vector.
BEGIN_C_DECLS typedef int pastix_int_t
int core_zgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t LDA, pastix_complex64_t beta, pastix_complex64_t *B, pastix_int_t LDB)
Add two matrices together.
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_zrradd_svd(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
pastix_fixdbl_t core_zge2lr_svd(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using SVD.
void core_zlrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_zlrconcatenate_v(pastix_trans_t transA1, pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, pastix_complex64_t *v1v2)
Concatenate right parts of two low-rank matrices.
int core_zlrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
void core_zlrconcatenate_u(pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, pastix_complex64_t *u1u2)
Concatenate left parts of two low-rank matrices.
double core_zlrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
void core_zlrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
void core_zlralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...