137 const float *A = (
const float*)Avoid;
139 float *u, *v, *zwork, *Acpy, ws;
159 norm = LAPACKE_slange_work( LAPACK_COL_MAJOR,
'f', m, n,
163 if ( (norm == 0.) && (tol >= 0.) ) {
172 else if ( use_reltol ) {
177 minMN = pastix_imin(m, n);
178 rklimit = pastix_imin( minMN, rklimit );
185 if ( rklimit == 0 ) {
186 if ( (tol < 0.) || (norm < tol) ) {
193 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
194 A, lda, Alr->
u, Alr->
rkmax );
211#if defined(PASTIX_DEBUG_LR_NANCHECK)
220 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
222 NULL, NULL, ldu, NULL, ldv,
223 &ws, lwork, &rwork );
232#if defined(PRECISION_z) || defined(PRECISION_c)
236 zwork = malloc( zsize *
sizeof(
float) + rsize *
sizeof(
float) );
237 rwork = (
float*)(zwork + zsize);
239 Acpy = zwork + lwork;
245 ret = LAPACKE_slacpy_work(LAPACK_COL_MAJOR,
'A', m, n,
249 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
252 zwork, lwork, rwork + minMN );
254 pastix_print_error(
"SVD Failed\n" );
258 imax = pastix_imin( minMN, rklimit+1 );
259 for (i=0; i<imax; i++, v+=1) {
272#if defined(PASTIX_SVD_2NORM)
275 frob_norm =
core_slassq( minMN-i, s + minMN - 1, -1 );
278 if (frob_norm < tol) {
281 cblas_sscal(n, s[i], v, ldv);
293 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR,
'A', m, n,
294 A, lda, Alr->
u, Alr->
rkmax );
358 const void *alphaptr,
371 float *u1u2, *v1v2, *R, *u, *v;
372 float *tmp, *zbuf, *tauU, *tauV;
374 float alpha = *((
float*)alphaptr);
376 size_t wzsize, wdsize;
380 rank = (A->
rk == -1) ? pastix_imin(M1, N1) : A->
rk;
382 M = pastix_imax(M2, M1);
383 N = pastix_imax(N2, N1);
384 minU = pastix_imin(M, rank);
385 minV = pastix_imin(N, rank);
387 assert(M2 == M && N2 == N);
393 if ( ((M1 + offx) > M2) ||
396 pastix_print_error(
"Dimensions are not correct" );
408 ldau = (A->
rk == -1) ? A->
rkmax : M1;
419 M1, N1, A, M2, N2, B,
427 if ( rank > pastix_imin( M, N ) ) {
435 wzsize = (M+N) * rank;
437 wzsize += minU + minV;
440 wzsize += 3 * rank * rank;
443 lwork = pastix_imax( M, N ) * 32;
446#if defined(PASTIX_DEBUG_LR_NANCHECK)
455 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
456 rank, rank, NULL, rank,
457 NULL, NULL, rank, NULL, rank,
458 &querysize, -1, &rwork );
462 lwork = pastix_imax( lwork, querysize );
466#if defined(PRECISION_z) || defined(PRECISION_c)
470 zbuf = malloc( wzsize *
sizeof(
float) + wdsize *
sizeof(
float) );
471 s = (
float*)(zbuf + wzsize);
474 tauU = u1u2 + M * rank;
476 tauV = v1v2 + N * rank;
493 ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, M, rank,
494 u1u2, M, tauU, zbuf, lwork );
496 total_flops += FLOPS_SGEQRF( M, rank );
511 ret = LAPACKE_sgelqf_work( LAPACK_COL_MAJOR, rank, N,
512 v1v2, rank, tauV, zbuf, lwork );
514 total_flops += FLOPS_SGELQF( rank, N );
522 memset(R, 0, rank * rank *
sizeof(
float));
524 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR,
'U', rank, rank,
528 cblas_strmm(CblasColMajor,
529 CblasRight, CblasLower,
530 CblasNoTrans, CblasNonUnit,
532 v1v2, rank, R, rank);
533 total_flops += FLOPS_STRMM(
PastixRight, rank, rank );
561 tol = tol * ( fabsf(alpha) * normA + normB );
568 flops = FLOPS_SGEQRF( rank, rank ) + FLOPS_SGELQF( rank, (rank-1) );
570 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
571 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR,
'S',
'S',
574 zbuf, lwork, s + rank );
576 pastix_print_error(
"LAPACKE_sgesvd FAILED" );
584 for (i=0; i<rank; i++, tmp+=1){
597#if defined(PASTIX_SVD_2NORM)
600 frob_norm =
core_slassq( rank-i, s + rank - 1, -1 );
603 if (frob_norm < tol) {
606 cblas_sscal(rank, s[i], tmp, rank);
609 kernel_trace_stop_lvl2_rank( flops, new_rank );
610 total_flops += flops;
622 flops = FLOPS_SGEMM( M, N, Bbackup.
rk );
623 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
624 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
626 (sone), Bbackup.
u, ldbu,
629 kernel_trace_stop_lvl2( flops );
630 total_flops += flops;
635 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
638 sone, u + offy * M + offx, M);
639 kernel_trace_stop_lvl2( flops );
642 flops = FLOPS_SGEMM( M1, N1, A->
rk );
643 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
644 cblas_sgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
648 (sone), u + offy * M + offx, M);
649 kernel_trace_stop_lvl2( flops );
651 total_flops += flops;
656 else if ( new_rank == 0 ) {
668 assert( B->
rkmax >= new_rank );
677#if defined(PASTIX_DEBUG_LR_NANCHECK)
678 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR,
'A', M, new_rank,
679 0.0, 0.0, B->
u, ldbu );
681 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR,
'A', rank, new_rank,
682 u, rank, B->
u, ldbu );
686 for (i=0; i<new_rank; i++, tmp+=ldbu, u+=rank) {
687 memcpy(tmp, u, rank *
sizeof(
float));
688 memset(tmp + rank, 0, (M - rank) *
sizeof(
float));
692 flops = FLOPS_SORMQR( M, new_rank, minU,
PastixLeft )
694 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
695 ret = LAPACKE_sormqr_work(LAPACK_COL_MAJOR,
'L',
'N',
706 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR,
'A', new_rank, rank,
707 v, rank, B->
v, ldbv );
710 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR,
'A', new_rank, N-rank,
711 0.0, 0.0, tmp + ldbv * rank, ldbv );
714 ret = LAPACKE_sormlq_work(LAPACK_COL_MAJOR,
'R',
'N',
720 kernel_trace_stop_lvl2( flops );
721 total_flops += flops;