PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_sgelrops_svd.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_sgelrops_svd.c
4 *
5 * PaStiX low-rank kernel routines using SVD based on Lapack SGESVD.
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Gregoire Pichon
12 * @author Esragul Korkmaz
13 * @author Mathieu Faverge
14 * @author Nolan Bredel
15 * @date 2024-07-05
16 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zgelrops_svd.c, normal z -> s, Tue Feb 25 14:34:55 2025
17 *
18 **/
19#include "common.h"
20#include <cblas.h>
21#include <lapacke.h>
22#include "flops.h"
23#include "kernels_trace.h"
24#include "blend/solver.h"
25#include "pastix_scores.h"
26#include "pastix_slrcores.h"
27#include "s_nan_check.h"
28
29#ifndef DOXYGEN_SHOULD_SKIP_THIS
30#define PASTIX_SVD_2NORM 1
31#endif /* DOXYGEN_SHOULD_SKIP_THIS */
32
33#if !defined(PASTIX_SVD_2NORM)
34#include "common/frobeniusupdate.h"
35
36/**
37 *******************************************************************************
38 *
39 * @ingroup kernel_lr_svd_null
40 *
41 * @brief Compute the frobenius norm of a vector
42 *
43 * This routine is inspired from LAPACK slassq function, and allows to
44 * accumulate the contribution backward for better accuracy as opposed to snrm2
45 * which allows only positive increment.
46 *
47 *******************************************************************************
48 *
49 * @param[in] n
50 * The number of elemnts in the vector
51 *
52 * @param[in] x
53 * The vector of size n * incx
54 *
55 * @param[in] incx
56 * The increment between two elments in the vector x.
57 *
58 *******************************************************************************
59 *
60 * @return The frobenius norm of the vector x.
61 *
62 *******************************************************************************/
63static inline float
65 const float *x,
66 int incx )
67{
68 float scale = 1.;
69 float sumsq = 0.;
70 int i;
71
72 for( i=0; i<n; i++, x+=incx ) {
73 frobenius_update( 1, &scale, &sumsq, x );
74 }
75
76 return scale * sqrtf( sumsq );
77}
78#endif /* !defined(PASTIX_SVD_2NORM) */
79
80#ifndef DOXYGEN_SHOULD_SKIP_THIS
81static float sone = 1.0;
82static float szero = 0.0;
83#endif /* DOXYGEN_SHOULD_SKIP_THIS */
84
85/**
86 *******************************************************************************
87 *
88 * @brief Convert a full rank matrix in a low rank matrix, using SVD.
89 *
90 *******************************************************************************
91 *
92 * @param[in] use_reltol
93 * TODO
94 *
95 * @param[in] tol
96 * The tolerance used as a criterion to eliminate information from the
97 * full rank matrix.
98 * If tol < 0, then we compress up to rklimit. So if rklimit is set to
99 * min(m,n), and tol < 0., we get a full representation of the matrix
100 * under the form U * V^t.
101 *
102 * @param[in] rklimit
103 * The maximum rank to store the matrix in low-rank format. If
104 * -1, set to min(M, N) / PASTIX_LR_MINRATIO.
105 *
106 * @param[in] m
107 * Number of rows of the matrix A, and of the low rank matrix Alr.
108 *
109 * @param[in] n
110 * Number of columns of the matrix A, and of the low rank matrix Alr.
111 *
112 * @param[in] Avoid
113 * The matrix of dimension lda-by-n that needs to be compressed
114 *
115 * @param[in] lda
116 * The leading dimension of the matrix A. lda >= max(1, m)
117 *
118 * @param[out] Alr
119 * The low rank matrix structure that will store the low rank
120 * representation of A
121 *
122 *******************************************************************************
123 *
124 * @return TODO
125 *
126 *******************************************************************************/
128core_sge2lr_svd( int use_reltol,
129 pastix_fixdbl_t tol,
130 pastix_int_t rklimit,
131 pastix_int_t m,
132 pastix_int_t n,
133 const void *Avoid,
134 pastix_int_t lda,
135 pastix_lrblock_t *Alr )
136{
137 const float *A = (const float*)Avoid;
138 pastix_fixdbl_t flops = 0.0;
139 float *u, *v, *zwork, *Acpy, ws;
140 float *rwork, *s;
141 pastix_int_t i, ret, ldu, ldv;
142 pastix_int_t minMN, imax;
143 pastix_int_t lwork = -1;
144 pastix_int_t zsize, rsize;
145 float norm;
146
147#if !defined(NDEBUG)
148 if ( m < 0 ) {
149 return -2;
150 }
151 if ( n < 0 ) {
152 return -3;
153 }
154 if ( lda < m ) {
155 return -5;
156 }
157#endif
158
159 norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
160 A, lda, NULL );
161
162 /* Quick return on norm */
163 if ( (norm == 0.) && (tol >= 0.) ) {
164 core_slralloc( m, n, 0, Alr );
165 return 0. ;
166 }
167
168 rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
169 if ( tol < 0. ) {
170 tol = -1.;
171 }
172 else if ( use_reltol ) {
173 tol = tol * norm;
174 }
175
176 /* Quick return on max rank */
177 minMN = pastix_imin(m, n);
178 rklimit = pastix_imin( minMN, rklimit );
179
180 /**
181 * If maximum rank is 0, then either the matrix norm is below the tolerance,
182 * and we can return a null rank matrix, or it is not and we need to return
183 * a full rank matrix.
184 */
185 if ( rklimit == 0 ) {
186 if ( (tol < 0.) || (norm < tol) ) {
187 core_slralloc( m, n, 0, Alr );
188 return 0.;
189 }
190
191 /* Return full rank */
192 core_slralloc( m, n, -1, Alr );
193 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
194 A, lda, Alr->u, Alr->rkmax );
195 assert(ret == 0);
196 return 0.;
197 }
198
199 /*
200 * Allocate a temporary Low rank matrix to store the full U and V
201 */
202 core_slralloc( m, n, pastix_imin( m, n ), Alr );
203 u = Alr->u;
204 v = Alr->v;
205 ldu = m;
206 ldv = Alr->rkmax;
207
208 /*
209 * Query the workspace needed for the gesvd
210 */
211#if defined(PASTIX_DEBUG_LR_NANCHECK)
212 ws = minMN;
213#else
214 {
215 /*
216 * rworkfix is a fix to an internal mkl bug
217 * see: https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/MKL-2021-4-CoreDump-with-LAPACKE-cgesvd-work-or-LAPACKE-sgesvd/m-p/1341228
218 */
219 float rwork;
220 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
221 m, n, NULL, m,
222 NULL, NULL, ldu, NULL, ldv,
223 &ws, lwork, &rwork );
224 (void)rwork;
225 }
226#endif
227 lwork = ws;
228 zsize = ws;
229 zsize += m * n; /* Copy of the matrix A */
230
231 rsize = minMN;
232#if defined(PRECISION_z) || defined(PRECISION_c)
233 rsize += 5 * minMN;
234#endif
235
236 zwork = malloc( zsize * sizeof(float) + rsize * sizeof(float) );
237 rwork = (float*)(zwork + zsize);
238
239 Acpy = zwork + lwork;
240 s = rwork;
241
242 /*
243 * Backup the original matrix before to overwrite it with the SVD
244 */
245 ret = LAPACKE_slacpy_work(LAPACK_COL_MAJOR, 'A', m, n,
246 A, lda, Acpy, m );
247 assert( ret == 0 );
248
249 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
250 m, n, Acpy, m,
251 s, u, ldu, v, ldv,
252 zwork, lwork, rwork + minMN );
253 if ( ret != 0 ) {
254 pastix_print_error( "SVD Failed\n" );
255 }
256
257 /* Let's stop i before going too far */
258 imax = pastix_imin( minMN, rklimit+1 );
259 for (i=0; i<imax; i++, v+=1) {
260 float frob_norm;
261
262 /*
263 * There are two different stopping criteria for SVD to decide the
264 * compression rank:
265 * 1) The 2-norm:
266 * Compare the singular values to the threshold
267 * 2) The Frobenius norm:
268 * Compare the Frobenius norm of the trailing singular values to
269 * the threshold. Note that we use a reverse accumulation of the
270 * singular values to avoid accuracy issues.
271 */
272#if defined(PASTIX_SVD_2NORM)
273 frob_norm = s[i];
274#else
275 frob_norm = core_slassq( minMN-i, s + minMN - 1, -1 );
276#endif
277
278 if (frob_norm < tol) {
279 break;
280 }
281 cblas_sscal(n, s[i], v, ldv);
282 }
283
284 /*
285 * Resize the space used by the low rank matrix
286 */
287 core_slrsze( 1, m, n, Alr, i, -1, rklimit );
288
289 /*
290 * It was not interesting to compress, so we restore the dense version in Alr
291 */
292 if (Alr->rk == -1) {
293 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
294 A, lda, Alr->u, Alr->rkmax );
295 assert(ret == 0);
296 }
297
298 (void)ret;
299 memFree_null(zwork);
300 return flops;
301}
302
303/**
304 *******************************************************************************
305 *
306 * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
307 *
308 * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
309 * Compute QR decomposition of (u2 u1) = Q1 R1
310 * Compute QR decomposition of (v2 v1) = Q2 R2
311 * Compute SVD of R1 R2^T = u sigma v^T
312 * Final solution is (Q1 u sigma^[1/2]) (Q2 v sigma^[1/2])^T
313 *
314 *******************************************************************************
315 *
316 * @param[in] lowrank
317 * The structure with low-rank parameters.
318 *
319 * @param[in] transA1
320 * @arg PastixNoTrans: No transpose, op( A ) = A;
321 * @arg PastixTrans: Transpose, op( A ) = A';
322 *
323 * @param[in] alphaptr
324 * alpha * A is add to B
325 *
326 * @param[in] M1
327 * The number of rows of the matrix A.
328 *
329 * @param[in] N1
330 * The number of columns of the matrix A.
331 *
332 * @param[in] A
333 * The low-rank representation of the matrix A.
334 *
335 * @param[in] M2
336 * The number of rows of the matrix B.
337 *
338 * @param[in] N2
339 * The number of columns of the matrix B.
340 *
341 * @param[in] B
342 * The low-rank representation of the matrix B.
343 *
344 * @param[in] offx
345 * The horizontal offset of A with respect to B.
346 *
347 * @param[in] offy
348 * The vertical offset of A with respect to B.
349 *
350 *******************************************************************************
351 *
352 * @return The new rank of u2 v2^T or -1 if ranks are too large for recompression
353 *
354 *******************************************************************************/
357 pastix_trans_t transA1,
358 const void *alphaptr,
359 pastix_int_t M1,
360 pastix_int_t N1,
361 const pastix_lrblock_t *A,
362 pastix_int_t M2,
363 pastix_int_t N2,
365 pastix_int_t offx,
366 pastix_int_t offy)
367{
368 pastix_int_t rank, M, N, minU, minV;
369 pastix_int_t i, ret, lwork, new_rank;
370 pastix_int_t ldau, ldav, ldbu, ldbv;
371 float *u1u2, *v1v2, *R, *u, *v;
372 float *tmp, *zbuf, *tauU, *tauV;
373 float querysize;
374 float alpha = *((float*)alphaptr);
375 float *s;
376 size_t wzsize, wdsize;
377 float tol = lowrank->tolerance;
378 pastix_fixdbl_t flops, total_flops = 0.0;
379
380 rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
381 rank += B->rk;
382 M = pastix_imax(M2, M1);
383 N = pastix_imax(N2, N1);
384 minU = pastix_imin(M, rank);
385 minV = pastix_imin(N, rank);
386
387 assert(M2 == M && N2 == N);
388 assert(B->rk != -1);
389
390 assert( A->rk <= A->rkmax);
391 assert( B->rk <= B->rkmax);
392
393 if ( ((M1 + offx) > M2) ||
394 ((N1 + offy) > N2) )
395 {
396 pastix_print_error( "Dimensions are not correct" );
397 assert(0 /* Incorrect dimensions */);
398 return total_flops;
399 }
400
401 /*
402 * A is rank null, nothing to do
403 */
404 if (A->rk == 0) {
405 return total_flops;
406 }
407
408 ldau = (A->rk == -1) ? A->rkmax : M1;
409 ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
410 ldbu = M;
411 ldbv = B->rkmax;
412
413 /*
414 * Let's handle case where B is a null matrix
415 * B = alpha A
416 */
417 if (B->rk == 0) {
418 core_slrcpy( lowrank, transA1, alpha,
419 M1, N1, A, M2, N2, B,
420 offx, offy );
421 return total_flops;
422 }
423
424 /*
425 * The rank is too big, let's try to compress
426 */
427 if ( rank > pastix_imin( M, N ) ) {
428 assert(0);
429 }
430
431 /*
432 * Let's compute the size of the workspace
433 */
434 /* u1u2 and v1v2 */
435 wzsize = (M+N) * rank;
436 /* tauU and tauV */
437 wzsize += minU + minV;
438
439 /* Storage of R, u and v */
440 wzsize += 3 * rank * rank;
441
442 /* QR/LQ workspace */
443 lwork = pastix_imax( M, N ) * 32;
444
445 /* Workspace needed for the gesvd */
446#if defined(PASTIX_DEBUG_LR_NANCHECK)
447 querysize = rank;
448#else
449 {
450 /*
451 * rworkfix is a fix to an internal mkl bug
452 * see: https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/MKL-2021-4-CoreDump-with-LAPACKE-cgesvd-work-or-LAPACKE-sgesvd/m-p/1341228
453 */
454 float rwork;
455 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
456 rank, rank, NULL, rank,
457 NULL, NULL, rank, NULL, rank,
458 &querysize, -1, &rwork );
459 (void)rwork;
460 }
461#endif
462 lwork = pastix_imax( lwork, querysize );
463 wzsize += lwork;
464
465 wdsize = rank;
466#if defined(PRECISION_z) || defined(PRECISION_c)
467 wdsize += 5 * rank;
468#endif
469
470 zbuf = malloc( wzsize * sizeof(float) + wdsize * sizeof(float) );
471 s = (float*)(zbuf + wzsize);
472
473 u1u2 = zbuf + lwork;
474 tauU = u1u2 + M * rank;
475 v1v2 = tauU + minU;
476 tauV = v1v2 + N * rank;
477 R = tauV + minV;
478
479 /*
480 * Concatenate U2 and U1 in u1u2
481 * [ u2 0 ]
482 * [ u2 u1 ]
483 * [ u2 0 ]
484 */
486 M1, N1, A,
487 M2, B,
488 offx, u1u2 );
489
490 /*
491 * Perform QR factorization on u1u2 = (Q1 R1)
492 */
493 ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, M, rank,
494 u1u2, M, tauU, zbuf, lwork );
495 assert( ret == 0 );
496 total_flops += FLOPS_SGEQRF( M, rank );
497
498 /*
499 * Concatenate V2 and V1 in v1v2
500 * [ v2^h v2^h v2^h ]
501 * [ 0 v1^h 0 ]
502 */
503 core_slrconcatenate_v( transA1, alpha,
504 M1, N1, A,
505 N2, B,
506 offy, v1v2 );
507
508 /*
509 * Perform LQ factorization on v1v2 = (L2 Q2)
510 */
511 ret = LAPACKE_sgelqf_work( LAPACK_COL_MAJOR, rank, N,
512 v1v2, rank, tauV, zbuf, lwork );
513 assert(ret == 0);
514 total_flops += FLOPS_SGELQF( rank, N );
515
516 /*
517 * Compute R = alpha R1 L2
518 */
519 u = R + rank * rank;
520 v = u + rank * rank;
521
522 memset(R, 0, rank * rank * sizeof(float));
523
524 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'U', rank, rank,
525 u1u2, M, R, rank );
526 assert(ret == 0);
527
528 cblas_strmm(CblasColMajor,
529 CblasRight, CblasLower,
530 CblasNoTrans, CblasNonUnit,
531 rank, rank, (sone),
532 v1v2, rank, R, rank);
533 total_flops += FLOPS_STRMM( PastixRight, rank, rank );
534
535 if ( lowrank->use_reltol ) {
536 /**
537 * In relative tolerance, we can choose two solutions:
538 * 1) The first one, more conservative, is to compress relatively to
539 * the norm of the final matrix \f$ \alpha A + B \f$. In this kernel, we
540 * exploit the fact that the previous computed product contains all the
541 * information of the final matrix to do it as follow:
542 *
543 * float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', rank, rank,
544 * R, rank, NULL );
545 * tol = tol * norm;
546 *
547 * 2) The second solution, less conservative, will allow to reduce the
548 * rank more efficiently. Since A and B have been compressed relatively
549 * to their respective norms, there is no reason to compress the sum
550 * relatively to its own norm, but it is more reasonable to compress it
551 * relatively to the norm of A and B. For example, A-B would be full
552 * with the first criterion, and rank null with the second.
553 * Note that here, we can only have an estimation that once again
554 * reduces the conservation of the criterion.
555 * \f[ || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b|| \f]
556 *
557 */
558 float normA, normB;
559 normA = core_slrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
560 normB = core_slrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
561 tol = tol * ( fabsf(alpha) * normA + normB );
562 }
563
564 /*
565 * Compute svd(R) = u sigma v^t
566 */
567 /* Missing the flops of the u and v generation */
568 flops = FLOPS_SGEQRF( rank, rank ) + FLOPS_SGELQF( rank, (rank-1) );
569
570 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
571 ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
572 rank, rank, R, rank,
573 s, u, rank, v, rank,
574 zbuf, lwork, s + rank );
575 if ( ret != 0 ) {
576 pastix_print_error( "LAPACKE_sgesvd FAILED" );
577 }
578
579 /*
580 * Let's compute the new rank of the result
581 */
582 tmp = v;
583
584 for (i=0; i<rank; i++, tmp+=1){
585 float frob_norm;
586
587 /*
588 * There are two different stopping criteria for SVD to decide the
589 * compression rank:
590 * 1) The 2-norm:
591 * Compare the singular values to the threshold
592 * 2) The Frobenius norm:
593 * Compare the Frobenius norm of the trailing singular values to
594 * the threshold. Note that we use a reverse accumulation of the
595 * singular values to avoid accuracy issues.
596 */
597#if defined(PASTIX_SVD_2NORM)
598 frob_norm = s[i];
599#else
600 frob_norm = core_slassq( rank-i, s + rank - 1, -1 );
601#endif
602
603 if (frob_norm < tol) {
604 break;
605 }
606 cblas_sscal(rank, s[i], tmp, rank);
607 }
608 new_rank = i;
609 kernel_trace_stop_lvl2_rank( flops, new_rank );
610 total_flops += flops;
611
612 /*
613 * First case: The rank is too big, so we decide to uncompress the result
614 */
615 if ( new_rank > core_get_rklimit( M, N ) ) {
616 pastix_lrblock_t Bbackup = *B;
617
618 core_slralloc( M, N, -1, B );
619 u = B->u;
620
621 /* Uncompress B */
622 flops = FLOPS_SGEMM( M, N, Bbackup.rk );
623 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
624 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
625 M, N, Bbackup.rk,
626 (sone), Bbackup.u, ldbu,
627 Bbackup.v, ldbv,
628 (szero), u, M );
629 kernel_trace_stop_lvl2( flops );
630 total_flops += flops;
631
632 /* Add A into it */
633 if ( A->rk == -1 ) {
634 flops = 2 * M1 * N1;
635 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
636 core_sgeadd( transA1, M1, N1,
637 alpha, A->u, ldau,
638 sone, u + offy * M + offx, M);
639 kernel_trace_stop_lvl2( flops );
640 }
641 else {
642 flops = FLOPS_SGEMM( M1, N1, A->rk );
643 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
644 cblas_sgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
645 M1, N1, A->rk,
646 (alpha), A->u, ldau,
647 A->v, ldav,
648 (sone), u + offy * M + offx, M);
649 kernel_trace_stop_lvl2( flops );
650 }
651 total_flops += flops;
652 core_slrfree(&Bbackup);
653 memFree_null(zbuf);
654 return total_flops;
655 }
656 else if ( new_rank == 0 ) {
657 core_slrfree(B);
658 memFree_null(zbuf);
659 return total_flops;
660 }
661
662 /*
663 * We need to reallocate the buffer to store the new compressed version of B
664 * because it wasn't big enough
665 */
666 ret = core_slrsze( 0, M, N, B, new_rank, -1, -1 );
667 assert( ret != -1 );
668 assert( B->rkmax >= new_rank );
669 assert( B->rkmax >= B->rk );
670
671 ldbv = B->rkmax;
672
673 /*
674 * Let's now compute the final U = Q1 ([u] sigma)
675 * [0]
676 */
677#if defined(PASTIX_DEBUG_LR_NANCHECK)
678 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M, new_rank,
679 0.0, 0.0, B->u, ldbu );
680 assert(ret == 0);
681 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', rank, new_rank,
682 u, rank, B->u, ldbu );
683 assert(ret == 0);
684#else
685 tmp = B->u;
686 for (i=0; i<new_rank; i++, tmp+=ldbu, u+=rank) {
687 memcpy(tmp, u, rank * sizeof(float));
688 memset(tmp + rank, 0, (M - rank) * sizeof(float));
689 }
690#endif
691
692 flops = FLOPS_SORMQR( M, new_rank, minU, PastixLeft )
693 + FLOPS_SORMLQ( new_rank, N, minV, PastixRight );
694 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
695 ret = LAPACKE_sormqr_work(LAPACK_COL_MAJOR, 'L', 'N',
696 M, new_rank, minU,
697 u1u2, M, tauU,
698 B->u, ldbu,
699 zbuf, lwork);
700 assert( ret == 0 );
701
702 /*
703 * And the final V^T = [v^t 0 ] Q2
704 */
705 tmp = B->v;
706 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', new_rank, rank,
707 v, rank, B->v, ldbv );
708 assert( ret == 0 );
709
710 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', new_rank, N-rank,
711 0.0, 0.0, tmp + ldbv * rank, ldbv );
712 assert( ret == 0 );
713
714 ret = LAPACKE_sormlq_work(LAPACK_COL_MAJOR, 'R', 'N',
715 new_rank, N, minV,
716 v1v2, rank, tauV,
717 B->v, ldbv,
718 zbuf, lwork);
719 assert( ret == 0 );
720 kernel_trace_stop_lvl2( flops );
721 total_flops += flops;
722
723 (void)ret;
724 memFree_null(zbuf);
725 return total_flops;
726}
static float core_slassq(int n, const float *x, int incx)
Compute the frobenius norm of a vector.
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition core_sgeadd.c:78
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_svd(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using SVD.
pastix_fixdbl_t core_srradd_svd(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
int core_slrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
void core_slralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
void core_slrconcatenate_v(pastix_trans_t transA1, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, float *v1v2)
Concatenate right parts of two low-rank matrices.
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_slrconcatenate_u(float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, float *u1u2)
Concatenate left parts of two low-rank matrices.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
float core_slrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition core_slrnrm.c:48
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixFrobeniusNorm
Definition api.h:504
@ PastixRight
Definition api.h:496
@ PastixLeft
Definition api.h:495
@ PastixNoTrans
Definition api.h:445
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...