PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_sgelrops.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_sgelrops.c
4 *
5 * PaStiX low-rank kernel routines
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Gregoire Pichon
12 * @author Esragul Korkmaz
13 * @author Mathieu Faverge
14 * @author Pierre Ramet
15 * @author Nolan Bredel
16 * @date 2024-07-05
17 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zgelrops.c, normal z -> s, Tue Feb 25 14:34:55 2025
18 *
19 **/
20#include "common.h"
21#include <cblas.h>
22#include <lapacke.h>
23#include "blend/solver.h"
24#include "pastix_scores.h"
25#include "pastix_slrcores.h"
26#include "s_nan_check.h"
27#include "kernels_trace.h"
28
29#ifndef DOXYGEN_SHOULD_SKIP_THIS
30static float sone = 1.0;
31static float szero = 0.0;
32#endif /* DOXYGEN_SHOULD_SKIP_THIS */
33
34/**
35 *******************************************************************************
36 *
37 * @brief Allocate a low-rank matrix.
38 *
39 *******************************************************************************
40 *
41 * @param[in] M
42 * Number of rows of the matrix A.
43 *
44 * @param[in] N
45 * Number of columns of the matrix A.
46 *
47 * @param[in] rkmax
48 * @arg -1: the matrix is allocated tight to its rank.
49 * @arg >0: the matrix is allocated to the minimum of rkmax and its maximum rank.
50 *
51 * @param[out] A
52 * The allocated low-rank matrix
53 *
54 *******************************************************************************/
55void
58 pastix_int_t rkmax,
60{
61 float *u, *v;
62
63 if ( rkmax == -1 ) {
64 u = malloc( M * N * sizeof(float) );
65 memset( u, 0, M * N * sizeof(float) );
66 A->rk = -1;
67 A->rkmax = M;
68 A->u = u;
69 A->v = NULL;
70 }
71 else if ( rkmax == 0 ) {
72 A->rk = 0;
73 A->rkmax = 0;
74 A->u = NULL;
75 A->v = NULL;
76 }
77 else {
78 pastix_int_t rk = pastix_imin( M, N );
79 rkmax = pastix_imin( rkmax, rk );
80
81#if defined(PASTIX_DEBUG_LR)
82 u = malloc( M * rkmax * sizeof(float) );
83 v = malloc( N * rkmax * sizeof(float) );
84
85 /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
86 memset(u, 0, M * rkmax * sizeof(float));
87 memset(v, 0, N * rkmax * sizeof(float));
88#else
89 u = malloc( (M+N) * rkmax * sizeof(float));
90
91 /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
92 memset(u, 0, (M+N) * rkmax * sizeof(float));
93
94 v = u + M * rkmax;
95#endif
96
97 A->rk = 0;
98 A->rkmax = rkmax;
99 A->u = u;
100 A->v = v;
101 }
102}
103
104/**
105 *******************************************************************************
106 *
107 * @brief Free a low-rank matrix.
108 *
109 *******************************************************************************
110 *
111 * @param[inout] A
112 * The low-rank matrix that will be desallocated.
113 *
114 *******************************************************************************/
115void
117{
118 if ( A->rk == -1 ) {
119 free(A->u);
120 A->u = NULL;
121 }
122 else {
123 free(A->u);
124#if defined(PASTIX_DEBUG_LR)
125 free(A->v);
126#endif
127 A->u = NULL;
128 A->v = NULL;
129 }
130 A->rk = 0;
131 A->rkmax = 0;
132}
133
134/**
135 *******************************************************************************
136 *
137 * @brief Resize a low-rank matrix
138 *
139 *******************************************************************************
140 *
141 * @param[in] copy
142 * Enable/disable the copy of the data from A->u and A->v into the new
143 * low-rank representation.
144 *
145 * @param[in] M
146 * The number of rows of the matrix A.
147 *
148 * @param[in] N
149 * The number of columns of the matrix A.
150 *
151 * @param[inout] A
152 * The low-rank representation of the matrix. At exit, this structure
153 * is modified with the new low-rank representation of A, is the rank
154 * is small enough
155 *
156 * @param[in] newrk
157 * The new rank of the matrix A.
158 *
159 * @param[in] newrkmax
160 * The new maximum rank of the matrix A. Useful if the low-rank
161 * structure was allocated with more data than the rank.
162 *
163 * @param[in] rklimit
164 * The maximum rank to store the matrix in low-rank format. If
165 * -1, set to core_get_rklimit(M, N)
166 *
167 *******************************************************************************
168 *
169 * @return The new rank of A
170 *
171 *******************************************************************************/
172int
173core_slrsze( int copy,
174 pastix_int_t M,
175 pastix_int_t N,
177 pastix_int_t newrk,
178 pastix_int_t newrkmax,
179 pastix_int_t rklimit )
180{
181 /* If no limit on the rank is given, let's take min(M, N) */
182 rklimit = (rklimit == -1) ? core_get_rklimit( M, N ) : rklimit;
183
184 /* If no extra memory allocated, let's fix rkmax to rk */
185 newrkmax = (newrkmax == -1) ? newrk : newrkmax;
186 newrkmax = pastix_imax( newrkmax, newrk );
187
188 /*
189 * It is not interesting to compress, so we alloc space to store the full matrix
190 */
191 if ( (newrk > rklimit) || (newrk == -1) )
192 {
193 A->u = realloc( A->u, M * N * sizeof(float) );
194#if defined(PASTIX_DEBUG_LR)
195 free(A->v);
196#endif
197 A->v = NULL;
198 A->rk = -1;
199 A->rkmax = M;
200 return -1;
201 }
202 /*
203 * The rank is null, we free everything
204 */
205 else if (newrkmax == 0)
206 {
207 /*
208 * The rank is null, we free everything
209 */
210 free(A->u);
211#if defined(PASTIX_DEBUG_LR)
212 free(A->v);
213#endif
214 A->u = NULL;
215 A->v = NULL;
216 A->rkmax = newrkmax;
217 A->rk = newrk;
218 }
219 /*
220 * The rank is non null, we allocate the correct amount of space, and
221 * compress the stored information if necessary
222 */
223 else {
224 float *u, *v;
225 int ret;
226
227 if ( ( A->rk == -1 ) ||
228 (( A->rk != -1 ) && (newrkmax != A->rkmax)) )
229 {
230#if defined(PASTIX_DEBUG_LR)
231 u = malloc( M * newrkmax * sizeof(float) );
232 v = malloc( N * newrkmax * sizeof(float) );
233#else
234 u = malloc( (M+N) * newrkmax * sizeof(float) );
235 v = u + M * newrkmax;
236#endif
237 if ( copy ) {
238 assert( A->rk != -1 );
239 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M, newrk,
240 A->u, M, u, M );
241 assert(ret == 0);
242 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', newrk, N,
243 A->v, A->rkmax, v, newrkmax );
244 assert(ret == 0);
245 }
246 free(A->u);
247#if defined(PASTIX_DEBUG_LR)
248 free(A->v);
249#endif
250 A->u = u;
251 A->v = v;
252 }
253
254 /* Update rk and rkmax */
255 A->rkmax = newrkmax;
256 A->rk = newrk;
257
258 (void)ret;
259 }
260 assert( A->rk <= A->rkmax);
261 return 0;
262}
263
264/**
265 *******************************************************************************
266 *
267 * @brief Convert a low rank matrix into a dense matrix.
268 *
269 * Convert a low-rank matrix of size m-by-n into a full rank matrix.
270 * A = op( u * v^t ) with op(A) = A or A^t
271 *
272 *******************************************************************************
273 *
274 * @param[in] trans
275 * @arg PastixNoTrans: returns A = u * v^t
276 * @arg PastixTrans: returns A = v * u^t
277 *
278 * @param[in] m
279 * Number of rows of the low-rank matrix Alr.
280 *
281 * @param[in] n
282 * Number of columns of the low-rank matrix Alr.
283 *
284 * @param[in] Alr
285 * The low rank matrix to be converted into a full rank matrix
286 *
287 * @param[inout] A
288 * The matrix of dimension lda-by-k in which to store the uncompressed
289 * version of Alr. k = n if trans == PastixNoTrans, m otherwise.
290 *
291 * @param[in] lda
292 * The leading dimension of the matrix A. lda >= max(1, m) if trans ==
293 * PastixNoTrans, lda >= max(1,n) otherwise.
294 *
295 *******************************************************************************
296 *
297 * @retval 0 in case of success.
298 * @retval -i if the ith parameter is incorrect.
299 *
300 *******************************************************************************/
301int
303 pastix_int_t m,
304 pastix_int_t n,
305 const pastix_lrblock_t *Alr,
306 float *A,
307 pastix_int_t lda )
308{
309 int ret = 0;
310
311#if !defined(NDEBUG)
312 if ( m < 0 ) {
313 return -1;
314 }
315 if ( n < 0 ) {
316 return -2;
317 }
318 if (Alr == NULL || Alr->rk > Alr->rkmax) {
319 return -3;
320 }
321 if ( (trans == PastixNoTrans && lda < m) ||
322 (trans != PastixNoTrans && lda < n) )
323 {
324 return -5;
325 }
326 if ( Alr->rk == -1 ) {
327 if (Alr->u == NULL || Alr->v != NULL || (Alr->rkmax < m))
328 {
329 return -6;
330 }
331 }
332 else if ( Alr->rk != 0){
333 if (Alr->u == NULL || Alr->v == NULL) {
334 return -6;
335 }
336 }
337#endif
338
339 if ( trans == PastixNoTrans ) {
340 if ( Alr->rk == -1 ) {
341 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
342 Alr->u, Alr->rkmax, A, lda );
343 assert( ret == 0 );
344 }
345 else if ( Alr->rk == 0 ) {
346 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', m, n,
347 0.0, 0.0, A, lda );
348 assert( ret == 0 );
349 }
350 else {
351 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
352 m, n, Alr->rk,
353 (sone), Alr->u, m,
354 Alr->v, Alr->rkmax,
355 (szero), A, lda);
356 }
357 }
358 else {
359 if ( Alr->rk == -1 ) {
360 core_sgetmo( m, n, Alr->u, Alr->rkmax, A, lda );
361 }
362 else if ( Alr->rk == 0 ) {
363 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', n, m,
364 0.0, 0.0, A, lda );
365 assert( ret == 0 );
366 }
367 else {
368 cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans,
369 n, m, Alr->rk,
370 (sone), Alr->v, Alr->rkmax,
371 Alr->u, m,
372 (szero), A, lda);
373 }
374 }
375
376 return ret;
377}
378
379/**
380 *******************************************************************************
381 *
382 * @brief Copy a small low-rank structure into a large one
383 *
384 *******************************************************************************
385 *
386 * @param[in] lowrank
387 * The structure with low-rank parameters.
388 *
389 * @param[in] transAv
390 * @arg PastixNoTrans: (A.v)' is stored transposed as usual
391 * @arg PastixTrans: A.v is stored
392 * @arg PastixTrans: A.v is stored
393 *
394 * @param[in] alpha
395 * The multiplier parameter: B = B + alpha * A
396 *
397 * @param[in] M1
398 * The number of rows of the matrix A.
399 *
400 * @param[in] N1
401 * The number of columns of the matrix A.
402 *
403 * @param[in] A
404 * The low-rank representation of the matrix A.
405 *
406 * @param[in] M2
407 * The number of rows of the matrix B.
408 *
409 * @param[in] N2
410 * The number of columns of the matrix B.
411 *
412 * @param[inout] B
413 * The low-rank representation of the matrix B.
414 *
415 * @param[in] offx
416 * The horizontal offset of A with respect to B.
417 *
418 * @param[in] offy
419 * The vertical offset of A with respect to B.
420 *
421 *******************************************************************************/
422void
423core_slrcpy( const pastix_lr_t *lowrank,
424 pastix_trans_t transAv,
425 float alpha,
426 pastix_int_t M1,
427 pastix_int_t N1,
428 const pastix_lrblock_t *A,
429 pastix_int_t M2,
430 pastix_int_t N2,
432 pastix_int_t offx,
433 pastix_int_t offy )
434{
435 float *u, *v;
436 pastix_int_t ldau, ldav;
437 int ret;
438
439 assert( (M1 + offx) <= M2 );
440 assert( (N1 + offy) <= N2 );
441
442 ldau = (A->rk == -1) ? A->rkmax : M1;
443 ldav = (transAv == PastixNoTrans) ? A->rkmax : N1;
444
445 core_slrfree( B );
446 core_slralloc( M2, N2, A->rk, B );
447 u = B->u;
448 v = B->v;
449
450 B->rk = A->rk;
451
452 if ( A->rk == -1 ) {
453 if ( (M1 != M2) || (N1 != N2) ) {
454 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, N2,
455 0.0, 0.0, u, M2 );
456 assert( ret == 0 );
457 }
458 ret = core_sgeadd( PastixNoTrans, M1, N1,
459 alpha, A->u, ldau,
460 0.0, u + M2 * offy + offx, M2 );
461 assert(ret == 0);
462 }
463 else {
464 if ( M1 != M2 ) {
465 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
466 0.0, 0.0, u, M2 );
467 assert( ret == 0 );
468 }
469 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
470 A->u, ldau,
471 u + offx, M2 );
472 assert(ret == 0);
473
474 if ( N1 != N2 ) {
475 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
476 0.0, 0.0, v, B->rkmax );
477 assert( ret == 0 );
478 }
479 ret = core_sgeadd( transAv, A->rk, N1,
480 alpha, A->v, ldav,
481 0.0, v + B->rkmax * offy, B->rkmax );
482 assert(ret == 0);
483 }
484
485#if 0
486 {
487 float *work = malloc( M2 * N2 * sizeof(float) );
488
489 core_slr2ge( PastixNoTrans, M2, N2, B, work, M2 );
490
491 lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, M2, N2, work, M2, B );
492
493 free(work);
494 }
495#endif
496
497 (void)lowrank;
498 (void)ret;
499}
500
501
502/**
503 *******************************************************************************
504 *
505 * @brief Concatenate left parts of two low-rank matrices
506 *
507 *******************************************************************************
508 *
509 * @param[in] alpha
510 * alpha * A is add to B
511 *
512 * @param[in] M1
513 * The number of rows of the matrix A.
514 *
515 * @param[in] N1
516 * The number of columns of the matrix A.
517 *
518 * @param[in] A
519 * The low-rank representation of the matrix A.
520 *
521 * @param[in] M2
522 * The number of rows of the matrix B.
523 *
524 * @param[in] B
525 * The low-rank representation of the matrix B.
526 *
527 * @param[in] offx
528 * The horizontal offset of A with respect to B.
529 *
530 * @param[inout] u1u2
531 * The workspace where matrices are concatenated
532 *
533 *******************************************************************************/
534void
536 pastix_int_t M1,
537 pastix_int_t N1,
538 const pastix_lrblock_t *A,
539 pastix_int_t M2,
541 pastix_int_t offx,
542 float *u1u2 )
543{
544 float *tmp;
545 pastix_int_t i, ret, rank;
546 pastix_int_t ldau, ldbu;
547
548 rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
549 rank += B->rk;
550
551 ldau = (A->rk == -1) ? A->rkmax : M1;
552 ldbu = M2;
553
554 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
555 B->u, ldbu, u1u2, M2 );
556 assert(ret == 0);
557
558 tmp = u1u2 + B->rk * M2;
559 if ( A->rk == -1 ) {
560 /*
561 * A is full of rank M1, so A will be integrated into v1v2
562 */
563 if ( M1 < N1 ) {
564 if ( M1 != M2 ) {
565 /* Set to 0 */
566 memset(tmp, 0, M2 * M1 * sizeof(float));
567
568 /* Set diagonal */
569 tmp += offx;
570 for (i=0; i<M1; i++, tmp += M2+1) {
571 *tmp = 1.0;
572 }
573 }
574 else {
575 assert( offx == 0 );
576 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, M1,
577 0.0, 1.0, tmp, M2 );
578 assert( ret == 0 );
579 }
580 }
581 else {
582 /*
583 * A is full of rank N1, so A is integrated into u1u2
584 */
585 if ( M1 != M2 ) {
586 memset(tmp, 0, M2 * N1 * sizeof(float));
587 }
588 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, N1,
589 A->u, ldau, tmp + offx, M2 );
590 assert(ret == 0);
591 }
592 }
593 /*
594 * A is low rank of rank A->rk
595 */
596 else {
597 if ( M1 != M2 ) {
598 memset(tmp, 0, M2 * A->rk * sizeof(float));
599 }
600 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
601 A->u, ldau, tmp + offx, M2 );
602 assert(ret == 0);
603 }
604 (void)ret;
605 (void)alpha;
606 (void)rank;
607}
608
609
610/**
611 *******************************************************************************
612 *
613 * @brief Concatenate right parts of two low-rank matrices
614 *
615 *******************************************************************************
616 *
617 * @param[in] transA1
618 * @arg PastixNoTrans: No transpose, op( A ) = A;
619 * @arg PastixTrans: Transpose, op( A ) = A';
620 *
621 * @param[in] alpha
622 * alpha * A is add to B
623 *
624 * @param[in] M1
625 * The number of rows of the matrix A.
626 *
627 * @param[in] N1
628 * The number of columns of the matrix A.
629 *
630 * @param[in] A
631 * The low-rank representation of the matrix A.
632 *
633 * @param[in] N2
634 * The number of columns of the matrix B.
635 *
636 * @param[in] B
637 * The low-rank representation of the matrix B.
638 *
639 * @param[in] offy
640 * The vertical offset of A with respect to B.
641 *
642 * @param[inout] v1v2
643 * The workspace where matrices are concatenated
644 *
645 *******************************************************************************/
646void
648 float alpha,
649 pastix_int_t M1,
650 pastix_int_t N1,
651 const pastix_lrblock_t *A,
652 pastix_int_t N2,
654 pastix_int_t offy,
655 float *v1v2 )
656{
657 float *tmp;
658 pastix_int_t i, ret, rank;
659 pastix_int_t ldau, ldav, ldbv;
660
661 rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
662 rank += B->rk;
663
664 ldau = (A->rk == -1) ? A->rkmax : M1;
665 ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
666 ldbv = B->rkmax;
667
668 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
669 B->v, ldbv, v1v2, rank );
670 assert(ret == 0);
671
672 tmp = v1v2 + B->rk;
673 if ( A->rk == -1 ) {
674 assert( transA1 == PastixNoTrans );
675 /*
676 * A is full of rank M1, so it is integrated into v1v2
677 */
678 if ( M1 < N1 ) {
679 if (N1 != N2) {
680 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M1, N2,
681 0.0, 0.0, tmp, rank );
682 assert( ret == 0 );
683 }
684 core_sgeadd( PastixNoTrans, M1, N1,
685 alpha, A->u, ldau,
686 0.0, tmp + offy * rank, rank );
687 }
688 /*
689 * A is full of rank N1, so it has been integrated into u1u2
690 */
691 else {
692 if (N1 != N2) {
693 /* Set to 0 */
694 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
695 0.0, 0.0, tmp, rank );
696 assert(ret == 0);
697
698 /* Set diagonal */
699 tmp += offy * rank;
700 for (i=0; i<N1; i++, tmp += rank+1) {
701 *tmp = alpha;
702 }
703 }
704 else {
705 assert( offy == 0 );
706 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
707 0.0, alpha, tmp + offy * rank, rank );
708 assert( ret == 0 );
709 }
710 }
711 }
712 /*
713 * A is low rank of rank A->rk
714 */
715 else {
716 if (N1 != N2) {
717 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', A->rk, N2,
718 0.0, 0.0, tmp, rank );
719 assert(ret == 0);
720 }
721 core_sgeadd( transA1, A->rk, N1,
722 alpha, A->v, ldav,
723 0.0, tmp + offy * rank, rank );
724 }
725 (void)ret;
726 (void)alpha;
727}
728
729/**
730 *******************************************************************************
731 *
732 * @brief Template to convert a full rank matrix into a low rank matrix through
733 * QR decompositions
734 *
735 * This version is only used when permutation method is used. Only difference
736 * from core_sge2lr_qrrt is the V calculation part, That is: Instead of applying
737 * inverse rotation on V, here inverse permutation is applied
738 *
739 *******************************************************************************
740 *
741 * @param[in] rrqrfct
742 * QR decomposition function used to compute the rank revealing
743 * factorization and create the low-rank form of A.
744 *
745 * @param[in] use_reltol
746 * TODO
747 *
748 * @param[in] tol
749 * The tolerance used as a criterion to eliminate information from the
750 * full rank matrix
751 *
752 * @param[in] rklimit
753 * The maximum rank to store the matrix in low-rank format. If
754 * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
755 *
756 * @param[in] m
757 * Number of rows of the matrix A, and of the low rank matrix Alr.
758 *
759 * @param[in] n
760 * Number of columns of the matrix A, and of the low rank matrix Alr.
761 *
762 * @param[in] Avoid
763 * The matrix of dimension lda-by-n that needs to be compressed
764 *
765 * @param[in] lda
766 * The leading dimension of the matrix A. lda >= max(1, m)
767 *
768 * @param[out] Alr
769 * The low rank matrix structure that will store the low rank
770 * representation of A
771 *
772 *******************************************************************************
773 *
774 * @return TODO
775 *
776 *******************************************************************************/
779 int use_reltol,
780 pastix_fixdbl_t tol,
781 pastix_int_t rklimit,
782 pastix_int_t m,
783 pastix_int_t n,
784 const void *Avoid,
785 pastix_int_t lda,
786 pastix_lrblock_t *Alr )
787{
788 int ret, newrk;
789 pastix_int_t nb = 32;
790 float *A = (float*)Avoid;
791 float *Acpy;
792 pastix_int_t lwork;
793 float *work, *tau, zzsize;
794 float *rwork;
795 pastix_int_t *jpvt;
796 pastix_int_t zsize, rsize;
797 float *zwork;
798 pastix_fixdbl_t flops;
799
800 float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
801 A, lda, NULL );
802
803 if ( (norm == 0.) && (tol >= 0.)) {
804 core_slralloc( m, n, 0, Alr );
805 return 0. ;
806 }
807
808 /* work */
809 rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
810 if ( tol < 0. ) {
811 tol = -1.;
812 }
813 else if ( use_reltol ) {
814 tol = tol * norm;
815 }
816
817 ret = rrqrfct( tol, rklimit, 0, nb,
818 m, n, NULL, m,
819 NULL, NULL,
820 &zzsize, -1, NULL );
821
822 lwork = (pastix_int_t)zzsize;
823 zsize = lwork;
824 /* Acpy */
825 zsize += m * n;
826 /* tau */
827 zsize += n;
828 /* rwork */
829 rsize = 2 * n;
830
831#if defined(PASTIX_DEBUG_LR)
832 zwork = NULL;
833 Acpy = malloc( m * n * sizeof(float) );
834 tau = malloc( n * sizeof(float) );
835 work = malloc( lwork * sizeof(float) );
836 rwork = malloc( rsize * sizeof(float) );
837#else
838 zwork = malloc( zsize * sizeof(float) + rsize * sizeof(float) );
839 Acpy = zwork;
840 tau = Acpy + m * n;
841 work = tau + n;
842 rwork = (float*)(work + lwork);
843#endif
844
845 jpvt = malloc( n * sizeof(pastix_int_t) );
846
847 /**
848 * Backup A into Acpy to try to compress
849 */
850 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
851 A, lda, Acpy, m );
852 assert(ret == 0);
853
854 newrk = rrqrfct( tol, rklimit, 0, nb,
855 m, n, Acpy, m,
856 jpvt, tau,
857 work, lwork, rwork );
858 if (newrk == -1) {
859 flops = FLOPS_SGEQRF( m, n );
860 }
861 else {
862 flops = FLOPS_SGEQRF( m, newrk ) + FLOPS_SORMQR( m, n-newrk, newrk, PastixLeft );
863 }
864
865 /**
866 * It was not interesting to compress, so we restore the dense version in Alr
867 */
868 core_slralloc( m, n, newrk, Alr );
869 Alr->rk = newrk;
870
871 if ( newrk == -1 ) {
872 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
873 A, lda, Alr->u, Alr->rkmax );
874 assert(ret == 0);
875 }
876 else if ( newrk > 0 ) {
877 /**
878 * We compute U and V
879 */
880 pastix_int_t i;
881 float *U, *V;
882
883 U = Alr->u;
884 V = Alr->v;
885
886 /* Compute the final U form */
887 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
888 Acpy, m, U, m );
889 assert(ret == 0);
890
891 ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
892 U, m, tau, work, lwork );
893 assert(ret == 0);
894 flops += FLOPS_SORGQR( m, Alr->rk, Alr->rk );
895
896 /* Compute the final V form */
897 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
898 0.0, 0.0, Acpy + 1, m );
899
900 for (i=0; i<n; i++){
901 memcpy( V + jpvt[i] * Alr->rk,
902 Acpy + i * m,
903 Alr->rk * sizeof(float) );
904 }
905 }
906
907#if defined(PASTIX_DEBUG_LR)
908 if ( Alr->rk > 0 ) {
909 int rc = core_slrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
910 if (rc == 1) {
911 fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
912 }
913 }
914#endif
915
916 free( zwork );
917 free( jpvt );
918#if defined(PASTIX_DEBUG_LR)
919 free( Acpy );
920 free( tau );
921 free( work );
922 free( rwork );
923#endif
924 (void)ret;
925 return flops;
926}
927
928/**
929 *******************************************************************************
930 *
931 * @brief Template to convert a full rank matrix into a low rank matrix through
932 * QR decompositions
933 *
934 * This version is only used when rotational method is used. Only difference
935 * from core_sge2lr_qr is the V calculation part, That is: Instead of applying
936 * inverse permutation on V, here inverse rotation is applied
937 *
938 *******************************************************************************
939 *
940 * @param[in] rrqrfct
941 * QR decomposition function used to compute the rank revealing
942 * factorization and create the low-rank form of A.
943 *
944 * @param[in] use_reltol
945 * TODO
946 *
947 * @param[in] tol
948 * The tolerance used as a criterion to eliminate information from the
949 * full rank matrix
950 *
951 * @param[in] rklimit
952 * The maximum rank to store the matrix in low-rank format. If
953 * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
954 *
955 * @param[in] m
956 * Number of rows of the matrix A, and of the low rank matrix Alr.
957 *
958 * @param[in] n
959 * Number of columns of the matrix A, and of the low rank matrix Alr.
960 *
961 * @param[in] Avoid
962 * The matrix of dimension lda-by-n that needs to be compressed
963 *
964 * @param[in] lda
965 * The leading dimension of the matrix A. lda >= max(1, m)
966 *
967 * @param[out] Alr
968 * The low rank matrix structure that will store the low rank
969 * representation of A
970 *
971 *******************************************************************************
972 *
973 * @return TODO
974 *
975 *******************************************************************************/
978 int use_reltol,
979 pastix_fixdbl_t tol,
980 pastix_int_t rklimit,
981 pastix_int_t m,
982 pastix_int_t n,
983 const void *Avoid,
984 pastix_int_t lda,
985 pastix_lrblock_t *Alr )
986{
987 int ret, newrk;
988 pastix_int_t nb = 32;
989 float *A = (float*)Avoid;
990 float *Acpy;
991 pastix_int_t lwork;
992 float *work, *tau, *B, *tau_b, zzsize;
993 pastix_int_t *jpvt;
994 pastix_int_t zsize, bsize;
995 float *zwork;
996 pastix_fixdbl_t flops;
997
998 char trans;
999#if defined(PRECISION_c) || defined(PRECISION_z)
1000 trans = 'C';
1001#else
1002 trans = 'T';
1003#endif
1004
1005 float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
1006 A, lda, NULL );
1007
1008 if ( (norm == 0.) && (tol >= 0.)) {
1009 core_slralloc( m, n, 0, Alr );
1010 return 0. ;
1011 }
1012
1013 /* work */
1014 rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
1015 if ( tol < 0. ) {
1016 tol = -1.;
1017 }
1018 else if ( use_reltol ) {
1019 tol = tol * norm;
1020 }
1021
1022 ret = rrqrfct( tol, rklimit, nb,
1023 m, n,
1024 NULL, m, NULL,
1025 NULL, n, NULL,
1026 &zzsize, -1, norm );
1027
1028 lwork = (pastix_int_t)zzsize;
1029 zsize = lwork;
1030 bsize = n * rklimit;
1031 /* Acpy */
1032 zsize += m * n;
1033 /* tau */
1034 zsize += n;
1035 /* B and tau_b */
1036 zsize += bsize + n;
1037
1038#if defined(PASTIX_DEBUG_LR)
1039 zwork = NULL;
1040 Acpy = malloc( m * n * sizeof(float) );
1041 tau = malloc( n * sizeof(float) );
1042 B = malloc( bsize * sizeof(float) );
1043 tau_b = malloc( n * sizeof(float) );
1044 work = malloc( lwork * sizeof(float) );
1045#else
1046 zwork = malloc( zsize * sizeof(float) );
1047 Acpy = zwork;
1048 tau = Acpy + m * n;
1049 B = tau + n;
1050 tau_b = B + bsize;
1051 work = tau_b + n;
1052#endif
1053
1054 jpvt = malloc( n * sizeof(pastix_int_t) );
1055
1056 /**
1057 * Backup A into Acpy to try to compress
1058 */
1059 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1060 A, lda, Acpy, m );
1061 assert(ret == 0);
1062
1063 newrk = rrqrfct( tol, rklimit, nb,
1064 m, n,
1065 Acpy, m, tau,
1066 B, n, tau_b,
1067 work, lwork, norm );
1068 if (newrk == -1) {
1069 flops = FLOPS_SGEQRF( m, n );
1070 }
1071 else {
1072 flops = FLOPS_SGEQRF( m, newrk ) + FLOPS_SORMQR( m, n-newrk, newrk, PastixLeft );
1073 }
1074
1075 /**
1076 * It was not interesting to compress, so we restore the dense version in Alr
1077 */
1078 core_slralloc( m, n, newrk, Alr );
1079 Alr->rk = newrk;
1080
1081 if ( newrk == -1 ) {
1082 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1083 A, lda, Alr->u, Alr->rkmax );
1084 assert(ret == 0);
1085 }
1086 else if ( newrk > 0 ) {
1087 /**
1088 * We compute U and V
1089 */
1090 float *U, *V;
1091 pastix_int_t d, rk = 0;
1092
1093 U = Alr->u;
1094 V = Alr->v;
1095
1096 /* Compute the final U form */
1097 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
1098 Acpy, m, U, m );
1099 assert(ret == 0);
1100
1101 ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
1102 U, m, tau, work, lwork );
1103 assert(ret == 0);
1104 flops += FLOPS_SORGQR( m, Alr->rk, Alr->rk );
1105
1106 /* Compute the final V form */
1107 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'U', Alr->rk, n,
1108 Acpy, m, V, Alr->rk );
1109 assert(ret == 0);
1110 ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
1111 0.0, 0.0, V + 1, Alr->rk );
1112 assert(ret == 0);
1113 /*
1114 * Apply inverse rotations to V^T
1115 */
1116 {
1117 /*
1118 * Householders are applied in the reverse order of before
1119 */
1120 rk = (Alr->rk / nb) * nb;
1121 while( rk >= 0 ) {
1122 d = pastix_imin( nb, Alr->rk - rk );
1123 ret = LAPACKE_sormqr_work( LAPACK_COL_MAJOR, 'R', trans,
1124 Alr->rk - rk, n - rk, d,
1125 B + rk * n + rk, n, tau_b + rk,
1126 V + rk * Alr->rk + rk, Alr->rk,
1127 work, lwork );
1128 assert(ret == 0);
1129 rk -= nb;
1130 }
1131 }
1132 }
1133
1134#if defined(PASTIX_DEBUG_LR)
1135 if ( Alr->rk > 0 ) {
1136 int rc = core_slrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
1137 if (rc == 1) {
1138 fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
1139 }
1140 }
1141#endif
1142
1143 free( zwork );
1144 free( jpvt );
1145#if defined(PASTIX_DEBUG_LR)
1146 free( Acpy );
1147 free( tau );
1148 free( B );
1149 free( tau_b );
1150 free( work );
1151#endif
1152 (void)ret;
1153 return flops;
1154}
1155
1156/**
1157 *******************************************************************************
1158 *
1159 * @brief Template to perform the addition of two low-rank structures with
1160 * compression kernel based on QR decomposition.
1161 *
1162 * Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
1163 *
1164 * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
1165 * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
1166 * (0 I )
1167 * Compute Rank Revealing QR decomposition of (I u2^T u1) * (v2 v1)^T
1168 * (0 I )
1169 * Any QR rank revealing kernel can be used for the recompression of the V part.
1170 *
1171 *******************************************************************************
1172 *
1173 * @param[in] rrqrfct
1174 * QR decomposition function used to compute the rank revealing
1175 * factorization of the sum of the two low-rank matrices.
1176 *
1177 * @param[in] lowrank
1178 * The structure with low-rank parameters.
1179 *
1180 * @param[in] transA1
1181 * @arg PastixNoTrans: No transpose, op( A ) = A;
1182 * @arg PastixTrans: Transpose, op( A ) = A';
1183 *
1184 * @param[in] alphaptr
1185 * alpha * A is add to B
1186 *
1187 * @param[in] M1
1188 * The number of rows of the matrix A.
1189 *
1190 * @param[in] N1
1191 * The number of columns of the matrix A.
1192 *
1193 * @param[in] A
1194 * The low-rank representation of the matrix A.
1195 *
1196 * @param[in] M2
1197 * The number of rows of the matrix B.
1198 *
1199 * @param[in] N2
1200 * The number of columns of the matrix B.
1201 *
1202 * @param[in] B
1203 * The low-rank representation of the matrix B.
1204 *
1205 * @param[in] offx
1206 * The horizontal offset of A with respect to B.
1207 *
1208 * @param[in] offy
1209 * The vertical offset of A with respect to B.
1210 *
1211 *******************************************************************************
1212 *
1213 * @return The new rank of u2 v2^T or -1 if ranks are too large for
1214 * recompression
1215 *
1216 *******************************************************************************/
1219 const pastix_lr_t *lowrank,
1220 pastix_trans_t transA1,
1221 const void *alphaptr,
1222 pastix_int_t M1,
1223 pastix_int_t N1,
1224 const pastix_lrblock_t *A,
1225 pastix_int_t M2,
1226 pastix_int_t N2,
1228 pastix_int_t offx,
1229 pastix_int_t offy )
1230{
1231 pastix_int_t rankA, rank, M, N, minV;
1232 pastix_int_t i, ret, new_rank, rklimit;
1233 pastix_int_t ldau, ldav, ldbu, ldbv, ldu, ldv;
1234 float *u1u2, *v1v2, *u;
1235 float *zbuf, *tauV;
1236 size_t wzsize;
1237 float tol = lowrank->tolerance;
1238
1239 /* PQRCP parameters / workspace */
1240 pastix_int_t nb = 32;
1241 pastix_int_t lwork;
1242 pastix_int_t *jpvt;
1243 float *zwork, zzsize;
1244 float *rwork;
1245 float alpha = *((float*)alphaptr);
1246 pastix_fixdbl_t flops, total_flops = 0.;
1247
1248#if defined(PASTIX_DEBUG_LR)
1249 if ( B->rk > 0 ) {
1250 int rc = core_slrdbg_check_orthogonality( M2, B->rk, B->u, M2 );
1251 if (rc == 1) {
1252 fprintf(stderr, "Failed to have B->u orthogonal in entry of rradd\n" );
1253 }
1254 }
1255#endif
1256
1257 rankA = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
1258 rank = rankA + B->rk;
1259 M = pastix_imax(M2, M1);
1260 N = pastix_imax(N2, N1);
1261
1262 minV = pastix_imin(N, rank);
1263
1264 assert(M2 == M && N2 == N);
1265 assert(B->rk != -1);
1266
1267 assert( A->rk <= A->rkmax);
1268 assert( B->rk <= B->rkmax);
1269
1270 if ( ((M1 + offx) > M2) ||
1271 ((N1 + offy) > N2) )
1272 {
1273 pastix_print_error( "Dimensions are not correct" );
1274 assert(0 /* Incorrect dimensions */);
1275 return total_flops;
1276 }
1277
1278 /*
1279 * A is rank null, nothing to do
1280 */
1281 if ( A->rk == 0 ) {
1282 return total_flops;
1283 }
1284
1285 /*
1286 * Let's handle case where B is a null matrix
1287 * B = alpha A
1288 */
1289 if ( B->rk == 0 ) {
1290 core_slrcpy( lowrank, transA1, alpha,
1291 M1, N1, A, M2, N2, B,
1292 offx, offy );
1293 return total_flops;
1294 }
1295
1296 /*
1297 * The rank is too big, let's try to compress
1298 */
1299 if ( rank > pastix_imin( M, N ) ) {
1300 assert(0);
1301 }
1302
1303 /*
1304 * Let's define leading dimensions
1305 */
1306 ldau = (A->rk == -1) ? A->rkmax : M1;
1307 ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
1308 ldbu = M;
1309 ldbv = B->rkmax;
1310 ldu = M;
1311 ldv = rank;
1312
1313 /*
1314 * Let's compute the size of the workspace
1315 */
1316 /* u1u2 and v1v2 */
1317 wzsize = (M+N) * rank;
1318 /* tauV */
1319 wzsize += minV;
1320
1321 /* RRQR workspaces */
1322 rklimit = pastix_imin( rank, core_get_rklimit( M, N ) );
1323 rrqrfct( tol, rklimit, 1, nb,
1324 rank, N, NULL, ldv,
1325 NULL, NULL,
1326 &zzsize, -1, NULL );
1327 lwork = (pastix_int_t)(zzsize);
1328 wzsize += lwork;
1329
1330#if defined(PASTIX_DEBUG_LR)
1331 zbuf = NULL;
1332 u1u2 = malloc( ldu * rank * sizeof(float) );
1333 v1v2 = malloc( ldv * N * sizeof(float) );
1334 tauV = malloc( rank * sizeof(float) );
1335 zwork = malloc( lwork * sizeof(float) );
1336
1337 rwork = malloc( 2 * pastix_imax( rank, N ) * sizeof(float) );
1338#else
1339 zbuf = malloc( wzsize * sizeof(float) + 2 * pastix_imax(rank, N) * sizeof(float) );
1340
1341 u1u2 = zbuf;
1342 v1v2 = u1u2 + ldu * rank;
1343 tauV = v1v2 + ldv * N;
1344 zwork = tauV + rank;
1345
1346 rwork = (float*)(zwork + lwork);
1347#endif
1348
1349 /*
1350 * Concatenate U2 and U1 in u1u2
1351 * [ u2 0 ]
1352 * [ u2 u1 ]
1353 * [ u2 0 ]
1354 */
1355 core_slrconcatenate_u( alpha,
1356 M1, N1, A,
1357 M2, B,
1358 offx, u1u2 );
1359
1360 /*
1361 * Concatenate V2 and V1 in v1v2
1362 * [ v2^h v2^h v2^h ]
1363 * [ 0 v1^h 0 ]
1364 */
1365 core_slrconcatenate_v( transA1, alpha,
1366 M1, N1, A,
1367 N2, B,
1368 offy, v1v2 );
1369
1370 /*
1371 * Orthogonalize [u2, u1]
1372 * [u2, u1] = [u2, u1 - u2(u2Tu1)] * (I u2Tu1)
1373 * (0 I )
1374 */
1375
1376 /* We do not care is A was integrated into v1v2 */
1377 if (rankA != 0) {
1378
1379 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_orthogonalize );
1380 switch ( pastix_lr_ortho ) {
1382 flops = core_slrorthu_fullqr( M, N, B->rk + rankA,
1383 u1u2, ldu, v1v2, ldv );
1384 break;
1385
1387 flops = core_slrorthu_partialqr( M, N, B->rk, &rankA, offx, offy,
1388 u1u2, ldu, v1v2, ldv );
1389 break;
1390
1392 default:
1393 flops = core_slrorthu_cgs( M2, N2, M1, N1, B->rk, &rankA, offx, offy,
1394 u1u2, ldu, v1v2, ldv );
1395 }
1396 kernel_trace_stop_lvl2( flops );
1397
1398 total_flops += flops;
1399 }
1400
1401 rank = B->rk + rankA;
1402
1403 if (rankA == 0) {
1404 /*
1405 * The final B += A fit in B
1406 * Lets copy and return
1407 */
1408 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M, B->rk, u1u2, ldu, B->u, ldbu );
1409 assert( ret == 0 );
1410 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N, v1v2, ldv, B->v, ldbv );
1411 assert( ret == 0 );
1412
1413 free(zbuf);
1414#if defined(PASTIX_DEBUG_LR)
1415 free( u1u2 );
1416 free( v1v2 );
1417 free( tauV );
1418 free( zwork );
1419 free( rwork );
1420#endif
1421 return total_flops;
1422 }
1423
1424 MALLOC_INTERN( jpvt, pastix_imax(rank, N), pastix_int_t );
1425
1426 if ( lowrank->use_reltol ) {
1427 /**
1428 * In relative tolerance, we can choose two solutions:
1429 * 1) The first one, more conservative, is to compress relatively to
1430 * the norm of the final matrix \f$ \alpha A + B \f$. In this kernel, we
1431 * exploit the fact that the V part contains all the information while
1432 * the U part is orthonormal, and compute it as follow:
1433 *
1434 * float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', rank, N,
1435 * v1v2, ldv, NULL );
1436 * tol = tol * norm;
1437 *
1438 * 2) The second solution, less conservative, will allow to reduce the
1439 * rank more efficiently. Since A and B have been compressed relatively
1440 * to their respective norms, there is no reason to compress the sum
1441 * relatively to its own norm, but it is more reasonable to compress it
1442 * relatively to the norm of A and B. For example, A-B would be full
1443 * with the first criterion, and rank null with the second.
1444 * Note that here, we can only have an estimation that once again
1445 * reduces the conservation of the criterion.
1446 * \f[ || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b|| \f]
1447 *
1448 */
1449 float normA, normB;
1450 normA = core_slrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
1451 normB = core_slrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
1452 tol = tol * ( fabsf(alpha) * normA + normB );
1453 }
1454
1455 /*
1456 * Perform RRQR factorization on (I u2Tu1) v1v2 = (Q2 R2)
1457 * (0 I )
1458 */
1459 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
1460 rklimit = pastix_imin( rklimit, rank );
1461 new_rank = rrqrfct( tol, rklimit, 1, nb,
1462 rank, N, v1v2, ldv,
1463 jpvt, tauV,
1464 zwork, lwork, rwork );
1465 flops = (new_rank == -1) ? FLOPS_SGEQRF( rank, N )
1466 : (FLOPS_SGEQRF( rank, new_rank ) +
1467 FLOPS_SORMQR( rank, N-new_rank, new_rank, PastixLeft ));
1468 kernel_trace_stop_lvl2_rank( flops, new_rank );
1469 total_flops += flops;
1470
1471 /*
1472 * First case: The rank is too big, so we decide to uncompress the result
1473 */
1474 if ( (new_rank > rklimit) ||
1475 (new_rank == -1) )
1476 {
1477 pastix_lrblock_t Bbackup = *B;
1478
1479 core_slralloc( M, N, -1, B );
1480 u = B->u;
1481
1482 /* Uncompress B */
1483 flops = FLOPS_SGEMM( M, N, Bbackup.rk );
1484 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
1485 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1486 M, N, Bbackup.rk,
1487 (sone), Bbackup.u, ldbu,
1488 Bbackup.v, ldbv,
1489 (szero), u, M );
1490 kernel_trace_stop_lvl2( flops );
1491 total_flops += flops;
1492
1493 /* Add A into it */
1494 if ( A->rk == -1 ) {
1495 flops = 2 * M1 * N1;
1496 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1497 core_sgeadd( transA1, M1, N1,
1498 alpha, A->u, ldau,
1499 sone, u + offy * M + offx, M);
1500 kernel_trace_stop_lvl2( flops );
1501 }
1502 else {
1503 flops = FLOPS_SGEMM( M1, N1, A->rk );
1504 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1505 cblas_sgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
1506 M1, N1, A->rk,
1507 (alpha), A->u, ldau,
1508 A->v, ldav,
1509 (sone), u + offy * M + offx, M);
1510 kernel_trace_stop_lvl2( flops );
1511 }
1512 total_flops += flops;
1513 core_slrfree(&Bbackup);
1514 free( zbuf );
1515 free( jpvt );
1516#if defined(PASTIX_DEBUG_LR)
1517 free( u1u2 );
1518 free( v1v2 );
1519 free( tauV );
1520 free( zwork );
1521 free( rwork );
1522#endif
1523 return total_flops;
1524 }
1525 else if ( new_rank == 0 ) {
1526 core_slrfree(B);
1527 free( zbuf );
1528 free( jpvt );
1529#if defined(PASTIX_DEBUG_LR)
1530 free( u1u2 );
1531 free( v1v2 );
1532 free( tauV );
1533 free( zwork );
1534 free( rwork );
1535#endif
1536 return total_flops;
1537 }
1538
1539 /*
1540 * We need to reallocate the buffer to store the new compressed version of B
1541 * because it wasn't big enough
1542 */
1543 ret = core_slrsze( 0, M, N, B, new_rank, -1, -1 );
1544 assert( ret != -1 );
1545 assert( B->rkmax >= new_rank );
1546 assert( B->rkmax >= B->rk );
1547
1548 ldbv = B->rkmax;
1549
1550 /* B->v = P v1v2 */
1551 {
1552 float *tmpV;
1553 pastix_int_t lm;
1554
1555 memset(B->v, 0, N * ldbv * sizeof(float));
1556 tmpV = B->v;
1557 for (i=0; i<N; i++){
1558 lm = pastix_imin( new_rank, i+1 );
1559 memcpy(tmpV + jpvt[i] * ldbv,
1560 v1v2 + i * ldv,
1561 lm * sizeof(float));
1562 }
1563 }
1564
1565 /* Compute Q2 factor */
1566 {
1567 flops = FLOPS_SORGQR( rank, new_rank, new_rank )
1568 + FLOPS_SGEMM( M, new_rank, rank );
1569
1570 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
1571 ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, rank, new_rank, new_rank,
1572 v1v2, ldv, tauV, zwork, lwork );
1573 assert(ret == 0);
1574
1575 cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1576 M, new_rank, rank,
1577 (sone), u1u2, ldu,
1578 v1v2, ldv,
1579 (szero), B->u, ldbu);
1580 kernel_trace_stop_lvl2( flops );
1581 total_flops += flops;
1582 }
1583
1584 free( zbuf );
1585 free( jpvt );
1586
1587#if defined(PASTIX_DEBUG_LR)
1588 free( u1u2 );
1589 free( v1v2 );
1590 free( tauV );
1591 free( zwork );
1592 free( rwork );
1593#endif
1594
1595 (void)ret;
1596 return total_flops;
1597}
1598
1599/**
1600 *******************************************************************************
1601 *
1602 * @brief Compute the size of a block to send in LR
1603 *
1604 *******************************************************************************
1605 *
1606 * @param[in] M
1607 * The number of rows of the matrix A.
1608 *
1609 * @param[in] N
1610 * The number of columns of the matrix A.
1611 *
1612 * @param[in] A
1613 * The low-rank representation of the matrix A.
1614 *
1615 *******************************************************************************
1616 *
1617 * @return Size of a block to send in LR
1618 *
1619 *******************************************************************************/
1620size_t
1622 pastix_int_t N,
1623 pastix_lrblock_t *A )
1624{
1625 if ( A->rk != -1 ) {
1626 return A->rk * ( M + N );
1627 }
1628 else {
1629 return M * N;
1630 }
1631}
1632
1633/**
1634 *******************************************************************************
1635 *
1636 * @brief Pack low-rank data by side
1637 *
1638 *******************************************************************************
1639 *
1640 * @param[in] M
1641 * The number of rows of the matrix A.
1642 *
1643 * @param[in] N
1644 * The number of columns of the matrix A.
1645 *
1646 * @param[in] A
1647 * The low-rank representation of the matrix A.
1648 *
1649 * @param[inout] buffer
1650 * Pointer on packed data
1651 *
1652 *******************************************************************************
1653 *
1654 * @return Pointer on packed data shifted to the next block
1655 *
1656 *******************************************************************************/
1657char *
1659 pastix_int_t N,
1660 const pastix_lrblock_t *A,
1661 char *buffer )
1662{
1663 int rk = A->rk;
1664 int rkmax = A->rkmax;
1665 void *u = A->u;
1666 void *v = A->v;
1667 int ret;
1668
1669 /* Store the rank */
1670 memcpy( buffer, &rk, sizeof( int ) );
1671 buffer += sizeof( int );
1672
1673 if ( rk != -1 ) {
1674 /* Pack the u part */
1675 memcpy( buffer, u, rk * M * sizeof( float ) );
1676 buffer += rk * M * sizeof( float );
1677
1678 /* Pack the v part */
1679 if ( rk == rkmax ) {
1680 memcpy( buffer, v, rk * N * sizeof( float ) );
1681 buffer += rk * N * sizeof( float );
1682 }
1683 else {
1684 ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', rk, N, v, rkmax,
1685 (float *)buffer, rk );
1686 assert( ret == 0 );
1687 buffer += rk * N * sizeof( float );
1688 }
1689 }
1690 else {
1691 memcpy( buffer, u, M * N * sizeof( float ) );
1692 buffer += M * N * sizeof( float );
1693 }
1694
1695 (void)ret;
1696
1697 return buffer;
1698}
1699
1700/**
1701 *******************************************************************************
1702 *
1703 * @brief Unpack low rank data and fill the cblk concerned by the computation
1704 *
1705 *******************************************************************************
1706 *
1707 * @param[in] M
1708 * The number of rows of the matrix A.
1709 *
1710 * @param[in] N
1711 * The number of columns of the matrix A.
1712 *
1713 * @param[in] A
1714 * The low-rank representation of the matrix A.
1715 *
1716 * @param[inout] buffer
1717 * Pointer on packed data
1718 *
1719 *******************************************************************************
1720 *
1721 * @return Pointer on packed data shifted to the next block
1722 *
1723 *******************************************************************************/
1724char *
1726 pastix_int_t N,
1728 char *buffer )
1729{
1730 int rk;
1731 memcpy( &rk, buffer, sizeof( int ) );
1732 buffer += sizeof( int );
1733
1734 /* Make sure A can store the unpacked values */
1735 core_slrsze( 0, M, N, A, rk, rk, rk );
1736
1737 if ( rk != -1 ) {
1738 /* Unpack U */
1739 memcpy( A->u, buffer, M * rk * sizeof( float ) );
1740 buffer += M * rk * sizeof( float );
1741
1742 /* Unpack V */
1743 memcpy( A->v, buffer, N * rk * sizeof( float ) );
1744 buffer += N * rk * sizeof( float );
1745 }
1746 else {
1747 /* Unpack the full block */
1748 memcpy( A->u, buffer, M * N * sizeof( float ) );
1749 buffer += M * N * sizeof( float );
1750 }
1751
1752 return buffer;
1753}
1754
1755/**
1756 *******************************************************************************
1757 *
1758 * @brief Unpack low rank data and fill the cblk concerned by the computation
1759 *
1760 *******************************************************************************
1761 *
1762 * @param[in] M
1763 * The number of rows of the matrix A.
1764 *
1765 * @param[in] N
1766 * The number of columns of the matrix A.
1767 *
1768 * @param[in] A
1769 * The low-rank representation of the matrix A.
1770 *
1771 * @param[inout] input
1772 * TODO
1773 *
1774 * @param[inout] outptr
1775 * TODO
1776 *
1777 *******************************************************************************
1778 *
1779 * @return Pointer on packed data shifted to the next block
1780 *
1781 *******************************************************************************/
1782const char *
1784 pastix_int_t N,
1786 const char *input,
1787 char **outptr )
1788{
1789 char *output = *outptr;
1790 size_t size;
1791 int rk;
1792
1793 rk = *((int *)input);
1794 input += sizeof( int );
1795
1796 if ( rk != -1 ) {
1797 A->rk = rk;
1798 A->rkmax = rk;
1799
1800 /* Unpack U */
1801 size = M * rk * sizeof( float );
1802 A->u = output;
1803
1804 memcpy( A->u, input, size );
1805 input += size;
1806 output += size;
1807
1808 /* Unpack V */
1809 size = N * rk * sizeof( float );
1810 A->v = output;
1811
1812 memcpy( A->v, input, size );
1813 input += size;
1814 output += size;
1815 }
1816 else {
1817 A->rk = -1;
1818 A->rkmax = M;
1819 A->v = NULL;
1820
1821 /* Unpack the full block */
1822 size = M * N * sizeof( float );
1823 A->u = output;
1824
1825 memcpy( A->u, input, size );
1826 input += size;
1827 output += size;
1828 }
1829
1830 *outptr = output;
1831 return input;
1832}
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
@ PastixKernelLvl2_LR_add2C_rradd_orthogonalize
pastix_fixdbl_t core_slrorthu_fullqr(pastix_int_t M, pastix_int_t N, pastix_int_t rank, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the u part of the low-rank form, and update the v part accordingly using full QR.
pastix_fixdbl_t core_slrorthu_cgs(pastix_int_t M1, pastix_int_t N1, pastix_int_t M2, pastix_int_t N2, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using CGS.
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition core_sgeadd.c:78
void core_sgetmo(int m, int n, const float *A, int lda, float *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
Definition core_sgetmo.c:49
pastix_fixdbl_t core_slrorthu_partialqr(pastix_int_t M, pastix_int_t N, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using partial ...
int core_slrdbg_check_orthogonality(pastix_int_t M, pastix_int_t N, const float *A, pastix_int_t lda)
Check the orthogonality of the matrix A.
fct_ge2lr_t core_ge2lr
pastix_int_t pastix_lr_ortho
Define the orthogonalization method.
Definition lowrank.c:25
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_qrrt(core_srrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
int(* core_srrqr_rt_t)(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, float *tau, float *B, pastix_int_t ldb, float *tau_b, float *work, pastix_int_t lwork, float normA)
TODO.
int(* core_srrqr_cp_t)(float tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
TODO.
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
int core_slr2ge(pastix_trans_t trans, pastix_int_t m, pastix_int_t n, const pastix_lrblock_t *Alr, float *A, pastix_int_t lda)
Convert a low rank matrix into a dense matrix.
int core_slrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
void core_slralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
void core_slrconcatenate_v(pastix_trans_t transA1, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, float *v1v2)
Concatenate right parts of two low-rank matrices.
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
char * core_slrpack(pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer)
Pack low-rank data by side.
size_t core_slrgetsize(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A)
Compute the size of a block to send in LR.
char * core_slrunpack(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer)
Unpack low rank data and fill the cblk concerned by the computation.
void core_slrconcatenate_u(float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, float *u1u2)
Concatenate left parts of two low-rank matrices.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
float core_slrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition core_slrnrm.c:48
const char * core_slrunpack2(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, const char *input, char **outptr)
Unpack low rank data and fill the cblk concerned by the computation.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixFrobeniusNorm
Definition api.h:504
@ PastixCompressOrthoQR
Definition api.h:408
@ PastixCompressOrthoPartialQR
Definition api.h:409
@ PastixCompressOrthoCGS
Definition api.h:407
@ PastixLeft
Definition api.h:495
@ PastixNoTrans
Definition api.h:445
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...