PaStiX Handbook  6.4.0
core_sgelrops.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_sgelrops.c
4  *
5  * PaStiX low-rank kernel routines
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Gregoire Pichon
12  * @author Esragul Korkmaz
13  * @author Mathieu Faverge
14  * @author Pierre Ramet
15  * @author Nolan Bredel
16  * @date 2024-07-05
17  * @generated from /builds/solverstack/pastix/kernels/core_zgelrops.c, normal z -> s, Tue Oct 8 14:17:21 2024
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_scores.h"
25 #include "pastix_slrcores.h"
26 #include "s_nan_check.h"
27 #include "kernels_trace.h"
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 static float sone = 1.0;
31 static float szero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Allocate a low-rank matrix.
38  *
39  *******************************************************************************
40  *
41  * @param[in] M
42  * Number of rows of the matrix A.
43  *
44  * @param[in] N
45  * Number of columns of the matrix A.
46  *
47  * @param[in] rkmax
48  * @arg -1: the matrix is allocated tight to its rank.
49  * @arg >0: the matrix is allocated to the minimum of rkmax and its maximum rank.
50  *
51  * @param[out] A
52  * The allocated low-rank matrix
53  *
54  *******************************************************************************/
55 void
57  pastix_int_t N,
58  pastix_int_t rkmax,
59  pastix_lrblock_t *A )
60 {
61  float *u, *v;
62 
63  if ( rkmax == -1 ) {
64  u = malloc( M * N * sizeof(float) );
65  memset( u, 0, M * N * sizeof(float) );
66  A->rk = -1;
67  A->rkmax = M;
68  A->u = u;
69  A->v = NULL;
70  }
71  else if ( rkmax == 0 ) {
72  A->rk = 0;
73  A->rkmax = 0;
74  A->u = NULL;
75  A->v = NULL;
76  }
77  else {
78  pastix_int_t rk = pastix_imin( M, N );
79  rkmax = pastix_imin( rkmax, rk );
80 
81 #if defined(PASTIX_DEBUG_LR)
82  u = malloc( M * rkmax * sizeof(float) );
83  v = malloc( N * rkmax * sizeof(float) );
84 
85  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
86  memset(u, 0, M * rkmax * sizeof(float));
87  memset(v, 0, N * rkmax * sizeof(float));
88 #else
89  u = malloc( (M+N) * rkmax * sizeof(float));
90 
91  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
92  memset(u, 0, (M+N) * rkmax * sizeof(float));
93 
94  v = u + M * rkmax;
95 #endif
96 
97  A->rk = 0;
98  A->rkmax = rkmax;
99  A->u = u;
100  A->v = v;
101  }
102 }
103 
104 /**
105  *******************************************************************************
106  *
107  * @brief Free a low-rank matrix.
108  *
109  *******************************************************************************
110  *
111  * @param[inout] A
112  * The low-rank matrix that will be desallocated.
113  *
114  *******************************************************************************/
115 void
117 {
118  if ( A->rk == -1 ) {
119  free(A->u);
120  A->u = NULL;
121  }
122  else {
123  free(A->u);
124 #if defined(PASTIX_DEBUG_LR)
125  free(A->v);
126 #endif
127  A->u = NULL;
128  A->v = NULL;
129  }
130  A->rk = 0;
131  A->rkmax = 0;
132 }
133 
134 /**
135  *******************************************************************************
136  *
137  * @brief Resize a low-rank matrix
138  *
139  *******************************************************************************
140  *
141  * @param[in] copy
142  * Enable/disable the copy of the data from A->u and A->v into the new
143  * low-rank representation.
144  *
145  * @param[in] M
146  * The number of rows of the matrix A.
147  *
148  * @param[in] N
149  * The number of columns of the matrix A.
150  *
151  * @param[inout] A
152  * The low-rank representation of the matrix. At exit, this structure
153  * is modified with the new low-rank representation of A, is the rank
154  * is small enough
155  *
156  * @param[in] newrk
157  * The new rank of the matrix A.
158  *
159  * @param[in] newrkmax
160  * The new maximum rank of the matrix A. Useful if the low-rank
161  * structure was allocated with more data than the rank.
162  *
163  * @param[in] rklimit
164  * The maximum rank to store the matrix in low-rank format. If
165  * -1, set to core_get_rklimit(M, N)
166  *
167  *******************************************************************************
168  *
169  * @return The new rank of A
170  *
171  *******************************************************************************/
172 int
173 core_slrsze( int copy,
174  pastix_int_t M,
175  pastix_int_t N,
176  pastix_lrblock_t *A,
177  pastix_int_t newrk,
178  pastix_int_t newrkmax,
179  pastix_int_t rklimit )
180 {
181  /* If no limit on the rank is given, let's take min(M, N) */
182  rklimit = (rklimit == -1) ? core_get_rklimit( M, N ) : rklimit;
183 
184  /* If no extra memory allocated, let's fix rkmax to rk */
185  newrkmax = (newrkmax == -1) ? newrk : newrkmax;
186  newrkmax = pastix_imax( newrkmax, newrk );
187 
188  /*
189  * It is not interesting to compress, so we alloc space to store the full matrix
190  */
191  if ( (newrk > rklimit) || (newrk == -1) )
192  {
193  A->u = realloc( A->u, M * N * sizeof(float) );
194 #if defined(PASTIX_DEBUG_LR)
195  free(A->v);
196 #endif
197  A->v = NULL;
198  A->rk = -1;
199  A->rkmax = M;
200  return -1;
201  }
202  /*
203  * The rank is null, we free everything
204  */
205  else if (newrkmax == 0)
206  {
207  /*
208  * The rank is null, we free everything
209  */
210  free(A->u);
211 #if defined(PASTIX_DEBUG_LR)
212  free(A->v);
213 #endif
214  A->u = NULL;
215  A->v = NULL;
216  A->rkmax = newrkmax;
217  A->rk = newrk;
218  }
219  /*
220  * The rank is non null, we allocate the correct amount of space, and
221  * compress the stored information if necessary
222  */
223  else {
224  float *u, *v;
225  int ret;
226 
227  if ( ( A->rk == -1 ) ||
228  (( A->rk != -1 ) && (newrkmax != A->rkmax)) )
229  {
230 #if defined(PASTIX_DEBUG_LR)
231  u = malloc( M * newrkmax * sizeof(float) );
232  v = malloc( N * newrkmax * sizeof(float) );
233 #else
234  u = malloc( (M+N) * newrkmax * sizeof(float) );
235  v = u + M * newrkmax;
236 #endif
237  if ( copy ) {
238  assert( A->rk != -1 );
239  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M, newrk,
240  A->u, M, u, M );
241  assert(ret == 0);
242  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', newrk, N,
243  A->v, A->rkmax, v, newrkmax );
244  assert(ret == 0);
245  }
246  free(A->u);
247 #if defined(PASTIX_DEBUG_LR)
248  free(A->v);
249 #endif
250  A->u = u;
251  A->v = v;
252  }
253 
254  /* Update rk and rkmax */
255  A->rkmax = newrkmax;
256  A->rk = newrk;
257 
258  (void)ret;
259  }
260  assert( A->rk <= A->rkmax);
261  return 0;
262 }
263 
264 /**
265  *******************************************************************************
266  *
267  * @brief Convert a low rank matrix into a dense matrix.
268  *
269  * Convert a low-rank matrix of size m-by-n into a full rank matrix.
270  * A = op( u * v^t ) with op(A) = A or A^t
271  *
272  *******************************************************************************
273  *
274  * @param[in] trans
275  * @arg PastixNoTrans: returns A = u * v^t
276  * @arg PastixTrans: returns A = v * u^t
277  *
278  * @param[in] m
279  * Number of rows of the low-rank matrix Alr.
280  *
281  * @param[in] n
282  * Number of columns of the low-rank matrix Alr.
283  *
284  * @param[in] Alr
285  * The low rank matrix to be converted into a full rank matrix
286  *
287  * @param[inout] A
288  * The matrix of dimension lda-by-k in which to store the uncompressed
289  * version of Alr. k = n if trans == PastixNoTrans, m otherwise.
290  *
291  * @param[in] lda
292  * The leading dimension of the matrix A. lda >= max(1, m) if trans ==
293  * PastixNoTrans, lda >= max(1,n) otherwise.
294  *
295  *******************************************************************************
296  *
297  * @retval 0 in case of success.
298  * @retval -i if the ith parameter is incorrect.
299  *
300  *******************************************************************************/
301 int
303  pastix_int_t m,
304  pastix_int_t n,
305  const pastix_lrblock_t *Alr,
306  float *A,
307  pastix_int_t lda )
308 {
309  int ret = 0;
310 
311 #if !defined(NDEBUG)
312  if ( m < 0 ) {
313  return -1;
314  }
315  if ( n < 0 ) {
316  return -2;
317  }
318  if (Alr == NULL || Alr->rk > Alr->rkmax) {
319  return -3;
320  }
321  if ( (trans == PastixNoTrans && lda < m) ||
322  (trans != PastixNoTrans && lda < n) )
323  {
324  return -5;
325  }
326  if ( Alr->rk == -1 ) {
327  if (Alr->u == NULL || Alr->v != NULL || (Alr->rkmax < m))
328  {
329  return -6;
330  }
331  }
332  else if ( Alr->rk != 0){
333  if (Alr->u == NULL || Alr->v == NULL) {
334  return -6;
335  }
336  }
337 #endif
338 
339  if ( trans == PastixNoTrans ) {
340  if ( Alr->rk == -1 ) {
341  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
342  Alr->u, Alr->rkmax, A, lda );
343  assert( ret == 0 );
344  }
345  else if ( Alr->rk == 0 ) {
346  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', m, n,
347  0.0, 0.0, A, lda );
348  assert( ret == 0 );
349  }
350  else {
351  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
352  m, n, Alr->rk,
353  (sone), Alr->u, m,
354  Alr->v, Alr->rkmax,
355  (szero), A, lda);
356  }
357  }
358  else {
359  if ( Alr->rk == -1 ) {
360  core_sgetmo( m, n, Alr->u, Alr->rkmax, A, lda );
361  }
362  else if ( Alr->rk == 0 ) {
363  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', n, m,
364  0.0, 0.0, A, lda );
365  assert( ret == 0 );
366  }
367  else {
368  cblas_sgemm(CblasColMajor, CblasTrans, CblasTrans,
369  n, m, Alr->rk,
370  (sone), Alr->v, Alr->rkmax,
371  Alr->u, m,
372  (szero), A, lda);
373  }
374  }
375 
376  return ret;
377 }
378 
379 /**
380  *******************************************************************************
381  *
382  * @brief Copy a small low-rank structure into a large one
383  *
384  *******************************************************************************
385  *
386  * @param[in] lowrank
387  * The structure with low-rank parameters.
388  *
389  * @param[in] transAv
390  * @arg PastixNoTrans: (A.v)' is stored transposed as usual
391  * @arg PastixTrans: A.v is stored
392  * @arg PastixTrans: A.v is stored
393  *
394  * @param[in] alpha
395  * The multiplier parameter: B = B + alpha * A
396  *
397  * @param[in] M1
398  * The number of rows of the matrix A.
399  *
400  * @param[in] N1
401  * The number of columns of the matrix A.
402  *
403  * @param[in] A
404  * The low-rank representation of the matrix A.
405  *
406  * @param[in] M2
407  * The number of rows of the matrix B.
408  *
409  * @param[in] N2
410  * The number of columns of the matrix B.
411  *
412  * @param[inout] B
413  * The low-rank representation of the matrix B.
414  *
415  * @param[in] offx
416  * The horizontal offset of A with respect to B.
417  *
418  * @param[in] offy
419  * The vertical offset of A with respect to B.
420  *
421  *******************************************************************************/
422 void
423 core_slrcpy( const pastix_lr_t *lowrank,
424  pastix_trans_t transAv,
425  float alpha,
426  pastix_int_t M1,
427  pastix_int_t N1,
428  const pastix_lrblock_t *A,
429  pastix_int_t M2,
430  pastix_int_t N2,
431  pastix_lrblock_t *B,
432  pastix_int_t offx,
433  pastix_int_t offy )
434 {
435  float *u, *v;
436  pastix_int_t ldau, ldav;
437  int ret;
438 
439  assert( (M1 + offx) <= M2 );
440  assert( (N1 + offy) <= N2 );
441 
442  ldau = (A->rk == -1) ? A->rkmax : M1;
443  ldav = (transAv == PastixNoTrans) ? A->rkmax : N1;
444 
445  core_slrfree( B );
446  core_slralloc( M2, N2, A->rk, B );
447  u = B->u;
448  v = B->v;
449 
450  B->rk = A->rk;
451 
452  if ( A->rk == -1 ) {
453  if ( (M1 != M2) || (N1 != N2) ) {
454  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, N2,
455  0.0, 0.0, u, M2 );
456  assert( ret == 0 );
457  }
458  ret = core_sgeadd( PastixNoTrans, M1, N1,
459  alpha, A->u, ldau,
460  0.0, u + M2 * offy + offx, M2 );
461  assert(ret == 0);
462  }
463  else {
464  if ( M1 != M2 ) {
465  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
466  0.0, 0.0, u, M2 );
467  assert( ret == 0 );
468  }
469  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
470  A->u, ldau,
471  u + offx, M2 );
472  assert(ret == 0);
473 
474  if ( N1 != N2 ) {
475  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
476  0.0, 0.0, v, B->rkmax );
477  assert( ret == 0 );
478  }
479  ret = core_sgeadd( transAv, A->rk, N1,
480  alpha, A->v, ldav,
481  0.0, v + B->rkmax * offy, B->rkmax );
482  assert(ret == 0);
483  }
484 
485 #if 0
486  {
487  float *work = malloc( M2 * N2 * sizeof(float) );
488 
489  core_slr2ge( PastixNoTrans, M2, N2, B, work, M2 );
490 
491  lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, M2, N2, work, M2, B );
492 
493  free(work);
494  }
495 #endif
496 
497  (void)lowrank;
498  (void)ret;
499 }
500 
501 
502 /**
503  *******************************************************************************
504  *
505  * @brief Concatenate left parts of two low-rank matrices
506  *
507  *******************************************************************************
508  *
509  * @param[in] alpha
510  * alpha * A is add to B
511  *
512  * @param[in] M1
513  * The number of rows of the matrix A.
514  *
515  * @param[in] N1
516  * The number of columns of the matrix A.
517  *
518  * @param[in] A
519  * The low-rank representation of the matrix A.
520  *
521  * @param[in] M2
522  * The number of rows of the matrix B.
523  *
524  * @param[in] B
525  * The low-rank representation of the matrix B.
526  *
527  * @param[in] offx
528  * The horizontal offset of A with respect to B.
529  *
530  * @param[inout] u1u2
531  * The workspace where matrices are concatenated
532  *
533  *******************************************************************************/
534 void
536  pastix_int_t M1,
537  pastix_int_t N1,
538  const pastix_lrblock_t *A,
539  pastix_int_t M2,
540  pastix_lrblock_t *B,
541  pastix_int_t offx,
542  float *u1u2 )
543 {
544  float *tmp;
545  pastix_int_t i, ret, rank;
546  pastix_int_t ldau, ldbu;
547 
548  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
549  rank += B->rk;
550 
551  ldau = (A->rk == -1) ? A->rkmax : M1;
552  ldbu = M2;
553 
554  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
555  B->u, ldbu, u1u2, M2 );
556  assert(ret == 0);
557 
558  tmp = u1u2 + B->rk * M2;
559  if ( A->rk == -1 ) {
560  /*
561  * A is full of rank M1, so A will be integrated into v1v2
562  */
563  if ( M1 < N1 ) {
564  if ( M1 != M2 ) {
565  /* Set to 0 */
566  memset(tmp, 0, M2 * M1 * sizeof(float));
567 
568  /* Set diagonal */
569  tmp += offx;
570  for (i=0; i<M1; i++, tmp += M2+1) {
571  *tmp = 1.0;
572  }
573  }
574  else {
575  assert( offx == 0 );
576  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M2, M1,
577  0.0, 1.0, tmp, M2 );
578  assert( ret == 0 );
579  }
580  }
581  else {
582  /*
583  * A is full of rank N1, so A is integrated into u1u2
584  */
585  if ( M1 != M2 ) {
586  memset(tmp, 0, M2 * N1 * sizeof(float));
587  }
588  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, N1,
589  A->u, ldau, tmp + offx, M2 );
590  assert(ret == 0);
591  }
592  }
593  /*
594  * A is low rank of rank A->rk
595  */
596  else {
597  if ( M1 != M2 ) {
598  memset(tmp, 0, M2 * A->rk * sizeof(float));
599  }
600  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
601  A->u, ldau, tmp + offx, M2 );
602  assert(ret == 0);
603  }
604  (void)ret;
605  (void)alpha;
606  (void)rank;
607 }
608 
609 
610 /**
611  *******************************************************************************
612  *
613  * @brief Concatenate right parts of two low-rank matrices
614  *
615  *******************************************************************************
616  *
617  * @param[in] transA1
618  * @arg PastixNoTrans: No transpose, op( A ) = A;
619  * @arg PastixTrans: Transpose, op( A ) = A';
620  *
621  * @param[in] alpha
622  * alpha * A is add to B
623  *
624  * @param[in] M1
625  * The number of rows of the matrix A.
626  *
627  * @param[in] N1
628  * The number of columns of the matrix A.
629  *
630  * @param[in] A
631  * The low-rank representation of the matrix A.
632  *
633  * @param[in] N2
634  * The number of columns of the matrix B.
635  *
636  * @param[in] B
637  * The low-rank representation of the matrix B.
638  *
639  * @param[in] offy
640  * The vertical offset of A with respect to B.
641  *
642  * @param[inout] v1v2
643  * The workspace where matrices are concatenated
644  *
645  *******************************************************************************/
646 void
648  float alpha,
649  pastix_int_t M1,
650  pastix_int_t N1,
651  const pastix_lrblock_t *A,
652  pastix_int_t N2,
653  pastix_lrblock_t *B,
654  pastix_int_t offy,
655  float *v1v2 )
656 {
657  float *tmp;
658  pastix_int_t i, ret, rank;
659  pastix_int_t ldau, ldav, ldbv;
660 
661  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
662  rank += B->rk;
663 
664  ldau = (A->rk == -1) ? A->rkmax : M1;
665  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
666  ldbv = B->rkmax;
667 
668  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
669  B->v, ldbv, v1v2, rank );
670  assert(ret == 0);
671 
672  tmp = v1v2 + B->rk;
673  if ( A->rk == -1 ) {
674  assert( transA1 == PastixNoTrans );
675  /*
676  * A is full of rank M1, so it is integrated into v1v2
677  */
678  if ( M1 < N1 ) {
679  if (N1 != N2) {
680  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M1, N2,
681  0.0, 0.0, tmp, rank );
682  assert( ret == 0 );
683  }
684  core_sgeadd( PastixNoTrans, M1, N1,
685  alpha, A->u, ldau,
686  0.0, tmp + offy * rank, rank );
687  }
688  /*
689  * A is full of rank N1, so it has been integrated into u1u2
690  */
691  else {
692  if (N1 != N2) {
693  /* Set to 0 */
694  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
695  0.0, 0.0, tmp, rank );
696  assert(ret == 0);
697 
698  /* Set diagonal */
699  tmp += offy * rank;
700  for (i=0; i<N1; i++, tmp += rank+1) {
701  *tmp = alpha;
702  }
703  }
704  else {
705  assert( offy == 0 );
706  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
707  0.0, alpha, tmp + offy * rank, rank );
708  assert( ret == 0 );
709  }
710  }
711  }
712  /*
713  * A is low rank of rank A->rk
714  */
715  else {
716  if (N1 != N2) {
717  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', A->rk, N2,
718  0.0, 0.0, tmp, rank );
719  assert(ret == 0);
720  }
721  core_sgeadd( transA1, A->rk, N1,
722  alpha, A->v, ldav,
723  0.0, tmp + offy * rank, rank );
724  }
725  (void)ret;
726  (void)alpha;
727 }
728 
729 /**
730  *******************************************************************************
731  *
732  * @brief Template to convert a full rank matrix into a low rank matrix through
733  * QR decompositions
734  *
735  * This version is only used when permutation method is used. Only difference
736  * from core_sge2lr_qrrt is the V calculation part, That is: Instead of applying
737  * inverse rotation on V, here inverse permutation is applied
738  *
739  *******************************************************************************
740  *
741  * @param[in] rrqrfct
742  * QR decomposition function used to compute the rank revealing
743  * factorization and create the low-rank form of A.
744  *
745  * @param[in] use_reltol
746  * TODO
747  *
748  * @param[in] tol
749  * The tolerance used as a criterion to eliminate information from the
750  * full rank matrix
751  *
752  * @param[in] rklimit
753  * The maximum rank to store the matrix in low-rank format. If
754  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
755  *
756  * @param[in] m
757  * Number of rows of the matrix A, and of the low rank matrix Alr.
758  *
759  * @param[in] n
760  * Number of columns of the matrix A, and of the low rank matrix Alr.
761  *
762  * @param[in] Avoid
763  * The matrix of dimension lda-by-n that needs to be compressed
764  *
765  * @param[in] lda
766  * The leading dimension of the matrix A. lda >= max(1, m)
767  *
768  * @param[out] Alr
769  * The low rank matrix structure that will store the low rank
770  * representation of A
771  *
772  *******************************************************************************
773  *
774  * @return TODO
775  *
776  *******************************************************************************/
779  int use_reltol,
780  pastix_fixdbl_t tol,
781  pastix_int_t rklimit,
782  pastix_int_t m,
783  pastix_int_t n,
784  const void *Avoid,
785  pastix_int_t lda,
786  pastix_lrblock_t *Alr )
787 {
788  int ret, newrk;
789  pastix_int_t nb = 32;
790  float *A = (float*)Avoid;
791  float *Acpy;
792  pastix_int_t lwork;
793  float *work, *tau, zzsize;
794  float *rwork;
795  pastix_int_t *jpvt;
796  pastix_int_t zsize, rsize;
797  float *zwork;
798  pastix_fixdbl_t flops;
799 
800  float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
801  A, lda, NULL );
802 
803  if ( (norm == 0.) && (tol >= 0.)) {
804  core_slralloc( m, n, 0, Alr );
805  return 0. ;
806  }
807 
808  /* work */
809  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
810  if ( tol < 0. ) {
811  tol = -1.;
812  }
813  else if ( use_reltol ) {
814  tol = tol * norm;
815  }
816 
817  ret = rrqrfct( tol, rklimit, 0, nb,
818  m, n, NULL, m,
819  NULL, NULL,
820  &zzsize, -1, NULL );
821 
822  lwork = (pastix_int_t)zzsize;
823  zsize = lwork;
824  /* Acpy */
825  zsize += m * n;
826  /* tau */
827  zsize += n;
828  /* rwork */
829  rsize = 2 * n;
830 
831 #if defined(PASTIX_DEBUG_LR)
832  zwork = NULL;
833  Acpy = malloc( m * n * sizeof(float) );
834  tau = malloc( n * sizeof(float) );
835  work = malloc( lwork * sizeof(float) );
836  rwork = malloc( rsize * sizeof(float) );
837 #else
838  zwork = malloc( zsize * sizeof(float) + rsize * sizeof(float) );
839  Acpy = zwork;
840  tau = Acpy + m * n;
841  work = tau + n;
842  rwork = (float*)(work + lwork);
843 #endif
844 
845  jpvt = malloc( n * sizeof(pastix_int_t) );
846 
847  /**
848  * Backup A into Acpy to try to compress
849  */
850  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
851  A, lda, Acpy, m );
852  assert(ret == 0);
853 
854  newrk = rrqrfct( tol, rklimit, 0, nb,
855  m, n, Acpy, m,
856  jpvt, tau,
857  work, lwork, rwork );
858  if (newrk == -1) {
859  flops = FLOPS_SGEQRF( m, n );
860  }
861  else {
862  flops = FLOPS_SGEQRF( m, newrk ) + FLOPS_SORMQR( m, n-newrk, newrk, PastixLeft );
863  }
864 
865  /**
866  * It was not interesting to compress, so we restore the dense version in Alr
867  */
868  core_slralloc( m, n, newrk, Alr );
869  Alr->rk = newrk;
870 
871  if ( newrk == -1 ) {
872  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
873  A, lda, Alr->u, Alr->rkmax );
874  assert(ret == 0);
875  }
876  else if ( newrk > 0 ) {
877  /**
878  * We compute U and V
879  */
880  pastix_int_t i;
881  float *U, *V;
882 
883  U = Alr->u;
884  V = Alr->v;
885 
886  /* Compute the final U form */
887  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
888  Acpy, m, U, m );
889  assert(ret == 0);
890 
891  ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
892  U, m, tau, work, lwork );
893  assert(ret == 0);
894  flops += FLOPS_SORGQR( m, Alr->rk, Alr->rk );
895 
896  /* Compute the final V form */
897  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
898  0.0, 0.0, Acpy + 1, m );
899 
900  for (i=0; i<n; i++){
901  memcpy( V + jpvt[i] * Alr->rk,
902  Acpy + i * m,
903  Alr->rk * sizeof(float) );
904  }
905  }
906 
907 #if defined(PASTIX_DEBUG_LR)
908  if ( Alr->rk > 0 ) {
909  int rc = core_slrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
910  if (rc == 1) {
911  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
912  }
913  }
914 #endif
915 
916  free( zwork );
917  free( jpvt );
918 #if defined(PASTIX_DEBUG_LR)
919  free( Acpy );
920  free( tau );
921  free( work );
922  free( rwork );
923 #endif
924  (void)ret;
925  return flops;
926 }
927 
928 /**
929  *******************************************************************************
930  *
931  * @brief Template to convert a full rank matrix into a low rank matrix through
932  * QR decompositions
933  *
934  * This version is only used when rotational method is used. Only difference
935  * from core_sge2lr_qr is the V calculation part, That is: Instead of applying
936  * inverse permutation on V, here inverse rotation is applied
937  *
938  *******************************************************************************
939  *
940  * @param[in] rrqrfct
941  * QR decomposition function used to compute the rank revealing
942  * factorization and create the low-rank form of A.
943  *
944  * @param[in] use_reltol
945  * TODO
946  *
947  * @param[in] tol
948  * The tolerance used as a criterion to eliminate information from the
949  * full rank matrix
950  *
951  * @param[in] rklimit
952  * The maximum rank to store the matrix in low-rank format. If
953  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
954  *
955  * @param[in] m
956  * Number of rows of the matrix A, and of the low rank matrix Alr.
957  *
958  * @param[in] n
959  * Number of columns of the matrix A, and of the low rank matrix Alr.
960  *
961  * @param[in] Avoid
962  * The matrix of dimension lda-by-n that needs to be compressed
963  *
964  * @param[in] lda
965  * The leading dimension of the matrix A. lda >= max(1, m)
966  *
967  * @param[out] Alr
968  * The low rank matrix structure that will store the low rank
969  * representation of A
970  *
971  *******************************************************************************
972  *
973  * @return TODO
974  *
975  *******************************************************************************/
978  int use_reltol,
979  pastix_fixdbl_t tol,
980  pastix_int_t rklimit,
981  pastix_int_t m,
982  pastix_int_t n,
983  const void *Avoid,
984  pastix_int_t lda,
985  pastix_lrblock_t *Alr )
986 {
987  int ret, newrk;
988  pastix_int_t nb = 32;
989  float *A = (float*)Avoid;
990  float *Acpy;
991  pastix_int_t lwork;
992  float *work, *tau, *B, *tau_b, zzsize;
993  pastix_int_t *jpvt;
994  pastix_int_t zsize, bsize;
995  float *zwork;
996  pastix_fixdbl_t flops;
997 
998  char trans;
999 #if defined(PRECISION_c) || defined(PRECISION_z)
1000  trans = 'C';
1001 #else
1002  trans = 'T';
1003 #endif
1004 
1005  float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
1006  A, lda, NULL );
1007 
1008  if ( (norm == 0.) && (tol >= 0.)) {
1009  core_slralloc( m, n, 0, Alr );
1010  return 0. ;
1011  }
1012 
1013  /* work */
1014  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
1015  if ( tol < 0. ) {
1016  tol = -1.;
1017  }
1018  else if ( use_reltol ) {
1019  tol = tol * norm;
1020  }
1021 
1022  ret = rrqrfct( tol, rklimit, nb,
1023  m, n,
1024  NULL, m, NULL,
1025  NULL, n, NULL,
1026  &zzsize, -1, norm );
1027 
1028  lwork = (pastix_int_t)zzsize;
1029  zsize = lwork;
1030  bsize = n * rklimit;
1031  /* Acpy */
1032  zsize += m * n;
1033  /* tau */
1034  zsize += n;
1035  /* B and tau_b */
1036  zsize += bsize + n;
1037 
1038 #if defined(PASTIX_DEBUG_LR)
1039  zwork = NULL;
1040  Acpy = malloc( m * n * sizeof(float) );
1041  tau = malloc( n * sizeof(float) );
1042  B = malloc( bsize * sizeof(float) );
1043  tau_b = malloc( n * sizeof(float) );
1044  work = malloc( lwork * sizeof(float) );
1045 #else
1046  zwork = malloc( zsize * sizeof(float) );
1047  Acpy = zwork;
1048  tau = Acpy + m * n;
1049  B = tau + n;
1050  tau_b = B + bsize;
1051  work = tau_b + n;
1052 #endif
1053 
1054  jpvt = malloc( n * sizeof(pastix_int_t) );
1055 
1056  /**
1057  * Backup A into Acpy to try to compress
1058  */
1059  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1060  A, lda, Acpy, m );
1061  assert(ret == 0);
1062 
1063  newrk = rrqrfct( tol, rklimit, nb,
1064  m, n,
1065  Acpy, m, tau,
1066  B, n, tau_b,
1067  work, lwork, norm );
1068  if (newrk == -1) {
1069  flops = FLOPS_SGEQRF( m, n );
1070  }
1071  else {
1072  flops = FLOPS_SGEQRF( m, newrk ) + FLOPS_SORMQR( m, n-newrk, newrk, PastixLeft );
1073  }
1074 
1075  /**
1076  * It was not interesting to compress, so we restore the dense version in Alr
1077  */
1078  core_slralloc( m, n, newrk, Alr );
1079  Alr->rk = newrk;
1080 
1081  if ( newrk == -1 ) {
1082  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1083  A, lda, Alr->u, Alr->rkmax );
1084  assert(ret == 0);
1085  }
1086  else if ( newrk > 0 ) {
1087  /**
1088  * We compute U and V
1089  */
1090  float *U, *V;
1091  pastix_int_t d, rk = 0;
1092 
1093  U = Alr->u;
1094  V = Alr->v;
1095 
1096  /* Compute the final U form */
1097  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
1098  Acpy, m, U, m );
1099  assert(ret == 0);
1100 
1101  ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
1102  U, m, tau, work, lwork );
1103  assert(ret == 0);
1104  flops += FLOPS_SORGQR( m, Alr->rk, Alr->rk );
1105 
1106  /* Compute the final V form */
1107  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'U', Alr->rk, n,
1108  Acpy, m, V, Alr->rk );
1109  assert(ret == 0);
1110  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
1111  0.0, 0.0, V + 1, Alr->rk );
1112  assert(ret == 0);
1113  /*
1114  * Apply inverse rotations to V^T
1115  */
1116  {
1117  /*
1118  * Householders are applied in the reverse order of before
1119  */
1120  rk = (Alr->rk / nb) * nb;
1121  while( rk >= 0 ) {
1122  d = pastix_imin( nb, Alr->rk - rk );
1123  ret = LAPACKE_sormqr_work( LAPACK_COL_MAJOR, 'R', trans,
1124  Alr->rk - rk, n - rk, d,
1125  B + rk * n + rk, n, tau_b + rk,
1126  V + rk * Alr->rk + rk, Alr->rk,
1127  work, lwork );
1128  assert(ret == 0);
1129  rk -= nb;
1130  }
1131  }
1132  }
1133 
1134 #if defined(PASTIX_DEBUG_LR)
1135  if ( Alr->rk > 0 ) {
1136  int rc = core_slrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
1137  if (rc == 1) {
1138  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
1139  }
1140  }
1141 #endif
1142 
1143  free( zwork );
1144  free( jpvt );
1145 #if defined(PASTIX_DEBUG_LR)
1146  free( Acpy );
1147  free( tau );
1148  free( B );
1149  free( tau_b );
1150  free( work );
1151 #endif
1152  (void)ret;
1153  return flops;
1154 }
1155 
1156 /**
1157  *******************************************************************************
1158  *
1159  * @brief Template to perform the addition of two low-rank structures with
1160  * compression kernel based on QR decomposition.
1161  *
1162  * Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
1163  *
1164  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
1165  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
1166  * (0 I )
1167  * Compute Rank Revealing QR decomposition of (I u2^T u1) * (v2 v1)^T
1168  * (0 I )
1169  * Any QR rank revealing kernel can be used for the recompression of the V part.
1170  *
1171  *******************************************************************************
1172  *
1173  * @param[in] rrqrfct
1174  * QR decomposition function used to compute the rank revealing
1175  * factorization of the sum of the two low-rank matrices.
1176  *
1177  * @param[in] lowrank
1178  * The structure with low-rank parameters.
1179  *
1180  * @param[in] transA1
1181  * @arg PastixNoTrans: No transpose, op( A ) = A;
1182  * @arg PastixTrans: Transpose, op( A ) = A';
1183  *
1184  * @param[in] alphaptr
1185  * alpha * A is add to B
1186  *
1187  * @param[in] M1
1188  * The number of rows of the matrix A.
1189  *
1190  * @param[in] N1
1191  * The number of columns of the matrix A.
1192  *
1193  * @param[in] A
1194  * The low-rank representation of the matrix A.
1195  *
1196  * @param[in] M2
1197  * The number of rows of the matrix B.
1198  *
1199  * @param[in] N2
1200  * The number of columns of the matrix B.
1201  *
1202  * @param[in] B
1203  * The low-rank representation of the matrix B.
1204  *
1205  * @param[in] offx
1206  * The horizontal offset of A with respect to B.
1207  *
1208  * @param[in] offy
1209  * The vertical offset of A with respect to B.
1210  *
1211  *******************************************************************************
1212  *
1213  * @return The new rank of u2 v2^T or -1 if ranks are too large for
1214  * recompression
1215  *
1216  *******************************************************************************/
1219  const pastix_lr_t *lowrank,
1220  pastix_trans_t transA1,
1221  const void *alphaptr,
1222  pastix_int_t M1,
1223  pastix_int_t N1,
1224  const pastix_lrblock_t *A,
1225  pastix_int_t M2,
1226  pastix_int_t N2,
1227  pastix_lrblock_t *B,
1228  pastix_int_t offx,
1229  pastix_int_t offy )
1230 {
1231  pastix_int_t rankA, rank, M, N, minV;
1232  pastix_int_t i, ret, new_rank, rklimit;
1233  pastix_int_t ldau, ldav, ldbu, ldbv, ldu, ldv;
1234  float *u1u2, *v1v2, *u;
1235  float *zbuf, *tauV;
1236  size_t wzsize;
1237  float tol = lowrank->tolerance;
1238 
1239  /* PQRCP parameters / workspace */
1240  pastix_int_t nb = 32;
1241  pastix_int_t lwork;
1242  pastix_int_t *jpvt;
1243  float *zwork, zzsize;
1244  float *rwork;
1245  float alpha = *((float*)alphaptr);
1246  pastix_fixdbl_t flops, total_flops = 0.;
1247 
1248 #if defined(PASTIX_DEBUG_LR)
1249  if ( B->rk > 0 ) {
1250  int rc = core_slrdbg_check_orthogonality( M2, B->rk, B->u, M2 );
1251  if (rc == 1) {
1252  fprintf(stderr, "Failed to have B->u orthogonal in entry of rradd\n" );
1253  }
1254  }
1255 #endif
1256 
1257  rankA = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
1258  rank = rankA + B->rk;
1259  M = pastix_imax(M2, M1);
1260  N = pastix_imax(N2, N1);
1261 
1262  minV = pastix_imin(N, rank);
1263 
1264  assert(M2 == M && N2 == N);
1265  assert(B->rk != -1);
1266 
1267  assert( A->rk <= A->rkmax);
1268  assert( B->rk <= B->rkmax);
1269 
1270  if ( ((M1 + offx) > M2) ||
1271  ((N1 + offy) > N2) )
1272  {
1273  pastix_print_error( "Dimensions are not correct" );
1274  assert(0 /* Incorrect dimensions */);
1275  return total_flops;
1276  }
1277 
1278  /*
1279  * A is rank null, nothing to do
1280  */
1281  if ( A->rk == 0 ) {
1282  return total_flops;
1283  }
1284 
1285  /*
1286  * Let's handle case where B is a null matrix
1287  * B = alpha A
1288  */
1289  if ( B->rk == 0 ) {
1290  core_slrcpy( lowrank, transA1, alpha,
1291  M1, N1, A, M2, N2, B,
1292  offx, offy );
1293  return total_flops;
1294  }
1295 
1296  /*
1297  * The rank is too big, let's try to compress
1298  */
1299  if ( rank > pastix_imin( M, N ) ) {
1300  assert(0);
1301  }
1302 
1303  /*
1304  * Let's define leading dimensions
1305  */
1306  ldau = (A->rk == -1) ? A->rkmax : M1;
1307  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
1308  ldbu = M;
1309  ldbv = B->rkmax;
1310  ldu = M;
1311  ldv = rank;
1312 
1313  /*
1314  * Let's compute the size of the workspace
1315  */
1316  /* u1u2 and v1v2 */
1317  wzsize = (M+N) * rank;
1318  /* tauV */
1319  wzsize += minV;
1320 
1321  /* RRQR workspaces */
1322  rklimit = pastix_imin( rank, core_get_rklimit( M, N ) );
1323  rrqrfct( tol, rklimit, 1, nb,
1324  rank, N, NULL, ldv,
1325  NULL, NULL,
1326  &zzsize, -1, NULL );
1327  lwork = (pastix_int_t)(zzsize);
1328  wzsize += lwork;
1329 
1330 #if defined(PASTIX_DEBUG_LR)
1331  zbuf = NULL;
1332  u1u2 = malloc( ldu * rank * sizeof(float) );
1333  v1v2 = malloc( ldv * N * sizeof(float) );
1334  tauV = malloc( rank * sizeof(float) );
1335  zwork = malloc( lwork * sizeof(float) );
1336 
1337  rwork = malloc( 2 * pastix_imax( rank, N ) * sizeof(float) );
1338 #else
1339  zbuf = malloc( wzsize * sizeof(float) + 2 * pastix_imax(rank, N) * sizeof(float) );
1340 
1341  u1u2 = zbuf;
1342  v1v2 = u1u2 + ldu * rank;
1343  tauV = v1v2 + ldv * N;
1344  zwork = tauV + rank;
1345 
1346  rwork = (float*)(zwork + lwork);
1347 #endif
1348 
1349  /*
1350  * Concatenate U2 and U1 in u1u2
1351  * [ u2 0 ]
1352  * [ u2 u1 ]
1353  * [ u2 0 ]
1354  */
1355  core_slrconcatenate_u( alpha,
1356  M1, N1, A,
1357  M2, B,
1358  offx, u1u2 );
1359 
1360  /*
1361  * Concatenate V2 and V1 in v1v2
1362  * [ v2^h v2^h v2^h ]
1363  * [ 0 v1^h 0 ]
1364  */
1365  core_slrconcatenate_v( transA1, alpha,
1366  M1, N1, A,
1367  N2, B,
1368  offy, v1v2 );
1369 
1370  /*
1371  * Orthogonalize [u2, u1]
1372  * [u2, u1] = [u2, u1 - u2(u2Tu1)] * (I u2Tu1)
1373  * (0 I )
1374  */
1375 
1376  /* We do not care is A was integrated into v1v2 */
1377  if (rankA != 0) {
1378 
1379  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_orthogonalize );
1380  switch ( pastix_lr_ortho ) {
1381  case PastixCompressOrthoQR:
1382  flops = core_slrorthu_fullqr( M, N, B->rk + rankA,
1383  u1u2, ldu, v1v2, ldv );
1384  break;
1385 
1387  flops = core_slrorthu_partialqr( M, N, B->rk, &rankA, offx, offy,
1388  u1u2, ldu, v1v2, ldv );
1389  break;
1390 
1392  default:
1393  flops = core_slrorthu_cgs( M2, N2, M1, N1, B->rk, &rankA, offx, offy,
1394  u1u2, ldu, v1v2, ldv );
1395  }
1396  kernel_trace_stop_lvl2( flops );
1397 
1398  total_flops += flops;
1399  }
1400 
1401  rank = B->rk + rankA;
1402 
1403  if (rankA == 0) {
1404  /*
1405  * The final B += A fit in B
1406  * Lets copy and return
1407  */
1408  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', M, B->rk, u1u2, ldu, B->u, ldbu );
1409  assert( ret == 0 );
1410  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N, v1v2, ldv, B->v, ldbv );
1411  assert( ret == 0 );
1412 
1413  free(zbuf);
1414 #if defined(PASTIX_DEBUG_LR)
1415  free( u1u2 );
1416  free( v1v2 );
1417  free( tauV );
1418  free( zwork );
1419  free( rwork );
1420 #endif
1421  return total_flops;
1422  }
1423 
1424  MALLOC_INTERN( jpvt, pastix_imax(rank, N), pastix_int_t );
1425 
1426  if ( lowrank->use_reltol ) {
1427  /**
1428  * In relative tolerance, we can choose two solutions:
1429  * 1) The first one, more conservative, is to compress relatively to
1430  * the norm of the final matrix \f$ \alpha A + B \f$. In this kernel, we
1431  * exploit the fact that the V part contains all the information while
1432  * the U part is orthonormal, and compute it as follow:
1433  *
1434  * float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', rank, N,
1435  * v1v2, ldv, NULL );
1436  * tol = tol * norm;
1437  *
1438  * 2) The second solution, less conservative, will allow to reduce the
1439  * rank more efficiently. Since A and B have been compressed relatively
1440  * to their respective norms, there is no reason to compress the sum
1441  * relatively to its own norm, but it is more reasonable to compress it
1442  * relatively to the norm of A and B. For example, A-B would be full
1443  * with the first criterion, and rank null with the second.
1444  * Note that here, we can only have an estimation that once again
1445  * reduces the conservation of the criterion.
1446  * \f[ || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b|| \f]
1447  *
1448  */
1449  float normA, normB;
1450  normA = core_slrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
1451  normB = core_slrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
1452  tol = tol * ( fabsf(alpha) * normA + normB );
1453  }
1454 
1455  /*
1456  * Perform RRQR factorization on (I u2Tu1) v1v2 = (Q2 R2)
1457  * (0 I )
1458  */
1459  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
1460  rklimit = pastix_imin( rklimit, rank );
1461  new_rank = rrqrfct( tol, rklimit, 1, nb,
1462  rank, N, v1v2, ldv,
1463  jpvt, tauV,
1464  zwork, lwork, rwork );
1465  flops = (new_rank == -1) ? FLOPS_SGEQRF( rank, N )
1466  : (FLOPS_SGEQRF( rank, new_rank ) +
1467  FLOPS_SORMQR( rank, N-new_rank, new_rank, PastixLeft ));
1468  kernel_trace_stop_lvl2_rank( flops, new_rank );
1469  total_flops += flops;
1470 
1471  /*
1472  * First case: The rank is too big, so we decide to uncompress the result
1473  */
1474  if ( (new_rank > rklimit) ||
1475  (new_rank == -1) )
1476  {
1477  pastix_lrblock_t Bbackup = *B;
1478 
1479  core_slralloc( M, N, -1, B );
1480  u = B->u;
1481 
1482  /* Uncompress B */
1483  flops = FLOPS_SGEMM( M, N, Bbackup.rk );
1484  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
1485  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1486  M, N, Bbackup.rk,
1487  (sone), Bbackup.u, ldbu,
1488  Bbackup.v, ldbv,
1489  (szero), u, M );
1490  kernel_trace_stop_lvl2( flops );
1491  total_flops += flops;
1492 
1493  /* Add A into it */
1494  if ( A->rk == -1 ) {
1495  flops = 2 * M1 * N1;
1496  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1497  core_sgeadd( transA1, M1, N1,
1498  alpha, A->u, ldau,
1499  sone, u + offy * M + offx, M);
1500  kernel_trace_stop_lvl2( flops );
1501  }
1502  else {
1503  flops = FLOPS_SGEMM( M1, N1, A->rk );
1504  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1505  cblas_sgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
1506  M1, N1, A->rk,
1507  (alpha), A->u, ldau,
1508  A->v, ldav,
1509  (sone), u + offy * M + offx, M);
1510  kernel_trace_stop_lvl2( flops );
1511  }
1512  total_flops += flops;
1513  core_slrfree(&Bbackup);
1514  free( zbuf );
1515  free( jpvt );
1516 #if defined(PASTIX_DEBUG_LR)
1517  free( u1u2 );
1518  free( v1v2 );
1519  free( tauV );
1520  free( zwork );
1521  free( rwork );
1522 #endif
1523  return total_flops;
1524  }
1525  else if ( new_rank == 0 ) {
1526  core_slrfree(B);
1527  free( zbuf );
1528  free( jpvt );
1529 #if defined(PASTIX_DEBUG_LR)
1530  free( u1u2 );
1531  free( v1v2 );
1532  free( tauV );
1533  free( zwork );
1534  free( rwork );
1535 #endif
1536  return total_flops;
1537  }
1538 
1539  /*
1540  * We need to reallocate the buffer to store the new compressed version of B
1541  * because it wasn't big enough
1542  */
1543  ret = core_slrsze( 0, M, N, B, new_rank, -1, -1 );
1544  assert( ret != -1 );
1545  assert( B->rkmax >= new_rank );
1546  assert( B->rkmax >= B->rk );
1547 
1548  ldbv = B->rkmax;
1549 
1550  /* B->v = P v1v2 */
1551  {
1552  float *tmpV;
1553  pastix_int_t lm;
1554 
1555  memset(B->v, 0, N * ldbv * sizeof(float));
1556  tmpV = B->v;
1557  for (i=0; i<N; i++){
1558  lm = pastix_imin( new_rank, i+1 );
1559  memcpy(tmpV + jpvt[i] * ldbv,
1560  v1v2 + i * ldv,
1561  lm * sizeof(float));
1562  }
1563  }
1564 
1565  /* Compute Q2 factor */
1566  {
1567  flops = FLOPS_SORGQR( rank, new_rank, new_rank )
1568  + FLOPS_SGEMM( M, new_rank, rank );
1569 
1570  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
1571  ret = LAPACKE_sorgqr_work( LAPACK_COL_MAJOR, rank, new_rank, new_rank,
1572  v1v2, ldv, tauV, zwork, lwork );
1573  assert(ret == 0);
1574 
1575  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1576  M, new_rank, rank,
1577  (sone), u1u2, ldu,
1578  v1v2, ldv,
1579  (szero), B->u, ldbu);
1580  kernel_trace_stop_lvl2( flops );
1581  total_flops += flops;
1582  }
1583 
1584  free( zbuf );
1585  free( jpvt );
1586 
1587 #if defined(PASTIX_DEBUG_LR)
1588  free( u1u2 );
1589  free( v1v2 );
1590  free( tauV );
1591  free( zwork );
1592  free( rwork );
1593 #endif
1594 
1595  (void)ret;
1596  return total_flops;
1597 }
1598 
1599 /**
1600  *******************************************************************************
1601  *
1602  * @brief Compute the size of a block to send in LR
1603  *
1604  *******************************************************************************
1605  *
1606  * @param[in] M
1607  * The number of rows of the matrix A.
1608  *
1609  * @param[in] N
1610  * The number of columns of the matrix A.
1611  *
1612  * @param[in] A
1613  * The low-rank representation of the matrix A.
1614  *
1615  *******************************************************************************
1616  *
1617  * @return Size of a block to send in LR
1618  *
1619  *******************************************************************************/
1620 size_t
1622  pastix_int_t N,
1623  pastix_lrblock_t *A )
1624 {
1625  if ( A->rk != -1 ) {
1626  return A->rk * ( M + N );
1627  }
1628  else {
1629  return M * N;
1630  }
1631 }
1632 
1633 /**
1634  *******************************************************************************
1635  *
1636  * @brief Pack low-rank data by side
1637  *
1638  *******************************************************************************
1639  *
1640  * @param[in] M
1641  * The number of rows of the matrix A.
1642  *
1643  * @param[in] N
1644  * The number of columns of the matrix A.
1645  *
1646  * @param[in] A
1647  * The low-rank representation of the matrix A.
1648  *
1649  * @param[inout] buffer
1650  * Pointer on packed data
1651  *
1652  *******************************************************************************
1653  *
1654  * @return Pointer on packed data shifted to the next block
1655  *
1656  *******************************************************************************/
1657 char *
1659  pastix_int_t N,
1660  const pastix_lrblock_t *A,
1661  char *buffer )
1662 {
1663  int rk = A->rk;
1664  int rkmax = A->rkmax;
1665  void *u = A->u;
1666  void *v = A->v;
1667  int ret;
1668 
1669  /* Store the rank */
1670  memcpy( buffer, &rk, sizeof( int ) );
1671  buffer += sizeof( int );
1672 
1673  if ( rk != -1 ) {
1674  /* Pack the u part */
1675  memcpy( buffer, u, rk * M * sizeof( float ) );
1676  buffer += rk * M * sizeof( float );
1677 
1678  /* Pack the v part */
1679  if ( rk == rkmax ) {
1680  memcpy( buffer, v, rk * N * sizeof( float ) );
1681  buffer += rk * N * sizeof( float );
1682  }
1683  else {
1684  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', rk, N, v, rkmax,
1685  (float *)buffer, rk );
1686  assert( ret == 0 );
1687  buffer += rk * N * sizeof( float );
1688  }
1689  }
1690  else {
1691  memcpy( buffer, u, M * N * sizeof( float ) );
1692  buffer += M * N * sizeof( float );
1693  }
1694 
1695  (void)ret;
1696 
1697  return buffer;
1698 }
1699 
1700 /**
1701  *******************************************************************************
1702  *
1703  * @brief Unpack low rank data and fill the cblk concerned by the computation
1704  *
1705  *******************************************************************************
1706  *
1707  * @param[in] M
1708  * The number of rows of the matrix A.
1709  *
1710  * @param[in] N
1711  * The number of columns of the matrix A.
1712  *
1713  * @param[in] A
1714  * The low-rank representation of the matrix A.
1715  *
1716  * @param[inout] buffer
1717  * Pointer on packed data
1718  *
1719  *******************************************************************************
1720  *
1721  * @return Pointer on packed data shifted to the next block
1722  *
1723  *******************************************************************************/
1724 char *
1726  pastix_int_t N,
1727  pastix_lrblock_t *A,
1728  char *buffer )
1729 {
1730  int rk;
1731  memcpy( &rk, buffer, sizeof( int ) );
1732  buffer += sizeof( int );
1733 
1734  /* Make sure A can store the unpacked values */
1735  core_slrsze( 0, M, N, A, rk, rk, rk );
1736 
1737  if ( rk != -1 ) {
1738  /* Unpack U */
1739  memcpy( A->u, buffer, M * rk * sizeof( float ) );
1740  buffer += M * rk * sizeof( float );
1741 
1742  /* Unpack V */
1743  memcpy( A->v, buffer, N * rk * sizeof( float ) );
1744  buffer += N * rk * sizeof( float );
1745  }
1746  else {
1747  /* Unpack the full block */
1748  memcpy( A->u, buffer, M * N * sizeof( float ) );
1749  buffer += M * N * sizeof( float );
1750  }
1751 
1752  return buffer;
1753 }
1754 
1755 /**
1756  *******************************************************************************
1757  *
1758  * @brief Unpack low rank data and fill the cblk concerned by the computation
1759  *
1760  *******************************************************************************
1761  *
1762  * @param[in] M
1763  * The number of rows of the matrix A.
1764  *
1765  * @param[in] N
1766  * The number of columns of the matrix A.
1767  *
1768  * @param[in] A
1769  * The low-rank representation of the matrix A.
1770  *
1771  * @param[inout] input
1772  * TODO
1773  *
1774  * @param[inout] outptr
1775  * TODO
1776  *
1777  *******************************************************************************
1778  *
1779  * @return Pointer on packed data shifted to the next block
1780  *
1781  *******************************************************************************/
1782 const char *
1784  pastix_int_t N,
1785  pastix_lrblock_t *A,
1786  const char *input,
1787  char **outptr )
1788 {
1789  char *output = *outptr;
1790  size_t size;
1791  int rk;
1792 
1793  rk = *((int *)input);
1794  input += sizeof( int );
1795 
1796  if ( rk != -1 ) {
1797  A->rk = rk;
1798  A->rkmax = rk;
1799 
1800  /* Unpack U */
1801  size = M * rk * sizeof( float );
1802  A->u = output;
1803 
1804  memcpy( A->u, input, size );
1805  input += size;
1806  output += size;
1807 
1808  /* Unpack V */
1809  size = N * rk * sizeof( float );
1810  A->v = output;
1811 
1812  memcpy( A->v, input, size );
1813  input += size;
1814  output += size;
1815  }
1816  else {
1817  A->rk = -1;
1818  A->rkmax = M;
1819  A->v = NULL;
1820 
1821  /* Unpack the full block */
1822  size = M * N * sizeof( float );
1823  A->u = output;
1824 
1825  memcpy( A->u, input, size );
1826  input += size;
1827  output += size;
1828  }
1829 
1830  *outptr = output;
1831  return input;
1832 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelLvl2_LR_add2C_rradd_orthogonalize
pastix_fixdbl_t core_slrorthu_fullqr(pastix_int_t M, pastix_int_t N, pastix_int_t rank, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the u part of the low-rank form, and update the v part accordingly using full QR.
Definition: core_slrothu.c:83
pastix_fixdbl_t core_slrorthu_cgs(pastix_int_t M1, pastix_int_t N1, pastix_int_t M2, pastix_int_t N2, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using CGS.
Definition: core_slrothu.c:430
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
void core_sgetmo(int m, int n, const float *A, int lda, float *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
Definition: core_sgetmo.c:49
pastix_fixdbl_t core_slrorthu_partialqr(pastix_int_t M, pastix_int_t N, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, float *U, pastix_int_t ldu, float *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using partial ...
Definition: core_slrothu.c:197
int core_slrdbg_check_orthogonality(pastix_int_t M, pastix_int_t N, const float *A, pastix_int_t lda)
Check the orthogonality of the matrix A.
Definition: core_slrdbg.c:101
double tolerance
fct_ge2lr_t core_ge2lr
pastix_int_t pastix_lr_ortho
Define the orthogonalization method.
Definition: lowrank.c:25
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:46
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_qrrt(core_srrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
int(* core_srrqr_rt_t)(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, float *tau, float *B, pastix_int_t ldb, float *tau_b, float *work, pastix_int_t lwork, float normA)
TODO.
int(* core_srrqr_cp_t)(float tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
TODO.
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
int core_slr2ge(pastix_trans_t trans, pastix_int_t m, pastix_int_t n, const pastix_lrblock_t *Alr, float *A, pastix_int_t lda)
Convert a low rank matrix into a dense matrix.
int core_slrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
const char * core_slrunpack2(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, const char *input, char **outptr)
Unpack low rank data and fill the cblk concerned by the computation.
char * core_slrpack(pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer)
Pack low-rank data by side.
void core_slralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
Definition: core_sgelrops.c:56
void core_slrconcatenate_v(pastix_trans_t transA1, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, float *v1v2)
Concatenate right parts of two low-rank matrices.
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
size_t core_slrgetsize(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A)
Compute the size of a block to send in LR.
void core_slrconcatenate_u(float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, float *u1u2)
Concatenate left parts of two low-rank matrices.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
float core_slrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition: core_slrnrm.c:48
char * core_slrunpack(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer)
Unpack low rank data and fill the cblk concerned by the computation.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixFrobeniusNorm
Definition: api.h:504
@ PastixCompressOrthoQR
Definition: api.h:408
@ PastixCompressOrthoPartialQR
Definition: api.h:409
@ PastixCompressOrthoCGS
Definition: api.h:407
@ PastixLeft
Definition: api.h:495
@ PastixNoTrans
Definition: api.h:445
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...