PaStiX Handbook  6.3.2
core_dgelrops.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_dgelrops.c
4  *
5  * PaStiX low-rank kernel routines
6  *
7  * @copyright 2016-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.3.2
11  * @author Gregoire Pichon
12  * @author Esragul Korkmaz
13  * @author Mathieu Faverge
14  * @author Pierre Ramet
15  * @author Nolan Bredel
16  * @date 2023-07-21
17  * @generated from /builds/solverstack/pastix/kernels/core_zgelrops.c, normal z -> d, Wed Dec 13 12:09:13 2023
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_dcores.h"
25 #include "pastix_dlrcores.h"
26 #include "d_nan_check.h"
27 #include "kernels_trace.h"
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 static double done = 1.0;
31 static double dzero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Allocate a low-rank matrix.
38  *
39  *******************************************************************************
40  *
41  * @param[in] M
42  * Number of rows of the matrix A.
43  *
44  * @param[in] N
45  * Number of columns of the matrix A.
46  *
47  * @param[in] rkmax
48  * @arg -1: the matrix is allocated tight to its rank.
49  * @arg >0: the matrix is allocated to the minimum of rkmax and its maximum rank.
50  *
51  * @param[out] A
52  * The allocated low-rank matrix
53  *
54  *******************************************************************************/
55 void
57  pastix_int_t N,
58  pastix_int_t rkmax,
59  pastix_lrblock_t *A )
60 {
61  double *u, *v;
62 
63  if ( rkmax == -1 ) {
64  u = malloc( M * N * sizeof(double) );
65  memset( u, 0, M * N * sizeof(double) );
66  A->rk = -1;
67  A->rkmax = M;
68  A->u = u;
69  A->v = NULL;
70  }
71  else if ( rkmax == 0 ) {
72  A->rk = 0;
73  A->rkmax = 0;
74  A->u = NULL;
75  A->v = NULL;
76  }
77  else {
78  pastix_int_t rk = pastix_imin( M, N );
79  rkmax = pastix_imin( rkmax, rk );
80 
81 #if defined(PASTIX_DEBUG_LR)
82  u = malloc( M * rkmax * sizeof(double) );
83  v = malloc( N * rkmax * sizeof(double) );
84 
85  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
86  memset(u, 0, M * rkmax * sizeof(double));
87  memset(v, 0, N * rkmax * sizeof(double));
88 #else
89  u = malloc( (M+N) * rkmax * sizeof(double));
90 
91  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
92  memset(u, 0, (M+N) * rkmax * sizeof(double));
93 
94  v = u + M * rkmax;
95 #endif
96 
97  A->rk = 0;
98  A->rkmax = rkmax;
99  A->u = u;
100  A->v = v;
101  }
102 }
103 
104 /**
105  *******************************************************************************
106  *
107  * @brief Free a low-rank matrix.
108  *
109  *******************************************************************************
110  *
111  * @param[inout] A
112  * The low-rank matrix that will be desallocated.
113  *
114  *******************************************************************************/
115 void
117 {
118  if ( A->rk == -1 ) {
119  free(A->u);
120  A->u = NULL;
121  }
122  else {
123  free(A->u);
124 #if defined(PASTIX_DEBUG_LR)
125  free(A->v);
126 #endif
127  A->u = NULL;
128  A->v = NULL;
129  }
130  A->rk = 0;
131  A->rkmax = 0;
132 }
133 
134 /**
135  *******************************************************************************
136  *
137  * @brief Resize a low-rank matrix
138  *
139  *******************************************************************************
140  *
141  * @param[in] copy
142  * Enable/disable the copy of the data from A->u and A->v into the new
143  * low-rank representation.
144  *
145  * @param[in] M
146  * The number of rows of the matrix A.
147  *
148  * @param[in] N
149  * The number of columns of the matrix A.
150  *
151  * @param[inout] A
152  * The low-rank representation of the matrix. At exit, this structure
153  * is modified with the new low-rank representation of A, is the rank
154  * is small enough
155  *
156  * @param[in] newrk
157  * The new rank of the matrix A.
158  *
159  * @param[in] newrkmax
160  * The new maximum rank of the matrix A. Useful if the low-rank
161  * structure was allocated with more data than the rank.
162  *
163  * @param[in] rklimit
164  * The maximum rank to store the matrix in low-rank format. If
165  * -1, set to core_get_rklimit(M, N)
166  *
167  *******************************************************************************
168  *
169  * @return The new rank of A
170  *
171  *******************************************************************************/
172 int
173 core_dlrsze( int copy,
174  pastix_int_t M,
175  pastix_int_t N,
176  pastix_lrblock_t *A,
177  pastix_int_t newrk,
178  pastix_int_t newrkmax,
179  pastix_int_t rklimit )
180 {
181  /* If no limit on the rank is given, let's take min(M, N) */
182  rklimit = (rklimit == -1) ? core_get_rklimit( M, N ) : rklimit;
183 
184  /* If no extra memory allocated, let's fix rkmax to rk */
185  newrkmax = (newrkmax == -1) ? newrk : newrkmax;
186  newrkmax = pastix_imax( newrkmax, newrk );
187 
188  /*
189  * It is not interesting to compress, so we alloc space to store the full matrix
190  */
191  if ( (newrk > rklimit) || (newrk == -1) )
192  {
193  A->u = realloc( A->u, M * N * sizeof(double) );
194 #if defined(PASTIX_DEBUG_LR)
195  free(A->v);
196 #endif
197  A->v = NULL;
198  A->rk = -1;
199  A->rkmax = M;
200  return -1;
201  }
202  /*
203  * The rank is null, we free everything
204  */
205  else if (newrkmax == 0)
206  {
207  /*
208  * The rank is null, we free everything
209  */
210  free(A->u);
211 #if defined(PASTIX_DEBUG_LR)
212  free(A->v);
213 #endif
214  A->u = NULL;
215  A->v = NULL;
216  A->rkmax = newrkmax;
217  A->rk = newrk;
218  }
219  /*
220  * The rank is non null, we allocate the correct amount of space, and
221  * compress the stored information if necessary
222  */
223  else {
224  double *u, *v;
225  int ret;
226 
227  if ( ( A->rk == -1 ) ||
228  (( A->rk != -1 ) && (newrkmax != A->rkmax)) )
229  {
230 #if defined(PASTIX_DEBUG_LR)
231  u = malloc( M * newrkmax * sizeof(double) );
232  v = malloc( N * newrkmax * sizeof(double) );
233 #else
234  u = malloc( (M+N) * newrkmax * sizeof(double) );
235  v = u + M * newrkmax;
236 #endif
237  if ( copy ) {
238  assert( A->rk != -1 );
239  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M, newrk,
240  A->u, M, u, M );
241  assert(ret == 0);
242  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', newrk, N,
243  A->v, A->rkmax, v, newrkmax );
244  assert(ret == 0);
245  }
246  free(A->u);
247 #if defined(PASTIX_DEBUG_LR)
248  free(A->v);
249 #endif
250  A->u = u;
251  A->v = v;
252  }
253 
254  /* Update rk and rkmax */
255  A->rkmax = newrkmax;
256  A->rk = newrk;
257 
258  (void)ret;
259  }
260  assert( A->rk <= A->rkmax);
261  return 0;
262 }
263 
264 /**
265  *******************************************************************************
266  *
267  * @brief Convert a low rank matrix into a dense matrix.
268  *
269  * Convert a low-rank matrix of size m-by-n into a full rank matrix.
270  * A = op( u * v^t ) with op(A) = A or A^t
271  *
272  *******************************************************************************
273  *
274  * @param[in] trans
275  * @arg PastixNoTrans: returns A = u * v^t
276  * @arg PastixTrans: returns A = v * u^t
277  *
278  * @param[in] m
279  * Number of rows of the low-rank matrix Alr.
280  *
281  * @param[in] n
282  * Number of columns of the low-rank matrix Alr.
283  *
284  * @param[in] Alr
285  * The low rank matrix to be converted into a full rank matrix
286  *
287  * @param[inout] A
288  * The matrix of dimension lda-by-k in which to store the uncompressed
289  * version of Alr. k = n if trans == PastixNoTrans, m otherwise.
290  *
291  * @param[in] lda
292  * The leading dimension of the matrix A. lda >= max(1, m) if trans ==
293  * PastixNoTrans, lda >= max(1,n) otherwise.
294  *
295  *******************************************************************************
296  *
297  * @retval 0 in case of success.
298  * @retval -i if the ith parameter is incorrect.
299  *
300  *******************************************************************************/
301 int
303  pastix_int_t m,
304  pastix_int_t n,
305  const pastix_lrblock_t *Alr,
306  double *A,
307  pastix_int_t lda )
308 {
309  int ret = 0;
310 
311 #if !defined(NDEBUG)
312  if ( m < 0 ) {
313  return -1;
314  }
315  if ( n < 0 ) {
316  return -2;
317  }
318  if (Alr == NULL || Alr->rk > Alr->rkmax) {
319  return -3;
320  }
321  if ( (trans == PastixNoTrans && lda < m) ||
322  (trans != PastixNoTrans && lda < n) )
323  {
324  return -5;
325  }
326  if ( Alr->rk == -1 ) {
327  if (Alr->u == NULL || Alr->v != NULL || (Alr->rkmax < m))
328  {
329  return -6;
330  }
331  }
332  else if ( Alr->rk != 0){
333  if (Alr->u == NULL || Alr->v == NULL) {
334  return -6;
335  }
336  }
337 #endif
338 
339  if ( trans == PastixNoTrans ) {
340  if ( Alr->rk == -1 ) {
341  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
342  Alr->u, Alr->rkmax, A, lda );
343  assert( ret == 0 );
344  }
345  else if ( Alr->rk == 0 ) {
346  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', m, n,
347  0.0, 0.0, A, lda );
348  assert( ret == 0 );
349  }
350  else {
351  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
352  m, n, Alr->rk,
353  (done), Alr->u, m,
354  Alr->v, Alr->rkmax,
355  (dzero), A, lda);
356  }
357  }
358  else {
359  if ( Alr->rk == -1 ) {
360  core_dgetmo( m, n, Alr->u, Alr->rkmax, A, lda );
361  }
362  else if ( Alr->rk == 0 ) {
363  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', n, m,
364  0.0, 0.0, A, lda );
365  assert( ret == 0 );
366  }
367  else {
368  cblas_dgemm(CblasColMajor, CblasTrans, CblasTrans,
369  n, m, Alr->rk,
370  (done), Alr->v, Alr->rkmax,
371  Alr->u, m,
372  (dzero), A, lda);
373  }
374  }
375 
376  return ret;
377 }
378 
379 /**
380  *******************************************************************************
381  *
382  * @brief Copy a small low-rank structure into a large one
383  *
384  *******************************************************************************
385  *
386  * @param[in] lowrank
387  * The structure with low-rank parameters.
388  *
389  * @param[in] transAv
390  * @arg PastixNoTrans: (A.v)' is stored transposed as usual
391  * @arg PastixTrans: A.v is stored
392  * @arg PastixTrans: A.v is stored
393  *
394  * @param[in] alpha
395  * The multiplier parameter: B = B + alpha * A
396  *
397  * @param[in] M1
398  * The number of rows of the matrix A.
399  *
400  * @param[in] N1
401  * The number of columns of the matrix A.
402  *
403  * @param[in] A
404  * The low-rank representation of the matrix A.
405  *
406  * @param[in] M2
407  * The number of rows of the matrix B.
408  *
409  * @param[in] N2
410  * The number of columns of the matrix B.
411  *
412  * @param[inout] B
413  * The low-rank representation of the matrix B.
414  *
415  * @param[in] offx
416  * The horizontal offset of A with respect to B.
417  *
418  * @param[in] offy
419  * The vertical offset of A with respect to B.
420  *
421  *******************************************************************************/
422 void
423 core_dlrcpy( const pastix_lr_t *lowrank,
424  pastix_trans_t transAv,
425  double alpha,
426  pastix_int_t M1,
427  pastix_int_t N1,
428  const pastix_lrblock_t *A,
429  pastix_int_t M2,
430  pastix_int_t N2,
431  pastix_lrblock_t *B,
432  pastix_int_t offx,
433  pastix_int_t offy )
434 {
435  double *u, *v;
436  pastix_int_t ldau, ldav;
437  int ret;
438 
439  assert( (M1 + offx) <= M2 );
440  assert( (N1 + offy) <= N2 );
441 
442  ldau = (A->rk == -1) ? A->rkmax : M1;
443  ldav = (transAv == PastixNoTrans) ? A->rkmax : N1;
444 
445  core_dlrfree( B );
446  core_dlralloc( M2, N2, A->rk, B );
447  u = B->u;
448  v = B->v;
449 
450  B->rk = A->rk;
451 
452  if ( A->rk == -1 ) {
453  if ( (M1 != M2) || (N1 != N2) ) {
454  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', M2, N2,
455  0.0, 0.0, u, M2 );
456  assert( ret == 0 );
457  }
458  ret = core_dgeadd( PastixNoTrans, M1, N1,
459  alpha, A->u, ldau,
460  0.0, u + M2 * offy + offx, M2 );
461  assert(ret == 0);
462  }
463  else {
464  if ( M1 != M2 ) {
465  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
466  0.0, 0.0, u, M2 );
467  assert( ret == 0 );
468  }
469  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
470  A->u, ldau,
471  u + offx, M2 );
472  assert(ret == 0);
473 
474  if ( N1 != N2 ) {
475  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
476  0.0, 0.0, v, B->rkmax );
477  assert( ret == 0 );
478  }
479  ret = core_dgeadd( transAv, A->rk, N1,
480  alpha, A->v, ldav,
481  0.0, v + B->rkmax * offy, B->rkmax );
482  assert(ret == 0);
483  }
484 
485 #if 0
486  {
487  double *work = malloc( M2 * N2 * sizeof(double) );
488 
489  core_dlr2ge( PastixNoTrans, M2, N2, B, work, M2 );
490 
491  lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, M2, N2, work, M2, B );
492 
493  free(work);
494  }
495 #endif
496 
497  (void)lowrank;
498  (void)ret;
499 }
500 
501 
502 /**
503  *******************************************************************************
504  *
505  * @brief Concatenate left parts of two low-rank matrices
506  *
507  *******************************************************************************
508  *
509  * @param[in] alpha
510  * alpha * A is add to B
511  *
512  * @param[in] M1
513  * The number of rows of the matrix A.
514  *
515  * @param[in] N1
516  * The number of columns of the matrix A.
517  *
518  * @param[in] A
519  * The low-rank representation of the matrix A.
520  *
521  * @param[in] M2
522  * The number of rows of the matrix B.
523  *
524  * @param[in] B
525  * The low-rank representation of the matrix B.
526  *
527  * @param[in] offx
528  * The horizontal offset of A with respect to B.
529  *
530  * @param[inout] u1u2
531  * The workspace where matrices are concatenated
532  *
533  *******************************************************************************/
534 void
535 core_dlrconcatenate_u( double alpha,
536  pastix_int_t M1,
537  pastix_int_t N1,
538  const pastix_lrblock_t *A,
539  pastix_int_t M2,
540  pastix_lrblock_t *B,
541  pastix_int_t offx,
542  double *u1u2 )
543 {
544  double *tmp;
545  pastix_int_t i, ret, rank;
546  pastix_int_t ldau, ldbu;
547 
548  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
549  rank += B->rk;
550 
551  ldau = (A->rk == -1) ? A->rkmax : M1;
552  ldbu = M2;
553 
554  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
555  B->u, ldbu, u1u2, M2 );
556  assert(ret == 0);
557 
558  tmp = u1u2 + B->rk * M2;
559  if ( A->rk == -1 ) {
560  /*
561  * A is full of rank M1, so A will be integrated into v1v2
562  */
563  if ( M1 < N1 ) {
564  if ( M1 != M2 ) {
565  /* Set to 0 */
566  memset(tmp, 0, M2 * M1 * sizeof(double));
567 
568  /* Set diagonal */
569  tmp += offx;
570  for (i=0; i<M1; i++, tmp += M2+1) {
571  *tmp = 1.0;
572  }
573  }
574  else {
575  assert( offx == 0 );
576  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', M2, M1,
577  0.0, 1.0, tmp, M2 );
578  assert( ret == 0 );
579  }
580  }
581  else {
582  /*
583  * A is full of rank N1, so A is integrated into u1u2
584  */
585  if ( M1 != M2 ) {
586  memset(tmp, 0, M2 * N1 * sizeof(double));
587  }
588  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M1, N1,
589  A->u, ldau, tmp + offx, M2 );
590  assert(ret == 0);
591  }
592  }
593  /*
594  * A is low rank of rank A->rk
595  */
596  else {
597  if ( M1 != M2 ) {
598  memset(tmp, 0, M2 * A->rk * sizeof(double));
599  }
600  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
601  A->u, ldau, tmp + offx, M2 );
602  assert(ret == 0);
603  }
604  (void)ret;
605  (void)alpha;
606  (void)rank;
607 }
608 
609 
610 /**
611  *******************************************************************************
612  *
613  * @brief Concatenate right parts of two low-rank matrices
614  *
615  *******************************************************************************
616  *
617  * @param[in] transA1
618  * @arg PastixNoTrans: No transpose, op( A ) = A;
619  * @arg PastixTrans: Transpose, op( A ) = A';
620  *
621  * @param[in] alpha
622  * alpha * A is add to B
623  *
624  * @param[in] M1
625  * The number of rows of the matrix A.
626  *
627  * @param[in] N1
628  * The number of columns of the matrix A.
629  *
630  * @param[in] A
631  * The low-rank representation of the matrix A.
632  *
633  * @param[in] N2
634  * The number of columns of the matrix B.
635  *
636  * @param[in] B
637  * The low-rank representation of the matrix B.
638  *
639  * @param[in] offy
640  * The vertical offset of A with respect to B.
641  *
642  * @param[inout] v1v2
643  * The workspace where matrices are concatenated
644  *
645  *******************************************************************************/
646 void
648  double alpha,
649  pastix_int_t M1,
650  pastix_int_t N1,
651  const pastix_lrblock_t *A,
652  pastix_int_t N2,
653  pastix_lrblock_t *B,
654  pastix_int_t offy,
655  double *v1v2 )
656 {
657  double *tmp;
658  pastix_int_t i, ret, rank;
659  pastix_int_t ldau, ldav, ldbv;
660 
661  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
662  rank += B->rk;
663 
664  ldau = (A->rk == -1) ? A->rkmax : M1;
665  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
666  ldbv = B->rkmax;
667 
668  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
669  B->v, ldbv, v1v2, rank );
670  assert(ret == 0);
671 
672  tmp = v1v2 + B->rk;
673  if ( A->rk == -1 ) {
674  assert( transA1 == PastixNoTrans );
675  /*
676  * A is full of rank M1, so it is integrated into v1v2
677  */
678  if ( M1 < N1 ) {
679  if (N1 != N2) {
680  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', M1, N2,
681  0.0, 0.0, tmp, rank );
682  assert( ret == 0 );
683  }
684  core_dgeadd( PastixNoTrans, M1, N1,
685  alpha, A->u, ldau,
686  0.0, tmp + offy * rank, rank );
687  }
688  /*
689  * A is full of rank N1, so it has been integrated into u1u2
690  */
691  else {
692  if (N1 != N2) {
693  /* Set to 0 */
694  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
695  0.0, 0.0, tmp, rank );
696  assert(ret == 0);
697 
698  /* Set diagonal */
699  tmp += offy * rank;
700  for (i=0; i<N1; i++, tmp += rank+1) {
701  *tmp = alpha;
702  }
703  }
704  else {
705  assert( offy == 0 );
706  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
707  0.0, alpha, tmp + offy * rank, rank );
708  assert( ret == 0 );
709  }
710  }
711  }
712  /*
713  * A is low rank of rank A->rk
714  */
715  else {
716  if (N1 != N2) {
717  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'A', A->rk, N2,
718  0.0, 0.0, tmp, rank );
719  assert(ret == 0);
720  }
721  core_dgeadd( transA1, A->rk, N1,
722  alpha, A->v, ldav,
723  0.0, tmp + offy * rank, rank );
724  }
725  (void)ret;
726  (void)alpha;
727 }
728 
729 /**
730  *******************************************************************************
731  *
732  * @brief Template to convert a full rank matrix into a low rank matrix through
733  * QR decompositions
734  *
735  * This version is only used when permutation method is used. Only difference
736  * from core_dge2lr_qrrt is the V calculation part, That is: Instead of applying
737  * inverse rotation on V, here inverse permutation is applied
738  *
739  *******************************************************************************
740  *
741  * @param[in] rrqrfct
742  * QR decomposition function used to compute the rank revealing
743  * factorization and create the low-rank form of A.
744  *
745  * @param[in] use_reltol
746  * TODO
747  *
748  * @param[in] tol
749  * The tolerance used as a criterion to eliminate information from the
750  * full rank matrix
751  *
752  * @param[in] rklimit
753  * The maximum rank to store the matrix in low-rank format. If
754  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
755  *
756  * @param[in] m
757  * Number of rows of the matrix A, and of the low rank matrix Alr.
758  *
759  * @param[in] n
760  * Number of columns of the matrix A, and of the low rank matrix Alr.
761  *
762  * @param[in] Avoid
763  * The matrix of dimension lda-by-n that needs to be compressed
764  *
765  * @param[in] lda
766  * The leading dimension of the matrix A. lda >= max(1, m)
767  *
768  * @param[out] Alr
769  * The low rank matrix structure that will store the low rank
770  * representation of A
771  *
772  *******************************************************************************
773  *
774  * @return TODO
775  *
776  *******************************************************************************/
779  int use_reltol,
780  pastix_fixdbl_t tol,
781  pastix_int_t rklimit,
782  pastix_int_t m,
783  pastix_int_t n,
784  const void *Avoid,
785  pastix_int_t lda,
786  pastix_lrblock_t *Alr )
787 {
788  int ret, newrk;
789  pastix_int_t nb = 32;
790  double *A = (double*)Avoid;
791  double *Acpy;
792  pastix_int_t lwork;
793  double *work, *tau, zzsize;
794  double *rwork;
795  pastix_int_t *jpvt;
796  pastix_int_t zsize, rsize;
797  double *zwork;
798  pastix_fixdbl_t flops;
799 
800  double norm = LAPACKE_dlange_work( LAPACK_COL_MAJOR, 'f', m, n,
801  A, lda, NULL );
802 
803  if ( (norm == 0.) && (tol >= 0.)) {
804  core_dlralloc( m, n, 0, Alr );
805  return 0. ;
806  }
807 
808  /* work */
809  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
810  if ( tol < 0. ) {
811  tol = -1.;
812  }
813  else if ( use_reltol ) {
814  tol = tol * norm;
815  }
816 
817  ret = rrqrfct( tol, rklimit, 0, nb,
818  m, n, NULL, m,
819  NULL, NULL,
820  &zzsize, -1, NULL );
821 
822  lwork = (pastix_int_t)zzsize;
823  zsize = lwork;
824  /* Acpy */
825  zsize += m * n;
826  /* tau */
827  zsize += n;
828  /* rwork */
829  rsize = 2 * n;
830 
831 #if defined(PASTIX_DEBUG_LR)
832  zwork = NULL;
833  Acpy = malloc( m * n * sizeof(double) );
834  tau = malloc( n * sizeof(double) );
835  work = malloc( lwork * sizeof(double) );
836  rwork = malloc( rsize * sizeof(double) );
837 #else
838  zwork = malloc( zsize * sizeof(double) + rsize * sizeof(double) );
839  Acpy = zwork;
840  tau = Acpy + m * n;
841  work = tau + n;
842  rwork = (double*)(work + lwork);
843 #endif
844 
845  jpvt = malloc( n * sizeof(pastix_int_t) );
846 
847  /**
848  * Backup A into Acpy to try to compress
849  */
850  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
851  A, lda, Acpy, m );
852  assert(ret == 0);
853 
854  newrk = rrqrfct( tol, rklimit, 0, nb,
855  m, n, Acpy, m,
856  jpvt, tau,
857  work, lwork, rwork );
858  if (newrk == -1) {
859  flops = FLOPS_DGEQRF( m, n );
860  }
861  else {
862  flops = FLOPS_DGEQRF( m, newrk ) + FLOPS_DORMQR( m, n-newrk, newrk, PastixLeft );
863  }
864 
865  /**
866  * It was not interesting to compress, so we restore the dense version in Alr
867  */
868  core_dlralloc( m, n, newrk, Alr );
869  Alr->rk = newrk;
870 
871  if ( newrk == -1 ) {
872  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
873  A, lda, Alr->u, Alr->rkmax );
874  assert(ret == 0);
875  }
876  else if ( newrk > 0 ) {
877  /**
878  * We compute U and V
879  */
880  pastix_int_t i;
881  double *U, *V;
882 
883  U = Alr->u;
884  V = Alr->v;
885 
886  /* Compute the final U form */
887  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
888  Acpy, m, U, m );
889  assert(ret == 0);
890 
891  ret = LAPACKE_dorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
892  U, m, tau, work, lwork );
893  assert(ret == 0);
894  flops += FLOPS_DORGQR( m, Alr->rk, Alr->rk );
895 
896  /* Compute the final V form */
897  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
898  0.0, 0.0, Acpy + 1, m );
899 
900  for (i=0; i<n; i++){
901  memcpy( V + jpvt[i] * Alr->rk,
902  Acpy + i * m,
903  Alr->rk * sizeof(double) );
904  }
905  }
906 
907 #if defined(PASTIX_DEBUG_LR)
908  if ( Alr->rk > 0 ) {
909  int rc = core_dlrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
910  if (rc == 1) {
911  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
912  }
913  }
914 #endif
915 
916  free( zwork );
917  free( jpvt );
918 #if defined(PASTIX_DEBUG_LR)
919  free( Acpy );
920  free( tau );
921  free( work );
922  free( rwork );
923 #endif
924  (void)ret;
925  return flops;
926 }
927 
928 /**
929  *******************************************************************************
930  *
931  * @brief Template to convert a full rank matrix into a low rank matrix through
932  * QR decompositions
933  *
934  * This version is only used when rotational method is used. Only difference
935  * from core_dge2lr_qr is the V calculation part, That is: Instead of applying
936  * inverse permutation on V, here inverse rotation is applied
937  *
938  *******************************************************************************
939  *
940  * @param[in] rrqrfct
941  * QR decomposition function used to compute the rank revealing
942  * factorization and create the low-rank form of A.
943  *
944  * @param[in] use_reltol
945  * TODO
946  *
947  * @param[in] tol
948  * The tolerance used as a criterion to eliminate information from the
949  * full rank matrix
950  *
951  * @param[in] rklimit
952  * The maximum rank to store the matrix in low-rank format. If
953  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
954  *
955  * @param[in] m
956  * Number of rows of the matrix A, and of the low rank matrix Alr.
957  *
958  * @param[in] n
959  * Number of columns of the matrix A, and of the low rank matrix Alr.
960  *
961  * @param[in] Avoid
962  * The matrix of dimension lda-by-n that needs to be compressed
963  *
964  * @param[in] lda
965  * The leading dimension of the matrix A. lda >= max(1, m)
966  *
967  * @param[out] Alr
968  * The low rank matrix structure that will store the low rank
969  * representation of A
970  *
971  *******************************************************************************
972  *
973  * @return TODO
974  *
975  *******************************************************************************/
978  int use_reltol,
979  pastix_fixdbl_t tol,
980  pastix_int_t rklimit,
981  pastix_int_t m,
982  pastix_int_t n,
983  const void *Avoid,
984  pastix_int_t lda,
985  pastix_lrblock_t *Alr )
986 {
987  int ret, newrk;
988  pastix_int_t nb = 32;
989  double *A = (double*)Avoid;
990  double *Acpy;
991  pastix_int_t lwork;
992  double *work, *tau, *B, *tau_b, zzsize;
993  pastix_int_t *jpvt;
994  pastix_int_t zsize, bsize;
995  double *zwork;
996  pastix_fixdbl_t flops;
997 
998  char trans;
999 #if defined(PRECISION_c) || defined(PRECISION_z)
1000  trans = 'C';
1001 #else
1002  trans = 'T';
1003 #endif
1004 
1005  double norm = LAPACKE_dlange_work( LAPACK_COL_MAJOR, 'f', m, n,
1006  A, lda, NULL );
1007 
1008  if ( (norm == 0.) && (tol >= 0.)) {
1009  core_dlralloc( m, n, 0, Alr );
1010  return 0. ;
1011  }
1012 
1013  /* work */
1014  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
1015  if ( tol < 0. ) {
1016  tol = -1.;
1017  }
1018  else if ( use_reltol ) {
1019  tol = tol * norm;
1020  }
1021 
1022  ret = rrqrfct( tol, rklimit, nb,
1023  m, n,
1024  NULL, m, NULL,
1025  NULL, n, NULL,
1026  &zzsize, -1, norm );
1027 
1028  lwork = (pastix_int_t)zzsize;
1029  zsize = lwork;
1030  bsize = n * rklimit;
1031  /* Acpy */
1032  zsize += m * n;
1033  /* tau */
1034  zsize += n;
1035  /* B and tau_b */
1036  zsize += bsize + n;
1037 
1038 #if defined(PASTIX_DEBUG_LR)
1039  zwork = NULL;
1040  Acpy = malloc( m * n * sizeof(double) );
1041  tau = malloc( n * sizeof(double) );
1042  B = malloc( bsize * sizeof(double) );
1043  tau_b = malloc( n * sizeof(double) );
1044  work = malloc( lwork * sizeof(double) );
1045 #else
1046  zwork = malloc( zsize * sizeof(double) );
1047  Acpy = zwork;
1048  tau = Acpy + m * n;
1049  B = tau + n;
1050  tau_b = B + bsize;
1051  work = tau_b + n;
1052 #endif
1053 
1054  jpvt = malloc( n * sizeof(pastix_int_t) );
1055 
1056  /**
1057  * Backup A into Acpy to try to compress
1058  */
1059  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1060  A, lda, Acpy, m );
1061  assert(ret == 0);
1062 
1063  newrk = rrqrfct( tol, rklimit, nb,
1064  m, n,
1065  Acpy, m, tau,
1066  B, n, tau_b,
1067  work, lwork, norm );
1068  if (newrk == -1) {
1069  flops = FLOPS_DGEQRF( m, n );
1070  }
1071  else {
1072  flops = FLOPS_DGEQRF( m, newrk ) + FLOPS_DORMQR( m, n-newrk, newrk, PastixLeft );
1073  }
1074 
1075  /**
1076  * It was not interesting to compress, so we restore the dense version in Alr
1077  */
1078  core_dlralloc( m, n, newrk, Alr );
1079  Alr->rk = newrk;
1080 
1081  if ( newrk == -1 ) {
1082  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1083  A, lda, Alr->u, Alr->rkmax );
1084  assert(ret == 0);
1085  }
1086  else if ( newrk > 0 ) {
1087  /**
1088  * We compute U and V
1089  */
1090  double *U, *V;
1091  pastix_int_t d, rk = 0;
1092 
1093  U = Alr->u;
1094  V = Alr->v;
1095 
1096  /* Compute the final U form */
1097  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
1098  Acpy, m, U, m );
1099  assert(ret == 0);
1100 
1101  ret = LAPACKE_dorgqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
1102  U, m, tau, work, lwork );
1103  assert(ret == 0);
1104  flops += FLOPS_DORGQR( m, Alr->rk, Alr->rk );
1105 
1106  /* Compute the final V form */
1107  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'U', Alr->rk, n,
1108  Acpy, m, V, Alr->rk );
1109  assert(ret == 0);
1110  ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
1111  0.0, 0.0, V + 1, Alr->rk );
1112  assert(ret == 0);
1113  /*
1114  * Apply inverse rotations to V^T
1115  */
1116  {
1117  /*
1118  * Householders are applied in the reverse order of before
1119  */
1120  rk = (Alr->rk / nb) * nb;
1121  while( rk >= 0 ) {
1122  d = pastix_imin( nb, Alr->rk - rk );
1123  ret = LAPACKE_dormqr_work( LAPACK_COL_MAJOR, 'R', trans,
1124  Alr->rk - rk, n - rk, d,
1125  B + rk * n + rk, n, tau_b + rk,
1126  V + rk * Alr->rk + rk, Alr->rk,
1127  work, lwork );
1128  assert(ret == 0);
1129  rk -= nb;
1130  }
1131  }
1132  }
1133 
1134 #if defined(PASTIX_DEBUG_LR)
1135  if ( Alr->rk > 0 ) {
1136  int rc = core_dlrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
1137  if (rc == 1) {
1138  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
1139  }
1140  }
1141 #endif
1142 
1143  free( zwork );
1144  free( jpvt );
1145 #if defined(PASTIX_DEBUG_LR)
1146  free( Acpy );
1147  free( tau );
1148  free( B );
1149  free( tau_b );
1150  free( work );
1151 #endif
1152  (void)ret;
1153  return flops;
1154 }
1155 
1156 /**
1157  *******************************************************************************
1158  *
1159  * @brief Template to perform the addition of two low-rank structures with
1160  * compression kernel based on QR decomposition.
1161  *
1162  * Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
1163  *
1164  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
1165  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
1166  * (0 I )
1167  * Compute Rank Revealing QR decomposition of (I u2^T u1) * (v2 v1)^T
1168  * (0 I )
1169  * Any QR rank revealing kernel can be used for the recompression of the V part.
1170  *
1171  *******************************************************************************
1172  *
1173  * @param[in] rrqrfct
1174  * QR decomposition function used to compute the rank revealing
1175  * factorization of the sum of the two low-rank matrices.
1176  *
1177  * @param[in] lowrank
1178  * The structure with low-rank parameters.
1179  *
1180  * @param[in] transA1
1181  * @arg PastixNoTrans: No transpose, op( A ) = A;
1182  * @arg PastixTrans: Transpose, op( A ) = A';
1183  *
1184  * @param[in] alphaptr
1185  * alpha * A is add to B
1186  *
1187  * @param[in] M1
1188  * The number of rows of the matrix A.
1189  *
1190  * @param[in] N1
1191  * The number of columns of the matrix A.
1192  *
1193  * @param[in] A
1194  * The low-rank representation of the matrix A.
1195  *
1196  * @param[in] M2
1197  * The number of rows of the matrix B.
1198  *
1199  * @param[in] N2
1200  * The number of columns of the matrix B.
1201  *
1202  * @param[in] B
1203  * The low-rank representation of the matrix B.
1204  *
1205  * @param[in] offx
1206  * The horizontal offset of A with respect to B.
1207  *
1208  * @param[in] offy
1209  * The vertical offset of A with respect to B.
1210  *
1211  *******************************************************************************
1212  *
1213  * @return The new rank of u2 v2^T or -1 if ranks are too large for
1214  * recompression
1215  *
1216  *******************************************************************************/
1219  const pastix_lr_t *lowrank,
1220  pastix_trans_t transA1,
1221  const void *alphaptr,
1222  pastix_int_t M1,
1223  pastix_int_t N1,
1224  const pastix_lrblock_t *A,
1225  pastix_int_t M2,
1226  pastix_int_t N2,
1227  pastix_lrblock_t *B,
1228  pastix_int_t offx,
1229  pastix_int_t offy )
1230 {
1231  pastix_int_t rankA, rank, M, N, minV;
1232  pastix_int_t i, ret, new_rank, rklimit;
1233  pastix_int_t ldau, ldav, ldbu, ldbv, ldu, ldv;
1234  double *u1u2, *v1v2, *u;
1235  double *zbuf, *tauV;
1236  size_t wzsize;
1237  double tol = lowrank->tolerance;
1238 
1239  /* PQRCP parameters / workspace */
1240  pastix_int_t nb = 32;
1241  pastix_int_t lwork;
1242  pastix_int_t *jpvt;
1243  double *zwork, zzsize;
1244  double *rwork;
1245  double alpha = *((double*)alphaptr);
1246  pastix_fixdbl_t flops, total_flops = 0.;
1247 
1248 #if defined(PASTIX_DEBUG_LR)
1249  if ( B->rk > 0 ) {
1250  int rc = core_dlrdbg_check_orthogonality( M2, B->rk, B->u, M2 );
1251  if (rc == 1) {
1252  fprintf(stderr, "Failed to have B->u orthogonal in entry of rradd\n" );
1253  }
1254  }
1255 #endif
1256 
1257  rankA = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
1258  rank = rankA + B->rk;
1259  M = pastix_imax(M2, M1);
1260  N = pastix_imax(N2, N1);
1261 
1262  minV = pastix_imin(N, rank);
1263 
1264  assert(M2 == M && N2 == N);
1265  assert(B->rk != -1);
1266 
1267  assert( A->rk <= A->rkmax);
1268  assert( B->rk <= B->rkmax);
1269 
1270  if ( ((M1 + offx) > M2) ||
1271  ((N1 + offy) > N2) )
1272  {
1273  pastix_print_error( "Dimensions are not correct" );
1274  assert(0 /* Incorrect dimensions */);
1275  return total_flops;
1276  }
1277 
1278  /*
1279  * A is rank null, nothing to do
1280  */
1281  if ( A->rk == 0 ) {
1282  return total_flops;
1283  }
1284 
1285  /*
1286  * Let's handle case where B is a null matrix
1287  * B = alpha A
1288  */
1289  if ( B->rk == 0 ) {
1290  core_dlrcpy( lowrank, transA1, alpha,
1291  M1, N1, A, M2, N2, B,
1292  offx, offy );
1293  return total_flops;
1294  }
1295 
1296  /*
1297  * The rank is too big, let's try to compress
1298  */
1299  if ( rank > pastix_imin( M, N ) ) {
1300  assert(0);
1301  }
1302 
1303  /*
1304  * Let's define leading dimensions
1305  */
1306  ldau = (A->rk == -1) ? A->rkmax : M1;
1307  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
1308  ldbu = M;
1309  ldbv = B->rkmax;
1310  ldu = M;
1311  ldv = rank;
1312 
1313  /*
1314  * Let's compute the size of the workspace
1315  */
1316  /* u1u2 and v1v2 */
1317  wzsize = (M+N) * rank;
1318  /* tauV */
1319  wzsize += minV;
1320 
1321  /* RRQR workspaces */
1322  rklimit = pastix_imin( rank, core_get_rklimit( M, N ) );
1323  rrqrfct( tol, rklimit, 1, nb,
1324  rank, N, NULL, ldv,
1325  NULL, NULL,
1326  &zzsize, -1, NULL );
1327  lwork = (pastix_int_t)(zzsize);
1328  wzsize += lwork;
1329 
1330 #if defined(PASTIX_DEBUG_LR)
1331  zbuf = NULL;
1332  u1u2 = malloc( ldu * rank * sizeof(double) );
1333  v1v2 = malloc( ldv * N * sizeof(double) );
1334  tauV = malloc( rank * sizeof(double) );
1335  zwork = malloc( lwork * sizeof(double) );
1336 
1337  rwork = malloc( 2 * pastix_imax( rank, N ) * sizeof(double) );
1338 #else
1339  zbuf = malloc( wzsize * sizeof(double) + 2 * pastix_imax(rank, N) * sizeof(double) );
1340 
1341  u1u2 = zbuf;
1342  v1v2 = u1u2 + ldu * rank;
1343  tauV = v1v2 + ldv * N;
1344  zwork = tauV + rank;
1345 
1346  rwork = (double*)(zwork + lwork);
1347 #endif
1348 
1349  /*
1350  * Concatenate U2 and U1 in u1u2
1351  * [ u2 0 ]
1352  * [ u2 u1 ]
1353  * [ u2 0 ]
1354  */
1355  core_dlrconcatenate_u( alpha,
1356  M1, N1, A,
1357  M2, B,
1358  offx, u1u2 );
1359 
1360  /*
1361  * Concatenate V2 and V1 in v1v2
1362  * [ v2^h v2^h v2^h ]
1363  * [ 0 v1^h 0 ]
1364  */
1365  core_dlrconcatenate_v( transA1, alpha,
1366  M1, N1, A,
1367  N2, B,
1368  offy, v1v2 );
1369 
1370  /*
1371  * Orthogonalize [u2, u1]
1372  * [u2, u1] = [u2, u1 - u2(u2Tu1)] * (I u2Tu1)
1373  * (0 I )
1374  */
1375 
1376  /* We do not care is A was integrated into v1v2 */
1377  if (rankA != 0) {
1378 
1379  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_orthogonalize );
1380  switch ( pastix_lr_ortho ) {
1381  case PastixCompressOrthoQR:
1382  flops = core_dlrorthu_fullqr( M, N, B->rk + rankA,
1383  u1u2, ldu, v1v2, ldv );
1384  break;
1385 
1387  flops = core_dlrorthu_partialqr( M, N, B->rk, &rankA, offx, offy,
1388  u1u2, ldu, v1v2, ldv );
1389  break;
1390 
1392  pastix_attr_fallthrough;
1393 
1394  default:
1395  flops = core_dlrorthu_cgs( M2, N2, M1, N1, B->rk, &rankA, offx, offy,
1396  u1u2, ldu, v1v2, ldv );
1397  }
1398  kernel_trace_stop_lvl2( flops );
1399 
1400  total_flops += flops;
1401  }
1402 
1403  rank = B->rk + rankA;
1404 
1405  if (rankA == 0) {
1406  /*
1407  * The final B += A fit in B
1408  * Lets copy and return
1409  */
1410  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', M, B->rk, u1u2, ldu, B->u, ldbu );
1411  assert( ret == 0 );
1412  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N, v1v2, ldv, B->v, ldbv );
1413  assert( ret == 0 );
1414 
1415  free(zbuf);
1416 #if defined(PASTIX_DEBUG_LR)
1417  free( u1u2 );
1418  free( v1v2 );
1419  free( tauV );
1420  free( zwork );
1421  free( rwork );
1422 #endif
1423  return total_flops;
1424  }
1425 
1426  MALLOC_INTERN( jpvt, pastix_imax(rank, N), pastix_int_t );
1427 
1428  if ( lowrank->use_reltol ) {
1429  /**
1430  * In relative tolerance, we can choose two solutions:
1431  * 1) The first one, more conservative, is to compress relatively to
1432  * the norm of the final matrix \f$ \alpha A + B \f$. In this kernel, we
1433  * exploit the fact that the V part contains all the information while
1434  * the U part is orthonormal, and compute it as follow:
1435  *
1436  * double norm = LAPACKE_dlange_work( LAPACK_COL_MAJOR, 'f', rank, N,
1437  * v1v2, ldv, NULL );
1438  * tol = tol * norm;
1439  *
1440  * 2) The second solution, less conservative, will allow to reduce the
1441  * rank more efficiently. Since A and B have been compressed relatively
1442  * to their respective norms, there is no reason to compress the sum
1443  * relatively to its own norm, but it is more reasonable to compress it
1444  * relatively to the norm of A and B. For example, A-B would be full
1445  * with the first criterion, and rank null with the second.
1446  * Note that here, we can only have an estimation that once again
1447  * reduces the conservation of the criterion.
1448  * \f[ || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b|| \f]
1449  *
1450  */
1451  double normA, normB;
1452  normA = core_dlrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
1453  normB = core_dlrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
1454  tol = tol * ( fabs(alpha) * normA + normB );
1455  }
1456 
1457  /*
1458  * Perform RRQR factorization on (I u2Tu1) v1v2 = (Q2 R2)
1459  * (0 I )
1460  */
1461  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
1462  rklimit = pastix_imin( rklimit, rank );
1463  new_rank = rrqrfct( tol, rklimit, 1, nb,
1464  rank, N, v1v2, ldv,
1465  jpvt, tauV,
1466  zwork, lwork, rwork );
1467  flops = (new_rank == -1) ? FLOPS_DGEQRF( rank, N )
1468  : (FLOPS_DGEQRF( rank, new_rank ) +
1469  FLOPS_DORMQR( rank, N-new_rank, new_rank, PastixLeft ));
1470  kernel_trace_stop_lvl2_rank( flops, new_rank );
1471  total_flops += flops;
1472 
1473  /*
1474  * First case: The rank is too big, so we decide to uncompress the result
1475  */
1476  if ( (new_rank > rklimit) ||
1477  (new_rank == -1) )
1478  {
1479  pastix_lrblock_t Bbackup = *B;
1480 
1481  core_dlralloc( M, N, -1, B );
1482  u = B->u;
1483 
1484  /* Uncompress B */
1485  flops = FLOPS_DGEMM( M, N, Bbackup.rk );
1486  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
1487  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1488  M, N, Bbackup.rk,
1489  (done), Bbackup.u, ldbu,
1490  Bbackup.v, ldbv,
1491  (dzero), u, M );
1492  kernel_trace_stop_lvl2( flops );
1493  total_flops += flops;
1494 
1495  /* Add A into it */
1496  if ( A->rk == -1 ) {
1497  flops = 2 * M1 * N1;
1498  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1499  core_dgeadd( transA1, M1, N1,
1500  alpha, A->u, ldau,
1501  done, u + offy * M + offx, M);
1502  kernel_trace_stop_lvl2( flops );
1503  }
1504  else {
1505  flops = FLOPS_DGEMM( M1, N1, A->rk );
1506  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1507  cblas_dgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
1508  M1, N1, A->rk,
1509  (alpha), A->u, ldau,
1510  A->v, ldav,
1511  (done), u + offy * M + offx, M);
1512  kernel_trace_stop_lvl2( flops );
1513  }
1514  total_flops += flops;
1515  core_dlrfree(&Bbackup);
1516  free( zbuf );
1517  free( jpvt );
1518 #if defined(PASTIX_DEBUG_LR)
1519  free( u1u2 );
1520  free( v1v2 );
1521  free( tauV );
1522  free( zwork );
1523  free( rwork );
1524 #endif
1525  return total_flops;
1526  }
1527  else if ( new_rank == 0 ) {
1528  core_dlrfree(B);
1529  free( zbuf );
1530  free( jpvt );
1531 #if defined(PASTIX_DEBUG_LR)
1532  free( u1u2 );
1533  free( v1v2 );
1534  free( tauV );
1535  free( zwork );
1536  free( rwork );
1537 #endif
1538  return total_flops;
1539  }
1540 
1541  /*
1542  * We need to reallocate the buffer to store the new compressed version of B
1543  * because it wasn't big enough
1544  */
1545  ret = core_dlrsze( 0, M, N, B, new_rank, -1, -1 );
1546  assert( ret != -1 );
1547  assert( B->rkmax >= new_rank );
1548  assert( B->rkmax >= B->rk );
1549 
1550  ldbv = B->rkmax;
1551 
1552  /* B->v = P v1v2 */
1553  {
1554  double *tmpV;
1555  pastix_int_t lm;
1556 
1557  memset(B->v, 0, N * ldbv * sizeof(double));
1558  tmpV = B->v;
1559  for (i=0; i<N; i++){
1560  lm = pastix_imin( new_rank, i+1 );
1561  memcpy(tmpV + jpvt[i] * ldbv,
1562  v1v2 + i * ldv,
1563  lm * sizeof(double));
1564  }
1565  }
1566 
1567  /* Compute Q2 factor */
1568  {
1569  flops = FLOPS_DORGQR( rank, new_rank, new_rank )
1570  + FLOPS_DGEMM( M, new_rank, rank );
1571 
1572  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
1573  ret = LAPACKE_dorgqr_work( LAPACK_COL_MAJOR, rank, new_rank, new_rank,
1574  v1v2, ldv, tauV, zwork, lwork );
1575  assert(ret == 0);
1576 
1577  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1578  M, new_rank, rank,
1579  (done), u1u2, ldu,
1580  v1v2, ldv,
1581  (dzero), B->u, ldbu);
1582  kernel_trace_stop_lvl2( flops );
1583  total_flops += flops;
1584  }
1585 
1586  free( zbuf );
1587  free( jpvt );
1588 
1589 #if defined(PASTIX_DEBUG_LR)
1590  free( u1u2 );
1591  free( v1v2 );
1592  free( tauV );
1593  free( zwork );
1594  free( rwork );
1595 #endif
1596 
1597  (void)ret;
1598  return total_flops;
1599 }
1600 
1601 /**
1602  *******************************************************************************
1603  *
1604  * @brief Compute the size of a block to send in LR
1605  *
1606  *******************************************************************************
1607  *
1608  * @param[in] M
1609  * The number of rows of the matrix A.
1610  *
1611  * @param[in] N
1612  * The number of columns of the matrix A.
1613  *
1614  * @param[in] A
1615  * The low-rank representation of the matrix A.
1616  *
1617  *******************************************************************************
1618  *
1619  * @return Size of a block to send in LR
1620  *
1621  *******************************************************************************/
1622 size_t
1624  pastix_int_t N,
1625  pastix_lrblock_t *A )
1626 {
1627  if ( A->rk != -1 ) {
1628  return A->rk * ( M + N );
1629  }
1630  else {
1631  return M * N;
1632  }
1633 }
1634 
1635 /**
1636  *******************************************************************************
1637  *
1638  * @brief Pack low-rank data by side
1639  *
1640  *******************************************************************************
1641  *
1642  * @param[in] M
1643  * The number of rows of the matrix A.
1644  *
1645  * @param[in] N
1646  * The number of columns of the matrix A.
1647  *
1648  * @param[in] A
1649  * The low-rank representation of the matrix A.
1650  *
1651  * @param[inout] buffer
1652  * Pointer on packed data
1653  *
1654  *******************************************************************************
1655  *
1656  * @return Pointer on packed data shifted to the next block
1657  *
1658  *******************************************************************************/
1659 char *
1661  pastix_int_t N,
1662  const pastix_lrblock_t *A,
1663  char *buffer )
1664 {
1665  int rk = A->rk;
1666  int rkmax = A->rkmax;
1667  void *u = A->u;
1668  void *v = A->v;
1669  int ret;
1670 
1671  /* Store the rank */
1672  memcpy( buffer, &rk, sizeof( int ) );
1673  buffer += sizeof( int );
1674 
1675  if ( rk != -1 ) {
1676  /* Pack the u part */
1677  memcpy( buffer, u, rk * M * sizeof( double ) );
1678  buffer += rk * M * sizeof( double );
1679 
1680  /* Pack the v part */
1681  if ( rk == rkmax ) {
1682  memcpy( buffer, v, rk * N * sizeof( double ) );
1683  buffer += rk * N * sizeof( double );
1684  }
1685  else {
1686  ret = LAPACKE_dlacpy_work( LAPACK_COL_MAJOR, 'A', rk, N, v, rkmax,
1687  (double *)buffer, rk );
1688  assert( ret == 0 );
1689  buffer += rk * N * sizeof( double );
1690  }
1691  }
1692  else {
1693  memcpy( buffer, u, M * N * sizeof( double ) );
1694  buffer += M * N * sizeof( double );
1695  }
1696 
1697  (void)ret;
1698 
1699  return buffer;
1700 }
1701 
1702 /**
1703  *******************************************************************************
1704  *
1705  * @brief Unpack low rank data and fill the cblk concerned by the computation
1706  *
1707  *******************************************************************************
1708  *
1709  * @param[in] M
1710  * The number of rows of the matrix A.
1711  *
1712  * @param[in] N
1713  * The number of columns of the matrix A.
1714  *
1715  * @param[in] A
1716  * The low-rank representation of the matrix A.
1717  *
1718  * @param[inout] buffer
1719  * Pointer on packed data
1720  *
1721  *******************************************************************************
1722  *
1723  * @return Pointer on packed data shifted to the next block
1724  *
1725  *******************************************************************************/
1726 char *
1728  pastix_int_t N,
1729  pastix_lrblock_t *A,
1730  char *buffer )
1731 {
1732  int rk;
1733  memcpy( &rk, buffer, sizeof( int ) );
1734  buffer += sizeof( int );
1735 
1736  /* Make sure A can store the unpacked values */
1737  core_dlrsze( 0, M, N, A, rk, rk, rk );
1738 
1739  if ( rk != -1 ) {
1740  /* Unpack U */
1741  memcpy( A->u, buffer, M * rk * sizeof( double ) );
1742  buffer += M * rk * sizeof( double );
1743 
1744  /* Unpack V */
1745  memcpy( A->v, buffer, N * rk * sizeof( double ) );
1746  buffer += N * rk * sizeof( double );
1747  }
1748  else {
1749  /* Unpack the full block */
1750  memcpy( A->u, buffer, M * N * sizeof( double ) );
1751  buffer += M * N * sizeof( double );
1752  }
1753 
1754  return buffer;
1755 }
1756 
1757 /**
1758  *******************************************************************************
1759  *
1760  * @brief Unpack low rank data and fill the cblk concerned by the computation
1761  *
1762  *******************************************************************************
1763  *
1764  * @param[in] M
1765  * The number of rows of the matrix A.
1766  *
1767  * @param[in] N
1768  * The number of columns of the matrix A.
1769  *
1770  * @param[in] A
1771  * The low-rank representation of the matrix A.
1772  *
1773  * @param[inout] input
1774  * TODO
1775  *
1776  * @param[inout] outptr
1777  * TODO
1778  *
1779  *******************************************************************************
1780  *
1781  * @return Pointer on packed data shifted to the next block
1782  *
1783  *******************************************************************************/
1784 const char *
1786  pastix_int_t N,
1787  pastix_lrblock_t *A,
1788  const char *input,
1789  char **outptr )
1790 {
1791  char *output = *outptr;
1792  size_t size;
1793  int rk;
1794 
1795  rk = *((int *)input);
1796  input += sizeof( int );
1797 
1798  if ( rk != -1 ) {
1799  A->rk = rk;
1800  A->rkmax = rk;
1801 
1802  /* Unpack U */
1803  size = M * rk * sizeof( double );
1804  A->u = output;
1805 
1806  memcpy( A->u, input, size );
1807  input += size;
1808  output += size;
1809 
1810  /* Unpack V */
1811  size = N * rk * sizeof( double );
1812  A->v = output;
1813 
1814  memcpy( A->v, input, size );
1815  input += size;
1816  output += size;
1817  }
1818  else {
1819  A->rk = -1;
1820  A->rkmax = M;
1821  A->v = NULL;
1822 
1823  /* Unpack the full block */
1824  size = M * N * sizeof( double );
1825  A->u = output;
1826 
1827  memcpy( A->u, input, size );
1828  input += size;
1829  output += size;
1830  }
1831 
1832  *outptr = output;
1833  return input;
1834 }
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelLvl2_LR_add2C_rradd_orthogonalize
void core_dgetmo(int m, int n, const double *A, int lda, double *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
Definition: core_dgetmo.c:49
pastix_fixdbl_t core_dlrorthu_fullqr(pastix_int_t M, pastix_int_t N, pastix_int_t rank, double *U, pastix_int_t ldu, double *V, pastix_int_t ldv)
Try to orthognalize the u part of the low-rank form, and update the v part accordingly using full QR.
Definition: core_dlrothu.c:83
int core_dgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, double alpha, const double *A, pastix_int_t LDA, double beta, double *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_dgeadd.c:78
pastix_fixdbl_t core_dlrorthu_partialqr(pastix_int_t M, pastix_int_t N, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, double *U, pastix_int_t ldu, double *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using partial ...
Definition: core_dlrothu.c:197
pastix_fixdbl_t core_dlrorthu_cgs(pastix_int_t M1, pastix_int_t N1, pastix_int_t M2, pastix_int_t N2, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, double *U, pastix_int_t ldu, double *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using CGS.
Definition: core_dlrothu.c:430
int core_dlrdbg_check_orthogonality(pastix_int_t M, pastix_int_t N, const double *A, pastix_int_t lda)
Check the orthogonality of the matrix A.
Definition: core_dlrdbg.c:101
double tolerance
fct_ge2lr_t core_ge2lr
pastix_int_t pastix_lr_ortho
Define the orthogonalization method.
Definition: lowrank.c:25
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:46
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_dge2lr_qrrt(core_drrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
int(* core_drrqr_cp_t)(double tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, double *A, pastix_int_t lda, pastix_int_t *jpvt, double *tau, double *work, pastix_int_t lwork, double *rwork)
TODO.
pastix_fixdbl_t core_dge2lr_qrcp(core_drrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
int(* core_drrqr_rt_t)(double tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, double *A, pastix_int_t lda, double *tau, double *B, pastix_int_t ldb, double *tau_b, double *work, pastix_int_t lwork, double normA)
TODO.
pastix_fixdbl_t core_drradd_qr(core_drrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
void core_dlrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, double alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
int core_dlr2ge(pastix_trans_t trans, pastix_int_t m, pastix_int_t n, const pastix_lrblock_t *Alr, double *A, pastix_int_t lda)
Convert a low rank matrix into a dense matrix.
void core_dlrconcatenate_u(double alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, double *u1u2)
Concatenate left parts of two low-rank matrices.
char * core_dlrpack(pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer)
Pack low-rank data by side.
void core_dlralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
Definition: core_dgelrops.c:56
const char * core_dlrunpack2(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, const char *input, char **outptr)
Unpack low rank data and fill the cblk concerned by the computation.
void core_dlrconcatenate_v(pastix_trans_t transA1, double alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, double *v1v2)
Concatenate right parts of two low-rank matrices.
double core_dlrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition: core_dlrnrm.c:48
size_t core_dlrgetsize(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A)
Compute the size of a block to send in LR.
void core_dlrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
int core_dlrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
char * core_dlrunpack(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer)
Unpack low rank data and fill the cblk concerned by the computation.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixFrobeniusNorm
Definition: api.h:504
@ PastixCompressOrthoQR
Definition: api.h:408
@ PastixCompressOrthoPartialQR
Definition: api.h:409
@ PastixCompressOrthoCGS
Definition: api.h:407
@ PastixLeft
Definition: api.h:495
@ PastixNoTrans
Definition: api.h:445