PaStiX Handbook  6.2.1
core_zgelrops.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_zgelrops.c
4  *
5  * PaStiX low-rank kernel routines
6  *
7  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.2.1
11  * @author Gregoire Pichon
12  * @author Esragul Korkmaz
13  * @author Mathieu Faverge
14  * @author Pierre Ramet
15  * @author Nolan Bredel
16  * @date 2021-06-29
17  * @generated from /builds/solverstack/pastix/kernels/core_zgelrops.c, normal z -> z, Tue Apr 12 09:38:40 2022
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_zcores.h"
25 #include "pastix_zlrcores.h"
26 #include "z_nan_check.h"
27 #include "kernels_trace.h"
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 static pastix_complex64_t zone = 1.0;
31 static pastix_complex64_t zzero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Allocate a low-rank matrix.
38  *
39  *******************************************************************************
40  *
41  * @param[in] M
42  * Number of rows of the matrix A.
43  *
44  * @param[in] N
45  * Number of columns of the matrix A.
46  *
47  * @param[in] rkmax
48  * @arg -1: the matrix is allocated tight to its rank.
49  * @arg >0: the matrix is allocated to the minimum of rkmax and its maximum rank.
50  *
51  * @param[out] A
52  * The allocated low-rank matrix
53  *
54  *******************************************************************************/
55 void
56 core_zlralloc( pastix_int_t M,
57  pastix_int_t N,
58  pastix_int_t rkmax,
59  pastix_lrblock_t *A )
60 {
61  pastix_complex64_t *u, *v;
62 
63  if ( rkmax == -1 ) {
64  u = malloc( M * N * sizeof(pastix_complex64_t) );
65  memset( u, 0, M * N * sizeof(pastix_complex64_t) );
66  A->rk = -1;
67  A->rkmax = M;
68  A->u = u;
69  A->v = NULL;
70  }
71  else if ( rkmax == 0 ) {
72  A->rk = 0;
73  A->rkmax = 0;
74  A->u = NULL;
75  A->v = NULL;
76  }
77  else {
78  pastix_int_t rk = pastix_imin( M, N );
79  rkmax = pastix_imin( rkmax, rk );
80 
81 #if defined(PASTIX_DEBUG_LR)
82  u = malloc( M * rkmax * sizeof(pastix_complex64_t) );
83  v = malloc( N * rkmax * sizeof(pastix_complex64_t) );
84 
85  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
86  memset(u, 0, M * rkmax * sizeof(pastix_complex64_t));
87  memset(v, 0, N * rkmax * sizeof(pastix_complex64_t));
88 #else
89  u = malloc( (M+N) * rkmax * sizeof(pastix_complex64_t));
90 
91  /* To avoid uninitialised values in valgrind. Lapacke doc (xgesvd) is not correct */
92  memset(u, 0, (M+N) * rkmax * sizeof(pastix_complex64_t));
93 
94  v = u + M * rkmax;
95 #endif
96 
97  A->rk = 0;
98  A->rkmax = rkmax;
99  A->u = u;
100  A->v = v;
101  }
102 }
103 
104 /**
105  *******************************************************************************
106  *
107  * @brief Free a low-rank matrix.
108  *
109  *******************************************************************************
110  *
111  * @param[inout] A
112  * The low-rank matrix that will be desallocated.
113  *
114  *******************************************************************************/
115 void
117 {
118  if ( A->rk == -1 ) {
119  free(A->u);
120  A->u = NULL;
121  }
122  else {
123  free(A->u);
124 #if defined(PASTIX_DEBUG_LR)
125  free(A->v);
126 #endif
127  A->u = NULL;
128  A->v = NULL;
129  }
130  A->rk = 0;
131  A->rkmax = 0;
132 }
133 
134 /**
135  *******************************************************************************
136  *
137  * @brief Resize a low-rank matrix
138  *
139  *******************************************************************************
140  *
141  * @param[in] copy
142  * Enable/disable the copy of the data from A->u and A->v into the new
143  * low-rank representation.
144  *
145  * @param[in] M
146  * The number of rows of the matrix A.
147  *
148  * @param[in] N
149  * The number of columns of the matrix A.
150  *
151  * @param[inout] A
152  * The low-rank representation of the matrix. At exit, this structure
153  * is modified with the new low-rank representation of A, is the rank
154  * is small enough
155  *
156  * @param[in] newrk
157  * The new rank of the matrix A.
158  *
159  * @param[in] newrkmax
160  * The new maximum rank of the matrix A. Useful if the low-rank
161  * structure was allocated with more data than the rank.
162  *
163  * @param[in] rklimit
164  * The maximum rank to store the matrix in low-rank format. If
165  * -1, set to core_get_rklimit(M, N)
166  *
167  *******************************************************************************
168  *
169  * @return The new rank of A
170  *
171  *******************************************************************************/
172 int
173 core_zlrsze( int copy, pastix_int_t M, pastix_int_t N,
174  pastix_lrblock_t *A,
175  pastix_int_t newrk,
176  pastix_int_t newrkmax,
177  pastix_int_t rklimit )
178 {
179  /* If no limit on the rank is given, let's take min(M, N) */
180  rklimit = (rklimit == -1) ? core_get_rklimit( M, N ) : rklimit;
181 
182  /* If no extra memory allocated, let's fix rkmax to rk */
183  newrkmax = (newrkmax == -1) ? newrk : newrkmax;
184  newrkmax = pastix_imax( newrkmax, newrk );
185 
186  /*
187  * It is not interesting to compress, so we alloc space to store the full matrix
188  */
189  if ( (newrk > rklimit) || (newrk == -1) )
190  {
191  A->u = realloc( A->u, M * N * sizeof(pastix_complex64_t) );
192 #if defined(PASTIX_DEBUG_LR)
193  free(A->v);
194 #endif
195  A->v = NULL;
196  A->rk = -1;
197  A->rkmax = M;
198  return -1;
199  }
200  /*
201  * The rank is null, we free everything
202  */
203  else if (newrkmax == 0)
204  {
205  /*
206  * The rank is null, we free everything
207  */
208  free(A->u);
209 #if defined(PASTIX_DEBUG_LR)
210  free(A->v);
211 #endif
212  A->u = NULL;
213  A->v = NULL;
214  A->rkmax = newrkmax;
215  A->rk = newrk;
216  }
217  /*
218  * The rank is non null, we allocate the correct amount of space, and
219  * compress the stored information if necessary
220  */
221  else {
222  pastix_complex64_t *u, *v;
223  int ret;
224 
225  if ( ( A->rk == -1 ) ||
226  (( A->rk != -1 ) && (newrkmax != A->rkmax)) )
227  {
228 #if defined(PASTIX_DEBUG_LR)
229  u = malloc( M * newrkmax * sizeof(pastix_complex64_t) );
230  v = malloc( N * newrkmax * sizeof(pastix_complex64_t) );
231 #else
232  u = malloc( (M+N) * newrkmax * sizeof(pastix_complex64_t) );
233  v = u + M * newrkmax;
234 #endif
235  if ( copy ) {
236  assert( A->rk != -1 );
237  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M, newrk,
238  A->u, M, u, M );
239  assert(ret == 0);
240  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', newrk, N,
241  A->v, A->rkmax, v, newrkmax );
242  assert(ret == 0);
243  }
244  free(A->u);
245 #if defined(PASTIX_DEBUG_LR)
246  free(A->v);
247 #endif
248  A->u = u;
249  A->v = v;
250  }
251 
252  /* Update rk and rkmax */
253  A->rkmax = newrkmax;
254  A->rk = newrk;
255 
256  (void)ret;
257  }
258  assert( A->rk <= A->rkmax);
259  return 0;
260 }
261 
262 /**
263  *******************************************************************************
264  *
265  * @brief Convert a low rank matrix into a dense matrix.
266  *
267  * Convert a low-rank matrix of size m-by-n into a full rank matrix.
268  * A = op( u * v^t ) with op(A) = A or A^t
269  *
270  *******************************************************************************
271  *
272  * @param[in] trans
273  * @arg PastixNoTrans: returns A = u * v^t
274  * @arg PastixTrans: returns A = v * u^t
275  *
276  * @param[in] m
277  * Number of rows of the low-rank matrix Alr.
278  *
279  * @param[in] n
280  * Number of columns of the low-rank matrix Alr.
281  *
282  * @param[in] Alr
283  * The low rank matrix to be converted into a full rank matrix
284  *
285  * @param[inout] A
286  * The matrix of dimension lda-by-k in which to store the uncompressed
287  * version of Alr. k = n if trans == PastixNoTrans, m otherwise.
288  *
289  * @param[in] lda
290  * The leading dimension of the matrix A. lda >= max(1, m) if trans ==
291  * PastixNoTrans, lda >= max(1,n) otherwise.
292  *
293  *******************************************************************************
294  *
295  * @retval 0 in case of success.
296  * @retval -i if the ith parameter is incorrect.
297  *
298  *******************************************************************************/
299 int
300 core_zlr2ge( pastix_trans_t trans, pastix_int_t m, pastix_int_t n,
301  const pastix_lrblock_t *Alr,
302  pastix_complex64_t *A, pastix_int_t lda )
303 {
304  int ret = 0;
305 
306 #if !defined(NDEBUG)
307  if ( m < 0 ) {
308  return -1;
309  }
310  if ( n < 0 ) {
311  return -2;
312  }
313  if (Alr == NULL || Alr->rk > Alr->rkmax) {
314  return -3;
315  }
316  if ( (trans == PastixNoTrans && lda < m) ||
317  (trans != PastixNoTrans && lda < n) )
318  {
319  return -5;
320  }
321  if ( Alr->rk == -1 ) {
322  if (Alr->u == NULL || Alr->v != NULL || (Alr->rkmax < m))
323  {
324  return -6;
325  }
326  }
327  else if ( Alr->rk != 0){
328  if (Alr->u == NULL || Alr->v == NULL) {
329  return -6;
330  }
331  }
332 #endif
333 
334  if ( trans == PastixNoTrans ) {
335  if ( Alr->rk == -1 ) {
336  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
337  Alr->u, Alr->rkmax, A, lda );
338  assert( ret == 0 );
339  }
340  else if ( Alr->rk == 0 ) {
341  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', m, n,
342  0.0, 0.0, A, lda );
343  assert( ret == 0 );
344  }
345  else {
346  cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
347  m, n, Alr->rk,
348  CBLAS_SADDR(zone), Alr->u, m,
349  Alr->v, Alr->rkmax,
350  CBLAS_SADDR(zzero), A, lda);
351  }
352  }
353  else {
354  if ( Alr->rk == -1 ) {
355  core_zgetmo( m, n, Alr->u, Alr->rkmax, A, lda );
356  }
357  else if ( Alr->rk == 0 ) {
358  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', n, m,
359  0.0, 0.0, A, lda );
360  assert( ret == 0 );
361  }
362  else {
363  cblas_zgemm(CblasColMajor, CblasTrans, CblasTrans,
364  n, m, Alr->rk,
365  CBLAS_SADDR(zone), Alr->v, Alr->rkmax,
366  Alr->u, m,
367  CBLAS_SADDR(zzero), A, lda);
368  }
369  }
370 
371  return ret;
372 }
373 
374 /**
375  *******************************************************************************
376  *
377  * @brief Copy a small low-rank structure into a large one
378  *
379  *******************************************************************************
380  *
381  * @param[in] lowrank
382  * The structure with low-rank parameters.
383  *
384  * @param[in] transAv
385  * @arg PastixNoTrans: (A.v)' is stored transposed as usual
386  * @arg PastixTrans: A.v is stored
387  * @arg PastixConjTrans: A.v is stored
388  *
389  * @param[in] alpha
390  * The multiplier parameter: B = B + alpha * A
391  *
392  * @param[in] M1
393  * The number of rows of the matrix A.
394  *
395  * @param[in] N1
396  * The number of columns of the matrix A.
397  *
398  * @param[in] A
399  * The low-rank representation of the matrix A.
400  *
401  * @param[in] M2
402  * The number of rows of the matrix B.
403  *
404  * @param[in] N2
405  * The number of columns of the matrix B.
406  *
407  * @param[inout] B
408  * The low-rank representation of the matrix B.
409  *
410  * @param[in] offx
411  * The horizontal offset of A with respect to B.
412  *
413  * @param[in] offy
414  * The vertical offset of A with respect to B.
415  *
416  *******************************************************************************/
417 void
418 core_zlrcpy( const pastix_lr_t *lowrank,
419  pastix_trans_t transAv, pastix_complex64_t alpha,
420  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
421  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
422  pastix_int_t offx, pastix_int_t offy )
423 {
424  pastix_complex64_t *u, *v;
425  pastix_int_t ldau, ldav;
426  int ret;
427 
428  assert( (M1 + offx) <= M2 );
429  assert( (N1 + offy) <= N2 );
430 
431  ldau = (A->rk == -1) ? A->rkmax : M1;
432  ldav = (transAv == PastixNoTrans) ? A->rkmax : N1;
433 
434  core_zlrfree( B );
435  core_zlralloc( M2, N2, A->rk, B );
436  u = B->u;
437  v = B->v;
438 
439  B->rk = A->rk;
440 
441  if ( A->rk == -1 ) {
442  if ( (M1 != M2) || (N1 != N2) ) {
443  LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', M2, N2,
444  0.0, 0.0, u, M2 );
445  }
446  ret = core_zgeadd( PastixNoTrans, M1, N1,
447  alpha, A->u, ldau,
448  0.0, u + M2 * offy + offx, M2 );
449  assert(ret == 0);
450  }
451  else {
452  if ( M1 != M2 ) {
453  LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
454  0.0, 0.0, u, M2 );
455  }
456  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
457  A->u, ldau,
458  u + offx, M2 );
459  assert(ret == 0);
460 
461  if ( N1 != N2 ) {
462  LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
463  0.0, 0.0, v, B->rkmax );
464  }
465  ret = core_zgeadd( transAv, A->rk, N1,
466  alpha, A->v, ldav,
467  0.0, v + B->rkmax * offy, B->rkmax );
468  assert(ret == 0);
469  }
470 
471 #if 0
472  {
473  pastix_complex64_t *work = malloc( M2 * N2 * sizeof(pastix_complex64_t) );
474 
475  core_zlr2ge( PastixNoTrans, M2, N2, B, work, M2 );
476 
477  lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, M2, N2, work, M2, B );
478 
479  free(work);
480  }
481 #endif
482 
483  (void) lowrank;
484  (void) ret;
485 }
486 
487 
488 /**
489  *******************************************************************************
490  *
491  * @brief Concatenate left parts of two low-rank matrices
492  *
493  *******************************************************************************
494  *
495  * @param[in] alpha
496  * alpha * A is add to B
497  *
498  * @param[in] M1
499  * The number of rows of the matrix A.
500  *
501  * @param[in] N1
502  * The number of columns of the matrix A.
503  *
504  * @param[in] A
505  * The low-rank representation of the matrix A.
506  *
507  * @param[in] M2
508  * The number of rows of the matrix B.
509  *
510  * @param[in] B
511  * The low-rank representation of the matrix B.
512  *
513  * @param[in] offx
514  * The horizontal offset of A with respect to B.
515  *
516  * @param[inout] u1u2
517  * The workspace where matrices are concatenated
518  *
519  *******************************************************************************/
520 void
521 core_zlrconcatenate_u( pastix_complex64_t alpha,
522  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
523  pastix_int_t M2, pastix_lrblock_t *B,
524  pastix_int_t offx,
525  pastix_complex64_t *u1u2 )
526 {
527  pastix_complex64_t *tmp;
528  pastix_int_t i, ret, rank;
529  pastix_int_t ldau, ldbu;
530 
531  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
532  rank += B->rk;
533 
534  ldau = (A->rk == -1) ? A->rkmax : M1;
535  ldbu = M2;
536 
537  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M2, B->rk,
538  B->u, ldbu, u1u2, M2 );
539  assert(ret == 0);
540 
541  tmp = u1u2 + B->rk * M2;
542  if ( A->rk == -1 ) {
543  /*
544  * A is full of rank M1, so A will be integrated into v1v2
545  */
546  if ( M1 < N1 ) {
547  if ( M1 != M2 ) {
548  /* Set to 0 */
549  memset(tmp, 0, M2 * M1 * sizeof(pastix_complex64_t));
550 
551  /* Set diagonal */
552  tmp += offx;
553  for (i=0; i<M1; i++, tmp += M2+1) {
554  *tmp = 1.0;
555  }
556  }
557  else {
558  assert( offx == 0 );
559  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', M2, M1,
560  0.0, 1.0, tmp, M2 );
561  assert( ret == 0 );
562  }
563  }
564  else {
565  /*
566  * A is full of rank N1, so A is integrated into u1u2
567  */
568  if ( M1 != M2 ) {
569  memset(tmp, 0, M2 * N1 * sizeof(pastix_complex64_t));
570  }
571  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M1, N1,
572  A->u, ldau, tmp + offx, M2 );
573  assert(ret == 0);
574  }
575  }
576  /*
577  * A is low rank of rank A->rk
578  */
579  else {
580  if ( M1 != M2 ) {
581  memset(tmp, 0, M2 * A->rk * sizeof(pastix_complex64_t));
582  }
583  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M1, A->rk,
584  A->u, ldau, tmp + offx, M2 );
585  assert(ret == 0);
586  }
587  (void) ret;
588  (void) alpha;
589 }
590 
591 
592 /**
593  *******************************************************************************
594  *
595  * @brief Concatenate right parts of two low-rank matrices
596  *
597  *******************************************************************************
598  *
599  * @param[in] transA1
600  * @arg PastixNoTrans: No transpose, op( A ) = A;
601  * @arg PastixTrans: Transpose, op( A ) = A';
602  *
603  * @param[in] alpha
604  * alpha * A is add to B
605  *
606  * @param[in] M1
607  * The number of rows of the matrix A.
608  *
609  * @param[in] N1
610  * The number of columns of the matrix A.
611  *
612  * @param[in] A
613  * The low-rank representation of the matrix A.
614  *
615  * @param[in] N2
616  * The number of columns of the matrix B.
617  *
618  * @param[in] B
619  * The low-rank representation of the matrix B.
620  *
621  * @param[in] offy
622  * The vertical offset of A with respect to B.
623  *
624  * @param[inout] v1v2
625  * The workspace where matrices are concatenated
626  *
627  *******************************************************************************/
628 void
629 core_zlrconcatenate_v( pastix_trans_t transA1, pastix_complex64_t alpha,
630  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
631  pastix_int_t N2, pastix_lrblock_t *B,
632  pastix_int_t offy,
633  pastix_complex64_t *v1v2 )
634 {
635  pastix_complex64_t *tmp;
636  pastix_int_t i, ret, rank;
637  pastix_int_t ldau, ldav, ldbv;
638 
639  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
640  rank += B->rk;
641 
642  ldau = (A->rk == -1) ? A->rkmax : M1;
643  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
644  ldbv = B->rkmax;
645 
646  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N2,
647  B->v, ldbv, v1v2, rank );
648  assert(ret == 0);
649 
650  tmp = v1v2 + B->rk;
651  if ( A->rk == -1 ) {
652  assert( transA1 == PastixNoTrans );
653  /*
654  * A is full of rank M1, so it is integrated into v1v2
655  */
656  if ( M1 < N1 ) {
657  if (N1 != N2) {
658  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', M1, N2,
659  0.0, 0.0, tmp, rank );
660  assert( ret == 0 );
661  }
662  core_zgeadd( PastixNoTrans, M1, N1,
663  alpha, A->u, ldau,
664  0.0, tmp + offy * rank, rank );
665  }
666  /*
667  * A is full of rank N1, so it has been integrated into u1u2
668  */
669  else {
670  if (N1 != N2) {
671  /* Set to 0 */
672  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
673  0.0, 0.0, tmp, rank );
674  assert(ret == 0);
675 
676  /* Set diagonal */
677  tmp += offy * rank;
678  for (i=0; i<N1; i++, tmp += rank+1) {
679  *tmp = alpha;
680  }
681  }
682  else {
683  assert( offy == 0 );
684  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', N1, N2,
685  0.0, alpha, tmp + offy * rank, rank );
686  assert( ret == 0 );
687  }
688  }
689  }
690  /*
691  * A is low rank of rank A->rk
692  */
693  else {
694  if (N1 != N2) {
695  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', A->rk, N2,
696  0.0, 0.0, tmp, rank );
697  assert(ret == 0);
698  }
699  core_zgeadd( transA1, A->rk, N1,
700  alpha, A->v, ldav,
701  0.0, tmp + offy * rank, rank );
702  }
703  (void) ret;
704  (void) alpha;
705 }
706 
707 /**
708  *******************************************************************************
709  *
710  * @brief Template to convert a full rank matrix into a low rank matrix through
711  * QR decompositions
712  *
713  * This version is only used when permutation method is used. Only difference
714  * from core_zge2lr_qrrt is the V calculation part, That is: Instead of applying
715  * inverse rotation on V, here inverse permutation is applied
716  *
717  *******************************************************************************
718  *
719  * @param[in] rrqrfct
720  * QR decomposition function used to compute the rank revealing
721  * factorization and create the low-rank form of A.
722  *
723  * @param[in] tol
724  * The tolerance used as a criterion to eliminate information from the
725  * full rank matrix
726  *
727  * @param[in] rklimit
728  * The maximum rank to store the matrix in low-rank format. If
729  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
730  *
731  * @param[in] m
732  * Number of rows of the matrix A, and of the low rank matrix Alr.
733  *
734  * @param[in] n
735  * Number of columns of the matrix A, and of the low rank matrix Alr.
736  *
737  * @param[in] A
738  * The matrix of dimension lda-by-n that needs to be compressed
739  *
740  * @param[in] lda
741  * The leading dimension of the matrix A. lda >= max(1, m)
742  *
743  * @param[out] Alr
744  * The low rank matrix structure that will store the low rank
745  * representation of A
746  *
747  *******************************************************************************/
748 pastix_fixdbl_t
749 core_zge2lr_qrcp( core_zrrqr_cp_t rrqrfct,
750  int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
751  pastix_int_t m, pastix_int_t n,
752  const void *Avoid, pastix_int_t lda,
753  pastix_lrblock_t *Alr )
754 {
755  int ret, newrk;
756  pastix_int_t nb = 32;
757  pastix_complex64_t *A = (pastix_complex64_t*)Avoid;
758  pastix_complex64_t *Acpy;
759  pastix_int_t lwork;
760  pastix_complex64_t *work, *tau, zzsize;
761  double *rwork;
762  pastix_int_t *jpvt;
763  pastix_int_t zsize, rsize;
764  pastix_complex64_t *zwork;
765  pastix_fixdbl_t flops;
766 
767  double norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'f', m, n,
768  A, lda, NULL );
769 
770  if ( (norm == 0.) && (tol >= 0.)) {
771  core_zlralloc( m, n, 0, Alr );
772  return 0. ;
773  }
774 
775  /* work */
776  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
777  if ( tol < 0. ) {
778  tol = -1.;
779  }
780  else if ( use_reltol ) {
781  tol = tol * norm;
782  }
783 
784  ret = rrqrfct( tol, rklimit, 0, nb,
785  m, n, NULL, m,
786  NULL, NULL,
787  &zzsize, -1, NULL );
788 
789  lwork = (pastix_int_t)zzsize;
790  zsize = lwork;
791  /* Acpy */
792  zsize += m * n;
793  /* tau */
794  zsize += n;
795  /* rwork */
796  rsize = 2 * n;
797 
798 #if defined(PASTIX_DEBUG_LR)
799  zwork = NULL;
800  Acpy = malloc( m * n * sizeof(pastix_complex64_t) );
801  tau = malloc( n * sizeof(pastix_complex64_t) );
802  work = malloc( lwork * sizeof(pastix_complex64_t) );
803  rwork = malloc( rsize * sizeof(double) );
804 #else
805  zwork = malloc( zsize * sizeof(pastix_complex64_t) + rsize * sizeof(double) );
806  Acpy = zwork;
807  tau = Acpy + m * n;
808  work = tau + n;
809  rwork = (double*)(work + lwork);
810 #endif
811 
812  jpvt = malloc( n * sizeof(pastix_int_t) );
813 
814  /**
815  * Backup A into Acpy to try to compress
816  */
817  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
818  A, lda, Acpy, m );
819  assert(ret == 0);
820 
821  newrk = rrqrfct( tol, rklimit, 0, nb,
822  m, n, Acpy, m,
823  jpvt, tau,
824  work, lwork, rwork );
825  if (newrk == -1) {
826  flops = FLOPS_ZGEQRF( m, n );
827  }
828  else {
829  flops = FLOPS_ZGEQRF( m, newrk ) + FLOPS_ZUNMQR( m, n-newrk, newrk, PastixLeft );
830  }
831 
832  /**
833  * It was not interesting to compress, so we restore the dense version in Alr
834  */
835  core_zlralloc( m, n, newrk, Alr );
836  Alr->rk = newrk;
837 
838  if ( newrk == -1 ) {
839  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
840  A, lda, Alr->u, Alr->rkmax );
841  assert(ret == 0);
842  }
843  else if ( newrk > 0 ) {
844  /**
845  * We compute U and V
846  */
847  pastix_int_t i;
848  pastix_complex64_t *U, *V;
849 
850  U = Alr->u;
851  V = Alr->v;
852 
853  /* Compute the final U form */
854  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
855  Acpy, m, U, m );
856  assert(ret == 0);
857 
858  ret = LAPACKE_zungqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
859  U, m, tau, work, lwork );
860  assert(ret == 0);
861  flops += FLOPS_ZUNGQR( m, Alr->rk, Alr->rk );
862 
863  /* Compute the final V form */
864  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
865  0.0, 0.0, Acpy + 1, m );
866 
867  for (i=0; i<n; i++){
868  memcpy( V + jpvt[i] * Alr->rk,
869  Acpy + i * m,
870  Alr->rk * sizeof(pastix_complex64_t) );
871  }
872  }
873 
874 #if defined(PASTIX_DEBUG_LR)
875  if ( Alr->rk > 0 ) {
876  int rc = core_zlrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
877  if (rc == 1) {
878  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
879  }
880  }
881 #endif
882 
883  free( zwork );
884  free( jpvt );
885 #if defined(PASTIX_DEBUG_LR)
886  free( Acpy );
887  free( tau );
888  free( work );
889  free( rwork );
890 #endif
891  (void)ret;
892  return flops;
893 }
894 
895 /**
896  *******************************************************************************
897  *
898  * @brief Template to convert a full rank matrix into a low rank matrix through
899  * QR decompositions
900  *
901  * This version is only used when rotational method is used. Only difference
902  * from core_zge2lr_qr is the V calculation part, That is: Instead of applying
903  * inverse permutation on V, here inverse rotation is applied
904  *
905  *******************************************************************************
906  *
907  * @param[in] rrqrfct
908  * QR decomposition function used to compute the rank revealing
909  * factorization and create the low-rank form of A.
910  *
911  * @param[in] tol
912  * The tolerance used as a criterion to eliminate information from the
913  * full rank matrix
914  *
915  * @param[in] rklimit
916  * The maximum rank to store the matrix in low-rank format. If
917  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
918  *
919  * @param[in] m
920  * Number of rows of the matrix A, and of the low rank matrix Alr.
921  *
922  * @param[in] n
923  * Number of columns of the matrix A, and of the low rank matrix Alr.
924  *
925  * @param[in] A
926  * The matrix of dimension lda-by-n that needs to be compressed
927  *
928  * @param[in] lda
929  * The leading dimension of the matrix A. lda >= max(1, m)
930  *
931  * @param[out] Alr
932  * The low rank matrix structure that will store the low rank
933  * representation of A
934  *
935  *******************************************************************************/
936 pastix_fixdbl_t
937 core_zge2lr_qrrt( core_zrrqr_rt_t rrqrfct,
938  int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
939  pastix_int_t m, pastix_int_t n,
940  const void *Avoid, pastix_int_t lda,
941  pastix_lrblock_t *Alr )
942 {
943  int ret, newrk;
944  pastix_int_t nb = 32;
945  pastix_complex64_t *A = (pastix_complex64_t*)Avoid;
946  pastix_complex64_t *Acpy;
947  pastix_int_t lwork;
948  pastix_complex64_t *work, *tau, *B, *tau_b, zzsize;
949  pastix_int_t *jpvt;
950  pastix_int_t zsize, bsize;
951  pastix_complex64_t *zwork;
952  pastix_fixdbl_t flops;
953 
954  char trans;
955 #if defined(PRECISION_c) || defined(PRECISION_z)
956  trans = 'C';
957 #else
958  trans = 'T';
959 #endif
960 
961  double norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'f', m, n,
962  A, lda, NULL );
963 
964  if ( (norm == 0.) && (tol >= 0.)) {
965  core_zlralloc( m, n, 0, Alr );
966  return 0. ;
967  }
968 
969  /* work */
970  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
971  if ( tol < 0. ) {
972  tol = -1.;
973  }
974  else if ( use_reltol ) {
975  tol = tol * norm;
976  }
977 
978  ret = rrqrfct( tol, rklimit, nb,
979  m, n,
980  NULL, m, NULL,
981  NULL, n, NULL,
982  &zzsize, -1, norm );
983 
984  lwork = (pastix_int_t)zzsize;
985  zsize = lwork;
986  bsize = n * rklimit;
987  /* Acpy */
988  zsize += m * n;
989  /* tau */
990  zsize += n;
991  /* B and tau_b */
992  zsize += bsize + n;
993 
994 #if defined(PASTIX_DEBUG_LR)
995  zwork = NULL;
996  Acpy = malloc( m * n * sizeof(pastix_complex64_t) );
997  tau = malloc( n * sizeof(pastix_complex64_t) );
998  B = malloc( bsize * sizeof(pastix_complex64_t) );
999  tau_b = malloc( n * sizeof(pastix_complex64_t) );
1000  work = malloc( lwork * sizeof(pastix_complex64_t) );
1001 #else
1002  zwork = malloc( zsize * sizeof(pastix_complex64_t) );
1003  Acpy = zwork;
1004  tau = Acpy + m * n;
1005  B = tau + n;
1006  tau_b = B + bsize;
1007  work = tau_b + n;
1008 #endif
1009 
1010  jpvt = malloc( n * sizeof(pastix_int_t) );
1011 
1012  /**
1013  * Backup A into Acpy to try to compress
1014  */
1015  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1016  A, lda, Acpy, m );
1017  assert(ret == 0);
1018 
1019  newrk = rrqrfct( tol, rklimit, nb,
1020  m, n,
1021  Acpy, m, tau,
1022  B, n, tau_b,
1023  work, lwork, norm );
1024  if (newrk == -1) {
1025  flops = FLOPS_ZGEQRF( m, n );
1026  }
1027  else {
1028  flops = FLOPS_ZGEQRF( m, newrk ) + FLOPS_ZUNMQR( m, n-newrk, newrk, PastixLeft );
1029  }
1030 
1031  /**
1032  * It was not interesting to compress, so we restore the dense version in Alr
1033  */
1034  core_zlralloc( m, n, newrk, Alr );
1035  Alr->rk = newrk;
1036 
1037  if ( newrk == -1 ) {
1038  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
1039  A, lda, Alr->u, Alr->rkmax );
1040  assert(ret == 0);
1041  }
1042  else if ( newrk > 0 ) {
1043  /**
1044  * We compute U and V
1045  */
1046  pastix_complex64_t *U, *V;
1047  pastix_int_t d, rk = 0;
1048 
1049  U = Alr->u;
1050  V = Alr->v;
1051 
1052  /* Compute the final U form */
1053  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', m, Alr->rk,
1054  Acpy, m, U, m );
1055  assert(ret == 0);
1056 
1057  ret = LAPACKE_zungqr_work( LAPACK_COL_MAJOR, m, Alr->rk, Alr->rk,
1058  U, m, tau, work, lwork );
1059  assert(ret == 0);
1060  flops += FLOPS_ZUNGQR( m, Alr->rk, Alr->rk );
1061 
1062  /* Compute the final V form */
1063  ret = LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'U', Alr->rk, n,
1064  Acpy, m, V, Alr->rk );
1065  assert(ret == 0);
1066  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'L', Alr->rk-1, Alr->rk-1,
1067  0.0, 0.0, V + 1, Alr->rk );
1068  assert(ret == 0);
1069  /*
1070  * Apply inverse rotations to V^T
1071  */
1072  {
1073  /*
1074  * Householders are applied in the reverse order of before
1075  */
1076  rk = (Alr->rk / nb) * nb;
1077  while( rk >= 0 ) {
1078  d = pastix_imin( nb, Alr->rk - rk );
1079  ret = LAPACKE_zunmqr_work( LAPACK_COL_MAJOR, 'R', trans,
1080  Alr->rk - rk, n - rk, d,
1081  B + rk * n + rk, n, tau_b + rk,
1082  V + rk * Alr->rk + rk, Alr->rk,
1083  work, lwork );
1084  assert(ret == 0);
1085  rk -= nb;
1086  }
1087  }
1088  }
1089 
1090 #if defined(PASTIX_DEBUG_LR)
1091  if ( Alr->rk > 0 ) {
1092  int rc = core_zlrdbg_check_orthogonality( m, Alr->rk, Alr->u, m );
1093  if (rc == 1) {
1094  fprintf(stderr, "Failed to compress a matrix and generate an orthogonal u\n" );
1095  }
1096  }
1097 #endif
1098 
1099  free( zwork );
1100  free( jpvt );
1101 #if defined(PASTIX_DEBUG_LR)
1102  free( Acpy );
1103  free( tau );
1104  free( B );
1105  free( tau_b );
1106  free( work );
1107 #endif
1108  (void)ret;
1109  return flops;
1110 }
1111 
1112 /**
1113  *******************************************************************************
1114  *
1115  * @brief Template to perform the addition of two low-rank structures with
1116  * compression kernel based on QR decomposition.
1117  *
1118  * Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
1119  *
1120  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
1121  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
1122  * (0 I )
1123  * Compute Rank Revealing QR decomposition of (I u2^T u1) * (v2 v1)^T
1124  * (0 I )
1125  * Any QR rank revealing kernel can be used for the recompression of the V part.
1126  *
1127  *******************************************************************************
1128  *
1129  * @param[in] rrqrfct
1130  * QR decomposition function used to compute the rank revealing
1131  * factorization of the sum of the two low-rank matrices.
1132  *
1133  * @param[in] lowrank
1134  * The structure with low-rank parameters.
1135  *
1136  * @param[in] transA1
1137  * @arg PastixNoTrans: No transpose, op( A ) = A;
1138  * @arg PastixTrans: Transpose, op( A ) = A';
1139  *
1140  * @param[in] alpha
1141  * alpha * A is add to B
1142  *
1143  * @param[in] M1
1144  * The number of rows of the matrix A.
1145  *
1146  * @param[in] N1
1147  * The number of columns of the matrix A.
1148  *
1149  * @param[in] A
1150  * The low-rank representation of the matrix A.
1151  *
1152  * @param[in] M2
1153  * The number of rows of the matrix B.
1154  *
1155  * @param[in] N2
1156  * The number of columns of the matrix B.
1157  *
1158  * @param[in] B
1159  * The low-rank representation of the matrix B.
1160  *
1161  * @param[in] offx
1162  * The horizontal offset of A with respect to B.
1163  *
1164  * @param[in] offy
1165  * The vertical offset of A with respect to B.
1166  *
1167  *******************************************************************************
1168  *
1169  * @return The new rank of u2 v2^T or -1 if ranks are too large for
1170  * recompression
1171  *
1172  *******************************************************************************/
1173 pastix_fixdbl_t
1174 core_zrradd_qr( core_zrrqr_cp_t rrqrfct,
1175  const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr,
1176  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
1177  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
1178  pastix_int_t offx, pastix_int_t offy )
1179 {
1180  pastix_int_t rankA, rank, M, N, minV;
1181  pastix_int_t i, ret, new_rank, rklimit;
1182  pastix_int_t ldau, ldav, ldbu, ldbv, ldu, ldv;
1183  pastix_complex64_t *u1u2, *v1v2, *u;
1184  pastix_complex64_t *zbuf, *tauV;
1185  size_t wzsize;
1186  double tol = lowrank->tolerance;
1187 
1188  /* PQRCP parameters / workspace */
1189  pastix_int_t nb = 32;
1190  pastix_int_t lwork;
1191  pastix_int_t *jpvt;
1192  pastix_complex64_t *zwork, zzsize;
1193  double *rwork;
1194  pastix_complex64_t alpha = *((pastix_complex64_t*)alphaptr);
1195  pastix_fixdbl_t flops, total_flops = 0.;
1196 
1197 #if defined(PASTIX_DEBUG_LR)
1198  if ( B->rk > 0 ) {
1199  int rc = core_zlrdbg_check_orthogonality( M2, B->rk, B->u, M2 );
1200  if (rc == 1) {
1201  fprintf(stderr, "Failed to have B->u orthogonal in entry of rradd\n" );
1202  }
1203  }
1204 #endif
1205 
1206  rankA = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
1207  rank = rankA + B->rk;
1208  M = pastix_imax(M2, M1);
1209  N = pastix_imax(N2, N1);
1210 
1211  minV = pastix_imin(N, rank);
1212 
1213  assert(M2 == M && N2 == N);
1214  assert(B->rk != -1);
1215 
1216  assert( A->rk <= A->rkmax);
1217  assert( B->rk <= B->rkmax);
1218 
1219  if ( ((M1 + offx) > M2) ||
1220  ((N1 + offy) > N2) )
1221  {
1222  pastix_print_error( "Dimensions are not correct" );
1223  assert(0 /* Incorrect dimensions */);
1224  return total_flops;
1225  }
1226 
1227  /*
1228  * A is rank null, nothing to do
1229  */
1230  if ( A->rk == 0 ) {
1231  return total_flops;
1232  }
1233 
1234  /*
1235  * Let's handle case where B is a null matrix
1236  * B = alpha A
1237  */
1238  if ( B->rk == 0 ) {
1239  core_zlrcpy( lowrank, transA1, alpha,
1240  M1, N1, A, M2, N2, B,
1241  offx, offy );
1242  return total_flops;
1243  }
1244 
1245  /*
1246  * The rank is too big, let's try to compress
1247  */
1248  if ( rank > pastix_imin( M, N ) ) {
1249  assert(0);
1250  }
1251 
1252  /*
1253  * Let's define leading dimensions
1254  */
1255  ldau = (A->rk == -1) ? A->rkmax : M1;
1256  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
1257  ldbu = M;
1258  ldbv = B->rkmax;
1259  ldu = M;
1260  ldv = rank;
1261 
1262  /*
1263  * Let's compute the size of the workspace
1264  */
1265  /* u1u2 and v1v2 */
1266  wzsize = (M+N) * rank;
1267  /* tauV */
1268  wzsize += minV;
1269 
1270  /* RRQR workspaces */
1271  rklimit = pastix_imin( rank, core_get_rklimit( M, N ) );
1272  rrqrfct( tol, rklimit, 1, nb,
1273  rank, N, NULL, ldv,
1274  NULL, NULL,
1275  &zzsize, -1, NULL );
1276  lwork = (pastix_int_t)(zzsize);
1277  wzsize += lwork;
1278 
1279 #if defined(PASTIX_DEBUG_LR)
1280  zbuf = NULL;
1281  u1u2 = malloc( ldu * rank * sizeof(pastix_complex64_t) );
1282  v1v2 = malloc( ldv * N * sizeof(pastix_complex64_t) );
1283  tauV = malloc( rank * sizeof(pastix_complex64_t) );
1284  zwork = malloc( lwork * sizeof(pastix_complex64_t) );
1285 
1286  rwork = malloc( 2 * pastix_imax( rank, N ) * sizeof(double) );
1287 #else
1288  zbuf = malloc( wzsize * sizeof(pastix_complex64_t) + 2 * pastix_imax(rank, N) * sizeof(double) );
1289 
1290  u1u2 = zbuf;
1291  v1v2 = u1u2 + ldu * rank;
1292  tauV = v1v2 + ldv * N;
1293  zwork = tauV + rank;
1294 
1295  rwork = (double*)(zwork + lwork);
1296 #endif
1297 
1298  /*
1299  * Concatenate U2 and U1 in u1u2
1300  * [ u2 0 ]
1301  * [ u2 u1 ]
1302  * [ u2 0 ]
1303  */
1304  core_zlrconcatenate_u( alpha,
1305  M1, N1, A,
1306  M2, B,
1307  offx, u1u2 );
1308 
1309  /*
1310  * Concatenate V2 and V1 in v1v2
1311  * [ v2^h v2^h v2^h ]
1312  * [ 0 v1^h 0 ]
1313  */
1314  core_zlrconcatenate_v( transA1, alpha,
1315  M1, N1, A,
1316  N2, B,
1317  offy, v1v2 );
1318 
1319  /*
1320  * Orthogonalize [u2, u1]
1321  * [u2, u1] = [u2, u1 - u2(u2Tu1)] * (I u2Tu1)
1322  * (0 I )
1323  */
1324 
1325  /* We do not care is A was integrated into v1v2 */
1326  if (rankA != 0) {
1327 
1328  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_orthogonalize );
1329  switch ( pastix_lr_ortho ) {
1330  case PastixCompressOrthoQR:
1331  flops = core_zlrorthu_fullqr( M, N, B->rk + rankA,
1332  u1u2, ldu, v1v2, ldv );
1333  break;
1334 
1336  flops = core_zlrorthu_partialqr( M, N, B->rk, &rankA, offx, offy,
1337  u1u2, ldu, v1v2, ldv );
1338  break;
1339 
1341  pastix_attr_fallthrough;
1342 
1343  default:
1344  flops = core_zlrorthu_cgs( M2, N2, M1, N1, B->rk, &rankA, offx, offy,
1345  u1u2, ldu, v1v2, ldv );
1346  }
1347  kernel_trace_stop_lvl2( flops );
1348 
1349  total_flops += flops;
1350  }
1351 
1352  rank = B->rk + rankA;
1353 
1354  if (rankA == 0) {
1355  /*
1356  * The final B += A fit in B
1357  * Lets copy and return
1358  */
1359  LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', M, B->rk, u1u2, ldu, B->u, ldbu );
1360  LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, 'A', B->rk, N, v1v2, ldv, B->v, ldbv );
1361 
1362  free(zbuf);
1363 #if defined(PASTIX_DEBUG_LR)
1364  free( u1u2 );
1365  free( v1v2 );
1366  free( tauV );
1367  free( zwork );
1368  free( rwork );
1369 #endif
1370  return total_flops;
1371  }
1372 
1373  MALLOC_INTERN( jpvt, pastix_imax(rank, N), pastix_int_t );
1374 
1375  if ( lowrank->use_reltol ) {
1376  /**
1377  * In relative tolerance, we can choose two solutions:
1378  * 1) The first one, more conservative, is to compress relatively to
1379  * the norm of the final matrix \[ \alpha A + B \]. In this kernel, we
1380  * exploit the fact that the V part contains all the information while
1381  * the U part is orthonormal, and compute it as follow:
1382  *
1383  * double norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'f', rank, N,
1384  * v1v2, ldv, NULL );
1385  * tol = tol * norm;
1386  *
1387  * 2) The second solution, less conservative, will allow to reduce the
1388  * rank more efficiently. Since A and B have been compressed relatively
1389  * to their respective norms, there is no reason to compress the sum
1390  * relatively to its own norm, but it is more reasonable to compress it
1391  * relatively to the norm of A and B. For example, A-B would be full
1392  * with the first criterion, and rank null with the second.
1393  * Note that here, we can only have an estimation that once again
1394  * reduces the conservation of the criterion.
1395  * || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b||
1396  *
1397  */
1398  double normA, normB;
1399  normA = core_zlrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
1400  normB = core_zlrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
1401  tol = tol * ( cabs(alpha) * normA + normB );
1402  }
1403 
1404  /*
1405  * Perform RRQR factorization on (I u2Tu1) v1v2 = (Q2 R2)
1406  * (0 I )
1407  */
1408  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
1409  rklimit = pastix_imin( rklimit, rank );
1410  new_rank = rrqrfct( tol, rklimit, 1, nb,
1411  rank, N, v1v2, ldv,
1412  jpvt, tauV,
1413  zwork, lwork, rwork );
1414  flops = (new_rank == -1) ? FLOPS_ZGEQRF( rank, N )
1415  : (FLOPS_ZGEQRF( rank, new_rank ) +
1416  FLOPS_ZUNMQR( rank, N-new_rank, new_rank, PastixLeft ));
1417  kernel_trace_stop_lvl2_rank( flops, new_rank );
1418  total_flops += flops;
1419 
1420  /*
1421  * First case: The rank is too big, so we decide to uncompress the result
1422  */
1423  if ( (new_rank > rklimit) ||
1424  (new_rank == -1) )
1425  {
1426  pastix_lrblock_t Bbackup = *B;
1427 
1428  core_zlralloc( M, N, -1, B );
1429  u = B->u;
1430 
1431  /* Uncompress B */
1432  flops = FLOPS_ZGEMM( M, N, Bbackup.rk );
1433  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
1434  cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1435  M, N, Bbackup.rk,
1436  CBLAS_SADDR(zone), Bbackup.u, ldbu,
1437  Bbackup.v, ldbv,
1438  CBLAS_SADDR(zzero), u, M );
1439  kernel_trace_stop_lvl2( flops );
1440  total_flops += flops;
1441 
1442  /* Add A into it */
1443  if ( A->rk == -1 ) {
1444  flops = 2 * M1 * N1;
1445  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1446  core_zgeadd( transA1, M1, N1,
1447  alpha, A->u, ldau,
1448  zone, u + offy * M + offx, M);
1449  kernel_trace_stop_lvl2( flops );
1450  }
1451  else {
1452  flops = FLOPS_ZGEMM( M1, N1, A->rk );
1453  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
1454  cblas_zgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
1455  M1, N1, A->rk,
1456  CBLAS_SADDR(alpha), A->u, ldau,
1457  A->v, ldav,
1458  CBLAS_SADDR(zone), u + offy * M + offx, M);
1459  kernel_trace_stop_lvl2( flops );
1460  }
1461  total_flops += flops;
1462  core_zlrfree(&Bbackup);
1463  free( zbuf );
1464  free( jpvt );
1465 #if defined(PASTIX_DEBUG_LR)
1466  free( u1u2 );
1467  free( v1v2 );
1468  free( tauV );
1469  free( zwork );
1470  free( rwork );
1471 #endif
1472  return total_flops;
1473  }
1474  else if ( new_rank == 0 ) {
1475  core_zlrfree(B);
1476  free( zbuf );
1477  free( jpvt );
1478 #if defined(PASTIX_DEBUG_LR)
1479  free( u1u2 );
1480  free( v1v2 );
1481  free( tauV );
1482  free( zwork );
1483  free( rwork );
1484 #endif
1485  return total_flops;
1486  }
1487 
1488  /*
1489  * We need to reallocate the buffer to store the new compressed version of B
1490  * because it wasn't big enough
1491  */
1492  ret = core_zlrsze( 0, M, N, B, new_rank, -1, -1 );
1493  assert( ret != -1 );
1494  assert( B->rkmax >= new_rank );
1495  assert( B->rkmax >= B->rk );
1496 
1497  ldbv = B->rkmax;
1498 
1499  /* B->v = P v1v2 */
1500  {
1501  pastix_complex64_t *tmpV;
1502  pastix_int_t lm;
1503 
1504  memset(B->v, 0, N * ldbv * sizeof(pastix_complex64_t));
1505  tmpV = B->v;
1506  for (i=0; i<N; i++){
1507  lm = pastix_imin( new_rank, i+1 );
1508  memcpy(tmpV + jpvt[i] * ldbv,
1509  v1v2 + i * ldv,
1510  lm * sizeof(pastix_complex64_t));
1511  }
1512  }
1513 
1514  /* Compute Q2 factor */
1515  {
1516  flops = FLOPS_ZUNGQR( rank, new_rank, new_rank )
1517  + FLOPS_ZGEMM( M, new_rank, rank );
1518 
1519  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
1520  ret = LAPACKE_zungqr_work( LAPACK_COL_MAJOR, rank, new_rank, new_rank,
1521  v1v2, ldv, tauV, zwork, lwork );
1522  assert(ret == 0);
1523 
1524  cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
1525  M, new_rank, rank,
1526  CBLAS_SADDR(zone), u1u2, ldu,
1527  v1v2, ldv,
1528  CBLAS_SADDR(zzero), B->u, ldbu);
1529  kernel_trace_stop_lvl2( flops );
1530  total_flops += flops;
1531  }
1532 
1533  free( zbuf );
1534  free( jpvt );
1535 
1536 #if defined(PASTIX_DEBUG_LR)
1537  free( u1u2 );
1538  free( v1v2 );
1539  free( tauV );
1540  free( zwork );
1541  free( rwork );
1542 #endif
1543 
1544  (void)ret;
1545  return total_flops;
1546 }
1547 
1548 /**
1549  *******************************************************************************
1550  *
1551  * @brief Compute the size of a block to send in LR
1552  *
1553  *******************************************************************************
1554  *
1555  * @param[in] M
1556  * The number of rows of the matrix A.
1557  *
1558  * @param[in] N
1559  * The number of columns of the matrix A.
1560  *
1561  * @param[in] A
1562  * The low-rank representation of the matrix A.
1563  *
1564  *******************************************************************************
1565  *
1566  * @return Size of a block to send in LR
1567  *
1568  *******************************************************************************/
1569 size_t
1570 core_zlrgetsize( pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A )
1571 {
1572  if ( A->rk != -1 ) {
1573  return A->rk * ( M + N );
1574  }
1575  else {
1576  return M * N;
1577  }
1578 }
1579 
1580 /**
1581  *******************************************************************************
1582  *
1583  * @brief Pack low-rank data by side
1584  *
1585  *******************************************************************************
1586  *
1587  * @param[in] M
1588  * The number of rows of the matrix A.
1589  *
1590  * @param[in] N
1591  * The number of columns of the matrix A.
1592  *
1593  * @param[in] A
1594  * The low-rank representation of the matrix A.
1595  *
1596  * @param[inout] buffer
1597  * Pointer on packed data
1598  *
1599  *******************************************************************************
1600  *
1601  * @return Pointer on packed data shifted to the next block
1602  *
1603  *******************************************************************************/
1604 char *
1605 core_zlrpack( pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer )
1606 {
1607  int rk = A->rk;
1608  int rkmax = A->rkmax;
1609  void *u = A->u;
1610  void *v = A->v;
1611 
1612  /* Store the rank */
1613  memcpy( buffer, &rk, sizeof( int ) );
1614  buffer += sizeof( int );
1615 
1616  if ( rk != -1 ) {
1617  /* Pack the u part */
1618  memcpy( buffer, u, rk * M * sizeof( pastix_complex64_t ) );
1619  buffer += rk * M * sizeof( pastix_complex64_t );
1620 
1621  /* Pack the v part */
1622  if ( rk == rkmax ) {
1623  memcpy( buffer, v, rk * N * sizeof( pastix_complex64_t ) );
1624  buffer += rk * N * sizeof( pastix_complex64_t );
1625  }
1626  else {
1627  LAPACKE_zlacpy_work(
1628  LAPACK_COL_MAJOR, 'A', rk, N, v, rkmax, (pastix_complex64_t *)buffer, rk );
1629  buffer += rk * N * sizeof( pastix_complex64_t );
1630  }
1631  }
1632  else {
1633  memcpy( buffer, u, M * N * sizeof( pastix_complex64_t ) );
1634  buffer += M * N * sizeof( pastix_complex64_t );
1635  }
1636 
1637  return buffer;
1638 }
1639 
1640 /**
1641  *******************************************************************************
1642  *
1643  * @brief Unpack low rank data and fill the cblk concerned by the computation
1644  *
1645  *******************************************************************************
1646  *
1647  * @param[in] M
1648  * The number of rows of the matrix A.
1649  *
1650  * @param[in] N
1651  * The number of columns of the matrix A.
1652  *
1653  * @param[in] A
1654  * The low-rank representation of the matrix A.
1655  *
1656  * @param[inout] buffer
1657  * Pointer on packed data
1658  *
1659  *******************************************************************************
1660  *
1661  * @return Pointer on packed data shifted to the next block
1662  *
1663  *******************************************************************************/
1664 char *
1665 core_zlrunpack( pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer )
1666 {
1667  int rk;
1668  memcpy( &rk, buffer, sizeof( int ) );
1669  buffer += sizeof( int );
1670 
1671  /* Make sure A can store the unpacked values */
1672  core_zlrsze( 0, M, N, A, rk, rk, rk );
1673 
1674  if ( rk != -1 ) {
1675  /* Unpack U */
1676  memcpy( A->u, buffer, M * rk * sizeof( pastix_complex64_t ) );
1677  buffer += M * rk * sizeof( pastix_complex64_t );
1678 
1679  /* Unpack V */
1680  memcpy( A->v, buffer, N * rk * sizeof( pastix_complex64_t ) );
1681  buffer += N * rk * sizeof( pastix_complex64_t );
1682  }
1683  else {
1684  /* Unpack the full block */
1685  memcpy( A->u, buffer, M * N * sizeof( pastix_complex64_t ) );
1686  buffer += M * N * sizeof( pastix_complex64_t );
1687  }
1688 
1689  return buffer;
1690 }
1691 
1692 /**
1693  *******************************************************************************
1694  *
1695  * @brief Unpack low rank data and fill the cblk concerned by the computation
1696  *
1697  *******************************************************************************
1698  *
1699  * @param[in] M
1700  * The number of rows of the matrix A.
1701  *
1702  * @param[in] N
1703  * The number of columns of the matrix A.
1704  *
1705  * @param[in] A
1706  * The low-rank representation of the matrix A.
1707  *
1708  * @param[inout] buffer
1709  * Pointer on packed data
1710  *
1711  *******************************************************************************
1712  *
1713  * @return Pointer on packed data shifted to the next block
1714  *
1715  *******************************************************************************/
1716 const char *
1717 core_zlrunpack2( pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A,
1718  const char *input, char **outptr )
1719 {
1720  char *output = *outptr;
1721  size_t size;
1722  int rk;
1723 
1724  rk = *((int *)input);
1725  input += sizeof( int );
1726 
1727  if ( rk != -1 ) {
1728  A->rk = rk;
1729  A->rkmax = rk;
1730 
1731  /* Unpack U */
1732  size = M * rk * sizeof( pastix_complex64_t );
1733  A->u = output;
1734 
1735  memcpy( A->u, input, size );
1736  input += size;
1737  output += size;
1738 
1739  /* Unpack V */
1740  size = N * rk * sizeof( pastix_complex64_t );
1741  A->v = output;
1742 
1743  memcpy( A->v, input, size );
1744  input += size;
1745  output += size;
1746  }
1747  else {
1748  A->rk = -1;
1749  A->rkmax = M;
1750  A->v = NULL;
1751 
1752  /* Unpack the full block */
1753  size = M * N * sizeof( pastix_complex64_t );
1754  A->u = output;
1755 
1756  memcpy( A->u, input, size );
1757  input += size;
1758  output += size;
1759  }
1760 
1761  *outptr = output;
1762  return input;
1763 }
solver.h
PastixFrobeniusNorm
@ PastixFrobeniusNorm
Definition: api.h:482
core_zrradd_qr
pastix_fixdbl_t core_zrradd_qr(core_zrrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
Definition: core_zgelrops.c:1174
core_get_rklimit
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:30
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
pastix_lrblock_s::v
void * v
Definition: pastix_lowrank.h:116
PastixCompressOrthoQR
@ PastixCompressOrthoQR
Definition: api.h:387
pastix_lrblock_s::u
void * u
Definition: pastix_lowrank.h:115
pastix_lr_s::use_reltol
int use_reltol
Definition: pastix_lowrank.h:153
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
core_zlrdbg_check_orthogonality
int core_zlrdbg_check_orthogonality(pastix_int_t M, pastix_int_t N, const pastix_complex64_t *A, pastix_int_t lda)
Check the orthogonality of the matrix A.
Definition: core_zlrdbg.c:98
pastix_lr_ortho
pastix_int_t pastix_lr_ortho
Define the orthogonalization method.
Definition: lowrank.c:25
pastix_zcores.h
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
PastixCompressOrthoCGS
@ PastixCompressOrthoCGS
Definition: api.h:386
core_zlrorthu_partialqr
pastix_fixdbl_t core_zlrorthu_partialqr(pastix_int_t M, pastix_int_t N, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, pastix_complex64_t *U, pastix_int_t ldu, pastix_complex64_t *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using partial ...
Definition: core_zlrothu.c:193
core_zgeadd
int core_zgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t LDA, pastix_complex64_t beta, pastix_complex64_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_zgeadd.c:78
PastixNoTrans
@ PastixNoTrans
Definition: api.h:424
core_zlrnrm
double core_zlrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition: core_zlrnrm.c:39
core_zlrfree
void core_zlrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
Definition: core_zgelrops.c:116
core_zge2lr_qrcp
pastix_fixdbl_t core_zge2lr_qrcp(core_zrrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_zgelrops.c:749
core_zgetmo
void core_zgetmo(int m, int n, const pastix_complex64_t *A, int lda, pastix_complex64_t *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
Definition: core_zgetmo.c:49
core_zge2lr_qrrt
pastix_fixdbl_t core_zge2lr_qrrt(core_zrrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_zgelrops.c:937
pastix_zlrcores.h
pastix_lr_s::core_ge2lr
fct_ge2lr_t core_ge2lr
Definition: pastix_lowrank.h:157
core_zlr2ge
int core_zlr2ge(pastix_trans_t trans, pastix_int_t m, pastix_int_t n, const pastix_lrblock_t *Alr, pastix_complex64_t *A, pastix_int_t lda)
Convert a low rank matrix into a dense matrix.
Definition: core_zgelrops.c:300
core_zlrorthu_fullqr
pastix_fixdbl_t core_zlrorthu_fullqr(pastix_int_t M, pastix_int_t N, pastix_int_t rank, pastix_complex64_t *U, pastix_int_t ldu, pastix_complex64_t *V, pastix_int_t ldv)
Try to orthognalize the u part of the low-rank form, and update the v part accordingly using full QR.
Definition: core_zlrothu.c:83
z_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
pastix_lrblock_s::rk
int rk
Definition: pastix_lowrank.h:113
core_zlralloc
void core_zlralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
Definition: core_zgelrops.c:56
core_zlrsze
int core_zlrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
Definition: core_zgelrops.c:173
core_zlrconcatenate_u
void core_zlrconcatenate_u(pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, pastix_complex64_t *u1u2)
Concatenate left parts of two low-rank matrices.
Definition: core_zgelrops.c:521
core_zlrcpy
void core_zlrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
Definition: core_zgelrops.c:418
pastix_lr_s::tolerance
double tolerance
Definition: pastix_lowrank.h:155
core_zlrconcatenate_v
void core_zlrconcatenate_v(pastix_trans_t transA1, pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, pastix_complex64_t *v1v2)
Concatenate right parts of two low-rank matrices.
Definition: core_zgelrops.c:629
core_zlrunpack2
const char * core_zlrunpack2(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, const char *input, char **outptr)
Unpack low rank data and fill the cblk concerned by the computation.
Definition: core_zgelrops.c:1717
PastixCompressOrthoPartialQR
@ PastixCompressOrthoPartialQR
Definition: api.h:388
core_zlrgetsize
size_t core_zlrgetsize(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A)
Compute the size of a block to send in LR.
Definition: core_zgelrops.c:1570
pastix_lrblock_s::rkmax
int rkmax
Definition: pastix_lowrank.h:114
core_zlrunpack
char * core_zlrunpack(pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, char *buffer)
Unpack low rank data and fill the cblk concerned by the computation.
Definition: core_zgelrops.c:1665
core_zlrorthu_cgs
pastix_fixdbl_t core_zlrorthu_cgs(pastix_int_t M1, pastix_int_t N1, pastix_int_t M2, pastix_int_t N2, pastix_int_t r1, pastix_int_t *r2ptr, pastix_int_t offx, pastix_int_t offy, pastix_complex64_t *U, pastix_int_t ldu, pastix_complex64_t *V, pastix_int_t ldv)
Try to orthognalize the U part of the low-rank form, and update the V part accordingly using CGS.
Definition: core_zlrothu.c:419
core_zlrpack
char * core_zlrpack(pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A, char *buffer)
Pack low-rank data by side.
Definition: core_zgelrops.c:1605
PastixLeft
@ PastixLeft
Definition: api.h:473