PaStiX Handbook  6.4.0
core_cgelrops_svd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_cgelrops_svd.c
4  *
5  * PaStiX low-rank kernel routines using SVD based on Lapack CGESVD.
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Gregoire Pichon
12  * @author Esragul Korkmaz
13  * @author Mathieu Faverge
14  * @author Nolan Bredel
15  * @date 2024-07-05
16  * @generated from /builds/solverstack/pastix/kernels/core_zgelrops_svd.c, normal z -> c, Tue Oct 8 14:17:17 2024
17  *
18  **/
19 #include "common.h"
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "flops.h"
23 #include "kernels_trace.h"
24 #include "blend/solver.h"
25 #include "pastix_ccores.h"
26 #include "pastix_clrcores.h"
27 #include "c_nan_check.h"
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 #define PASTIX_SVD_2NORM 1
31 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
32 
33 #if !defined(PASTIX_SVD_2NORM)
34 #include "common/frobeniusupdate.h"
35 
36 /**
37  *******************************************************************************
38  *
39  * @ingroup kernel_lr_svd_null
40  *
41  * @brief Compute the frobenius norm of a vector
42  *
43  * This routine is inspired from LAPACK classq function, and allows to
44  * accumulate the contribution backward for better accuracy as opposed to snrm2
45  * which allows only positive increment.
46  *
47  *******************************************************************************
48  *
49  * @param[in] n
50  * The number of elemnts in the vector
51  *
52  * @param[in] x
53  * The vector of size n * incx
54  *
55  * @param[in] incx
56  * The increment between two elments in the vector x.
57  *
58  *******************************************************************************
59  *
60  * @return The frobenius norm of the vector x.
61  *
62  *******************************************************************************/
63 static inline float
64 core_slassq( int n,
65  const float *x,
66  int incx )
67 {
68  float scale = 1.;
69  float sumsq = 0.;
70  int i;
71 
72  for( i=0; i<n; i++, x+=incx ) {
73  frobenius_update( 1, &scale, &sumsq, x );
74  }
75 
76  return scale * sqrtf( sumsq );
77 }
78 #endif /* !defined(PASTIX_SVD_2NORM) */
79 
80 #ifndef DOXYGEN_SHOULD_SKIP_THIS
81 static pastix_complex32_t cone = 1.0;
82 static pastix_complex32_t czero = 0.0;
83 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
84 
85 /**
86  *******************************************************************************
87  *
88  * @brief Convert a full rank matrix in a low rank matrix, using SVD.
89  *
90  *******************************************************************************
91  *
92  * @param[in] use_reltol
93  * TODO
94  *
95  * @param[in] tol
96  * The tolerance used as a criterion to eliminate information from the
97  * full rank matrix.
98  * If tol < 0, then we compress up to rklimit. So if rklimit is set to
99  * min(m,n), and tol < 0., we get a full representation of the matrix
100  * under the form U * V^t.
101  *
102  * @param[in] rklimit
103  * The maximum rank to store the matrix in low-rank format. If
104  * -1, set to min(M, N) / PASTIX_LR_MINRATIO.
105  *
106  * @param[in] m
107  * Number of rows of the matrix A, and of the low rank matrix Alr.
108  *
109  * @param[in] n
110  * Number of columns of the matrix A, and of the low rank matrix Alr.
111  *
112  * @param[in] Avoid
113  * The matrix of dimension lda-by-n that needs to be compressed
114  *
115  * @param[in] lda
116  * The leading dimension of the matrix A. lda >= max(1, m)
117  *
118  * @param[out] Alr
119  * The low rank matrix structure that will store the low rank
120  * representation of A
121  *
122  *******************************************************************************
123  *
124  * @return TODO
125  *
126  *******************************************************************************/
128 core_cge2lr_svd( int use_reltol,
129  pastix_fixdbl_t tol,
130  pastix_int_t rklimit,
131  pastix_int_t m,
132  pastix_int_t n,
133  const void *Avoid,
134  pastix_int_t lda,
135  pastix_lrblock_t *Alr )
136 {
137  const pastix_complex32_t *A = (const pastix_complex32_t*)Avoid;
138  pastix_fixdbl_t flops = 0.0;
139  pastix_complex32_t *u, *v, *zwork, *Acpy, ws;
140  float *rwork, *s;
141  pastix_int_t i, ret, ldu, ldv;
142  pastix_int_t minMN, imax;
143  pastix_int_t lwork = -1;
144  pastix_int_t zsize, rsize;
145  float norm;
146 
147 #if !defined(NDEBUG)
148  if ( m < 0 ) {
149  return -2;
150  }
151  if ( n < 0 ) {
152  return -3;
153  }
154  if ( lda < m ) {
155  return -5;
156  }
157 #endif
158 
159  norm = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', m, n,
160  A, lda, NULL );
161 
162  /* Quick return on norm */
163  if ( (norm == 0.) && (tol >= 0.) ) {
164  core_clralloc( m, n, 0, Alr );
165  return 0. ;
166  }
167 
168  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
169  if ( tol < 0. ) {
170  tol = -1.;
171  }
172  else if ( use_reltol ) {
173  tol = tol * norm;
174  }
175 
176  /* Quick return on max rank */
177  minMN = pastix_imin(m, n);
178  rklimit = pastix_imin( minMN, rklimit );
179 
180  /**
181  * If maximum rank is 0, then either the matrix norm is below the tolerance,
182  * and we can return a null rank matrix, or it is not and we need to return
183  * a full rank matrix.
184  */
185  if ( rklimit == 0 ) {
186  if ( (tol < 0.) || (norm < tol) ) {
187  core_clralloc( m, n, 0, Alr );
188  return 0.;
189  }
190 
191  /* Return full rank */
192  core_clralloc( m, n, -1, Alr );
193  ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
194  A, lda, Alr->u, Alr->rkmax );
195  assert(ret == 0);
196  return 0.;
197  }
198 
199  /*
200  * Allocate a temporary Low rank matrix to store the full U and V
201  */
202  core_clralloc( m, n, pastix_imin( m, n ), Alr );
203  u = Alr->u;
204  v = Alr->v;
205  ldu = m;
206  ldv = Alr->rkmax;
207 
208  /*
209  * Query the workspace needed for the gesvd
210  */
211 #if defined(PASTIX_DEBUG_LR_NANCHECK)
212  ws = minMN;
213 #else
214  {
215  /*
216  * rworkfix is a fix to an internal mkl bug
217  * see: https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/MKL-2021-4-CoreDump-with-LAPACKE-cgesvd-work-or-LAPACKE-cgesvd/m-p/1341228
218  */
219  float rwork;
220  ret = MYLAPACKE_cgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
221  m, n, NULL, m,
222  NULL, NULL, ldu, NULL, ldv,
223  &ws, lwork, &rwork );
224  (void)rwork;
225  }
226 #endif
227  lwork = ws;
228  zsize = ws;
229  zsize += m * n; /* Copy of the matrix A */
230 
231  rsize = minMN;
232 #if defined(PRECISION_z) || defined(PRECISION_c)
233  rsize += 5 * minMN;
234 #endif
235 
236  zwork = malloc( zsize * sizeof(pastix_complex32_t) + rsize * sizeof(float) );
237  rwork = (float*)(zwork + zsize);
238 
239  Acpy = zwork + lwork;
240  s = rwork;
241 
242  /*
243  * Backup the original matrix before to overwrite it with the SVD
244  */
245  ret = LAPACKE_clacpy_work(LAPACK_COL_MAJOR, 'A', m, n,
246  A, lda, Acpy, m );
247  assert( ret == 0 );
248 
249  ret = MYLAPACKE_cgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
250  m, n, Acpy, m,
251  s, u, ldu, v, ldv,
252  zwork, lwork, rwork + minMN );
253  if ( ret != 0 ) {
254  pastix_print_error( "SVD Failed\n" );
255  }
256 
257  /* Let's stop i before going too far */
258  imax = pastix_imin( minMN, rklimit+1 );
259  for (i=0; i<imax; i++, v+=1) {
260  float frob_norm;
261 
262  /*
263  * There are two different stopping criteria for SVD to decide the
264  * compression rank:
265  * 1) The 2-norm:
266  * Compare the singular values to the threshold
267  * 2) The Frobenius norm:
268  * Compare the Frobenius norm of the trailing singular values to
269  * the threshold. Note that we use a reverse accumulation of the
270  * singular values to avoid accuracy issues.
271  */
272 #if defined(PASTIX_SVD_2NORM)
273  frob_norm = s[i];
274 #else
275  frob_norm = core_slassq( minMN-i, s + minMN - 1, -1 );
276 #endif
277 
278  if (frob_norm < tol) {
279  break;
280  }
281  cblas_csscal(n, s[i], v, ldv);
282  }
283 
284  /*
285  * Resize the space used by the low rank matrix
286  */
287  core_clrsze( 1, m, n, Alr, i, -1, rklimit );
288 
289  /*
290  * It was not interesting to compress, so we restore the dense version in Alr
291  */
292  if (Alr->rk == -1) {
293  ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
294  A, lda, Alr->u, Alr->rkmax );
295  assert(ret == 0);
296  }
297 
298  (void)ret;
299  memFree_null(zwork);
300  return flops;
301 }
302 
303 /**
304  *******************************************************************************
305  *
306  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
307  *
308  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
309  * Compute QR decomposition of (u2 u1) = Q1 R1
310  * Compute QR decomposition of (v2 v1) = Q2 R2
311  * Compute SVD of R1 R2^T = u sigma v^T
312  * Final solution is (Q1 u sigma^[1/2]) (Q2 v sigma^[1/2])^T
313  *
314  *******************************************************************************
315  *
316  * @param[in] lowrank
317  * The structure with low-rank parameters.
318  *
319  * @param[in] transA1
320  * @arg PastixNoTrans: No transpose, op( A ) = A;
321  * @arg PastixTrans: Transpose, op( A ) = A';
322  *
323  * @param[in] alphaptr
324  * alpha * A is add to B
325  *
326  * @param[in] M1
327  * The number of rows of the matrix A.
328  *
329  * @param[in] N1
330  * The number of columns of the matrix A.
331  *
332  * @param[in] A
333  * The low-rank representation of the matrix A.
334  *
335  * @param[in] M2
336  * The number of rows of the matrix B.
337  *
338  * @param[in] N2
339  * The number of columns of the matrix B.
340  *
341  * @param[in] B
342  * The low-rank representation of the matrix B.
343  *
344  * @param[in] offx
345  * The horizontal offset of A with respect to B.
346  *
347  * @param[in] offy
348  * The vertical offset of A with respect to B.
349  *
350  *******************************************************************************
351  *
352  * @return The new rank of u2 v2^T or -1 if ranks are too large for recompression
353  *
354  *******************************************************************************/
356 core_crradd_svd( const pastix_lr_t *lowrank,
357  pastix_trans_t transA1,
358  const void *alphaptr,
359  pastix_int_t M1,
360  pastix_int_t N1,
361  const pastix_lrblock_t *A,
362  pastix_int_t M2,
363  pastix_int_t N2,
364  pastix_lrblock_t *B,
365  pastix_int_t offx,
366  pastix_int_t offy)
367 {
368  pastix_int_t rank, M, N, minU, minV;
369  pastix_int_t i, ret, lwork, new_rank;
370  pastix_int_t ldau, ldav, ldbu, ldbv;
371  pastix_complex32_t *u1u2, *v1v2, *R, *u, *v;
372  pastix_complex32_t *tmp, *zbuf, *tauU, *tauV;
373  pastix_complex32_t querysize;
374  pastix_complex32_t alpha = *((pastix_complex32_t*)alphaptr);
375  float *s;
376  size_t wzsize, wdsize;
377  float tol = lowrank->tolerance;
378  pastix_fixdbl_t flops, total_flops = 0.0;
379 
380  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
381  rank += B->rk;
382  M = pastix_imax(M2, M1);
383  N = pastix_imax(N2, N1);
384  minU = pastix_imin(M, rank);
385  minV = pastix_imin(N, rank);
386 
387  assert(M2 == M && N2 == N);
388  assert(B->rk != -1);
389 
390  assert( A->rk <= A->rkmax);
391  assert( B->rk <= B->rkmax);
392 
393  if ( ((M1 + offx) > M2) ||
394  ((N1 + offy) > N2) )
395  {
396  pastix_print_error( "Dimensions are not correct" );
397  assert(0 /* Incorrect dimensions */);
398  return total_flops;
399  }
400 
401  /*
402  * A is rank null, nothing to do
403  */
404  if (A->rk == 0) {
405  return total_flops;
406  }
407 
408  ldau = (A->rk == -1) ? A->rkmax : M1;
409  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
410  ldbu = M;
411  ldbv = B->rkmax;
412 
413  /*
414  * Let's handle case where B is a null matrix
415  * B = alpha A
416  */
417  if (B->rk == 0) {
418  core_clrcpy( lowrank, transA1, alpha,
419  M1, N1, A, M2, N2, B,
420  offx, offy );
421  return total_flops;
422  }
423 
424  /*
425  * The rank is too big, let's try to compress
426  */
427  if ( rank > pastix_imin( M, N ) ) {
428  assert(0);
429  }
430 
431  /*
432  * Let's compute the size of the workspace
433  */
434  /* u1u2 and v1v2 */
435  wzsize = (M+N) * rank;
436  /* tauU and tauV */
437  wzsize += minU + minV;
438 
439  /* Storage of R, u and v */
440  wzsize += 3 * rank * rank;
441 
442  /* QR/LQ workspace */
443  lwork = pastix_imax( M, N ) * 32;
444 
445  /* Workspace needed for the gesvd */
446 #if defined(PASTIX_DEBUG_LR_NANCHECK)
447  querysize = rank;
448 #else
449  {
450  /*
451  * rworkfix is a fix to an internal mkl bug
452  * see: https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/MKL-2021-4-CoreDump-with-LAPACKE-cgesvd-work-or-LAPACKE-cgesvd/m-p/1341228
453  */
454  float rwork;
455  ret = MYLAPACKE_cgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
456  rank, rank, NULL, rank,
457  NULL, NULL, rank, NULL, rank,
458  &querysize, -1, &rwork );
459  (void)rwork;
460  }
461 #endif
462  lwork = pastix_imax( lwork, querysize );
463  wzsize += lwork;
464 
465  wdsize = rank;
466 #if defined(PRECISION_z) || defined(PRECISION_c)
467  wdsize += 5 * rank;
468 #endif
469 
470  zbuf = malloc( wzsize * sizeof(pastix_complex32_t) + wdsize * sizeof(float) );
471  s = (float*)(zbuf + wzsize);
472 
473  u1u2 = zbuf + lwork;
474  tauU = u1u2 + M * rank;
475  v1v2 = tauU + minU;
476  tauV = v1v2 + N * rank;
477  R = tauV + minV;
478 
479  /*
480  * Concatenate U2 and U1 in u1u2
481  * [ u2 0 ]
482  * [ u2 u1 ]
483  * [ u2 0 ]
484  */
485  core_clrconcatenate_u( alpha,
486  M1, N1, A,
487  M2, B,
488  offx, u1u2 );
489 
490  /*
491  * Perform QR factorization on u1u2 = (Q1 R1)
492  */
493  ret = LAPACKE_cgeqrf_work( LAPACK_COL_MAJOR, M, rank,
494  u1u2, M, tauU, zbuf, lwork );
495  assert( ret == 0 );
496  total_flops += FLOPS_CGEQRF( M, rank );
497 
498  /*
499  * Concatenate V2 and V1 in v1v2
500  * [ v2^h v2^h v2^h ]
501  * [ 0 v1^h 0 ]
502  */
503  core_clrconcatenate_v( transA1, alpha,
504  M1, N1, A,
505  N2, B,
506  offy, v1v2 );
507 
508  /*
509  * Perform LQ factorization on v1v2 = (L2 Q2)
510  */
511  ret = LAPACKE_cgelqf_work( LAPACK_COL_MAJOR, rank, N,
512  v1v2, rank, tauV, zbuf, lwork );
513  assert(ret == 0);
514  total_flops += FLOPS_CGELQF( rank, N );
515 
516  /*
517  * Compute R = alpha R1 L2
518  */
519  u = R + rank * rank;
520  v = u + rank * rank;
521 
522  memset(R, 0, rank * rank * sizeof(pastix_complex32_t));
523 
524  ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR, 'U', rank, rank,
525  u1u2, M, R, rank );
526  assert(ret == 0);
527 
528  cblas_ctrmm(CblasColMajor,
529  CblasRight, CblasLower,
530  CblasNoTrans, CblasNonUnit,
531  rank, rank, CBLAS_SADDR(cone),
532  v1v2, rank, R, rank);
533  total_flops += FLOPS_CTRMM( PastixRight, rank, rank );
534 
535  if ( lowrank->use_reltol ) {
536  /**
537  * In relative tolerance, we can choose two solutions:
538  * 1) The first one, more conservative, is to compress relatively to
539  * the norm of the final matrix \f$ \alpha A + B \f$. In this kernel, we
540  * exploit the fact that the previous computed product contains all the
541  * information of the final matrix to do it as follow:
542  *
543  * float norm = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', rank, rank,
544  * R, rank, NULL );
545  * tol = tol * norm;
546  *
547  * 2) The second solution, less conservative, will allow to reduce the
548  * rank more efficiently. Since A and B have been compressed relatively
549  * to their respective norms, there is no reason to compress the sum
550  * relatively to its own norm, but it is more reasonable to compress it
551  * relatively to the norm of A and B. For example, A-B would be full
552  * with the first criterion, and rank null with the second.
553  * Note that here, we can only have an estimation that once again
554  * reduces the conservation of the criterion.
555  * \f[ || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b|| \f]
556  *
557  */
558  float normA, normB;
559  normA = core_clrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
560  normB = core_clrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
561  tol = tol * ( cabsf(alpha) * normA + normB );
562  }
563 
564  /*
565  * Compute svd(R) = u sigma v^t
566  */
567  /* Missing the flops of the u and v generation */
568  flops = FLOPS_CGEQRF( rank, rank ) + FLOPS_CGELQF( rank, (rank-1) );
569 
570  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
571  ret = MYLAPACKE_cgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
572  rank, rank, R, rank,
573  s, u, rank, v, rank,
574  zbuf, lwork, s + rank );
575  if ( ret != 0 ) {
576  pastix_print_error( "LAPACKE_cgesvd FAILED" );
577  }
578 
579  /*
580  * Let's compute the new rank of the result
581  */
582  tmp = v;
583 
584  for (i=0; i<rank; i++, tmp+=1){
585  float frob_norm;
586 
587  /*
588  * There are two different stopping criteria for SVD to decide the
589  * compression rank:
590  * 1) The 2-norm:
591  * Compare the singular values to the threshold
592  * 2) The Frobenius norm:
593  * Compare the Frobenius norm of the trailing singular values to
594  * the threshold. Note that we use a reverse accumulation of the
595  * singular values to avoid accuracy issues.
596  */
597 #if defined(PASTIX_SVD_2NORM)
598  frob_norm = s[i];
599 #else
600  frob_norm = core_slassq( rank-i, s + rank - 1, -1 );
601 #endif
602 
603  if (frob_norm < tol) {
604  break;
605  }
606  cblas_csscal(rank, s[i], tmp, rank);
607  }
608  new_rank = i;
609  kernel_trace_stop_lvl2_rank( flops, new_rank );
610  total_flops += flops;
611 
612  /*
613  * First case: The rank is too big, so we decide to uncompress the result
614  */
615  if ( new_rank > core_get_rklimit( M, N ) ) {
616  pastix_lrblock_t Bbackup = *B;
617 
618  core_clralloc( M, N, -1, B );
619  u = B->u;
620 
621  /* Uncompress B */
622  flops = FLOPS_CGEMM( M, N, Bbackup.rk );
623  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
624  cblas_cgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
625  M, N, Bbackup.rk,
626  CBLAS_SADDR(cone), Bbackup.u, ldbu,
627  Bbackup.v, ldbv,
628  CBLAS_SADDR(czero), u, M );
629  kernel_trace_stop_lvl2( flops );
630  total_flops += flops;
631 
632  /* Add A into it */
633  if ( A->rk == -1 ) {
634  flops = 2 * M1 * N1;
635  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
636  core_cgeadd( transA1, M1, N1,
637  alpha, A->u, ldau,
638  cone, u + offy * M + offx, M);
639  kernel_trace_stop_lvl2( flops );
640  }
641  else {
642  flops = FLOPS_CGEMM( M1, N1, A->rk );
643  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
644  cblas_cgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
645  M1, N1, A->rk,
646  CBLAS_SADDR(alpha), A->u, ldau,
647  A->v, ldav,
648  CBLAS_SADDR(cone), u + offy * M + offx, M);
649  kernel_trace_stop_lvl2( flops );
650  }
651  total_flops += flops;
652  core_clrfree(&Bbackup);
653  memFree_null(zbuf);
654  return total_flops;
655  }
656  else if ( new_rank == 0 ) {
657  core_clrfree(B);
658  memFree_null(zbuf);
659  return total_flops;
660  }
661 
662  /*
663  * We need to reallocate the buffer to store the new compressed version of B
664  * because it wasn't big enough
665  */
666  ret = core_clrsze( 0, M, N, B, new_rank, -1, -1 );
667  assert( ret != -1 );
668  assert( B->rkmax >= new_rank );
669  assert( B->rkmax >= B->rk );
670 
671  ldbv = B->rkmax;
672 
673  /*
674  * Let's now compute the final U = Q1 ([u] sigma)
675  * [0]
676  */
677 #if defined(PASTIX_DEBUG_LR_NANCHECK)
678  ret = LAPACKE_claset_work( LAPACK_COL_MAJOR, 'A', M, new_rank,
679  0.0, 0.0, B->u, ldbu );
680  assert(ret == 0);
681  ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR, 'A', rank, new_rank,
682  u, rank, B->u, ldbu );
683  assert(ret == 0);
684 #else
685  tmp = B->u;
686  for (i=0; i<new_rank; i++, tmp+=ldbu, u+=rank) {
687  memcpy(tmp, u, rank * sizeof(pastix_complex32_t));
688  memset(tmp + rank, 0, (M - rank) * sizeof(pastix_complex32_t));
689  }
690 #endif
691 
692  flops = FLOPS_CUNMQR( M, new_rank, minU, PastixLeft )
693  + FLOPS_CUNMLQ( new_rank, N, minV, PastixRight );
694  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
695  ret = LAPACKE_cunmqr_work(LAPACK_COL_MAJOR, 'L', 'N',
696  M, new_rank, minU,
697  u1u2, M, tauU,
698  B->u, ldbu,
699  zbuf, lwork);
700  assert( ret == 0 );
701 
702  /*
703  * And the final V^T = [v^t 0 ] Q2
704  */
705  tmp = B->v;
706  ret = LAPACKE_clacpy_work( LAPACK_COL_MAJOR, 'A', new_rank, rank,
707  v, rank, B->v, ldbv );
708  assert( ret == 0 );
709 
710  ret = LAPACKE_claset_work( LAPACK_COL_MAJOR, 'A', new_rank, N-rank,
711  0.0, 0.0, tmp + ldbv * rank, ldbv );
712  assert( ret == 0 );
713 
714  ret = LAPACKE_cunmlq_work(LAPACK_COL_MAJOR, 'R', 'N',
715  new_rank, N, minV,
716  v1v2, rank, tauV,
717  B->v, ldbv,
718  zbuf, lwork);
719  assert( ret == 0 );
720  kernel_trace_stop_lvl2( flops );
721  total_flops += flops;
722 
723  (void)ret;
724  memFree_null(zbuf);
725  return total_flops;
726 }
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
static float core_slassq(int n, const float *x, int incx)
Compute the frobenius norm of a vector.
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
float _Complex pastix_complex32_t
Definition: datatypes.h:76
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_cgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t LDA, pastix_complex32_t beta, pastix_complex32_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_cgeadd.c:78
double tolerance
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:46
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_crradd_svd(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
pastix_fixdbl_t core_cge2lr_svd(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using SVD.
float core_clrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition: core_clrnrm.c:48
int core_clrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
void core_clrconcatenate_u(pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, pastix_complex32_t *u1u2)
Concatenate left parts of two low-rank matrices.
void core_clralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
Definition: core_cgelrops.c:56
void core_clrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_clrconcatenate_v(pastix_trans_t transA1, pastix_complex32_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, pastix_complex32_t *v1v2)
Concatenate right parts of two low-rank matrices.
void core_clrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixFrobeniusNorm
Definition: api.h:504
@ PastixRight
Definition: api.h:496
@ PastixLeft
Definition: api.h:495
@ PastixNoTrans
Definition: api.h:445