PaStiX Handbook  6.2.1
core_sgelrops_svd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_sgelrops_svd.c
4  *
5  * PaStiX low-rank kernel routines using SVD based on Lapack SGESVD.
6  *
7  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.2.1
11  * @author Gregoire Pichon
12  * @author Esragul Korkmaz
13  * @author Mathieu Faverge
14  * @author Nolan Bredel
15  * @date 2021-06-16
16  * @generated from /builds/solverstack/pastix/kernels/core_zgelrops_svd.c, normal z -> s, Tue Apr 12 09:38:40 2022
17  *
18  **/
19 #include "common.h"
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "flops.h"
23 #include "kernels_trace.h"
24 #include "blend/solver.h"
25 #include "pastix_scores.h"
26 #include "pastix_slrcores.h"
27 #include "s_nan_check.h"
28 
29 #define PASTIX_SVD_2NORM 1
30 #if !defined(PASTIX_SVD_2NORM)
31 #include "common/frobeniusupdate.h"
32 
33 /**
34  *******************************************************************************
35  *
36  * @ingroup kernel_lr_svd_null
37  *
38  * @brief Compute the frobenius norm of a vector
39  *
40  * This routine is inspired from LAPACK slassq function, and allows to
41  * accumulate the contribution backward for better accuracy as opposed to snrm2
42  * which allows only positive increment.
43  *
44  *******************************************************************************
45  *
46  * @param[in] n
47  * The number of elemnts in the vector
48  *
49  * @param[in] x
50  * The vector of size n * incx
51  *
52  * @param[in] incx
53  * The increment between two elments in the vector x.
54  *
55  *******************************************************************************
56  *
57  * @return The frobenius norm of the vector x.
58  *
59  *******************************************************************************/
60 static inline float
61 core_slassq( int n, const float *x, int incx )
62 {
63  float scale = 1.;
64  float sumsq = 0.;
65  int i;
66 
67  for( i=0; i<n; i++, x+=incx ) {
68  frobenius_update( 1, &scale, &sumsq, x );
69  }
70 
71  return scale * sqrtf( sumsq );
72 }
73 #endif /* !defined(PASTIX_SVD_2NORM) */
74 
75 #ifndef DOXYGEN_SHOULD_SKIP_THIS
76 static float sone = 1.0;
77 static float szero = 0.0;
78 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
79 
80 /**
81  *******************************************************************************
82  *
83  * @brief Convert a full rank matrix in a low rank matrix, using SVD.
84  *
85  *******************************************************************************
86  *
87  * @param[in] tol
88  * The tolerance used as a criterion to eliminate information from the
89  * full rank matrix.
90  * If tol < 0, then we compress up to rklimit. So if rklimit is set to
91  * min(m,n), and tol < 0., we get a full representation of the matrix
92  * under the form U * V^t.
93  *
94  * @param[in] rklimit
95  * The maximum rank to store the matrix in low-rank format. If
96  * -1, set to min(M, N) / PASTIX_LR_MINRATIO.
97  *
98  * @param[in] m
99  * Number of rows of the matrix A, and of the low rank matrix Alr.
100  *
101  * @param[in] n
102  * Number of columns of the matrix A, and of the low rank matrix Alr.
103  *
104  * @param[in] A
105  * The matrix of dimension lda-by-n that needs to be compressed
106  *
107  * @param[in] lda
108  * The leading dimension of the matrix A. lda >= max(1, m)
109  *
110  * @param[out] Alr
111  * The low rank matrix structure that will store the low rank
112  * representation of A
113  *
114  *******************************************************************************/
115 pastix_fixdbl_t
116 core_sge2lr_svd( int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
117  pastix_int_t m, pastix_int_t n,
118  const void *Avoid, pastix_int_t lda,
119  pastix_lrblock_t *Alr )
120 {
121  const float *A = (const float*)Avoid;
122  pastix_fixdbl_t flops = 0.0;
123  float *u, *v, *zwork, *Acpy, ws;
124  float *rwork, *s;
125  pastix_int_t i, ret, ldu, ldv;
126  pastix_int_t minMN, imax;
127  pastix_int_t lwork = -1;
128  pastix_int_t zsize, rsize;
129  float norm;
130 
131 #if !defined(NDEBUG)
132  if ( m < 0 ) {
133  return -2;
134  }
135  if ( n < 0 ) {
136  return -3;
137  }
138  if ( lda < m ) {
139  return -5;
140  }
141 #endif
142 
143  norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
144  A, lda, NULL );
145 
146  /* Quick return on norm */
147  if ( (norm == 0.) && (tol >= 0.) ) {
148  core_slralloc( m, n, 0, Alr );
149  return 0. ;
150  }
151 
152  rklimit = ( rklimit < 0 ) ? core_get_rklimit( m, n ) : rklimit;
153  if ( tol < 0. ) {
154  tol = -1.;
155  }
156  else if ( use_reltol ) {
157  tol = tol * norm;
158  }
159 
160  /* Quick return on max rank */
161  minMN = pastix_imin(m, n);
162  rklimit = pastix_imin( minMN, rklimit );
163 
164  /**
165  * If maximum rank is 0, then either the matrix norm is below the tolerance,
166  * and we can return a null rank matrix, or it is not and we need to return
167  * a full rank matrix.
168  */
169  if ( rklimit == 0 ) {
170  if ( (tol < 0.) || (norm < tol) ) {
171  core_slralloc( m, n, 0, Alr );
172  return 0.;
173  }
174 
175  /* Return full rank */
176  core_slralloc( m, n, -1, Alr );
177  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
178  A, lda, Alr->u, Alr->rkmax );
179  assert(ret == 0);
180  return 0.;
181  }
182 
183  /*
184  * Allocate a temporary Low rank matrix to store the full U and V
185  */
186  core_slralloc( m, n, pastix_imin( m, n ), Alr );
187  u = Alr->u;
188  v = Alr->v;
189  ldu = m;
190  ldv = Alr->rkmax;
191 
192  /*
193  * Query the workspace needed for the gesvd
194  */
195 #if defined(PASTIX_DEBUG_LR_NANCHECK)
196  ws = minMN;
197 #else
198  ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
199  m, n, NULL, m,
200  NULL, NULL, ldu, NULL, ldv,
201  &ws, lwork, NULL );
202 #endif
203  lwork = ws;
204  zsize = ws;
205  zsize += m * n; /* Copy of the matrix A */
206 
207  rsize = minMN;
208 #if defined(PRECISION_z) || defined(PRECISION_c)
209  rsize += 5 * minMN;
210 #endif
211 
212  zwork = malloc( zsize * sizeof(float) + rsize * sizeof(float) );
213  rwork = (float*)(zwork + zsize);
214 
215  Acpy = zwork + lwork;
216  s = rwork;
217 
218  /*
219  * Backup the original matrix before to overwrite it with the SVD
220  */
221  ret = LAPACKE_slacpy_work(LAPACK_COL_MAJOR, 'A', m, n,
222  A, lda, Acpy, m );
223  assert( ret == 0 );
224 
225  ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
226  m, n, Acpy, m,
227  s, u, ldu, v, ldv,
228  zwork, lwork, rwork + minMN );
229  if ( ret != 0 ) {
230  pastix_print_error( "SVD Failed\n" );
231  }
232 
233  /* Let's stop i before going too far */
234  imax = pastix_imin( minMN, rklimit+1 );
235  for (i=0; i<imax; i++, v+=1) {
236  float frob_norm;
237 
238  /*
239  * There are two different stopping criteria for SVD to decide the
240  * compression rank:
241  * 1) The 2-norm:
242  * Compare the singular values to the threshold
243  * 2) The Frobenius norm:
244  * Compare the Frobenius norm of the trailing singular values to
245  * the threshold. Note that we use a reverse accumulation of the
246  * singular values to avoid accuracy issues.
247  */
248 #if defined(PASTIX_SVD_2NORM)
249  frob_norm = s[i];
250 #else
251  frob_norm = core_slassq( minMN-i, s + minMN - 1, -1 );
252 #endif
253 
254  if (frob_norm < tol) {
255  break;
256  }
257  cblas_sscal(n, s[i], v, ldv);
258  }
259 
260  /*
261  * Resize the space used by the low rank matrix
262  */
263  core_slrsze( 1, m, n, Alr, i, -1, rklimit );
264 
265  /*
266  * It was not interesting to compress, so we restore the dense version in Alr
267  */
268  if (Alr->rk == -1) {
269  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
270  A, lda, Alr->u, Alr->rkmax );
271  assert(ret == 0);
272  }
273 
274  memFree_null(zwork);
275  return flops;
276 }
277 
278 /**
279  *******************************************************************************
280  *
281  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
282  *
283  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
284  * Compute QR decomposition of (u2 u1) = Q1 R1
285  * Compute QR decomposition of (v2 v1) = Q2 R2
286  * Compute SVD of R1 R2^T = u sigma v^T
287  * Final solution is (Q1 u sigma^[1/2]) (Q2 v sigma^[1/2])^T
288  *
289  *******************************************************************************
290  *
291  * @param[in] lowrank
292  * The structure with low-rank parameters.
293  *
294  * @param[in] transA1
295  * @arg PastixNoTrans: No transpose, op( A ) = A;
296  * @arg PastixTrans: Transpose, op( A ) = A';
297  *
298  * @param[in] alpha
299  * alpha * A is add to B
300  *
301  * @param[in] M1
302  * The number of rows of the matrix A.
303  *
304  * @param[in] N1
305  * The number of columns of the matrix A.
306  *
307  * @param[in] A
308  * The low-rank representation of the matrix A.
309  *
310  * @param[in] M2
311  * The number of rows of the matrix B.
312  *
313  * @param[in] N2
314  * The number of columns of the matrix B.
315  *
316  * @param[in] B
317  * The low-rank representation of the matrix B.
318  *
319  * @param[in] offx
320  * The horizontal offset of A with respect to B.
321  *
322  * @param[in] offy
323  * The vertical offset of A with respect to B.
324  *
325  *******************************************************************************
326  *
327  * @return The new rank of u2 v2^T or -1 if ranks are too large for recompression
328  *
329  *******************************************************************************/
330 pastix_fixdbl_t
331 core_srradd_svd( const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr,
332  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
333  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
334  pastix_int_t offx, pastix_int_t offy)
335 {
336  pastix_int_t rank, M, N, minU, minV;
337  pastix_int_t i, ret, lwork, new_rank;
338  pastix_int_t ldau, ldav, ldbu, ldbv;
339  float *u1u2, *v1v2, *R, *u, *v;
340  float *tmp, *zbuf, *tauU, *tauV;
341  float querysize;
342  float alpha = *((float*)alphaptr);
343  float *s;
344  size_t wzsize, wdsize;
345  float tol = lowrank->tolerance;
346  pastix_fixdbl_t flops, total_flops = 0.0;
347 
348  rank = (A->rk == -1) ? pastix_imin(M1, N1) : A->rk;
349  rank += B->rk;
350  M = pastix_imax(M2, M1);
351  N = pastix_imax(N2, N1);
352  minU = pastix_imin(M, rank);
353  minV = pastix_imin(N, rank);
354 
355  assert(M2 == M && N2 == N);
356  assert(B->rk != -1);
357 
358  assert( A->rk <= A->rkmax);
359  assert( B->rk <= B->rkmax);
360 
361  if ( ((M1 + offx) > M2) ||
362  ((N1 + offy) > N2) )
363  {
364  pastix_print_error( "Dimensions are not correct" );
365  assert(0 /* Incorrect dimensions */);
366  return total_flops;
367  }
368 
369  /*
370  * A is rank null, nothing to do
371  */
372  if (A->rk == 0) {
373  return total_flops;
374  }
375 
376  ldau = (A->rk == -1) ? A->rkmax : M1;
377  ldav = (transA1 == PastixNoTrans) ? A->rkmax : N1;
378  ldbu = M;
379  ldbv = B->rkmax;
380 
381  /*
382  * Let's handle case where B is a null matrix
383  * B = alpha A
384  */
385  if (B->rk == 0) {
386  core_slrcpy( lowrank, transA1, alpha,
387  M1, N1, A, M2, N2, B,
388  offx, offy );
389  return total_flops;
390  }
391 
392  /*
393  * The rank is too big, let's try to compress
394  */
395  if ( rank > pastix_imin( M, N ) ) {
396  assert(0);
397  }
398 
399  /*
400  * Let's compute the size of the workspace
401  */
402  /* u1u2 and v1v2 */
403  wzsize = (M+N) * rank;
404  /* tauU and tauV */
405  wzsize += minU + minV;
406 
407  /* Storage of R, u and v */
408  wzsize += 3 * rank * rank;
409 
410  /* QR/LQ workspace */
411  lwork = pastix_imax( M, N ) * 32;
412 
413  /* Workspace needed for the gesvd */
414 #if defined(PASTIX_DEBUG_LR_NANCHECK)
415  querysize = rank;
416 #else
417  ret = MYLAPACKE_sgesvd_work( LAPACK_COL_MAJOR, 'S', 'S',
418  rank, rank, NULL, rank,
419  NULL, NULL, rank, NULL, rank,
420  &querysize, -1, NULL);
421 #endif
422  lwork = pastix_imax( lwork, querysize );
423  wzsize += lwork;
424 
425  wdsize = rank;
426 #if defined(PRECISION_z) || defined(PRECISION_c)
427  wdsize += 5 * rank;
428 #endif
429 
430  zbuf = malloc( wzsize * sizeof(float) + wdsize * sizeof(float) );
431  s = (float*)(zbuf + wzsize);
432 
433  u1u2 = zbuf + lwork;
434  tauU = u1u2 + M * rank;
435  v1v2 = tauU + minU;
436  tauV = v1v2 + N * rank;
437  R = tauV + minV;
438 
439  /*
440  * Concatenate U2 and U1 in u1u2
441  * [ u2 0 ]
442  * [ u2 u1 ]
443  * [ u2 0 ]
444  */
445  core_slrconcatenate_u( alpha,
446  M1, N1, A,
447  M2, B,
448  offx, u1u2 );
449 
450  /*
451  * Perform QR factorization on u1u2 = (Q1 R1)
452  */
453  ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, M, rank,
454  u1u2, M, tauU, zbuf, lwork );
455  assert( ret == 0 );
456  total_flops += FLOPS_SGEQRF( M, rank );
457 
458  /*
459  * Concatenate V2 and V1 in v1v2
460  * [ v2^h v2^h v2^h ]
461  * [ 0 v1^h 0 ]
462  */
463  core_slrconcatenate_v( transA1, alpha,
464  M1, N1, A,
465  N2, B,
466  offy, v1v2 );
467 
468  /*
469  * Perform LQ factorization on v1v2 = (L2 Q2)
470  */
471  ret = LAPACKE_sgelqf_work( LAPACK_COL_MAJOR, rank, N,
472  v1v2, rank, tauV, zbuf, lwork );
473  assert(ret == 0);
474  total_flops += FLOPS_SGELQF( rank, N );
475 
476  /*
477  * Compute R = alpha R1 L2
478  */
479  u = R + rank * rank;
480  v = u + rank * rank;
481 
482  memset(R, 0, rank * rank * sizeof(float));
483 
484  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'U', rank, rank,
485  u1u2, M, R, rank );
486  assert(ret == 0);
487 
488  cblas_strmm(CblasColMajor,
489  CblasRight, CblasLower,
490  CblasNoTrans, CblasNonUnit,
491  rank, rank, (sone),
492  v1v2, rank, R, rank);
493  total_flops += FLOPS_STRMM( PastixRight, rank, rank );
494 
495  if ( lowrank->use_reltol ) {
496  /**
497  * In relative tolerance, we can choose two solutions:
498  * 1) The first one, more conservative, is to compress relatively to
499  * the norm of the final matrix \[ \alpha A + B \]. In this kernel, we
500  * exploit the fact that the previous computed product contains all the
501  * information of the final matrix to do it as follow:
502  *
503  * float norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', rank, rank,
504  * R, rank, NULL );
505  * tol = tol * norm;
506  *
507  * 2) The second solution, less conservative, will allow to reduce the
508  * rank more efficiently. Since A and B have been compressed relatively
509  * to their respective norms, there is no reason to compress the sum
510  * relatively to its own norm, but it is more reasonable to compress it
511  * relatively to the norm of A and B. For example, A-B would be full
512  * with the first criterion, and rank null with the second.
513  * Note that here, we can only have an estimation that once again
514  * reduces the conservation of the criterion.
515  * || \alpha A + B || <= |\alpha| ||A|| + ||B|| <= |\alpha| ||U_aV_a|| + ||U_bV_b||
516  *
517  */
518  float normA, normB;
519  normA = core_slrnrm( PastixFrobeniusNorm, transA1, M1, N1, A );
520  normB = core_slrnrm( PastixFrobeniusNorm, PastixNoTrans, M2, N2, B );
521  tol = tol * ( fabsf(alpha) * normA + normB );
522  }
523 
524  /*
525  * Compute svd(R) = u sigma v^t
526  */
527  /* Missing the flops of the u and v generation */
528  flops = FLOPS_SGEQRF( rank, rank ) + FLOPS_SGELQF( rank, (rank-1) );
529 
530  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_recompression );
531  ret = MYLAPACKE_sgesvd_work( CblasColMajor, 'S', 'S',
532  rank, rank, R, rank,
533  s, u, rank, v, rank,
534  zbuf, lwork, s + rank );
535  if ( ret != 0 ) {
536  pastix_print_error( "LAPACKE_sgesvd FAILED" );
537  }
538 
539  /*
540  * Let's compute the new rank of the result
541  */
542  tmp = v;
543 
544  for (i=0; i<rank; i++, tmp+=1){
545  float frob_norm;
546 
547  /*
548  * There are two different stopping criteria for SVD to decide the
549  * compression rank:
550  * 1) The 2-norm:
551  * Compare the singular values to the threshold
552  * 2) The Frobenius norm:
553  * Compare the Frobenius norm of the trailing singular values to
554  * the threshold. Note that we use a reverse accumulation of the
555  * singular values to avoid accuracy issues.
556  */
557 #if defined(PASTIX_SVD_2NORM)
558  frob_norm = s[i];
559 #else
560  frob_norm = core_slassq( rank-i, s + rank - 1, -1 );
561 #endif
562 
563  if (frob_norm < tol) {
564  break;
565  }
566  cblas_sscal(rank, s[i], tmp, rank);
567  }
568  new_rank = i;
569  kernel_trace_stop_lvl2_rank( flops, new_rank );
570  total_flops += flops;
571 
572  /*
573  * First case: The rank is too big, so we decide to uncompress the result
574  */
575  if ( new_rank > core_get_rklimit( M, N ) ) {
576  pastix_lrblock_t Bbackup = *B;
577 
578  core_slralloc( M, N, -1, B );
579  u = B->u;
580 
581  /* Uncompress B */
582  flops = FLOPS_SGEMM( M, N, Bbackup.rk );
583  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
584  cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
585  M, N, Bbackup.rk,
586  (sone), Bbackup.u, ldbu,
587  Bbackup.v, ldbv,
588  (szero), u, M );
589  kernel_trace_stop_lvl2( flops );
590  total_flops += flops;
591 
592  /* Add A into it */
593  if ( A->rk == -1 ) {
594  flops = 2 * M1 * N1;
595  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
596  core_sgeadd( transA1, M1, N1,
597  alpha, A->u, ldau,
598  sone, u + offy * M + offx, M);
599  kernel_trace_stop_lvl2( flops );
600  }
601  else {
602  flops = FLOPS_SGEMM( M1, N1, A->rk );
603  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
604  cblas_sgemm(CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transA1,
605  M1, N1, A->rk,
606  (alpha), A->u, ldau,
607  A->v, ldav,
608  (sone), u + offy * M + offx, M);
609  kernel_trace_stop_lvl2( flops );
610  }
611  total_flops += flops;
612  core_slrfree(&Bbackup);
613  memFree_null(zbuf);
614  return total_flops;
615  }
616  else if ( new_rank == 0 ) {
617  core_slrfree(B);
618  memFree_null(zbuf);
619  return total_flops;
620  }
621 
622  /*
623  * We need to reallocate the buffer to store the new compressed version of B
624  * because it wasn't big enough
625  */
626  ret = core_slrsze( 0, M, N, B, new_rank, -1, -1 );
627  assert( ret != -1 );
628  assert( B->rkmax >= new_rank );
629  assert( B->rkmax >= B->rk );
630 
631  ldbv = B->rkmax;
632 
633  /*
634  * Let's now compute the final U = Q1 ([u] sigma)
635  * [0]
636  */
637 #if defined(PASTIX_DEBUG_LR_NANCHECK)
638  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', M, new_rank,
639  0.0, 0.0, B->u, ldbu );
640  assert(ret == 0);
641  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', rank, new_rank,
642  u, rank, B->u, ldbu );
643  assert(ret == 0);
644 #else
645  tmp = B->u;
646  for (i=0; i<new_rank; i++, tmp+=ldbu, u+=rank) {
647  memcpy(tmp, u, rank * sizeof(float));
648  memset(tmp + rank, 0, (M - rank) * sizeof(float));
649  }
650 #endif
651 
652  flops = FLOPS_SORMQR( M, new_rank, minU, PastixLeft )
653  + FLOPS_SORMLQ( new_rank, N, minV, PastixRight );
654  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_rradd_computeNewU );
655  ret = LAPACKE_sormqr_work(LAPACK_COL_MAJOR, 'L', 'N',
656  M, new_rank, minU,
657  u1u2, M, tauU,
658  B->u, ldbu,
659  zbuf, lwork);
660  assert( ret == 0 );
661 
662  /*
663  * And the final V^T = [v^t 0 ] Q2
664  */
665  tmp = B->v;
666  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', new_rank, rank,
667  v, rank, B->v, ldbv );
668  assert( ret == 0 );
669 
670  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', new_rank, N-rank,
671  0.0, 0.0, tmp + ldbv * rank, ldbv );
672  assert( ret == 0 );
673 
674  ret = LAPACKE_sormlq_work(LAPACK_COL_MAJOR, 'R', 'N',
675  new_rank, N, minV,
676  v1v2, rank, tauV,
677  B->v, ldbv,
678  zbuf, lwork);
679  assert( ret == 0 );
680  kernel_trace_stop_lvl2( flops );
681  total_flops += flops;
682 
683  memFree_null(zbuf);
684  return total_flops;
685 }
solver.h
PastixFrobeniusNorm
@ PastixFrobeniusNorm
Definition: api.h:482
core_get_rklimit
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:30
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
pastix_lrblock_s::v
void * v
Definition: pastix_lowrank.h:116
core_slrconcatenate_u
void core_slrconcatenate_u(float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_lrblock_t *B, pastix_int_t offx, float *u1u2)
Concatenate left parts of two low-rank matrices.
Definition: core_sgelrops.c:521
pastix_lrblock_s::u
void * u
Definition: pastix_lowrank.h:115
pastix_lr_s::use_reltol
int use_reltol
Definition: pastix_lowrank.h:153
core_slralloc
void core_slralloc(pastix_int_t M, pastix_int_t N, pastix_int_t rkmax, pastix_lrblock_t *A)
Allocate a low-rank matrix.
Definition: core_sgelrops.c:56
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
s_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
core_sge2lr_svd
pastix_fixdbl_t core_sge2lr_svd(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using SVD.
Definition: core_sgelrops_svd.c:116
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_slrfree
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
Definition: core_sgelrops.c:116
core_slrcpy
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transA, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
Definition: core_sgelrops.c:418
PastixNoTrans
@ PastixNoTrans
Definition: api.h:424
core_slrsze
int core_slrsze(int copy, pastix_int_t M, pastix_int_t N, pastix_lrblock_t *A, pastix_int_t newrk, pastix_int_t newrkmax, pastix_int_t rklimit)
Resize a low-rank matrix.
Definition: core_sgelrops.c:173
pastix_scores.h
pastix_slrcores.h
core_srradd_svd
pastix_fixdbl_t core_srradd_svd(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_sgelrops_svd.c:331
pastix_lrblock_s::rk
int rk
Definition: pastix_lowrank.h:113
core_slrconcatenate_v
void core_slrconcatenate_v(pastix_trans_t transA1, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offy, float *v1v2)
Concatenate right parts of two low-rank matrices.
Definition: core_sgelrops.c:629
pastix_lr_s::tolerance
double tolerance
Definition: pastix_lowrank.h:155
core_sgeadd
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
pastix_lrblock_s::rkmax
int rkmax
Definition: pastix_lowrank.h:114
core_slrnrm
float core_slrnrm(pastix_normtype_t ntype, int transV, pastix_int_t M, pastix_int_t N, const pastix_lrblock_t *A)
Compute the norm of a low-rank matrix.
Definition: core_slrnrm.c:39
PastixLeft
@ PastixLeft
Definition: api.h:473
PastixRight
@ PastixRight
Definition: api.h:474