PaStiX Handbook  6.4.0
core_srqrcp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_srqrcp.c
4  *
5  * PaStiX Rank-revealing QR kernel beased on randomization technique and partial
6  * QR with column pivoting.
7  *
8  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.4.0
12  * @author Claire Soyez-Martin
13  * @author Esragul Korkmaz
14  * @author Mathieu Faverge
15  * @date 2024-07-05
16  * @generated from /builds/solverstack/pastix/kernels/core_zrqrcp.c, normal z -> s, Tue Oct 8 14:17:22 2024
17  *
18  **/
19 #include "common.h"
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "blend/solver.h"
23 #include "pastix_scores.h"
24 #include "pastix_slrcores.h"
25 #include "s_nan_check.h"
26 
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static float msone = -1.0;
29 static float sone = 1.0;
30 static float szero = 0.0;
31 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
32 
33 /**
34  *******************************************************************************
35  *
36  * @brief Compute a randomized QR factorization.
37  *
38  * This kernel implements the algorithm described in:
39  * Fast Parallel Randomized QR with Column Pivoting Algorithms for Reliable
40  * Low-rank Matrix Approximations. Jianwei Xiao, Ming Gu, and Julien Langou
41  *
42  * The main difference in this algorithm relies in two points:
43  * 1) First, we stop the iterative porcess based on a tolerance criterion
44  * forwarded to the QR with column pivoting kernel, while they have a fixed
45  * number of iterations defined by parameter.
46  * 2) Second, we perform an extra PQRCP call on the trailing submatrix to
47  * finalize the computations, while in the paper above they use a spectrum
48  * revealing algorithm to refine the solution.
49  *
50  * More information can also be found in Finding Structure with Randomness:
51  * Probabilistic Algorithms for Constructing Approximate Matrix
52  * Decompositions. N. Halko, P. G. Martinsson, and J. A. Tropp
53  *
54  * Based on this paper, we set the p parameter to 5, as it seems sufficient, and
55  * because we iterate multiple times to get the final rank.
56  *
57  *******************************************************************************
58  *
59  * @param[in] tol
60  * The relative tolerance criterion. Computations are stopped when the
61  * frobenius norm of the residual matrix is lower than tol.
62  * If tol < 0, then maxrank reflectors are computed.
63  *
64  * @param[in] maxrank
65  * Maximum number of reflectors computed. Computations are stopped when
66  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
67  * or up to the tolerance criterion.
68  *
69  * @param[in] refine
70  * Enable/disable the extra refinement step that performs an additional
71  * PQRCP on the trailing submatrix.
72  *
73  * @param[in] nb
74  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
75  * 32.
76  *
77  * @param[in] m
78  * Number of rows of the matrix A.
79  *
80  * @param[in] n
81  * Number of columns of the matrix A.
82  *
83  * @param[in] A
84  * The matrix of dimension lda-by-n that needs to be compressed.
85  *
86  * @param[in] lda
87  * The leading dimension of the matrix A. lda >= max(1, m).
88  *
89  * @param[out] jpvt
90  * The array that describes the permutation of A.
91  *
92  * @param[out] tau
93  * Contains scalar factors of the elementary reflectors for the matrix
94  * Q.
95  *
96  * @param[in] work
97  * Workspace array of size lwork.
98  *
99  * @param[in] lwork
100  * The dimension of the work area. lwork >= (nb * n + max(n, m) )
101  * If lwork == -1, the functions returns immediately and work[0]
102  * contains the optimal size of work.
103  *
104  * @param[in] rwork
105  * Workspace array used to store partial and exact column norms (2-by-n)
106  *
107  *******************************************************************************
108  *
109  * @return This routine will return the rank of A (>=0) or -1 if it didn't
110  * manage to compress within the margins of tolerance and maximum rank.
111  *
112  *******************************************************************************/
113 int
114 core_srqrcp( float tol,
115  pastix_int_t maxrank,
116  int refine,
117  pastix_int_t nb,
118  pastix_int_t m,
119  pastix_int_t n,
120  float *A,
121  pastix_int_t lda,
122  pastix_int_t *jpvt,
123  float *tau,
124  float *work,
125  pastix_int_t lwork,
126  float *rwork )
127 {
128  int SEED[4] = {26, 67, 52, 197};
129  pastix_int_t j, k, in, itmp, d, ib, loop = 1;
130  int ret;
131  pastix_int_t p = 5;
132  pastix_int_t bp = ( nb < p ) ? 32 : nb;
133  pastix_int_t b = bp - p;
134  pastix_int_t ldb = bp;
135  pastix_int_t ldo = bp;
136  pastix_int_t size_O = ldo * m;
137  pastix_int_t size_B = ldb * n;
138  pastix_int_t *jpvt_b;
139  pastix_int_t rk, minMN, lwkopt;
140  float tolB = sqrtf( (float)(bp) ) * tol;
141 
142  float *B = work;
143  float *tau_b = B + size_B;
144  float *omega = tau_b + n;
145  float *subw = tau_b + n;
146  pastix_int_t sublw = n * bp + pastix_imax( bp, n );
147  sublw = pastix_imax( sublw, size_O );
148 
149  char trans;
150 #if defined(PRECISION_c) || defined(PRECISION_z)
151  trans = 'C';
152 #else
153  trans = 'T';
154 #endif
155 
156  lwkopt = size_B + n + sublw;
157  if ( lwork == -1 ) {
158  work[0] = (float)lwkopt;
159  return 0;
160  }
161 #if !defined(NDEBUG)
162  if (m < 0) {
163  return -1;
164  }
165  if (n < 0) {
166  return -2;
167  }
168  if (lda < pastix_imax(1, m)) {
169  return -4;
170  }
171  if( lwork < lwkopt ) {
172  return -8;
173  }
174 #endif
175 
176  minMN = pastix_imin(m, n);
177  if ( maxrank < 0 ) {
178  maxrank = minMN;
179  }
180  maxrank = pastix_imin( maxrank, minMN );
181 
182  /**
183  * If maximum rank is 0, then either the matrix norm is below the tolerance,
184  * and we can return a null rank matrix, or it is not and we need to return
185  * a full rank matrix.
186  */
187  if ( maxrank == 0 ) {
188  float norm;
189  if ( tol < 0. ) {
190  return 0;
191  }
192  norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
193  A, lda, NULL );
194  if ( norm < tol ) {
195  return 0;
196  }
197  return -1;
198  }
199 
200 #if defined(PASTIX_DEBUG_LR)
201  B = malloc( size_B * sizeof(float) );
202  tau_b = malloc( n * sizeof(float) );
203  omega = malloc( size_O * sizeof(float) );
204  subw = malloc( sublw * sizeof(float) );
205 #endif
206 
207  jpvt_b = malloc( n * sizeof(pastix_int_t) );
208  for (j=0; j<n; j++) jpvt[j] = j;
209 
210  /* Computation of the Gaussian matrix */
211  ret = LAPACKE_slarnv_work(3, SEED, size_O, omega);
212  assert( ret == 0 );
213 
214  /* Project with the gaussian matrix: B = Omega * A */
215  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
216  bp, n, m,
217  (sone), omega, ldo,
218  A, lda,
219  (szero), B, ldb );
220 
221  rk = 0;
222  while ( loop )
223  {
224  ib = pastix_imin( b, minMN-rk );
225  d = core_spqrcp( tolB, ib, 1, bp,
226  bp, n-rk,
227  B + rk*ldb, ldb,
228  jpvt_b + rk, tau_b,
229  subw, sublw, /* >= (n*bp)+max(bp, n) */
230  rwork ); /* >= 2*n */
231 
232  /* If fails to reach the tolerance before maxrank, let's restore the max value */
233  if ( d == -1 ) {
234  d = ib;
235  }
236  /* If smaller than ib, we reached the threshold */
237  if ( d < ib ) {
238  loop = 0;
239  }
240  if ( d == 0 ) {
241  loop = 0;
242  break;
243  }
244  /* If we exceeded the max rank, let's stop now */
245  if ( (rk + d) > maxrank ) {
246  rk = -1;
247  break;
248  }
249 
250  /* Updating jpvt and A */
251  for (j = rk; j < rk + d; j++) {
252  if (jpvt_b[j] >= 0) {
253  k = j;
254  in = jpvt_b[k] + rk;
255 
256  /* Mark as done */
257  jpvt_b[k] = - jpvt_b[k] - 1;
258 
259  while( jpvt_b[in] >= 0 ) {
260 
261  if (k != in) {
262  cblas_sswap( m, A + k * lda, 1,
263  A + in * lda, 1 );
264 
265  itmp = jpvt[k];
266  jpvt[k] = jpvt[in];
267  jpvt[in] = itmp;
268  }
269  itmp = jpvt_b[in];
270  jpvt_b[in] = - jpvt_b[in] - 1;
271  k = in;
272  in = itmp + rk;
273  }
274  }
275  }
276 
277  /*
278  * Factorize d columns of A without pivoting
279  */
280  ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
281  A + rk*lda + rk, lda, tau + rk,
282  subw, sublw );
283  assert(ret == 0);
284 
285  if ( rk+d < n ) {
286 
287  /*
288  * Update trailing submatrix: A <- Q^h A
289  */
290  ret = LAPACKE_sormqr_work( LAPACK_COL_MAJOR, 'L', trans,
291  m-rk, n-rk-d, d,
292  A + rk *lda + rk, lda, tau + rk,
293  A + (rk+d)*lda + rk, lda,
294  subw, sublw );
295  assert(ret == 0);
296 
297  /*
298  * The Q from partial QRCP is stored in the lower part of the matrix,
299  * we need to remove it
300  */
301  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', d-1, d-1,
302  0, 0, B + rk*ldb + 1, ldb );
303  assert( ret == 0 );
304 
305  /*
306  * Updating B
307  */
308  cblas_strsm( CblasColMajor, CblasRight, CblasUpper,
309  CblasNoTrans, CblasNonUnit,
310  d, d,
311  (sone), A + rk*lda + rk, lda,
312  B + rk*ldb, ldb );
313 
314  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
315  d, n - (rk+d), d,
316  (msone), B + rk *ldb, ldb,
317  A + (rk+d)*lda + rk, lda,
318  (sone), B + (rk+d)*ldb, ldb );
319  }
320  rk += d;
321  }
322 
323 #if 0
324  d = 0;
325  if ( refine && !loop && (rk < maxrank) ) {
326  /*
327  * Apply a Rank-revealing QR on the trailing submatrix to get the last
328  * columns
329  */
330  d = core_spqrcp( tol, maxrank-rk, 0, bp,
331  m-rk, n-rk,
332  A + rk * lda + rk, lda,
333  jpvt_b, tau + rk,
334  work, lwork, rwork );
335 
336  /* Updating jpvt and A */
337  for (j=0; j<d; j++) {
338  if (jpvt_b[j] >= 0) {
339  k = j;
340  in = jpvt_b[k];
341 
342  /* Mark as done */
343  jpvt_b[k] = - jpvt_b[k] - 1;
344 
345  while( jpvt_b[in] >= 0 ) {
346  if (k != in) {
347  /* Swap columns in first rows */
348  cblas_sswap( rk, A + (rk + k ) * lda, 1,
349  A + (rk + in) * lda, 1 );
350 
351  itmp = jpvt[rk + k];
352  jpvt[rk + k] = jpvt[rk + in];
353  jpvt[rk + in] = itmp;
354  }
355  itmp = jpvt_b[in];
356  jpvt_b[in] = - jpvt_b[in] - 1;
357  k = in;
358  in = itmp;
359  }
360  }
361  }
362 
363  rk += d;
364  if ( rk > maxrank ) {
365  rk = -1;
366  }
367  }
368 #endif
369  free( jpvt_b );
370 
371 #if defined(PASTIX_DEBUG_LR)
372  free( B );
373  free( tau_b );
374  free( omega );
375  free( subw );
376 #endif
377 
378  (void)ret;
379  (void)refine;
380  return rk;
381 }
382 
383 /**
384  *******************************************************************************
385  *
386  * @brief Convert a full rank matrix in a low rank matrix, using RQRCP.
387  *
388  *******************************************************************************
389  *
390  * @param[in] use_reltol
391  * Defines if the kernel should use relative tolerance (tol *||A||), or
392  * absolute tolerance (tol).
393  *
394  * @param[in] tol
395  * The tolerance used as a criterion to eliminate information from the
396  * full rank matrix
397  *
398  * @param[in] rklimit
399  * The maximum rank to store the matrix in low-rank format. If
400  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
401  *
402  * @param[in] m
403  * Number of rows of the matrix A, and of the low rank matrix Alr.
404  *
405  * @param[in] n
406  * Number of columns of the matrix A, and of the low rank matrix Alr.
407  *
408  * @param[in] A
409  * The matrix of dimension lda-by-n that needs to be compressed
410  *
411  * @param[in] lda
412  * The leading dimension of the matrix A. lda >= max(1, m)
413  *
414  * @param[out] Alr
415  * The low rank matrix structure that will store the low rank
416  * representation of A
417  *
418  *******************************************************************************
419  *
420  * @return TODO
421  *
422  *******************************************************************************/
424 core_sge2lr_rqrcp( int use_reltol,
425  pastix_fixdbl_t tol,
426  pastix_int_t rklimit,
427  pastix_int_t m,
428  pastix_int_t n,
429  const void *A,
430  pastix_int_t lda,
431  pastix_lrblock_t *Alr )
432 {
433  return core_sge2lr_qrcp( core_srqrcp, use_reltol, tol, rklimit,
434  m, n, A, lda, Alr );
435 }
436 
437 /**
438  *******************************************************************************
439  *
440  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
441  *
442  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
443  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
444  * (0 I )
445  * Compute RQRCP decomposition of (I u2^T u1) * (v2 v1)^T
446  * (0 I )
447  *
448  *******************************************************************************
449  *
450  * @param[in] lowrank
451  * The structure with low-rank parameters.
452  *
453  * @param[in] transA1
454  * @arg PastixNoTrans: No transpose, op( A ) = A;
455  * @arg PastixTrans: Transpose, op( A ) = A';
456  *
457  * @param[in] alphaptr
458  * alpha * A is add to B
459  *
460  * @param[in] M1
461  * The number of rows of the matrix A.
462  *
463  * @param[in] N1
464  * The number of columns of the matrix A.
465  *
466  * @param[in] A
467  * The low-rank representation of the matrix A.
468  *
469  * @param[in] M2
470  * The number of rows of the matrix B.
471  *
472  * @param[in] N2
473  * The number of columns of the matrix B.
474  *
475  * @param[in] B
476  * The low-rank representation of the matrix B.
477  *
478  * @param[in] offx
479  * The horizontal offset of A with respect to B.
480  *
481  * @param[in] offy
482  * The vertical offset of A with respect to B.
483  *
484  *******************************************************************************
485  *
486  * @return The new rank of u2 v2^T or -1 if ranks are too large for
487  * recompression
488  *
489  *******************************************************************************/
492  pastix_trans_t transA1,
493  const void *alphaptr,
494  pastix_int_t M1,
495  pastix_int_t N1,
496  const pastix_lrblock_t *A,
497  pastix_int_t M2,
498  pastix_int_t N2,
499  pastix_lrblock_t *B,
500  pastix_int_t offx,
501  pastix_int_t offy)
502 {
503  return core_srradd_qr( core_srqrcp, lowrank, transA1, alphaptr,
504  M1, N1, A, M2, N2, B, offx, offy );
505 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_spqrcp(float tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a rank-reavealing QR factorization.
Definition: core_spqrcp.c:105
int core_srqrcp(float tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a randomized QR factorization.
Definition: core_srqrcp.c:114
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_rqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRCP.
Definition: core_srqrcp.c:424
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
pastix_fixdbl_t core_srradd_rqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_srqrcp.c:491
enum pastix_trans_e pastix_trans_t
Transpostion.
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...