PaStiX Handbook  6.4.0
core_srqrrt.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_srqrrt.c
4  *
5  * PaStiX Rank-revealing QR kernel based on randomization technique with rotation.
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Esragul Korkmaz
12  * @author Mathieu Faverge
13  * @date 2024-07-05
14  * @generated from /builds/solverstack/pastix/kernels/core_zrqrrt.c, normal z -> s, Thu Aug 29 14:20:20 2024
15  *
16  **/
17 #include "common.h"
18 #include <cblas.h>
19 #include <lapacke.h>
20 #include "common/frobeniusupdate.h"
21 #include "blend/solver.h"
22 #include "pastix_scores.h"
23 #include "pastix_slrcores.h"
24 #include "s_nan_check.h"
25 
26 #ifndef DOXYGEN_SHOULD_SKIP_THIS
27 static float sone = 1.0;
28 static float szero = 0.0;
29 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
30 
31 /**
32  *******************************************************************************
33  *
34  * @brief Compute a randomized QR factorization with rotation technique.
35  *
36  * This kernel implements the second method (he did not write a pseudocode for
37  * the second method) described in :
38  *
39  * Blocked rank-revealing QR factorizations: How randomized sampling can be used
40  * to avoid single-vector pivoting. P. G. Martinsson
41  *
42  * Note that we only use the trailing matrix for resampling. We don't use power
43  * of it for getting better results, since it would be too costly.
44  *
45  * Difference from randomized QRCP is :
46  * 1) resampling is used instead of sample update formula
47  * 2) Instead of pivoting A, rotation is applied on it
48  * 3) Instead of working with B and omega, directly their transposes are handled
49  * for simplicity
50  *
51  * The main difference in this algorithm compared to figure 5 in the Martinsson's
52  * paper:
53  * 1) First, we stop the iterative process based on a tolerance criterion
54  * 2) Second, we do not apply SVD for gathering the mass on the diagonal
55  * blocks, so our method is less costly but we expect it to be less
56  * close to SVD result
57  * 3) Third, we do not apply the power iteration for resampling for a closer
58  * result to SVD, since it is too costly
59  *
60  *******************************************************************************
61  *
62  * @param[in] tol
63  * The relative tolerance criterion. Computations are stopped when the
64  * frobenius norm of the residual matrix is lower than tol.
65  * If tol < 0, then maxrank reflectors are computed.
66  *
67  * @param[in] maxrank
68  * Maximum number of reflectors computed. Computations are stopped when
69  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
70  * or up to the tolerance criterion.
71  *
72  * @param[in] nb
73  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
74  * 32.
75  *
76  * @param[in] m
77  * Number of rows of the matrix A.
78  *
79  * @param[in] n
80  * Number of columns of the matrix A.
81  *
82  * @param[inout] A
83  * The matrix of dimension lda-by-n that needs to be compressed.
84  * On output, the A matrix is computed up to the founded
85  * rank k (k first columns and rows). Everything else, should be dropped.
86  *
87  * @param[in] lda
88  * The leading dimension of the matrix A. lda >= max(1, m).
89  *
90  * @param[out] tau
91  * Contains scalar factors of the elementary reflectors for the matrix
92  * A.
93  *
94  * @param[out] B
95  * The matrix of dimension ldb-by-maxrank that will holds the partial
96  * projection of the matrix A.
97  * On output, each block of 32 columns can be used to computed the
98  * reverse rotation of the R part of A.
99  *
100  * @param[in] ldb
101  * The leading dimension of the matrix B. ldb >= max(1, n).
102  *
103  * @param[out] tau_b
104  * Contains scalar factors of the elementary reflectors for the matrix
105  * B.
106  *
107  * @param[in] work
108  * Workspace array of size lwork.
109  *
110  * @param[in] lwork
111  * The dimension of the work area. lwork >= (nb * max(n,n))
112  * If lwork == -1, the function returns immediately and work[0]
113  * contains the optimal size of work.
114  *
115  * @param[in] normA
116  * The norm of the input matrixA. If negative, the norm is computed by
117  * the kernel.
118  *
119  *******************************************************************************
120  *
121  * @return This routine will return the rank of A (>=0) or -1 if it didn't
122  * manage to compress within the margins of tolerance and maximum rank.
123  *
124  *******************************************************************************/
125 int
126 core_srqrrt( float tol,
127  pastix_int_t maxrank,
128  pastix_int_t nb,
129  pastix_int_t m,
130  pastix_int_t n,
131  float *A,
132  pastix_int_t lda,
133  float *tau,
134  float *B,
135  pastix_int_t ldb,
136  float *tau_b,
137  float *work,
138  pastix_int_t lwork,
139  float normA )
140 {
141  int SEED[4] = {26, 67, 52, 197};
142  int ret, i;
143  pastix_int_t bp = ( nb < 0 ) ? 32 : nb;
144  pastix_int_t d, ib, loop = 1;
145  pastix_int_t ldo = m;
146  pastix_int_t size_O = ldo * bp;
147  pastix_int_t rk, minMN, lwkopt;
148  pastix_int_t sublw = n * bp;
149  float *omega = work;
150  float *subw = work;
151  float normR;
152 
153  sublw = pastix_imax( sublw, size_O );
154 
155  char trans;
156 #if defined(PRECISION_c) || defined(PRECISION_z)
157  trans = 'C';
158 #else
159  trans = 'T';
160 #endif
161 
162  lwkopt = sublw;
163  if ( lwork == -1 ) {
164  work[0] = (float)lwkopt;
165  return 0;
166  }
167 #if !defined(NDEBUG)
168  if (m < 0) {
169  return -1;
170  }
171  if (n < 0) {
172  return -2;
173  }
174  if (lda < pastix_imax(1, m)) {
175  return -4;
176  }
177  if( lwork < lwkopt ) {
178  return -8;
179  }
180 #endif
181 
182  minMN = pastix_imin(m, n);
183  if ( maxrank < 0 ) {
184  maxrank = minMN;
185  }
186  maxrank = pastix_imin( maxrank, minMN );
187 
188  /* Compute the norm of A if not provided by the user */
189  if ( normA < 0 ) {
190  normA = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
191  A, lda, NULL );
192  }
193 
194  /**
195  * If maximum rank is 0, then either the matrix norm is below the tolerance,
196  * and we can return a null rank matrix, or it is not and we need to return
197  * a full rank matrix.
198  */
199  if ( maxrank == 0 ) {
200  if ( tol < 0. ) {
201  return 0;
202  }
203  if ( normA < tol ) {
204  return 0;
205  }
206  return -1;
207  }
208 
209  /* Quick exit if A is null rank for the given tolerance */
210  if ( normA < tol ) {
211  return 0;
212  }
213 
214 #if defined(PASTIX_DEBUG_LR)
215  omega = malloc( size_O * sizeof(float) );
216  subw = malloc( sublw * sizeof(float) );
217 #endif
218 
219  /* Computation of the Gaussian matrix */
220  ret = LAPACKE_slarnv_work(3, SEED, size_O, omega);
221  assert( ret == 0 );
222 
223  rk = 0;
224  while ( (rk < maxrank) && loop )
225  {
226  /*
227  * Note that we can use maxrank instead of minMN to compute ib, as it is
228  * useless to compute extra columns with rotation. The residual will
229  * tell us if we managed to compress or not
230  */
231  ib = pastix_imin( bp, maxrank-rk );
232  d = ib;
233 
234  /* Computation of the projection matrix B = A_{0:m,k:n}^T * omega */
235  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
236  n-rk, d, m-rk,
237  (sone), A + rk*lda + rk, lda,
238  omega, ldo,
239  (szero), B + rk*ldb + rk, ldb );
240 
241  /* Try to do some power iteration to refine the projection */
242  if (0)
243  {
244  int l;
245  for(l=0; l<2; l++)
246  {
247  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
248  m-rk, d, n-rk,
249  (sone), A + rk*lda + rk, lda,
250  B + rk*ldb + rk, ldb,
251  (szero), omega, ldo );
252 
253  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
254  n-rk, d, m-rk,
255  (sone), A + rk*lda + rk, lda,
256  omega, ldo,
257  (szero), B + rk*ldb + rk, ldb );
258  }
259 
260  /* Computation of the Gaussian matrix */
261  ret = LAPACKE_slarnv_work(3, SEED, size_O, omega);
262  assert( ret == 0 );
263  }
264 
265  /*
266  * QR factorization of the sample matrix B = Q_{B} R_{B}.
267  * At the end, the householders will be stored at the lower part of the matrix
268  */
269  ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, n-rk, d,
270  B + rk*ldb + rk, ldb, tau_b+rk,
271  subw, sublw );
272  assert(ret == 0);
273 
274  /*
275  * A_{0:m,k:n} = A_{0:m,k:n} Q_{B} for rotational version
276  */
277  ret = LAPACKE_sormqr_work( LAPACK_COL_MAJOR, 'R', 'N',
278  m - rk, n - rk, d,
279  B + rk*ldb + rk, ldb, tau_b+rk,
280  A + rk*lda + rk, lda,
281  subw, sublw );
282  assert(ret == 0);
283 
284  /*
285  * Factorize d columns of A_{k:m,k:k+d} without pivoting
286  */
287  ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
288  A + rk*lda + rk, lda, tau + rk,
289  subw, sublw );
290  assert(ret == 0);
291 
292  if ( rk+d < n ) {
293  /*
294  * Update trailing submatrix: A_{k:m,k+d:n} <- Q^h A_{k:m,k+d:n}
295  */
296  ret = LAPACKE_sormqr_work( LAPACK_COL_MAJOR, 'L', trans,
297  m-rk, n-rk-d, d,
298  A + rk *lda + rk, lda, tau + rk,
299  A + (rk+d)*lda + rk, lda,
300  subw, sublw );
301  assert(ret == 0);
302  }
303 
304  /* Let's update the residual norm */
305  normR = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m-rk-d, n-rk-d, A + (rk+d) * (lda+1), lda, NULL );
306  if ( normR < tol ) {
307  /* Let's refine the rank: r <= rk+d */
308  float ssq = 1.;
309  float scl = normR;
310 
311  loop = 0;
312 
313  for( i=d-1; i>=0; i-- ) {
314  float normRk = cblas_snrm2( n-rk-i, A + (rk+i) * lda + (rk+i), lda );
315 
316  frobenius_update( 1., &scl, &ssq, &normRk );
317  normRk = scl * sqrtf( ssq );
318 
319  if ( normRk > tol ) {
320  /*
321  * The actual rank is i (the i^th column has just been
322  * removed from the selection), and we need to be below the
323  * threshold tol, so we need the i from the previous
324  * iteration (+1)
325  */
326  d = i+1;
327  break;
328  }
329  }
330  }
331  rk += d;
332  }
333 
334 #if defined(PASTIX_DEBUG_LR)
335  free( omega );
336  free( subw );
337 #endif
338 
339  (void)ret;
340  if ( (loop && (rk < minMN)) || (rk > maxrank) ) {
341  return -1;
342  }
343  else {
344  return rk;
345  }
346 }
347 
348 /**
349  *******************************************************************************
350  *
351  * @brief Convert a full rank matrix in a low rank matrix, using RQRRT.
352  *
353  *******************************************************************************
354  *
355  * @param[in] use_reltol
356  * Defines if the kernel should use relative tolerance (tol *||A||), or
357  * absolute tolerance (tol).
358  *
359  * @param[in] tol
360  * The tolerance used as a criterion to eliminate information from the
361  * full rank matrix
362  *
363  * @param[in] rklimit
364  * The maximum rank to store the matrix in low-rank format. If
365  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
366  *
367  * @param[in] m
368  * Number of rows of the matrix A, and of the low rank matrix Alr.
369  *
370  * @param[in] n
371  * Number of columns of the matrix A, and of the low rank matrix Alr.
372  *
373  * @param[in] A
374  * The matrix of dimension lda-by-n that needs to be compressed
375  *
376  * @param[in] lda
377  * The leading dimension of the matrix A. lda >= max(1, m)
378  *
379  * @param[out] Alr
380  * The low rank matrix structure that will store the low rank
381  * representation of A
382  *
383  *******************************************************************************
384  *
385  * @return TODO
386  *
387  *******************************************************************************/
389 core_sge2lr_rqrrt( int use_reltol,
390  pastix_fixdbl_t tol,
391  pastix_int_t rklimit,
392  pastix_int_t m,
393  pastix_int_t n,
394  const void *A,
395  pastix_int_t lda,
396  pastix_lrblock_t *Alr )
397 {
398  return core_sge2lr_qrrt( core_srqrrt, use_reltol, tol, rklimit,
399  m, n, A, lda, Alr );
400 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_srqrrt(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, float *tau, float *B, pastix_int_t ldb, float *tau_b, float *work, pastix_int_t lwork, float normA)
Compute a randomized QR factorization with rotation technique.
Definition: core_srqrrt.c:126
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_qrrt(core_srrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_sge2lr_rqrrt(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRRT.
Definition: core_srqrrt.c:389
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...