PaStiX Handbook  6.2.1
core_crqrrt.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_crqrrt.c
4  *
5  * PaStiX Rank-revealing QR kernel based on randomization technique with rotation.
6  *
7  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.0.3
11  * @author Esragul Korkmaz
12  * @author Mathieu Faverge
13  * @date 2019-11-12
14  * @generated from /builds/solverstack/pastix/kernels/core_zrqrrt.c, normal z -> c, Tue Apr 12 09:38:39 2022
15  *
16  **/
17 #include "common.h"
18 #include <cblas.h>
19 #include <lapacke.h>
20 #include "common/frobeniusupdate.h"
21 #include "blend/solver.h"
22 #include "pastix_ccores.h"
23 #include "pastix_clrcores.h"
24 #include "c_nan_check.h"
25 
26 #ifndef DOXYGEN_SHOULD_SKIP_THIS
27 static pastix_complex32_t cone = 1.0;
28 static pastix_complex32_t czero = 0.0;
29 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
30 
31 /**
32  *******************************************************************************
33  *
34  * @brief Compute a randomized QR factorization with rotation technique.
35  *
36  * This kernel implements the second method (he did not write a pseudocode for
37  * the second method) described in :
38  *
39  * Blocked rank-revealing QR factorizations: How randomized sampling can be used
40  * to avoid single-vector pivoting. P. G. Martinsson
41  *
42  * Note that we only use the trailing matrix for resampling. We don't use power
43  * of it for getting better results, since it would be too costly.
44  *
45  * Difference from randomized QRCP is :
46  * 1) resampling is used instead of sample update formula
47  * 2) Instead of pivoting A, rotation is applied on it
48  * 3) Instead of working with B and omega, directly their transposes are handled
49  * for simplicity
50  *
51  * The main difference in this algorithm compared to figure 5 in the Martinsson's
52  * paper:
53  * 1) First, we stop the iterative process based on a tolerance criterion
54  * 2) Second, we do not apply SVD for gathering the mass on the diagonal
55  * blocks, so our method is less costly but we expect it to be less
56  * close to SVD result
57  * 3) Third, we do not apply the power iteration for resampling for a closer
58  * result to SVD, since it is too costly
59  *
60  *******************************************************************************
61  *
62  * @param[in] tol
63  * The relative tolerance criterion. Computations are stopped when the
64  * frobenius norm of the residual matrix is lower than tol.
65  * If tol < 0, then maxrank reflectors are computed.
66  *
67  * @param[in] maxrank
68  * Maximum number of reflectors computed. Computations are stopped when
69  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
70  * or up to the tolerance criterion.
71  *
72  * @param[in] nb
73  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
74  * 32.
75  *
76  * @param[in] m
77  * Number of rows of the matrix A.
78  *
79  * @param[in] n
80  * Number of columns of the matrix A.
81  *
82  * @param[inout] A
83  * The matrix of dimension lda-by-n that needs to be compressed.
84  * On output, the A matrix is computed up to the founded
85  * rank k (k first columns and rows). Everything else, should be dropped.
86  *
87  * @param[in] lda
88  * The leading dimension of the matrix A. lda >= max(1, m).
89  *
90  * @param[out] tau
91  * Contains scalar factors of the elementary reflectors for the matrix
92  * A.
93  *
94  * @param[out] B
95  * The matrix of dimension ldb-by-maxrank that will holds the partial
96  * projection of the matrix A.
97  * On output, each block of 32 columns can be used to computed the
98  * reverse rotation of the R part of A.
99  *
100  * @param[in] ldb
101  * The leading dimension of the matrix B. ldb >= max(1, n).
102  *
103  * @param[out] tau_b
104  * Contains scalar factors of the elementary reflectors for the matrix
105  * B.
106  *
107  * @param[in] work
108  * Workspace array of size lwork.
109  *
110  * @param[in] lwork
111  * The dimension of the work area. lwork >= (nb * max(n,n))
112  * If lwork == -1, the function returns immediately and work[0]
113  * contains the optimal size of work.
114  *
115  * @param[in] normA
116  * The norm of the input matrixA. If negative, the norm is computed by
117  * the kernel.
118  *
119  *******************************************************************************
120  *
121  * @return This routine will return the rank of A (>=0) or -1 if it didn't
122  * manage to compress within the margins of tolerance and maximum rank.
123  *
124  *******************************************************************************/
125 int
126 core_crqrrt( float tol, pastix_int_t maxrank, pastix_int_t nb,
127  pastix_int_t m, pastix_int_t n,
128  pastix_complex32_t *A, pastix_int_t lda, pastix_complex32_t *tau,
129  pastix_complex32_t *B, pastix_int_t ldb, pastix_complex32_t *tau_b,
130  pastix_complex32_t *work, pastix_int_t lwork, float normA )
131 {
132  int SEED[4] = {26, 67, 52, 197};
133  int ret, i;
134  pastix_int_t bp = ( nb < 0 ) ? 32 : nb;
135  pastix_int_t d, ib, loop = 1;
136  pastix_int_t ldo = m;
137  pastix_int_t size_O = ldo * bp;
138  pastix_int_t rk, minMN, lwkopt;
139  pastix_int_t sublw = n * bp;
140  pastix_complex32_t *omega = work;
141  pastix_complex32_t *subw = work;
142  float normR;
143 
144  sublw = pastix_imax( sublw, size_O );
145 
146  char trans;
147 #if defined(PRECISION_c) || defined(PRECISION_z)
148  trans = 'C';
149 #else
150  trans = 'T';
151 #endif
152 
153  lwkopt = sublw;
154  if ( lwork == -1 ) {
155  work[0] = (pastix_complex32_t)lwkopt;
156  return 0;
157  }
158 #if !defined(NDEBUG)
159  if (m < 0) {
160  return -1;
161  }
162  if (n < 0) {
163  return -2;
164  }
165  if (lda < pastix_imax(1, m)) {
166  return -4;
167  }
168  if( lwork < lwkopt ) {
169  return -8;
170  }
171 #endif
172 
173  minMN = pastix_imin(m, n);
174  if ( maxrank < 0 ) {
175  maxrank = minMN;
176  }
177  maxrank = pastix_imin( maxrank, minMN );
178 
179  /* Compute the norm of A if not provided by the user */
180  if ( normA < 0 ) {
181  normA = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', m, n,
182  A, lda, NULL );
183  }
184 
185  /**
186  * If maximum rank is 0, then either the matrix norm is below the tolerance,
187  * and we can return a null rank matrix, or it is not and we need to return
188  * a full rank matrix.
189  */
190  if ( maxrank == 0 ) {
191  if ( tol < 0. ) {
192  return 0;
193  }
194  if ( normA < tol ) {
195  return 0;
196  }
197  return -1;
198  }
199 
200  /* Quick exit if A is null rank for the given tolerance */
201  if ( normA < tol ) {
202  return 0;
203  }
204 
205 #if defined(PASTIX_DEBUG_LR)
206  omega = malloc( size_O * sizeof(pastix_complex32_t) );
207  subw = malloc( sublw * sizeof(pastix_complex32_t) );
208 #endif
209 
210  /* Computation of the Gaussian matrix */
211  LAPACKE_clarnv_work(3, SEED, size_O, omega);
212 
213  rk = 0;
214  while ( (rk < maxrank) && loop )
215  {
216  /*
217  * Note that we can use maxrank instead of minMN to compute ib, as it is
218  * useless to compute extra columns with rotation. The residual will
219  * tell us if we managed to compress or not
220  */
221  ib = pastix_imin( bp, maxrank-rk );
222  d = ib;
223 
224  /* Computation of the projection matrix B = A_{0:m,k:n}^H * omega */
225  cblas_cgemm( CblasColMajor, CblasConjTrans, CblasNoTrans,
226  n-rk, d, m-rk,
227  CBLAS_SADDR(cone), A + rk*lda + rk, lda,
228  omega, ldo,
229  CBLAS_SADDR(czero), B + rk*ldb + rk, ldb );
230 
231  /* Try to do some power iteration to refine the projection */
232  if (0)
233  {
234  int l;
235  for(l=0; l<2; l++)
236  {
237  cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
238  m-rk, d, n-rk,
239  CBLAS_SADDR(cone), A + rk*lda + rk, lda,
240  B + rk*ldb + rk, ldb,
241  CBLAS_SADDR(czero), omega, ldo );
242 
243  cblas_cgemm( CblasColMajor, CblasConjTrans, CblasNoTrans,
244  n-rk, d, m-rk,
245  CBLAS_SADDR(cone), A + rk*lda + rk, lda,
246  omega, ldo,
247  CBLAS_SADDR(czero), B + rk*ldb + rk, ldb );
248  }
249 
250  /* Computation of the Gaussian matrix */
251  LAPACKE_clarnv_work(3, SEED, size_O, omega);
252  }
253 
254  /*
255  * QR factorization of the sample matrix B = Q_{B} R_{B}.
256  * At the end, the householders will be stored at the lower part of the matrix
257  */
258  ret = LAPACKE_cgeqrf_work( LAPACK_COL_MAJOR, n-rk, d,
259  B + rk*ldb + rk, ldb, tau_b+rk,
260  subw, sublw );
261  assert(ret == 0);
262 
263  /*
264  * A_{0:m,k:n} = A_{0:m,k:n} Q_{B} for rotational version
265  */
266  ret = LAPACKE_cunmqr_work( LAPACK_COL_MAJOR, 'R', 'N',
267  m - rk, n - rk, d,
268  B + rk*ldb + rk, ldb, tau_b+rk,
269  A + rk*lda + rk, lda,
270  subw, sublw );
271  assert(ret == 0);
272 
273  /*
274  * Factorize d columns of A_{k:m,k:k+d} without pivoting
275  */
276  ret = LAPACKE_cgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
277  A + rk*lda + rk, lda, tau + rk,
278  subw, sublw );
279  assert(ret == 0);
280 
281  if ( rk+d < n ) {
282  /*
283  * Update trailing submatrix: A_{k:m,k+d:n} <- Q^h A_{k:m,k+d:n}
284  */
285  ret = LAPACKE_cunmqr_work( LAPACK_COL_MAJOR, 'L', trans,
286  m-rk, n-rk-d, d,
287  A + rk *lda + rk, lda, tau + rk,
288  A + (rk+d)*lda + rk, lda,
289  subw, sublw );
290  assert(ret == 0);
291  }
292 
293  /* Let's update the residual norm */
294  normR = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', m-rk-d, n-rk-d, A + (rk+d) * (lda+1), lda, NULL );
295  if ( normR < tol ) {
296  /* Let's refine the rank: r <= rk+d */
297  float ssq = 1.;
298  float scl = normR;
299 
300  loop = 0;
301 
302  for( i=d-1; i>=0; i-- ) {
303  float normRk = cblas_scnrm2( n-rk-i, A + (rk+i) * lda + (rk+i), lda );
304 
305  frobenius_update( 1., &scl, &ssq, &normRk );
306  normRk = scl * sqrtf( ssq );
307 
308  if ( normRk > tol ) {
309  /*
310  * The actual rank is i (the i^th column has just been
311  * removed from the selection), and we need to be below the
312  * threshold tol, so we need the i from the previous
313  * iteration (+1)
314  */
315  d = i+1;
316  break;
317  }
318  }
319  }
320  rk += d;
321  }
322 
323 #if defined(PASTIX_DEBUG_LR)
324  free( omega );
325  free( subw );
326 #endif
327 
328  (void)ret;
329  if ( (loop && (rk < minMN)) || (rk > maxrank) ) {
330  return -1;
331  }
332  else {
333  return rk;
334  }
335 }
336 
337 /**
338  *******************************************************************************
339  *
340  * @brief Convert a full rank matrix in a low rank matrix, using RQRRT.
341  *
342  *******************************************************************************
343  *
344  * @param[in] use_reltol
345  * Defines if the kernel should use relative tolerance (tol *||A||), or
346  * absolute tolerance (tol).
347  *
348  * @param[in] tol
349  * The tolerance used as a criterion to eliminate information from the
350  * full rank matrix
351  *
352  * @param[in] rklimit
353  * The maximum rank to store the matrix in low-rank format. If
354  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
355  *
356  * @param[in] m
357  * Number of rows of the matrix A, and of the low rank matrix Alr.
358  *
359  * @param[in] n
360  * Number of columns of the matrix A, and of the low rank matrix Alr.
361  *
362  * @param[in] A
363  * The matrix of dimension lda-by-n that needs to be compressed
364  *
365  * @param[in] lda
366  * The leading dimension of the matrix A. lda >= max(1, m)
367  *
368  * @param[out] Alr
369  * The low rank matrix structure that will store the low rank
370  * representation of A
371  *
372  *******************************************************************************/
373 pastix_fixdbl_t
374 core_cge2lr_rqrrt( int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
375  pastix_int_t m, pastix_int_t n,
376  const void *A, pastix_int_t lda,
377  pastix_lrblock_t *Alr )
378 {
379  return core_cge2lr_qrrt( core_crqrrt, use_reltol, tol, rklimit,
380  m, n, A, lda, Alr );
381 }
solver.h
core_crqrrt
int core_crqrrt(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex32_t *A, pastix_int_t lda, pastix_complex32_t *tau, pastix_complex32_t *B, pastix_int_t ldb, pastix_complex32_t *tau_b, pastix_complex32_t *work, pastix_int_t lwork, float normA)
Compute a randomized QR factorization with rotation technique.
Definition: core_crqrrt.c:126
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_cge2lr_rqrrt
pastix_fixdbl_t core_cge2lr_rqrrt(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRRT.
Definition: core_crqrrt.c:374
c_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
pastix_ccores.h
core_cge2lr_qrrt
pastix_fixdbl_t core_cge2lr_qrrt(core_crrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_cgelrops.c:937
pastix_clrcores.h