PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_crqrrt.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_crqrrt.c
4 *
5 * PaStiX Rank-revealing QR kernel based on randomization technique with rotation.
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Esragul Korkmaz
12 * @author Mathieu Faverge
13 * @date 2024-07-05
14 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zrqrrt.c, normal z -> c, Tue Feb 25 14:34:51 2025
15 *
16 **/
17#include "common.h"
18#include <cblas.h>
19#include <lapacke.h>
20#include "common/frobeniusupdate.h"
21#include "blend/solver.h"
22#include "pastix_ccores.h"
23#include "pastix_clrcores.h"
24#include "c_nan_check.h"
25
26#ifndef DOXYGEN_SHOULD_SKIP_THIS
27static pastix_complex32_t cone = 1.0;
28static pastix_complex32_t czero = 0.0;
29#endif /* DOXYGEN_SHOULD_SKIP_THIS */
30
31/**
32 *******************************************************************************
33 *
34 * @brief Compute a randomized QR factorization with rotation technique.
35 *
36 * This kernel implements the second method (he did not write a pseudocode for
37 * the second method) described in :
38 *
39 * Blocked rank-revealing QR factorizations: How randomized sampling can be used
40 * to avoid single-vector pivoting. P. G. Martinsson
41 *
42 * Note that we only use the trailing matrix for resampling. We don't use power
43 * of it for getting better results, since it would be too costly.
44 *
45 * Difference from randomized QRCP is :
46 * 1) resampling is used instead of sample update formula
47 * 2) Instead of pivoting A, rotation is applied on it
48 * 3) Instead of working with B and omega, directly their transposes are handled
49 * for simplicity
50 *
51 * The main difference in this algorithm compared to figure 5 in the Martinsson's
52 * paper:
53 * 1) First, we stop the iterative process based on a tolerance criterion
54 * 2) Second, we do not apply SVD for gathering the mass on the diagonal
55 * blocks, so our method is less costly but we expect it to be less
56 * close to SVD result
57 * 3) Third, we do not apply the power iteration for resampling for a closer
58 * result to SVD, since it is too costly
59 *
60 *******************************************************************************
61 *
62 * @param[in] tol
63 * The relative tolerance criterion. Computations are stopped when the
64 * frobenius norm of the residual matrix is lower than tol.
65 * If tol < 0, then maxrank reflectors are computed.
66 *
67 * @param[in] maxrank
68 * Maximum number of reflectors computed. Computations are stopped when
69 * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
70 * or up to the tolerance criterion.
71 *
72 * @param[in] nb
73 * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
74 * 32.
75 *
76 * @param[in] m
77 * Number of rows of the matrix A.
78 *
79 * @param[in] n
80 * Number of columns of the matrix A.
81 *
82 * @param[inout] A
83 * The matrix of dimension lda-by-n that needs to be compressed.
84 * On output, the A matrix is computed up to the founded
85 * rank k (k first columns and rows). Everything else, should be dropped.
86 *
87 * @param[in] lda
88 * The leading dimension of the matrix A. lda >= max(1, m).
89 *
90 * @param[out] tau
91 * Contains scalar factors of the elementary reflectors for the matrix
92 * A.
93 *
94 * @param[out] B
95 * The matrix of dimension ldb-by-maxrank that will holds the partial
96 * projection of the matrix A.
97 * On output, each block of 32 columns can be used to computed the
98 * reverse rotation of the R part of A.
99 *
100 * @param[in] ldb
101 * The leading dimension of the matrix B. ldb >= max(1, n).
102 *
103 * @param[out] tau_b
104 * Contains scalar factors of the elementary reflectors for the matrix
105 * B.
106 *
107 * @param[in] work
108 * Workspace array of size lwork.
109 *
110 * @param[in] lwork
111 * The dimension of the work area. lwork >= (nb * max(n,n))
112 * If lwork == -1, the function returns immediately and work[0]
113 * contains the optimal size of work.
114 *
115 * @param[in] normA
116 * The norm of the input matrixA. If negative, the norm is computed by
117 * the kernel.
118 *
119 *******************************************************************************
120 *
121 * @return This routine will return the rank of A (>=0) or -1 if it didn't
122 * manage to compress within the margins of tolerance and maximum rank.
123 *
124 *******************************************************************************/
125int
126core_crqrrt( float tol,
127 pastix_int_t maxrank,
128 pastix_int_t nb,
129 pastix_int_t m,
130 pastix_int_t n,
132 pastix_int_t lda,
135 pastix_int_t ldb,
136 pastix_complex32_t *tau_b,
137 pastix_complex32_t *work,
138 pastix_int_t lwork,
139 float normA )
140{
141 int SEED[4] = {26, 67, 52, 197};
142 int ret, i;
143 pastix_int_t bp = ( nb < 0 ) ? 32 : nb;
144 pastix_int_t d, ib, loop = 1;
145 pastix_int_t ldo = m;
146 pastix_int_t size_O = ldo * bp;
147 pastix_int_t rk, minMN, lwkopt;
148 pastix_int_t sublw = n * bp;
149 pastix_complex32_t *omega = work;
150 pastix_complex32_t *subw = work;
151 float normR;
152
153 sublw = pastix_imax( sublw, size_O );
154
155 char trans;
156#if defined(PRECISION_c) || defined(PRECISION_z)
157 trans = 'C';
158#else
159 trans = 'T';
160#endif
161
162 lwkopt = sublw;
163 if ( lwork == -1 ) {
164 work[0] = (pastix_complex32_t)lwkopt;
165 return 0;
166 }
167#if !defined(NDEBUG)
168 if (m < 0) {
169 return -1;
170 }
171 if (n < 0) {
172 return -2;
173 }
174 if (lda < pastix_imax(1, m)) {
175 return -4;
176 }
177 if( lwork < lwkopt ) {
178 return -8;
179 }
180#endif
181
182 minMN = pastix_imin(m, n);
183 if ( maxrank < 0 ) {
184 maxrank = minMN;
185 }
186 maxrank = pastix_imin( maxrank, minMN );
187
188 /* Compute the norm of A if not provided by the user */
189 if ( normA < 0 ) {
190 normA = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', m, n,
191 A, lda, NULL );
192 }
193
194 /**
195 * If maximum rank is 0, then either the matrix norm is below the tolerance,
196 * and we can return a null rank matrix, or it is not and we need to return
197 * a full rank matrix.
198 */
199 if ( maxrank == 0 ) {
200 if ( tol < 0. ) {
201 return 0;
202 }
203 if ( normA < tol ) {
204 return 0;
205 }
206 return -1;
207 }
208
209 /* Quick exit if A is null rank for the given tolerance */
210 if ( normA < tol ) {
211 return 0;
212 }
213
214#if defined(PASTIX_DEBUG_LR)
215 omega = malloc( size_O * sizeof(pastix_complex32_t) );
216 subw = malloc( sublw * sizeof(pastix_complex32_t) );
217#endif
218
219 /* Computation of the Gaussian matrix */
220 ret = LAPACKE_clarnv_work(3, SEED, size_O, omega);
221 assert( ret == 0 );
222
223 rk = 0;
224 while ( (rk < maxrank) && loop )
225 {
226 /*
227 * Note that we can use maxrank instead of minMN to compute ib, as it is
228 * useless to compute extra columns with rotation. The residual will
229 * tell us if we managed to compress or not
230 */
231 ib = pastix_imin( bp, maxrank-rk );
232 d = ib;
233
234 /* Computation of the projection matrix B = A_{0:m,k:n}^H * omega */
235 cblas_cgemm( CblasColMajor, CblasConjTrans, CblasNoTrans,
236 n-rk, d, m-rk,
237 CBLAS_SADDR(cone), A + rk*lda + rk, lda,
238 omega, ldo,
239 CBLAS_SADDR(czero), B + rk*ldb + rk, ldb );
240
241 /* Try to do some power iteration to refine the projection */
242 if (0)
243 {
244 int l;
245 for(l=0; l<2; l++)
246 {
247 cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
248 m-rk, d, n-rk,
249 CBLAS_SADDR(cone), A + rk*lda + rk, lda,
250 B + rk*ldb + rk, ldb,
251 CBLAS_SADDR(czero), omega, ldo );
252
253 cblas_cgemm( CblasColMajor, CblasConjTrans, CblasNoTrans,
254 n-rk, d, m-rk,
255 CBLAS_SADDR(cone), A + rk*lda + rk, lda,
256 omega, ldo,
257 CBLAS_SADDR(czero), B + rk*ldb + rk, ldb );
258 }
259
260 /* Computation of the Gaussian matrix */
261 ret = LAPACKE_clarnv_work(3, SEED, size_O, omega);
262 assert( ret == 0 );
263 }
264
265 /*
266 * QR factorization of the sample matrix B = Q_{B} R_{B}.
267 * At the end, the householders will be stored at the lower part of the matrix
268 */
269 ret = LAPACKE_cgeqrf_work( LAPACK_COL_MAJOR, n-rk, d,
270 B + rk*ldb + rk, ldb, tau_b+rk,
271 subw, sublw );
272 assert(ret == 0);
273
274 /*
275 * A_{0:m,k:n} = A_{0:m,k:n} Q_{B} for rotational version
276 */
277 ret = LAPACKE_cunmqr_work( LAPACK_COL_MAJOR, 'R', 'N',
278 m - rk, n - rk, d,
279 B + rk*ldb + rk, ldb, tau_b+rk,
280 A + rk*lda + rk, lda,
281 subw, sublw );
282 assert(ret == 0);
283
284 /*
285 * Factorize d columns of A_{k:m,k:k+d} without pivoting
286 */
287 ret = LAPACKE_cgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
288 A + rk*lda + rk, lda, tau + rk,
289 subw, sublw );
290 assert(ret == 0);
291
292 if ( rk+d < n ) {
293 /*
294 * Update trailing submatrix: A_{k:m,k+d:n} <- Q^h A_{k:m,k+d:n}
295 */
296 ret = LAPACKE_cunmqr_work( LAPACK_COL_MAJOR, 'L', trans,
297 m-rk, n-rk-d, d,
298 A + rk *lda + rk, lda, tau + rk,
299 A + (rk+d)*lda + rk, lda,
300 subw, sublw );
301 assert(ret == 0);
302 }
303
304 /* Let's update the residual norm */
305 normR = LAPACKE_clange_work( LAPACK_COL_MAJOR, 'f', m-rk-d, n-rk-d, A + (rk+d) * (lda+1), lda, NULL );
306 if ( normR < tol ) {
307 /* Let's refine the rank: r <= rk+d */
308 float ssq = 1.;
309 float scl = normR;
310
311 loop = 0;
312
313 for( i=d-1; i>=0; i-- ) {
314 float normRk = cblas_scnrm2( n-rk-i, A + (rk+i) * lda + (rk+i), lda );
315
316 frobenius_update( 1., &scl, &ssq, &normRk );
317 normRk = scl * sqrtf( ssq );
318
319 if ( normRk > tol ) {
320 /*
321 * The actual rank is i (the i^th column has just been
322 * removed from the selection), and we need to be below the
323 * threshold tol, so we need the i from the previous
324 * iteration (+1)
325 */
326 d = i+1;
327 break;
328 }
329 }
330 }
331 rk += d;
332 }
333
334#if defined(PASTIX_DEBUG_LR)
335 free( omega );
336 free( subw );
337#endif
338
339 (void)ret;
340 if ( (loop && (rk < minMN)) || (rk > maxrank) ) {
341 return -1;
342 }
343 else {
344 return rk;
345 }
346}
347
348/**
349 *******************************************************************************
350 *
351 * @brief Convert a full rank matrix in a low rank matrix, using RQRRT.
352 *
353 *******************************************************************************
354 *
355 * @param[in] use_reltol
356 * Defines if the kernel should use relative tolerance (tol *||A||), or
357 * absolute tolerance (tol).
358 *
359 * @param[in] tol
360 * The tolerance used as a criterion to eliminate information from the
361 * full rank matrix
362 *
363 * @param[in] rklimit
364 * The maximum rank to store the matrix in low-rank format. If
365 * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
366 *
367 * @param[in] m
368 * Number of rows of the matrix A, and of the low rank matrix Alr.
369 *
370 * @param[in] n
371 * Number of columns of the matrix A, and of the low rank matrix Alr.
372 *
373 * @param[in] A
374 * The matrix of dimension lda-by-n that needs to be compressed
375 *
376 * @param[in] lda
377 * The leading dimension of the matrix A. lda >= max(1, m)
378 *
379 * @param[out] Alr
380 * The low rank matrix structure that will store the low rank
381 * representation of A
382 *
383 *******************************************************************************
384 *
385 * @return TODO
386 *
387 *******************************************************************************/
389core_cge2lr_rqrrt( int use_reltol,
390 pastix_fixdbl_t tol,
391 pastix_int_t rklimit,
392 pastix_int_t m,
393 pastix_int_t n,
394 const void *A,
395 pastix_int_t lda,
396 pastix_lrblock_t *Alr )
397{
398 return core_cge2lr_qrrt( core_crqrrt, use_reltol, tol, rklimit,
399 m, n, A, lda, Alr );
400}
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
float _Complex pastix_complex32_t
Definition datatypes.h:76
double pastix_fixdbl_t
Definition datatypes.h:65
int core_crqrrt(float tol, pastix_int_t maxrank, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex32_t *A, pastix_int_t lda, pastix_complex32_t *tau, pastix_complex32_t *B, pastix_int_t ldb, pastix_complex32_t *tau_b, pastix_complex32_t *work, pastix_int_t lwork, float normA)
Compute a randomized QR factorization with rotation technique.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_cge2lr_qrrt(core_crrqr_rt_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_cge2lr_rqrrt(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRRT.