PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_drqrcp.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_drqrcp.c
4 *
5 * PaStiX Rank-revealing QR kernel beased on randomization technique and partial
6 * QR with column pivoting.
7 *
8 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9 * Univ. Bordeaux. All rights reserved.
10 *
11 * @version 6.4.0
12 * @author Claire Soyez-Martin
13 * @author Esragul Korkmaz
14 * @author Mathieu Faverge
15 * @date 2024-07-05
16 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zrqrcp.c, normal z -> d, Tue Feb 25 14:34:54 2025
17 *
18 **/
19#include "common.h"
20#include <cblas.h>
21#include <lapacke.h>
22#include "blend/solver.h"
23#include "pastix_dcores.h"
24#include "pastix_dlrcores.h"
25#include "d_nan_check.h"
26
27#ifndef DOXYGEN_SHOULD_SKIP_THIS
28static double mdone = -1.0;
29static double done = 1.0;
30static double dzero = 0.0;
31#endif /* DOXYGEN_SHOULD_SKIP_THIS */
32
33/**
34 *******************************************************************************
35 *
36 * @brief Compute a randomized QR factorization.
37 *
38 * This kernel implements the algorithm described in:
39 * Fast Parallel Randomized QR with Column Pivoting Algorithms for Reliable
40 * Low-rank Matrix Approximations. Jianwei Xiao, Ming Gu, and Julien Langou
41 *
42 * The main difference in this algorithm relies in two points:
43 * 1) First, we stop the iterative porcess based on a tolerance criterion
44 * forwarded to the QR with column pivoting kernel, while they have a fixed
45 * number of iterations defined by parameter.
46 * 2) Second, we perform an extra PQRCP call on the trailing submatrix to
47 * finalize the computations, while in the paper above they use a spectrum
48 * revealing algorithm to refine the solution.
49 *
50 * More information can also be found in Finding Structure with Randomness:
51 * Probabilistic Algorithms for Constructing Approximate Matrix
52 * Decompositions. N. Halko, P. G. Martinsson, and J. A. Tropp
53 *
54 * Based on this paper, we set the p parameter to 5, as it seems sufficient, and
55 * because we iterate multiple times to get the final rank.
56 *
57 *******************************************************************************
58 *
59 * @param[in] tol
60 * The relative tolerance criterion. Computations are stopped when the
61 * frobenius norm of the residual matrix is lower than tol.
62 * If tol < 0, then maxrank reflectors are computed.
63 *
64 * @param[in] maxrank
65 * Maximum number of reflectors computed. Computations are stopped when
66 * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
67 * or up to the tolerance criterion.
68 *
69 * @param[in] refine
70 * Enable/disable the extra refinement step that performs an additional
71 * PQRCP on the trailing submatrix.
72 *
73 * @param[in] nb
74 * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
75 * 32.
76 *
77 * @param[in] m
78 * Number of rows of the matrix A.
79 *
80 * @param[in] n
81 * Number of columns of the matrix A.
82 *
83 * @param[in] A
84 * The matrix of dimension lda-by-n that needs to be compressed.
85 *
86 * @param[in] lda
87 * The leading dimension of the matrix A. lda >= max(1, m).
88 *
89 * @param[out] jpvt
90 * The array that describes the permutation of A.
91 *
92 * @param[out] tau
93 * Contains scalar factors of the elementary reflectors for the matrix
94 * Q.
95 *
96 * @param[in] work
97 * Workspace array of size lwork.
98 *
99 * @param[in] lwork
100 * The dimension of the work area. lwork >= (nb * n + max(n, m) )
101 * If lwork == -1, the functions returns immediately and work[0]
102 * contains the optimal size of work.
103 *
104 * @param[in] rwork
105 * Workspace array used to store partial and exact column norms (2-by-n)
106 *
107 *******************************************************************************
108 *
109 * @return This routine will return the rank of A (>=0) or -1 if it didn't
110 * manage to compress within the margins of tolerance and maximum rank.
111 *
112 *******************************************************************************/
113int
114core_drqrcp( double tol,
115 pastix_int_t maxrank,
116 int refine,
117 pastix_int_t nb,
118 pastix_int_t m,
119 pastix_int_t n,
120 double *A,
121 pastix_int_t lda,
122 pastix_int_t *jpvt,
123 double *tau,
124 double *work,
125 pastix_int_t lwork,
126 double *rwork )
127{
128 int SEED[4] = {26, 67, 52, 197};
129 pastix_int_t j, k, in, itmp, d, ib, loop = 1;
130 int ret;
131 pastix_int_t p = 5;
132 pastix_int_t bp = ( nb < p ) ? 32 : nb;
133 pastix_int_t b = bp - p;
134 pastix_int_t ldb = bp;
135 pastix_int_t ldo = bp;
136 pastix_int_t size_O = ldo * m;
137 pastix_int_t size_B = ldb * n;
138 pastix_int_t *jpvt_b;
139 pastix_int_t rk, minMN, lwkopt;
140 double tolB = sqrt( (double)(bp) ) * tol;
141
142 double *B = work;
143 double *tau_b = B + size_B;
144 double *omega = tau_b + n;
145 double *subw = tau_b + n;
146 pastix_int_t sublw = n * bp + pastix_imax( bp, n );
147 sublw = pastix_imax( sublw, size_O );
148
149 char trans;
150#if defined(PRECISION_c) || defined(PRECISION_z)
151 trans = 'C';
152#else
153 trans = 'T';
154#endif
155
156 lwkopt = size_B + n + sublw;
157 if ( lwork == -1 ) {
158 work[0] = (double)lwkopt;
159 return 0;
160 }
161#if !defined(NDEBUG)
162 if (m < 0) {
163 return -1;
164 }
165 if (n < 0) {
166 return -2;
167 }
168 if (lda < pastix_imax(1, m)) {
169 return -4;
170 }
171 if( lwork < lwkopt ) {
172 return -8;
173 }
174#endif
175
176 minMN = pastix_imin(m, n);
177 if ( maxrank < 0 ) {
178 maxrank = minMN;
179 }
180 maxrank = pastix_imin( maxrank, minMN );
181
182 /**
183 * If maximum rank is 0, then either the matrix norm is below the tolerance,
184 * and we can return a null rank matrix, or it is not and we need to return
185 * a full rank matrix.
186 */
187 if ( maxrank == 0 ) {
188 double norm;
189 if ( tol < 0. ) {
190 return 0;
191 }
192 norm = LAPACKE_dlange_work( LAPACK_COL_MAJOR, 'f', m, n,
193 A, lda, NULL );
194 if ( norm < tol ) {
195 return 0;
196 }
197 return -1;
198 }
199
200#if defined(PASTIX_DEBUG_LR)
201 B = malloc( size_B * sizeof(double) );
202 tau_b = malloc( n * sizeof(double) );
203 omega = malloc( size_O * sizeof(double) );
204 subw = malloc( sublw * sizeof(double) );
205#endif
206
207 jpvt_b = malloc( n * sizeof(pastix_int_t) );
208 for (j=0; j<n; j++) jpvt[j] = j;
209
210 /* Computation of the Gaussian matrix */
211 ret = LAPACKE_dlarnv_work(3, SEED, size_O, omega);
212 assert( ret == 0 );
213
214 /* Project with the gaussian matrix: B = Omega * A */
215 cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
216 bp, n, m,
217 (done), omega, ldo,
218 A, lda,
219 (dzero), B, ldb );
220
221 rk = 0;
222 while ( loop )
223 {
224 ib = pastix_imin( b, minMN-rk );
225 d = core_dpqrcp( tolB, ib, 1, bp,
226 bp, n-rk,
227 B + rk*ldb, ldb,
228 jpvt_b + rk, tau_b,
229 subw, sublw, /* >= (n*bp)+max(bp, n) */
230 rwork ); /* >= 2*n */
231
232 /* If fails to reach the tolerance before maxrank, let's restore the max value */
233 if ( d == -1 ) {
234 d = ib;
235 }
236 /* If smaller than ib, we reached the threshold */
237 if ( d < ib ) {
238 loop = 0;
239 }
240 if ( d == 0 ) {
241 loop = 0;
242 break;
243 }
244 /* If we exceeded the max rank, let's stop now */
245 if ( (rk + d) > maxrank ) {
246 rk = -1;
247 break;
248 }
249
250 /* Updating jpvt and A */
251 for (j = rk; j < rk + d; j++) {
252 if (jpvt_b[j] >= 0) {
253 k = j;
254 in = jpvt_b[k] + rk;
255
256 /* Mark as done */
257 jpvt_b[k] = - jpvt_b[k] - 1;
258
259 while( jpvt_b[in] >= 0 ) {
260
261 if (k != in) {
262 cblas_dswap( m, A + k * lda, 1,
263 A + in * lda, 1 );
264
265 itmp = jpvt[k];
266 jpvt[k] = jpvt[in];
267 jpvt[in] = itmp;
268 }
269 itmp = jpvt_b[in];
270 jpvt_b[in] = - jpvt_b[in] - 1;
271 k = in;
272 in = itmp + rk;
273 }
274 }
275 }
276
277 /*
278 * Factorize d columns of A without pivoting
279 */
280 ret = LAPACKE_dgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
281 A + rk*lda + rk, lda, tau + rk,
282 subw, sublw );
283 assert(ret == 0);
284
285 if ( rk+d < n ) {
286
287 /*
288 * Update trailing submatrix: A <- Q^h A
289 */
290 ret = LAPACKE_dormqr_work( LAPACK_COL_MAJOR, 'L', trans,
291 m-rk, n-rk-d, d,
292 A + rk *lda + rk, lda, tau + rk,
293 A + (rk+d)*lda + rk, lda,
294 subw, sublw );
295 assert(ret == 0);
296
297 /*
298 * The Q from partial QRCP is stored in the lower part of the matrix,
299 * we need to remove it
300 */
301 ret = LAPACKE_dlaset_work( LAPACK_COL_MAJOR, 'L', d-1, d-1,
302 0, 0, B + rk*ldb + 1, ldb );
303 assert( ret == 0 );
304
305 /*
306 * Updating B
307 */
308 cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper,
309 CblasNoTrans, CblasNonUnit,
310 d, d,
311 (done), A + rk*lda + rk, lda,
312 B + rk*ldb, ldb );
313
314 cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
315 d, n - (rk+d), d,
316 (mdone), B + rk *ldb, ldb,
317 A + (rk+d)*lda + rk, lda,
318 (done), B + (rk+d)*ldb, ldb );
319 }
320 rk += d;
321 }
322
323#if 0
324 d = 0;
325 if ( refine && !loop && (rk < maxrank) ) {
326 /*
327 * Apply a Rank-revealing QR on the trailing submatrix to get the last
328 * columns
329 */
330 d = core_dpqrcp( tol, maxrank-rk, 0, bp,
331 m-rk, n-rk,
332 A + rk * lda + rk, lda,
333 jpvt_b, tau + rk,
334 work, lwork, rwork );
335
336 /* Updating jpvt and A */
337 for (j=0; j<d; j++) {
338 if (jpvt_b[j] >= 0) {
339 k = j;
340 in = jpvt_b[k];
341
342 /* Mark as done */
343 jpvt_b[k] = - jpvt_b[k] - 1;
344
345 while( jpvt_b[in] >= 0 ) {
346 if (k != in) {
347 /* Swap columns in first rows */
348 cblas_dswap( rk, A + (rk + k ) * lda, 1,
349 A + (rk + in) * lda, 1 );
350
351 itmp = jpvt[rk + k];
352 jpvt[rk + k] = jpvt[rk + in];
353 jpvt[rk + in] = itmp;
354 }
355 itmp = jpvt_b[in];
356 jpvt_b[in] = - jpvt_b[in] - 1;
357 k = in;
358 in = itmp;
359 }
360 }
361 }
362
363 rk += d;
364 if ( rk > maxrank ) {
365 rk = -1;
366 }
367 }
368#endif
369 free( jpvt_b );
370
371#if defined(PASTIX_DEBUG_LR)
372 free( B );
373 free( tau_b );
374 free( omega );
375 free( subw );
376#endif
377
378 (void)ret;
379 (void)refine;
380 return rk;
381}
382
383/**
384 *******************************************************************************
385 *
386 * @brief Convert a full rank matrix in a low rank matrix, using RQRCP.
387 *
388 *******************************************************************************
389 *
390 * @param[in] use_reltol
391 * Defines if the kernel should use relative tolerance (tol *||A||), or
392 * absolute tolerance (tol).
393 *
394 * @param[in] tol
395 * The tolerance used as a criterion to eliminate information from the
396 * full rank matrix
397 *
398 * @param[in] rklimit
399 * The maximum rank to store the matrix in low-rank format. If
400 * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
401 *
402 * @param[in] m
403 * Number of rows of the matrix A, and of the low rank matrix Alr.
404 *
405 * @param[in] n
406 * Number of columns of the matrix A, and of the low rank matrix Alr.
407 *
408 * @param[in] A
409 * The matrix of dimension lda-by-n that needs to be compressed
410 *
411 * @param[in] lda
412 * The leading dimension of the matrix A. lda >= max(1, m)
413 *
414 * @param[out] Alr
415 * The low rank matrix structure that will store the low rank
416 * representation of A
417 *
418 *******************************************************************************
419 *
420 * @return TODO
421 *
422 *******************************************************************************/
424core_dge2lr_rqrcp( int use_reltol,
425 pastix_fixdbl_t tol,
426 pastix_int_t rklimit,
427 pastix_int_t m,
428 pastix_int_t n,
429 const void *A,
430 pastix_int_t lda,
431 pastix_lrblock_t *Alr )
432{
433 return core_dge2lr_qrcp( core_drqrcp, use_reltol, tol, rklimit,
434 m, n, A, lda, Alr );
435}
436
437/**
438 *******************************************************************************
439 *
440 * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
441 *
442 * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
443 * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
444 * (0 I )
445 * Compute RQRCP decomposition of (I u2^T u1) * (v2 v1)^T
446 * (0 I )
447 *
448 *******************************************************************************
449 *
450 * @param[in] lowrank
451 * The structure with low-rank parameters.
452 *
453 * @param[in] transA1
454 * @arg PastixNoTrans: No transpose, op( A ) = A;
455 * @arg PastixTrans: Transpose, op( A ) = A';
456 *
457 * @param[in] alphaptr
458 * alpha * A is add to B
459 *
460 * @param[in] M1
461 * The number of rows of the matrix A.
462 *
463 * @param[in] N1
464 * The number of columns of the matrix A.
465 *
466 * @param[in] A
467 * The low-rank representation of the matrix A.
468 *
469 * @param[in] M2
470 * The number of rows of the matrix B.
471 *
472 * @param[in] N2
473 * The number of columns of the matrix B.
474 *
475 * @param[in] B
476 * The low-rank representation of the matrix B.
477 *
478 * @param[in] offx
479 * The horizontal offset of A with respect to B.
480 *
481 * @param[in] offy
482 * The vertical offset of A with respect to B.
483 *
484 *******************************************************************************
485 *
486 * @return The new rank of u2 v2^T or -1 if ranks are too large for
487 * recompression
488 *
489 *******************************************************************************/
492 pastix_trans_t transA1,
493 const void *alphaptr,
494 pastix_int_t M1,
495 pastix_int_t N1,
496 const pastix_lrblock_t *A,
497 pastix_int_t M2,
498 pastix_int_t N2,
500 pastix_int_t offx,
501 pastix_int_t offy)
502{
503 return core_drradd_qr( core_drqrcp, lowrank, transA1, alphaptr,
504 M1, N1, A, M2, N2, B, offx, offy );
505}
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
int core_dpqrcp(double tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, double *A, pastix_int_t lda, pastix_int_t *jpvt, double *tau, double *work, pastix_int_t lwork, double *rwork)
Compute a rank-reavealing QR factorization.
int core_drqrcp(double tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, double *A, pastix_int_t lda, pastix_int_t *jpvt, double *tau, double *work, pastix_int_t lwork, double *rwork)
Compute a randomized QR factorization.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_dge2lr_rqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRCP.
pastix_fixdbl_t core_dge2lr_qrcp(core_drrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_drradd_qr(core_drrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
pastix_fixdbl_t core_drradd_rqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
enum pastix_trans_e pastix_trans_t
Transpostion.