PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_zpqrcp.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_zpqrcp.c
4 *
5 * PaStiX implementation of the partial rank-revealing QR with column pivoting
6 * based on Lapack GEQP3.
7 *
8 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9 * Univ. Bordeaux. All rights reserved.
10 *
11 * @version 6.4.0
12 * @author Alfredo Buttari
13 * @author Gregoire Pichon
14 * @author Esragul Korkmaz
15 * @author Mathieu Faverge
16 * @date 2024-07-05
17 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zpqrcp.c, normal z -> z, Tue Feb 25 14:34:58 2025
18 *
19 **/
20#include "common.h"
21#include <cblas.h>
22#include <lapacke.h>
23#include "blend/solver.h"
24#include "pastix_zcores.h"
25#include "pastix_zlrcores.h"
26#include "z_nan_check.h"
27
28#ifndef DOXYGEN_SHOULD_SKIP_THIS
29static pastix_complex64_t mzone = -1.0;
30static pastix_complex64_t zone = 1.0;
31static pastix_complex64_t zzero = 0.0;
32#endif /* DOXYGEN_SHOULD_SKIP_THIS */
33
34/**
35 *******************************************************************************
36 *
37 * @brief Compute a rank-reavealing QR factorization.
38 *
39 * This routine is originated from the LAPACK kernels zgeqp3/zlaqps and was
40 * modified by A. Buttari for MUMPS-BLR.
41 * In this version the stopping criterion is based on the frobenius norm of the
42 * residual, and not on the estimate of the two-norm making it more
43 * restrictive. Thus, the returned ranks are larger, but this gives a better
44 * accuracy.
45 *
46 *******************************************************************************
47 *
48 * @param[in] tol
49 * The relative tolerance criterion. Computations are stopped when the
50 * frobenius norm of the residual matrix is lower than tol.
51 * If tol < 0, then maxrank reflectors are computed.
52 *
53 * @param[in] maxrank
54 * Maximum number of reflectors computed. Computations are stopped when
55 * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
56 * or up to the tolerance criterion.
57 *
58 * @param[in] full_update
59 * If true, all the trailing submatrix is updated, even if maxrank is
60 * reached.
61 * If false, the trailing submatrix is not updated as soon as it is not
62 * worth it. (Unused for now but kept to match API of RQRCP and TQRCP)
63 *
64 * @param[in] nb
65 * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
66 * 32.
67 *
68 * @param[in] m
69 * Number of rows of the matrix A.
70 *
71 * @param[in] n
72 * Number of columns of the matrix A.
73 *
74 * @param[in] A
75 * The matrix of dimension lda-by-n that needs to be compressed.
76 *
77 * @param[in] lda
78 * The leading dimension of the matrix A. lda >= max(1, m).
79 *
80 * @param[out] jpvt
81 * The array that describes the permutation of A.
82 *
83 * @param[out] tau
84 * Contains scalar factors of the elementary reflectors for the matrix
85 * Q.
86 *
87 * @param[in] work
88 * Workspace array of size lwork.
89 *
90 * @param[in] lwork
91 * The dimension of the work area. lwork >= (nb * n + max(n, m) )
92 * If lwork == -1, the functions returns immediately and work[0]
93 * contains the optimal size of work.
94 *
95 * @param[in] rwork
96 * Workspace array used to store partial and exact column norms (2-by-n)
97 *
98 *******************************************************************************
99 *
100 * @return This routine will return the rank of A (>=0) or -1 if it didn't
101 * manage to compress within the margins of tolerance and maximum rank.
102 *
103 *******************************************************************************/
104int
105core_zpqrcp( double tol,
106 pastix_int_t maxrank,
107 int full_update,
108 pastix_int_t nb,
109 pastix_int_t m,
110 pastix_int_t n,
111 pastix_complex64_t *A,
112 pastix_int_t lda,
113 pastix_int_t *jpvt,
114 pastix_complex64_t *tau,
115 pastix_complex64_t *work,
116 pastix_int_t lwork,
117 double *rwork )
118{
119 pastix_int_t minMN, ldf, lwkopt;
120 pastix_int_t j, k, jb, itemp, lsticc, pvt, ret;
121 double temp, temp2, machine_prec, residual;
122 pastix_complex64_t akk, *auxv, *f;
123
124 /* Partial (VN1) and exact (VN2) column norms */
125 double *VN1, *VN2;
126
127 /* Number or rows of A that have been factorized */
128 pastix_int_t offset = 0;
129
130 /* Rank */
131 pastix_int_t rk = 0;
132
133 if (nb < 0) {
134 nb = 32;
135 }
136
137 lwkopt = n * nb + pastix_imax(m, n);
138 if ( lwork == -1 ) {
139 work[0] = (pastix_complex64_t)lwkopt;
140 return 0;
141 }
142#if !defined(NDEBUG)
143 if (m < 0) {
144 return -1;
145 }
146 if (n < 0) {
147 return -2;
148 }
149 if (lda < pastix_imax(1, m)) {
150 return -4;
151 }
152 if( lwork < lwkopt ) {
153 return -8;
154 }
155#endif
156
157 minMN = pastix_imin(m, n);
158 if ( maxrank < 0 ) {
159 maxrank = minMN;
160 }
161 maxrank = pastix_imin( minMN, maxrank );
162
163 /**
164 * If maximum rank is 0, then either the matrix norm is below the tolerance,
165 * and we can return a null rank matrix, or it is not and we need to return
166 * a full rank matrix.
167 */
168 if ( maxrank == 0 ) {
169 double norm;
170 if ( tol < 0. ) {
171 return 0;
172 }
173 norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'f', m, n,
174 A, lda, NULL );
175 if ( norm < tol ) {
176 return 0;
177 }
178 return -1;
179 }
180
181 VN1 = rwork;
182 VN2 = rwork + n;
183
184 auxv = work;
185 f = work + pastix_imax(m, n);
186 ldf = n;
187
188 /*
189 * Initialize partial column norms. The first N elements of work store the
190 * exact column norms.
191 */
192 for (j=0; j<n; j++){
193 VN1[j] = cblas_dznrm2(m, A + j * lda, 1);
194 VN2[j] = VN1[j];
195 jpvt[j] = j;
196 }
197
198 machine_prec = sqrt(LAPACKE_dlamch_work('e'));
199 rk = 0;
200
201 while ( rk < maxrank ) {
202 /* jb equivalent to kb in LAPACK xLAQPS: maximum number of columns to factorize */
203 jb = pastix_imin(nb, maxrank-offset);
204 lsticc = 0;
205
206 /* Factorize as many columns as possible */
207 for ( k=0; k<jb; k++ ) {
208
209 rk = offset + k;
210
211 assert( rk < maxrank );
212
213 pvt = rk + cblas_idamax( n-rk, VN1 + rk, 1 );
214
215 /*
216 * The selected pivot is below the threshold, we check if we exit
217 * now or we still need to compute it to refine the precision.
218 */
219 if ( (VN1[pvt] == 0.) || (VN1[pvt] < tol) ) {
220 residual = cblas_dnrm2( n-rk, VN1 + rk, 1 );
221 if ( (residual == 0.) || (residual < tol) ) {
222 assert( rk < maxrank );
223 return rk;
224 }
225 }
226
227 /*
228 * Pivot is not within the current column: we swap
229 */
230 if ( pvt != rk ) {
231 assert( pvt < n );
232 cblas_zswap( m, A + pvt * lda, 1,
233 A + rk * lda, 1 );
234 cblas_zswap( k, f + (pvt-offset), ldf,
235 f + k, ldf );
236
237 itemp = jpvt[pvt];
238 jpvt[pvt] = jpvt[rk];
239 jpvt[rk] = itemp;
240 VN1[pvt] = VN1[rk];
241 VN2[pvt] = VN2[rk];
242 }
243
244 /*
245 * Apply previous Householder reflectors to the column K
246 * A(RK:M,RK) := A(RK:M,RK) - A(RK:M,OFFSET+1:RK-1)*F(K,1:K-1)^H
247 */
248 if ( k > 0 ) {
249 assert( (rk < n) && (rk < m) );
250
251#if defined(PRECISION_c) || defined(PRECISION_z)
252 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans, m-rk, 1, k,
253 CBLAS_SADDR(mzone), A + offset * lda + rk, lda,
254 f + k, ldf,
255 CBLAS_SADDR(zone), A + rk * lda + rk, lda );
256#else
257 cblas_zgemv( CblasColMajor, CblasNoTrans, m-rk, k,
258 CBLAS_SADDR(mzone), A + offset * lda + rk, lda,
259 f + k, ldf,
260 CBLAS_SADDR(zone), A + rk * lda + rk, 1 );
261#endif
262 }
263
264 /*
265 * Generate elementary reflector H(k).
266 */
267 if ((rk+1) < m) {
268 ret = LAPACKE_zlarfg_work(m-rk, A + rk * lda + rk, A + rk * lda + (rk+1), 1, tau + rk);
269 assert( ret == 0 );
270 }
271 else{
272 ret = LAPACKE_zlarfg_work(1, A + rk * lda + rk, A + rk * lda + rk, 1, tau + rk);
273 assert( ret == 0 );
274 }
275
276 akk = A[rk * lda + rk];
277 A[rk * lda + rk] = zone;
278
279 /*
280 * Compute Kth column of F:
281 * F(RK+1:N,RK) := tau(RK)*A(RK:M,RK+1:N)^H*A(RK:M,RK).
282 */
283 if ((rk+1) < n) {
284 pastix_complex64_t alpha = tau[rk];
285 cblas_zgemv( CblasColMajor, CblasConjTrans, m-rk, n-rk-1,
286 CBLAS_SADDR(alpha), A + (rk+1) * lda + rk, lda,
287 A + rk * lda + rk, 1,
288 CBLAS_SADDR(zzero), f + k * ldf + k + 1, 1 );
289 }
290
291 /*
292 * Padding F(1:K,K) with zeros.
293 */
294 memset( f + k * ldf, 0, k * sizeof( pastix_complex64_t ) );
295
296 /*
297 * Incremental updating of F:
298 * F(1:N-OFFSET,K) := F(1:N-OFFSET,K) - tau(RK)*F(1:N,1:K-1)*A(RK:M,OFFSET+1:RK-1)^H*A(RK:M,RK).
299 */
300 if (k > 0) {
301 pastix_complex64_t alpha = -tau[rk];
302 cblas_zgemv( CblasColMajor, CblasConjTrans, m-rk, k,
303 CBLAS_SADDR(alpha), A + offset * lda + rk, lda,
304 A + rk * lda + rk, 1,
305 CBLAS_SADDR(zzero), auxv, 1 );
306
307 cblas_zgemv( CblasColMajor, CblasNoTrans, n-offset, k,
308 CBLAS_SADDR(zone), f, ldf,
309 auxv, 1,
310 CBLAS_SADDR(zone), f + k * ldf, 1);
311 }
312
313 /*
314 * Update the current row of A:
315 * A(RK,RK+1:N) := A(RK,RK+1:N) - A(RK,OFFSET+1:RK)*F(K+1:N,1:K)^H.
316 */
317 if ((rk+1) < n) {
318#if defined(PRECISION_c) || defined(PRECISION_z)
319 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans,
320 1, n-rk-1, k+1,
321 CBLAS_SADDR(mzone), A + (offset) * lda + rk, lda,
322 f + (k+1), ldf,
323 CBLAS_SADDR(zone), A + (rk + 1) * lda + rk, lda );
324#else
325 cblas_zgemv( CblasColMajor, CblasNoTrans, n-rk-1, k+1,
326 CBLAS_SADDR(mzone), f + (k+1), ldf,
327 A + (offset) * lda + rk, lda,
328 CBLAS_SADDR(zone), A + (rk + 1) * lda + rk, lda );
329#endif
330 }
331
332 /*
333 * Update partial column norms.
334 */
335 for (j=rk+1; j<n; j++) {
336 if (VN1[j] != 0.0) {
337 /*
338 * NOTE: The following 4 lines follow from the analysis in
339 * Lapack Working Note 176.
340 */
341 temp = cabs( A[j * lda + rk] ) / VN1[j];
342 temp2 = (1.0 + temp) * (1.0 - temp);
343 temp = (temp2 > 0.0) ? temp2 : 0.0;
344
345 temp2 = temp * ((VN1[j] / VN2[j]) * ( VN1[j] / VN2[j]));
346 if (temp2 < machine_prec){
347 VN2[j] = (double)lsticc;
348 lsticc = j;
349 }
350 else{
351 VN1[j] = VN1[j] * sqrt(temp);
352 }
353 }
354 }
355
356 A[rk * lda + rk] = akk;
357
358 if (lsticc != 0) {
359 k++;
360 break;
361 }
362 }
363
364 /* One additional reflector has been computed */
365 rk++;
366
367 /*
368 * Apply the block reflector to the rest of the matrix:
369 * A(RK+1:M,RK+1:N) := A(RK+1:M,RK+1:N) -
370 * A(RK+1:M,OFFSET+1:RK)*F(K+1:N-OFFSET,1:K)^H.
371 */
372 if ( rk < n )
373 {
374 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasConjTrans,
375 m-rk, n-rk, k,
376 CBLAS_SADDR(mzone), A + offset * lda + rk, lda,
377 f + k, ldf,
378 CBLAS_SADDR(zone), A + rk * lda + rk, lda );
379 }
380
381 /* Recomputation of difficult columns. */
382 while (lsticc > 0) {
383 assert(lsticc < n);
384 itemp = (pastix_int_t) (VN2[lsticc]);
385
386 VN1[lsticc] = cblas_dznrm2(m-rk, A + lsticc * lda + rk, 1 );
387
388 /*
389 * NOTE: The computation of VN1( LSTICC ) relies on the fact that
390 * SNRM2 does not fail on vectors with norm below the value of
391 * SQRT(DLAMCH('S'))
392 */
393 VN2[lsticc] = VN1[lsticc];
394 lsticc = itemp;
395 }
396
397 offset = rk;
398 }
399
400 (void)full_update;
401 (void)ret;
402
403 /* We reached maxrank, so we check if the threshold is met or not */
404 residual = cblas_dnrm2( n-rk, VN1 + rk, 1 );
405 if ( (tol < 0.) || ( (residual == 0.) || (residual < tol) ) ) {
406 assert( rk == maxrank );
407 return rk;
408 }
409 else {
410 return -1;
411 }
412}
413
414/**
415 *******************************************************************************
416 *
417 * @brief Convert a full rank matrix in a low rank matrix, using PQRCP.
418 *
419 *******************************************************************************
420 *
421 * @param[in] use_reltol
422 * Defines if the kernel should use relative tolerance (tol *||A||), or
423 * absolute tolerance (tol).
424 *
425 * @param[in] tol
426 * The tolerance used as a criterion to eliminate information from the
427 * full rank matrix
428 *
429 * @param[in] rklimit
430 * The maximum rank to store the matrix in low-rank format. If
431 * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
432 *
433 * @param[in] m
434 * Number of rows of the matrix A, and of the low rank matrix Alr.
435 *
436 * @param[in] n
437 * Number of columns of the matrix A, and of the low rank matrix Alr.
438 *
439 * @param[in] A
440 * The matrix of dimension lda-by-n that needs to be compressed
441 *
442 * @param[in] lda
443 * The leading dimension of the matrix A. lda >= max(1, m)
444 *
445 * @param[out] Alr
446 * The low rank matrix structure that will store the low rank
447 * representation of A
448 *
449 *******************************************************************************
450 *
451 * @return TODO
452 *
453 *******************************************************************************/
455core_zge2lr_pqrcp( int use_reltol,
456 pastix_fixdbl_t tol,
457 pastix_int_t rklimit,
458 pastix_int_t m,
459 pastix_int_t n,
460 const void *A,
461 pastix_int_t lda,
462 pastix_lrblock_t *Alr )
463{
464 return core_zge2lr_qrcp( core_zpqrcp, use_reltol, tol, rklimit,
465 m, n, A, lda, Alr );
466}
467
468
469/**
470 *******************************************************************************
471 *
472 * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
473 *
474 * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
475 * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
476 * (0 I )
477 * Compute PQRCP decomposition of (I u2^T u1) * (v2 v1)^T
478 * (0 I )
479 *
480 *******************************************************************************
481 *
482 * @param[in] lowrank
483 * The structure with low-rank parameters.
484 *
485 * @param[in] transA1
486 * @arg PastixNoTrans: No transpose, op( A ) = A;
487 * @arg PastixTrans: Transpose, op( A ) = A';
488 *
489 * @param[in] alphaptr
490 * alpha * A is add to B
491 *
492 * @param[in] M1
493 * The number of rows of the matrix A.
494 *
495 * @param[in] N1
496 * The number of columns of the matrix A.
497 *
498 * @param[in] A
499 * The low-rank representation of the matrix A.
500 *
501 * @param[in] M2
502 * The number of rows of the matrix B.
503 *
504 * @param[in] N2
505 * The number of columns of the matrix B.
506 *
507 * @param[in] B
508 * The low-rank representation of the matrix B.
509 *
510 * @param[in] offx
511 * The horizontal offset of A with respect to B.
512 *
513 * @param[in] offy
514 * The vertical offset of A with respect to B.
515 *
516 *******************************************************************************
517 *
518 * @return The new rank of u2 v2^T or -1 if ranks are too large for
519 * recompression
520 *
521 *******************************************************************************/
524 pastix_trans_t transA1,
525 const void *alphaptr,
526 pastix_int_t M1,
527 pastix_int_t N1,
528 const pastix_lrblock_t *A,
529 pastix_int_t M2,
530 pastix_int_t N2,
532 pastix_int_t offx,
533 pastix_int_t offy)
534{
535 return core_zrradd_qr( core_zpqrcp, lowrank, transA1, alphaptr,
536 M1, N1, A, M2, N2, B, offx, offy );
537}
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
int core_zpqrcp(double tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex64_t *A, pastix_int_t lda, pastix_int_t *jpvt, pastix_complex64_t *tau, pastix_complex64_t *work, pastix_int_t lwork, double *rwork)
Compute a rank-reavealing QR factorization.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_zrradd_pqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
pastix_fixdbl_t core_zge2lr_pqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using PQRCP.
pastix_fixdbl_t core_zge2lr_qrcp(core_zrrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_zrradd_qr(core_zrrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
enum pastix_trans_e pastix_trans_t
Transpostion.
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...