PaStiX Handbook  6.4.0
core_spqrcp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_spqrcp.c
4  *
5  * PaStiX implementation of the partial rank-revealing QR with column pivoting
6  * based on Lapack GEQP3.
7  *
8  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.4.0
12  * @author Alfredo Buttari
13  * @author Gregoire Pichon
14  * @author Esragul Korkmaz
15  * @author Mathieu Faverge
16  * @date 2024-07-05
17  * @generated from /builds/solverstack/pastix/kernels/core_zpqrcp.c, normal z -> s, Tue Oct 8 14:17:22 2024
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_scores.h"
25 #include "pastix_slrcores.h"
26 #include "s_nan_check.h"
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 static float msone = -1.0;
30 static float sone = 1.0;
31 static float szero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Compute a rank-reavealing QR factorization.
38  *
39  * This routine is originated from the LAPACK kernels sgeqp3/slaqps and was
40  * modified by A. Buttari for MUMPS-BLR.
41  * In this version the stopping criterion is based on the frobenius norm of the
42  * residual, and not on the estimate of the two-norm making it more
43  * restrictive. Thus, the returned ranks are larger, but this gives a better
44  * accuracy.
45  *
46  *******************************************************************************
47  *
48  * @param[in] tol
49  * The relative tolerance criterion. Computations are stopped when the
50  * frobenius norm of the residual matrix is lower than tol.
51  * If tol < 0, then maxrank reflectors are computed.
52  *
53  * @param[in] maxrank
54  * Maximum number of reflectors computed. Computations are stopped when
55  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
56  * or up to the tolerance criterion.
57  *
58  * @param[in] full_update
59  * If true, all the trailing submatrix is updated, even if maxrank is
60  * reached.
61  * If false, the trailing submatrix is not updated as soon as it is not
62  * worth it. (Unused for now but kept to match API of RQRCP and TQRCP)
63  *
64  * @param[in] nb
65  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
66  * 32.
67  *
68  * @param[in] m
69  * Number of rows of the matrix A.
70  *
71  * @param[in] n
72  * Number of columns of the matrix A.
73  *
74  * @param[in] A
75  * The matrix of dimension lda-by-n that needs to be compressed.
76  *
77  * @param[in] lda
78  * The leading dimension of the matrix A. lda >= max(1, m).
79  *
80  * @param[out] jpvt
81  * The array that describes the permutation of A.
82  *
83  * @param[out] tau
84  * Contains scalar factors of the elementary reflectors for the matrix
85  * Q.
86  *
87  * @param[in] work
88  * Workspace array of size lwork.
89  *
90  * @param[in] lwork
91  * The dimension of the work area. lwork >= (nb * n + max(n, m) )
92  * If lwork == -1, the functions returns immediately and work[0]
93  * contains the optimal size of work.
94  *
95  * @param[in] rwork
96  * Workspace array used to store partial and exact column norms (2-by-n)
97  *
98  *******************************************************************************
99  *
100  * @return This routine will return the rank of A (>=0) or -1 if it didn't
101  * manage to compress within the margins of tolerance and maximum rank.
102  *
103  *******************************************************************************/
104 int
105 core_spqrcp( float tol,
106  pastix_int_t maxrank,
107  int full_update,
108  pastix_int_t nb,
109  pastix_int_t m,
110  pastix_int_t n,
111  float *A,
112  pastix_int_t lda,
113  pastix_int_t *jpvt,
114  float *tau,
115  float *work,
116  pastix_int_t lwork,
117  float *rwork )
118 {
119  pastix_int_t minMN, ldf, lwkopt;
120  pastix_int_t j, k, jb, itemp, lsticc, pvt, ret;
121  float temp, temp2, machine_prec, residual;
122  float akk, *auxv, *f;
123 
124  /* Partial (VN1) and exact (VN2) column norms */
125  float *VN1, *VN2;
126 
127  /* Number or rows of A that have been factorized */
128  pastix_int_t offset = 0;
129 
130  /* Rank */
131  pastix_int_t rk = 0;
132 
133  if (nb < 0) {
134  nb = 32;
135  }
136 
137  lwkopt = n * nb + pastix_imax(m, n);
138  if ( lwork == -1 ) {
139  work[0] = (float)lwkopt;
140  return 0;
141  }
142 #if !defined(NDEBUG)
143  if (m < 0) {
144  return -1;
145  }
146  if (n < 0) {
147  return -2;
148  }
149  if (lda < pastix_imax(1, m)) {
150  return -4;
151  }
152  if( lwork < lwkopt ) {
153  return -8;
154  }
155 #endif
156 
157  minMN = pastix_imin(m, n);
158  if ( maxrank < 0 ) {
159  maxrank = minMN;
160  }
161  maxrank = pastix_imin( minMN, maxrank );
162 
163  /**
164  * If maximum rank is 0, then either the matrix norm is below the tolerance,
165  * and we can return a null rank matrix, or it is not and we need to return
166  * a full rank matrix.
167  */
168  if ( maxrank == 0 ) {
169  float norm;
170  if ( tol < 0. ) {
171  return 0;
172  }
173  norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
174  A, lda, NULL );
175  if ( norm < tol ) {
176  return 0;
177  }
178  return -1;
179  }
180 
181  VN1 = rwork;
182  VN2 = rwork + n;
183 
184  auxv = work;
185  f = work + pastix_imax(m, n);
186  ldf = n;
187 
188  /*
189  * Initialize partial column norms. The first N elements of work store the
190  * exact column norms.
191  */
192  for (j=0; j<n; j++){
193  VN1[j] = cblas_snrm2(m, A + j * lda, 1);
194  VN2[j] = VN1[j];
195  jpvt[j] = j;
196  }
197 
198  machine_prec = sqrtf(LAPACKE_slamch_work('e'));
199  rk = 0;
200 
201  while ( rk < maxrank ) {
202  /* jb equivalent to kb in LAPACK xLAQPS: maximum number of columns to factorize */
203  jb = pastix_imin(nb, maxrank-offset);
204  lsticc = 0;
205 
206  /* Factorize as many columns as possible */
207  for ( k=0; k<jb; k++ ) {
208 
209  rk = offset + k;
210 
211  assert( rk < maxrank );
212 
213  pvt = rk + cblas_isamax( n-rk, VN1 + rk, 1 );
214 
215  /*
216  * The selected pivot is below the threshold, we check if we exit
217  * now or we still need to compute it to refine the precision.
218  */
219  if ( (VN1[pvt] == 0.) || (VN1[pvt] < tol) ) {
220  residual = cblas_snrm2( n-rk, VN1 + rk, 1 );
221  if ( (residual == 0.) || (residual < tol) ) {
222  assert( rk < maxrank );
223  return rk;
224  }
225  }
226 
227  /*
228  * Pivot is not within the current column: we swap
229  */
230  if ( pvt != rk ) {
231  assert( pvt < n );
232  cblas_sswap( m, A + pvt * lda, 1,
233  A + rk * lda, 1 );
234  cblas_sswap( k, f + (pvt-offset), ldf,
235  f + k, ldf );
236 
237  itemp = jpvt[pvt];
238  jpvt[pvt] = jpvt[rk];
239  jpvt[rk] = itemp;
240  VN1[pvt] = VN1[rk];
241  VN2[pvt] = VN2[rk];
242  }
243 
244  /*
245  * Apply previous Householder reflectors to the column K
246  * A(RK:M,RK) := A(RK:M,RK) - A(RK:M,OFFSET+1:RK-1)*F(K,1:K-1)^T
247  */
248  if ( k > 0 ) {
249  assert( (rk < n) && (rk < m) );
250 
251 #if defined(PRECISION_c) || defined(PRECISION_z)
252  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, m-rk, 1, k,
253  (msone), A + offset * lda + rk, lda,
254  f + k, ldf,
255  (sone), A + rk * lda + rk, lda );
256 #else
257  cblas_sgemv( CblasColMajor, CblasNoTrans, m-rk, k,
258  (msone), A + offset * lda + rk, lda,
259  f + k, ldf,
260  (sone), A + rk * lda + rk, 1 );
261 #endif
262  }
263 
264  /*
265  * Generate elementary reflector H(k).
266  */
267  if ((rk+1) < m) {
268  ret = LAPACKE_slarfg_work(m-rk, A + rk * lda + rk, A + rk * lda + (rk+1), 1, tau + rk);
269  assert( ret == 0 );
270  }
271  else{
272  ret = LAPACKE_slarfg_work(1, A + rk * lda + rk, A + rk * lda + rk, 1, tau + rk);
273  assert( ret == 0 );
274  }
275 
276  akk = A[rk * lda + rk];
277  A[rk * lda + rk] = sone;
278 
279  /*
280  * Compute Kth column of F:
281  * F(RK+1:N,RK) := tau(RK)*A(RK:M,RK+1:N)^T*A(RK:M,RK).
282  */
283  if ((rk+1) < n) {
284  float alpha = tau[rk];
285  cblas_sgemv( CblasColMajor, CblasTrans, m-rk, n-rk-1,
286  (alpha), A + (rk+1) * lda + rk, lda,
287  A + rk * lda + rk, 1,
288  (szero), f + k * ldf + k + 1, 1 );
289  }
290 
291  /*
292  * Padding F(1:K,K) with zeros.
293  */
294  memset( f + k * ldf, 0, k * sizeof( float ) );
295 
296  /*
297  * Incremental updating of F:
298  * F(1:N-OFFSET,K) := F(1:N-OFFSET,K) - tau(RK)*F(1:N,1:K-1)*A(RK:M,OFFSET+1:RK-1)^T*A(RK:M,RK).
299  */
300  if (k > 0) {
301  float alpha = -tau[rk];
302  cblas_sgemv( CblasColMajor, CblasTrans, m-rk, k,
303  (alpha), A + offset * lda + rk, lda,
304  A + rk * lda + rk, 1,
305  (szero), auxv, 1 );
306 
307  cblas_sgemv( CblasColMajor, CblasNoTrans, n-offset, k,
308  (sone), f, ldf,
309  auxv, 1,
310  (sone), f + k * ldf, 1);
311  }
312 
313  /*
314  * Update the current row of A:
315  * A(RK,RK+1:N) := A(RK,RK+1:N) - A(RK,OFFSET+1:RK)*F(K+1:N,1:K)^T.
316  */
317  if ((rk+1) < n) {
318 #if defined(PRECISION_c) || defined(PRECISION_z)
319  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
320  1, n-rk-1, k+1,
321  (msone), A + (offset) * lda + rk, lda,
322  f + (k+1), ldf,
323  (sone), A + (rk + 1) * lda + rk, lda );
324 #else
325  cblas_sgemv( CblasColMajor, CblasNoTrans, n-rk-1, k+1,
326  (msone), f + (k+1), ldf,
327  A + (offset) * lda + rk, lda,
328  (sone), A + (rk + 1) * lda + rk, lda );
329 #endif
330  }
331 
332  /*
333  * Update partial column norms.
334  */
335  for (j=rk+1; j<n; j++) {
336  if (VN1[j] != 0.0) {
337  /*
338  * NOTE: The following 4 lines follow from the analysis in
339  * Lapack Working Note 176.
340  */
341  temp = fabsf( A[j * lda + rk] ) / VN1[j];
342  temp2 = (1.0 + temp) * (1.0 - temp);
343  temp = (temp2 > 0.0) ? temp2 : 0.0;
344 
345  temp2 = temp * ((VN1[j] / VN2[j]) * ( VN1[j] / VN2[j]));
346  if (temp2 < machine_prec){
347  VN2[j] = (float)lsticc;
348  lsticc = j;
349  }
350  else{
351  VN1[j] = VN1[j] * sqrtf(temp);
352  }
353  }
354  }
355 
356  A[rk * lda + rk] = akk;
357 
358  if (lsticc != 0) {
359  k++;
360  break;
361  }
362  }
363 
364  /* One additional reflector has been computed */
365  rk++;
366 
367  /*
368  * Apply the block reflector to the rest of the matrix:
369  * A(RK+1:M,RK+1:N) := A(RK+1:M,RK+1:N) -
370  * A(RK+1:M,OFFSET+1:RK)*F(K+1:N-OFFSET,1:K)^T.
371  */
372  if ( rk < n )
373  {
374  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
375  m-rk, n-rk, k,
376  (msone), A + offset * lda + rk, lda,
377  f + k, ldf,
378  (sone), A + rk * lda + rk, lda );
379  }
380 
381  /* Recomputation of difficult columns. */
382  while (lsticc > 0) {
383  assert(lsticc < n);
384  itemp = (pastix_int_t) (VN2[lsticc]);
385 
386  VN1[lsticc] = cblas_snrm2(m-rk, A + lsticc * lda + rk, 1 );
387 
388  /*
389  * NOTE: The computation of VN1( LSTICC ) relies on the fact that
390  * SNRM2 does not fail on vectors with norm below the value of
391  * SQRT(SLAMCH('S'))
392  */
393  VN2[lsticc] = VN1[lsticc];
394  lsticc = itemp;
395  }
396 
397  offset = rk;
398  }
399 
400  (void)full_update;
401  (void)ret;
402 
403  /* We reached maxrank, so we check if the threshold is met or not */
404  residual = cblas_snrm2( n-rk, VN1 + rk, 1 );
405  if ( (tol < 0.) || ( (residual == 0.) || (residual < tol) ) ) {
406  assert( rk == maxrank );
407  return rk;
408  }
409  else {
410  return -1;
411  }
412 }
413 
414 /**
415  *******************************************************************************
416  *
417  * @brief Convert a full rank matrix in a low rank matrix, using PQRCP.
418  *
419  *******************************************************************************
420  *
421  * @param[in] use_reltol
422  * Defines if the kernel should use relative tolerance (tol *||A||), or
423  * absolute tolerance (tol).
424  *
425  * @param[in] tol
426  * The tolerance used as a criterion to eliminate information from the
427  * full rank matrix
428  *
429  * @param[in] rklimit
430  * The maximum rank to store the matrix in low-rank format. If
431  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
432  *
433  * @param[in] m
434  * Number of rows of the matrix A, and of the low rank matrix Alr.
435  *
436  * @param[in] n
437  * Number of columns of the matrix A, and of the low rank matrix Alr.
438  *
439  * @param[in] A
440  * The matrix of dimension lda-by-n that needs to be compressed
441  *
442  * @param[in] lda
443  * The leading dimension of the matrix A. lda >= max(1, m)
444  *
445  * @param[out] Alr
446  * The low rank matrix structure that will store the low rank
447  * representation of A
448  *
449  *******************************************************************************
450  *
451  * @return TODO
452  *
453  *******************************************************************************/
455 core_sge2lr_pqrcp( int use_reltol,
456  pastix_fixdbl_t tol,
457  pastix_int_t rklimit,
458  pastix_int_t m,
459  pastix_int_t n,
460  const void *A,
461  pastix_int_t lda,
462  pastix_lrblock_t *Alr )
463 {
464  return core_sge2lr_qrcp( core_spqrcp, use_reltol, tol, rklimit,
465  m, n, A, lda, Alr );
466 }
467 
468 
469 /**
470  *******************************************************************************
471  *
472  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
473  *
474  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
475  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
476  * (0 I )
477  * Compute PQRCP decomposition of (I u2^T u1) * (v2 v1)^T
478  * (0 I )
479  *
480  *******************************************************************************
481  *
482  * @param[in] lowrank
483  * The structure with low-rank parameters.
484  *
485  * @param[in] transA1
486  * @arg PastixNoTrans: No transpose, op( A ) = A;
487  * @arg PastixTrans: Transpose, op( A ) = A';
488  *
489  * @param[in] alphaptr
490  * alpha * A is add to B
491  *
492  * @param[in] M1
493  * The number of rows of the matrix A.
494  *
495  * @param[in] N1
496  * The number of columns of the matrix A.
497  *
498  * @param[in] A
499  * The low-rank representation of the matrix A.
500  *
501  * @param[in] M2
502  * The number of rows of the matrix B.
503  *
504  * @param[in] N2
505  * The number of columns of the matrix B.
506  *
507  * @param[in] B
508  * The low-rank representation of the matrix B.
509  *
510  * @param[in] offx
511  * The horizontal offset of A with respect to B.
512  *
513  * @param[in] offy
514  * The vertical offset of A with respect to B.
515  *
516  *******************************************************************************
517  *
518  * @return The new rank of u2 v2^T or -1 if ranks are too large for
519  * recompression
520  *
521  *******************************************************************************/
524  pastix_trans_t transA1,
525  const void *alphaptr,
526  pastix_int_t M1,
527  pastix_int_t N1,
528  const pastix_lrblock_t *A,
529  pastix_int_t M2,
530  pastix_int_t N2,
531  pastix_lrblock_t *B,
532  pastix_int_t offx,
533  pastix_int_t offy)
534 {
535  return core_srradd_qr( core_spqrcp, lowrank, transA1, alphaptr,
536  M1, N1, A, M2, N2, B, offx, offy );
537 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_spqrcp(float tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a rank-reavealing QR factorization.
Definition: core_spqrcp.c:105
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
pastix_fixdbl_t core_sge2lr_pqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using PQRCP.
Definition: core_spqrcp.c:455
pastix_fixdbl_t core_srradd_pqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_spqrcp.c:523
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
enum pastix_trans_e pastix_trans_t
Transpostion.
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...