PaStiX Handbook  6.2.1
core_spqrcp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_spqrcp.c
4  *
5  * PaStiX implementation of the partial rank-revealing QR with column pivoting
6  * based on Lapack GEQP3.
7  *
8  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.2.0
12  * @author Alfredo Buttari
13  * @author Gregoire Pichon
14  * @author Esragul Korkmaz
15  * @author Mathieu Faverge
16  * @date 2020-03-02
17  * @generated from /builds/solverstack/pastix/kernels/core_zpqrcp.c, normal z -> s, Tue Apr 12 09:38:38 2022
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_scores.h"
25 #include "pastix_slrcores.h"
26 #include "s_nan_check.h"
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 static float msone = -1.0;
30 static float sone = 1.0;
31 static float szero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Compute a rank-reavealing QR factorization.
38  *
39  * This routine is originated from the LAPACK kernels sgeqp3/slaqps and was
40  * modified by A. Buttari for MUMPS-BLR.
41  * In this version the stopping criterion is based on the frobenius norm of the
42  * residual, and not on the estimate of the two-norm making it more
43  * restrictive. Thus, the returned ranks are larger, but this gives a better
44  * accuracy.
45  *
46  *******************************************************************************
47  *
48  * @param[in] tol
49  * The relative tolerance criterion. Computations are stopped when the
50  * frobenius norm of the residual matrix is lower than tol.
51  * If tol < 0, then maxrank reflectors are computed.
52  *
53  * @param[in] maxrank
54  * Maximum number of reflectors computed. Computations are stopped when
55  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
56  * or up to the tolerance criterion.
57  *
58  * @param[in] full_update
59  * If true, all the trailing submatrix is updated, even if maxrank is
60  * reached.
61  * If false, the trailing submatrix is not updated as soon as it is not
62  * worth it. (Unused for now but kept to match API of RQRCP and TQRCP)
63  *
64  * @param[in] nb
65  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
66  * 32.
67  *
68  * @param[in] m
69  * Number of rows of the matrix A.
70  *
71  * @param[in] n
72  * Number of columns of the matrix A.
73  *
74  * @param[in] A
75  * The matrix of dimension lda-by-n that needs to be compressed.
76  *
77  * @param[in] lda
78  * The leading dimension of the matrix A. lda >= max(1, m).
79  *
80  * @param[out] jpvt
81  * The array that describes the permutation of A.
82  *
83  * @param[out] tau
84  * Contains scalar factors of the elementary reflectors for the matrix
85  * Q.
86  *
87  * @param[in] work
88  * Workspace array of size lwork.
89  *
90  * @param[in] lwork
91  * The dimension of the work area. lwork >= (nb * n + max(n, m) )
92  * If lwork == -1, the functions returns immediately and work[0]
93  * contains the optimal size of work.
94  *
95  * @param[in] rwork
96  * Workspace array used to store partial and exact column norms (2-by-n)
97  *
98  *******************************************************************************
99  *
100  * @return This routine will return the rank of A (>=0) or -1 if it didn't
101  * manage to compress within the margins of tolerance and maximum rank.
102  *
103  *******************************************************************************/
104 int
105 core_spqrcp( float tol, pastix_int_t maxrank, int full_update, pastix_int_t nb,
106  pastix_int_t m, pastix_int_t n,
107  float *A, pastix_int_t lda,
108  pastix_int_t *jpvt, float *tau,
109  float *work, pastix_int_t lwork, float *rwork )
110 {
111  pastix_int_t minMN, ldf, lwkopt;
112  pastix_int_t j, k, jb, itemp, lsticc, pvt;
113  float temp, temp2, machine_prec, residual;
114  float akk, *auxv, *f;
115 
116  /* Partial (VN1) and exact (VN2) column norms */
117  float *VN1, *VN2;
118 
119  /* Number or rows of A that have been factorized */
120  pastix_int_t offset = 0;
121 
122  /* Rank */
123  pastix_int_t rk = 0;
124 
125  if (nb < 0) {
126  nb = 32;
127  }
128 
129  lwkopt = n * nb + pastix_imax(m, n);
130  if ( lwork == -1 ) {
131  work[0] = (float)lwkopt;
132  return 0;
133  }
134 #if !defined(NDEBUG)
135  if (m < 0) {
136  return -1;
137  }
138  if (n < 0) {
139  return -2;
140  }
141  if (lda < pastix_imax(1, m)) {
142  return -4;
143  }
144  if( lwork < lwkopt ) {
145  return -8;
146  }
147 #endif
148 
149  minMN = pastix_imin(m, n);
150  if ( maxrank < 0 ) {
151  maxrank = minMN;
152  }
153  maxrank = pastix_imin( minMN, maxrank );
154 
155  /**
156  * If maximum rank is 0, then either the matrix norm is below the tolerance,
157  * and we can return a null rank matrix, or it is not and we need to return
158  * a full rank matrix.
159  */
160  if ( maxrank == 0 ) {
161  float norm;
162  if ( tol < 0. ) {
163  return 0;
164  }
165  norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
166  A, lda, NULL );
167  if ( norm < tol ) {
168  return 0;
169  }
170  return -1;
171  }
172 
173  VN1 = rwork;
174  VN2 = rwork + n;
175 
176  auxv = work;
177  f = work + pastix_imax(m, n);
178  ldf = n;
179 
180  /*
181  * Initialize partial column norms. The first N elements of work store the
182  * exact column norms.
183  */
184  for (j=0; j<n; j++){
185  VN1[j] = cblas_snrm2(m, A + j * lda, 1);
186  VN2[j] = VN1[j];
187  jpvt[j] = j;
188  }
189 
190  machine_prec = sqrtf(LAPACKE_slamch_work('e'));
191  rk = 0;
192 
193  while ( rk < maxrank ) {
194  /* jb equivalent to kb in LAPACK xLAQPS: maximum number of columns to factorize */
195  jb = pastix_imin(nb, maxrank-offset);
196  lsticc = 0;
197 
198  /* Factorize as many columns as possible */
199  for ( k=0; k<jb; k++ ) {
200 
201  rk = offset + k;
202 
203  assert( rk < maxrank );
204 
205  pvt = rk + cblas_isamax( n-rk, VN1 + rk, 1 );
206 
207  /*
208  * The selected pivot is below the threshold, we check if we exit
209  * now or we still need to compute it to refine the precision.
210  */
211  if ( (VN1[pvt] == 0.) || (VN1[pvt] < tol) ) {
212  residual = cblas_snrm2( n-rk, VN1 + rk, 1 );
213  if ( (residual == 0.) || (residual < tol) ) {
214  assert( rk < maxrank );
215  return rk;
216  }
217  }
218 
219  /*
220  * Pivot is not within the current column: we swap
221  */
222  if ( pvt != rk ) {
223  assert( pvt < n );
224  cblas_sswap( m, A + pvt * lda, 1,
225  A + rk * lda, 1 );
226  cblas_sswap( k, f + (pvt-offset), ldf,
227  f + k, ldf );
228 
229  itemp = jpvt[pvt];
230  jpvt[pvt] = jpvt[rk];
231  jpvt[rk] = itemp;
232  VN1[pvt] = VN1[rk];
233  VN2[pvt] = VN2[rk];
234  }
235 
236  /*
237  * Apply previous Householder reflectors to the column K
238  * A(RK:M,RK) := A(RK:M,RK) - A(RK:M,OFFSET+1:RK-1)*F(K,1:K-1)^T
239  */
240  if ( k > 0 ) {
241  assert( (rk < n) && (rk < m) );
242 
243 #if defined(PRECISION_c) || defined(PRECISION_z)
244  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, m-rk, 1, k,
245  (msone), A + offset * lda + rk, lda,
246  f + k, ldf,
247  (sone), A + rk * lda + rk, lda );
248 #else
249  cblas_sgemv( CblasColMajor, CblasNoTrans, m-rk, k,
250  (msone), A + offset * lda + rk, lda,
251  f + k, ldf,
252  (sone), A + rk * lda + rk, 1 );
253 #endif
254  }
255 
256  /*
257  * Generate elementary reflector H(k).
258  */
259  if ((rk+1) < m) {
260  LAPACKE_slarfg_work(m-rk, A + rk * lda + rk, A + rk * lda + (rk+1), 1, tau + rk);
261  }
262  else{
263  LAPACKE_slarfg_work(1, A + rk * lda + rk, A + rk * lda + rk, 1, tau + rk);
264  }
265 
266  akk = A[rk * lda + rk];
267  A[rk * lda + rk] = sone;
268 
269  /*
270  * Compute Kth column of F:
271  * F(RK+1:N,RK) := tau(RK)*A(RK:M,RK+1:N)^T*A(RK:M,RK).
272  */
273  if ((rk+1) < n) {
274  float alpha = tau[rk];
275  cblas_sgemv( CblasColMajor, CblasTrans, m-rk, n-rk-1,
276  (alpha), A + (rk+1) * lda + rk, lda,
277  A + rk * lda + rk, 1,
278  (szero), f + k * ldf + k + 1, 1 );
279  }
280 
281  /*
282  * Padding F(1:K,K) with zeros.
283  */
284  memset( f + k * ldf, 0, k * sizeof( float ) );
285 
286  /*
287  * Incremental updating of F:
288  * F(1:N-OFFSET,K) := F(1:N-OFFSET,K) - tau(RK)*F(1:N,1:K-1)*A(RK:M,OFFSET+1:RK-1)^T*A(RK:M,RK).
289  */
290  if (k > 0) {
291  float alpha = -tau[rk];
292  cblas_sgemv( CblasColMajor, CblasTrans, m-rk, k,
293  (alpha), A + offset * lda + rk, lda,
294  A + rk * lda + rk, 1,
295  (szero), auxv, 1 );
296 
297  cblas_sgemv( CblasColMajor, CblasNoTrans, n-offset, k,
298  (sone), f, ldf,
299  auxv, 1,
300  (sone), f + k * ldf, 1);
301  }
302 
303  /*
304  * Update the current row of A:
305  * A(RK,RK+1:N) := A(RK,RK+1:N) - A(RK,OFFSET+1:RK)*F(K+1:N,1:K)^T.
306  */
307  if ((rk+1) < n) {
308 #if defined(PRECISION_c) || defined(PRECISION_z)
309  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
310  1, n-rk-1, k+1,
311  (msone), A + (offset) * lda + rk, lda,
312  f + (k+1), ldf,
313  (sone), A + (rk + 1) * lda + rk, lda );
314 #else
315  cblas_sgemv( CblasColMajor, CblasNoTrans, n-rk-1, k+1,
316  (msone), f + (k+1), ldf,
317  A + (offset) * lda + rk, lda,
318  (sone), A + (rk + 1) * lda + rk, lda );
319 #endif
320  }
321 
322  /*
323  * Update partial column norms.
324  */
325  for (j=rk+1; j<n; j++) {
326  if (VN1[j] != 0.0) {
327  /*
328  * NOTE: The following 4 lines follow from the analysis in
329  * Lapack Working Note 176.
330  */
331  temp = fabsf( A[j * lda + rk] ) / VN1[j];
332  temp2 = (1.0 + temp) * (1.0 - temp);
333  temp = (temp2 > 0.0) ? temp2 : 0.0;
334 
335  temp2 = temp * ((VN1[j] / VN2[j]) * ( VN1[j] / VN2[j]));
336  if (temp2 < machine_prec){
337  VN2[j] = (float)lsticc;
338  lsticc = j;
339  }
340  else{
341  VN1[j] = VN1[j] * sqrtf(temp);
342  }
343  }
344  }
345 
346  A[rk * lda + rk] = akk;
347 
348  if (lsticc != 0) {
349  k++;
350  break;
351  }
352  }
353 
354  /* One additional reflector has been computed */
355  rk++;
356 
357  /*
358  * Apply the block reflector to the rest of the matrix:
359  * A(RK+1:M,RK+1:N) := A(RK+1:M,RK+1:N) -
360  * A(RK+1:M,OFFSET+1:RK)*F(K+1:N-OFFSET,1:K)^T.
361  */
362  if ( rk < n )
363  {
364  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
365  m-rk, n-rk, k,
366  (msone), A + offset * lda + rk, lda,
367  f + k, ldf,
368  (sone), A + rk * lda + rk, lda );
369  }
370 
371  /* Recomputation of difficult columns. */
372  while (lsticc > 0) {
373  assert(lsticc < n);
374  itemp = (pastix_int_t) (VN2[lsticc]);
375 
376  VN1[lsticc] = cblas_snrm2(m-rk, A + lsticc * lda + rk, 1 );
377 
378  /*
379  * NOTE: The computation of VN1( LSTICC ) relies on the fact that
380  * SNRM2 does not fail on vectors with norm below the value of
381  * SQRT(SLAMCH('S'))
382  */
383  VN2[lsticc] = VN1[lsticc];
384  lsticc = itemp;
385  }
386 
387  offset = rk;
388  }
389 
390  (void)full_update;
391 
392  /* We reached maxrank, so we check if the threshold is met or not */
393  residual = cblas_snrm2( n-rk, VN1 + rk, 1 );
394  if ( (tol < 0.) || ( (residual == 0.) || (residual < tol) ) ) {
395  assert( rk == maxrank );
396  return rk;
397  }
398  else {
399  return -1;
400  }
401 }
402 
403 /**
404  *******************************************************************************
405  *
406  * @brief Convert a full rank matrix in a low rank matrix, using PQRCP.
407  *
408  *******************************************************************************
409  *
410  * @param[in] use_reltol
411  * Defines if the kernel should use relative tolerance (tol *||A||), or
412  * absolute tolerance (tol).
413  *
414  * @param[in] tol
415  * The tolerance used as a criterion to eliminate information from the
416  * full rank matrix
417  *
418  * @param[in] rklimit
419  * The maximum rank to store the matrix in low-rank format. If
420  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
421  *
422  * @param[in] m
423  * Number of rows of the matrix A, and of the low rank matrix Alr.
424  *
425  * @param[in] n
426  * Number of columns of the matrix A, and of the low rank matrix Alr.
427  *
428  * @param[in] A
429  * The matrix of dimension lda-by-n that needs to be compressed
430  *
431  * @param[in] lda
432  * The leading dimension of the matrix A. lda >= max(1, m)
433  *
434  * @param[out] Alr
435  * The low rank matrix structure that will store the low rank
436  * representation of A
437  *
438  *******************************************************************************/
439 pastix_fixdbl_t
440 core_sge2lr_pqrcp( int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
441  pastix_int_t m, pastix_int_t n,
442  const void *A, pastix_int_t lda,
443  pastix_lrblock_t *Alr )
444 {
445  return core_sge2lr_qrcp( core_spqrcp, use_reltol, tol, rklimit,
446  m, n, A, lda, Alr );
447 }
448 
449 
450 /**
451  *******************************************************************************
452  *
453  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
454  *
455  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
456  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
457  * (0 I )
458  * Compute PQRCP decomposition of (I u2^T u1) * (v2 v1)^T
459  * (0 I )
460  *
461  *******************************************************************************
462  *
463  * @param[in] lowrank
464  * The structure with low-rank parameters.
465  *
466  * @param[in] transA1
467  * @arg PastixNoTrans: No transpose, op( A ) = A;
468  * @arg PastixTrans: Transpose, op( A ) = A';
469  *
470  * @param[in] alpha
471  * alpha * A is add to B
472  *
473  * @param[in] M1
474  * The number of rows of the matrix A.
475  *
476  * @param[in] N1
477  * The number of columns of the matrix A.
478  *
479  * @param[in] A
480  * The low-rank representation of the matrix A.
481  *
482  * @param[in] M2
483  * The number of rows of the matrix B.
484  *
485  * @param[in] N2
486  * The number of columns of the matrix B.
487  *
488  * @param[in] B
489  * The low-rank representation of the matrix B.
490  *
491  * @param[in] offx
492  * The horizontal offset of A with respect to B.
493  *
494  * @param[in] offy
495  * The vertical offset of A with respect to B.
496  *
497  *******************************************************************************
498  *
499  * @return The new rank of u2 v2^T or -1 if ranks are too large for
500  * recompression
501  *
502  *******************************************************************************/
503 pastix_fixdbl_t
504 core_srradd_pqrcp( const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr,
505  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
506  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
507  pastix_int_t offx, pastix_int_t offy)
508 {
509  return core_srradd_qr( core_spqrcp, lowrank, transA1, alphaptr,
510  M1, N1, A, M2, N2, B, offx, offy );
511 }
solver.h
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
s_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_sge2lr_pqrcp
pastix_fixdbl_t core_sge2lr_pqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using PQRCP.
Definition: core_spqrcp.c:440
pastix_scores.h
pastix_slrcores.h
core_spqrcp
int core_spqrcp(float tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a rank-reavealing QR factorization.
Definition: core_spqrcp.c:105
core_srradd_pqrcp
pastix_fixdbl_t core_srradd_pqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_spqrcp.c:504
core_sge2lr_qrcp
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_sgelrops.c:749
core_srradd_qr
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
Definition: core_sgelrops.c:1174