PaStiX Handbook  6.2.1
core_stqrcp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_stqrcp.c
4  *
5  * PaStiX implementation of the truncated rank-revealing QR with column pivoting
6  * based on Lapack GEQP3.
7  *
8  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.2.0
12  * @author Alfredo Buttari
13  * @author Gregoire Pichon
14  * @author Esragul Korkmaz
15  * @author Mathieu Faverge
16  * @date 2021-04-07
17  * @generated from /builds/solverstack/pastix/kernels/core_ztqrcp.c, normal z -> s, Tue Apr 12 09:38:39 2022
18  *
19  **/
20 #include "common.h"
21 #include <cblas.h>
22 #include <lapacke.h>
23 #include "blend/solver.h"
24 #include "pastix_scores.h"
25 #include "pastix_slrcores.h"
26 #include "s_nan_check.h"
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 static float msone = -1.0;
30 static float sone = 1.0;
31 static float szero = 0.0;
32 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
33 
34 /**
35  *******************************************************************************
36  *
37  * @brief Compute a randomized QR factorization with truncated updates.
38  *
39  * This routine is derivated from "Randomized QR with Column Pivoting",
40  * J. A. Duersch and M. Gu, SIAM Journal on Scientific Computing, vol. 39,
41  * no. 4, pp. C263-C291, 2017.
42  *
43  *******************************************************************************
44  *
45  * @param[in] tol
46  * The relative tolerance criterion. Computations are stopped when the
47  * frobenius norm of the residual matrix is lower than tol.
48  * If tol < 0, then maxrank reflectors are computed.
49  *
50  * @param[in] maxrank
51  * Maximum number of reflectors computed. Computations are stopped when
52  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
53  * or up to the tolerance criterion.
54  *
55  * @param[in] unused
56  * Unused parameter in this kernel added to match API of RQRCP and PQRCP.
57  *
58  * @param[in] nb
59  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
60  * 32.
61  *
62  * @param[in] m
63  * Number of rows of the matrix A.
64  *
65  * @param[in] n
66  * Number of columns of the matrix A.
67  *
68  * @param[in] A
69  * The matrix of dimension lda-by-n that needs to be compressed.
70  *
71  * @param[in] lda
72  * The leading dimension of the matrix A. lda >= max(1, m).
73  *
74  * @param[out] jpvt
75  * The array that describes the permutation of A.
76  *
77  * @param[out] tau
78  * Contains scalar factors of the elementary reflectors for the matrix
79  * Q.
80  *
81  * @param[in] work
82  * Workspace array of size lwork.
83  *
84  * @param[in] lwork
85  * The dimension of the work area. lwork >= (nb * n + max(n, m) )
86  * If lwork == -1, the functions returns immediately and work[0]
87  * contains the optimal size of work.
88  *
89  * @param[in] rwork
90  * Workspace array used to store partial and exact column norms (2-by-n)
91  *
92  *******************************************************************************
93  *
94  * @return This routine will return the rank of A (>=0) or -1 if it didn't
95  * manage to compress within the margins of tolerance and maximum rank.
96  *
97  *******************************************************************************/
98 int
99 core_stqrcp( float tol, pastix_int_t maxrank, int refine, pastix_int_t nb,
100  pastix_int_t m, pastix_int_t n,
101  float *A, pastix_int_t lda,
102  pastix_int_t *jpvt, float *tau,
103  float *work, pastix_int_t lwork, float *rwork )
104 {
105  int SEED[4] = {26, 67, 52, 197};
106  pastix_int_t j, k, in, itmp, d, ib, loop = 1;
107  int ret;
108  pastix_int_t minMN, lwkopt;
109  pastix_int_t p = 5;
110  pastix_int_t bp = ( nb < p ) ? 32 : nb;
111  pastix_int_t b = bp - p;
112  pastix_int_t size_B, size_O, size_W, size_Y, size_A, size_T, sublw;
113  pastix_int_t ldb, ldw, ldy;
114  pastix_int_t *jpvt_b;
115  pastix_int_t rk;
116  float tolB = sqrtf( (float)(bp) ) * tol;
117  float *AP, *Y, *WT, *T, *B, *tau_b, *omega, *subw;
118 
119  minMN = pastix_imin(m, n);
120  if ( maxrank < 0 ) {
121  maxrank = minMN;
122  }
123  maxrank = pastix_imin( maxrank, minMN );
124 
125  ldb = bp;
126  ldw = maxrank;
127 
128  size_B = ldb * n;
129  size_O = ldb * m;
130  size_W = n * maxrank;
131  size_Y = b * b;
132  ldy = b;
133  size_A = m * n;
134  size_T = b * b;
135 
136  sublw = n * bp + pastix_imax( bp, n ); /* pqrcp */
137  sublw = pastix_imax( sublw, size_O ); /* Omega */
138  sublw = pastix_imax( sublw, b * maxrank ); /* update */
139 
140  lwkopt = size_A + size_Y + size_W
141  + size_T + size_B + n + sublw;
142 
143  if ( lwork == -1 ) {
144  work[0] = (float)lwkopt;
145  return 0;
146  }
147 #if !defined(NDEBUG)
148  if (m < 0) {
149  return -1;
150  }
151  if (n < 0) {
152  return -2;
153  }
154  if (lda < pastix_imax(1, m)) {
155  return -4;
156  }
157  if( lwork < lwkopt ) {
158  return -8;
159  }
160 #endif
161 
162  /**
163  * If maximum rank is 0, then either the matrix norm is below the tolerance,
164  * and we can return a null rank matrix, or it is not and we need to return
165  * a full rank matrix.
166  */
167  if ( maxrank == 0 ) {
168  float norm;
169  if ( tol < 0. ) {
170  return 0;
171  }
172  norm = LAPACKE_slange_work( LAPACK_COL_MAJOR, 'f', m, n,
173  A, lda, NULL );
174  if ( norm < tol ) {
175  return 0;
176  }
177  return -1;
178  }
179 
180  jpvt_b = malloc( n * sizeof(pastix_int_t) );
181 
182  AP = work;
183  Y = AP + size_A;
184  WT = Y + size_Y;
185  T = WT + size_W;
186  B = T + size_T;
187  tau_b = B + size_B;
188  omega = tau_b + n;
189  subw = tau_b + n;
190 
191  /* Initialize diagonal block of Housholders reflectors */
192  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'A', b, b,
193  0., 1., Y, ldy );
194  assert( ret == 0 );
195 
196  /* Initialize T */
197  memset(T, 0, size_T * sizeof(float));
198 
199  /* Backup A */
200  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'A', m, n,
201  A, lda, AP, m );
202  assert( ret == 0 );
203 
204  /* Initialize pivots */
205  for (j=0; j<n; j++) jpvt[j] = j;
206 
207  /*
208  * Computation of the Gaussian matrix
209  */
210  LAPACKE_slarnv_work(3, SEED, size_O, omega);
211  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
212  bp, n, m,
213  (sone), omega, bp,
214  A, lda,
215  (szero), B, ldb );
216 
217  rk = 0;
218  d = 0;
219  while ( loop )
220  {
221  ib = pastix_imin( b, minMN-rk );
222  d = core_spqrcp( tolB, ib, 1, bp,
223  bp, n-rk,
224  B + rk * ldb, ldb,
225  jpvt_b + rk, tau_b,
226  subw, sublw, rwork );
227 
228  /* If fails to reach the tolerance before maxrank, let's restore the max value */
229  if ( d == -1 ) {
230  d = ib;
231  }
232  /* If smaller than ib, we reached the threshold */
233  if ( d < ib ) {
234  loop = 0;
235  }
236  if ( d == 0 ) {
237  break;
238  }
239  /* If we exceeded the max rank, let's stop now */
240  if ( (rk + d) > maxrank ) {
241  rk = -1;
242  break;
243  }
244 
245  /* Updating jpvt, A, and AP */
246  for (j = rk; j < rk + d; j++) {
247  if (jpvt_b[j] >= 0) {
248  k = j;
249  in = jpvt_b[k] + rk;
250 
251  /* Mark as done */
252  jpvt_b[k] = - jpvt_b[k] - 1;
253 
254  while( jpvt_b[in] >= 0 ) {
255 
256  if (k != in) {
257  cblas_sswap( m, A + k * lda, 1,
258  A + in * lda, 1 );
259  cblas_sswap( m, AP + k * m, 1,
260  AP + in * m, 1 );
261 
262  itmp = jpvt[k];
263  jpvt[k] = jpvt[in];
264  jpvt[in] = itmp;
265 
266  if (rk > 0) {
267  cblas_sswap( rk, WT + k * ldw, 1,
268  WT + in * ldw, 1 );
269  }
270  }
271  itmp = jpvt_b[in];
272  jpvt_b[in] = - jpvt_b[in] - 1;
273  k = in;
274  in = itmp + rk;
275  }
276  }
277  }
278 
279  if (rk > 0) {
280  /* Update the selected columns before factorization */
281  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
282  m-rk, d, rk,
283  (msone), A + rk, lda,
284  WT + rk * ldw, ldw,
285  (sone), A + rk * lda + rk, lda );
286  }
287 
288  /*
289  * Factorize the d selected columns of A without pivoting
290  */
291  ret = LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
292  A + rk * lda + rk, lda, tau + rk,
293  work, lwork );
294  assert( ret == 0 );
295 
296  ret = LAPACKE_slarft_work( LAPACK_COL_MAJOR, 'F', 'C', m-rk, d,
297  A + rk * lda + rk, lda, tau + rk, T, b );
298  assert( ret == 0 );
299 
300  /*
301  * Compute the update line 11 of algorithm 6 in "Randomized QR with
302  * Column pivoting" from Duersch and Gu
303  *
304  * W_2^h = T^h ( Y_2^h * A - (Y_2^h * Y) * W_1^h )
305  *
306  * Step 1: Y_2^h * A
307  * a) W[rk:rk+d] <- A
308  * b) W[rk:rk+d] <- Y_2^h * A, split in triangular part + rectangular part
309  */
310  ret = LAPACKE_slacpy_work( LAPACK_COL_MAJOR, 'L', d-1, d-1,
311  A + lda * rk + rk + 1, lda,
312  Y + 1, ldy );
313  assert( ret == 0 );
314 
315  /* Triangular part */
316  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
317  d, n, d,
318  (sone), Y, ldy,
319  AP + rk, m,
320  (szero), WT + rk, ldw );
321 
322  /* Rectangular part */
323  if ( rk + d < m ) {
324  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
325  d, n, m-rk-d,
326  (sone), A + rk * lda + rk + d, lda,
327  AP + rk + d, m,
328  (sone), WT + rk, ldw );
329  }
330 
331  /*
332  * Step 2: (Y_2^h * A) - (Y_2^h * Y) * W_1^h
333  * a) work = (Y_2^h * Y)
334  * b) (Y_2^h * A) - work * W_1^h
335  */
336  if ( rk > 0 ) {
337  /* Triangular part */
338  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
339  d, rk, d,
340  (sone), Y, ldy,
341  A + rk, lda,
342  (szero), subw, d );
343 
344  /* Rectangular part */
345  if ( rk + d < m ) {
346  cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans,
347  d, rk, m-rk-d,
348  (sone), A + rk * lda + rk + d, lda,
349  A + rk + d, lda,
350  (sone), subw, d );
351  }
352 
353  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
354  d, n, rk,
355  (msone), subw, d,
356  WT, ldw,
357  (sone), WT + rk, ldw );
358  }
359 
360  /*
361  * Step 3: W_2^h = T^h ( Y_2^h * A - (Y_2^h * Y) * W_1^h )
362  * W_2^h = T^h W_2^h
363  */
364  cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, CblasNonUnit,
365  d, n, (sone),
366  T, b,
367  WT + rk, ldw );
368 
369  /* Update current d rows of R */
370  if ( rk+d < n ) {
371  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
372  d, n-rk-d, rk,
373  (msone), A + rk, lda,
374  WT + (rk+d)*ldw, ldw,
375  (sone), A + rk + (rk+d)*lda, lda );
376 
377  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
378  d, n-rk-d, d,
379  (msone), Y, ldy,
380  WT + rk + (rk+d)*ldw, ldw,
381  (sone), A + rk + (rk+d)*lda, lda );
382  }
383 
384  if ( loop && (rk+d < maxrank) ) {
385  /*
386  * The Q from partial QRCP is stored in the lower part of the matrix,
387  * we need to remove it
388  */
389  ret = LAPACKE_slaset_work( LAPACK_COL_MAJOR, 'L', d-1, d-1,
390  0, 0, B + rk*ldb + 1, ldb );
391  assert( ret == 0 );
392 
393  /* Updating B */
394  /* Solving S_11 * R_11^{-1} */
395  cblas_strsm( CblasColMajor, CblasRight, CblasUpper,
396  CblasNoTrans, CblasNonUnit,
397  d, d,
398  (sone), A + rk*lda + rk, lda,
399  B + rk*ldb, ldb );
400 
401  /* Updating S_12 = S_12 - (S_11 * R_11^{-1}) * R_12 */
402  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
403  d, n - (rk+d), d,
404  (msone), B + rk *ldb, ldb,
405  A + (rk+d)*lda + rk, lda,
406  (sone), B + (rk+d)*ldb, ldb );
407  }
408  rk += d;
409  }
410  free( jpvt_b );
411 
412  (void)ret;
413  (void)refine;
414  return rk;
415 }
416 
417 /**
418  *******************************************************************************
419  *
420  * @brief Convert a full rank matrix in a low rank matrix, using TQRCP.
421  *
422  *******************************************************************************
423  *
424  * @param[in] use_reltol
425  * Defines if the kernel should use relative tolerance (tol *||A||), or
426  * absolute tolerance (tol).
427  *
428  * @param[in] tol
429  * The tolerance used as a criterion to eliminate information from the
430  * full rank matrix
431  *
432  * @param[in] rklimit
433  * The maximum rank to store the matrix in low-rank format. If
434  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
435  *
436  * @param[in] m
437  * Number of rows of the matrix A, and of the low rank matrix Alr.
438  *
439  * @param[in] n
440  * Number of columns of the matrix A, and of the low rank matrix Alr.
441  *
442  * @param[in] A
443  * The matrix of dimension lda-by-n that needs to be compressed
444  *
445  * @param[in] lda
446  * The leading dimension of the matrix A. lda >= max(1, m)
447  *
448  * @param[out] Alr
449  * The low rank matrix structure that will store the low rank
450  * representation of A
451  *
452  *******************************************************************************/
453 pastix_fixdbl_t
454 core_sge2lr_tqrcp( int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
455  pastix_int_t m, pastix_int_t n,
456  const void *A, pastix_int_t lda,
457  pastix_lrblock_t *Alr )
458 {
459  return core_sge2lr_qrcp( core_stqrcp, use_reltol, tol, rklimit,
460  m, n, A, lda, Alr );
461 }
462 
463 
464 /**
465  *******************************************************************************
466  *
467  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
468  *
469  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
470  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
471  * (0 I )
472  * Compute TQRCP decomposition of (I u2^T u1) * (v2 v1)^T
473  * (0 I )
474  *
475  *******************************************************************************
476  *
477  * @param[in] lowrank
478  * The structure with low-rank parameters.
479  *
480  * @param[in] transA1
481  * @arg PastixNoTrans: No transpose, op( A ) = A;
482  * @arg PastixTrans: Transpose, op( A ) = A';
483  *
484  * @param[in] alpha
485  * alpha * A is add to B
486  *
487  * @param[in] M1
488  * The number of rows of the matrix A.
489  *
490  * @param[in] N1
491  * The number of columns of the matrix A.
492  *
493  * @param[in] A
494  * The low-rank representation of the matrix A.
495  *
496  * @param[in] M2
497  * The number of rows of the matrix B.
498  *
499  * @param[in] N2
500  * The number of columns of the matrix B.
501  *
502  * @param[in] B
503  * The low-rank representation of the matrix B.
504  *
505  * @param[in] offx
506  * The horizontal offset of A with respect to B.
507  *
508  * @param[in] offy
509  * The vertical offset of A with respect to B.
510  *
511  *******************************************************************************
512  *
513  * @return The new rank of u2 v2^T or -1 if ranks are too large for
514  * recompression
515  *
516  *******************************************************************************/
517 pastix_fixdbl_t
518 core_srradd_tqrcp( const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr,
519  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
520  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
521  pastix_int_t offx, pastix_int_t offy)
522 {
523  return core_srradd_qr( core_stqrcp, lowrank, transA1, alphaptr,
524  M1, N1, A, M2, N2, B, offx, offy );
525 }
solver.h
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
core_sge2lr_tqrcp
pastix_fixdbl_t core_sge2lr_tqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using TQRCP.
Definition: core_stqrcp.c:454
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
core_srradd_tqrcp
pastix_fixdbl_t core_srradd_tqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_stqrcp.c:518
s_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_stqrcp
int core_stqrcp(float tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a randomized QR factorization with truncated updates.
Definition: core_stqrcp.c:99
pastix_scores.h
pastix_slrcores.h
core_spqrcp
int core_spqrcp(float tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *jpvt, float *tau, float *work, pastix_int_t lwork, float *rwork)
Compute a rank-reavealing QR factorization.
Definition: core_spqrcp.c:105
core_sge2lr_qrcp
pastix_fixdbl_t core_sge2lr_qrcp(core_srrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_sgelrops.c:749
core_srradd_qr
pastix_fixdbl_t core_srradd_qr(core_srrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
Definition: core_sgelrops.c:1174