PaStiX Handbook  6.2.1
core_zrqrcp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_zrqrcp.c
4  *
5  * PaStiX Rank-revealing QR kernel beased on randomization technique and partial
6  * QR with column pivoting.
7  *
8  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.2.0
12  * @author Claire Soyez-Martin
13  * @author Esragul Korkmaz
14  * @author Mathieu Faverge
15  * @date 2020-03-02
16  * @generated from /builds/solverstack/pastix/kernels/core_zrqrcp.c, normal z -> z, Tue Apr 12 09:38:38 2022
17  *
18  **/
19 #include "common.h"
20 #include <cblas.h>
21 #include <lapacke.h>
22 #include "blend/solver.h"
23 #include "pastix_zcores.h"
24 #include "pastix_zlrcores.h"
25 #include "z_nan_check.h"
26 
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static pastix_complex64_t mzone = -1.0;
29 static pastix_complex64_t zone = 1.0;
30 static pastix_complex64_t zzero = 0.0;
31 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
32 
33 /**
34  *******************************************************************************
35  *
36  * @brief Compute a randomized QR factorization.
37  *
38  * This kernel implements the algorithm described in:
39  * Fast Parallel Randomized QR with Column Pivoting Algorithms for Reliable
40  * Low-rank Matrix Approximations. Jianwei Xiao, Ming Gu, and Julien Langou
41  *
42  * The main difference in this algorithm relies in two points:
43  * 1) First, we stop the iterative porcess based on a tolerance criterion
44  * forwarded to the QR with column pivoting kernel, while they have a fixed
45  * number of iterations defined by parameter.
46  * 2) Second, we perform an extra PQRCP call on the trailing submatrix to
47  * finalize the computations, while in the paper above they use a spectrum
48  * revealing algorithm to refine the solution.
49  *
50  * More information can also be found in Finding Structure with Randomness:
51  * Probabilistic Algorithms for Constructing Approximate Matrix
52  * Decompositions. N. Halko, P. G. Martinsson, and J. A. Tropp
53  *
54  * Based on this paper, we set the p parameter to 5, as it seems sufficient, and
55  * because we iterate multiple times to get the final rank.
56  *
57  *******************************************************************************
58  *
59  * @param[in] tol
60  * The relative tolerance criterion. Computations are stopped when the
61  * frobenius norm of the residual matrix is lower than tol.
62  * If tol < 0, then maxrank reflectors are computed.
63  *
64  * @param[in] maxrank
65  * Maximum number of reflectors computed. Computations are stopped when
66  * the rank exceeds maxrank. If maxrank < 0, all reflectors are computed
67  * or up to the tolerance criterion.
68  *
69  * @param[in] refine
70  * Enable/disable the extra refinement step that performs an additional
71  * PQRCP on the trailing submatrix.
72  *
73  * @param[in] nb
74  * Tuning parameter for the GEMM blocking size. if nb < 0, nb is set to
75  * 32.
76  *
77  * @param[in] m
78  * Number of rows of the matrix A.
79  *
80  * @param[in] n
81  * Number of columns of the matrix A.
82  *
83  * @param[in] A
84  * The matrix of dimension lda-by-n that needs to be compressed.
85  *
86  * @param[in] lda
87  * The leading dimension of the matrix A. lda >= max(1, m).
88  *
89  * @param[out] jpvt
90  * The array that describes the permutation of A.
91  *
92  * @param[out] tau
93  * Contains scalar factors of the elementary reflectors for the matrix
94  * Q.
95  *
96  * @param[in] work
97  * Workspace array of size lwork.
98  *
99  * @param[in] lwork
100  * The dimension of the work area. lwork >= (nb * n + max(n, m) )
101  * If lwork == -1, the functions returns immediately and work[0]
102  * contains the optimal size of work.
103  *
104  * @param[in] rwork
105  * Workspace array used to store partial and exact column norms (2-by-n)
106  *
107  *******************************************************************************
108  *
109  * @return This routine will return the rank of A (>=0) or -1 if it didn't
110  * manage to compress within the margins of tolerance and maximum rank.
111  *
112  *******************************************************************************/
113 int
114 core_zrqrcp( double tol, pastix_int_t maxrank, int refine, pastix_int_t nb,
115  pastix_int_t m, pastix_int_t n,
116  pastix_complex64_t *A, pastix_int_t lda,
117  pastix_int_t *jpvt, pastix_complex64_t *tau,
118  pastix_complex64_t *work, pastix_int_t lwork, double *rwork )
119 {
120  int SEED[4] = {26, 67, 52, 197};
121  pastix_int_t j, k, in, itmp, d, ib, loop = 1;
122  int ret;
123  pastix_int_t p = 5;
124  pastix_int_t bp = ( nb < p ) ? 32 : nb;
125  pastix_int_t b = bp - p;
126  pastix_int_t ldb = bp;
127  pastix_int_t ldo = bp;
128  pastix_int_t size_O = ldo * m;
129  pastix_int_t size_B = ldb * n;
130  pastix_int_t *jpvt_b;
131  pastix_int_t rk, minMN, lwkopt;
132  double tolB = sqrt( (double)(bp) ) * tol;
133 
134  pastix_complex64_t *B = work;
135  pastix_complex64_t *tau_b = B + size_B;
136  pastix_complex64_t *omega = tau_b + n;
137  pastix_complex64_t *subw = tau_b + n;
138  pastix_int_t sublw = n * bp + pastix_imax( bp, n );
139  sublw = pastix_imax( sublw, size_O );
140 
141  char trans;
142 #if defined(PRECISION_c) || defined(PRECISION_z)
143  trans = 'C';
144 #else
145  trans = 'T';
146 #endif
147 
148  lwkopt = size_B + n + sublw;
149  if ( lwork == -1 ) {
150  work[0] = (pastix_complex64_t)lwkopt;
151  return 0;
152  }
153 #if !defined(NDEBUG)
154  if (m < 0) {
155  return -1;
156  }
157  if (n < 0) {
158  return -2;
159  }
160  if (lda < pastix_imax(1, m)) {
161  return -4;
162  }
163  if( lwork < lwkopt ) {
164  return -8;
165  }
166 #endif
167 
168  minMN = pastix_imin(m, n);
169  if ( maxrank < 0 ) {
170  maxrank = minMN;
171  }
172  maxrank = pastix_imin( maxrank, minMN );
173 
174  /**
175  * If maximum rank is 0, then either the matrix norm is below the tolerance,
176  * and we can return a null rank matrix, or it is not and we need to return
177  * a full rank matrix.
178  */
179  if ( maxrank == 0 ) {
180  double norm;
181  if ( tol < 0. ) {
182  return 0;
183  }
184  norm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'f', m, n,
185  A, lda, NULL );
186  if ( norm < tol ) {
187  return 0;
188  }
189  return -1;
190  }
191 
192 #if defined(PASTIX_DEBUG_LR)
193  B = malloc( size_B * sizeof(pastix_complex64_t) );
194  tau_b = malloc( n * sizeof(pastix_complex64_t) );
195  omega = malloc( size_O * sizeof(pastix_complex64_t) );
196  subw = malloc( sublw * sizeof(pastix_complex64_t) );
197 #endif
198 
199  jpvt_b = malloc( n * sizeof(pastix_int_t) );
200  for (j=0; j<n; j++) jpvt[j] = j;
201 
202  /* Computation of the Gaussian matrix */
203  LAPACKE_zlarnv_work(3, SEED, size_O, omega);
204 
205  /* Project with the gaussian matrix: B = Omega * A */
206  cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
207  bp, n, m,
208  CBLAS_SADDR(zone), omega, ldo,
209  A, lda,
210  CBLAS_SADDR(zzero), B, ldb );
211 
212  rk = 0;
213  while ( loop )
214  {
215  ib = pastix_imin( b, minMN-rk );
216  d = core_zpqrcp( tolB, ib, 1, bp,
217  bp, n-rk,
218  B + rk*ldb, ldb,
219  jpvt_b + rk, tau_b,
220  subw, sublw, /* >= (n*bp)+max(bp, n) */
221  rwork ); /* >= 2*n */
222 
223  /* If fails to reach the tolerance before maxrank, let's restore the max value */
224  if ( d == -1 ) {
225  d = ib;
226  }
227  /* If smaller than ib, we reached the threshold */
228  if ( d < ib ) {
229  loop = 0;
230  }
231  if ( d == 0 ) {
232  loop = 0;
233  break;
234  }
235  /* If we exceeded the max rank, let's stop now */
236  if ( (rk + d) > maxrank ) {
237  rk = -1;
238  break;
239  }
240 
241  /* Updating jpvt and A */
242  for (j = rk; j < rk + d; j++) {
243  if (jpvt_b[j] >= 0) {
244  k = j;
245  in = jpvt_b[k] + rk;
246 
247  /* Mark as done */
248  jpvt_b[k] = - jpvt_b[k] - 1;
249 
250  while( jpvt_b[in] >= 0 ) {
251 
252  if (k != in) {
253  cblas_zswap( m, A + k * lda, 1,
254  A + in * lda, 1 );
255 
256  itmp = jpvt[k];
257  jpvt[k] = jpvt[in];
258  jpvt[in] = itmp;
259  }
260  itmp = jpvt_b[in];
261  jpvt_b[in] = - jpvt_b[in] - 1;
262  k = in;
263  in = itmp + rk;
264  }
265  }
266  }
267 
268  /*
269  * Factorize d columns of A without pivoting
270  */
271  ret = LAPACKE_zgeqrf_work( LAPACK_COL_MAJOR, m-rk, d,
272  A + rk*lda + rk, lda, tau + rk,
273  subw, sublw );
274  assert(ret == 0);
275 
276  if ( rk+d < n ) {
277 
278  /*
279  * Update trailing submatrix: A <- Q^h A
280  */
281  ret = LAPACKE_zunmqr_work( LAPACK_COL_MAJOR, 'L', trans,
282  m-rk, n-rk-d, d,
283  A + rk *lda + rk, lda, tau + rk,
284  A + (rk+d)*lda + rk, lda,
285  subw, sublw );
286  assert(ret == 0);
287 
288  /*
289  * The Q from partial QRCP is stored in the lower part of the matrix,
290  * we need to remove it
291  */
292  ret = LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'L', d-1, d-1,
293  0, 0, B + rk*ldb + 1, ldb );
294  assert( ret == 0 );
295 
296  /*
297  * Updating B
298  */
299  cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper,
300  CblasNoTrans, CblasNonUnit,
301  d, d,
302  CBLAS_SADDR(zone), A + rk*lda + rk, lda,
303  B + rk*ldb, ldb );
304 
305  cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
306  d, n - (rk+d), d,
307  CBLAS_SADDR(mzone), B + rk *ldb, ldb,
308  A + (rk+d)*lda + rk, lda,
309  CBLAS_SADDR(zone), B + (rk+d)*ldb, ldb );
310  }
311  rk += d;
312  }
313 
314 #if 0
315  d = 0;
316  if ( refine && !loop && (rk < maxrank) ) {
317  /*
318  * Apply a Rank-revealing QR on the trailing submatrix to get the last
319  * columns
320  */
321  d = core_zpqrcp( tol, maxrank-rk, 0, bp,
322  m-rk, n-rk,
323  A + rk * lda + rk, lda,
324  jpvt_b, tau + rk,
325  work, lwork, rwork );
326 
327  /* Updating jpvt and A */
328  for (j=0; j<d; j++) {
329  if (jpvt_b[j] >= 0) {
330  k = j;
331  in = jpvt_b[k];
332 
333  /* Mark as done */
334  jpvt_b[k] = - jpvt_b[k] - 1;
335 
336  while( jpvt_b[in] >= 0 ) {
337  if (k != in) {
338  /* Swap columns in first rows */
339  cblas_zswap( rk, A + (rk + k ) * lda, 1,
340  A + (rk + in) * lda, 1 );
341 
342  itmp = jpvt[rk + k];
343  jpvt[rk + k] = jpvt[rk + in];
344  jpvt[rk + in] = itmp;
345  }
346  itmp = jpvt_b[in];
347  jpvt_b[in] = - jpvt_b[in] - 1;
348  k = in;
349  in = itmp;
350  }
351  }
352  }
353 
354  rk += d;
355  if ( rk > maxrank ) {
356  rk = -1;
357  }
358  }
359 #endif
360  free( jpvt_b );
361 
362 #if defined(PASTIX_DEBUG_LR)
363  free( B );
364  free( tau_b );
365  free( omega );
366  free( subw );
367 #endif
368 
369  (void)ret;
370  (void)refine;
371  return rk;
372 }
373 
374 /**
375  *******************************************************************************
376  *
377  * @brief Convert a full rank matrix in a low rank matrix, using RQRCP.
378  *
379  *******************************************************************************
380  *
381  * @param[in] use_reltol
382  * Defines if the kernel should use relative tolerance (tol *||A||), or
383  * absolute tolerance (tol).
384  *
385  * @param[in] tol
386  * The tolerance used as a criterion to eliminate information from the
387  * full rank matrix
388  *
389  * @param[in] rklimit
390  * The maximum rank to store the matrix in low-rank format. If
391  * -1, set to min(m, n) / PASTIX_LR_MINRATIO.
392  *
393  * @param[in] m
394  * Number of rows of the matrix A, and of the low rank matrix Alr.
395  *
396  * @param[in] n
397  * Number of columns of the matrix A, and of the low rank matrix Alr.
398  *
399  * @param[in] A
400  * The matrix of dimension lda-by-n that needs to be compressed
401  *
402  * @param[in] lda
403  * The leading dimension of the matrix A. lda >= max(1, m)
404  *
405  * @param[out] Alr
406  * The low rank matrix structure that will store the low rank
407  * representation of A
408  *
409  *******************************************************************************/
410 pastix_fixdbl_t
411 core_zge2lr_rqrcp( int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit,
412  pastix_int_t m, pastix_int_t n,
413  const void *A, pastix_int_t lda,
414  pastix_lrblock_t *Alr )
415 {
416  return core_zge2lr_qrcp( core_zrqrcp, use_reltol, tol, rklimit,
417  m, n, A, lda, Alr );
418 }
419 
420 /**
421  *******************************************************************************
422  *
423  * @brief Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T
424  *
425  * u2v2^T - u1v1^T = (u2 u1) (v2 v1)^T
426  * Orthogonalize (u2 u1) = (u2, u1 - u2(u2^T u1)) * (I u2^T u1)
427  * (0 I )
428  * Compute RQRCP decomposition of (I u2^T u1) * (v2 v1)^T
429  * (0 I )
430  *
431  *******************************************************************************
432  *
433  * @param[in] lowrank
434  * The structure with low-rank parameters.
435  *
436  * @param[in] transA1
437  * @arg PastixNoTrans: No transpose, op( A ) = A;
438  * @arg PastixTrans: Transpose, op( A ) = A';
439  *
440  * @param[in] alpha
441  * alpha * A is add to B
442  *
443  * @param[in] M1
444  * The number of rows of the matrix A.
445  *
446  * @param[in] N1
447  * The number of columns of the matrix A.
448  *
449  * @param[in] A
450  * The low-rank representation of the matrix A.
451  *
452  * @param[in] M2
453  * The number of rows of the matrix B.
454  *
455  * @param[in] N2
456  * The number of columns of the matrix B.
457  *
458  * @param[in] B
459  * The low-rank representation of the matrix B.
460  *
461  * @param[in] offx
462  * The horizontal offset of A with respect to B.
463  *
464  * @param[in] offy
465  * The vertical offset of A with respect to B.
466  *
467  *******************************************************************************
468  *
469  * @return The new rank of u2 v2^T or -1 if ranks are too large for
470  * recompression
471  *
472  *******************************************************************************/
473 pastix_fixdbl_t
474 core_zrradd_rqrcp( const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr,
475  pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A,
476  pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B,
477  pastix_int_t offx, pastix_int_t offy)
478 {
479  return core_zrradd_qr( core_zrqrcp, lowrank, transA1, alphaptr,
480  M1, N1, A, M2, N2, B, offx, offy );
481 }
solver.h
core_zrradd_qr
pastix_fixdbl_t core_zrradd_qr(core_zrrqr_cp_t rrqrfct, const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Template to perform the addition of two low-rank structures with compression kernel based on QR decom...
Definition: core_zgelrops.c:1174
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
core_zrqrcp
int core_zrqrcp(double tol, pastix_int_t maxrank, int refine, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex64_t *A, pastix_int_t lda, pastix_int_t *jpvt, pastix_complex64_t *tau, pastix_complex64_t *work, pastix_int_t lwork, double *rwork)
Compute a randomized QR factorization.
Definition: core_zrqrcp.c:114
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
pastix_zcores.h
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_zge2lr_qrcp
pastix_fixdbl_t core_zge2lr_qrcp(core_zrrqr_cp_t rrqrfct, int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *Avoid, pastix_int_t lda, pastix_lrblock_t *Alr)
Template to convert a full rank matrix into a low rank matrix through QR decompositions.
Definition: core_zgelrops.c:749
pastix_zlrcores.h
core_zge2lr_rqrcp
pastix_fixdbl_t core_zge2lr_rqrcp(int use_reltol, pastix_fixdbl_t tol, pastix_int_t rklimit, pastix_int_t m, pastix_int_t n, const void *A, pastix_int_t lda, pastix_lrblock_t *Alr)
Convert a full rank matrix in a low rank matrix, using RQRCP.
Definition: core_zrqrcp.c:411
core_zpqrcp
int core_zpqrcp(double tol, pastix_int_t maxrank, int full_update, pastix_int_t nb, pastix_int_t m, pastix_int_t n, pastix_complex64_t *A, pastix_int_t lda, pastix_int_t *jpvt, pastix_complex64_t *tau, pastix_complex64_t *work, pastix_int_t lwork, double *rwork)
Compute a rank-reavealing QR factorization.
Definition: core_zpqrcp.c:105
z_nan_check.h
Manage nancheck for lowrank kernels. This header describes all the LAPACKE functions used for low-ran...
core_zrradd_rqrcp
pastix_fixdbl_t core_zrradd_rqrcp(const pastix_lr_t *lowrank, pastix_trans_t transA1, const void *alphaptr, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Add two LR structures A=(-u1) v1^T and B=u2 v2^T into u2 v2^T.
Definition: core_zrqrcp.c:474