PaStiX Handbook  6.4.0
core_sgetrfsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_sgetrfsp.c
4  *
5  * PaStiX kernel routines for LU factorization.
6  *
7  * @copyright 2011-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Xavier Lacoste
14  * @author Gregoire Pichon
15  * @date 2024-07-05
16  * @generated from /builds/solverstack/pastix/kernels/core_zgetrfsp.c, normal z -> s, Thu Aug 29 14:20:19 2024
17  *
18  **/
19 #include "common.h"
20 #include "cblas.h"
21 #include "blend/solver.h"
22 #include "pastix_scores.h"
23 #include "kernels_trace.h"
24 
25 #ifndef DOXYGEN_SHOULD_SKIP_THIS
26 #define MAXSIZEOFBLOCKS 64
27 static float sone = 1.0;
28 static float msone = -1.0;
29 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
30 
31 /**
32  *******************************************************************************
33  *
34  * @ingroup kernel_blas_lapack_null
35  *
36  * @brief Compute the sequential static pivoting LU factorization of the matrix
37  * m-by-n A = L * U.
38  *
39  *******************************************************************************
40  *
41  * @param[in] m
42  * The number of rows and columns of the matrix A.
43  *
44  * @param[in] n
45  * The number of rows and columns of the matrix A.
46  *
47  * @param[inout] A
48  * The matrix A to factorize with LU factorization. The matrix
49  * is of size lda -by- n.
50  *
51  * @param[in] lda
52  * The leading dimension of the matrix A.
53  *
54  * @param[inout] nbpivots
55  * Pointer to the number of piovting operations made during
56  * factorization. It is updated during this call
57  *
58  * @param[in] criterion
59  * Threshold use for static pivoting. If diagonal value is under this
60  * threshold, its value is replaced by the threshold and the number of
61  * pivots is incremented.
62  *
63  *******************************************************************************/
64 static inline void
66  pastix_int_t n,
67  float *A,
68  pastix_int_t lda,
69  pastix_int_t *nbpivots,
70  float criterion )
71 {
72  pastix_int_t k, minMN;
73  float *Akk, *Aik, alpha;
74 
75  minMN = pastix_imin( m, n );
76 
77  Akk = A;
78  for (k=0; k<minMN; k++) {
79  Aik = Akk + 1;
80 
81  if ( fabsf(*Akk) < criterion ) {
82  if ( (*Akk) < 0. ) {
83  *Akk = (float)(-criterion);
84  }
85  else {
86  *Akk = (float)criterion;
87  }
88  (*nbpivots)++;
89  }
90 
91  /* A_ik = A_ik / A_kk, i = k+1 .. n */
92  alpha = 1.0 / (*Akk);
93  cblas_sscal(m-k-1, ( alpha ), Aik, 1 );
94 
95  if ( k+1 < minMN ) {
96 
97  /* A_ij = A_ij - A_ik * A_kj, i,j = k+1..n */
98  cblas_sger(CblasColMajor, m-k-1, n-k-1,
99  (msone),
100  Aik, 1,
101  Akk+lda, lda,
102  Aik+lda, lda);
103  }
104 
105  Akk += lda+1;
106  }
107 }
108 
109 /**
110  *******************************************************************************
111  *
112  * @brief Compute the block static pivoting LU factorization of the matrix
113  * m-by-n A = L * U.
114  *
115  *******************************************************************************
116  *
117  * @param[in] n
118  * The number of rows and columns of the matrix A.
119  *
120  * @param[inout] A
121  * The matrix A to factorize with LU factorization. The matrix
122  * is of size lda -by- n.
123  *
124  * @param[in] lda
125  * The leading dimension of the matrix A.
126  *
127  * @param[inout] nbpivots
128  * Pointer to the number of piovting operations made during
129  * factorization. It is updated during this call
130  *
131  * @param[in] criterion
132  * Threshold use for static pivoting. If diagonal value is under this
133  * threshold, its value is replaced by the threshold and the number of
134  * pivots is incremented.
135  *
136  *******************************************************************************/
137 void
139  float *A,
140  pastix_int_t lda,
141  pastix_int_t *nbpivots,
142  float criterion )
143 {
144  pastix_int_t k, blocknbr, blocksize, matrixsize, tempm;
145  float *Akk, *Lik, *Ukj, *Aij;
146 
147  blocknbr = pastix_iceil( n, MAXSIZEOFBLOCKS );
148 
149  Akk = A; /* Lk,k */
150 
151  for (k=0; k<blocknbr; k++) {
152 
153  tempm = n - k * MAXSIZEOFBLOCKS;
154  blocksize = pastix_imin(MAXSIZEOFBLOCKS, tempm);
155  Lik = Akk + blocksize;
156  Ukj = Akk + blocksize*lda;
157  Aij = Ukj + blocksize;
158 
159  /* Factorize the diagonal block Akk*/
160  core_sgetf2sp( tempm, blocksize, Akk, lda, nbpivots, criterion );
161 
162  matrixsize = tempm - blocksize;
163  if ( matrixsize > 0 ) {
164 
165  /* Compute the column Ukk+1 */
166  cblas_strsm(CblasColMajor,
167  CblasLeft, CblasLower,
168  CblasNoTrans, CblasUnit,
169  blocksize, matrixsize,
170  (sone), Akk, lda,
171  Ukj, lda);
172 
173  /* Update Ak+1,k+1 = Ak+1,k+1 - Lk+1,k*Uk,k+1 */
174  cblas_sgemm(CblasColMajor,
175  CblasNoTrans, CblasNoTrans,
176  matrixsize, matrixsize, blocksize,
177  (msone), Lik, lda,
178  Ukj, lda,
179  (sone), Aij, lda);
180  }
181 
182  Akk += blocksize * (lda+1);
183  }
184 }
185 
186 /**
187  *******************************************************************************
188  *
189  * @brief Compute the LU factorization of the diagonal block in a panel.
190  *
191  *******************************************************************************
192  *
193  * @param[in] solvmtx
194  * Solver Matrix structure of the problem
195  *
196  * @param[in] cblk
197  * Pointer to the structure representing the panel to factorize in the
198  * cblktab array. Next column blok must be accessible through cblk[1].
199  *
200  * @param[inout] dataL
201  * The pointer to the correct representation of the lower part of the data.
202  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
203  * - pastix_lr_block if the block is compressed.
204  *
205  * @param[inout] dataU
206  * The pointer to the correct representation of the upper part of the data.
207  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width
208  * - pastix_lr_block if the block is compressed.
209  *
210  *******************************************************************************
211  *
212  * @return The number of static pivoting performed during the diagonal block
213  * factorization.
214  *
215  *******************************************************************************/
216 int
218  SolverCblk *cblk,
219  void *dataL,
220  void *dataU )
221 {
222  pastix_int_t ncols, stride;
223  pastix_int_t nbpivots = 0;
224  pastix_fixdbl_t time, flops;
225  pastix_lrblock_t *lrL, *lrU;
226  float *L, *U;
227 
228  float criterion = solvmtx->diagthreshold;
229 
231 
232  ncols = cblk->lcolnum - cblk->fcolnum + 1;
233  stride = (cblk->cblktype & CBLK_LAYOUT_2D) ? ncols : cblk->stride;
234 
235  if ( cblk->cblktype & CBLK_COMPRESSED ) {
236  /* dataL and dataU are LRblock */
237 
238  lrL = (pastix_lrblock_t *)dataL;
239  lrU = (pastix_lrblock_t *)dataU;
240  assert( (lrL->rk == -1) && (lrU->rk == -1) );
241 
242  L = lrL->u;
243  U = lrU->u;
244  stride = ncols;
245 
246  assert( stride == lrL->rkmax );
247  assert( stride == lrU->rkmax );
248  } else {
249  L = (float *)dataL;
250  U = (float *)dataU;
251  }
252 
253  core_sgeadd( PastixTrans, ncols, ncols,
254  1.0, U, stride,
255  1.0, L, stride );
256 
257  /* Factorize diagonal block */
258  flops = FLOPS_SGETRF( ncols, ncols );
259  kernel_trace_start_lvl2( PastixKernelLvl2GETRF );
260  core_sgetrfsp(ncols, L, stride, &nbpivots, criterion);
261  kernel_trace_stop_lvl2( flops );
262 
263  /* Transpose Akk in ucoeftab */
264  core_sgetmo( ncols, ncols, L, stride, U, stride );
265 
266  kernel_trace_stop( cblk->fblokptr->inlast, PastixKernelGETRF, ncols, 0, 0, flops, time );
267 
268  if ( nbpivots ) {
269  pastix_atomic_add_32b( &(solvmtx->nbpivots), nbpivots );
270  }
271  return nbpivots;
272 }
273 
274 /**
275  *******************************************************************************
276  *
277  * @brief Compute the LU factorization of one panel.
278  *
279  *******************************************************************************
280  *
281  * @param[in] solvmtx
282  * Solver Matrix structure of the problem
283  *
284  * @param[in] cblk
285  * Pointer to the structure representing the panel to factorize in the
286  * cblktab array. Next column blok must be accessible through cblk[1].
287  *
288  * @param[inout] L
289  * The pointer to the correct representation of the lower part of the data.
290  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
291  * - pastix_lr_block if the block is compressed.
292  *
293  * @param[inout] U
294  * The pointer to the correct representation of the upper part of the data.
295  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
296  * - pastix_lr_block if the block is compressed.
297  *
298  *******************************************************************************
299  *
300  * @return The number of static pivoting performed during the diagonal block
301  * factorization.
302  *
303  *******************************************************************************/
304 int
306  SolverCblk *cblk,
307  void *L,
308  void *U )
309 {
310  pastix_int_t nbpivots;
311  nbpivots = cpucblk_sgetrfsp1d_getrf( solvmtx, cblk, L, U );
312 
313  /*
314  * We exploit the fact that the upper triangle is stored at the top of the L
315  * column, and by transposition the L part of the diagonal block is
316  * similarly stored in the U panel
317  */
320  cblk, L, L, &(solvmtx->lowrank) );
323  cblk, U, U, &(solvmtx->lowrank) );
324  return nbpivots;
325 }
326 
327 /**
328  *******************************************************************************
329  *
330  * @brief Perform the LU factorization of a given panel and apply all its
331  * updates.
332  *
333  *******************************************************************************
334  *
335  * @param[in] solvmtx
336  * Solver Matrix structure of the problem
337  *
338  * @param[in] cblk
339  * Pointer to the structure representing the panel to factorize in the
340  * cblktab array. Next column blok must be accessible through cblk[1].
341  *
342  * @param[in] work
343  * Temporary memory buffer.
344  *
345  * @param[in] lwork
346  * Temporary workspace dimension.
347  *
348  *******************************************************************************
349  *
350  * @return The number of static pivoting during factorization of the diagonal
351  * block.
352  *
353  *******************************************************************************/
354 int
356  SolverCblk *cblk,
357  float *work,
358  pastix_int_t lwork )
359 {
360  void *L = cblk_getdataL( cblk );
361  void *U = cblk_getdataU( cblk );
362  SolverCblk *fcblk;
363  SolverBlok *blok, *lblk;
364  pastix_int_t nbpivots;
365 
366  nbpivots = cpucblk_sgetrfsp1d_panel( solvmtx, cblk, L, U );
367 
368  blok = cblk->fblokptr + 1; /* this diagonal block */
369  lblk = cblk[1].fblokptr; /* the next diagonal block */
370 
371  /* if there are off-diagonal supernodes in the column */
372  for( ; blok < lblk; blok++ )
373  {
374  fcblk = solvmtx->cblktab + blok->fcblknm;
375 
376  if ( fcblk->cblktype & CBLK_FANIN ) {
377  cpucblk_salloc( PastixLUCoef, fcblk );
378  }
379 
380  /* Update on L */
382  cblk, blok, fcblk,
383  L, U, cblk_getdataL( fcblk ),
384  work, lwork, &(solvmtx->lowrank) );
385 
386  /* Update on U */
387  if ( blok+1 < lblk ) {
389  cblk, blok, fcblk,
390  U, L, cblk_getdataU( fcblk ),
391  work, lwork, &(solvmtx->lowrank) );
392  }
393  cpucblk_srelease_deps( PastixLUCoef, solvmtx, cblk, fcblk );
394  }
395 
396  return nbpivots;
397 }
398 
399 /**
400  *******************************************************************************
401  *
402  * @brief Perform the LU factorization of a given panel and submit tasks for the
403  * subsequent updates.
404  *
405  *******************************************************************************
406  *
407  * @param[in] solvmtx
408  * Solver Matrix structure of the problem
409  *
410  * @param[in] cblk
411  * Pointer to the structure representing the panel to factorize in the
412  * cblktab array. Next column blok must be accessible through cblk[1].
413  *
414  *******************************************************************************
415  *
416  * @return The number of static pivoting during factorization of the diagonal
417  * block.
418  *
419  *******************************************************************************/
420 int
422  SolverCblk *cblk )
423 {
424  void *L = cblk_getdataL( cblk );
425  void *U = cblk_getdataU( cblk );
426  SolverBlok *blok, *lblk;
427  pastix_int_t i, nbpivots;
428  pastix_queue_t *queue = solvmtx->computeQueue[ cblk->threadid ];
429 
430  assert( cblk->cblktype & CBLK_TASKS_2D );
431  nbpivots = cpucblk_sgetrfsp1d_panel( solvmtx, cblk, L, U );
432 
433  blok = cblk->fblokptr + 1; /* this diagonal block */
434  lblk = cblk[1].fblokptr; /* the next diagonal block */
435 
436  /* if there are off-diagonal supernodes in the column */
437  for( i=0; blok < lblk; i++, blok++ )
438  {
439  assert( !((solvmtx->cblktab + blok->fcblknm)->cblktype & CBLK_RECV) );
440  pqueuePush1( queue, - (blok - solvmtx->bloktab) - 1, cblk->priority + i );
441 
442  /* Skip blocks facing the same cblk */
443  while ( ( blok < lblk ) &&
444  ( blok[0].fcblknm == blok[1].fcblknm ) &&
445  ( blok[0].lcblknm == blok[1].lcblknm ) )
446  {
447  blok++;
448  }
449  }
450 
451  return nbpivots;
452 }
453 
454 /**
455  *******************************************************************************
456  *
457  * @brief Apply the updates of the LU factorisation of a given panel.
458  *
459  *******************************************************************************
460  *
461  * @param[in] solvmtx
462  * Solver Matrix structure of the problem
463  *
464  * @param[in] blok
465  * Pointer to the blok where the update start.
466  *
467  * @param[in] work
468  * Temporary memory buffer.
469  *
470  * @param[in] lwork
471  * Temporary workspace dimension.
472  *
473  *******************************************************************************/
474 void
476  SolverBlok *blok,
477  float *work,
478  pastix_int_t lwork )
479 {
480  SolverCblk *cblk = solvmtx->cblktab + blok->lcblknm;
481  SolverCblk *fcbk = solvmtx->cblktab + blok->fcblknm;
482  SolverBlok *lblk = cblk[1].fblokptr; /* the next diagonal block */
483  void *L = cblk_getdataL( cblk );
484  void *U = cblk_getdataU( cblk );
485 
486  if ( fcbk->cblktype & CBLK_FANIN ) {
487  cpucblk_salloc( PastixLUCoef, fcbk );
488  }
489 
490  do
491  {
492  /* Update on L */
494  cblk, blok, fcbk,
495  L, U, cblk_getdataL( fcbk ),
496  work, lwork, &(solvmtx->lowrank) );
497 
498  /* Update on U */
499  if ( blok+1 < lblk ) {
501  cblk, blok, fcbk,
502  U, L, cblk_getdataU( fcbk ),
503  work, lwork, &(solvmtx->lowrank) );
504  }
505  cpucblk_srelease_deps( PastixLUCoef, solvmtx, cblk, fcbk );
506  blok++;
507  }
508  while ( ( blok < lblk ) &&
509  ( blok[-1].fcblknm == blok[0].fcblknm ) &&
510  ( blok[-1].lcblknm == blok[0].lcblknm ) );
511 }
static void core_sgetf2sp(pastix_int_t m, pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *nbpivots, float criterion)
Compute the sequential static pivoting LU factorization of the matrix m-by-n A = L * U.
Definition: core_sgetrfsp.c:65
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
static void pqueuePush1(pastix_queue_t *q, pastix_int_t elt, double key1)
Push an element with a single key.
Definition: queue.h:64
Queue structure.
Definition: queue.h:38
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
Definition: kernels_trace.h:87
@ PastixKernelLvl2GETRF
Definition: kernels_enums.h:87
@ PastixKernelGETRF
Definition: kernels_enums.h:48
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
void core_sgetmo(int m, int n, const float *A, int lda, float *B, int ldb)
Transposes a m-by-n matrix out of place using an extra workspace of size m-by-n.
Definition: core_sgetmo.c:49
void core_sgetrfsp(pastix_int_t n, float *A, pastix_int_t lda, pastix_int_t *nbpivots, float criterion)
Compute the block static pivoting LU factorization of the matrix m-by-n A = L * U.
void cpucblk_sgetrfsp1dplus_update(SolverMatrix *solvmtx, SolverBlok *blok, float *work, pastix_int_t lwork)
Apply the updates of the LU factorisation of a given panel.
int cpucblk_sgetrfsp1dplus(SolverMatrix *solvmtx, SolverCblk *cblk)
Perform the LU factorization of a given panel and submit tasks for the subsequent updates.
pastix_fixdbl_t cpucblk_sgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
int cpucblk_sgetrfsp1d(SolverMatrix *solvmtx, SolverCblk *cblk, float *work, pastix_int_t lwork)
Perform the LU factorization of a given panel and apply all its updates.
void cpucblk_strsmsp(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, const void *A, void *C, const pastix_lr_t *lowrank)
Compute the updates associated to a column of off-diagonal blocks.
Definition: core_strsmsp.c:356
void cpucblk_salloc(pastix_coefside_t side, SolverCblk *cblk)
Allocate the cblk structure to store the coefficient.
int cpucblk_sgetrfsp1d_panel(SolverMatrix *solvmtx, SolverCblk *cblk, void *L, void *U)
Compute the LU factorization of one panel.
int cpucblk_sgetrfsp1d_getrf(SolverMatrix *solvmtx, SolverCblk *cblk, void *dataL, void *dataU)
Compute the LU factorization of the diagonal block in a panel.
void cpucblk_srelease_deps(pastix_coefside_t side, SolverMatrix *solvmtx, const SolverCblk *cblk, SolverCblk *fcbk)
Release the dependencies of the given cblk after an update.
The block low-rank structure to hold a matrix in low-rank form.
@ PastixLCoef
Definition: api.h:478
@ PastixLUCoef
Definition: api.h:480
@ PastixUCoef
Definition: api.h:479
@ PastixUpper
Definition: api.h:466
@ PastixRight
Definition: api.h:496
@ PastixUnit
Definition: api.h:488
@ PastixNonUnit
Definition: api.h:487
@ PastixNoTrans
Definition: api.h:445
@ PastixTrans
Definition: api.h:446
pastix_lr_t lowrank
Definition: solver.h:236
pastix_int_t priority
Definition: solver.h:183
pastix_int_t fcblknm
Definition: solver.h:144
SolverBlok *restrict bloktab
Definition: solver.h:229
static void * cblk_getdataU(const SolverCblk *cblk)
Get the pointer to the data associated to the upper part of the cblk.
Definition: solver.h:354
SolverBlok * fblokptr
Definition: solver.h:168
static void * cblk_getdataL(const SolverCblk *cblk)
Get the pointer to the data associated to the lower part of the cblk.
Definition: solver.h:342
pastix_int_t lcblknm
Definition: solver.h:143
int threadid
Definition: solver.h:182
int8_t inlast
Definition: solver.h:151
SolverCblk *restrict cblktab
Definition: solver.h:228
pastix_int_t stride
Definition: solver.h:169
int8_t cblktype
Definition: solver.h:164
pastix_int_t lcolnum
Definition: solver.h:167
double diagthreshold
Definition: solver.h:238
volatile int32_t nbpivots
Definition: solver.h:239
pastix_int_t fcolnum
Definition: solver.h:166
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161
Solver column block structure.
Definition: solver.h:203