PaStiX Handbook  6.3.2
core_slr2xx.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_slr2xx.c
4  *
5  * PaStiX low-rank kernel routines that perform the addition of AB into C.
6  *
7  * @copyright 2016-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.3.2
11  * @author Mathieu Faverge
12  * @author Gregoire Pichon
13  * @author Pierre Ramet
14  * @date 2023-07-21
15  * @generated from /builds/solverstack/pastix/kernels/core_zlr2xx.c, normal z -> s, Wed Dec 13 12:09:15 2023
16  *
17  **/
18 #include "common.h"
19 #include <cblas.h>
20 #include "kernels_trace.h"
21 #include "blend/solver.h"
22 #include "pastix_scores.h"
23 #include "pastix_slrcores.h"
24 
25 #ifndef DOXYGEN_SHOULD_SKIP_THIS
26 static float sone = 1.0;
27 static float szero = 0.0;
28 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
29 
30 /**
31  *******************************************************************************
32  *
33  * @brief Perform the addition of the low-rank matrix AB and the full-rank
34  * matrix C.
35  *
36  *******************************************************************************
37  *
38  * @param[inout] params
39  * The LRMM structure that stores all the parameters used in the LRMM
40  * functions family.
41  * On exit, the C matrix is udpated with the addition of AB.
42  * @sa core_slrmm_t
43  *
44  * @param[in] AB
45  * The low-rank structure of the AB matrix to apply to C.
46  *
47  * @param[in] transV
48  * Specify if AB->v is stored normally or transposed.
49  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
50  * - If PastixTrans, AB->v is stored transposed.
51  * - If PastixTrans, AB->v is stored transposed, and () must be
52  * applied to the matrix.
53  *
54  *******************************************************************************
55  *
56  * @return The number of flops required to perform the operation.
57  *
58  *******************************************************************************/
59 static inline pastix_fixdbl_t
61  const pastix_lrblock_t *AB,
62  pastix_trans_t transV )
63 {
64  PASTE_CORE_SLRMM_PARAMS( params );
65  pastix_int_t ldabu = M;
66  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
67  pastix_fixdbl_t flops = 0.;
68  float *Cfr = C->u;
69  Cfr += Cm * offy + offx;
70 
71  assert( C->rk == -1 );
72 
73  /* TODO: find a suitable name to trace this kind of kernel. */
74  if ( AB->rk == -1 ) {
75  flops = 2 * M * N;
76  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
78  alpha, AB->u, ldabu,
79  beta, Cfr, Cm );
80  kernel_trace_stop_lvl2( flops );
81  }
82  else {
83  flops = FLOPS_SGEMM( M, N, AB->rk );
84  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
85  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
86  M, N, AB->rk,
87  (alpha), AB->u, ldabu,
88  AB->v, ldabv,
89  (beta), Cfr, Cm );
90  kernel_trace_stop_lvl2( flops );
91  }
92 
94  return flops;
95 }
96 
97 /**
98  *******************************************************************************
99  *
100  * @brief Perform the addition of the low-rank matrix AB and the low-rank
101  * matrix C.
102  *
103  *******************************************************************************
104  *
105  * @param[inout] params
106  * The LRMM structure that stores all the parameters used in the LRMM
107  * functions family.
108  * On exit, the C matrix is udpated with the addition of AB.
109  * @sa core_slrmm_t
110  *
111  * @param[in] AB
112  * The low-rank structure of the AB matrix to apply to C.
113  *
114  * @param[in] transV
115  * Specify if AB->v is stored normally or transposed.
116  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
117  * - If PastixTrans, AB->v is stored transposed.
118  * - If PastixTrans, AB->v is stored transposed, and () must be
119  * applied to the matrix.
120  *
121  *******************************************************************************
122  *
123  * @return The number of flops required to perform the operation.
124  *
125  *******************************************************************************/
126 static inline pastix_fixdbl_t
128  const pastix_lrblock_t *AB,
129  pastix_trans_t transV )
130 {
131  PASTE_CORE_SLRMM_PARAMS( params );
132  pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
133  pastix_int_t rAB = ( AB->rk == -1 ) ? pastix_imin( M, N ) : AB->rk;
134  pastix_int_t ldabu = M;
135  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
136  pastix_fixdbl_t total_flops = 0.;
137  pastix_fixdbl_t flops = 0.;
138 
139  assert( (C->rk >= 0) && (C->rk <= C->rkmax) );
140 
141  /*
142  * The rank is too big, we need to uncompress/compress C
143  */
144  if ( (C->rk + rAB) > rklimit )
145  {
146  float *Cfr, *Coff;
147  int allocated = 0;
148  if ( (Cfr = core_slrmm_getws( params, Cm * Cn )) == NULL ) {
149  Cfr = malloc( Cm * Cn * sizeof(float) );
150  allocated = 1;
151  }
152  Coff = Cfr + Cm * offy + offx;
153 
154  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
155  cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
156  Cm, Cn, C->rk,
157  (sone), C->u, Cm,
158  C->v, C->rkmax,
159  (szero), Cfr, Cm );
160  flops = FLOPS_SGEMM( Cm, Cn, C->rk );
161 
162  /* Add A*B */
163  if ( AB->rk == -1 ) {
164  core_sgeadd( PastixNoTrans, M, N,
165  alpha, AB->u, M,
166  beta, Coff, Cm );
167  flops += (2. * M * N);
168  }
169  else {
170  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
171  M, N, AB->rk,
172  (alpha), AB->u, ldabu,
173  AB->v, ldabv,
174  (beta), Coff, Cm );
175  flops += FLOPS_SGEMM( M, N, AB->rk );
176  }
177  kernel_trace_stop_lvl2( flops );
178  total_flops += flops;
179 
180  /* Try to recompress */
181  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
182  core_slrfree(C); // TODO: Can we give it directly to ge2lr as this
183  flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
184  kernel_trace_stop_lvl2_rank( flops, C->rk );
185  total_flops += flops;
186 
187  if (allocated) {
188  free(Cfr);
189  }
190  }
191  /*
192  * The rank is not too large, we perform a low-rank update
193  */
194  else {
195  total_flops += lowrank->core_rradd( lowrank, transV, &alpha,
196  M, N, AB,
197  Cm, Cn, C,
198  offx, offy );
199  }
200 
202  return total_flops;
203 }
204 
205 /**
206  *******************************************************************************
207  *
208  * @brief Perform the addition of the low-rank matrix AB into the null matrix C.
209  *
210  *******************************************************************************
211  *
212  * @param[inout] params
213  * The LRMM structure that stores all the parameters used in the LRMM
214  * functions family.
215  * On exit, the C matrix contains the product AB aligned with its own
216  * dimensions.
217  * @sa core_slrmm_t
218  *
219  * @param[in] AB
220  * The low-rank structure of the AB matrix to apply to C.
221  *
222  * @param[in] transV
223  * Specify if AB->v is stored normally or transposed.
224  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
225  * - If PastixTrans, AB->v is stored transposed.
226  * - If PastixTrans, AB->v is stored transposed, and () must be
227  * applied to the matrix.
228  *
229  * @param[in] infomask
230  * Mask of informations returned by the core_sxx2lr() functions.
231  * If CORE_LRMM_ORTHOU is set, then AB.u is orthogonal, otherwise an
232  * orthogonalization step is added before adding it to C.
233  *
234  *******************************************************************************
235  *
236  * @return The number of flops required to perform the operation.
237  *
238  *******************************************************************************/
239 static inline pastix_fixdbl_t
241  const pastix_lrblock_t *AB,
242  pastix_trans_t transV,
243  int infomask )
244 {
245  PASTE_CORE_SLRMM_PARAMS( params );
246  pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
247  pastix_int_t ldabu = M;
248  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
249  pastix_fixdbl_t total_flops = 0.;
250  pastix_fixdbl_t flops;
251  int allocated = 0;
252 
253  assert( C->rk == 0 );
254 
255  if ( AB->rk > rklimit ) {
256  float *Cfr, *Coff;
257  if ( (Cfr = core_slrmm_getws( params, Cm * Cn )) == NULL ) {
258  Cfr = malloc( Cm * Cn * sizeof(float) );
259  allocated = 1;
260  }
261  Coff = Cfr + Cm * offy + offx;
262 
263  /* Set to 0 if contribution smaller than C */
264  if ( (M != Cm) || (N != Cn) ) {
265  memset( Cfr, 0, Cm * Cn * sizeof(float) );
266  }
267 
268  /* Uncompress the AB product into C */
269  flops = FLOPS_SGEMM( M, N, AB->rk );
270  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
271  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
272  M, N, AB->rk,
273  (alpha), AB->u, ldabu,
274  AB->v, ldabv,
275  (beta), Coff, Cm );
276  kernel_trace_stop_lvl2( flops );
277  total_flops += flops;
278 
279  /* Try to recompress C */
280  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
281  flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
282  kernel_trace_stop_lvl2_rank( flops, C->rk );
283  total_flops += flops;
284 
285  if ( allocated ) {
286  free( work );
287  }
288  }
289  else {
290  /*
291  * Let's chech that AB->u is orthogonal before copying it to C.u
292  */
293  int orthou = infomask & PASTIX_LRM3_ORTHOU;
294  if ( !orthou ) {
295  float *ABfr;
296  pastix_lrblock_t backup;
297  int allocated = 0;
298 
299  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_orthou );
300 
301  if ( AB->rk > 0 ) {
302  if ( (ABfr = core_slrmm_getws( params, M * N )) == NULL ) {
303  ABfr = malloc( M * N * sizeof(float) );
304  allocated = 1;
305  }
306 
307  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
308  M, N, AB->rk,
309  (sone), AB->u, ldabu,
310  AB->v, ldabv,
311  (szero), ABfr, M );
312  flops = FLOPS_SGEMM( M, N, AB->rk );
313  }
314  else {
315  ABfr = AB->u;
316  flops = 0.0;
317  }
318 
319  flops += lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, rklimit,
320  M, N, ABfr, M, &backup );
321 
322  core_slrcpy( lowrank, PastixNoTrans, alpha,
323  M, N, &backup, Cm, Cn, C,
324  offx, offy );
325 
326  kernel_trace_stop_lvl2( flops );
327  core_slrfree( &backup );
328  total_flops += flops;
329 
330  if ( allocated ) {
331  free( ABfr );
332  }
333  }
334  /*
335  * AB->u is orthogonal, we directly copy AB->u into C
336  */
337  else {
338  core_slrcpy( lowrank, transV, alpha,
339  M, N, AB, Cm, Cn, C,
340  offx, offy );
341  }
342  }
343 
345  return total_flops;
346 }
347 
348 /**
349  *******************************************************************************
350  *
351  * @brief Perform the addition of two low-rank matrices.
352  *
353  *******************************************************************************
354  *
355  * @param[inout] params
356  * The LRMM structure that stores all the parameters used in the LRMM
357  * functions family.
358  * On exit, the C matrix contains the addition of C and A aligned with its own
359  * dimensions.
360  * @sa core_slrmm_t
361  *
362  * @param[in] A
363  * The low-rank structure of the A matrix to add to C.
364  *
365  * @param[in] transV
366  * Specify if A->v is stored normally or transposed.
367  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
368  * - If PastixTrans, AB->v is stored transposed.
369  * - If PastixTrans, AB->v is stored transposed, and () must be
370  * applied to the matrix.
371  *
372  * @param[in] infomask
373  * Mask of informations returned by the core_sxx2lr() functions.
374  * If CORE_LRMM_ORTHOU is set, then A.u is orthogonal, otherwise an
375  * orthogonalization step is added before adding it to C.
376  *
377  *******************************************************************************
378  *
379  * @return The number of flops required to perform the operation.
380  *
381  *******************************************************************************/
384  const pastix_lrblock_t *A,
385  pastix_trans_t transV,
386  int infomask )
387 {
388  pastix_lrblock_t *C = params->C;
389  pastix_fixdbl_t flops = 0.;
390 
391  if ( A->rk != 0 ) {
392  pastix_atomic_lock( params->lock );
393  switch ( C->rk ) {
394  case -1:
395  /*
396  * C became full rank
397  */
398  flops = core_slr2fr( params, A, transV );
399  break;
400 
401  case 0:
402  /*
403  * C is still null
404  */
405  flops = core_slr2null( params, A, transV, infomask );
406  break;
407 
408  default:
409  /*
410  * C is low-rank of rank k
411  */
412  flops = core_slr2lr( params, A, transV );
413  }
414  assert( C->rk <= C->rkmax);
415  pastix_atomic_unlock( params->lock );
416  }
417 
418  return flops;
419 }
static pastix_fixdbl_t core_slr2fr(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the full-rank matrix C.
Definition: core_slr2xx.c:60
static pastix_fixdbl_t core_slr2lr(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the low-rank matrix C.
Definition: core_slr2xx.c:127
static pastix_fixdbl_t core_slr2null(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV, int infomask)
Perform the addition of the low-rank matrix AB into the null matrix C.
Definition: core_slr2xx.c:240
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
pastix_lrblock_t * C
pastix_atomic_lock_t * lock
#define PASTE_CORE_SLRMM_PARAMS(_a_)
Initialize all the parameters of the core_slrmm family functions to ease the access.
static float * core_slrmm_getws(core_slrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
#define PASTE_CORE_SLRMM_VOID
Void all the parameters of the core_slrmm family functions to silent warnings.
pastix_fixdbl_t core_slradd(core_slrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Definition: core_slr2xx.c:383
Structure to store all the parameters of the core_slrmm family functions.
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:46
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
The block low-rank structure to hold a matrix in low-rank form.
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixNoTrans
Definition: api.h:445