PaStiX Handbook  6.4.0
core_zlr2xx.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_zlr2xx.c
4  *
5  * PaStiX low-rank kernel routines that perform the addition of AB into C.
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Gregoire Pichon
13  * @author Pierre Ramet
14  * @date 2024-07-05
15  * @generated from /builds/solverstack/pastix/kernels/core_zlr2xx.c, normal z -> z, Thu Aug 29 14:20:21 2024
16  *
17  **/
18 #include "common.h"
19 #include <cblas.h>
20 #include "kernels_trace.h"
21 #include "blend/solver.h"
22 #include "pastix_zcores.h"
23 #include "pastix_zlrcores.h"
24 
25 #ifndef DOXYGEN_SHOULD_SKIP_THIS
26 static pastix_complex64_t zone = 1.0;
27 static pastix_complex64_t zzero = 0.0;
28 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
29 
30 /**
31  *******************************************************************************
32  *
33  * @brief Perform the addition of the low-rank matrix AB and the full-rank
34  * matrix C.
35  *
36  *******************************************************************************
37  *
38  * @param[inout] params
39  * The LRMM structure that stores all the parameters used in the LRMM
40  * functions family.
41  * On exit, the C matrix is udpated with the addition of AB.
42  * @sa core_zlrmm_t
43  *
44  * @param[in] AB
45  * The low-rank structure of the AB matrix to apply to C.
46  *
47  * @param[in] transV
48  * Specify if AB->v is stored normally or transposed.
49  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
50  * - If PastixTrans, AB->v is stored transposed.
51  * - If PastixConjTrans, AB->v is stored transposed, and conj() must be
52  * applied to the matrix.
53  *
54  *******************************************************************************
55  *
56  * @return The number of flops required to perform the operation.
57  *
58  *******************************************************************************/
59 static inline pastix_fixdbl_t
61  const pastix_lrblock_t *AB,
62  pastix_trans_t transV )
63 {
64  PASTE_CORE_ZLRMM_PARAMS( params );
65  pastix_int_t ldabu = M;
66  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
67  pastix_fixdbl_t flops = 0.;
68  pastix_complex64_t *Cfr = C->u;
69  Cfr += Cm * offy + offx;
70 
71  assert( C->rk == -1 );
72 
73  /* TODO: find a suitable name to trace this kind of kernel. */
74  if ( AB->rk == -1 ) {
75  flops = 2 * M * N;
76  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
78  alpha, AB->u, ldabu,
79  beta, Cfr, Cm );
80  kernel_trace_stop_lvl2( flops );
81  }
82  else {
83  flops = FLOPS_ZGEMM( M, N, AB->rk );
84  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
85  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
86  M, N, AB->rk,
87  CBLAS_SADDR(alpha), AB->u, ldabu,
88  AB->v, ldabv,
89  CBLAS_SADDR(beta), Cfr, Cm );
90  kernel_trace_stop_lvl2( flops );
91  }
92 
94  return flops;
95 }
96 
97 /**
98  *******************************************************************************
99  *
100  * @brief Perform the addition of the low-rank matrix AB and the low-rank
101  * matrix C.
102  *
103  *******************************************************************************
104  *
105  * @param[inout] params
106  * The LRMM structure that stores all the parameters used in the LRMM
107  * functions family.
108  * On exit, the C matrix is udpated with the addition of AB.
109  * @sa core_zlrmm_t
110  *
111  * @param[in] AB
112  * The low-rank structure of the AB matrix to apply to C.
113  *
114  * @param[in] transV
115  * Specify if AB->v is stored normally or transposed.
116  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
117  * - If PastixTrans, AB->v is stored transposed.
118  * - If PastixConjTrans, AB->v is stored transposed, and conj() must be
119  * applied to the matrix.
120  *
121  *******************************************************************************
122  *
123  * @return The number of flops required to perform the operation.
124  *
125  *******************************************************************************/
126 static inline pastix_fixdbl_t
128  const pastix_lrblock_t *AB,
129  pastix_trans_t transV )
130 {
131  PASTE_CORE_ZLRMM_PARAMS( params );
132  pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
133  pastix_int_t rAB = ( AB->rk == -1 ) ? pastix_imin( M, N ) : AB->rk;
134  pastix_int_t ldabu = M;
135  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
136  pastix_fixdbl_t total_flops = 0.;
137  pastix_fixdbl_t flops = 0.;
138 
139  assert( (C->rk >= 0) && (C->rk <= C->rkmax) );
140 
141  /*
142  * The rank is too big, we need to uncompress/compress C
143  */
144  if ( (C->rk + rAB) > rklimit )
145  {
146  pastix_complex64_t *Cfr, *Coff;
147  int allocated = 0;
148  if ( (Cfr = core_zlrmm_getws( params, Cm * Cn )) == NULL ) {
149  Cfr = malloc( Cm * Cn * sizeof(pastix_complex64_t) );
150  allocated = 1;
151  }
152  Coff = Cfr + Cm * offy + offx;
153 
154  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
155  cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
156  Cm, Cn, C->rk,
157  CBLAS_SADDR(zone), C->u, Cm,
158  C->v, C->rkmax,
159  CBLAS_SADDR(zzero), Cfr, Cm );
160  flops = FLOPS_ZGEMM( Cm, Cn, C->rk );
161 
162  /* Add A*B */
163  if ( AB->rk == -1 ) {
164  core_zgeadd( PastixNoTrans, M, N,
165  alpha, AB->u, M,
166  beta, Coff, Cm );
167  flops += (2. * M * N);
168  }
169  else {
170  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
171  M, N, AB->rk,
172  CBLAS_SADDR(alpha), AB->u, ldabu,
173  AB->v, ldabv,
174  CBLAS_SADDR(beta), Coff, Cm );
175  flops += FLOPS_ZGEMM( M, N, AB->rk );
176  }
177  kernel_trace_stop_lvl2( flops );
178  total_flops += flops;
179 
180  /* Try to recompress */
181  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
182  core_zlrfree(C); // TODO: Can we give it directly to ge2lr as this
183  flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
184  kernel_trace_stop_lvl2_rank( flops, C->rk );
185  total_flops += flops;
186 
187  if (allocated) {
188  free(Cfr);
189  }
190  }
191  /*
192  * The rank is not too large, we perform a low-rank update
193  */
194  else {
195  total_flops += lowrank->core_rradd( lowrank, transV, &alpha,
196  M, N, AB,
197  Cm, Cn, C,
198  offx, offy );
199  }
200 
202  return total_flops;
203 }
204 
205 /**
206  *******************************************************************************
207  *
208  * @brief Perform the addition of the low-rank matrix AB into the null matrix C.
209  *
210  *******************************************************************************
211  *
212  * @param[inout] params
213  * The LRMM structure that stores all the parameters used in the LRMM
214  * functions family.
215  * On exit, the C matrix contains the product AB aligned with its own
216  * dimensions.
217  * @sa core_zlrmm_t
218  *
219  * @param[in] AB
220  * The low-rank structure of the AB matrix to apply to C.
221  *
222  * @param[in] transV
223  * Specify if AB->v is stored normally or transposed.
224  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
225  * - If PastixTrans, AB->v is stored transposed.
226  * - If PastixConjTrans, AB->v is stored transposed, and conj() must be
227  * applied to the matrix.
228  *
229  * @param[in] infomask
230  * Mask of informations returned by the core_zxx2lr() functions.
231  * If CORE_LRMM_ORTHOU is set, then AB.u is orthogonal, otherwise an
232  * orthogonalization step is added before adding it to C.
233  *
234  *******************************************************************************
235  *
236  * @return The number of flops required to perform the operation.
237  *
238  *******************************************************************************/
239 static inline pastix_fixdbl_t
241  const pastix_lrblock_t *AB,
242  pastix_trans_t transV,
243  int infomask )
244 {
245  PASTE_CORE_ZLRMM_PARAMS( params );
246  pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
247  pastix_int_t ldabu = M;
248  pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
249  pastix_fixdbl_t total_flops = 0.;
250  pastix_fixdbl_t flops;
251  int allocated = 0;
253 
254  assert( C->rk == 0 );
255 
256  if ( AB->rk > rklimit ) {
257  pastix_complex64_t *Cfr, *Coff;
258  if ( (Cfr = core_zlrmm_getws( params, Cm * Cn )) == NULL ) {
259  Cfr = malloc( Cm * Cn * sizeof(pastix_complex64_t) );
260  allocated = 1;
261  }
262  Coff = Cfr + Cm * offy + offx;
263 
264  /* Set to 0 if contribution smaller than C */
265  if ( (M != Cm) || (N != Cn) ) {
266  memset( Cfr, 0, Cm * Cn * sizeof(pastix_complex64_t) );
267  }
268 
269  /* Uncompress the AB product into C */
270  flops = FLOPS_ZGEMM( M, N, AB->rk );
271  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
272  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
273  M, N, AB->rk,
274  CBLAS_SADDR(alpha), AB->u, ldabu,
275  AB->v, ldabv,
276  CBLAS_SADDR(beta), Coff, Cm );
277  kernel_trace_stop_lvl2( flops );
278  total_flops += flops;
279 
280  /* Try to recompress C */
281  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
282  flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
283  kernel_trace_stop_lvl2_rank( flops, C->rk );
284  total_flops += flops;
285 
286  if ( allocated ) {
287  free( work );
288  }
289  }
290  else {
291  /*
292  * Let's chech that AB->u is orthogonal before copying it to C.u
293  */
294  int orthou = infomask & PASTIX_LRM3_ORTHOU;
295  if ( !orthou ) {
296  pastix_complex64_t *ABfr;
297  pastix_lrblock_t backup;
298  int allocated = 0;
299 
300  kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_orthou );
301 
302  if ( AB->rk > 0 ) {
303  if ( (ABfr = core_zlrmm_getws( params, M * N )) == NULL ) {
304  ABfr = malloc( M * N * sizeof(pastix_complex64_t) );
305  allocated = 1;
306  }
307 
308  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
309  M, N, AB->rk,
310  CBLAS_SADDR(zone), AB->u, ldabu,
311  AB->v, ldabv,
312  CBLAS_SADDR(zzero), ABfr, M );
313  flops = FLOPS_ZGEMM( M, N, AB->rk );
314  }
315  else {
316  ABfr = AB->u;
317  flops = 0.0;
318  }
319 
320  flops += lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, rklimit,
321  M, N, ABfr, M, &backup );
322 
323  core_zlrcpy( lowrank, PastixNoTrans, alpha,
324  M, N, &backup, Cm, Cn, C,
325  offx, offy );
326 
327  kernel_trace_stop_lvl2( flops );
328  core_zlrfree( &backup );
329  total_flops += flops;
330 
331  if ( allocated ) {
332  free( ABfr );
333  }
334  }
335  /*
336  * AB->u is orthogonal, we directly copy AB->u into C
337  */
338  else {
339  core_zlrcpy( lowrank, transV, alpha,
340  M, N, AB, Cm, Cn, C,
341  offx, offy );
342  }
343  }
344 
345  return total_flops;
346 }
347 
348 /**
349  *******************************************************************************
350  *
351  * @brief Perform the addition of two low-rank matrices.
352  *
353  *******************************************************************************
354  *
355  * @param[inout] params
356  * The LRMM structure that stores all the parameters used in the LRMM
357  * functions family.
358  * On exit, the C matrix contains the addition of C and A aligned with its own
359  * dimensions.
360  * @sa core_zlrmm_t
361  *
362  * @param[in] A
363  * The low-rank structure of the A matrix to add to C.
364  *
365  * @param[in] transV
366  * Specify if A->v is stored normally or transposed.
367  * - If PastixNoTrans, AB->v is stored normally for low-rank format.
368  * - If PastixTrans, AB->v is stored transposed.
369  * - If PastixConjTrans, AB->v is stored transposed, and conj() must be
370  * applied to the matrix.
371  *
372  * @param[in] infomask
373  * Mask of informations returned by the core_zxx2lr() functions.
374  * If CORE_LRMM_ORTHOU is set, then A.u is orthogonal, otherwise an
375  * orthogonalization step is added before adding it to C.
376  *
377  *******************************************************************************
378  *
379  * @return The number of flops required to perform the operation.
380  *
381  *******************************************************************************/
384  const pastix_lrblock_t *A,
385  pastix_trans_t transV,
386  int infomask )
387 {
388  pastix_lrblock_t *C = params->C;
389  pastix_fixdbl_t flops = 0.;
390 
391  if ( A->rk != 0 ) {
392  pastix_atomic_lock( params->lock );
393  switch ( C->rk ) {
394  case -1:
395  /*
396  * C became full rank
397  */
398  flops = core_zlr2fr( params, A, transV );
399  break;
400 
401  case 0:
402  /*
403  * C is still null
404  */
405  flops = core_zlr2null( params, A, transV, infomask );
406  break;
407 
408  default:
409  /*
410  * C is low-rank of rank k
411  */
412  flops = core_zlr2lr( params, A, transV );
413  }
414  assert( C->rk <= C->rkmax);
415  pastix_atomic_unlock( params->lock );
416  }
417 
418  return flops;
419 }
static pastix_fixdbl_t core_zlr2fr(core_zlrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the full-rank matrix C.
Definition: core_zlr2xx.c:60
static pastix_fixdbl_t core_zlr2null(core_zlrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV, int infomask)
Perform the addition of the low-rank matrix AB into the null matrix C.
Definition: core_zlr2xx.c:240
static pastix_fixdbl_t core_zlr2lr(core_zlrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the low-rank matrix C.
Definition: core_zlr2xx.c:127
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
int core_zgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t LDA, pastix_complex64_t beta, pastix_complex64_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_zgeadd.c:78
pastix_atomic_lock_t * lock
pastix_lrblock_t * C
#define PASTE_CORE_ZLRMM_PARAMS(_a_)
Initialize all the parameters of the core_zlrmm family functions to ease the access.
static pastix_complex64_t * core_zlrmm_getws(core_zlrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
#define PASTE_CORE_ZLRMM_VOID
Void all the parameters of the core_zlrmm family functions to silent warnings.
pastix_fixdbl_t core_zlradd(core_zlrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Definition: core_zlr2xx.c:383
Structure to store all the parameters of the core_zlrmm family functions.
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
Definition: kernels_trace.c:46
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
The block low-rank structure to hold a matrix in low-rank form.
void core_zlrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, pastix_complex64_t alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_zlrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixNoTrans
Definition: api.h:445