PaStiX Handbook  6.4.0
cpublok_sadd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file cpublok_sadd.c
4  *
5  * Precision dependent routines to add different bloks.
6  *
7  * @copyright 2015-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Alycia Lisito
13  * @date 2024-07-05
14  *
15  * @generated from /builds/solverstack/pastix/kernels/cpublok_zadd.c, normal z -> s, Thu Aug 29 14:20:21 2024
16  *
17  **/
18 #include "common/common.h"
19 #include "blend/solver.h"
20 #include "kernels_trace.h"
21 #include "pastix_scores.h"
22 #include "pastix_slrcores.h"
23 
24 /**
25  *******************************************************************************
26  *
27  * @brief Add a blok in full rank format to a blok in low rank format.
28  *
29  * The second cblk is overwritten by the sum of the two blocks.
30  * B <- alpha * A + B
31  *
32  *******************************************************************************
33  *
34  * @param[in] alpha
35  * The scalar alpha
36  *
37  * @param[in] cblkA
38  * The column block of the A matrix.
39  *
40  * @param[inout] cblkB
41  * The column block of the B matrix
42  * On exit, cblkB coefficient arrays are overwritten by the result of
43  * alpha * A + B.
44  *
45  * @param[in] blokA_m
46  * Index of the first off-diagonal block in cblk, that is used for A.
47  *
48  * @param[in] blokB_m
49  * Index of the first off-diagonal block in cblk, that is used for B.
50  *
51  * @param[inout] A
52  * The pointer to the coeftab of the blok.lcoeftab matrix storing the
53  * coefficients of the panel when the Lower part is computed,
54  * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
55  *
56  * @param[in] lrB
57  * Pointer to the low-rank representation of the column block B.
58  * Must be followed by the low-rank representation of the following blocks.
59  *
60  * @param[in] work
61  * Temporary memory buffer.
62  *
63  * @param[in] lwork
64  * Temporary workspace dimension.
65  *
66  * @param[in] lowrank
67  * The structure with low-rank parameters.
68  *
69  *******************************************************************************
70  *
71  * @return The number of flops of the operation.
72  *
73  *******************************************************************************/
74 static inline pastix_fixdbl_t
75 cpublok_sadd_frlr( float alpha,
76  const SolverCblk *cblkA,
77  SolverCblk *cblkB,
78  pastix_int_t blokA_m,
79  pastix_int_t blokB_m,
80  const float *A,
81  pastix_lrblock_t *lrB,
82  float *work,
83  pastix_int_t lwork,
84  const pastix_lr_t *lowrank )
85 {
86  /*
87  * We can start both blocks at the m^th in the cblk, as the cblkB always
88  * holds at least as many blocks as A
89  */
90  const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
91  const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
92  const SolverBlok *lblokA = cblkA[1].fblokptr;
93  const SolverBlok *lblokB = cblkB[1].fblokptr;
94  pastix_fixdbl_t flops = 0.;
95  core_slrmm_t params;
96  pastix_lrblock_t lrA;
97  size_t offsetA;
98 
99  assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
100  assert( cblkB->cblktype & CBLK_COMPRESSED );
101  assert( cblkA->cblktype & CBLK_LAYOUT_2D );
102 
103  assert( A != NULL );
104 
105  params.lowrank = lowrank;
106  params.transA = PastixNoTrans; /* Unused */
107  params.transB = PastixNoTrans; /* Unused */
108  params.K = -1; /* Unused */
109  params.alpha = alpha;
110  params.A = NULL; /* Unused */
111  params.B = NULL; /* Unused */
112  params.beta = 1.0;
113  params.work = work;
114  params.lwork = lwork;
115  params.lwused = 0;
116  params.lock = &(cblkB->lock);
117 
118  /* Dimensions on N */
119  params.N = cblk_colnbr( cblkA );
120  params.Cn = cblk_colnbr( cblkB );
121  params.offy = cblkA->fcolnum - cblkB->fcolnum;
122 
123  lrA.rk = -1;
124  lrA.v = NULL;
125 
126  offsetA = blokA->coefind;
127  do {
128  /* Find facing bloknum */
129  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
130  blokB++; lrB++;
131  }
132  assert( blokA->fcblknm == blokB->fcblknm );
133  assert( is_block_inside_fblock( blokA, blokB ) );
134  assert( blokB <= lblokB );
135 
136  lrA.u = (float*)A + blokA->coefind - offsetA;
137  lrA.rkmax = blok_rownbr( blokA );
138 
139  /* Dimensions on M */
140  params.M = blok_rownbr( blokA );
141  params.Cm = blok_rownbr( blokB );
142  params.offx = blokA->frownum - blokB->frownum;
143  params.C = lrB;
144 
145  flops += core_slradd( &params, &lrA,
146  PastixNoTrans, 0 );
147  blokA++;
148  }
149  while ( ( blokA < lblokA ) &&
150  ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
151  ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
152 
153  return flops;
154 }
155 
156 /**
157  *******************************************************************************
158  *
159  * @brief Add two column bloks in low rank format.
160  *
161  * The second cblk is overwritten by the sum of the two blocks.
162  * B <- alpha * A + B
163  *
164  *******************************************************************************
165  *
166  * @param[in] alpha
167  * The scalar alpha
168  *
169  * @param[in] cblkA
170  * The column block of the A matrix.
171  *
172  * @param[inout] cblkB
173  * The column block of the B matrix
174  * On exit, cblkB coefficient arrays are overwritten by the result of
175  * alpha * A + B.
176  *
177  * @param[in] blokA_m
178  * Index of the first off-diagonal block in cblk, that is used for A.
179  *
180  * @param[in] blokB_m
181  * Index of the first off-diagonal block in cblk, that is used for B.
182  *
183  * @param[in] lrA
184  * Pointer to the low-rank representation of the column block A.
185  * Must be followed by the low-rank representation of the following blocks.
186  *
187  * @param[in] lrB
188  * Pointer to the low-rank representation of the column block B.
189  * Must be followed by the low-rank representation of the following blocks.
190  *
191  * @param[in] work
192  * Temporary memory buffer.
193  *
194  * @param[in] lwork
195  * Temporary workspace dimension.
196  *
197  * @param[in] lowrank
198  * The structure with low-rank parameters.
199  *
200  *******************************************************************************
201  *
202  * @return The number of flops of the operation.
203  *
204  *******************************************************************************/
205 static inline pastix_fixdbl_t
206 cpublok_sadd_lrlr( float alpha,
207  const SolverCblk *cblkA,
208  SolverCblk *cblkB,
209  pastix_int_t blokA_m,
210  pastix_int_t blokB_m,
211  const pastix_lrblock_t *lrA,
212  pastix_lrblock_t *lrB,
213  float *work,
214  pastix_int_t lwork,
215  const pastix_lr_t *lowrank )
216 {
217  const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
218  const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
219  const SolverBlok *lblokA = cblkA[1].fblokptr;
220  const SolverBlok *lblokB = cblkB[1].fblokptr;
221  pastix_fixdbl_t flops = 0.;
222  core_slrmm_t params;
223 
224  assert( (cblkA->cblktype & CBLK_COMPRESSED) );
225  assert( (cblkB->cblktype & CBLK_COMPRESSED) );
226 
227  params.lowrank = lowrank;
228  params.transA = PastixNoTrans; /* Unused */
229  params.transB = PastixNoTrans; /* Unused */
230  params.K = -1; /* Unused */
231  params.alpha = alpha;
232  params.A = NULL; /* Unused */
233  params.B = NULL; /* Unused */
234  params.beta = 1.0;
235  params.work = work;
236  params.lwork = lwork;
237  params.lwused = 0;
238  params.lock = &(cblkB->lock);
239 
240  /* Dimensions on N */
241  params.N = cblk_colnbr( cblkA );
242  params.Cn = cblk_colnbr( cblkB );
243  params.offy = cblkA->fcolnum - cblkB->fcolnum;
244 
245  do {
246  /* Find facing bloknum */
247  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
248  blokB++; lrB++;
249  }
250  assert( blokA->fcblknm == blokB->fcblknm );
251  assert( is_block_inside_fblock( blokA, blokB ) );
252  assert( blokB <= lblokB );
253 
254  /* Dimensions on M */
255  params.M = blok_rownbr( blokA );
256  params.Cm = blok_rownbr( blokB );
257  params.offx = blokA->frownum - blokB->frownum;
258  params.C = lrB;
259  flops += core_slradd( &params, lrA, PastixNoTrans, PASTIX_LRM3_ORTHOU );
260  blokA++;
261  lrA++;
262  }
263  while ( ( blokA < lblokA ) &&
264  ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
265  ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
266 
267  return flops;
268 }
269 
270 /**
271  *******************************************************************************
272  *
273  * @brief Add two bloks in full rank format.
274  *
275  * The second cblk is overwritten by the sum of the two blocks.
276  * B <- alpha * A + B
277  *
278  *******************************************************************************
279  *
280  * @param[in] alpha
281  * The scalar alpha
282  *
283  * @param[in] cblkA
284  * The column block of the A matrix.
285  *
286  * @param[inout] cblkB
287  * The column block of the B matrix
288  * On exit, cblkB coefficient arrays are overwritten by the result of
289  * alpha * A + B.
290  *
291  * @param[in] blokA_m
292  * Index of the first off-diagonal block in cblk, that is used for A.
293  *
294  * @param[in] blokB_m
295  * Index of the first off-diagonal block in cblk, that is used for B.
296  *
297  * @param[inout] A
298  * The pointer to the coeftab of the blok.lcoeftab matrix storing the
299  * coefficients of the panel when the Lower part is computed,
300  * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
301  *
302  * @param[inout] B
303  * The pointer to the coeftab of the blok.lcoeftab matrix storing
304  * the coefficients of the panel, if Symmetric/Symmetric cases or if
305  * upper part is computed; blok.ucoeftab otherwise. Must be of size
306  * blok.stride -by- blok.width
307  *
308  *******************************************************************************
309  *
310  * @return The number of flops of the operation.
311  *
312  *******************************************************************************/
313 static inline pastix_fixdbl_t
314 cpublok_sadd_frfr( float alpha,
315  const SolverCblk *cblkA,
316  SolverCblk *cblkB,
317  pastix_int_t blokA_m,
318  pastix_int_t blokB_m,
319  const float *A,
320  float *B )
321 {
322  const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
323  const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
324  const SolverBlok *lblokA = cblkA[1].fblokptr;
325  const SolverBlok *lblokB = cblkB[1].fblokptr;
326  const float *bA = A;
327  float *bB = B;
328  pastix_int_t lda;
329  pastix_int_t ldb;
330  pastix_int_t m;
331  pastix_int_t n = cblk_colnbr( cblkA );
332  pastix_fixdbl_t flops = 0.;
333  size_t offsetA, offsetB;
334 
335  assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
336  assert( !(cblkB->cblktype & CBLK_COMPRESSED) );
337 
338  /* Both cblk A and B must be stored in 2D */
339  assert( cblkA->cblktype & CBLK_LAYOUT_2D );
340  assert( cblkB->cblktype & CBLK_LAYOUT_2D );
341 
342  offsetA = blokA->coefind;
343  offsetB = blokB->coefind;
344 
345  /* Adds blocks facing the same cblk */
346  do {
347  /* Find facing bloknum */
348  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
349  blokB++;
350  }
351  assert( blokA->fcblknm == blokB->fcblknm );
352  assert( is_block_inside_fblock( blokA, blokB ) );
353  assert( blokB <= lblokB );
354 
355  bA = A + blokA->coefind - offsetA;
356  bB = B + blokB->coefind - offsetB;
357  lda = blok_rownbr( blokA );
358  ldb = blok_rownbr( blokB );
359 
360  bB = bB + ldb * ( cblkA->fcolnum - cblkB->fcolnum ) + ( blokA->frownum - blokB->frownum );
361  m = lda;
362 
363  pastix_cblk_lock( cblkB );
364  core_sgeadd( PastixNoTrans, m, n,
365  alpha, bA, lda,
366  1., bB, ldb );
367  pastix_cblk_unlock( cblkB );
368  flops += 2. * m * n;
369  blokA++;
370  }
371  while ( ( blokA < lblokA ) &&
372  ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
373  ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
374 
375  return flops;
376 }
377 
378 /**
379  *******************************************************************************
380  *
381  * @brief Add two bloks.
382  *
383  * The second cblk is overwritten by the sum of the two blocks.
384  * B <- alpha * A + B
385  *
386  *******************************************************************************
387  *
388  * @param[in] alpha
389  * The scalar alpha
390  *
391  * @param[in] cblkA
392  * The column block of the A matrix.
393  *
394  * @param[inout] cblkB
395  * The column block of the B matrix
396  * On exit, cblkB coefficient arrays are overwritten by the result of
397  * alpha * A + B.
398  *
399  * @param[in] blokA_m
400  * Index of the first off-diagonal block in cblk, that is used for A.
401  *
402  * @param[in] blokB_m
403  * Index of the first off-diagonal block in cblk, that is used for B.
404  *
405  * @param[inout] A
406  * The pointer to the coeftab of the blok.lcoeftab matrix storing the
407  * coefficients of the panel when the Lower part is computed,
408  * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
409  *
410  * @param[inout] B
411  * The pointer to the coeftab of the blok.lcoeftab matrix storing
412  * the coefficients of the panel, if Symmetric/Symmetric cases or if
413  * upper part is computed; blok.ucoeftab otherwise. Must be of size
414  * blok.stride -by- blok.width
415  *
416  * @param[in] work
417  * Temporary memory buffer.
418  *
419  * @param[in] lwork
420  * Temporary workspace dimension.
421  *
422  * @param[in] lowrank
423  * The structure with low-rank parameters.
424  *
425  *******************************************************************************
426  *
427  * @return The number of flops of the operation.
428  *
429  *******************************************************************************/
431 cpublok_sadd( float alpha,
432  const SolverCblk *cblkA,
433  SolverCblk *cblkB,
434  pastix_int_t blokA_m,
435  pastix_int_t blokB_m,
436  const void *A,
437  void *B,
438  float *work,
439  pastix_int_t lwork,
440  const pastix_lr_t *lowrank )
441 {
443  pastix_fixdbl_t time, flops = 0.0;
444  pastix_int_t m = cblkA->stride;
445  pastix_int_t n = cblk_colnbr( cblkA );
446 
447  if ( cblkB->cblktype & CBLK_COMPRESSED ) {
448  if ( cblkA->cblktype & CBLK_COMPRESSED ) {
450  time = kernel_trace_start( ktype );
451  flops = cpublok_sadd_lrlr( alpha, cblkA, cblkB, blokA_m, blokB_m,
452  A, B, work, lwork, lowrank );
453  }
454  else {
456  time = kernel_trace_start( ktype );
457  flops = cpublok_sadd_frlr( alpha, cblkA, cblkB, blokA_m, blokB_m,
458  A, B, work, lwork, lowrank );
459  }
460  }
461  else {
462  if ( cblkA->cblktype & CBLK_COMPRESSED ) {
463  assert(0); /* We do not add a compressed cblk to a non compressed cblk */
464  return 0.; /* Avoids compilation and coverity warning */
465  }
466  else {
468  time = kernel_trace_start( ktype );
469  flops = cpublok_sadd_frfr( alpha, cblkA, cblkB, blokA_m, blokB_m, A, B );
470  }
471  }
472 
473  kernel_trace_stop( cblkB->fblokptr->inlast, ktype, m, n, 0, flops, time );
474  return flops;
475 }
static pastix_fixdbl_t cpublok_sadd_lrlr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const pastix_lrblock_t *lrA, pastix_lrblock_t *lrB, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in low rank format.
Definition: cpublok_sadd.c:206
static pastix_fixdbl_t cpublok_sadd_frfr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const float *A, float *B)
Add two bloks in full rank format.
Definition: cpublok_sadd.c:314
static pastix_fixdbl_t cpublok_sadd_frlr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const float *A, pastix_lrblock_t *lrB, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add a blok in full rank format to a blok in low rank format.
Definition: cpublok_sadd.c:75
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
Definition: kernels_trace.h:87
@ PastixKernelGEADDCblkFRFR
Definition: kernels_enums.h:69
@ PastixKernelGEADDCblkLRLR
Definition: kernels_enums.h:71
@ PastixKernelGEADDCblkFRLR
Definition: kernels_enums.h:70
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
pastix_fixdbl_t cpublok_sadd(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const void *A, void *B, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two bloks.
Definition: cpublok_sadd.c:431
pastix_int_t K
const pastix_lrblock_t * B
pastix_int_t lwork
pastix_int_t N
pastix_int_t M
pastix_int_t lwused
const pastix_lrblock_t * A
pastix_trans_t transB
pastix_lrblock_t * C
pastix_int_t offy
pastix_int_t offx
const pastix_lr_t * lowrank
pastix_int_t Cn
pastix_int_t Cm
pastix_trans_t transA
pastix_atomic_lock_t * lock
pastix_fixdbl_t core_slradd(core_slrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Definition: core_slr2xx.c:383
Structure to store all the parameters of the core_slrmm family functions.
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
@ PastixNoTrans
Definition: api.h:445
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
pastix_int_t fcblknm
Definition: solver.h:144
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition: solver.h:504
pastix_int_t frownum
Definition: solver.h:147
pastix_atomic_lock_t lock
Definition: solver.h:162
pastix_int_t coefind
Definition: solver.h:149
SolverBlok * fblokptr
Definition: solver.h:168
int8_t inlast
Definition: solver.h:151
pastix_int_t stride
Definition: solver.h:169
int8_t cblktype
Definition: solver.h:164
pastix_int_t fcolnum
Definition: solver.h:166
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161