PaStiX Handbook  6.4.0
cpucblk_zadd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file cpucblk_zadd.c
4  *
5  * Precision dependent routines to add different cblks.
6  *
7  * @copyright 2015-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Pierre Ramet
12  * @author Mathieu Faverge
13  * @author Tony Delarue
14  * @author Alycia Lisito
15  * @author Nolan Bredel
16  * @date 2024-07-05
17  *
18  * @generated from /builds/solverstack/pastix/kernels/cpucblk_zadd.c, normal z -> z, Thu Aug 29 14:20:22 2024
19  *
20  **/
21 #include "common/common.h"
22 #include "blend/solver.h"
23 #include "kernels_trace.h"
24 #include "pastix_zcores.h"
25 #include "pastix_zlrcores.h"
26 
27 /**
28  *******************************************************************************
29  *
30  * @brief Add a column blok in full rank format to a column blok in low rank
31  * format.
32  *
33  * The second cblk is overwritten by the sum of the two column blocks.
34  * B <- alpha * A + B
35  *
36  *******************************************************************************
37  *
38  * @param[in] alpha
39  * The scalar alpha
40  *
41  * @param[in] cblkA
42  * The column block of the A matrix.
43  *
44  * @param[inout] cblkB
45  * The column block of the B matrix
46  * On exit, cblkB coefficient arrays are overwritten by the result of
47  * alpha * A + B.
48  *
49  * @param[inout] A
50  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
51  * coefficients of the panel when the Lower part is computed,
52  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
53  *
54  * @param[in] lrB
55  * Pointer to the low-rank representation of the column block B.
56  * Must be followed by the low-rank representation of the following blocks.
57  *
58  * @param[in] work
59  * Temporary memory buffer.
60  *
61  * @param[in] lwork
62  * Temporary workspace dimension.
63  *
64  * @param[in] lowrank
65  * The structure with low-rank parameters.
66  *
67  *******************************************************************************
68  *
69  * @return The number of flops of the operation.
70  *
71  *******************************************************************************/
72 static inline pastix_fixdbl_t
73 cpucblk_zadd_frlr( pastix_complex64_t alpha,
74  const SolverCblk *cblkA,
75  SolverCblk *cblkB,
76  const pastix_complex64_t *A,
77  pastix_lrblock_t *lrB,
78  pastix_complex64_t *work,
79  pastix_int_t lwork,
80  const pastix_lr_t *lowrank )
81 {
82  const SolverBlok *blokA = cblkA->fblokptr;
83  const SolverBlok *blokB = cblkB->fblokptr;
84  const SolverBlok *lblokA = cblkA[1].fblokptr;
85  const SolverBlok *lblokB = cblkB[1].fblokptr;
86  pastix_fixdbl_t flops = 0.;
87  core_zlrmm_t params;
88  pastix_lrblock_t lrA;
89 
90  assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
91  assert( cblkB->cblktype & CBLK_COMPRESSED );
92  assert( cblkA->cblktype & CBLK_LAYOUT_2D );
93 
94  assert( A != NULL );
95 
96  params.lowrank = lowrank;
97  params.transA = PastixNoTrans; /* Unused */
98  params.transB = PastixNoTrans; /* Unused */
99  params.K = -1; /* Unused */
100  params.alpha = alpha;
101  params.A = NULL; /* Unused */
102  params.B = NULL; /* Unused */
103  params.beta = 1.0;
104  params.work = work;
105  params.lwork = lwork;
106  params.lwused = 0;
107  params.lock = &(cblkB->lock);
108 
109  /* Dimensions on N */
110  params.N = cblk_colnbr( cblkA );
111  params.Cn = cblk_colnbr( cblkB );
112  params.offy = cblkA->fcolnum - cblkB->fcolnum;
113 
114  lrA.rk = -1;
115  lrA.v = NULL;
116 
117  for (; blokA < lblokA; blokA++) {
118 
119  /* Find facing bloknum */
120  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
121  blokB++; lrB++;
122  }
123 
124  assert( is_block_inside_fblock( blokA, blokB ) && (blokB <= lblokB) );
125 
126  lrA.u = (pastix_complex64_t*)A + blokA->coefind;
127  lrA.rkmax = blok_rownbr( blokA );
128 
129  /* Dimensions on M */
130  params.M = blok_rownbr( blokA );
131  params.Cm = blok_rownbr( blokB );
132  params.offx = blokA->frownum - blokB->frownum;
133  params.C = lrB;
134 
135  flops += core_zlradd( &params, &lrA,
136  PastixNoTrans, 0 );
137  }
138  return flops;
139 }
140 
141 /**
142  *******************************************************************************
143  *
144  * @brief Add two column bloks in low rank format.
145  *
146  * The second cblk is overwritten by the sum of the two column blocks.
147  * B <- alpha * A + B
148  *
149  *******************************************************************************
150  *
151  * @param[in] alpha
152  * The scalar alpha
153  *
154  * @param[in] cblkA
155  * The column block of the A matrix.
156  *
157  * @param[inout] cblkB
158  * The column block of the B matrix
159  * On exit, cblkB coefficient arrays are overwritten by the result of
160  * alpha * A + B.
161  *
162  * @param[in] lrA
163  * Pointer to the low-rank representation of the column block A.
164  * Must be followed by the low-rank representation of the following blocks.
165  *
166  * @param[in] lrB
167  * Pointer to the low-rank representation of the column block B.
168  * Must be followed by the low-rank representation of the following blocks.
169  *
170  * @param[in] work
171  * Temporary memory buffer.
172  *
173  * @param[in] lwork
174  * Temporary workspace dimension.
175  *
176  * @param[in] lowrank
177  * The structure with low-rank parameters.
178  *
179  *******************************************************************************
180  *
181  * @return The number of flops of the operation.
182  *
183  *******************************************************************************/
184 static inline pastix_fixdbl_t
185 cpucblk_zadd_lrlr( pastix_complex64_t alpha,
186  const SolverCblk *cblkA,
187  SolverCblk *cblkB,
188  const pastix_lrblock_t *lrA,
189  pastix_lrblock_t *lrB,
190  pastix_complex64_t *work,
191  pastix_int_t lwork,
192  const pastix_lr_t *lowrank )
193 {
194  const SolverBlok *blokA = cblkA->fblokptr;
195  const SolverBlok *blokB = cblkB->fblokptr;
196  const SolverBlok *lblokA = cblkA[1].fblokptr;
197  const SolverBlok *lblokB = cblkB[1].fblokptr;
198  pastix_fixdbl_t flops = 0.;
199  core_zlrmm_t params;
200 
201  assert( (cblkA->cblktype & CBLK_COMPRESSED) );
202  assert( (cblkB->cblktype & CBLK_COMPRESSED) );
203 
204  params.lowrank = lowrank;
205  params.transA = PastixNoTrans; /* Unused */
206  params.transB = PastixNoTrans; /* Unused */
207  params.K = -1; /* Unused */
208  params.alpha = alpha;
209  params.A = NULL; /* Unused */
210  params.B = NULL; /* Unused */
211  params.beta = 1.0;
212  params.work = work;
213  params.lwork = lwork;
214  params.lwused = 0;
215  params.lock = &(cblkB->lock);
216 
217  /* Dimensions on N */
218  params.N = cblk_colnbr( cblkA );
219  params.Cn = cblk_colnbr( cblkB );
220  params.offy = cblkA->fcolnum - cblkB->fcolnum;
221 
222  for (; blokA < lblokA; blokA++, lrA++) {
223 
224  /* Find facing bloknum */
225  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
226  blokB++; lrB++;
227  }
228 
229  assert( is_block_inside_fblock( blokA, blokB ) && (blokB <= lblokB) );
230 
231  /* Dimensions on M */
232  params.M = blok_rownbr( blokA );
233  params.Cm = blok_rownbr( blokB );
234  params.offx = blokA->frownum - blokB->frownum;
235  params.C = lrB;
236  flops += core_zlradd( &params, lrA, PastixNoTrans, PASTIX_LRM3_ORTHOU );
237  }
238  return flops;
239 }
240 
241 /**
242  *******************************************************************************
243  *
244  * @brief Add two column bloks in full rank format.
245  *
246  * The second cblk is overwritten by the sum of the two column blocks.
247  * B <- alpha * A + B
248  *
249  *******************************************************************************
250  *
251  * @param[in] alpha
252  * The scalar alpha
253  *
254  * @param[in] cblkA
255  * The column block of the A matrix.
256  *
257  * @param[inout] cblkB
258  * The column block of the B matrix
259  * On exit, cblkB coefficient arrays are overwritten by the result of
260  * alpha * A + B.
261  *
262  * @param[inout] A
263  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
264  * coefficients of the panel when the Lower part is computed,
265  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
266  *
267  * @param[inout] B
268  * The pointer to the coeftab of the cblk.lcoeftab matrix storing
269  * the coefficients of the panel, if Symmetric/Hermitian cases or if
270  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
271  * cblk.stride -by- cblk.width
272  *
273  *******************************************************************************
274  *
275  * @return The number of flops of the operation.
276  *
277  *******************************************************************************/
278 static inline pastix_fixdbl_t
279 cpucblk_zadd_frfr( pastix_complex64_t alpha,
280  const SolverCblk *cblkA,
281  SolverCblk *cblkB,
282  const pastix_complex64_t *A,
283  pastix_complex64_t *B )
284 {
285  pastix_int_t n = cblk_colnbr( cblkA );
286  pastix_int_t m = cblkA->stride;
287  pastix_fixdbl_t flops = m * n;
288 
289  assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
290  assert( !(cblkB->cblktype & CBLK_COMPRESSED) );
291 
292  assert( (A != NULL) && (B != NULL) );
293 
294  /* If the cblk matches */
295  if ( (n == cblk_colnbr( cblkB )) &&
296  (m == cblkB->stride) ) {
297 
298  pastix_cblk_lock( cblkB );
299  core_zgeadd( PastixNoTrans, m, n,
300  alpha, A, m,
301  1., B, m );
302  pastix_cblk_unlock( cblkB );
303  }
304  else {
305  const pastix_complex64_t *bA;
306  pastix_complex64_t *bB;
307  const SolverBlok *blokA = cblkA->fblokptr;
308  const SolverBlok *blokB = cblkB->fblokptr;
309  const SolverBlok *lblokA = cblkA[1].fblokptr;
310  const SolverBlok *lblokB = cblkB[1].fblokptr;
311  pastix_int_t lda, ldb;
312 
313  /* Both cblk A and B must be stored in 2D */
314  assert( cblkA->cblktype & CBLK_LAYOUT_2D );
315  assert( cblkB->cblktype & CBLK_LAYOUT_2D );
316 
317  for (; blokA < lblokA; blokA++) {
318 
319  /* Find facing bloknum */
320  while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
321  blokB++;
322  }
323 
324  assert( is_block_inside_fblock( blokA, blokB ) && (blokB <= lblokB) );
325 
326  bA = A + blokA->coefind;
327  bB = B + blokB->coefind;
328  lda = blok_rownbr( blokA );
329  ldb = blok_rownbr( blokB );
330 
331  bB = bB + ldb * ( cblkA->fcolnum - cblkB->fcolnum ) + ( blokA->frownum - blokB->frownum );
332  m = lda;
333 
334  pastix_cblk_lock( cblkB );
335  core_zgeadd( PastixNoTrans, m, n,
336  alpha, bA, lda,
337  1., bB, ldb );
338  pastix_cblk_unlock( cblkB );
339  }
340  }
341  return flops;
342 }
343 
344 /**
345  *******************************************************************************
346  *
347  * @brief Add two column bloks in full rank format.
348  *
349  * The second cblk is overwritten by the sum of the two column blocks.
350  * B <- alpha * A + B
351  *
352  *******************************************************************************
353  *
354  * @param[in] alpha
355  * The scalar alpha
356  *
357  * @param[in] cblkA
358  * The column block of the A matrix.
359  *
360  * @param[inout] cblkB
361  * The column block of the B matrix
362  * On exit, cblkB coefficient arrays are overwritten by the result of
363  * alpha * A + B.
364  *
365  * @param[inout] A
366  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
367  * coefficients of the panel when the Lower part is computed,
368  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
369  *
370  * @param[in] B
371  * The pointer to the coeftab of the cblk.lcoeftab matrix storing
372  * the coefficients of the panel, if Symmetric/Hermitian cases or if
373  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
374  * cblk.stride -by- cblk.width
375  *
376  * @param[in] work
377  * Temporary memory buffer.
378  *
379  * @param[in] lwork
380  * Temporary workspace dimension.
381  *
382  * @param[in] lowrank
383  * The structure with low-rank parameters.
384  *
385  *******************************************************************************
386  *
387  * @return The number of flops of the operation.
388  *
389  *******************************************************************************/
391 cpucblk_zadd( pastix_complex64_t alpha,
392  const SolverCblk *cblkA,
393  SolverCblk *cblkB,
394  const void *A,
395  void *B,
396  pastix_complex64_t *work,
397  pastix_int_t lwork,
398  const pastix_lr_t *lowrank )
399 {
401  pastix_fixdbl_t time, flops = 0.0;
402  pastix_int_t m = cblkA->stride;
403  pastix_int_t n = cblk_colnbr( cblkA );
404 
405  if ( cblkB->cblktype & CBLK_COMPRESSED ) {
406  if ( cblkA->cblktype & CBLK_COMPRESSED ) {
408  time = kernel_trace_start( ktype );
409  flops = cpucblk_zadd_lrlr( alpha, cblkA, cblkB,
410  A, B, work, lwork, lowrank );
411  }
412  else {
414  time = kernel_trace_start( ktype );
415  flops = cpucblk_zadd_frlr( alpha, cblkA, cblkB,
416  A, B, work, lwork, lowrank );
417  }
418  }
419  else {
420  if ( cblkA->cblktype & CBLK_COMPRESSED ) {
421  assert(0); /* We do not add a compressed cblk to a non compressed cblk */
422  return 0.; /* Avoids compilation and coverity warning */
423  }
424  else {
426  time = kernel_trace_start( ktype );
427  flops = cpucblk_zadd_frfr( alpha, cblkA, cblkB, A, B );
428  }
429  }
430 
431  kernel_trace_stop( cblkB->fblokptr->inlast, ktype, m, n, 0, flops, time );
432  return flops;
433 }
434 
static pastix_fixdbl_t cpucblk_zadd_frfr(pastix_complex64_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const pastix_complex64_t *A, pastix_complex64_t *B)
Add two column bloks in full rank format.
Definition: cpucblk_zadd.c:279
static pastix_fixdbl_t cpucblk_zadd_lrlr(pastix_complex64_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const pastix_lrblock_t *lrA, pastix_lrblock_t *lrB, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in low rank format.
Definition: cpucblk_zadd.c:185
static pastix_fixdbl_t cpucblk_zadd_frlr(pastix_complex64_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const pastix_complex64_t *A, pastix_lrblock_t *lrB, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add a column blok in full rank format to a column blok in low rank format.
Definition: cpucblk_zadd.c:73
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
Definition: kernels_trace.h:87
@ PastixKernelGEADDCblkFRFR
Definition: kernels_enums.h:69
@ PastixKernelGEADDCblkLRLR
Definition: kernels_enums.h:71
@ PastixKernelGEADDCblkFRLR
Definition: kernels_enums.h:70
int core_zgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t LDA, pastix_complex64_t beta, pastix_complex64_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_zgeadd.c:78
pastix_fixdbl_t cpucblk_zadd(pastix_complex64_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const void *A, void *B, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in full rank format.
Definition: cpucblk_zadd.c:391
const pastix_lrblock_t * A
pastix_complex64_t alpha
pastix_trans_t transB
pastix_atomic_lock_t * lock
const pastix_lrblock_t * B
const pastix_lr_t * lowrank
pastix_int_t M
pastix_int_t offy
pastix_int_t lwused
pastix_trans_t transA
pastix_int_t Cm
pastix_int_t lwork
pastix_int_t N
pastix_lrblock_t * C
pastix_complex64_t beta
pastix_int_t K
pastix_int_t offx
pastix_complex64_t * work
pastix_int_t Cn
pastix_fixdbl_t core_zlradd(core_zlrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Definition: core_zlr2xx.c:383
Structure to store all the parameters of the core_zlrmm family functions.
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
@ PastixNoTrans
Definition: api.h:445
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition: solver.h:504
pastix_int_t frownum
Definition: solver.h:147
pastix_atomic_lock_t lock
Definition: solver.h:162
pastix_int_t coefind
Definition: solver.h:149
SolverBlok * fblokptr
Definition: solver.h:168
int8_t inlast
Definition: solver.h:151
pastix_int_t stride
Definition: solver.h:169
int8_t cblktype
Definition: solver.h:164
pastix_int_t fcolnum
Definition: solver.h:166
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161