PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
cpublok_sadd.c
Go to the documentation of this file.
1/**
2 *
3 * @file cpublok_sadd.c
4 *
5 * Precision dependent routines to add different bloks.
6 *
7 * @copyright 2015-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Mathieu Faverge
12 * @author Alycia Lisito
13 * @date 2024-07-05
14 *
15 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/cpublok_zadd.c, normal z -> s, Tue Feb 25 14:34:58 2025
16 *
17 **/
18#include "common/common.h"
19#include "blend/solver.h"
20#include "kernels_trace.h"
21#include "pastix_scores.h"
22#include "pastix_slrcores.h"
23
24/**
25 *******************************************************************************
26 *
27 * @brief Add a blok in full rank format to a blok in low rank format.
28 *
29 * The second cblk is overwritten by the sum of the two blocks.
30 * B <- alpha * A + B
31 *
32 *******************************************************************************
33 *
34 * @param[in] alpha
35 * The scalar alpha
36 *
37 * @param[in] cblkA
38 * The column block of the A matrix.
39 *
40 * @param[inout] cblkB
41 * The column block of the B matrix
42 * On exit, cblkB coefficient arrays are overwritten by the result of
43 * alpha * A + B.
44 *
45 * @param[in] blokA_m
46 * Index of the first off-diagonal block in cblk, that is used for A.
47 *
48 * @param[in] blokB_m
49 * Index of the first off-diagonal block in cblk, that is used for B.
50 *
51 * @param[inout] A
52 * The pointer to the coeftab of the blok.lcoeftab matrix storing the
53 * coefficients of the panel when the Lower part is computed,
54 * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
55 *
56 * @param[in] lrB
57 * Pointer to the low-rank representation of the column block B.
58 * Must be followed by the low-rank representation of the following blocks.
59 *
60 * @param[in] work
61 * Temporary memory buffer.
62 *
63 * @param[in] lwork
64 * Temporary workspace dimension.
65 *
66 * @param[in] lowrank
67 * The structure with low-rank parameters.
68 *
69 *******************************************************************************
70 *
71 * @return The number of flops of the operation.
72 *
73 *******************************************************************************/
74static inline pastix_fixdbl_t
75cpublok_sadd_frlr( float alpha,
76 const SolverCblk *cblkA,
77 SolverCblk *cblkB,
78 pastix_int_t blokA_m,
79 pastix_int_t blokB_m,
80 const float *A,
82 float *work,
83 pastix_int_t lwork,
84 const pastix_lr_t *lowrank )
85{
86 /*
87 * We can start both blocks at the m^th in the cblk, as the cblkB always
88 * holds at least as many blocks as A
89 */
90 const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
91 const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
92 const SolverBlok *lblokA = cblkA[1].fblokptr;
93 const SolverBlok *lblokB = cblkB[1].fblokptr;
94 pastix_fixdbl_t flops = 0.;
95 core_slrmm_t params;
97 size_t offsetA;
98
99 assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
100 assert( cblkB->cblktype & CBLK_COMPRESSED );
101 assert( cblkA->cblktype & CBLK_LAYOUT_2D );
102
103 assert( A != NULL );
104
105 params.lowrank = lowrank;
106 params.transA = PastixNoTrans; /* Unused */
107 params.transB = PastixNoTrans; /* Unused */
108 params.K = -1; /* Unused */
109 params.alpha = alpha;
110 params.A = NULL; /* Unused */
111 params.B = NULL; /* Unused */
112 params.beta = 1.0;
113 params.work = work;
114 params.lwork = lwork;
115 params.lwused = 0;
116 params.lock = &(cblkB->lock);
117
118 /* Dimensions on N */
119 params.N = cblk_colnbr( cblkA );
120 params.Cn = cblk_colnbr( cblkB );
121 params.offy = cblkA->fcolnum - cblkB->fcolnum;
122
123 lrA.rk = -1;
124 lrA.v = NULL;
125
126 offsetA = blokA->coefind;
127 do {
128 /* Find facing bloknum */
129 while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
130 blokB++; lrB++;
131 }
132 assert( blokA->fcblknm == blokB->fcblknm );
133 assert( is_block_inside_fblock( blokA, blokB ) );
134 assert( blokB <= lblokB );
135
136 lrA.u = (float*)A + blokA->coefind - offsetA;
137 lrA.rkmax = blok_rownbr( blokA );
138
139 /* Dimensions on M */
140 params.M = blok_rownbr( blokA );
141 params.Cm = blok_rownbr( blokB );
142 params.offx = blokA->frownum - blokB->frownum;
143 params.C = lrB;
144
145 flops += core_slradd( &params, &lrA,
146 PastixNoTrans, 0 );
147 blokA++;
148 }
149 while ( ( blokA < lblokA ) &&
150 ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
151 ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
152
153 return flops;
154}
155
156/**
157 *******************************************************************************
158 *
159 * @brief Add two column bloks in low rank format.
160 *
161 * The second cblk is overwritten by the sum of the two blocks.
162 * B <- alpha * A + B
163 *
164 *******************************************************************************
165 *
166 * @param[in] alpha
167 * The scalar alpha
168 *
169 * @param[in] cblkA
170 * The column block of the A matrix.
171 *
172 * @param[inout] cblkB
173 * The column block of the B matrix
174 * On exit, cblkB coefficient arrays are overwritten by the result of
175 * alpha * A + B.
176 *
177 * @param[in] blokA_m
178 * Index of the first off-diagonal block in cblk, that is used for A.
179 *
180 * @param[in] blokB_m
181 * Index of the first off-diagonal block in cblk, that is used for B.
182 *
183 * @param[in] lrA
184 * Pointer to the low-rank representation of the column block A.
185 * Must be followed by the low-rank representation of the following blocks.
186 *
187 * @param[in] lrB
188 * Pointer to the low-rank representation of the column block B.
189 * Must be followed by the low-rank representation of the following blocks.
190 *
191 * @param[in] work
192 * Temporary memory buffer.
193 *
194 * @param[in] lwork
195 * Temporary workspace dimension.
196 *
197 * @param[in] lowrank
198 * The structure with low-rank parameters.
199 *
200 *******************************************************************************
201 *
202 * @return The number of flops of the operation.
203 *
204 *******************************************************************************/
205static inline pastix_fixdbl_t
206cpublok_sadd_lrlr( float alpha,
207 const SolverCblk *cblkA,
208 SolverCblk *cblkB,
209 pastix_int_t blokA_m,
210 pastix_int_t blokB_m,
211 const pastix_lrblock_t *lrA,
212 pastix_lrblock_t *lrB,
213 float *work,
214 pastix_int_t lwork,
215 const pastix_lr_t *lowrank )
216{
217 const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
218 const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
219 const SolverBlok *lblokA = cblkA[1].fblokptr;
220 const SolverBlok *lblokB = cblkB[1].fblokptr;
221 pastix_fixdbl_t flops = 0.;
222 core_slrmm_t params;
223
224 assert( (cblkA->cblktype & CBLK_COMPRESSED) );
225 assert( (cblkB->cblktype & CBLK_COMPRESSED) );
226
227 params.lowrank = lowrank;
228 params.transA = PastixNoTrans; /* Unused */
229 params.transB = PastixNoTrans; /* Unused */
230 params.K = -1; /* Unused */
231 params.alpha = alpha;
232 params.A = NULL; /* Unused */
233 params.B = NULL; /* Unused */
234 params.beta = 1.0;
235 params.work = work;
236 params.lwork = lwork;
237 params.lwused = 0;
238 params.lock = &(cblkB->lock);
239
240 /* Dimensions on N */
241 params.N = cblk_colnbr( cblkA );
242 params.Cn = cblk_colnbr( cblkB );
243 params.offy = cblkA->fcolnum - cblkB->fcolnum;
244
245 do {
246 /* Find facing bloknum */
247 while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
248 blokB++; lrB++;
249 }
250 assert( blokA->fcblknm == blokB->fcblknm );
251 assert( is_block_inside_fblock( blokA, blokB ) );
252 assert( blokB <= lblokB );
253
254 /* Dimensions on M */
255 params.M = blok_rownbr( blokA );
256 params.Cm = blok_rownbr( blokB );
257 params.offx = blokA->frownum - blokB->frownum;
258 params.C = lrB;
259 flops += core_slradd( &params, lrA, PastixNoTrans, PASTIX_LRM3_ORTHOU );
260 blokA++;
261 lrA++;
262 }
263 while ( ( blokA < lblokA ) &&
264 ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
265 ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
266
267 return flops;
268}
269
270/**
271 *******************************************************************************
272 *
273 * @brief Add two bloks in full rank format.
274 *
275 * The second cblk is overwritten by the sum of the two blocks.
276 * B <- alpha * A + B
277 *
278 *******************************************************************************
279 *
280 * @param[in] alpha
281 * The scalar alpha
282 *
283 * @param[in] cblkA
284 * The column block of the A matrix.
285 *
286 * @param[inout] cblkB
287 * The column block of the B matrix
288 * On exit, cblkB coefficient arrays are overwritten by the result of
289 * alpha * A + B.
290 *
291 * @param[in] blokA_m
292 * Index of the first off-diagonal block in cblk, that is used for A.
293 *
294 * @param[in] blokB_m
295 * Index of the first off-diagonal block in cblk, that is used for B.
296 *
297 * @param[inout] A
298 * The pointer to the coeftab of the blok.lcoeftab matrix storing the
299 * coefficients of the panel when the Lower part is computed,
300 * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
301 *
302 * @param[inout] B
303 * The pointer to the coeftab of the blok.lcoeftab matrix storing
304 * the coefficients of the panel, if Symmetric/Symmetric cases or if
305 * upper part is computed; blok.ucoeftab otherwise. Must be of size
306 * blok.stride -by- blok.width
307 *
308 *******************************************************************************
309 *
310 * @return The number of flops of the operation.
311 *
312 *******************************************************************************/
313static inline pastix_fixdbl_t
314cpublok_sadd_frfr( float alpha,
315 const SolverCblk *cblkA,
316 SolverCblk *cblkB,
317 pastix_int_t blokA_m,
318 pastix_int_t blokB_m,
319 const float *A,
320 float *B )
321{
322 const SolverBlok *blokA = cblkA->fblokptr + blokA_m;
323 const SolverBlok *blokB = cblkB->fblokptr + blokB_m;
324 const SolverBlok *lblokA = cblkA[1].fblokptr;
325 const SolverBlok *lblokB = cblkB[1].fblokptr;
326 const float *bA = A;
327 float *bB = B;
328 pastix_int_t lda;
329 pastix_int_t ldb;
330 pastix_int_t m;
331 pastix_int_t n = cblk_colnbr( cblkA );
332 pastix_fixdbl_t flops = 0.;
333 size_t offsetA, offsetB;
334
335 assert( !(cblkA->cblktype & CBLK_COMPRESSED) );
336 assert( !(cblkB->cblktype & CBLK_COMPRESSED) );
337
338 /* Both cblk A and B must be stored in 2D */
339 assert( cblkA->cblktype & CBLK_LAYOUT_2D );
340 assert( cblkB->cblktype & CBLK_LAYOUT_2D );
341
342 offsetA = blokA->coefind;
343 offsetB = blokB->coefind;
344
345 /* Adds blocks facing the same cblk */
346 do {
347 /* Find facing bloknum */
348 while ( !is_block_inside_fblock( blokA, blokB ) && (blokB < lblokB) ) {
349 blokB++;
350 }
351 assert( blokA->fcblknm == blokB->fcblknm );
352 assert( is_block_inside_fblock( blokA, blokB ) );
353 assert( blokB <= lblokB );
354
355 bA = A + blokA->coefind - offsetA;
356 bB = B + blokB->coefind - offsetB;
357 lda = blok_rownbr( blokA );
358 ldb = blok_rownbr( blokB );
359
360 bB = bB + ldb * ( cblkA->fcolnum - cblkB->fcolnum ) + ( blokA->frownum - blokB->frownum );
361 m = lda;
362
363 pastix_cblk_lock( cblkB );
365 alpha, bA, lda,
366 1., bB, ldb );
367 pastix_cblk_unlock( cblkB );
368 flops += 2. * m * n;
369 blokA++;
370 }
371 while ( ( blokA < lblokA ) &&
372 ( blokA[-1].fcblknm == blokA[0].fcblknm ) &&
373 ( blokA[-1].lcblknm == blokA[0].lcblknm ) );
374
375 return flops;
376}
377
378/**
379 *******************************************************************************
380 *
381 * @brief Add two bloks.
382 *
383 * The second cblk is overwritten by the sum of the two blocks.
384 * B <- alpha * A + B
385 *
386 *******************************************************************************
387 *
388 * @param[in] alpha
389 * The scalar alpha
390 *
391 * @param[in] cblkA
392 * The column block of the A matrix.
393 *
394 * @param[inout] cblkB
395 * The column block of the B matrix
396 * On exit, cblkB coefficient arrays are overwritten by the result of
397 * alpha * A + B.
398 *
399 * @param[in] blokA_m
400 * Index of the first off-diagonal block in cblk, that is used for A.
401 *
402 * @param[in] blokB_m
403 * Index of the first off-diagonal block in cblk, that is used for B.
404 *
405 * @param[inout] A
406 * The pointer to the coeftab of the blok.lcoeftab matrix storing the
407 * coefficients of the panel when the Lower part is computed,
408 * blok.ucoeftab otherwise. Must be of size blok.stride -by- blok.width
409 *
410 * @param[inout] B
411 * The pointer to the coeftab of the blok.lcoeftab matrix storing
412 * the coefficients of the panel, if Symmetric/Symmetric cases or if
413 * upper part is computed; blok.ucoeftab otherwise. Must be of size
414 * blok.stride -by- blok.width
415 *
416 * @param[in] work
417 * Temporary memory buffer.
418 *
419 * @param[in] lwork
420 * Temporary workspace dimension.
421 *
422 * @param[in] lowrank
423 * The structure with low-rank parameters.
424 *
425 *******************************************************************************
426 *
427 * @return The number of flops of the operation.
428 *
429 *******************************************************************************/
431cpublok_sadd( float alpha,
432 const SolverCblk *cblkA,
433 SolverCblk *cblkB,
434 pastix_int_t blokA_m,
435 pastix_int_t blokB_m,
436 const void *A,
437 void *B,
438 float *work,
439 pastix_int_t lwork,
440 const pastix_lr_t *lowrank )
441{
443 pastix_fixdbl_t time, flops = 0.0;
444 pastix_int_t m = cblkA->stride;
445 pastix_int_t n = cblk_colnbr( cblkA );
446
447 if ( cblkB->cblktype & CBLK_COMPRESSED ) {
448 if ( cblkA->cblktype & CBLK_COMPRESSED ) {
450 time = kernel_trace_start( ktype );
451 flops = cpublok_sadd_lrlr( alpha, cblkA, cblkB, blokA_m, blokB_m,
452 A, B, work, lwork, lowrank );
453 }
454 else {
456 time = kernel_trace_start( ktype );
457 flops = cpublok_sadd_frlr( alpha, cblkA, cblkB, blokA_m, blokB_m,
458 A, B, work, lwork, lowrank );
459 }
460 }
461 else {
462 if ( cblkA->cblktype & CBLK_COMPRESSED ) {
463 assert(0); /* We do not add a compressed cblk to a non compressed cblk */
464 return 0.; /* Avoids compilation and coverity warning */
465 }
466 else {
468 time = kernel_trace_start( ktype );
469 flops = cpublok_sadd_frfr( alpha, cblkA, cblkB, blokA_m, blokB_m, A, B );
470 }
471 }
472
473 kernel_trace_stop( cblkB->fblokptr->inlast, ktype, m, n, 0, flops, time );
474 return flops;
475}
static pastix_fixdbl_t cpublok_sadd_lrlr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const pastix_lrblock_t *lrA, pastix_lrblock_t *lrB, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in low rank format.
static pastix_fixdbl_t cpublok_sadd_frfr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const float *A, float *B)
Add two bloks in full rank format.
static pastix_fixdbl_t cpublok_sadd_frlr(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const float *A, pastix_lrblock_t *lrB, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add a blok in full rank format to a blok in low rank format.
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
@ PastixKernelGEADDCblkFRFR
@ PastixKernelGEADDCblkLRLR
@ PastixKernelGEADDCblkFRLR
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition core_sgeadd.c:78
pastix_fixdbl_t cpublok_sadd(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const void *A, void *B, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two bloks.
pastix_int_t K
const pastix_lrblock_t * B
pastix_int_t lwork
pastix_int_t N
pastix_int_t M
pastix_int_t lwused
const pastix_lrblock_t * A
pastix_trans_t transB
pastix_lrblock_t * C
pastix_int_t offy
pastix_int_t offx
const pastix_lr_t * lowrank
pastix_int_t Cn
pastix_int_t Cm
pastix_trans_t transA
pastix_atomic_lock_t * lock
pastix_fixdbl_t core_slradd(core_slrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Structure to store all the parameters of the core_slrmm family functions.
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
@ PastixNoTrans
Definition api.h:445
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition solver.h:329
pastix_int_t fcblknm
Definition solver.h:144
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition solver.h:504
pastix_int_t frownum
Definition solver.h:147
pastix_atomic_lock_t lock
Definition solver.h:162
pastix_int_t coefind
Definition solver.h:149
SolverBlok * fblokptr
Definition solver.h:168
int8_t inlast
Definition solver.h:151
pastix_int_t stride
Definition solver.h:169
int8_t cblktype
Definition solver.h:164
pastix_int_t fcolnum
Definition solver.h:166
Solver block structure.
Definition solver.h:141
Solver column block structure.
Definition solver.h:161