PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_slr2xx.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_slr2xx.c
4 *
5 * PaStiX low-rank kernel routines that perform the addition of AB into C.
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Mathieu Faverge
12 * @author Gregoire Pichon
13 * @author Pierre Ramet
14 * @date 2024-07-05
15 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zlr2xx.c, normal z -> s, Tue Feb 25 14:34:56 2025
16 *
17 **/
18#include "common.h"
19#include <cblas.h>
20#include "kernels_trace.h"
21#include "blend/solver.h"
22#include "pastix_scores.h"
23#include "pastix_slrcores.h"
24
25#ifndef DOXYGEN_SHOULD_SKIP_THIS
26static float sone = 1.0;
27static float szero = 0.0;
28#endif /* DOXYGEN_SHOULD_SKIP_THIS */
29
30/**
31 *******************************************************************************
32 *
33 * @brief Perform the addition of the low-rank matrix AB and the full-rank
34 * matrix C.
35 *
36 *******************************************************************************
37 *
38 * @param[inout] params
39 * The LRMM structure that stores all the parameters used in the LRMM
40 * functions family.
41 * On exit, the C matrix is udpated with the addition of AB.
42 * @sa core_slrmm_t
43 *
44 * @param[in] AB
45 * The low-rank structure of the AB matrix to apply to C.
46 *
47 * @param[in] transV
48 * Specify if AB->v is stored normally or transposed.
49 * - If PastixNoTrans, AB->v is stored normally for low-rank format.
50 * - If PastixTrans, AB->v is stored transposed.
51 * - If PastixTrans, AB->v is stored transposed, and () must be
52 * applied to the matrix.
53 *
54 *******************************************************************************
55 *
56 * @return The number of flops required to perform the operation.
57 *
58 *******************************************************************************/
59static inline pastix_fixdbl_t
61 const pastix_lrblock_t *AB,
62 pastix_trans_t transV )
63{
65 pastix_int_t ldabu = M;
66 pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
67 pastix_fixdbl_t flops = 0.;
68 float *Cfr = C->u;
69 Cfr += Cm * offy + offx;
70
71 assert( C->rk == -1 );
72
73 /* TODO: find a suitable name to trace this kind of kernel. */
74 if ( AB->rk == -1 ) {
75 flops = 2 * M * N;
76 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
78 alpha, AB->u, ldabu,
79 beta, Cfr, Cm );
80 kernel_trace_stop_lvl2( flops );
81 }
82 else {
83 flops = FLOPS_SGEMM( M, N, AB->rk );
84 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_updateCfr );
85 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
86 M, N, AB->rk,
87 (alpha), AB->u, ldabu,
88 AB->v, ldabv,
89 (beta), Cfr, Cm );
90 kernel_trace_stop_lvl2( flops );
91 }
92
94 return flops;
95}
96
97/**
98 *******************************************************************************
99 *
100 * @brief Perform the addition of the low-rank matrix AB and the low-rank
101 * matrix C.
102 *
103 *******************************************************************************
104 *
105 * @param[inout] params
106 * The LRMM structure that stores all the parameters used in the LRMM
107 * functions family.
108 * On exit, the C matrix is udpated with the addition of AB.
109 * @sa core_slrmm_t
110 *
111 * @param[in] AB
112 * The low-rank structure of the AB matrix to apply to C.
113 *
114 * @param[in] transV
115 * Specify if AB->v is stored normally or transposed.
116 * - If PastixNoTrans, AB->v is stored normally for low-rank format.
117 * - If PastixTrans, AB->v is stored transposed.
118 * - If PastixTrans, AB->v is stored transposed, and () must be
119 * applied to the matrix.
120 *
121 *******************************************************************************
122 *
123 * @return The number of flops required to perform the operation.
124 *
125 *******************************************************************************/
126static inline pastix_fixdbl_t
128 const pastix_lrblock_t *AB,
129 pastix_trans_t transV )
130{
131 PASTE_CORE_SLRMM_PARAMS( params );
132 pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
133 pastix_int_t rAB = ( AB->rk == -1 ) ? pastix_imin( M, N ) : AB->rk;
134 pastix_int_t ldabu = M;
135 pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
136 pastix_fixdbl_t total_flops = 0.;
137 pastix_fixdbl_t flops = 0.;
138
139 assert( (C->rk >= 0) && (C->rk <= C->rkmax) );
140
141 /*
142 * The rank is too big, we need to uncompress/compress C
143 */
144 if ( (C->rk + rAB) > rklimit )
145 {
146 float *Cfr, *Coff;
147 int allocated = 0;
148 if ( (Cfr = core_slrmm_getws( params, Cm * Cn )) == NULL ) {
149 Cfr = malloc( Cm * Cn * sizeof(float) );
150 allocated = 1;
151 }
152 Coff = Cfr + Cm * offy + offx;
153
154 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
155 cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
156 Cm, Cn, C->rk,
157 (sone), C->u, Cm,
158 C->v, C->rkmax,
159 (szero), Cfr, Cm );
160 flops = FLOPS_SGEMM( Cm, Cn, C->rk );
161
162 /* Add A*B */
163 if ( AB->rk == -1 ) {
165 alpha, AB->u, M,
166 beta, Coff, Cm );
167 flops += (2. * M * N);
168 }
169 else {
170 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
171 M, N, AB->rk,
172 (alpha), AB->u, ldabu,
173 AB->v, ldabv,
174 (beta), Coff, Cm );
175 flops += FLOPS_SGEMM( M, N, AB->rk );
176 }
177 kernel_trace_stop_lvl2( flops );
178 total_flops += flops;
179
180 /* Try to recompress */
181 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
182 core_slrfree(C); // TODO: Can we give it directly to ge2lr as this
183 flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
184 kernel_trace_stop_lvl2_rank( flops, C->rk );
185 total_flops += flops;
186
187 if (allocated) {
188 free(Cfr);
189 }
190 }
191 /*
192 * The rank is not too large, we perform a low-rank update
193 */
194 else {
195 total_flops += lowrank->core_rradd( lowrank, transV, &alpha,
196 M, N, AB,
197 Cm, Cn, C,
198 offx, offy );
199 }
200
202 return total_flops;
203}
204
205/**
206 *******************************************************************************
207 *
208 * @brief Perform the addition of the low-rank matrix AB into the null matrix C.
209 *
210 *******************************************************************************
211 *
212 * @param[inout] params
213 * The LRMM structure that stores all the parameters used in the LRMM
214 * functions family.
215 * On exit, the C matrix contains the product AB aligned with its own
216 * dimensions.
217 * @sa core_slrmm_t
218 *
219 * @param[in] AB
220 * The low-rank structure of the AB matrix to apply to C.
221 *
222 * @param[in] transV
223 * Specify if AB->v is stored normally or transposed.
224 * - If PastixNoTrans, AB->v is stored normally for low-rank format.
225 * - If PastixTrans, AB->v is stored transposed.
226 * - If PastixTrans, AB->v is stored transposed, and () must be
227 * applied to the matrix.
228 *
229 * @param[in] infomask
230 * Mask of informations returned by the core_sxx2lr() functions.
231 * If CORE_LRMM_ORTHOU is set, then AB.u is orthogonal, otherwise an
232 * orthogonalization step is added before adding it to C.
233 *
234 *******************************************************************************
235 *
236 * @return The number of flops required to perform the operation.
237 *
238 *******************************************************************************/
239static inline pastix_fixdbl_t
241 const pastix_lrblock_t *AB,
242 pastix_trans_t transV,
243 int infomask )
244{
245 PASTE_CORE_SLRMM_PARAMS( params );
246 pastix_int_t rklimit = core_get_rklimit( Cm, Cn );
247 pastix_int_t ldabu = M;
248 pastix_int_t ldabv = (transV == PastixNoTrans) ? AB->rkmax : N;
249 pastix_fixdbl_t total_flops = 0.;
250 pastix_fixdbl_t flops;
251 int allocated = 0;
253
254 assert( C->rk == 0 );
255
256 if ( AB->rk > rklimit ) {
257 float *Cfr, *Coff;
258 if ( (Cfr = core_slrmm_getws( params, Cm * Cn )) == NULL ) {
259 Cfr = malloc( Cm * Cn * sizeof(float) );
260 allocated = 1;
261 }
262 Coff = Cfr + Cm * offy + offx;
263
264 /* Set to 0 if contribution smaller than C */
265 if ( (M != Cm) || (N != Cn) ) {
266 memset( Cfr, 0, Cm * Cn * sizeof(float) );
267 }
268
269 /* Uncompress the AB product into C */
270 flops = FLOPS_SGEMM( M, N, AB->rk );
271 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_uncompress );
272 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
273 M, N, AB->rk,
274 (alpha), AB->u, ldabu,
275 AB->v, ldabv,
276 (beta), Coff, Cm );
277 kernel_trace_stop_lvl2( flops );
278 total_flops += flops;
279
280 /* Try to recompress C */
281 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_recompress );
282 flops = lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, -1, Cm, Cn, Cfr, Cm, C );
283 kernel_trace_stop_lvl2_rank( flops, C->rk );
284 total_flops += flops;
285
286 if ( allocated ) {
287 free( work );
288 }
289 }
290 else {
291 /*
292 * Let's chech that AB->u is orthogonal before copying it to C.u
293 */
294 int orthou = infomask & PASTIX_LRM3_ORTHOU;
295 if ( !orthou ) {
296 float *ABfr;
297 pastix_lrblock_t backup;
298 int allocated = 0;
299
300 kernel_trace_start_lvl2( PastixKernelLvl2_LR_add2C_orthou );
301
302 if ( AB->rk > 0 ) {
303 if ( (ABfr = core_slrmm_getws( params, M * N )) == NULL ) {
304 ABfr = malloc( M * N * sizeof(float) );
305 allocated = 1;
306 }
307
308 cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transV,
309 M, N, AB->rk,
310 (sone), AB->u, ldabu,
311 AB->v, ldabv,
312 (szero), ABfr, M );
313 flops = FLOPS_SGEMM( M, N, AB->rk );
314 }
315 else {
316 ABfr = AB->u;
317 flops = 0.0;
318 }
319
320 flops += lowrank->core_ge2lr( lowrank->use_reltol, lowrank->tolerance, rklimit,
321 M, N, ABfr, M, &backup );
322
323 core_slrcpy( lowrank, PastixNoTrans, alpha,
324 M, N, &backup, Cm, Cn, C,
325 offx, offy );
326
327 kernel_trace_stop_lvl2( flops );
328 core_slrfree( &backup );
329 total_flops += flops;
330
331 if ( allocated ) {
332 free( ABfr );
333 }
334 }
335 /*
336 * AB->u is orthogonal, we directly copy AB->u into C
337 */
338 else {
339 core_slrcpy( lowrank, transV, alpha,
340 M, N, AB, Cm, Cn, C,
341 offx, offy );
342 }
343 }
344
345 return total_flops;
346}
347
348/**
349 *******************************************************************************
350 *
351 * @brief Perform the addition of two low-rank matrices.
352 *
353 *******************************************************************************
354 *
355 * @param[inout] params
356 * The LRMM structure that stores all the parameters used in the LRMM
357 * functions family.
358 * On exit, the C matrix contains the addition of C and A aligned with its own
359 * dimensions.
360 * @sa core_slrmm_t
361 *
362 * @param[in] A
363 * The low-rank structure of the A matrix to add to C.
364 *
365 * @param[in] transV
366 * Specify if A->v is stored normally or transposed.
367 * - If PastixNoTrans, AB->v is stored normally for low-rank format.
368 * - If PastixTrans, AB->v is stored transposed.
369 * - If PastixTrans, AB->v is stored transposed, and () must be
370 * applied to the matrix.
371 *
372 * @param[in] infomask
373 * Mask of informations returned by the core_sxx2lr() functions.
374 * If CORE_LRMM_ORTHOU is set, then A.u is orthogonal, otherwise an
375 * orthogonalization step is added before adding it to C.
376 *
377 *******************************************************************************
378 *
379 * @return The number of flops required to perform the operation.
380 *
381 *******************************************************************************/
384 const pastix_lrblock_t *A,
385 pastix_trans_t transV,
386 int infomask )
387{
388 pastix_lrblock_t *C = params->C;
389 pastix_fixdbl_t flops = 0.;
390
391 if ( A->rk != 0 ) {
392 pastix_atomic_lock( params->lock );
393 switch ( C->rk ) {
394 case -1:
395 /*
396 * C became full rank
397 */
398 flops = core_slr2fr( params, A, transV );
399 break;
400
401 case 0:
402 /*
403 * C is still null
404 */
405 flops = core_slr2null( params, A, transV, infomask );
406 break;
407
408 default:
409 /*
410 * C is low-rank of rank k
411 */
412 flops = core_slr2lr( params, A, transV );
413 }
414 assert( C->rk <= C->rkmax);
415 pastix_atomic_unlock( params->lock );
416 }
417
418 return flops;
419}
static pastix_fixdbl_t core_slr2fr(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the full-rank matrix C.
Definition core_slr2xx.c:60
static pastix_fixdbl_t core_slr2lr(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV)
Perform the addition of the low-rank matrix AB and the low-rank matrix C.
static pastix_fixdbl_t core_slr2null(core_slrmm_t *params, const pastix_lrblock_t *AB, pastix_trans_t transV, int infomask)
Perform the addition of the low-rank matrix AB into the null matrix C.
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition core_sgeadd.c:78
pastix_lrblock_t * C
pastix_atomic_lock_t * lock
static float * core_slrmm_getws(core_slrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
#define PASTE_CORE_SLRMM_PARAMS(_a_)
Initialize all the parameters of the core_slrmm family functions to ease the access.
#define PASTE_CORE_SLRMM_VOID
Void all the parameters of the core_slrmm family functions to silent warnings.
pastix_fixdbl_t core_slradd(core_slrmm_t *params, const pastix_lrblock_t *A, pastix_trans_t transV, int infomask)
Perform the addition of two low-rank matrices.
Structure to store all the parameters of the core_slrmm family functions.
pastix_int_t(* core_get_rklimit)(pastix_int_t, pastix_int_t)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
#define PASTIX_LRM3_ORTHOU
Macro to specify if the U part of a low-rank matrix is orthogonal or not (Used in LRMM functions).
The block low-rank structure to hold a matrix in low-rank form.
void core_slrcpy(const pastix_lr_t *lowrank, pastix_trans_t transAv, float alpha, pastix_int_t M1, pastix_int_t N1, const pastix_lrblock_t *A, pastix_int_t M2, pastix_int_t N2, pastix_lrblock_t *B, pastix_int_t offx, pastix_int_t offy)
Copy a small low-rank structure into a large one.
void core_slrfree(pastix_lrblock_t *A)
Free a low-rank matrix.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixNoTrans
Definition api.h:445