PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_zxx2fr.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_zxx2fr.c
4 *
5 * PaStiX low-rank kernel routines that form the product of two matrices A and B
6 * into a low-rank form for an update on a full rank matrix.
7 *
8 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9 * Univ. Bordeaux. All rights reserved.
10 *
11 * @version 6.4.0
12 * @author Mathieu Faverge
13 * @author Gregoire Pichon
14 * @author Pierre Ramet
15 * @date 2024-07-05
16 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zxx2fr.c, normal z -> z, Tue Feb 25 14:34:58 2025
17 *
18 **/
19#include "common.h"
20#include <cblas.h>
21#include "pastix_zlrcores.h"
22#include "kernels_trace.h"
23#ifndef DOXYGEN_SHOULD_SKIP_THIS
24static pastix_complex64_t zone = 1.0;
25static pastix_complex64_t zzero = 0.0;
26#endif /* DOXYGEN_SHOULD_SKIP_THIS */
27
28/**
29 *******************************************************************************
30 *
31 * @brief Perform the full-rank operation C = alpha * op(A) * op(B) + beta C
32 *
33 *******************************************************************************
34 *
35 * @param[inout] params
36 * The LRMM structure that stores all the parameters used in the LRMM
37 * functions family.
38 * On exit, the C matrix contains the product AB aligned with its own
39 * dimensions.
40 * @sa core_zlrmm_t
41 *
42 *******************************************************************************
43 *
44 * @return The number of flops required to perform the operation.
45 *
46 *******************************************************************************/
49{
50 pastix_int_t ldau, ldbu, ldcu;
51 pastix_complex64_t *Cptr;
52 pastix_fixdbl_t flops;
54 ldau = (transA == PastixNoTrans) ? M : K;
55 ldbu = (transB == PastixNoTrans) ? K : N;
56 ldcu = Cm;
57
58 Cptr = C->u;
59 Cptr += ldcu * offy + offx;
60
61 pastix_atomic_lock( lock );
62 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
63
64 /*
65 * Everything is full rank we apply directly a GEMM
66 */
67 cblas_zgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
68 M, N, K,
69 CBLAS_SADDR(alpha), A->u, ldau,
70 B->u, ldbu,
71 CBLAS_SADDR(beta), Cptr, ldcu );
72 flops = FLOPS_ZGEMM( M, N, K );
73
74 pastix_atomic_unlock( lock );
75
77 return flops;
78}
79
80/**
81 *******************************************************************************
82 *
83 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and C
84 * full-rank and B low-rank.
85 *
86 *******************************************************************************
87 *
88 * @param[inout] params
89 * The LRMM structure that stores all the parameters used in the LRMM
90 * functions family.
91 * On exit, the C matrix contains the product AB aligned with its own
92 * dimensions.
93 * @sa core_zlrmm_t
94 *
95 *******************************************************************************
96 *
97 * @return The number of flops required to perform the operation.
98 *
99 *******************************************************************************/
102{
103 PASTE_CORE_ZLRMM_PARAMS( params );
104 pastix_complex64_t *Cptr;
105 pastix_int_t ldau, ldbu, ldbv, ldcu;
106 pastix_fixdbl_t flops1 = FLOPS_ZGEMM( M, B->rk, K ) + FLOPS_ZGEMM( M, N, B->rk );
107 pastix_fixdbl_t flops2 = FLOPS_ZGEMM( K, N, B->rk ) + FLOPS_ZGEMM( M, N, K );
108 pastix_fixdbl_t flops;
109 int allocated = 0;
111
112 ldau = (transA == PastixNoTrans) ? M : K;
113 ldbu = (transB == PastixNoTrans) ? K : N;
114 ldbv = ( B->rk == -1 ) ? -1 : B->rkmax;
115
116 ldcu = Cm;
117 Cptr = C->u;
118 Cptr += ldcu * offy + offx;
119
120 /*
121 * A(M-by-K) * B( N-by-rb x rb-by-K )^t
122 */
123 if ( flops1 <= flops2 ) {
124 if ( (work = core_zlrmm_getws( params, M * B->rk )) == NULL ) {
125 work = malloc( M * B->rk * sizeof(pastix_complex64_t) );
126 allocated = 1;
127 }
128
129 /*
130 * (A * Bv) * Bu^t
131 */
132 cblas_zgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
133 M, B->rk, K,
134 CBLAS_SADDR(zone), A->u, ldau,
135 B->v, ldbv,
136 CBLAS_SADDR(zzero), work, M );
137
138 pastix_atomic_lock( lock );
139 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
140 cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
141 M, N, B->rk,
142 CBLAS_SADDR(alpha), work, M,
143 B->u, ldbu,
144 CBLAS_SADDR(beta), Cptr, ldcu );
145 flops = flops1;
146 pastix_atomic_unlock( lock );
147 }
148 else {
149 if ( (work = core_zlrmm_getws( params, K * N )) == NULL ) {
150 work = malloc( K * N * sizeof(pastix_complex64_t) );
151 allocated = 1;
152 }
153
154 /*
155 * A * (Bu * Bv^t)^t
156 */
157 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
158 K, N, B->rk,
159 CBLAS_SADDR(zone), B->u, ldbu,
160 B->v, ldbv,
161 CBLAS_SADDR(zzero), work, K );
162
163 pastix_atomic_lock( lock );
164 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
165 cblas_zgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
166 M, N, K,
167 CBLAS_SADDR(alpha), A->u, ldau,
168 work, K,
169 CBLAS_SADDR(beta), Cptr, ldcu );
170
171 flops = flops2;
172 pastix_atomic_unlock( lock );
173 }
174
175 if ( allocated ) {
176 free( work );
177 }
178 return flops;
179}
180
181/**
182 *******************************************************************************
183 *
184 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with B and C
185 * full-rank and A low-rank.
186 *
187 *******************************************************************************
188 *
189 * @param[inout] params
190 * The LRMM structure that stores all the parameters used in the LRMM
191 * functions family.
192 * On exit, the C matrix contains the product AB aligned with its own
193 * dimensions.
194 * @sa core_zlrmm_t
195 *
196 *******************************************************************************
197 *
198 * @return The number of flops required to perform the operation.
199 *
200 *******************************************************************************/
203{
204 PASTE_CORE_ZLRMM_PARAMS( params );
205 pastix_complex64_t *Cptr;
206 pastix_int_t ldau, ldav, ldbu, ldcu;
207 pastix_fixdbl_t flops1 = FLOPS_ZGEMM( A->rk, N, K ) + FLOPS_ZGEMM( M, N, A->rk );
208 pastix_fixdbl_t flops2 = FLOPS_ZGEMM( M, K, A->rk ) + FLOPS_ZGEMM( M, N, K );
209 pastix_fixdbl_t flops;
210 int allocated = 0;
212
213 ldau = (transA == PastixNoTrans) ? M : K;
214 ldav = ( A->rk == -1 ) ? -1 : A->rkmax;
215 ldbu = (transB == PastixNoTrans) ? K : N;
216
217 ldcu = Cm;
218 Cptr = C->u;
219 Cptr += ldcu * offy + offx;
220
221 /*
222 * A( M-by-ra x ra-by-K ) * B(N-by-K)^t
223 */
224 if ( flops1 <= flops2 ) {
225 if ( (work = core_zlrmm_getws( params, A->rk * N )) == NULL ) {
226 work = malloc( A->rk * N * sizeof(pastix_complex64_t) );
227 allocated = 1;
228 }
229
230 /*
231 * Au * (Av^t * B^t)
232 */
233 cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
234 A->rk, N, K,
235 CBLAS_SADDR(zone), A->v, ldav,
236 B->u, ldbu,
237 CBLAS_SADDR(zzero), work, A->rk );
238
239 pastix_atomic_lock( lock );
240 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
241 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
242 M, N, A->rk,
243 CBLAS_SADDR(alpha), A->u, ldau,
244 work, A->rk,
245 CBLAS_SADDR(beta), Cptr, ldcu );
246
247 flops = flops1;
248 pastix_atomic_unlock( lock );
249 }
250 else {
251 if ( (work = core_zlrmm_getws( params, M * K )) == NULL ) {
252 work = malloc( M * K * sizeof(pastix_complex64_t) );
253 allocated = 1;
254 }
255
256 /*
257 * (Au * Av^t) * B^t
258 */
259 cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
260 M, K, A->rk,
261 CBLAS_SADDR(zone), A->u, ldau,
262 A->v, ldav,
263 CBLAS_SADDR(zzero), work, M );
264
265 pastix_atomic_lock( lock );
266 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
267 cblas_zgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
268 M, N, K,
269 CBLAS_SADDR(alpha), work, M,
270 B->u, ldbu,
271 CBLAS_SADDR(beta), Cptr, ldcu );
272
273 flops = flops2;
274 pastix_atomic_unlock( lock );
275 }
276
277 if ( allocated ) {
278 free( work );
279 }
280 return flops;
281}
282
283/**
284 *******************************************************************************
285 *
286 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and B
287 * low-rank and C full-rank.
288 *
289 *******************************************************************************
290 *
291 * @param[inout] params
292 * The LRMM structure that stores all the parameters used in the LRMM
293 * functions family.
294 * On exit, the C matrix contains the product AB aligned with its own
295 * dimensions.
296 * @sa core_zlrmm_t
297 *
298 *******************************************************************************
299 *
300 * @return The number of flops required to perform the operation.
301 *
302 *******************************************************************************/
305{
306 PASTE_CORE_ZLRMM_PARAMS( params );
307 pastix_complex64_t *Cptr;
308 pastix_int_t ldcu;
311 int infomask = 0;
312 pastix_fixdbl_t flops;
313
314 ldcu = Cm;
315 Cptr = C->u;
316 Cptr += ldcu * offy + offx;
317
318 flops = core_zlrlr2lr( params, &AB, &infomask );
319 assert( AB.rk != -1 );
320 assert( AB.rkmax != -1 );
321
322 if ( infomask & PASTIX_LRM3_TRANSB ) {
323 trans = transB;
324 }
325
326 if ( AB.rk > 0 ) {
327 pastix_int_t ldabv = (trans == PastixNoTrans) ? AB.rkmax : N;
328
329 pastix_atomic_lock( lock );
330 assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
331
332 cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
333 M, N, AB.rk,
334 CBLAS_SADDR(alpha), AB.u, M,
335 AB.v, ldabv,
336 CBLAS_SADDR(beta), Cptr, ldcu );
337 flops = FLOPS_ZGEMM( M, N, AB.rk );
338 pastix_atomic_unlock( lock );
339 }
340
341 /* Free memory from zlrm3 */
342 if ( infomask & PASTIX_LRM3_ALLOCU ) {
343 free(AB.u);
344 }
345 if ( infomask & PASTIX_LRM3_ALLOCV ) {
346 free(AB.v);
347 }
348
350 return flops;
351}
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
#define PASTE_CORE_ZLRMM_PARAMS(_a_)
Initialize all the parameters of the core_zlrmm family functions to ease the access.
static pastix_complex64_t * core_zlrmm_getws(core_zlrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
pastix_fixdbl_t core_zfrlr2fr(core_zlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with A and C full-rank and B low-rank.
pastix_fixdbl_t core_zlrlr2fr(core_zlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with A and B low-rank and C full-rank.
#define PASTE_CORE_ZLRMM_VOID
Void all the parameters of the core_zlrmm family functions to silent warnings.
pastix_fixdbl_t core_zfrfr2fr(core_zlrmm_t *params)
Perform the full-rank operation C = alpha * op(A) * op(B) + beta C.
Definition core_zxx2fr.c:48
pastix_fixdbl_t core_zlrfr2fr(core_zlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with B and C full-rank and A low-rank.
pastix_fixdbl_t core_zlrlr2lr(core_zlrmm_t *params, pastix_lrblock_t *AB, int *infomask)
Perform the operation AB = op(A) * op(B), with A, B, and AB low-rank.
Structure to store all the parameters of the core_zlrmm family functions.
#define PASTIX_LRM3_ALLOCV
Macro to specify if the V part of a low-rank matrix has been allocated and need to be freed or not (U...
#define PASTIX_LRM3_TRANSB
Macro to specify if the the operator on B, still needs to be applied to the V part of the low-rank ma...
#define PASTIX_LRM3_ALLOCU
Macro to specify if the U part of a low-rank matrix has been allocated and need to be freed or not (U...
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixNoTrans
Definition api.h:445