PaStiX Handbook  6.2.1
core_dxx2fr.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_dxx2fr.c
4  *
5  * PaStiX low-rank kernel routines that form the product of two matrices A and B
6  * into a low-rank form for an update on a full rank matrix.
7  *
8  * @copyright 2016-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
9  * Univ. Bordeaux. All rights reserved.
10  *
11  * @version 6.1.0
12  * @author Mathieu Faverge
13  * @author Gregoire Pichon
14  * @author Pierre Ramet
15  * @date 2020-02-05
16  * @generated from /builds/solverstack/pastix/kernels/core_zxx2fr.c, normal z -> d, Tue Apr 12 09:38:38 2022
17  *
18  **/
19 #include "common.h"
20 #include <cblas.h>
21 #include "pastix_dlrcores.h"
22 #include "kernels_trace.h"
23 #ifndef DOXYGEN_SHOULD_SKIP_THIS
24 static double done = 1.0;
25 static double dzero = 0.0;
26 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
27 
28 /**
29  *******************************************************************************
30  *
31  * @brief Perform the full-rank operation C = alpha * op(A) * op(B) + beta C
32  *
33  *******************************************************************************
34  *
35  * @param[inout] params
36  * The LRMM structure that stores all the parameters used in the LRMM
37  * functions family.
38  * On exit, the C matrix contains the product AB aligned with its own
39  * dimensions.
40  * @sa core_dlrmm_t
41  *
42  *******************************************************************************
43  *
44  * @return The number of flops required to perform the operation.
45  *
46  *******************************************************************************/
47 pastix_fixdbl_t
49 {
50  pastix_int_t ldau, ldbu, ldcu;
51  double *Cptr;
52  pastix_fixdbl_t flops;
53  PASTE_CORE_DLRMM_PARAMS( params );
54  ldau = (transA == PastixNoTrans) ? M : K;
55  ldbu = (transB == PastixNoTrans) ? K : N;
56  ldcu = Cm;
57 
58  Cptr = C->u;
59  Cptr += ldcu * offy + offx;
60 
61  pastix_atomic_lock( lock );
62  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
63 
64  /*
65  * Everything is full rank we apply directly a GEMM
66  */
67  cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
68  M, N, K,
69  (alpha), A->u, ldau,
70  B->u, ldbu,
71  (beta), Cptr, ldcu );
72  flops = FLOPS_DGEMM( M, N, K );
73 
74  pastix_atomic_unlock( lock );
75 
77  return flops;
78 }
79 
80 /**
81  *******************************************************************************
82  *
83  * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and C
84  * full-rank and B low-rank.
85  *
86  *******************************************************************************
87  *
88  * @param[inout] params
89  * The LRMM structure that stores all the parameters used in the LRMM
90  * functions family.
91  * On exit, the C matrix contains the product AB aligned with its own
92  * dimensions.
93  * @sa core_dlrmm_t
94  *
95  *******************************************************************************
96  *
97  * @return The number of flops required to perform the operation.
98  *
99  *******************************************************************************/
100 pastix_fixdbl_t
102 {
103  PASTE_CORE_DLRMM_PARAMS( params );
104  double *Cptr;
105  pastix_int_t ldau, ldbu, ldbv, ldcu;
106  pastix_fixdbl_t flops1 = FLOPS_DGEMM( M, B->rk, K ) + FLOPS_DGEMM( M, N, B->rk );
107  pastix_fixdbl_t flops2 = FLOPS_DGEMM( K, N, B->rk ) + FLOPS_DGEMM( M, N, K );
108  pastix_fixdbl_t flops;
109  int allocated = 0;
110 
111  ldau = (transA == PastixNoTrans) ? M : K;
112  ldbu = (transB == PastixNoTrans) ? K : N;
113  ldbv = ( B->rk == -1 ) ? -1 : B->rkmax;
114 
115  ldcu = Cm;
116  Cptr = C->u;
117  Cptr += ldcu * offy + offx;
118 
119  /*
120  * A(M-by-K) * B( N-by-rb x rb-by-K )^t
121  */
122  if ( flops1 <= flops2 ) {
123  if ( (work = core_dlrmm_getws( params, M * B->rk )) == NULL ) {
124  work = malloc( M * B->rk * sizeof(double) );
125  allocated = 1;
126  }
127 
128  /*
129  * (A * Bv) * Bu^t
130  */
131  cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
132  M, B->rk, K,
133  (done), A->u, ldau,
134  B->v, ldbv,
135  (dzero), work, M );
136 
137  pastix_atomic_lock( lock );
138  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
139  cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
140  M, N, B->rk,
141  (alpha), work, M,
142  B->u, ldbu,
143  (beta), Cptr, ldcu );
144  flops = flops1;
145  pastix_atomic_unlock( lock );
146  }
147  else {
148  if ( (work = core_dlrmm_getws( params, K * N )) == NULL ) {
149  work = malloc( K * N * sizeof(double) );
150  allocated = 1;
151  }
152 
153  /*
154  * A * (Bu * Bv^t)^t
155  */
156  cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
157  K, N, B->rk,
158  (done), B->u, ldbu,
159  B->v, ldbv,
160  (dzero), work, K );
161 
162  pastix_atomic_lock( lock );
163  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
164  cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
165  M, N, K,
166  (alpha), A->u, ldau,
167  work, K,
168  (beta), Cptr, ldcu );
169 
170  flops = flops2;
171  pastix_atomic_unlock( lock );
172  }
173 
174  if ( allocated ) {
175  free( work );
176  }
178  return flops;
179 }
180 
181 /**
182  *******************************************************************************
183  *
184  * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with B and C
185  * full-rank and A low-rank.
186  *
187  *******************************************************************************
188  *
189  * @param[inout] params
190  * The LRMM structure that stores all the parameters used in the LRMM
191  * functions family.
192  * On exit, the C matrix contains the product AB aligned with its own
193  * dimensions.
194  * @sa core_dlrmm_t
195  *
196  *******************************************************************************
197  *
198  * @return The number of flops required to perform the operation.
199  *
200  *******************************************************************************/
201 pastix_fixdbl_t
203 {
204  PASTE_CORE_DLRMM_PARAMS( params );
205  double *Cptr;
206  pastix_int_t ldau, ldav, ldbu, ldcu;
207  pastix_fixdbl_t flops1 = FLOPS_DGEMM( A->rk, N, K ) + FLOPS_DGEMM( M, N, A->rk );
208  pastix_fixdbl_t flops2 = FLOPS_DGEMM( M, K, A->rk ) + FLOPS_DGEMM( M, N, K );
209  pastix_fixdbl_t flops;
210  int allocated = 0;
211 
212  ldau = (transA == PastixNoTrans) ? M : K;
213  ldav = ( A->rk == -1 ) ? -1 : A->rkmax;
214  ldbu = (transB == PastixNoTrans) ? K : N;
215 
216  ldcu = Cm;
217  Cptr = C->u;
218  Cptr += ldcu * offy + offx;
219 
220  /*
221  * A( M-by-ra x ra-by-K ) * B(N-by-K)^t
222  */
223  if ( flops1 <= flops2 ) {
224  if ( (work = core_dlrmm_getws( params, A->rk * N )) == NULL ) {
225  work = malloc( A->rk * N * sizeof(double) );
226  allocated = 1;
227  }
228 
229  /*
230  * Au * (Av^t * B^t)
231  */
232  cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
233  A->rk, N, K,
234  (done), A->v, ldav,
235  B->u, ldbu,
236  (dzero), work, A->rk );
237 
238  pastix_atomic_lock( lock );
239  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
240  cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
241  M, N, A->rk,
242  (alpha), A->u, ldau,
243  work, A->rk,
244  (beta), Cptr, ldcu );
245 
246  flops = flops1;
247  pastix_atomic_unlock( lock );
248  }
249  else {
250  if ( (work = core_dlrmm_getws( params, M * K )) == NULL ) {
251  work = malloc( M * K * sizeof(double) );
252  allocated = 1;
253  }
254 
255  /*
256  * (Au * Av^t) * B^t
257  */
258  cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
259  M, K, A->rk,
260  (done), A->u, ldau,
261  A->v, ldav,
262  (dzero), work, M );
263 
264  pastix_atomic_lock( lock );
265  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
266  cblas_dgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
267  M, N, K,
268  (alpha), work, M,
269  B->u, ldbu,
270  (beta), Cptr, ldcu );
271 
272  flops = flops2;
273  pastix_atomic_unlock( lock );
274  }
275 
276  if ( allocated ) {
277  free( work );
278  }
280  return flops;
281 }
282 
283 /**
284  *******************************************************************************
285  *
286  * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and B
287  * low-rank and C full-rank.
288  *
289  *******************************************************************************
290  *
291  * @param[inout] params
292  * The LRMM structure that stores all the parameters used in the LRMM
293  * functions family.
294  * On exit, the C matrix contains the product AB aligned with its own
295  * dimensions.
296  * @sa core_dlrmm_t
297  *
298  *******************************************************************************
299  *
300  * @return The number of flops required to perform the operation.
301  *
302  *******************************************************************************/
303 pastix_fixdbl_t
305 {
306  PASTE_CORE_DLRMM_PARAMS( params );
307  double *Cptr;
308  pastix_int_t ldcu;
309  pastix_lrblock_t AB;
311  int infomask = 0;
312  pastix_fixdbl_t flops;
313 
314  ldcu = Cm;
315  Cptr = C->u;
316  Cptr += ldcu * offy + offx;
317 
318  flops = core_dlrlr2lr( params, &AB, &infomask );
319  assert( AB.rk != -1 );
320  assert( AB.rkmax != -1 );
321 
322  if ( infomask & PASTIX_LRM3_TRANSB ) {
323  trans = transB;
324  }
325 
326  if ( AB.rk > 0 ) {
327  pastix_int_t ldabv = (trans == PastixNoTrans) ? AB.rkmax : N;
328 
329  pastix_atomic_lock( lock );
330  assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
331 
332  cblas_dgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
333  M, N, AB.rk,
334  (alpha), AB.u, M,
335  AB.v, ldabv,
336  (beta), Cptr, ldcu );
337  flops = FLOPS_DGEMM( M, N, AB.rk );
338  pastix_atomic_unlock( lock );
339  }
340 
341  /* Free memory from zlrm3 */
342  if ( infomask & PASTIX_LRM3_ALLOCU ) {
343  free(AB.u);
344  }
345  if ( infomask & PASTIX_LRM3_ALLOCV ) {
346  free(AB.v);
347  }
348 
350  return flops;
351 }
PASTE_CORE_DLRMM_PARAMS
#define PASTE_CORE_DLRMM_PARAMS(_a_)
Initialize all the parameters of the core_dlrmm family functions to ease the access.
Definition: pastix_dlrcores.h:105
PASTIX_LRM3_ALLOCU
#define PASTIX_LRM3_ALLOCU
Macro to specify if the U part of a low-rank matrix has been allocated and need to be freed or not (U...
Definition: pastix_lowrank.h:45
pastix_lrblock_s::v
void * v
Definition: pastix_lowrank.h:116
pastix_lrblock_s::u
void * u
Definition: pastix_lowrank.h:115
core_dfrlr2fr
pastix_fixdbl_t core_dfrlr2fr(core_dlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with A and C full-rank and B low-rank.
Definition: core_dxx2fr.c:101
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
PastixNoTrans
@ PastixNoTrans
Definition: api.h:424
core_dfrfr2fr
pastix_fixdbl_t core_dfrfr2fr(core_dlrmm_t *params)
Perform the full-rank operation C = alpha * op(A) * op(B) + beta C.
Definition: core_dxx2fr.c:48
PASTE_CORE_DLRMM_VOID
#define PASTE_CORE_DLRMM_VOID
Void all the parameters of the core_dlrmm family functions to silent warnings.
Definition: pastix_dlrcores.h:128
core_dlrmm_s
Structure to store all the parameters of the core_dlrmm family functions.
Definition: pastix_dlrcores.h:80
core_dlrlr2lr
pastix_fixdbl_t core_dlrlr2lr(core_dlrmm_t *params, pastix_lrblock_t *AB, int *infomask)
Perform the operation AB = op(A) * op(B), with A, B, and AB low-rank.
Definition: core_dxx2lr.c:442
PASTIX_LRM3_TRANSB
#define PASTIX_LRM3_TRANSB
Macro to specify if the the operator on B, still needs to be applied to the V part of the low-rank ma...
Definition: pastix_lowrank.h:53
core_dlrfr2fr
pastix_fixdbl_t core_dlrfr2fr(core_dlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with B and C full-rank and A low-rank.
Definition: core_dxx2fr.c:202
pastix_lrblock_s::rk
int rk
Definition: pastix_lowrank.h:113
core_dlrlr2fr
pastix_fixdbl_t core_dlrlr2fr(core_dlrmm_t *params)
Perform the operation C = alpha * op(A) * op(B) + beta C, with A and B low-rank and C full-rank.
Definition: core_dxx2fr.c:304
pastix_lrblock_s::rkmax
int rkmax
Definition: pastix_lowrank.h:114
pastix_dlrcores.h
core_dlrmm_getws
static double * core_dlrmm_getws(core_dlrmm_t *params, ssize_t newsize)
Function to get a workspace pointer if space is available in the one provided.
Definition: pastix_dlrcores.h:155
PASTIX_LRM3_ALLOCV
#define PASTIX_LRM3_ALLOCV
Macro to specify if the V part of a low-rank matrix has been allocated and need to be freed or not (U...
Definition: pastix_lowrank.h:49