PaStiX Handbook  6.4.0
core_cscalo.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_cscalo.c
4  *
5  * PaStiX kernel routines
6  *
7  * @copyright 2010-2015 Univ. of Tennessee, Univ. of California Berkeley and
8  * Univ. of Colorado Denver. All rights reserved.
9  * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
10  * Univ. Bordeaux. All rights reserved.
11  *
12  * @version 6.4.0
13  * @author Mathieu Faverge
14  * @author Nolan Bredel
15  * @date 2024-07-05
16  * @generated from /builds/solverstack/pastix/kernels/core_zscalo.c, normal z -> c, Thu Aug 29 14:20:17 2024
17  *
18  **/
19 #include "common.h"
20 #include "blend/solver.h"
21 #include "pastix_ccores.h"
22 #include "cblas.h"
23 #include "kernels_trace.h"
24 
25 /**
26  ******************************************************************************
27  *
28  * @brief Scale a matrix by a diagonal out of place
29  *
30  * Perform the operation: B <- op(A) * D, where A is a general matrix, and D a
31  * diagonal matrix.
32  *
33  *******************************************************************************
34  *
35  * @param[in] trans
36  * @arg PastixNoTrans: No transpose, op( A ) = A;
37  * @arg PastixTrans: Transpose, op( A ) = A;
38  * @arg PastixConjTrans: Conjugate Transpose, op( A ) = conjf(A).
39  *
40  * @param[in] M
41  * Number of rows of the matrix B.
42  * Number of rows of the matrix A.
43  *
44  * @param[in] N
45  * Number of columns of the matrix B.
46  * Number of columns of the matrix A.
47  *
48  * @param[in] A
49  * Matrix of size lda-by-N.
50  *
51  * @param[in] lda
52  * Leading dimension of the array A. lda >= max(1,M).
53  *
54  * @param[in] D
55  * Diagonal matrix of size ldd-by-N.
56  *
57  * @param[in] ldd
58  * Leading dimension of the array D. ldd >= 1.
59  *
60  * @param[inout] B
61  * Matrix of size LDB-by-N.
62  *
63  * @param[in] ldb
64  * Leading dimension of the array B. ldb >= max(1,M)
65  *
66  *******************************************************************************
67  *
68  * @retval PASTIX_SUCCESS successful exit
69  * @retval <0 if -i, the i-th argument had an illegal value
70  * @retval 1, not yet implemented
71  *
72  ******************************************************************************/
73 int
75  pastix_int_t M,
76  pastix_int_t N,
77  const pastix_complex32_t *A,
78  pastix_int_t lda,
79  const pastix_complex32_t *D,
80  pastix_int_t ldd,
82  pastix_int_t ldb )
83 {
84  pastix_complex32_t alpha;
85  pastix_int_t i, j;
86 
87 #if !defined(NDEBUG)
88  if ((trans < PastixNoTrans) ||
89  (trans > PastixConjTrans))
90  {
91  return -1;
92  }
93 
94  if (M < 0) {
95  return -2;
96  }
97  if (N < 0) {
98  return -3;
99  }
100  if ( lda < pastix_imax(1,M) )
101  {
102  return -5;
103  }
104  if ( ldd < 1 )
105  {
106  return -7;
107  }
108  if ( ldb < pastix_imax(1,M) ) {
109  return -9;
110  }
111 #endif
112 
113 #if defined(PRECISION_z) || defined(PRECISION_c)
114  if (trans == PastixConjTrans) {
115  for( j=0; j<N; j++, D += ldd ) {
116  alpha = *D;
117  for( i=0; i<M; i++, B++, A++ ) {
118  *B = conjf(*A) * alpha;
119  }
120  A += lda - M;
121  B += ldb - M;
122  }
123  }
124  else
125 #endif
126  {
127  for( j=0; j<N; j++, D += ldd ) {
128  alpha = *D;
129  for( i=0; i<M; i++, B++, A++ ) {
130  *B = (*A) * alpha;
131  }
132  A += lda - M;
133  B += ldb - M;
134  }
135  }
136 
137  (void)trans;
138  return PASTIX_SUCCESS;
139 }
140 
141 /**
142  *******************************************************************************
143  *
144  * @brief Copy the L term with scaling for the two-terms algorithm
145  *
146  * Performs LD = op(L) * D
147  *
148  *******************************************************************************
149  *
150  * @param[in] trans
151  * @arg PastixNoTrans: No transpose, op( L ) = L;
152  * @arg PastixTrans: Transpose, op( L ) = L;
153  * @arg PastixConjTrans: Conjugate Transpose, op( L ) = conjf(L).
154  *
155  * @param[in] cblk
156  * Pointer to the structure representing the panel to factorize in the
157  * cblktab array. Next column blok must be accessible through cblk[1].
158  *
159  * @param[inout] dataL
160  * The pointer to the correct representation of lower part of the data.
161  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
162  * - pastix_lr_block if the block is compressed.
163  *
164  * @param[inout] dataLD
165  * The pointer to the correct representation of LD.
166  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
167  * - pastix_lr_block if the block is compressed.
168  *
169  *******************************************************************************/
170 void
172  const SolverCblk *cblk,
173  void *dataL,
174  void *dataLD )
175 {
176  const SolverBlok *blok, *lblk;
177  pastix_int_t M, N;
178  pastix_lrblock_t *lrL, *lrLD;
179  pastix_fixdbl_t time;
180  pastix_complex32_t *LD;
181 
183 
184  N = cblk_colnbr( cblk );
185 
186  blok = cblk->fblokptr + 1; /* Firt off-diagonal block */
187  lblk = cblk[1].fblokptr; /* Next diagonal block */
188 
189  /* if there are off-diagonal supernodes in the column */
190  if ( blok < lblk )
191  {
192  const pastix_complex32_t *L;
193  const pastix_complex32_t *D;
194  pastix_int_t ldl, ldd, ldld;
195 
196  if ( cblk->cblktype & CBLK_COMPRESSED ) {
197  lrL = (pastix_lrblock_t *)dataL;
198  lrLD = (pastix_lrblock_t *)dataLD;
199  D = lrL->u;
200  ldd = N+1;
201 
202  lrL++; lrLD++;
203  for(; blok < lblk; blok++, lrL++, lrLD++) {
204  M = blok_rownbr( blok );
205 
206  assert( lrLD->rk == -1 );
207 
208  /* Copy L in LD */
209  lrLD->rk = lrL->rk;
210  lrLD->rkmax = lrL->rkmax;
211 
212  if ( lrL->rk == -1 ) {
213  assert( M == lrL->rkmax );
214 
215  /* Initialize the workspace */
216  memcpy( lrLD->u, lrL->u, lrL->rkmax * N * sizeof(pastix_complex32_t) );
217  lrLD->v = NULL;
218 
219  L = lrL->u;
220  LD = lrLD->u;
221  }
222  else {
223  /*
224  * Initialize the workspace
225  */
226  memcpy( lrLD->u, lrL->u, M * lrL->rk * sizeof(pastix_complex32_t) );
227  lrLD->v = ((pastix_complex32_t *)lrLD->u) + M * lrL->rk;
228  memcpy( lrLD->v, lrL->v, N * lrL->rkmax * sizeof(pastix_complex32_t) );
229 
230  L = lrL->v;
231  LD = lrLD->v;
232  M = lrLD->rkmax;
233  }
234 
235  ldl = M;
236  ldld = M;
237 
238  /* Compute LD = L * D */
239  core_cscalo( trans, M, N,
240  L, ldl, D, ldd,
241  LD, ldld );
242  }
243  }
244  else if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
245  L = D = (pastix_complex32_t *)dataL;
246  LD = (pastix_complex32_t *)dataLD;
247  ldd = N+1;
248 
249  for(; blok < lblk; blok++) {
250  M = blok_rownbr( blok );
251 
252  /* Compute LD = L * D */
253  core_cscalo( trans, M, N,
254  L + blok->coefind, M, D, ldd,
255  LD + blok->coefind, M );
256  }
257  }
258  else {
259  L = D = (pastix_complex32_t *)dataL;
260  LD = (pastix_complex32_t *)dataLD;
261  ldl = cblk->stride;
262  ldd = cblk->stride+1;
263 
264  M = cblk->stride - N;
265  LD = LD + blok->coefind;
266  ldld = cblk->stride;
267 
268  core_cscalo( trans, M, N, L + blok->coefind, ldl, D, ldd, LD, ldld );
269  }
270  }
271 
272  M = cblk->stride - N;
273  kernel_trace_stop( cblk->fblokptr->inlast, PastixKernelSCALOCblk, M, N, 0, (pastix_fixdbl_t)(M*N), time );
274 }
275 
276 /**
277  *******************************************************************************
278  *
279  * @brief Copy the lower terms of the block with scaling for the two-terms
280  * algorithm.
281  *
282  * Performs B = op(A) * D
283  *
284  *******************************************************************************
285  *
286  * @param[in] trans
287  * @arg PastixNoTrans: No transpose, op( A ) = A;
288  * @arg PastixTrans: Transpose, op( A ) = A;
289  * @arg PastixConjTrans: Conjugate Transpose, op( A ) = conjf(A).
290  *
291  * @param[in] cblk
292  * Pointer to the structure representing the panel to factorize in the
293  * cblktab array. Next column blok must be accessible through cblk[1].
294  *
295  * @param[in] blok_m
296  * Index of the off-diagonal block to be solved in the cblk. All blocks
297  * facing the same cblk, in the current column block will be solved.
298  *
299  * @param[in] dataA
300  * The pointer to the correct representation of data of A.
301  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
302  * - pastix_lr_block if the block is compressed.
303  *
304  * @param[in] dataD
305  * The pointer to the correct representation of data of D.
306  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
307  * - pastix_lr_block if the block is compressed.
308  *
309  * @param[inout] dataB
310  * The pointer to the correct representation of data of B.
311  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
312  * - pastix_lr_block if the block is compressed.
313  *
314  *******************************************************************************/
315 void
317  const SolverCblk *cblk,
318  pastix_int_t blok_m,
319  const void *dataA,
320  const void *dataD,
321  void *dataB )
322 {
323  const SolverBlok *fblok, *lblok, *blok;
324  pastix_int_t M, N, ldd, offset, cblk_m;
325  const pastix_complex32_t *lA;
326  pastix_lrblock_t *lrD, *lrB, *lrA;
327  pastix_complex32_t *D, *B, *A;
328  pastix_complex32_t *lB;
329 
330  N = cblk_colnbr( cblk );
331  fblok = cblk[0].fblokptr; /* The diagonal block */
332  lblok = cblk[1].fblokptr; /* The diagonal block of the next cblk */
333  ldd = blok_rownbr( fblok ) + 1;
334 
335  assert( blok_rownbr(fblok) == N );
336  assert( cblk->cblktype & CBLK_LAYOUT_2D );
337 
338  blok = fblok + blok_m;
339  offset = blok->coefind;
340  cblk_m = blok->fcblknm;
341 
342  if ( cblk->cblktype & CBLK_COMPRESSED ) {
343  lrA = (pastix_lrblock_t *)dataA;
344  lrD = (pastix_lrblock_t *)dataD;
345  lrB = (pastix_lrblock_t *)dataB;
346  D = lrD->u;
347  for (; (blok < lblok) && (blok->fcblknm == cblk_m); blok++, lrA++, lrB++) {
348  M = blok_rownbr( blok );
349 
350  /* Copy A in B */
351  lrB->rk = lrA->rk;
352  lrB->rkmax = lrA->rkmax;
353 
354  if ( lrB->rk == -1 ) {
355  assert( M == lrA->rkmax );
356  assert( NULL == lrA->v );
357 
358  /* Initialize the workspace */
359  memcpy( lrB->u, lrA->u, lrA->rkmax * N * sizeof(pastix_complex32_t) );
360  lrB->v = NULL;
361 
362  lA = lrA->u;
363  lB = lrB->u;
364  }
365  else {
366  /*
367  * Initialize the workspace
368  */
369  memcpy( lrB->u, lrA->u, M * lrA->rk * sizeof(pastix_complex32_t) );
370  lrB->v = ((pastix_complex32_t *)lrB->u) + M * lrA->rk;
371  memcpy( lrB->v, lrA->v, N * lrA->rkmax * sizeof(pastix_complex32_t) );
372 
373  lA = lrA->v;
374  lB = lrB->v;
375  M = lrA->rkmax;
376  }
377 
378  /* Compute B = op(A) * D */
379  core_cscalo( trans, M, N,
380  lA, M, D, ldd, lB, M );
381  }
382  }
383  else {
384  A = (pastix_complex32_t *)dataA;
385  D = (pastix_complex32_t *)dataD;
386  B = (pastix_complex32_t *)dataB;
387 
388  for (; (blok < lblok) && (blok->fcblknm == cblk_m); blok++) {
389  lA = A + blok->coefind - offset;
390  lB = B + blok->coefind - offset;
391  M = blok_rownbr(blok);
392 
393  /* Compute B = op(A) * D */
394  core_cscalo( trans, M, N,
395  lA, M, D, ldd, lB, M );
396  }
397  }
398 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
float _Complex pastix_complex32_t
Definition: datatypes.h:76
double pastix_fixdbl_t
Definition: datatypes.h:65
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
Definition: kernels_trace.h:87
@ PastixKernelSCALOCblk
Definition: kernels_enums.h:53
int core_cscalo(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, const pastix_complex32_t *A, pastix_int_t lda, const pastix_complex32_t *D, pastix_int_t ldd, pastix_complex32_t *B, pastix_int_t ldb)
Scale a matrix by a diagonal out of place.
Definition: core_cscalo.c:74
void cpucblk_cscalo(pastix_trans_t trans, const SolverCblk *cblk, void *dataL, void *dataLD)
Copy the L term with scaling for the two-terms algorithm.
Definition: core_cscalo.c:171
void cpublok_cscalo(pastix_trans_t trans, const SolverCblk *cblk, pastix_int_t blok_m, const void *dataA, const void *dataD, void *dataB)
Copy the lower terms of the block with scaling for the two-terms algorithm.
Definition: core_cscalo.c:316
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
@ PastixConjTrans
Definition: api.h:447
@ PastixNoTrans
Definition: api.h:445
@ PASTIX_SUCCESS
Definition: api.h:367
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
pastix_int_t fcblknm
Definition: solver.h:144
pastix_int_t coefind
Definition: solver.h:149
SolverBlok * fblokptr
Definition: solver.h:168
int8_t inlast
Definition: solver.h:151
pastix_int_t stride
Definition: solver.h:169
int8_t cblktype
Definition: solver.h:164
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161