PaStiX Handbook  6.2.1
core_cgemmsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_cgemmsp.c
4  *
5  * PaStiX kernel routines operating on the solver structure.
6  *
7  * @copyright 2011-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.2.1
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Xavier Lacoste
14  * @author Gregoire Pichon
15  * @author Nolan Bredel
16  * @date 2021-06-18
17  * @generated from /builds/solverstack/pastix/kernels/core_zgemmsp.c, normal z -> c, Tue Apr 12 09:38:36 2022
18  *
19  **/
20 #include "common.h"
21 #include "cblas.h"
22 #include "blend/solver.h"
23 #include "kernels_trace.h"
24 #include "pastix_ccores.h"
25 #include "pastix_clrcores.h"
26 
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static pastix_complex32_t mcone = -1.0;
29 static pastix_complex32_t cone = 1.0;
30 static pastix_complex32_t czero = 0.0;
31 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
32 
33 /**
34  *******************************************************************************
35  *
36  * @ingroup kernel_fact_null
37  *
38  * @brief Compute the updates that are generated by the transposition of one
39  * single off-diagonal block.
40  *
41  * Both cblk involved in the computation are stored with the 1D storage: Column
42  * Major Layout with blocks interleaved.
43  *
44  * All the off-diagonal block below block are multiplied by the selected block
45  * and added to the facing cblk.
46  *
47  *******************************************************************************
48  *
49  * @param[in] sideA
50  * Specify if A and C belong to the lower part, or to the upper part.
51  * If sideA == PastixLCoef, the contribution of:
52  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
53  * C, otherwise the contribution:
54  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
55  * to C.
56  * The pointer to the data structure that describes the panel from
57  * which we compute the contributions. Next column blok must be
58  * accessible through cblk[1].
59  *
60  * @param[in] trans
61  * Specify the transposition used for the B matrix. It has to be either
62  * PastixTrans or PastixConjTrans.
63  *
64  * @param[in] cblk
65  * The cblk structure to which block belongs to. The A and B pointers
66  * must be the coeftab of this column block.
67  * Next column blok must be accessible through cblk[1].
68  *
69  * @param[in] blok
70  * The block from which we compute the contributions.
71  *
72  * @param[inout] fcblk
73  * The pointer to the data structure that describes the panel on which
74  * we compute the contributions. The C pointer must be one of the
75  * coeftab from this fcblk. Next column blok must be accessible through
76  * fcblk[1].
77  *
78  * @param[in] A
79  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
80  * coefficients of the panel when the Lower part is computed,
81  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
82  *
83  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
84  * the coefficients of the panel, if Symmetric/Hermitian cases or if
85  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
86  * cblk.stride -by- cblk.width
87  *
88  * @param[inout] C
89  * The pointer to the fcblk.lcoeftab if the lower part is computed,
90  * fcblk.ucoeftab otherwise.
91  *
92  * @param[in] work
93  * Temporary memory buffer that is at least equal to the height of the
94  * block B by the sum of the height of all the blocks below the block
95  * B.
96  *
97  *******************************************************************************
98  *
99  * @sa core_cgemmsp_1d2d
100  * @sa core_cgemmsp_2d2d
101  *
102  *******************************************************************************/
103 static inline void
105  const SolverCblk *cblk,
106  const SolverBlok *blok,
107  SolverCblk *fcblk,
108  const pastix_complex32_t *A,
109  const pastix_complex32_t *B,
110  pastix_complex32_t *C,
111  pastix_complex32_t *work )
112 {
113  const SolverBlok *iterblok;
114  const SolverBlok *fblok;
115  const SolverBlok *lblok;
116 
117  pastix_complex32_t *tmpC;
118  pastix_complex32_t *wtmp;
119  pastix_int_t stride, stridef, indblok;
120  pastix_int_t M, N, K, m;
121  int shift;
122 
123  /* Both cblk and fcblk are stored in 1D */
124  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
125  assert(!(fcblk->cblktype & CBLK_LAYOUT_2D));
126 
127  assert( work != NULL );
128 
129  shift = (sideA == PastixUCoef) ? 1 : 0;
130 
131  stride = cblk->stride;
132  stridef = fcblk->stride;
133  K = cblk_colnbr( cblk );
134 
135  /* First blok */
136  indblok = blok->coefind;
137 
138  N = blok_rownbr( blok );
139  M = stride - indblok - (shift * N);
140 
141  /* Matrix A = Aik */
142  A = A + indblok + (shift * N);
143  B = B + indblok;
144 
145  /*
146  * Compute update A * B'
147  */
148  wtmp = work;
149  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
150  cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
151  M, N, K,
152  CBLAS_SADDR(cone), A, stride,
153  B, stride,
154  CBLAS_SADDR(czero), wtmp, M );
155  kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
156 
157  /*
158  * Add contribution to C in fcblk
159  */
160 
161  /* Get the first block of the distant panel */
162  fblok = fcblk->fblokptr;
163 
164  /* Move the pointer to the top of the right column */
165  C = C + (blok->frownum - fcblk->fcolnum) * stridef;
166 
167  lblok = cblk[1].fblokptr;
168 
169  /* for all following blocks in block column */
170  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
171 
172  /* Find facing blok */
173  while (!is_block_inside_fblock( iterblok, fblok ))
174  {
175  fblok++;
176  assert( fblok < fcblk[1].fblokptr );
177  }
178 
179  tmpC = C + fblok->coefind + iterblok->frownum - fblok->frownum;
180  m = blok_rownbr( iterblok );
181 
182  pastix_cblk_lock( fcblk );
183  core_cgeadd( PastixNoTrans, m, N,
184  -1.0, wtmp, M,
185  1.0, tmpC, stridef );
186  pastix_cblk_unlock( fcblk );
187 
188  /* Displacement to next block */
189  wtmp += m;
190  }
191 }
192 
193 /**
194  *******************************************************************************
195  *
196  * @ingroup kernel_fact_null
197  *
198  * @brief Compute the updates that are generated by the transposition of one
199  * single off-diagonal block.
200  *
201  * The cblk involved in the matrices A and B are stored with the 1D storage:
202  * Column Major Layout with blocks interleaved. The facing cblk of the atrix C,
203  * is stored with the 2D storage where each block is stored continuously one
204  * after another. (Similar to dense tile storage with variant tile size)
205  *
206  * All the off-diagonal block below block are multiplied by the selected block
207  * and added to the facing cblk.
208  *
209  *******************************************************************************
210  *
211  * @param[in] sideA
212  * Specify if A and C belong to the lower part, or to the upper part.
213  * If sideA == PastixLCoef, the contribution of:
214  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
215  * C, otherwise the contribution:
216  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
217  * to C.
218  * The pointer to the data structure that describes the panel from
219  * which we compute the contributions. Next column blok must be
220  * accessible through cblk[1].
221  *
222  * @param[in] trans
223  * Specify the transposition used for the B matrix. It has to be either
224  * PastixTrans or PastixConjTrans.
225  *
226  * @param[in] cblk
227  * The cblk structure to which block belongs to. The A and B pointers
228  * must be the coeftab of this column block.
229  * Next column blok must be accessible through cblk[1].
230  *
231  * @param[in] blok
232  * The block from which we compute the contributions.
233  *
234  * @param[inout] fcblk
235  * The pointer to the data structure that describes the panel on which
236  * we compute the contributions. The C pointer must be one of the
237  * coeftab from this fcblk. Next column blok must be accessible through
238  * fcblk[1].
239  *
240  * @param[in] A
241  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
242  * coefficients of the panel when the Lower part is computed,
243  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
244  *
245  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
246  * the coefficients of the panel, if Symmetric/Hermitian cases or if
247  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
248  * cblk.stride -by- cblk.width
249  *
250  * @param[inout] C
251  * The pointer to the fcblk.lcoeftab if the lower part is computed,
252  * fcblk.ucoeftab otherwise.
253  *
254  *******************************************************************************
255  *
256  * @sa core_cgemmsp_1d1d
257  * @sa core_cgemmsp_2d2d
258  *
259  *******************************************************************************/
260 static inline void
262  const SolverCblk *cblk,
263  const SolverBlok *blok,
264  SolverCblk *fcblk,
265  const pastix_complex32_t *A,
266  const pastix_complex32_t *B,
267  pastix_complex32_t *C )
268 {
269  const SolverBlok *iterblok;
270  const SolverBlok *fblok;
271  const SolverBlok *lblok;
272  const pastix_complex32_t *blokA;
273  const pastix_complex32_t *blokB;
274  pastix_complex32_t *blokC;
275 
276  pastix_int_t stride, stridef;
277  pastix_int_t M, N, K;
278  int shift;
279 
280  /* cblk is stored in 1D and fcblk in 2D */
281  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
282  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
283 
284  shift = (sideA == PastixUCoef) ? 1 : 0;
285  stride = cblk->stride;
286 
287  /* Get the B block and its dimensions */
288  blokB = B + blok->coefind;
289 
290  stride = cblk->stride;
291  K = cblk_colnbr( cblk );
292  N = blok_rownbr( blok );
293 
294  /**
295  * Add contribution to C in fcblk:
296  * Get the first facing block of the distant panel, and the last block of
297  * the current cblk
298  */
299  fblok = fcblk->fblokptr;
300  lblok = cblk[1].fblokptr;
301 
302  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
303 
304  /* Find facing blok */
305  while (!is_block_inside_fblock( iterblok, fblok ))
306  {
307  fblok++;
308  assert( fblok < fcblk[1].fblokptr );
309  }
310 
311  stridef = blok_rownbr(fblok);
312 
313  /* Get the A block and its dimensions */
314  blokA = A + iterblok->coefind;
315  M = blok_rownbr( iterblok );
316 
317  blokC = C + fblok->coefind
318  + iterblok->frownum - fblok->frownum
319  + (blok->frownum - fcblk->fcolnum) * stridef;
320 
321  pastix_cblk_lock( fcblk );
322  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
323  cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
324  M, N, K,
325  CBLAS_SADDR(mcone), blokA, stride,
326  blokB, stride,
327  CBLAS_SADDR(cone), blokC, stridef );
328  kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
329  pastix_cblk_unlock( fcblk );
330  }
331 }
332 
333 /**
334  *******************************************************************************
335  *
336  * @ingroup kernel_fact_null
337  *
338  * @brief Compute the updates that are generated by the transposition of one
339  * single off-diagonal block.
340  *
341  * Both cblk involved in the matrices A, B and C are stored with the 2D storage
342  * where each block is stored continuously one after another. (Similar to dense
343  * tile storage with variant tile size)
344  *
345  * All the off-diagonal block below block are multiplied by the selected block
346  * and added to the facing cblk.
347  *
348  *******************************************************************************
349  *
350  * @param[in] sideA
351  * Specify if A and C belong to the lower part, or to the upper part.
352  * If sideA == PastixLCoef, the contribution of:
353  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
354  * C, otherwise the contribution:
355  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
356  * to C.
357  * The pointer to the data structure that describes the panel from
358  * which we compute the contributions. Next column blok must be
359  * accessible through cblk[1].
360  *
361  * @param[in] trans
362  * Specify the transposition used for the B matrix. It has to be either
363  * PastixTrans or PastixConjTrans.
364  *
365  * @param[in] cblk
366  * The cblk structure to which block belongs to. The A and B pointers
367  * must be the coeftab of this column block.
368  * Next column blok must be accessible through cblk[1].
369  *
370  * @param[in] blok
371  * The block from which we compute the contributions.
372  *
373  * @param[inout] fcblk
374  * The pointer to the data structure that describes the panel on which
375  * we compute the contributions. The C pointer must be one of the
376  * coeftab from this fcblk. Next column blok must be accessible through
377  * fcblk[1].
378  *
379  * @param[in] A
380  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
381  * coefficients of the panel when the Lower part is computed,
382  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
383  *
384  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
385  * the coefficients of the panel, if Symmetric/Hermitian cases or if
386  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
387  * cblk.stride -by- cblk.width
388  *
389  * @param[inout] C
390  * The pointer to the fcblk.lcoeftab if the lower part is computed,
391  * fcblk.ucoeftab otherwise.
392  *
393  *******************************************************************************
394  *
395  * @sa core_cgemmsp_1d1d
396  * @sa core_cgemmsp_1d2d
397  *
398  *******************************************************************************/
399 static inline void
401  const SolverCblk *cblk,
402  const SolverBlok *blok,
403  SolverCblk *fcblk,
404  const pastix_complex32_t *A,
405  const pastix_complex32_t *B,
406  pastix_complex32_t *C )
407 {
408  const SolverBlok *iterblok;
409  const SolverBlok *fblok;
410  const SolverBlok *lblok;
411  const pastix_complex32_t *blokA;
412  const pastix_complex32_t *blokB;
413  pastix_complex32_t *blokC;
414 
415  pastix_int_t M, N, K, lda, ldb, ldc;
416  int shift;
417 
418  /* Both cblk and fcblk must be stored in 2D */
419  assert( cblk->cblktype & CBLK_LAYOUT_2D );
420  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
421 
422  shift = (sideA == PastixUCoef) ? 1 : 0;
423 
424  /* Get the B block and its dimensions */
425  blokB = B + blok->coefind;
426 
427  ldb = blok_rownbr( blok );
428  K = cblk_colnbr( cblk );
429  N = blok_rownbr( blok );
430 
431  /*
432  * Add contribution to C in fcblk:
433  * Get the first facing block of the distant panel, and the last block of
434  * the current cblk
435  */
436  fblok = fcblk->fblokptr;
437  lblok = cblk[1].fblokptr;
438 
439  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
440 
441  /* Find facing blok */
442  while (!is_block_inside_fblock( iterblok, fblok ))
443  {
444  fblok++;
445  assert( fblok < fcblk[1].fblokptr );
446  }
447 
448  ldc = blok_rownbr(fblok);
449 
450  /* Get the A block and its dimensions */
451  blokA = A + iterblok->coefind;
452  M = blok_rownbr( iterblok );
453  lda = M;
454 
455  blokC = C + fblok->coefind
456  + iterblok->frownum - fblok->frownum
457  + (blok->frownum - fcblk->fcolnum) * ldc;
458 
459  pastix_cblk_lock( fcblk );
460  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
461  cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
462  M, N, K,
463  CBLAS_SADDR(mcone), blokA, lda,
464  blokB, ldb,
465  CBLAS_SADDR(cone), blokC, ldc );
466  kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
467  pastix_cblk_unlock( fcblk );
468  }
469 }
470 
471 /**
472  *******************************************************************************
473  *
474  * @ingroup kernel_fact_null
475  *
476  * @brief Compute the updates that are generated by the transposition of all the
477  * blocks facing a common diagonal block, by another similar set of blocks.
478  *
479  * This is used to performed an update:
480  *
481  * C_mn = C_mn - A_mk * op(B_kn)
482  *
483  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
484  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
485  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
486  * belongs to the set of block of the cblk n facing the diagonal block of the
487  * cblk m.
488  *
489  *******************************************************************************
490  *
491  * @param[in] trans
492  * Specify the transposition used for the B matrices. It has to be either
493  * PastixTrans or PastixConjTrans.
494  *
495  * @param[in] blok_mk
496  * Index of the first off-diagonal block in cblk, that is used for A.
497  *
498  * @param[in] blok_kn
499  * Index of the first off-diagonal block in cblk, that is used for B.
500  *
501  * @param[in] blok_mn
502  * Index of the first off-diagonal block in fcblk, that is used for C.
503  *
504  * @param[in] cblk
505  * The cblk structure to which block belongs to. The A and B pointers
506  * must be the coeftab of this column block.
507  * Next column blok must be accessible through cblk[1].
508  *
509  * @param[inout] fcblk
510  * The pointer to the data structure that describes the panel on which
511  * we compute the contributions. The C pointer must be one of the
512  * coeftab from this fcblk. Next column blok must be accessible through
513  * fcblk[1].
514  *
515  * @param[in] A
516  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
517  * coefficients of the panel when the Lower part is computed,
518  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
519  *
520  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
521  * the coefficients of the panel, if Symmetric/Hermitian cases or if
522  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
523  * cblk.stride -by- cblk.width
524  *
525  * @param[inout] C
526  * The pointer to the fcblk.lcoeftab if the lower part is computed,
527  * fcblk.ucoeftab otherwise.
528  *
529  *******************************************************************************
530  *
531  * @return The number of flops performed
532  *
533  *******************************************************************************
534  *
535  * @sa core_cgemmsp_block_frlr
536  * @sa core_cgemmsp_block_lrlr
537  *
538  *******************************************************************************/
539 static inline pastix_fixdbl_t
541  pastix_int_t blok_mk,
542  pastix_int_t blok_kn,
543  pastix_int_t blok_mn,
544  const SolverCblk *cblk,
545  SolverCblk *fcblk,
546  const pastix_complex32_t *A,
547  const pastix_complex32_t *B,
548  pastix_complex32_t *C )
549 {
550  const SolverBlok *blokA, *blokB, *blokC;
551  const SolverBlok *bA, *bB, *bC;
552  const SolverBlok *fblokK, *lblokK;
553  const SolverBlok *fblokN, *lblokN;
554 
555  const pastix_complex32_t *Aptr, *Bptr;
556  pastix_complex32_t *Cptr;
557  pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
558  pastix_int_t full_m, full_n;
559  size_t offsetA, offsetB, offsetC;
560 
561  pastix_fixdbl_t flops = 0.0;
562  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
563 
564  /* Both cblk and fcblk must be stored in 2D */
565  assert( cblk->cblktype & CBLK_LAYOUT_2D );
566  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
567 
568  /*
569  * Blocs on column K
570  */
571  fblokK = cblk[0].fblokptr;
572  lblokK = cblk[1].fblokptr;
573 
574  blokB = fblokK + blok_kn;
575  offsetB = blokB->coefind;
576  cblk_n = blokB->fcblknm;
577 
578  blokA = fblokK + blok_mk;
579  offsetA = blokA->coefind;
580  cblk_m = blokA->fcblknm;
581 
582  /*
583  * Blocs on column N
584  */
585  fblokN = fcblk[0].fblokptr;
586  lblokN = fcblk[1].fblokptr;
587 
588  blokC = fblokN + blok_mn;
589  offsetC = blokC->coefind;
590  assert( blokC->fcblknm == cblk_m );
591 
592  K = cblk_colnbr( cblk );
593  full_m = 0;
594 
595  bC = blokC;
596  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
597  M = blok_rownbr(bA);
598  Aptr = A + bA->coefind - offsetA;
599  lda = M;
600  full_m += M;
601 
602  /* Find facing C blok */
603  while (!is_block_inside_fblock( bA, bC )) {
604  bC++;
605  assert( bC < lblokN );
606  }
607 
608  Cptr = C + bC->coefind - offsetC;
609  ldc = blok_rownbr(bC);
610 
611  full_n = 0;
612  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
613  N = blok_rownbr( bB );
614  full_n += N;
615  Bptr = B + bB->coefind - offsetB;
616  ldb = N;
617 
618  cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
619  M, N, K,
620  CBLAS_SADDR(mcone), Aptr, lda,
621  Bptr, ldb,
622  CBLAS_SADDR(cone), Cptr + (bA->frownum - bC->frownum)
623  + (bB->frownum - fcblk->fcolnum) * ldc , ldc );
624 
625  flops += FLOPS_CGEMM( M, N, K );
626  }
627  }
628 
629  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlok2d2d,
630  full_m, full_n, K,
631  flops, time );
632 
633  (void)lblokN;
634  return flops;
635 }
636 
637 /**
638  *******************************************************************************
639  *
640  * @ingroup kernel_fact_null
641  *
642  * @brief Compute the updates that are generated by the transposition of all the
643  * blocks facing a common diagonal block, by another similar set of blocks.
644  *
645  * This is used to perform an update:
646  *
647  * C_mn = C_mn - A_mk * op(B_kn)
648  *
649  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
650  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
651  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
652  * belongs to the set of block of the cblk n facing the diagonal block of the
653  * cblk m.
654  *
655  * In this routine, all the matrices are low-rank
656  *
657  *******************************************************************************
658  *
659  * @param[in] transB
660  * Specify the transposition used for the B matrices. It has to be either
661  * PastixTrans or PastixConjTrans.
662  *
663  * @param[in] blok_mk
664  * Index of the first off-diagonal block in cblk, that is used for A.
665  *
666  * @param[in] blok_kn
667  * Index of the first off-diagonal block in cblk, that is used for B.
668  *
669  * @param[in] blok_mn
670  * Index of the first off-diagonal block in fcblk, that is used for C.
671  *
672  * @param[in] cblk
673  * The cblk structure to which block belongs to. The A and B pointers
674  * must be the coeftab of this column block.
675  * Next column blok must be accessible through cblk[1].
676  *
677  * @param[inout] fcblk
678  * The pointer to the data structure that describes the panel on which
679  * we compute the contributions. The C pointer must be one of the
680  * coeftab from this fcblk. Next column blok must be accessible through
681  * fcblk[1].
682  *
683  * @param[in] A
684  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
685  * coefficients of the panel when the Lower part is computed,
686  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
687  *
688  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
689  * the coefficients of the panel, if Symmetric/Hermitian cases or if
690  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
691  * cblk.stride -by- cblk.width
692  *
693  * @param[in] lrC
694  * Pointer to the low-rank representation of the block C.
695  * Must be followed by the low-rank representation of the following blocks.
696  *
697  * @param[in] lowrank
698  * The structure with low-rank parameters.
699  *
700  *******************************************************************************
701  *
702  * @return The number of flops performed
703  *
704  *******************************************************************************/
705 static inline pastix_fixdbl_t
707  pastix_int_t blok_mk,
708  pastix_int_t blok_kn,
709  pastix_int_t blok_mn,
710  const SolverCblk *cblk,
711  SolverCblk *fcblk,
712  const pastix_complex32_t *A,
713  const pastix_complex32_t *B,
714  pastix_lrblock_t *lrC,
715  const pastix_lr_t *lowrank )
716 {
717  const SolverBlok *blokA, *blokB, *blokC;
718  const SolverBlok *bA, *bB, *bC;
719  const SolverBlok *fblokK, *lblokK;
720  const SolverBlok *fblokN, *lblokN;
721 
722  pastix_lrblock_t lrA, lrB;
723  core_clrmm_t params;
724 
725  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
726  size_t offsetA, offsetB;
727 
728  pastix_fixdbl_t flops = 0.0;
729  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
730 
731  /* Provide an internal lock as this function is already protected by the runtime dependency system */
732  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
733 
734  /* Both cblk and fcblk must be stored in 2D */
735  assert( cblk->cblktype & CBLK_LAYOUT_2D );
736  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
737 
738  /* Both cblk and fcblk must be compressed */
739  assert(!(cblk->cblktype & CBLK_COMPRESSED) );
740  assert( fcblk->cblktype & CBLK_COMPRESSED );
741 
742  /*
743  * Blocs on column K
744  */
745  fblokK = cblk[0].fblokptr;
746  lblokK = cblk[1].fblokptr;
747 
748  blokB = fblokK + blok_kn;
749  offsetB = blokB->coefind;
750  cblk_n = blokB->fcblknm;
751  lrB.rk = -1;
752 
753  blokA = fblokK + blok_mk;
754  offsetA = blokA->coefind;
755  cblk_m = blokA->fcblknm;
756  lrA.rk = -1;
757 
758  /*
759  * Blocs on column N
760  */
761  fblokN = fcblk[0].fblokptr;
762  lblokN = fcblk[1].fblokptr;
763 
764  blokC = fblokN + blok_mn;
765  assert( blokC->fcblknm == cblk_m );
766 
767  full_m = 0;
768  K = cblk_colnbr( cblk );
769 
770  params.lowrank = lowrank;
771  params.transA = PastixNoTrans;
772  params.transB = transB;
773  params.K = K;
774  params.Cn = cblk_colnbr( fcblk );
775  params.alpha = -1.0;
776  params.beta = 1.0;
777  params.work = NULL;
778  params.lwork = -1;
779  params.lock = &(lock);
780 
781  bC = blokC;
782  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
783  M = blok_rownbr(bA);
784  params.M = M;
785  full_m += M;
786 
787  lrA.rkmax = M;
788  lrA.u = (pastix_complex32_t*)A + bA->coefind - offsetA;
789  lrA.v = NULL;
790 
791  /* Find facing C blok */
792  while (!is_block_inside_fblock( bA, bC )) {
793  bC++;
794  lrC++;
795  assert( bC < lblokN );
796  }
797 
798  params.A = &lrA;
799  params.C = lrC;
800  params.Cm = blok_rownbr( bC );
801 
802  full_n = 0;
803  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
804 
805  params.N = blok_rownbr( bB );
806  full_n += params.N;
807  params.offx = bA->frownum - bC->frownum;
808  params.offy = bB->frownum - fcblk->fcolnum;
809 
810  lrB.rkmax = M;
811  lrB.u = (pastix_complex32_t*)B + bB->coefind - offsetB;
812  lrB.v = NULL;
813 
814  params.B = &lrB;
815 
816  flops += core_clrmm( &params );
817  }
818  }
819 
820  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
821  full_m, full_n, K,
822  flops, time );
823  (void)lblokN;
824  return flops;
825 }
826 
827 /**
828  *******************************************************************************
829  *
830  * @ingroup kernel_fact_null
831  *
832  * @brief Compute the updates that are generated by the transposition of all the
833  * blocks facing a common diagonal block, by another similar set of blocks.
834  *
835  * This is used to perform an update:
836  *
837  * C_mn = C_mn - A_mk * op(B_kn)
838  *
839  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
840  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
841  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
842  * belongs to the set of block of the cblk n facing the diagonal block of the
843  * cblk m.
844  *
845  * In this routine, all the matrices are low-rank
846  *
847  *******************************************************************************
848  *
849  * @param[in] transB
850  * Specify the transposition used for the B matrices. It has to be either
851  * PastixTrans or PastixConjTrans.
852  *
853  * @param[in] blok_mk
854  * Index of the first off-diagonal block in cblk, that is used for A.
855  *
856  * @param[in] blok_kn
857  * Index of the first off-diagonal block in cblk, that is used for B.
858  *
859  * @param[in] blok_mn
860  * Index of the first off-diagonal block in fcblk, that is used for C.
861  *
862  * @param[in] cblk
863  * The cblk structure to which block belongs to. The A and B pointers
864  * must be the coeftab of this column block.
865  * Next column blok must be accessible through cblk[1].
866  *
867  * @param[inout] fcblk
868  * The pointer to the data structure that describes the panel on which
869  * we compute the contributions. The C pointer must be one of the
870  * coeftab from this fcblk. Next column blok must be accessible through
871  * fcblk[1].
872  *
873  * @param[inout] lrA
874  * Pointer to the low-rank representation of the block A.
875  * Must be followed by the low-rank representation of the following blocks.
876  *
877  * @param[inout] lrB
878  * Pointer to the low-rank representation of the block B.
879  * Must be followed by the low-rank representation of the following blocks.
880  *
881  * @param[inout] lrC
882  * Pointer to the low-rank representation of the block C.
883  * Must be followed by the low-rank representation of the following blocks.
884  *
885  * @param[in] lowrank
886  * The structure with low-rank parameters.
887  *
888  *******************************************************************************
889  *
890  * @return The number of flops performed
891  *
892  *******************************************************************************/
893 static inline pastix_fixdbl_t
895  pastix_int_t blok_mk,
896  pastix_int_t blok_kn,
897  pastix_int_t blok_mn,
898  const SolverCblk *cblk,
899  SolverCblk *fcblk,
900  const pastix_lrblock_t *lrA,
901  const pastix_lrblock_t *lrB,
902  pastix_lrblock_t *lrC,
903  const pastix_lr_t *lowrank )
904 {
905  const pastix_lrblock_t *blrB;
906  const SolverBlok *blokA, *blokB, *blokC;
907  const SolverBlok *bA, *bB, *bC;
908  const SolverBlok *fblokK, *lblokK;
909  const SolverBlok *fblokN, *lblokN;
910 
911  core_clrmm_t params;
912 
913  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
914 
915  pastix_fixdbl_t flops = 0.0;
916  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
917 
918  /* Provide an internal lock as this function is already protected by the runtime dependency system */
919  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
920 
921  /* Both cblk and fcblk must be stored in 2D */
922  assert( cblk->cblktype & CBLK_LAYOUT_2D );
923  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
924 
925  /* Both cblk and fcblk must be compressed */
926  assert( cblk->cblktype & CBLK_COMPRESSED );
927  assert( fcblk->cblktype & CBLK_COMPRESSED );
928 
929  /*
930  * Blocs on column K
931  */
932  fblokK = cblk[0].fblokptr;
933  lblokK = cblk[1].fblokptr;
934 
935  blokB = fblokK + blok_kn;
936  cblk_n = blokB->fcblknm;
937 
938  blokA = fblokK + blok_mk;
939  cblk_m = blokA->fcblknm;
940 
941  /*
942  * Blocs on column N
943  */
944  fblokN = fcblk[0].fblokptr;
945  lblokN = fcblk[1].fblokptr;
946 
947  blokC = fblokN + blok_mn;
948  assert( blokC->fcblknm == cblk_m );
949 
950  full_m = 0;
951  K = cblk_colnbr( cblk );
952 
953  params.lowrank = lowrank;
954  params.transA = PastixNoTrans;
955  params.transB = transB;
956  params.K = K;
957  params.Cn = cblk_colnbr( fcblk );
958  params.alpha = -1.0;
959  params.beta = 1.0;
960  params.work = NULL;
961  params.lwork = -1;
962  params.lock = &(lock);
963 
964  bC = blokC;
965  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++, lrA++) {
966  M = blok_rownbr(bA);
967  params.M = M;
968  full_m += M;
969 
970  /* Find facing C blok */
971  while (!is_block_inside_fblock( bA, bC )) {
972  bC++;
973  lrC++;
974  assert( bC < lblokN );
975  }
976 
977  params.A = lrA;
978  params.C = lrC;
979  params.Cm = blok_rownbr( bC );
980 
981  full_n = 0;
982  for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++, blrB++) {
983 
984  params.N = blok_rownbr( bB );
985  full_n += params.N;
986 
987  params.offx = bA->frownum - bC->frownum;
988  params.offy = bB->frownum - fcblk->fcolnum;
989  params.B = blrB;
990 
991  flops += core_clrmm( &params );
992  }
993  }
994 
995  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
996  full_m, full_n, K,
997  flops, time );
998  (void)lblokN;
999  return flops;
1000 }
1001 
1002 /**
1003  *******************************************************************************
1004  *
1005  * @ingroup kernel_fact_null
1006  *
1007  * @brief Compute the updates associated to one off-diagonal block.
1008  *
1009  *******************************************************************************
1010  *
1011  * @param[in] sideA
1012  * Specify if A and C belong to the lower part, or to the upper part.
1013  * If sideA == PastixLCoef, the contribution of:
1014  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1015  * C, otherwise the contribution:
1016  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1017  * to C.
1018  *
1019  * @param[in] trans
1020  * Specify the transposition used for the B matrix. It has to be either
1021  * PastixTrans or PastixConjTrans.
1022  *
1023  * @param[in] cblk
1024  * The cblk structure to which block belongs to. The A and B pointers
1025  * must be the coeftab of this column block.
1026  * Next column blok must be accessible through cblk[1].
1027  *
1028  * @param[in] blok
1029  * The block from which we compute the contributions.
1030  *
1031  * @param[inout] fcblk
1032  * The pointer to the data structure that describes the panel on which
1033  * we compute the contributions. The C pointer must be one of the
1034  * coeftab from this fcblk. Next column blok must be accessible through
1035  * fcblk[1].
1036  *
1037  * @param[in] A
1038  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
1039  * coefficients of the panel when the Lower part is computed,
1040  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
1041  *
1042  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
1043  * the coefficients of the panel, if Symmetric/Hermitian cases or if
1044  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
1045  * cblk.stride -by- cblk.width
1046  *
1047  * @param[in] lrC
1048  * Pointer to the low-rank representation of the block C.
1049  * Must be followed by the low-rank representation of the following blocks.
1050  *
1051  * @param[in] work
1052  * Temporary memory buffer.
1053  *
1054  * @param[in] lowrank
1055  * The structure with low-rank parameters.
1056  *
1057  *******************************************************************************
1058  *
1059  * @return The number of flops performed
1060  *
1061  *******************************************************************************/
1062 static inline pastix_fixdbl_t
1064  pastix_trans_t trans,
1065  const SolverCblk *cblk,
1066  const SolverBlok *blok,
1067  SolverCblk *fcblk,
1068  const pastix_complex32_t *A,
1069  const pastix_complex32_t *B,
1070  pastix_lrblock_t *lrC,
1071  pastix_complex32_t *work,
1072  pastix_int_t lwork,
1073  const pastix_lr_t *lowrank )
1074 {
1075  const SolverBlok *iterblok;
1076  const SolverBlok *fblok;
1077  const SolverBlok *lblok;
1078  pastix_lrblock_t lrA, lrB;
1079  core_clrmm_t params;
1080 
1081  pastix_int_t stride, shift;
1082  pastix_int_t M, N;
1083 
1084  pastix_fixdbl_t flops = 0.0;
1085 
1086  /* Update from a dense block to a low rank block */
1087  assert(!(cblk->cblktype & CBLK_COMPRESSED));
1088  assert( fcblk->cblktype & CBLK_COMPRESSED );
1089  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1090 
1091  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1092 
1093  shift = (sideA == PastixUCoef) ? 1 : 0;
1094  stride = cblk->stride;
1095 
1096  N = blok_rownbr( blok );
1097 
1098  /* Get the B block and its dimensions */
1099  lrB.rk = -1;
1100  lrB.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? N : stride;
1101  lrB.u = (pastix_complex32_t*)B + blok->coefind; /* lrB is const, we can cast the B pointer */
1102  lrB.v = NULL;
1103 
1104  /**
1105  * Add contribution to C in fcblk:
1106  * Get the first facing block of the distant panel, and the last block of
1107  * the current cblk
1108  */
1109  fblok = fcblk->fblokptr;
1110  lblok = cblk[1].fblokptr;
1111 
1112  params.lowrank = lowrank;
1113  params.transA = PastixNoTrans;
1114  params.transB = trans;
1115  params.N = blok_rownbr( blok );
1116  params.K = cblk_colnbr( cblk );
1117  params.Cn = cblk_colnbr( fcblk );
1118  params.alpha = -1.0;
1119  params.beta = 1.0;
1120  params.work = work;
1121  params.lwork = lwork;
1122  params.lock = &(fcblk->lock);
1123  params.B = &lrB;
1124 
1125  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1126 
1127  /* Find facing blok */
1128  while (!is_block_inside_fblock( iterblok, fblok ))
1129  {
1130  fblok++;
1131  lrC++;
1132  assert( fblok < fcblk[1].fblokptr );
1133  }
1134 
1135  /* Get the A block and its dimensions */
1136  M = blok_rownbr( iterblok );
1137  lrA.rk = -1;
1138  lrA.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? M : stride;
1139  lrA.u = (pastix_complex32_t*)A + iterblok->coefind; /* Same as for B */
1140  lrA.v = NULL;
1141 
1142  params.M = M;
1143  params.A = &lrA;
1144  params.C = lrC;
1145  params.Cm = blok_rownbr( fblok );
1146 
1147  params.offx = iterblok->frownum - fblok->frownum;
1148  params.offy = blok->frownum - fcblk->fcolnum;
1149 
1150  flops += core_clrmm( &params );
1151  }
1152  return flops;
1153 }
1154 
1155 /**
1156  *******************************************************************************
1157  *
1158  * @ingroup kernel_fact_null
1159  *
1160  * @brief Compute the updates associated to one off-diagonal block.
1161  *
1162  *******************************************************************************
1163  *
1164  * @param[in] sideA
1165  * Specify if A and C belong to the lower part, or to the upper part.
1166  * If sideA == PastixLCoef, the contribution of:
1167  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1168  * C, otherwise the contribution:
1169  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1170  * to C.
1171  *
1172  * @param[in] trans
1173  * Specify the transposition used for the B matrix. It has to be either
1174  * PastixTrans or PastixConjTrans.
1175  *
1176  * @param[in] cblk
1177  * The cblk structure to which block belongs to. The A and B pointers
1178  * must be the coeftab of this column block.
1179  * Next column blok must be accessible through cblk[1].
1180  *
1181  * @param[in] blok
1182  * The block from which we compute the contributions.
1183  *
1184  * @param[inout] fcblk
1185  * The pointer to the data structure that describes the panel on which
1186  * we compute the contributions. The C pointer must be one of the
1187  * coeftab from this fcblk. Next column blok must be accessible through
1188  * fcblk[1].
1189  *
1190  * @param[in] lrA
1191  * Pointer to the low-rank representation of the block A.
1192  * Must be followed by the low-rank representation of the following blocks.
1193  *
1194  * @param[in] lrB
1195  * Pointer to the low-rank representation of the block B.
1196  * Must be followed by the low-rank representation of the following blocks.
1197  *
1198  * @param[in] lrC
1199  * Pointer to the low-rank representation of the block B.
1200  * Must be followed by the low-rank representation of the following blocks.
1201  *
1202  * @param[in] work
1203  * Temporary memory buffer.
1204  *
1205  * @param[in] lowrank
1206  * The structure with low-rank parameters.
1207  *
1208  *******************************************************************************
1209  *
1210  * @return The number of flops performed
1211  *
1212  *******************************************************************************/
1213 static inline pastix_fixdbl_t
1215  pastix_trans_t trans,
1216  const SolverCblk *cblk,
1217  const SolverBlok *blok,
1218  SolverCblk *fcblk,
1219  const pastix_lrblock_t *lrA,
1220  const pastix_lrblock_t *lrB,
1221  pastix_lrblock_t *lrC,
1222  pastix_complex32_t *work,
1223  pastix_int_t lwork,
1224  const pastix_lr_t *lowrank )
1225 {
1226  const SolverBlok *iterblok;
1227  const SolverBlok *fblok;
1228  const SolverBlok *lblok;
1229 
1230  pastix_int_t N, K, shift;
1231  core_clrmm_t params;
1232 
1233  pastix_fixdbl_t flops = 0.0;
1234 
1235  /* Update from a low-rank cblk to a low-rank cblk */
1236  assert( cblk->cblktype & CBLK_COMPRESSED );
1237  assert( fcblk->cblktype & CBLK_COMPRESSED );
1238  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1239  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1240 
1241  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1242 
1243  shift = (sideA == PastixUCoef) ? 1 : 0;
1244 
1245  /* Get the B block and its dimensions */
1246  K = cblk_colnbr( cblk );
1247  N = blok_rownbr( blok );
1248 
1249  /*
1250  * Add contribution to C in fcblk:
1251  * Get the first facing block of the distant panel, and the last block of
1252  * the current cblk
1253  */
1254  fblok = fcblk->fblokptr;
1255  lblok = cblk[1].fblokptr;
1256 
1257  params.lowrank = lowrank;
1258  params.transA = PastixNoTrans;
1259  params.transB = trans;
1260  params.N = N;
1261  params.K = K;
1262  params.Cn = cblk_colnbr( fcblk );
1263  params.alpha = -1.0;
1264  params.beta = 1.0;
1265  params.work = work;
1266  params.lwork = lwork;
1267  params.lwused = 0;
1268  params.lock = &(fcblk->lock);
1269  params.B = lrB + (blok - cblk->fblokptr);
1270 
1271  /* for all following blocks in block column */
1272  lrA = lrA + (blok - cblk->fblokptr) + shift;
1273  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1274 
1275  /* Find facing blok */
1276  while (!is_block_inside_fblock( iterblok, fblok ))
1277  {
1278  fblok++;
1279  lrC++;
1280  assert( fblok < fcblk[1].fblokptr );
1281  }
1282 
1283  params.M = blok_rownbr( iterblok );
1284  params.A = lrA;
1285  params.C = lrC;
1286  params.Cm = blok_rownbr( fblok );
1287 
1288  params.offx = iterblok->frownum - fblok->frownum;
1289  params.offy = blok->frownum - fcblk->fcolnum;
1290 
1291  flops += core_clrmm( &params );
1292  }
1293  return flops;
1294 }
1295 
1296 /**
1297  *******************************************************************************
1298  *
1299  * @ingroup kernel_fact_null
1300  *
1301  * @brief Compute the updates associated to one off-diagonal block.
1302  *
1303  *******************************************************************************
1304  *
1305  * @param[in] sideA
1306  * Specify if A and C belong to the lower part, or to the upper part.
1307  * If sideA == PastixLCoef, the contribution of:
1308  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1309  * C, otherwise the contribution:
1310  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1311  * to C.
1312  *
1313  *
1314  * @param[in] trans
1315  * Specify the transposition used for the B matrix. It has to be either
1316  * PastixTrans or PastixConjTrans.
1317  *
1318  * @param[in] cblk
1319  * The cblk structure to which block belongs to. The A and B pointers
1320  * must be the coeftab of this column block.
1321  * Next column blok must be accessible through cblk[1].
1322  *
1323  * @param[in] blok
1324  * The block from which we compute the contributions.
1325  *
1326  * @param[inout] fcblk
1327  * The pointer to the data structure that describes the panel on which
1328  * we compute the contributions. The C pointer must be one of the
1329  * coeftab from this fcblk. Next column blok must be accessible through
1330  * fcblk[1].
1331  *
1332  * @param[in] lrA
1333  * Pointer to the low-rank representation of the block A.
1334  * Must be followed by the low-rank representation of the following blocks.
1335  *
1336  * @param[in] lrB
1337  * Pointer to the low-rank representation of the block B.
1338  * Must be followed by the low-rank representation of the following blocks.
1339  *
1340  * @param[inout] C
1341  * The pointer to the fcblk.lcoeftab if the lower part is computed,
1342  * fcblk.ucoeftab otherwise.
1343  *
1344  * @param[in] work
1345  * Temporary memory buffer.
1346  *
1347  * @param[in] lwork
1348  * The length of work.
1349  * On entry, if trans = PastixTrans
1350  * lwork >= max(1, K*N). Otherwise lwork >= max(1, M*K).
1351  *
1352  * @param[in] lowrank
1353  * The structure with low-rank parameters.
1354  *
1355  *******************************************************************************
1356  *
1357  * @return The number of flops performed
1358  *
1359  *******************************************************************************/
1360 static inline pastix_fixdbl_t
1362  pastix_trans_t trans,
1363  const SolverCblk *cblk,
1364  const SolverBlok *blok,
1365  SolverCblk *fcblk,
1366  const pastix_lrblock_t *lrA,
1367  const pastix_lrblock_t *lrB,
1368  pastix_complex32_t *C,
1369  pastix_complex32_t *work,
1370  pastix_int_t lwork,
1371  const pastix_lr_t *lowrank )
1372 {
1373  const SolverBlok *iterblok;
1374  const SolverBlok *fblok;
1375  const SolverBlok *lblok;
1376 
1377  pastix_int_t N, K, shift;
1378  pastix_lrblock_t lrC;
1379  core_clrmm_t params;
1380 
1381  pastix_fixdbl_t flops = 0.0;
1382 
1383  /* Update from a low-rank cblk to a low-rank cblk */
1384  assert( cblk->cblktype & CBLK_COMPRESSED );
1385  assert( !(fcblk->cblktype & CBLK_COMPRESSED) );
1386  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1387  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1388 
1389  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1390 
1391  shift = (sideA == PastixUCoef) ? 1 : 0;
1392 
1393  /* Get the B block and its dimensions */
1394  K = cblk_colnbr( cblk );
1395  N = blok_rownbr( blok );
1396 
1397  /*
1398  * Add contribution to C in fcblk:
1399  * Get the first facing block of the distant panel, and the last block of
1400  * the current cblk
1401  */
1402  fblok = fcblk->fblokptr;
1403  lblok = cblk[1].fblokptr;
1404 
1405  params.lowrank = lowrank;
1406  params.transA = PastixNoTrans;
1407  params.transB = trans;
1408  params.N = N;
1409  params.K = K;
1410  params.Cn = cblk_colnbr( fcblk );
1411  params.alpha = -1.0;
1412  params.beta = 1.0;
1413  params.work = work;
1414  params.lwork = lwork;
1415  params.lwused = 0;
1416  params.lock = &(fcblk->lock);
1417  params.B = lrB + (blok - cblk->fblokptr);
1418  params.C = &lrC;
1419 
1420  lrC.rk = -1;
1421  lrC.v = NULL;
1422 
1423  /* for all following blocks in block column */
1424  lrA = lrA + (blok - cblk->fblokptr) + shift;
1425  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1426 
1427  /* Find facing blok */
1428  while (!is_block_inside_fblock( iterblok, fblok ))
1429  {
1430  fblok++;
1431  assert( fblok < fcblk[1].fblokptr );
1432  }
1433 
1434  params.M = blok_rownbr( iterblok );
1435  params.A = lrA;
1436  params.Cm = blok_rownbr( fblok );
1437  params.offx = iterblok->frownum - fblok->frownum;
1438  params.offy = blok->frownum - fcblk->fcolnum;
1439 
1440  lrC.rkmax = params.Cm;
1441  lrC.u = C + fblok->coefind;
1442 
1443  flops += core_clrmm( &params );
1444  }
1445  return flops;
1446 }
1447 
1448 /**
1449  *******************************************************************************
1450  *
1451  * @brief Compute the updates associated to one off-diagonal block.
1452  *
1453  *******************************************************************************
1454  *
1455  * @param[in] sideA
1456  * Specify if A and C belong to the lower part, or to the upper part.
1457  * If sideA == PastixLCoef, the contribution of:
1458  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1459  * C, otherwise the contribution:
1460  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1461  * to C.
1462  * The pointer to the data structure that describes the panel from
1463  * which we compute the contributions. Next column blok must be
1464  * accessible through cblk[1].
1465  *
1466  * @param[in] trans
1467  * Specify the transposition used for the B matrix. It has to be either
1468  * PastixTrans or PastixConjTrans.
1469  *
1470  * @param[in] cblk
1471  * The cblk structure to which block belongs to. The A and B pointers
1472  * must be the coeftab of this column block.
1473  * Next column blok must be accessible through cblk[1].
1474  *
1475  * @param[in] blok
1476  * The block from which we compute the contributions.
1477  *
1478  * @param[inout] fcblk
1479  * The pointer to the data structure that describes the panel on which
1480  * we compute the contributions. The C pointer must be one of the
1481  * coeftab from this fcblk. Next column blok must be accessible through
1482  * fcblk[1].
1483  *
1484  * @param[in] A
1485  * The pointer to the correct representation of A.
1486  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1487  * - pastix_lr_block if the block is compressed.
1488  *
1489  * @param[in] B
1490  * The pointer to the correct representation of B.
1491  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1492  * - pastix_lr_block if the block is compressed.
1493  *
1494  * @param[inout] C
1495  * The pointer to the correct representation of C.
1496  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1497  * - pastix_lr_block if the block is compressed.
1498  *
1499  * @param[in] work
1500  * Temporary memory buffer.
1501  *
1502  * @param[in] lowrank
1503  * The structure with low-rank parameters.
1504  *
1505  *******************************************************************************
1506  *
1507  * @return The number of static pivoting during factorization of the diagonal
1508  * block.
1509  *
1510  *******************************************************************************/
1511 pastix_fixdbl_t
1513  pastix_trans_t trans,
1514  const SolverCblk *cblk,
1515  const SolverBlok *blok,
1516  SolverCblk *fcblk,
1517  const void *A,
1518  const void *B,
1519  void *C,
1520  pastix_complex32_t *work,
1521  pastix_int_t lwork,
1522  const pastix_lr_t *lowrank )
1523 {
1524  pastix_ktype_t ktype;
1525  pastix_fixdbl_t time, flops = 0.0;
1526  pastix_int_t m = cblk->stride;
1527  pastix_int_t n = blok_rownbr( blok );
1528  pastix_int_t k = cblk_colnbr( cblk );
1529 
1530  m -= (cblk->cblktype & CBLK_LAYOUT_2D) ? blok->coefind / k : blok->coefind;
1531  m -= (sideA == PastixUCoef) ? blok_rownbr( blok ) : 0;
1532 
1533  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1534  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1535  ktype = PastixKernelGEMMCblkLRLR;
1536  time = kernel_trace_start( ktype );
1537 
1538  flops = core_cgemmsp_lr( sideA, trans,
1539  cblk, blok, fcblk,
1540  A, B, C, work, lwork,
1541  lowrank );
1542  }
1543  else {
1544  ktype = PastixKernelGEMMCblkFRLR;
1545  time = kernel_trace_start( ktype );
1546 
1547  flops = core_cgemmsp_fulllr( sideA, trans,
1548  cblk, blok, fcblk,
1549  A, B, C, work, lwork,
1550  lowrank );
1551  }
1552  }
1553  else if ( fcblk->cblktype & CBLK_LAYOUT_2D ) {
1554  if ( cblk->cblktype & CBLK_COMPRESSED) {
1555  ktype = PastixKernelGEMMCblk2d2d;
1556  time = kernel_trace_start( ktype );
1557  core_cgemmsp_lrfr( sideA, trans,
1558  cblk, blok, fcblk,
1559  A, B, C, work, lwork,
1560  lowrank );
1561  }
1562  else if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
1563  ktype = PastixKernelGEMMCblk2d2d;
1564  time = kernel_trace_start( ktype );
1565 
1566  core_cgemmsp_2d2d( sideA, trans,
1567  cblk, blok, fcblk,
1568  A, B, C );
1569  }
1570  else {
1571  ktype = PastixKernelGEMMCblk1d2d;
1572  time = kernel_trace_start( ktype );
1573 
1574  core_cgemmsp_1d2d( sideA, trans,
1575  cblk, blok, fcblk,
1576  A, B, C );
1577  }
1578  flops = FLOPS_CGEMM( m, n, k );
1579  }
1580  else {
1581  assert( !(cblk->cblktype & CBLK_COMPRESSED) );
1582  ktype = PastixKernelGEMMCblk1d1d;
1583  time = kernel_trace_start( ktype );
1584 
1585  core_cgemmsp_1d1d( sideA, trans,
1586  cblk, blok, fcblk,
1587  A, B, C, work );
1588 
1589  flops = FLOPS_CGEMM( m, n, k );
1590  }
1591 
1592  kernel_trace_stop( blok->inlast, ktype, m, n, k, flops, time );
1593 
1594  return flops;
1595 }
1596 
1597 /**
1598  *******************************************************************************
1599  *
1600  * @brief Compute the CPU gemm associated to a couple of off-diagonal blocks.
1601  *
1602  * C_l = C_l - A_l * op(B_s), with B_s = B_l, or B_u
1603  * or
1604  * C_u = C_u - A_u * op(B_s), with B_s = B_l, or B_u
1605  *
1606  *******************************************************************************
1607  *
1608  * @param[in] transB
1609  * Specify wheter B should be used as PastixNoTrans, PastixTrans, or
1610  * PastixConjTrans in the computations.
1611  *
1612  * @param[in] cblk
1613  * The cblk structure to which block A and B belong to. The A and B
1614  * pointers must be one of the [lu]coeftab of this column block.
1615  * Next column blok must be accessible through cblk[1].
1616  *
1617  * @param[inout] fcblk
1618  * The pointer to the data structure that describes the panel on which
1619  * we compute the contributions. The C pointer must be one of the
1620  * [lu]coeftab from this fcblk.
1621  * Next column blok must be accessible through fcblk[1].
1622  *
1623  * @param[in] blok_mk
1624  * Specify the index of the A block in the cblk column. This index is
1625  * 0-based for the diagonal block.
1626  *
1627  * @param[in] blok_nk
1628  * Specify the index of the B block in the cblk column. This index is
1629  * 0-based for the diagonal block.
1630  *
1631  * @param[in] blok_mn
1632  * Specify the index of the C block in the fcblk column. This index is
1633  * 0-based for the diagonal block.
1634  *
1635  * @param[in] A
1636  * The pointer to the correct representation of A.
1637  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1638  * - pastix_lr_block if the block is compressed.
1639  *
1640  * @param[in] B
1641  * The pointer to the correct representation of B.
1642  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1643  * - pastix_lr_block if the block is compressed.
1644  *
1645  * @param[in] C
1646  * The pointer to the correct representation of C.
1647  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1648  * - pastix_lr_block if the block is compressed.
1649  *
1650  * @param[in] lowrank
1651  * The structure with the low-rank parameters.
1652  *
1653  *******************************************************************************/
1654 pastix_fixdbl_t
1656  const SolverCblk *cblk,
1657  SolverCblk *fcblk,
1658  pastix_int_t blok_mk,
1659  pastix_int_t blok_nk,
1660  pastix_int_t blok_mn,
1661  const void *A,
1662  const void *B,
1663  void *C,
1664  const pastix_lr_t *lowrank )
1665 {
1666  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1667  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1668  return core_cgemmsp_block_lrlr( transB,
1669  blok_mk, blok_nk, blok_mn,
1670  cblk, fcblk,
1671  A, B, C, lowrank );
1672  }
1673  else {
1674  return core_cgemmsp_block_frlr( transB,
1675  blok_mk, blok_nk, blok_mn,
1676  cblk, fcblk,
1677  A, B, C, lowrank );
1678  }
1679  }
1680  else {
1681  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1682  assert(0);
1683  return 0.; /* Avoids compilation and coverity warning */
1684  }
1685  else {
1686  return core_cgemmsp_block_frfr( transB,
1687  blok_mk, blok_nk, blok_mn,
1688  cblk, fcblk,
1689  A, B, C );
1690  }
1691  }
1692 }
solver_blok_s::frownum
pastix_int_t frownum
Definition: solver.h:112
core_cgemmsp_1d1d
static void core_cgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C, pastix_complex32_t *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_cgemmsp.c:104
solver.h
blok_rownbr
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:313
cblk_colnbr
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:247
core_clrmm_s::lowrank
const pastix_lr_t * lowrank
Definition: pastix_clrcores.h:81
pastix_lr_s
Structure to define the type of function to use for the low-rank kernels and their parameters.
Definition: pastix_lowrank.h:147
solver_cblk_s::fblokptr
SolverBlok * fblokptr
Definition: solver.h:134
pastix_lrblock_s::v
void * v
Definition: pastix_lowrank.h:116
cpucblk_cgemmsp
pastix_fixdbl_t cpucblk_cgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
Definition: core_cgemmsp.c:1512
solver_cblk_s::stride
pastix_int_t stride
Definition: solver.h:135
pastix_lrblock_s::u
void * u
Definition: pastix_lowrank.h:115
core_clrmm_s::Cm
pastix_int_t Cm
Definition: pastix_clrcores.h:87
core_clrmm
pastix_fixdbl_t core_clrmm(core_clrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Definition: core_clrmm.c:278
core_cgemmsp_block_lrlr
static pastix_fixdbl_t core_cgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_cgemmsp.c:894
core_clrmm_s::lwork
pastix_int_t lwork
Definition: pastix_clrcores.h:97
solver_cblk_s
Solver column block structure.
Definition: solver.h:127
cpublok_cgemmsp
pastix_fixdbl_t cpublok_cgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
Definition: core_cgemmsp.c:1655
pastix_coefside_t
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
core_clrmm_s::alpha
pastix_complex32_t alpha
Definition: pastix_clrcores.h:91
pastix_trans_t
enum pastix_trans_e pastix_trans_t
Transpostion.
core_clrmm_s::A
const pastix_lrblock_t * A
Definition: pastix_clrcores.h:92
solver_blok_s
Solver block structure.
Definition: solver.h:107
pastix_lrblock_s
The block low-rank structure to hold a matrix in low-rank form.
Definition: pastix_lowrank.h:112
core_clrmm_s::offx
pastix_int_t offx
Definition: pastix_clrcores.h:89
PastixNoTrans
@ PastixNoTrans
Definition: api.h:424
core_clrmm_s::Cn
pastix_int_t Cn
Definition: pastix_clrcores.h:88
is_block_inside_fblock
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition: solver.h:426
core_clrmm_s::lock
pastix_atomic_lock_t * lock
Definition: pastix_clrcores.h:99
core_clrmm_s::K
pastix_int_t K
Definition: pastix_clrcores.h:86
core_clrmm_s::offy
pastix_int_t offy
Definition: pastix_clrcores.h:90
core_clrmm_s
Structure to store all the parameters of the core_clrmm family functions.
Definition: pastix_clrcores.h:80
pastix_ccores.h
core_clrmm_s::lwused
pastix_int_t lwused
Definition: pastix_clrcores.h:98
core_clrmm_s::work
pastix_complex32_t * work
Definition: pastix_clrcores.h:96
PastixUCoef
@ PastixUCoef
Definition: api.h:457
core_cgemmsp_fulllr
static pastix_fixdbl_t core_cgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
Definition: core_cgemmsp.c:1063
core_cgemmsp_block_frfr
static pastix_fixdbl_t core_cgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_cgemmsp.c:540
core_cgemmsp_lr
static pastix_fixdbl_t core_cgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
Definition: core_cgemmsp.c:1214
core_clrmm_s::transA
pastix_trans_t transA
Definition: pastix_clrcores.h:82
core_cgemmsp_1d2d
static void core_cgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_cgemmsp.c:261
core_clrmm_s::M
pastix_int_t M
Definition: pastix_clrcores.h:84
core_cgemmsp_block_frlr
static pastix_fixdbl_t core_cgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_cgemmsp.c:706
solver_blok_s::coefind
pastix_int_t coefind
Definition: solver.h:114
pastix_lrblock_s::rk
int rk
Definition: pastix_lowrank.h:113
core_cgemmsp_2d2d
static void core_cgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_cgemmsp.c:400
core_clrmm_s::C
pastix_lrblock_t * C
Definition: pastix_clrcores.h:95
solver_cblk_s::cblktype
int8_t cblktype
Definition: solver.h:130
solver_blok_s::inlast
int8_t inlast
Definition: solver.h:117
solver_cblk_s::lock
pastix_atomic_lock_t lock
Definition: solver.h:128
solver_cblk_s::fcolnum
pastix_int_t fcolnum
Definition: solver.h:132
core_cgemmsp_lrfr
static pastix_fixdbl_t core_cgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_complex32_t *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
Definition: core_cgemmsp.c:1361
pastix_lrblock_s::rkmax
int rkmax
Definition: pastix_lowrank.h:114
core_clrmm_s::B
const pastix_lrblock_t * B
Definition: pastix_clrcores.h:93
pastix_clrcores.h
core_clrmm_s::beta
pastix_complex32_t beta
Definition: pastix_clrcores.h:94
core_clrmm_s::transB
pastix_trans_t transB
Definition: pastix_clrcores.h:83
core_cgeadd
int core_cgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t LDA, pastix_complex32_t beta, pastix_complex32_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_cgeadd.c:78
core_clrmm_s::N
pastix_int_t N
Definition: pastix_clrcores.h:85
solver_blok_s::fcblknm
pastix_int_t fcblknm
Definition: solver.h:110