PaStiX Handbook  6.3.0
core_zgemmsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_zgemmsp.c
4  *
5  * PaStiX kernel routines operating on the solver structure.
6  *
7  * @copyright 2011-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.3.0
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Xavier Lacoste
14  * @author Gregoire Pichon
15  * @author Nolan Bredel
16  * @date 2023-01-16
17  * @generated from /builds/solverstack/pastix/kernels/core_zgemmsp.c, normal z -> z, Mon Aug 28 13:40:36 2023
18  *
19  **/
20 #include "common.h"
21 #include "cblas.h"
22 #include "blend/solver.h"
23 #include "kernels_trace.h"
24 #include "pastix_zcores.h"
25 #include "pastix_zlrcores.h"
26 
27 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 static pastix_complex64_t mzone = -1.0;
29 static pastix_complex64_t zone = 1.0;
30 static pastix_complex64_t zzero = 0.0;
31 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
32 
33 /**
34  *******************************************************************************
35  *
36  * @ingroup kernel_fact_null
37  *
38  * @brief Compute the updates that are generated by the transposition of one
39  * single off-diagonal block.
40  *
41  * Both cblk involved in the computation are stored with the 1D storage: Column
42  * Major Layout with blocks interleaved.
43  *
44  * All the off-diagonal block below block are multiplied by the selected block
45  * and added to the facing cblk.
46  *
47  *******************************************************************************
48  *
49  * @param[in] sideA
50  * Specify if A and C belong to the lower part, or to the upper part.
51  * If sideA == PastixLCoef, the contribution of:
52  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
53  * C, otherwise the contribution:
54  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
55  * to C.
56  * The pointer to the data structure that describes the panel from
57  * which we compute the contributions. Next column blok must be
58  * accessible through cblk[1].
59  *
60  * @param[in] trans
61  * Specify the transposition used for the B matrix. It has to be either
62  * PastixTrans or PastixConjTrans.
63  *
64  * @param[in] cblk
65  * The cblk structure to which block belongs to. The A and B pointers
66  * must be the coeftab of this column block.
67  * Next column blok must be accessible through cblk[1].
68  *
69  * @param[in] blok
70  * The block from which we compute the contributions.
71  *
72  * @param[inout] fcblk
73  * The pointer to the data structure that describes the panel on which
74  * we compute the contributions. The C pointer must be one of the
75  * coeftab from this fcblk. Next column blok must be accessible through
76  * fcblk[1].
77  *
78  * @param[in] A
79  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
80  * coefficients of the panel when the Lower part is computed,
81  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
82  *
83  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
84  * the coefficients of the panel, if Symmetric/Hermitian cases or if
85  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
86  * cblk.stride -by- cblk.width
87  *
88  * @param[inout] C
89  * The pointer to the fcblk.lcoeftab if the lower part is computed,
90  * fcblk.ucoeftab otherwise.
91  *
92  * @param[in] work
93  * Temporary memory buffer that is at least equal to the height of the
94  * block B by the sum of the height of all the blocks below the block
95  * B.
96  *
97  *******************************************************************************
98  *
99  * @sa core_zgemmsp_1d2d
100  * @sa core_zgemmsp_2d2d
101  *
102  *******************************************************************************/
103 static inline void
105  pastix_trans_t trans,
106  const SolverCblk *cblk,
107  const SolverBlok *blok,
108  SolverCblk *fcblk,
109  const pastix_complex64_t *A,
110  const pastix_complex64_t *B,
111  pastix_complex64_t *C,
112  pastix_complex64_t *work )
113 {
114  const SolverBlok *iterblok;
115  const SolverBlok *fblok;
116  const SolverBlok *lblok;
117 
118  pastix_complex64_t *tmpC;
119  pastix_complex64_t *wtmp;
120  pastix_int_t stride, stridef, indblok;
121  pastix_int_t M, N, K, m;
122  int shift;
123 
124  /* Both cblk and fcblk are stored in 1D */
125  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
126  assert(!(fcblk->cblktype & CBLK_LAYOUT_2D));
127 
128  assert( work != NULL );
129 
130  shift = (sideA == PastixUCoef) ? 1 : 0;
131 
132  stride = cblk->stride;
133  stridef = fcblk->stride;
134  K = cblk_colnbr( cblk );
135 
136  /* First blok */
137  indblok = blok->coefind;
138 
139  N = blok_rownbr( blok );
140  M = stride - indblok - (shift * N);
141 
142  /* Matrix A = Aik */
143  A = A + indblok + (shift * N);
144  B = B + indblok;
145 
146  /*
147  * Compute update A * B'
148  */
149  wtmp = work;
150  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
151  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
152  M, N, K,
153  CBLAS_SADDR(zone), A, stride,
154  B, stride,
155  CBLAS_SADDR(zzero), wtmp, M );
156  kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );
157 
158  /*
159  * Add contribution to C in fcblk
160  */
161 
162  /* Get the first block of the distant panel */
163  fblok = fcblk->fblokptr;
164 
165  /* Move the pointer to the top of the right column */
166  C = C + (blok->frownum - fcblk->fcolnum) * stridef;
167 
168  lblok = cblk[1].fblokptr;
169 
170  /* for all following blocks in block column */
171  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
172 
173  /* Find facing blok */
174  while (!is_block_inside_fblock( iterblok, fblok ))
175  {
176  fblok++;
177  assert( fblok < fcblk[1].fblokptr );
178  }
179 
180  tmpC = C + fblok->coefind + iterblok->frownum - fblok->frownum;
181  m = blok_rownbr( iterblok );
182 
183  pastix_cblk_lock( fcblk );
184  core_zgeadd( PastixNoTrans, m, N,
185  -1.0, wtmp, M,
186  1.0, tmpC, stridef );
187  pastix_cblk_unlock( fcblk );
188 
189  /* Displacement to next block */
190  wtmp += m;
191  }
192 }
193 
194 /**
195  *******************************************************************************
196  *
197  * @ingroup kernel_fact_null
198  *
199  * @brief Compute the updates that are generated by the transposition of one
200  * single off-diagonal block.
201  *
202  * The cblk involved in the matrices A and B are stored with the 1D storage:
203  * Column Major Layout with blocks interleaved. The facing cblk of the atrix C,
204  * is stored with the 2D storage where each block is stored continuously one
205  * after another. (Similar to dense tile storage with variant tile size)
206  *
207  * All the off-diagonal block below block are multiplied by the selected block
208  * and added to the facing cblk.
209  *
210  *******************************************************************************
211  *
212  * @param[in] sideA
213  * Specify if A and C belong to the lower part, or to the upper part.
214  * If sideA == PastixLCoef, the contribution of:
215  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
216  * C, otherwise the contribution:
217  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
218  * to C.
219  * The pointer to the data structure that describes the panel from
220  * which we compute the contributions. Next column blok must be
221  * accessible through cblk[1].
222  *
223  * @param[in] trans
224  * Specify the transposition used for the B matrix. It has to be either
225  * PastixTrans or PastixConjTrans.
226  *
227  * @param[in] cblk
228  * The cblk structure to which block belongs to. The A and B pointers
229  * must be the coeftab of this column block.
230  * Next column blok must be accessible through cblk[1].
231  *
232  * @param[in] blok
233  * The block from which we compute the contributions.
234  *
235  * @param[inout] fcblk
236  * The pointer to the data structure that describes the panel on which
237  * we compute the contributions. The C pointer must be one of the
238  * coeftab from this fcblk. Next column blok must be accessible through
239  * fcblk[1].
240  *
241  * @param[in] A
242  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
243  * coefficients of the panel when the Lower part is computed,
244  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
245  *
246  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
247  * the coefficients of the panel, if Symmetric/Hermitian cases or if
248  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
249  * cblk.stride -by- cblk.width
250  *
251  * @param[inout] C
252  * The pointer to the fcblk.lcoeftab if the lower part is computed,
253  * fcblk.ucoeftab otherwise.
254  *
255  *******************************************************************************
256  *
257  * @sa core_zgemmsp_1d1d
258  * @sa core_zgemmsp_2d2d
259  *
260  *******************************************************************************/
261 static inline void
263  pastix_trans_t trans,
264  const SolverCblk *cblk,
265  const SolverBlok *blok,
266  SolverCblk *fcblk,
267  const pastix_complex64_t *A,
268  const pastix_complex64_t *B,
269  pastix_complex64_t *C )
270 {
271  const SolverBlok *iterblok;
272  const SolverBlok *fblok;
273  const SolverBlok *lblok;
274  const pastix_complex64_t *blokA;
275  const pastix_complex64_t *blokB;
276  pastix_complex64_t *blokC;
277 
278  pastix_int_t stride, stridef;
279  pastix_int_t M, N, K;
280  int shift;
281 
282  /* cblk is stored in 1D and fcblk in 2D */
283  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
284  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
285 
286  shift = (sideA == PastixUCoef) ? 1 : 0;
287  stride = cblk->stride;
288 
289  /* Get the B block and its dimensions */
290  blokB = B + blok->coefind;
291 
292  stride = cblk->stride;
293  K = cblk_colnbr( cblk );
294  N = blok_rownbr( blok );
295 
296  /**
297  * Add contribution to C in fcblk:
298  * Get the first facing block of the distant panel, and the last block of
299  * the current cblk
300  */
301  fblok = fcblk->fblokptr;
302  lblok = cblk[1].fblokptr;
303 
304  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
305 
306  /* Find facing blok */
307  while (!is_block_inside_fblock( iterblok, fblok ))
308  {
309  fblok++;
310  assert( fblok < fcblk[1].fblokptr );
311  }
312 
313  stridef = blok_rownbr(fblok);
314 
315  /* Get the A block and its dimensions */
316  blokA = A + iterblok->coefind;
317  M = blok_rownbr( iterblok );
318 
319  blokC = C + fblok->coefind
320  + iterblok->frownum - fblok->frownum
321  + (blok->frownum - fcblk->fcolnum) * stridef;
322 
323  pastix_cblk_lock( fcblk );
324  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
325  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
326  M, N, K,
327  CBLAS_SADDR(mzone), blokA, stride,
328  blokB, stride,
329  CBLAS_SADDR(zone), blokC, stridef );
330  kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );
331  pastix_cblk_unlock( fcblk );
332  }
333 }
334 
335 /**
336  *******************************************************************************
337  *
338  * @ingroup kernel_fact_null
339  *
340  * @brief Compute the updates that are generated by the transposition of one
341  * single off-diagonal block.
342  *
343  * Both cblk involved in the matrices A, B and C are stored with the 2D storage
344  * where each block is stored continuously one after another. (Similar to dense
345  * tile storage with variant tile size)
346  *
347  * All the off-diagonal block below block are multiplied by the selected block
348  * and added to the facing cblk.
349  *
350  *******************************************************************************
351  *
352  * @param[in] sideA
353  * Specify if A and C belong to the lower part, or to the upper part.
354  * If sideA == PastixLCoef, the contribution of:
355  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
356  * C, otherwise the contribution:
357  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
358  * to C.
359  * The pointer to the data structure that describes the panel from
360  * which we compute the contributions. Next column blok must be
361  * accessible through cblk[1].
362  *
363  * @param[in] trans
364  * Specify the transposition used for the B matrix. It has to be either
365  * PastixTrans or PastixConjTrans.
366  *
367  * @param[in] cblk
368  * The cblk structure to which block belongs to. The A and B pointers
369  * must be the coeftab of this column block.
370  * Next column blok must be accessible through cblk[1].
371  *
372  * @param[in] blok
373  * The block from which we compute the contributions.
374  *
375  * @param[inout] fcblk
376  * The pointer to the data structure that describes the panel on which
377  * we compute the contributions. The C pointer must be one of the
378  * coeftab from this fcblk. Next column blok must be accessible through
379  * fcblk[1].
380  *
381  * @param[in] A
382  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
383  * coefficients of the panel when the Lower part is computed,
384  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
385  *
386  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
387  * the coefficients of the panel, if Symmetric/Hermitian cases or if
388  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
389  * cblk.stride -by- cblk.width
390  *
391  * @param[inout] C
392  * The pointer to the fcblk.lcoeftab if the lower part is computed,
393  * fcblk.ucoeftab otherwise.
394  *
395  *******************************************************************************
396  *
397  * @sa core_zgemmsp_1d1d
398  * @sa core_zgemmsp_1d2d
399  *
400  *******************************************************************************/
401 static inline void
403  pastix_trans_t trans,
404  const SolverCblk *cblk,
405  const SolverBlok *blok,
406  SolverCblk *fcblk,
407  const pastix_complex64_t *A,
408  const pastix_complex64_t *B,
409  pastix_complex64_t *C )
410 {
411  const SolverBlok *iterblok;
412  const SolverBlok *fblok;
413  const SolverBlok *lblok;
414  const pastix_complex64_t *blokA;
415  const pastix_complex64_t *blokB;
416  pastix_complex64_t *blokC;
417 
418  pastix_int_t M, N, K, lda, ldb, ldc;
419  int shift;
420 
421  /* Both cblk and fcblk must be stored in 2D */
422  assert( cblk->cblktype & CBLK_LAYOUT_2D );
423  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
424 
425  shift = (sideA == PastixUCoef) ? 1 : 0;
426 
427  /* Get the B block and its dimensions */
428  blokB = B + blok->coefind;
429 
430  ldb = blok_rownbr( blok );
431  K = cblk_colnbr( cblk );
432  N = blok_rownbr( blok );
433 
434  /*
435  * Add contribution to C in fcblk:
436  * Get the first facing block of the distant panel, and the last block of
437  * the current cblk
438  */
439  fblok = fcblk->fblokptr;
440  lblok = cblk[1].fblokptr;
441 
442  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
443 
444  /* Find facing blok */
445  while (!is_block_inside_fblock( iterblok, fblok ))
446  {
447  fblok++;
448  assert( fblok < fcblk[1].fblokptr );
449  }
450 
451  ldc = blok_rownbr(fblok);
452 
453  /* Get the A block and its dimensions */
454  blokA = A + iterblok->coefind;
455  M = blok_rownbr( iterblok );
456  lda = M;
457 
458  blokC = C + fblok->coefind
459  + iterblok->frownum - fblok->frownum
460  + (blok->frownum - fcblk->fcolnum) * ldc;
461 
462  pastix_cblk_lock( fcblk );
463  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
464  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
465  M, N, K,
466  CBLAS_SADDR(mzone), blokA, lda,
467  blokB, ldb,
468  CBLAS_SADDR(zone), blokC, ldc );
469  kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );
470  pastix_cblk_unlock( fcblk );
471  }
472 }
473 
474 /**
475  *******************************************************************************
476  *
477  * @ingroup kernel_fact_null
478  *
479  * @brief Compute the updates that are generated by the transposition of all the
480  * blocks facing a common diagonal block, by another similar set of blocks.
481  *
482  * This is used to performed an update:
483  *
484  * C_mn = C_mn - A_mk * op(B_kn)
485  *
486  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
487  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
488  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
489  * belongs to the set of block of the cblk n facing the diagonal block of the
490  * cblk m.
491  *
492  *******************************************************************************
493  *
494  * @param[in] trans
495  * Specify the transposition used for the B matrices. It has to be either
496  * PastixTrans or PastixConjTrans.
497  *
498  * @param[in] blok_mk
499  * Index of the first off-diagonal block in cblk, that is used for A.
500  *
501  * @param[in] blok_kn
502  * Index of the first off-diagonal block in cblk, that is used for B.
503  *
504  * @param[in] blok_mn
505  * Index of the first off-diagonal block in fcblk, that is used for C.
506  *
507  * @param[in] cblk
508  * The cblk structure to which block belongs to. The A and B pointers
509  * must be the coeftab of this column block.
510  * Next column blok must be accessible through cblk[1].
511  *
512  * @param[inout] fcblk
513  * The pointer to the data structure that describes the panel on which
514  * we compute the contributions. The C pointer must be one of the
515  * coeftab from this fcblk. Next column blok must be accessible through
516  * fcblk[1].
517  *
518  * @param[in] A
519  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
520  * coefficients of the panel when the Lower part is computed,
521  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
522  *
523  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
524  * the coefficients of the panel, if Symmetric/Hermitian cases or if
525  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
526  * cblk.stride -by- cblk.width
527  *
528  * @param[inout] C
529  * The pointer to the fcblk.lcoeftab if the lower part is computed,
530  * fcblk.ucoeftab otherwise.
531  *
532  *******************************************************************************
533  *
534  * @return The number of flops performed
535  *
536  *******************************************************************************
537  *
538  * @sa core_zgemmsp_block_frlr
539  * @sa core_zgemmsp_block_lrlr
540  *
541  *******************************************************************************/
542 static inline pastix_fixdbl_t
544  pastix_int_t blok_mk,
545  pastix_int_t blok_kn,
546  pastix_int_t blok_mn,
547  const SolverCblk *cblk,
548  SolverCblk *fcblk,
549  const pastix_complex64_t *A,
550  const pastix_complex64_t *B,
551  pastix_complex64_t *C )
552 {
553  const SolverBlok *blokA, *blokB, *blokC;
554  const SolverBlok *bA, *bB, *bC;
555  const SolverBlok *fblokK, *lblokK;
556  const SolverBlok *fblokN, *lblokN;
557 
558  const pastix_complex64_t *Aptr, *Bptr;
559  pastix_complex64_t *Cptr;
560  pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
561  pastix_int_t full_m, full_n;
562  size_t offsetA, offsetB, offsetC;
563 
564  pastix_fixdbl_t flops = 0.0;
565  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
566 
567  /* Both cblk and fcblk must be stored in 2D */
568  assert( cblk->cblktype & CBLK_LAYOUT_2D );
569  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
570 
571  /*
572  * Blocs on column K
573  */
574  fblokK = cblk[0].fblokptr;
575  lblokK = cblk[1].fblokptr;
576 
577  blokB = fblokK + blok_kn;
578  offsetB = blokB->coefind;
579  cblk_n = blokB->fcblknm;
580 
581  blokA = fblokK + blok_mk;
582  offsetA = blokA->coefind;
583  cblk_m = blokA->fcblknm;
584 
585  /*
586  * Blocs on column N
587  */
588  fblokN = fcblk[0].fblokptr;
589  lblokN = fcblk[1].fblokptr;
590 
591  blokC = fblokN + blok_mn;
592  offsetC = blokC->coefind;
593  assert( blokC->fcblknm == cblk_m );
594 
595  K = cblk_colnbr( cblk );
596  full_m = 0;
597 
598  bC = blokC;
599  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
600  M = blok_rownbr(bA);
601  Aptr = A + bA->coefind - offsetA;
602  lda = M;
603  full_m += M;
604 
605  /* Find facing C blok */
606  while (!is_block_inside_fblock( bA, bC )) {
607  bC++;
608  assert( bC < lblokN );
609  }
610 
611  Cptr = C + bC->coefind - offsetC;
612  ldc = blok_rownbr(bC);
613 
614  full_n = 0;
615  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
616  N = blok_rownbr( bB );
617  full_n += N;
618  Bptr = B + bB->coefind - offsetB;
619  ldb = N;
620 
621  cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
622  M, N, K,
623  CBLAS_SADDR(mzone), Aptr, lda,
624  Bptr, ldb,
625  CBLAS_SADDR(zone), Cptr + (bA->frownum - bC->frownum)
626  + (bB->frownum - fcblk->fcolnum) * ldc , ldc );
627 
628  flops += FLOPS_ZGEMM( M, N, K );
629  }
630  }
631 
632  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlok2d2d,
633  full_m, full_n, K,
634  flops, time );
635 
636  (void)lblokN;
637  return flops;
638 }
639 
640 /**
641  *******************************************************************************
642  *
643  * @ingroup kernel_fact_null
644  *
645  * @brief Compute the updates that are generated by the transposition of all the
646  * blocks facing a common diagonal block, by another similar set of blocks.
647  *
648  * This is used to perform an update:
649  *
650  * C_mn = C_mn - A_mk * op(B_kn)
651  *
652  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
653  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
654  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
655  * belongs to the set of block of the cblk n facing the diagonal block of the
656  * cblk m.
657  *
658  * In this routine, all the matrices are low-rank
659  *
660  *******************************************************************************
661  *
662  * @param[in] transB
663  * Specify the transposition used for the B matrices. It has to be either
664  * PastixTrans or PastixConjTrans.
665  *
666  * @param[in] blok_mk
667  * Index of the first off-diagonal block in cblk, that is used for A.
668  *
669  * @param[in] blok_kn
670  * Index of the first off-diagonal block in cblk, that is used for B.
671  *
672  * @param[in] blok_mn
673  * Index of the first off-diagonal block in fcblk, that is used for C.
674  *
675  * @param[in] cblk
676  * The cblk structure to which block belongs to. The A and B pointers
677  * must be the coeftab of this column block.
678  * Next column blok must be accessible through cblk[1].
679  *
680  * @param[inout] fcblk
681  * The pointer to the data structure that describes the panel on which
682  * we compute the contributions. The C pointer must be one of the
683  * coeftab from this fcblk. Next column blok must be accessible through
684  * fcblk[1].
685  *
686  * @param[in] A
687  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
688  * coefficients of the panel when the Lower part is computed,
689  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
690  *
691  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
692  * the coefficients of the panel, if Symmetric/Hermitian cases or if
693  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
694  * cblk.stride -by- cblk.width
695  *
696  * @param[in] lrC
697  * Pointer to the low-rank representation of the block C.
698  * Must be followed by the low-rank representation of the following blocks.
699  *
700  * @param[in] lowrank
701  * The structure with low-rank parameters.
702  *
703  *******************************************************************************
704  *
705  * @return The number of flops performed
706  *
707  *******************************************************************************/
708 static inline pastix_fixdbl_t
710  pastix_int_t blok_mk,
711  pastix_int_t blok_kn,
712  pastix_int_t blok_mn,
713  const SolverCblk *cblk,
714  SolverCblk *fcblk,
715  const pastix_complex64_t *A,
716  const pastix_complex64_t *B,
717  pastix_lrblock_t *lrC,
718  const pastix_lr_t *lowrank )
719 {
720  const SolverBlok *blokA, *blokB, *blokC;
721  const SolverBlok *bA, *bB, *bC;
722  const SolverBlok *fblokK, *lblokK;
723  const SolverBlok *fblokN, *lblokN;
724 
725  pastix_lrblock_t lrA, lrB;
726  core_zlrmm_t params;
727 
728  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
729  size_t offsetA, offsetB;
730 
731  pastix_fixdbl_t flops = 0.0;
732  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
733 
734  /* Provide an internal lock as this function is already protected by the runtime dependency system */
735  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
736 
737  /* Both cblk and fcblk must be stored in 2D */
738  assert( cblk->cblktype & CBLK_LAYOUT_2D );
739  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
740 
741  /* Both cblk and fcblk must be compressed */
742  assert(!(cblk->cblktype & CBLK_COMPRESSED) );
743  assert( fcblk->cblktype & CBLK_COMPRESSED );
744 
745  /*
746  * Blocs on column K
747  */
748  fblokK = cblk[0].fblokptr;
749  lblokK = cblk[1].fblokptr;
750 
751  blokB = fblokK + blok_kn;
752  offsetB = blokB->coefind;
753  cblk_n = blokB->fcblknm;
754  lrB.rk = -1;
755 
756  blokA = fblokK + blok_mk;
757  offsetA = blokA->coefind;
758  cblk_m = blokA->fcblknm;
759  lrA.rk = -1;
760 
761  /*
762  * Blocs on column N
763  */
764  fblokN = fcblk[0].fblokptr;
765  lblokN = fcblk[1].fblokptr;
766 
767  blokC = fblokN + blok_mn;
768  assert( blokC->fcblknm == cblk_m );
769 
770  full_m = 0;
771  K = cblk_colnbr( cblk );
772 
773  params.lowrank = lowrank;
774  params.transA = PastixNoTrans;
775  params.transB = transB;
776  params.K = K;
777  params.Cn = cblk_colnbr( fcblk );
778  params.alpha = -1.0;
779  params.beta = 1.0;
780  params.work = NULL;
781  params.lwork = -1;
782  params.lock = &(lock);
783 
784  bC = blokC;
785  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
786  M = blok_rownbr(bA);
787  params.M = M;
788  full_m += M;
789 
790  lrA.rkmax = M;
791  lrA.u = (pastix_complex64_t*)A + bA->coefind - offsetA;
792  lrA.v = NULL;
793 
794  /* Find facing C blok */
795  while (!is_block_inside_fblock( bA, bC )) {
796  bC++;
797  lrC++;
798  assert( bC < lblokN );
799  }
800 
801  params.A = &lrA;
802  params.C = lrC;
803  params.Cm = blok_rownbr( bC );
804 
805  full_n = 0;
806  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
807 
808  params.N = blok_rownbr( bB );
809  full_n += params.N;
810  params.offx = bA->frownum - bC->frownum;
811  params.offy = bB->frownum - fcblk->fcolnum;
812 
813  lrB.rkmax = M;
814  lrB.u = (pastix_complex64_t*)B + bB->coefind - offsetB;
815  lrB.v = NULL;
816 
817  params.B = &lrB;
818 
819  flops += core_zlrmm( &params );
820  }
821  }
822 
823  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
824  full_m, full_n, K,
825  flops, time );
826  (void)lblokN;
827  return flops;
828 }
829 
830 /**
831  *******************************************************************************
832  *
833  * @ingroup kernel_fact_null
834  *
835  * @brief Compute the updates that are generated by the transposition of all the
836  * blocks facing a common diagonal block, by another similar set of blocks.
837  *
838  * This is used to perform an update:
839  *
840  * C_mn = C_mn - A_mk * op(B_kn)
841  *
842  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
843  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
844  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
845  * belongs to the set of block of the cblk n facing the diagonal block of the
846  * cblk m.
847  *
848  * In this routine, all the matrices are low-rank
849  *
850  *******************************************************************************
851  *
852  * @param[in] transB
853  * Specify the transposition used for the B matrices. It has to be either
854  * PastixTrans or PastixConjTrans.
855  *
856  * @param[in] blok_mk
857  * Index of the first off-diagonal block in cblk, that is used for A.
858  *
859  * @param[in] blok_kn
860  * Index of the first off-diagonal block in cblk, that is used for B.
861  *
862  * @param[in] blok_mn
863  * Index of the first off-diagonal block in fcblk, that is used for C.
864  *
865  * @param[in] cblk
866  * The cblk structure to which block belongs to. The A and B pointers
867  * must be the coeftab of this column block.
868  * Next column blok must be accessible through cblk[1].
869  *
870  * @param[inout] fcblk
871  * The pointer to the data structure that describes the panel on which
872  * we compute the contributions. The C pointer must be one of the
873  * coeftab from this fcblk. Next column blok must be accessible through
874  * fcblk[1].
875  *
876  * @param[inout] lrA
877  * Pointer to the low-rank representation of the block A.
878  * Must be followed by the low-rank representation of the following blocks.
879  *
880  * @param[inout] lrB
881  * Pointer to the low-rank representation of the block B.
882  * Must be followed by the low-rank representation of the following blocks.
883  *
884  * @param[inout] lrC
885  * Pointer to the low-rank representation of the block C.
886  * Must be followed by the low-rank representation of the following blocks.
887  *
888  * @param[in] lowrank
889  * The structure with low-rank parameters.
890  *
891  *******************************************************************************
892  *
893  * @return The number of flops performed
894  *
895  *******************************************************************************/
896 static inline pastix_fixdbl_t
898  pastix_int_t blok_mk,
899  pastix_int_t blok_kn,
900  pastix_int_t blok_mn,
901  const SolverCblk *cblk,
902  SolverCblk *fcblk,
903  const pastix_lrblock_t *lrA,
904  const pastix_lrblock_t *lrB,
905  pastix_lrblock_t *lrC,
906  const pastix_lr_t *lowrank )
907 {
908  const pastix_lrblock_t *blrB;
909  const SolverBlok *blokA, *blokB, *blokC;
910  const SolverBlok *bA, *bB, *bC;
911  const SolverBlok *fblokK, *lblokK;
912  const SolverBlok *fblokN, *lblokN;
913 
914  core_zlrmm_t params;
915 
916  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
917 
918  pastix_fixdbl_t flops = 0.0;
919  pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );
920 
921  /* Provide an internal lock as this function is already protected by the runtime dependency system */
922  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
923 
924  /* Both cblk and fcblk must be stored in 2D */
925  assert( cblk->cblktype & CBLK_LAYOUT_2D );
926  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
927 
928  /* Both cblk and fcblk must be compressed */
929  assert( cblk->cblktype & CBLK_COMPRESSED );
930  assert( fcblk->cblktype & CBLK_COMPRESSED );
931 
932  /*
933  * Blocs on column K
934  */
935  fblokK = cblk[0].fblokptr;
936  lblokK = cblk[1].fblokptr;
937 
938  blokB = fblokK + blok_kn;
939  cblk_n = blokB->fcblknm;
940 
941  blokA = fblokK + blok_mk;
942  cblk_m = blokA->fcblknm;
943 
944  /*
945  * Blocs on column N
946  */
947  fblokN = fcblk[0].fblokptr;
948  lblokN = fcblk[1].fblokptr;
949 
950  blokC = fblokN + blok_mn;
951  assert( blokC->fcblknm == cblk_m );
952 
953  full_m = 0;
954  K = cblk_colnbr( cblk );
955 
956  params.lowrank = lowrank;
957  params.transA = PastixNoTrans;
958  params.transB = transB;
959  params.K = K;
960  params.Cn = cblk_colnbr( fcblk );
961  params.alpha = -1.0;
962  params.beta = 1.0;
963  params.work = NULL;
964  params.lwork = -1;
965  params.lock = &(lock);
966 
967  bC = blokC;
968  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++, lrA++) {
969  M = blok_rownbr(bA);
970  params.M = M;
971  full_m += M;
972 
973  /* Find facing C blok */
974  while (!is_block_inside_fblock( bA, bC )) {
975  bC++;
976  lrC++;
977  assert( bC < lblokN );
978  }
979 
980  params.A = lrA;
981  params.C = lrC;
982  params.Cm = blok_rownbr( bC );
983 
984  full_n = 0;
985  for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++, blrB++) {
986 
987  params.N = blok_rownbr( bB );
988  full_n += params.N;
989 
990  params.offx = bA->frownum - bC->frownum;
991  params.offy = bB->frownum - fcblk->fcolnum;
992  params.B = blrB;
993 
994  flops += core_zlrmm( &params );
995  }
996  }
997 
998  kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
999  full_m, full_n, K,
1000  flops, time );
1001  (void)lblokN;
1002  return flops;
1003 }
1004 
1005 /**
1006  *******************************************************************************
1007  *
1008  * @ingroup kernel_fact_null
1009  *
1010  * @brief Compute the updates associated to one off-diagonal block.
1011  *
1012  *******************************************************************************
1013  *
1014  * @param[in] sideA
1015  * Specify if A and C belong to the lower part, or to the upper part.
1016  * If sideA == PastixLCoef, the contribution of:
1017  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1018  * C, otherwise the contribution:
1019  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1020  * to C.
1021  *
1022  * @param[in] trans
1023  * Specify the transposition used for the B matrix. It has to be either
1024  * PastixTrans or PastixConjTrans.
1025  *
1026  * @param[in] cblk
1027  * The cblk structure to which block belongs to. The A and B pointers
1028  * must be the coeftab of this column block.
1029  * Next column blok must be accessible through cblk[1].
1030  *
1031  * @param[in] blok
1032  * The block from which we compute the contributions.
1033  *
1034  * @param[inout] fcblk
1035  * The pointer to the data structure that describes the panel on which
1036  * we compute the contributions. The C pointer must be one of the
1037  * coeftab from this fcblk. Next column blok must be accessible through
1038  * fcblk[1].
1039  *
1040  * @param[in] A
1041  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
1042  * coefficients of the panel when the Lower part is computed,
1043  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
1044  *
1045  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
1046  * the coefficients of the panel, if Symmetric/Hermitian cases or if
1047  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
1048  * cblk.stride -by- cblk.width
1049  *
1050  * @param[in] lrC
1051  * Pointer to the low-rank representation of the block C.
1052  * Must be followed by the low-rank representation of the following blocks.
1053  *
1054  * @param[in] work
1055  * Temporary memory buffer.
1056  *
1057  * @param[in] lwork
1058  * TODO
1059  *
1060  * @param[in] lowrank
1061  * The structure with low-rank parameters.
1062  *
1063  *******************************************************************************
1064  *
1065  * @return The number of flops performed
1066  *
1067  *******************************************************************************/
1068 static inline pastix_fixdbl_t
1070  pastix_trans_t trans,
1071  const SolverCblk *cblk,
1072  const SolverBlok *blok,
1073  SolverCblk *fcblk,
1074  const pastix_complex64_t *A,
1075  const pastix_complex64_t *B,
1076  pastix_lrblock_t *lrC,
1077  pastix_complex64_t *work,
1078  pastix_int_t lwork,
1079  const pastix_lr_t *lowrank )
1080 {
1081  const SolverBlok *iterblok;
1082  const SolverBlok *fblok;
1083  const SolverBlok *lblok;
1084  pastix_lrblock_t lrA, lrB;
1085  core_zlrmm_t params;
1086 
1087  pastix_int_t stride, shift;
1088  pastix_int_t M, N;
1089 
1090  pastix_fixdbl_t flops = 0.0;
1091 
1092  /* Update from a dense block to a low rank block */
1093  assert(!(cblk->cblktype & CBLK_COMPRESSED));
1094  assert( fcblk->cblktype & CBLK_COMPRESSED );
1095  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1096 
1097  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1098 
1099  shift = (sideA == PastixUCoef) ? 1 : 0;
1100  stride = cblk->stride;
1101 
1102  N = blok_rownbr( blok );
1103 
1104  /* Get the B block and its dimensions */
1105  lrB.rk = -1;
1106  lrB.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? N : stride;
1107  lrB.u = (pastix_complex64_t*)B + blok->coefind; /* lrB is const, we can cast the B pointer */
1108  lrB.v = NULL;
1109 
1110  /**
1111  * Add contribution to C in fcblk:
1112  * Get the first facing block of the distant panel, and the last block of
1113  * the current cblk
1114  */
1115  fblok = fcblk->fblokptr;
1116  lblok = cblk[1].fblokptr;
1117 
1118  params.lowrank = lowrank;
1119  params.transA = PastixNoTrans;
1120  params.transB = trans;
1121  params.N = blok_rownbr( blok );
1122  params.K = cblk_colnbr( cblk );
1123  params.Cn = cblk_colnbr( fcblk );
1124  params.alpha = -1.0;
1125  params.beta = 1.0;
1126  params.work = work;
1127  params.lwork = lwork;
1128  params.lock = &(fcblk->lock);
1129  params.B = &lrB;
1130 
1131  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1132 
1133  /* Find facing blok */
1134  while (!is_block_inside_fblock( iterblok, fblok ))
1135  {
1136  fblok++;
1137  lrC++;
1138  assert( fblok < fcblk[1].fblokptr );
1139  }
1140 
1141  /* Get the A block and its dimensions */
1142  M = blok_rownbr( iterblok );
1143  lrA.rk = -1;
1144  lrA.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? M : stride;
1145  lrA.u = (pastix_complex64_t*)A + iterblok->coefind; /* Same as for B */
1146  lrA.v = NULL;
1147 
1148  params.M = M;
1149  params.A = &lrA;
1150  params.C = lrC;
1151  params.Cm = blok_rownbr( fblok );
1152 
1153  params.offx = iterblok->frownum - fblok->frownum;
1154  params.offy = blok->frownum - fcblk->fcolnum;
1155 
1156  flops += core_zlrmm( &params );
1157  }
1158  return flops;
1159 }
1160 
1161 /**
1162  *******************************************************************************
1163  *
1164  * @ingroup kernel_fact_null
1165  *
1166  * @brief Compute the updates associated to one off-diagonal block.
1167  *
1168  *******************************************************************************
1169  *
1170  * @param[in] sideA
1171  * Specify if A and C belong to the lower part, or to the upper part.
1172  * If sideA == PastixLCoef, the contribution of:
1173  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1174  * C, otherwise the contribution:
1175  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1176  * to C.
1177  *
1178  * @param[in] trans
1179  * Specify the transposition used for the B matrix. It has to be either
1180  * PastixTrans or PastixConjTrans.
1181  *
1182  * @param[in] cblk
1183  * The cblk structure to which block belongs to. The A and B pointers
1184  * must be the coeftab of this column block.
1185  * Next column blok must be accessible through cblk[1].
1186  *
1187  * @param[in] blok
1188  * The block from which we compute the contributions.
1189  *
1190  * @param[inout] fcblk
1191  * The pointer to the data structure that describes the panel on which
1192  * we compute the contributions. The C pointer must be one of the
1193  * coeftab from this fcblk. Next column blok must be accessible through
1194  * fcblk[1].
1195  *
1196  * @param[in] lrA
1197  * Pointer to the low-rank representation of the block A.
1198  * Must be followed by the low-rank representation of the following blocks.
1199  *
1200  * @param[in] lrB
1201  * Pointer to the low-rank representation of the block B.
1202  * Must be followed by the low-rank representation of the following blocks.
1203  *
1204  * @param[in] lrC
1205  * Pointer to the low-rank representation of the block B.
1206  * Must be followed by the low-rank representation of the following blocks.
1207  *
1208  * @param[in] work
1209  * Temporary memory buffer.
1210  *
1211  * @param[in] lwork
1212  * TODO
1213  *
1214  * @param[in] lowrank
1215  * The structure with low-rank parameters.
1216  *
1217  *******************************************************************************
1218  *
1219  * @return The number of flops performed
1220  *
1221  *******************************************************************************/
1222 static inline pastix_fixdbl_t
1224  pastix_trans_t trans,
1225  const SolverCblk *cblk,
1226  const SolverBlok *blok,
1227  SolverCblk *fcblk,
1228  const pastix_lrblock_t *lrA,
1229  const pastix_lrblock_t *lrB,
1230  pastix_lrblock_t *lrC,
1231  pastix_complex64_t *work,
1232  pastix_int_t lwork,
1233  const pastix_lr_t *lowrank )
1234 {
1235  const SolverBlok *iterblok;
1236  const SolverBlok *fblok;
1237  const SolverBlok *lblok;
1238 
1239  pastix_int_t N, K, shift;
1240  core_zlrmm_t params;
1241 
1242  pastix_fixdbl_t flops = 0.0;
1243 
1244  /* Update from a low-rank cblk to a low-rank cblk */
1245  assert( cblk->cblktype & CBLK_COMPRESSED );
1246  assert( fcblk->cblktype & CBLK_COMPRESSED );
1247  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1248  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1249 
1250  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1251 
1252  shift = (sideA == PastixUCoef) ? 1 : 0;
1253 
1254  /* Get the B block and its dimensions */
1255  K = cblk_colnbr( cblk );
1256  N = blok_rownbr( blok );
1257 
1258  /*
1259  * Add contribution to C in fcblk:
1260  * Get the first facing block of the distant panel, and the last block of
1261  * the current cblk
1262  */
1263  fblok = fcblk->fblokptr;
1264  lblok = cblk[1].fblokptr;
1265 
1266  params.lowrank = lowrank;
1267  params.transA = PastixNoTrans;
1268  params.transB = trans;
1269  params.N = N;
1270  params.K = K;
1271  params.Cn = cblk_colnbr( fcblk );
1272  params.alpha = -1.0;
1273  params.beta = 1.0;
1274  params.work = work;
1275  params.lwork = lwork;
1276  params.lwused = 0;
1277  params.lock = &(fcblk->lock);
1278  params.B = lrB + (blok - cblk->fblokptr);
1279 
1280  /* for all following blocks in block column */
1281  lrA = lrA + (blok - cblk->fblokptr) + shift;
1282  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1283 
1284  /* Find facing blok */
1285  while (!is_block_inside_fblock( iterblok, fblok ))
1286  {
1287  fblok++;
1288  lrC++;
1289  assert( fblok < fcblk[1].fblokptr );
1290  }
1291 
1292  params.M = blok_rownbr( iterblok );
1293  params.A = lrA;
1294  params.C = lrC;
1295  params.Cm = blok_rownbr( fblok );
1296 
1297  params.offx = iterblok->frownum - fblok->frownum;
1298  params.offy = blok->frownum - fcblk->fcolnum;
1299 
1300  flops += core_zlrmm( &params );
1301  }
1302  return flops;
1303 }
1304 
1305 /**
1306  *******************************************************************************
1307  *
1308  * @ingroup kernel_fact_null
1309  *
1310  * @brief Compute the updates associated to one off-diagonal block.
1311  *
1312  *******************************************************************************
1313  *
1314  * @param[in] sideA
1315  * Specify if A and C belong to the lower part, or to the upper part.
1316  * If sideA == PastixLCoef, the contribution of:
1317  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1318  * C, otherwise the contribution:
1319  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1320  * to C.
1321  *
1322  *
1323  * @param[in] trans
1324  * Specify the transposition used for the B matrix. It has to be either
1325  * PastixTrans or PastixConjTrans.
1326  *
1327  * @param[in] cblk
1328  * The cblk structure to which block belongs to. The A and B pointers
1329  * must be the coeftab of this column block.
1330  * Next column blok must be accessible through cblk[1].
1331  *
1332  * @param[in] blok
1333  * The block from which we compute the contributions.
1334  *
1335  * @param[inout] fcblk
1336  * The pointer to the data structure that describes the panel on which
1337  * we compute the contributions. The C pointer must be one of the
1338  * coeftab from this fcblk. Next column blok must be accessible through
1339  * fcblk[1].
1340  *
1341  * @param[in] lrA
1342  * Pointer to the low-rank representation of the block A.
1343  * Must be followed by the low-rank representation of the following blocks.
1344  *
1345  * @param[in] lrB
1346  * Pointer to the low-rank representation of the block B.
1347  * Must be followed by the low-rank representation of the following blocks.
1348  *
1349  * @param[inout] C
1350  * The pointer to the fcblk.lcoeftab if the lower part is computed,
1351  * fcblk.ucoeftab otherwise.
1352  *
1353  * @param[in] work
1354  * Temporary memory buffer.
1355  *
1356  * @param[in] lwork
1357  * The length of work.
1358  * On entry, if trans = PastixTrans
1359  * lwork >= max(1, K*N). Otherwise lwork >= max(1, M*K).
1360  *
1361  * @param[in] lowrank
1362  * The structure with low-rank parameters.
1363  *
1364  *******************************************************************************
1365  *
1366  * @return The number of flops performed
1367  *
1368  *******************************************************************************/
1369 static inline pastix_fixdbl_t
1371  pastix_trans_t trans,
1372  const SolverCblk *cblk,
1373  const SolverBlok *blok,
1374  SolverCblk *fcblk,
1375  const pastix_lrblock_t *lrA,
1376  const pastix_lrblock_t *lrB,
1377  pastix_complex64_t *C,
1378  pastix_complex64_t *work,
1379  pastix_int_t lwork,
1380  const pastix_lr_t *lowrank )
1381 {
1382  const SolverBlok *iterblok;
1383  const SolverBlok *fblok;
1384  const SolverBlok *lblok;
1385 
1386  pastix_int_t N, K, shift;
1387  pastix_lrblock_t lrC;
1388  core_zlrmm_t params;
1389 
1390  pastix_fixdbl_t flops = 0.0;
1391 
1392  /* Update from a low-rank cblk to a low-rank cblk */
1393  assert( cblk->cblktype & CBLK_COMPRESSED );
1394  assert( !(fcblk->cblktype & CBLK_COMPRESSED) );
1395  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1396  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1397 
1398  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1399 
1400  shift = (sideA == PastixUCoef) ? 1 : 0;
1401 
1402  /* Get the B block and its dimensions */
1403  K = cblk_colnbr( cblk );
1404  N = blok_rownbr( blok );
1405 
1406  /*
1407  * Add contribution to C in fcblk:
1408  * Get the first facing block of the distant panel, and the last block of
1409  * the current cblk
1410  */
1411  fblok = fcblk->fblokptr;
1412  lblok = cblk[1].fblokptr;
1413 
1414  params.lowrank = lowrank;
1415  params.transA = PastixNoTrans;
1416  params.transB = trans;
1417  params.N = N;
1418  params.K = K;
1419  params.Cn = cblk_colnbr( fcblk );
1420  params.alpha = -1.0;
1421  params.beta = 1.0;
1422  params.work = work;
1423  params.lwork = lwork;
1424  params.lwused = 0;
1425  params.lock = &(fcblk->lock);
1426  params.B = lrB + (blok - cblk->fblokptr);
1427  params.C = &lrC;
1428 
1429  lrC.rk = -1;
1430  lrC.v = NULL;
1431 
1432  /* for all following blocks in block column */
1433  lrA = lrA + (blok - cblk->fblokptr) + shift;
1434  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1435 
1436  /* Find facing blok */
1437  while (!is_block_inside_fblock( iterblok, fblok ))
1438  {
1439  fblok++;
1440  assert( fblok < fcblk[1].fblokptr );
1441  }
1442 
1443  params.M = blok_rownbr( iterblok );
1444  params.A = lrA;
1445  params.Cm = blok_rownbr( fblok );
1446  params.offx = iterblok->frownum - fblok->frownum;
1447  params.offy = blok->frownum - fcblk->fcolnum;
1448 
1449  lrC.rkmax = params.Cm;
1450  lrC.u = C + fblok->coefind;
1451 
1452  flops += core_zlrmm( &params );
1453  }
1454  return flops;
1455 }
1456 
1457 /**
1458  *******************************************************************************
1459  *
1460  * @brief Compute the updates associated to one off-diagonal block.
1461  *
1462  *******************************************************************************
1463  *
1464  * @param[in] sideA
1465  * Specify if A and C belong to the lower part, or to the upper part.
1466  * If sideA == PastixLCoef, the contribution of:
1467  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1468  * C, otherwise the contribution:
1469  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1470  * to C.
1471  * The pointer to the data structure that describes the panel from
1472  * which we compute the contributions. Next column blok must be
1473  * accessible through cblk[1].
1474  *
1475  * @param[in] trans
1476  * Specify the transposition used for the B matrix. It has to be either
1477  * PastixTrans or PastixConjTrans.
1478  *
1479  * @param[in] cblk
1480  * The cblk structure to which block belongs to. The A and B pointers
1481  * must be the coeftab of this column block.
1482  * Next column blok must be accessible through cblk[1].
1483  *
1484  * @param[in] blok
1485  * The block from which we compute the contributions.
1486  *
1487  * @param[inout] fcblk
1488  * The pointer to the data structure that describes the panel on which
1489  * we compute the contributions. The C pointer must be one of the
1490  * coeftab from this fcblk. Next column blok must be accessible through
1491  * fcblk[1].
1492  *
1493  * @param[in] A
1494  * The pointer to the correct representation of A.
1495  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1496  * - pastix_lr_block if the block is compressed.
1497  *
1498  * @param[in] B
1499  * The pointer to the correct representation of B.
1500  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1501  * - pastix_lr_block if the block is compressed.
1502  *
1503  * @param[inout] C
1504  * The pointer to the correct representation of C.
1505  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1506  * - pastix_lr_block if the block is compressed.
1507  *
1508  * @param[in] work
1509  * Temporary memory buffer.
1510  *
1511  * @param[in] lwork
1512  * TODO
1513  *
1514  * @param[in] lowrank
1515  * The structure with low-rank parameters.
1516  *
1517  *******************************************************************************
1518  *
1519  * @return The number of static pivoting during factorization of the diagonal
1520  * block.
1521  *
1522  *******************************************************************************/
1523 pastix_fixdbl_t
1525  pastix_trans_t trans,
1526  const SolverCblk *cblk,
1527  const SolverBlok *blok,
1528  SolverCblk *fcblk,
1529  const void *A,
1530  const void *B,
1531  void *C,
1532  pastix_complex64_t *work,
1533  pastix_int_t lwork,
1534  const pastix_lr_t *lowrank )
1535 {
1536  pastix_ktype_t ktype;
1537  pastix_fixdbl_t time, flops = 0.0;
1538  pastix_int_t m = cblk->stride;
1539  pastix_int_t n = blok_rownbr( blok );
1540  pastix_int_t k = cblk_colnbr( cblk );
1541 
1542  m -= (cblk->cblktype & CBLK_LAYOUT_2D) ? blok->coefind / k : blok->coefind;
1543  m -= (sideA == PastixUCoef) ? blok_rownbr( blok ) : 0;
1544 
1545  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1546  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1547  ktype = PastixKernelGEMMCblkLRLR;
1548  time = kernel_trace_start( ktype );
1549 
1550  flops = core_zgemmsp_lr( sideA, trans,
1551  cblk, blok, fcblk,
1552  A, B, C, work, lwork,
1553  lowrank );
1554  }
1555  else {
1556  ktype = PastixKernelGEMMCblkFRLR;
1557  time = kernel_trace_start( ktype );
1558 
1559  flops = core_zgemmsp_fulllr( sideA, trans,
1560  cblk, blok, fcblk,
1561  A, B, C, work, lwork,
1562  lowrank );
1563  }
1564  }
1565  else if ( fcblk->cblktype & CBLK_LAYOUT_2D ) {
1566  if ( cblk->cblktype & CBLK_COMPRESSED) {
1567  ktype = PastixKernelGEMMCblk2d2d;
1568  time = kernel_trace_start( ktype );
1569  core_zgemmsp_lrfr( sideA, trans,
1570  cblk, blok, fcblk,
1571  A, B, C, work, lwork,
1572  lowrank );
1573  }
1574  else if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
1575  ktype = PastixKernelGEMMCblk2d2d;
1576  time = kernel_trace_start( ktype );
1577 
1578  core_zgemmsp_2d2d( sideA, trans,
1579  cblk, blok, fcblk,
1580  A, B, C );
1581  }
1582  else {
1583  ktype = PastixKernelGEMMCblk1d2d;
1584  time = kernel_trace_start( ktype );
1585 
1586  core_zgemmsp_1d2d( sideA, trans,
1587  cblk, blok, fcblk,
1588  A, B, C );
1589  }
1590  flops = FLOPS_ZGEMM( m, n, k );
1591  }
1592  else {
1593  assert( !(cblk->cblktype & CBLK_COMPRESSED) );
1594  ktype = PastixKernelGEMMCblk1d1d;
1595  time = kernel_trace_start( ktype );
1596 
1597  core_zgemmsp_1d1d( sideA, trans,
1598  cblk, blok, fcblk,
1599  A, B, C, work );
1600 
1601  flops = FLOPS_ZGEMM( m, n, k );
1602  }
1603 
1604  kernel_trace_stop( blok->inlast, ktype, m, n, k, flops, time );
1605 
1606  return flops;
1607 }
1608 
1609 /**
1610  *******************************************************************************
1611  *
1612  * @brief Compute the CPU gemm associated to a couple of off-diagonal blocks.
1613  *
1614  * C_l = C_l - A_l * op(B_s), with B_s = B_l, or B_u
1615  * or
1616  * C_u = C_u - A_u * op(B_s), with B_s = B_l, or B_u
1617  *
1618  *******************************************************************************
1619  *
1620  * @param[in] transB
1621  * Specify wheter B should be used as PastixNoTrans, PastixTrans, or
1622  * PastixConjTrans in the computations.
1623  *
1624  * @param[in] cblk
1625  * The cblk structure to which block A and B belong to. The A and B
1626  * pointers must be one of the [lu]coeftab of this column block.
1627  * Next column blok must be accessible through cblk[1].
1628  *
1629  * @param[inout] fcblk
1630  * The pointer to the data structure that describes the panel on which
1631  * we compute the contributions. The C pointer must be one of the
1632  * [lu]coeftab from this fcblk.
1633  * Next column blok must be accessible through fcblk[1].
1634  *
1635  * @param[in] blok_mk
1636  * Specify the index of the A block in the cblk column. This index is
1637  * 0-based for the diagonal block.
1638  *
1639  * @param[in] blok_nk
1640  * Specify the index of the B block in the cblk column. This index is
1641  * 0-based for the diagonal block.
1642  *
1643  * @param[in] blok_mn
1644  * Specify the index of the C block in the fcblk column. This index is
1645  * 0-based for the diagonal block.
1646  *
1647  * @param[in] A
1648  * The pointer to the correct representation of A.
1649  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1650  * - pastix_lr_block if the block is compressed.
1651  *
1652  * @param[in] B
1653  * The pointer to the correct representation of B.
1654  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1655  * - pastix_lr_block if the block is compressed.
1656  *
1657  * @param[in] C
1658  * The pointer to the correct representation of C.
1659  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1660  * - pastix_lr_block if the block is compressed.
1661  *
1662  * @param[in] lowrank
1663  * The structure with the low-rank parameters.
1664  *
1665  *******************************************************************************
1666  *
1667  * @return TODO
1668  *
1669  *******************************************************************************/
1670 pastix_fixdbl_t
1672  const SolverCblk *cblk,
1673  SolverCblk *fcblk,
1674  pastix_int_t blok_mk,
1675  pastix_int_t blok_nk,
1676  pastix_int_t blok_mn,
1677  const void *A,
1678  const void *B,
1679  void *C,
1680  const pastix_lr_t *lowrank )
1681 {
1682  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1683  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1684  return core_zgemmsp_block_lrlr( transB,
1685  blok_mk, blok_nk, blok_mn,
1686  cblk, fcblk,
1687  A, B, C, lowrank );
1688  }
1689  else {
1690  return core_zgemmsp_block_frlr( transB,
1691  blok_mk, blok_nk, blok_mn,
1692  cblk, fcblk,
1693  A, B, C, lowrank );
1694  }
1695  }
1696  else {
1697  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1698  assert(0);
1699  return 0.; /* Avoids compilation and coverity warning */
1700  }
1701  else {
1702  return core_zgemmsp_block_frfr( transB,
1703  blok_mk, blok_nk, blok_mn,
1704  cblk, fcblk,
1705  A, B, C );
1706  }
1707  }
1708 }
int core_zgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t LDA, pastix_complex64_t beta, pastix_complex64_t *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_zgeadd.c:78
static pastix_fixdbl_t core_zgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_complex64_t *C, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_zgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_complex64_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_zgemmsp.c:262
static pastix_fixdbl_t core_zgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_complex64_t *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_zgemmsp.c:543
static pastix_fixdbl_t core_zgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_zgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_complex64_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_zgemmsp.c:402
static pastix_fixdbl_t core_zgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_zgemmsp.c:897
static pastix_fixdbl_t core_zgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_zgemmsp.c:709
static pastix_fixdbl_t core_zgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_lrblock_t *lrC, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_zgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex64_t *A, const pastix_complex64_t *B, pastix_complex64_t *C, pastix_complex64_t *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_zgemmsp.c:104
pastix_fixdbl_t cpucblk_zgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, pastix_complex64_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpublok_zgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
const pastix_lrblock_t * A
pastix_complex64_t alpha
pastix_trans_t transB
pastix_atomic_lock_t * lock
const pastix_lrblock_t * B
const pastix_lr_t * lowrank
pastix_int_t M
pastix_int_t offy
pastix_int_t lwused
pastix_trans_t transA
pastix_int_t Cm
pastix_int_t lwork
pastix_int_t N
pastix_lrblock_t * C
pastix_complex64_t beta
pastix_int_t K
pastix_int_t offx
pastix_complex64_t * work
pastix_int_t Cn
pastix_fixdbl_t core_zlrmm(core_zlrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Definition: core_zlrmm.c:278
Structure to store all the parameters of the core_zlrmm family functions.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
@ PastixUCoef
Definition: api.h:479
@ PastixNoTrans
Definition: api.h:447
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:389
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:323
pastix_int_t fcblknm
Definition: solver.h:140
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition: solver.h:498
pastix_int_t frownum
Definition: solver.h:142
pastix_atomic_lock_t lock
Definition: solver.h:157
pastix_int_t coefind
Definition: solver.h:144
SolverBlok * fblokptr
Definition: solver.h:163
int8_t inlast
Definition: solver.h:146
pastix_int_t stride
Definition: solver.h:164
int8_t cblktype
Definition: solver.h:159
pastix_int_t fcolnum
Definition: solver.h:161
Solver block structure.
Definition: solver.h:137
Solver column block structure.
Definition: solver.h:156