PaStiX Handbook  6.4.0
core_sgemmsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file core_sgemmsp.c
4  *
5  * PaStiX kernel routines operating on the solver structure.
6  *
7  * @copyright 2011-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Xavier Lacoste
14  * @author Gregoire Pichon
15  * @author Nolan Bredel
16  * @author Alycia Lisito
17  * @author Tom Moenne-Loccoz
18  * @date 2024-07-05
19  * @generated from /builds/solverstack/pastix/kernels/core_zgemmsp.c, normal z -> s, Fri Jul 12 15:09:46 2024
20  *
21  **/
22 #include "common.h"
23 #include "cblas.h"
24 #include "blend/solver.h"
25 #include "kernels_trace.h"
26 #include "pastix_scores.h"
27 #include "pastix_slrcores.h"
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 static float msone = -1.0;
31 static float sone = 1.0;
32 static float szero = 0.0;
33 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
34 
35 /**
36  *******************************************************************************
37  *
38  * @ingroup kernel_fact_null
39  *
40  * @brief Compute the updates that are generated by the transposition of one
41  * single off-diagonal block.
42  *
43  * Both cblk involved in the computation are stored with the 1D storage: Column
44  * Major Layout with blocks interleaved.
45  *
46  * All the off-diagonal block below block are multiplied by the selected block
47  * and added to the facing cblk.
48  *
49  *******************************************************************************
50  *
51  * @param[in] sideA
52  * Specify if A and C belong to the lower part, or to the upper part.
53  * If sideA == PastixLCoef, the contribution of:
54  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
55  * C, otherwise the contribution:
56  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
57  * to C.
58  * The pointer to the data structure that describes the panel from
59  * which we compute the contributions. Next column blok must be
60  * accessible through cblk[1].
61  *
62  * @param[in] trans
63  * Specify the transposition used for the B matrix. It has to be either
64  * PastixTrans or PastixTrans.
65  *
66  * @param[in] cblk
67  * The cblk structure to which block belongs to. The A and B pointers
68  * must be the coeftab of this column block.
69  * Next column blok must be accessible through cblk[1].
70  *
71  * @param[in] blok
72  * The block from which we compute the contributions.
73  *
74  * @param[inout] fcblk
75  * The pointer to the data structure that describes the panel on which
76  * we compute the contributions. The C pointer must be one of the
77  * coeftab from this fcblk. Next column blok must be accessible through
78  * fcblk[1].
79  *
80  * @param[in] A
81  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
82  * coefficients of the panel when the Lower part is computed,
83  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
84  *
85  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
86  * the coefficients of the panel, if Symmetric/Symmetric cases or if
87  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
88  * cblk.stride -by- cblk.width
89  *
90  * @param[inout] C
91  * The pointer to the fcblk.lcoeftab if the lower part is computed,
92  * fcblk.ucoeftab otherwise.
93  *
94  * @param[in] work
95  * Temporary memory buffer that is at least equal to the height of the
96  * block B by the sum of the height of all the blocks below the block
97  * B.
98  *
99  *******************************************************************************
100  *
101  * @sa core_sgemmsp_1d2d
102  * @sa core_sgemmsp_2d2d
103  *
104  *******************************************************************************/
105 static inline void
107  pastix_trans_t trans,
108  const SolverCblk *cblk,
109  const SolverBlok *blok,
110  SolverCblk *fcblk,
111  const float *A,
112  const float *B,
113  float *C,
114  float *work )
115 {
116  const SolverBlok *iterblok;
117  const SolverBlok *fblok;
118  const SolverBlok *lblok;
119 
120  float *tmpC;
121  float *wtmp;
122  pastix_int_t stride, stridef, indblok;
123  pastix_int_t M, N, K, m;
124  int shift;
125 
126  /* Both cblk and fcblk are stored in 1D */
127  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
128  assert(!(fcblk->cblktype & CBLK_LAYOUT_2D));
129 
130  assert( work != NULL );
131 
132  shift = (sideA == PastixUCoef) ? 1 : 0;
133 
134  stride = cblk->stride;
135  stridef = fcblk->stride;
136  K = cblk_colnbr( cblk );
137 
138  /* First blok */
139  indblok = blok->coefind;
140 
141  N = blok_rownbr( blok );
142  M = stride - indblok - (shift * N);
143 
144  /* Matrix A = Aik */
145  A = A + indblok + (shift * N);
146  B = B + indblok;
147 
148  /*
149  * Compute update A * B'
150  */
151  wtmp = work;
152  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
153  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
154  M, N, K,
155  (sone), A, stride,
156  B, stride,
157  (szero), wtmp, M );
158  kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
159 
160  /*
161  * Add contribution to C in fcblk
162  */
163 
164  /* Get the first block of the distant panel */
165  fblok = fcblk->fblokptr;
166 
167  /* Move the pointer to the top of the right column */
168  C = C + (blok->frownum - fcblk->fcolnum) * stridef;
169 
170  lblok = cblk[1].fblokptr;
171 
172  /* for all following blocks in block column */
173  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
174 
175  /* Find facing blok */
176  while (!is_block_inside_fblock( iterblok, fblok ))
177  {
178  fblok++;
179  assert( fblok < fcblk[1].fblokptr );
180  }
181 
182  tmpC = C + fblok->coefind + iterblok->frownum - fblok->frownum;
183  m = blok_rownbr( iterblok );
184 
185  pastix_cblk_lock( fcblk );
186  core_sgeadd( PastixNoTrans, m, N,
187  -1.0, wtmp, M,
188  1.0, tmpC, stridef );
189  pastix_cblk_unlock( fcblk );
190 
191  /* Displacement to next block */
192  wtmp += m;
193  }
194 }
195 
196 /**
197  *******************************************************************************
198  *
199  * @ingroup kernel_fact_null
200  *
201  * @brief Compute the updates that are generated by the transposition of one
202  * single off-diagonal block.
203  *
204  * The cblk involved in the matrices A and B are stored with the 1D storage:
205  * Column Major Layout with blocks interleaved. The facing cblk of the atrix C,
206  * is stored with the 2D storage where each block is stored continuously one
207  * after another. (Similar to dense tile storage with variant tile size)
208  *
209  * All the off-diagonal block below block are multiplied by the selected block
210  * and added to the facing cblk.
211  *
212  *******************************************************************************
213  *
214  * @param[in] sideA
215  * Specify if A and C belong to the lower part, or to the upper part.
216  * If sideA == PastixLCoef, the contribution of:
217  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
218  * C, otherwise the contribution:
219  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
220  * to C.
221  * The pointer to the data structure that describes the panel from
222  * which we compute the contributions. Next column blok must be
223  * accessible through cblk[1].
224  *
225  * @param[in] trans
226  * Specify the transposition used for the B matrix. It has to be either
227  * PastixTrans or PastixTrans.
228  *
229  * @param[in] cblk
230  * The cblk structure to which block belongs to. The A and B pointers
231  * must be the coeftab of this column block.
232  * Next column blok must be accessible through cblk[1].
233  *
234  * @param[in] blok
235  * The block from which we compute the contributions.
236  *
237  * @param[inout] fcblk
238  * The pointer to the data structure that describes the panel on which
239  * we compute the contributions. The C pointer must be one of the
240  * coeftab from this fcblk. Next column blok must be accessible through
241  * fcblk[1].
242  *
243  * @param[in] A
244  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
245  * coefficients of the panel when the Lower part is computed,
246  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
247  *
248  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
249  * the coefficients of the panel, if Symmetric/Symmetric cases or if
250  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
251  * cblk.stride -by- cblk.width
252  *
253  * @param[inout] C
254  * The pointer to the fcblk.lcoeftab if the lower part is computed,
255  * fcblk.ucoeftab otherwise.
256  *
257  *******************************************************************************
258  *
259  * @sa core_sgemmsp_1d1d
260  * @sa core_sgemmsp_2d2d
261  *
262  *******************************************************************************/
263 static inline void
265  pastix_trans_t trans,
266  const SolverCblk *cblk,
267  const SolverBlok *blok,
268  SolverCblk *fcblk,
269  const float *A,
270  const float *B,
271  float *C )
272 {
273  const SolverBlok *iterblok;
274  const SolverBlok *fblok;
275  const SolverBlok *lblok;
276  const float *blokA;
277  const float *blokB;
278  float *blokC;
279 
280  pastix_int_t stride, stridef;
281  pastix_int_t M, N, K;
282  int shift;
283 
284  /* cblk is stored in 1D and fcblk in 2D */
285  assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
286  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
287 
288  shift = (sideA == PastixUCoef) ? 1 : 0;
289  stride = cblk->stride;
290 
291  /* Get the B block and its dimensions */
292  blokB = B + blok->coefind;
293 
294  stride = cblk->stride;
295  K = cblk_colnbr( cblk );
296  N = blok_rownbr( blok );
297 
298  /**
299  * Add contribution to C in fcblk:
300  * Get the first facing block of the distant panel, and the last block of
301  * the current cblk
302  */
303  fblok = fcblk->fblokptr;
304  lblok = cblk[1].fblokptr;
305 
306  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
307 
308  /* Find facing blok */
309  while (!is_block_inside_fblock( iterblok, fblok ))
310  {
311  fblok++;
312  assert( fblok < fcblk[1].fblokptr );
313  }
314 
315  stridef = blok_rownbr(fblok);
316 
317  /* Get the A block and its dimensions */
318  blokA = A + iterblok->coefind;
319  M = blok_rownbr( iterblok );
320 
321  blokC = C + fblok->coefind
322  + iterblok->frownum - fblok->frownum
323  + (blok->frownum - fcblk->fcolnum) * stridef;
324 
325  pastix_cblk_lock( fcblk );
326  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
327  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
328  M, N, K,
329  (msone), blokA, stride,
330  blokB, stride,
331  (sone), blokC, stridef );
332  kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
333  pastix_cblk_unlock( fcblk );
334  }
335 }
336 
337 /**
338  *******************************************************************************
339  *
340  * @ingroup kernel_fact_null
341  *
342  * @brief Compute the updates that are generated by the transposition of one
343  * single off-diagonal block.
344  *
345  * Both cblk involved in the matrices A, B and C are stored with the 2D storage
346  * where each block is stored continuously one after another. (Similar to dense
347  * tile storage with variant tile size)
348  *
349  * All the off-diagonal block below block are multiplied by the selected block
350  * and added to the facing cblk.
351  *
352  *******************************************************************************
353  *
354  * @param[in] sideA
355  * Specify if A and C belong to the lower part, or to the upper part.
356  * If sideA == PastixLCoef, the contribution of:
357  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
358  * C, otherwise the contribution:
359  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
360  * to C.
361  * The pointer to the data structure that describes the panel from
362  * which we compute the contributions. Next column blok must be
363  * accessible through cblk[1].
364  *
365  * @param[in] trans
366  * Specify the transposition used for the B matrix. It has to be either
367  * PastixTrans or PastixTrans.
368  *
369  * @param[in] cblk
370  * The cblk structure to which block belongs to. The A and B pointers
371  * must be the coeftab of this column block.
372  * Next column blok must be accessible through cblk[1].
373  *
374  * @param[in] blok
375  * The block from which we compute the contributions.
376  *
377  * @param[inout] fcblk
378  * The pointer to the data structure that describes the panel on which
379  * we compute the contributions. The C pointer must be one of the
380  * coeftab from this fcblk. Next column blok must be accessible through
381  * fcblk[1].
382  *
383  * @param[in] A
384  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
385  * coefficients of the panel when the Lower part is computed,
386  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
387  *
388  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
389  * the coefficients of the panel, if Symmetric/Symmetric cases or if
390  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
391  * cblk.stride -by- cblk.width
392  *
393  * @param[inout] C
394  * The pointer to the fcblk.lcoeftab if the lower part is computed,
395  * fcblk.ucoeftab otherwise.
396  *
397  *******************************************************************************
398  *
399  * @sa core_sgemmsp_1d1d
400  * @sa core_sgemmsp_1d2d
401  *
402  *******************************************************************************/
403 static inline void
405  pastix_trans_t trans,
406  const SolverCblk *cblk,
407  const SolverBlok *blok,
408  SolverCblk *fcblk,
409  const float *A,
410  const float *B,
411  float *C )
412 {
413  const SolverBlok *iterblok;
414  const SolverBlok *fblok;
415  const SolverBlok *lblok;
416  const float *blokA;
417  const float *blokB;
418  float *blokC;
419 
420  pastix_int_t M, N, K, lda, ldb, ldc;
421  int shift;
422 
423  /* Both cblk and fcblk must be stored in 2D */
424  assert( cblk->cblktype & CBLK_LAYOUT_2D );
425  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
426 
427  shift = (sideA == PastixUCoef) ? 1 : 0;
428 
429  /* Get the B block and its dimensions */
430  blokB = B + blok->coefind;
431 
432  ldb = blok_rownbr( blok );
433  K = cblk_colnbr( cblk );
434  N = blok_rownbr( blok );
435 
436  /*
437  * Add contribution to C in fcblk:
438  * Get the first facing block of the distant panel, and the last block of
439  * the current cblk
440  */
441  fblok = fcblk->fblokptr;
442  lblok = cblk[1].fblokptr;
443 
444  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
445 
446  /* Find facing blok */
447  while (!is_block_inside_fblock( iterblok, fblok ))
448  {
449  fblok++;
450  assert( fblok < fcblk[1].fblokptr );
451  }
452 
453  ldc = blok_rownbr(fblok);
454 
455  /* Get the A block and its dimensions */
456  blokA = A + iterblok->coefind;
457  M = blok_rownbr( iterblok );
458  lda = M;
459 
460  blokC = C + fblok->coefind
461  + iterblok->frownum - fblok->frownum
462  + (blok->frownum - fcblk->fcolnum) * ldc;
463 
464  pastix_cblk_lock( fcblk );
465  kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
466  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
467  M, N, K,
468  (msone), blokA, lda,
469  blokB, ldb,
470  (sone), blokC, ldc );
471  kernel_trace_stop_lvl2( FLOPS_SGEMM( M, N, K ) );
472  pastix_cblk_unlock( fcblk );
473  }
474 }
475 
476 /**
477  *******************************************************************************
478  *
479  * @ingroup kernel_fact_null
480  *
481  * @brief Compute the updates that are generated by the transposition of all the
482  * blocks facing a common diagonal block, by another similar set of blocks.
483  *
484  * This is used to performed an update:
485  *
486  * C_mn = C_mn - A_mk * op(B_kn)
487  *
488  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
489  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
490  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
491  * belongs to the set of block of the cblk n facing the diagonal block of the
492  * cblk m.
493  *
494  *******************************************************************************
495  *
496  * @param[in] trans
497  * Specify the transposition used for the B matrices. It has to be either
498  * PastixTrans or PastixTrans.
499  *
500  * @param[in] blok_mk
501  * Index of the first off-diagonal block in cblk, that is used for A.
502  *
503  * @param[in] blok_kn
504  * Index of the first off-diagonal block in cblk, that is used for B.
505  *
506  * @param[in] blok_mn
507  * Index of the first off-diagonal block in fcblk, that is used for C.
508  *
509  * @param[in] cblk
510  * The cblk structure to which block belongs to. The A and B pointers
511  * must be the coeftab of this column block.
512  * Next column blok must be accessible through cblk[1].
513  *
514  * @param[inout] fcblk
515  * The pointer to the data structure that describes the panel on which
516  * we compute the contributions. The C pointer must be one of the
517  * coeftab from this fcblk. Next column blok must be accessible through
518  * fcblk[1].
519  *
520  * @param[in] A
521  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
522  * coefficients of the panel when the Lower part is computed,
523  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
524  *
525  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
526  * the coefficients of the panel, if Symmetric/Symmetric cases or if
527  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
528  * cblk.stride -by- cblk.width
529  *
530  * @param[inout] C
531  * The pointer to the fcblk.lcoeftab if the lower part is computed,
532  * fcblk.ucoeftab otherwise.
533  *
534  *******************************************************************************
535  *
536  * @return The number of flops performed
537  *
538  *******************************************************************************
539  *
540  * @sa core_sgemmsp_block_frlr
541  * @sa core_sgemmsp_block_lrlr
542  *
543  *******************************************************************************/
544 static inline pastix_fixdbl_t
546  pastix_int_t blok_mk,
547  pastix_int_t blok_kn,
548  pastix_int_t blok_mn,
549  const SolverCblk *cblk,
550  SolverCblk *fcblk,
551  const float *A,
552  const float *B,
553  float *C )
554 {
555  const SolverBlok *blokA, *blokB, *blokC;
556  const SolverBlok *bA, *bB, *bC;
557  const SolverBlok *fblokK, *lblokK;
558  const SolverBlok *fblokN, *lblokN;
559 
560  const float *Aptr, *Bptr;
561  float *Cptr;
562  pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
563  pastix_int_t full_m, full_n;
564  size_t offsetA, offsetB, offsetC;
565 
566  pastix_fixdbl_t flops = 0.0;
568 
569  /* Both cblk and fcblk must be stored in 2D */
570  assert( cblk->cblktype & CBLK_LAYOUT_2D );
571  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
572 
573  /*
574  * Blocs on column K
575  */
576  fblokK = cblk[0].fblokptr;
577  lblokK = cblk[1].fblokptr;
578 
579  blokB = fblokK + blok_kn;
580  offsetB = blokB->coefind;
581  cblk_n = blokB->fcblknm;
582 
583  blokA = fblokK + blok_mk;
584  offsetA = blokA->coefind;
585  cblk_m = blokA->fcblknm;
586 
587  /*
588  * Blocs on column N
589  */
590  fblokN = fcblk[0].fblokptr;
591  lblokN = fcblk[1].fblokptr;
592 
593  blokC = fblokN + blok_mn;
594  offsetC = blokC->coefind;
595  assert( blokC->lcblknm == cblk_n );
596  assert( blokC->fcblknm == cblk_m );
597 
598  K = cblk_colnbr( cblk );
599  full_m = 0;
600 
601  bC = blokC;
602  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
603  M = blok_rownbr(bA);
604  Aptr = A + bA->coefind - offsetA;
605  lda = M;
606  full_m += M;
607 
608  /* Find facing C blok */
609  while (!is_block_inside_fblock( bA, bC )) {
610  bC++;
611  assert( bC < lblokN );
612  }
613 
614  Cptr = C + bC->coefind - offsetC;
615  ldc = blok_rownbr(bC);
616 
617  full_n = 0;
618  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
619  N = blok_rownbr( bB );
620  full_n += N;
621  Bptr = B + bB->coefind - offsetB;
622  ldb = N;
623 
624  cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
625  M, N, K,
626  (msone), Aptr, lda,
627  Bptr, ldb,
628  (sone), Cptr + (bA->frownum - bC->frownum)
629  + (bB->frownum - fcblk->fcolnum) * ldc , ldc );
630 
631  flops += FLOPS_SGEMM( M, N, K );
632  }
633  }
634 
636  full_m, full_n, K,
637  flops, time );
638 
639  (void)lblokN;
640  return flops;
641 }
642 
643 /**
644  *******************************************************************************
645  *
646  * @ingroup kernel_fact_null
647  *
648  * @brief Compute the updates that are generated by the transposition of all the
649  * blocks facing a common diagonal block, by another similar set of blocks.
650  *
651  * This is used to perform an update:
652  *
653  * C_mn = C_mn - A_mk * op(B_kn)
654  *
655  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
656  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
657  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
658  * belongs to the set of block of the cblk n facing the diagonal block of the
659  * cblk m.
660  *
661  * In this routine, all the matrices are low-rank
662  *
663  *******************************************************************************
664  *
665  * @param[in] transB
666  * Specify the transposition used for the B matrices. It has to be either
667  * PastixTrans or PastixTrans.
668  *
669  * @param[in] blok_mk
670  * Index of the first off-diagonal block in cblk, that is used for A.
671  *
672  * @param[in] blok_kn
673  * Index of the first off-diagonal block in cblk, that is used for B.
674  *
675  * @param[in] blok_mn
676  * Index of the first off-diagonal block in fcblk, that is used for C.
677  *
678  * @param[in] cblk
679  * The cblk structure to which block belongs to. The A and B pointers
680  * must be the coeftab of this column block.
681  * Next column blok must be accessible through cblk[1].
682  *
683  * @param[inout] fcblk
684  * The pointer to the data structure that describes the panel on which
685  * we compute the contributions. The C pointer must be one of the
686  * coeftab from this fcblk. Next column blok must be accessible through
687  * fcblk[1].
688  *
689  * @param[in] A
690  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
691  * coefficients of the panel when the Lower part is computed,
692  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
693  *
694  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
695  * the coefficients of the panel, if Symmetric/Symmetric cases or if
696  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
697  * cblk.stride -by- cblk.width
698  *
699  * @param[in] lrC
700  * Pointer to the low-rank representation of the block C.
701  * Must be followed by the low-rank representation of the following blocks.
702  *
703  * @param[in] lowrank
704  * The structure with low-rank parameters.
705  *
706  *******************************************************************************
707  *
708  * @return The number of flops performed
709  *
710  *******************************************************************************/
711 static inline pastix_fixdbl_t
713  pastix_int_t blok_mk,
714  pastix_int_t blok_kn,
715  pastix_int_t blok_mn,
716  const SolverCblk *cblk,
717  SolverCblk *fcblk,
718  const float *A,
719  const float *B,
720  pastix_lrblock_t *lrC,
721  const pastix_lr_t *lowrank )
722 {
723  const SolverBlok *blokA, *blokB, *blokC;
724  const SolverBlok *bA, *bB, *bC;
725  const SolverBlok *fblokK, *lblokK;
726  const SolverBlok *fblokN, *lblokN;
727 
728  pastix_lrblock_t lrA, lrB;
729  core_slrmm_t params;
730 
731  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
732  size_t offsetA, offsetB;
733 
734  pastix_fixdbl_t flops = 0.0;
736 
737  /* Provide an internal lock as this function is already protected by the runtime dependency system */
738  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
739 
740  /* Both cblk and fcblk must be stored in 2D */
741  assert( cblk->cblktype & CBLK_LAYOUT_2D );
742  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
743 
744  /* Both cblk and fcblk must be compressed */
745  assert(!(cblk->cblktype & CBLK_COMPRESSED) );
746  assert( fcblk->cblktype & CBLK_COMPRESSED );
747 
748  /*
749  * Blocs on column K
750  */
751  fblokK = cblk[0].fblokptr;
752  lblokK = cblk[1].fblokptr;
753 
754  blokB = fblokK + blok_kn;
755  offsetB = blokB->coefind;
756  cblk_n = blokB->fcblknm;
757  lrB.rk = -1;
758 
759  blokA = fblokK + blok_mk;
760  offsetA = blokA->coefind;
761  cblk_m = blokA->fcblknm;
762  lrA.rk = -1;
763 
764  /*
765  * Blocs on column N
766  */
767  fblokN = fcblk[0].fblokptr;
768  lblokN = fcblk[1].fblokptr;
769 
770  blokC = fblokN + blok_mn;
771  assert( blokC->lcblknm == cblk_n );
772  assert( blokC->fcblknm == cblk_m );
773 
774  full_m = 0;
775  K = cblk_colnbr( cblk );
776 
777  params.lowrank = lowrank;
778  params.transA = PastixNoTrans;
779  params.transB = transB;
780  params.K = K;
781  params.Cn = cblk_colnbr( fcblk );
782  params.alpha = -1.0;
783  params.beta = 1.0;
784  params.work = NULL;
785  params.lwork = -1;
786  params.lock = &(lock);
787 
788  bC = blokC;
789  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
790  M = blok_rownbr(bA);
791  params.M = M;
792  full_m += M;
793 
794  lrA.rkmax = M;
795  lrA.u = (float*)A + bA->coefind - offsetA;
796  lrA.v = NULL;
797 
798  /* Find facing C blok */
799  while (!is_block_inside_fblock( bA, bC )) {
800  bC++;
801  lrC++;
802  assert( bC < lblokN );
803  }
804 
805  params.A = &lrA;
806  params.C = lrC;
807  params.Cm = blok_rownbr( bC );
808 
809  full_n = 0;
810  for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
811 
812  params.N = blok_rownbr( bB );
813  full_n += params.N;
814  params.offx = bA->frownum - bC->frownum;
815  params.offy = bB->frownum - fcblk->fcolnum;
816 
817  lrB.rkmax = M;
818  lrB.u = (float*)B + bB->coefind - offsetB;
819  lrB.v = NULL;
820 
821  params.B = &lrB;
822 
823  flops += core_slrmm( &params );
824  }
825  }
826 
828  full_m, full_n, K,
829  flops, time );
830  (void)lblokN;
831  return flops;
832 }
833 
834 /**
835  *******************************************************************************
836  *
837  * @ingroup kernel_fact_null
838  *
839  * @brief Compute the updates that are generated by the transposition of all the
840  * blocks facing a common diagonal block, by another similar set of blocks.
841  *
842  * This is used to perform an update:
843  *
844  * C_mn = C_mn - A_mk * op(B_kn)
845  *
846  * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
847  * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
848  * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
849  * belongs to the set of block of the cblk n facing the diagonal block of the
850  * cblk m.
851  *
852  * In this routine, all the matrices are low-rank
853  *
854  *******************************************************************************
855  *
856  * @param[in] transB
857  * Specify the transposition used for the B matrices. It has to be either
858  * PastixTrans or PastixTrans.
859  *
860  * @param[in] blok_mk
861  * Index of the first off-diagonal block in cblk, that is used for A.
862  *
863  * @param[in] blok_kn
864  * Index of the first off-diagonal block in cblk, that is used for B.
865  *
866  * @param[in] blok_mn
867  * Index of the first off-diagonal block in fcblk, that is used for C.
868  *
869  * @param[in] cblk
870  * The cblk structure to which block belongs to. The A and B pointers
871  * must be the coeftab of this column block.
872  * Next column blok must be accessible through cblk[1].
873  *
874  * @param[inout] fcblk
875  * The pointer to the data structure that describes the panel on which
876  * we compute the contributions. The C pointer must be one of the
877  * coeftab from this fcblk. Next column blok must be accessible through
878  * fcblk[1].
879  *
880  * @param[inout] lrA
881  * Pointer to the low-rank representation of the block A.
882  * Must be followed by the low-rank representation of the following blocks.
883  *
884  * @param[inout] lrB
885  * Pointer to the low-rank representation of the block B.
886  * Must be followed by the low-rank representation of the following blocks.
887  *
888  * @param[inout] lrC
889  * Pointer to the low-rank representation of the block C.
890  * Must be followed by the low-rank representation of the following blocks.
891  *
892  * @param[in] lowrank
893  * The structure with low-rank parameters.
894  *
895  *******************************************************************************
896  *
897  * @return The number of flops performed
898  *
899  *******************************************************************************/
900 static inline pastix_fixdbl_t
902  pastix_int_t blok_mk,
903  pastix_int_t blok_kn,
904  pastix_int_t blok_mn,
905  const SolverCblk *cblk,
906  SolverCblk *fcblk,
907  const pastix_lrblock_t *lrA,
908  const pastix_lrblock_t *lrB,
909  pastix_lrblock_t *lrC,
910  const pastix_lr_t *lowrank )
911 {
912  const pastix_lrblock_t *blrB;
913  const SolverBlok *blokA, *blokB, *blokC;
914  const SolverBlok *bA, *bB, *bC;
915  const SolverBlok *fblokK, *lblokK;
916  const SolverBlok *fblokN, *lblokN;
917 
918  core_slrmm_t params;
919 
920  pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
921 
922  pastix_fixdbl_t flops = 0.0;
924 
925  /* Provide an internal lock as this function is already protected by the runtime dependency system */
926  pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
927 
928  /* Both cblk and fcblk must be stored in 2D */
929  assert( cblk->cblktype & CBLK_LAYOUT_2D );
930  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
931 
932  /* Both cblk and fcblk must be compressed */
933  assert( cblk->cblktype & CBLK_COMPRESSED );
934  assert( fcblk->cblktype & CBLK_COMPRESSED );
935 
936  /*
937  * Blocs on column K
938  */
939  fblokK = cblk[0].fblokptr;
940  lblokK = cblk[1].fblokptr;
941 
942  blokB = fblokK + blok_kn;
943  cblk_n = blokB->fcblknm;
944 
945  blokA = fblokK + blok_mk;
946  cblk_m = blokA->fcblknm;
947 
948  /*
949  * Blocs on column N
950  */
951  fblokN = fcblk[0].fblokptr;
952  lblokN = fcblk[1].fblokptr;
953 
954  blokC = fblokN + blok_mn;
955  assert( blokC->lcblknm == cblk_n );
956  assert( blokC->fcblknm == cblk_m );
957 
958  full_m = 0;
959  K = cblk_colnbr( cblk );
960 
961  params.lowrank = lowrank;
962  params.transA = PastixNoTrans;
963  params.transB = transB;
964  params.K = K;
965  params.Cn = cblk_colnbr( fcblk );
966  params.alpha = -1.0;
967  params.beta = 1.0;
968  params.work = NULL;
969  params.lwork = -1;
970  params.lock = &(lock);
971 
972  bC = blokC;
973  for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++, lrA++) {
974  M = blok_rownbr(bA);
975  params.M = M;
976  full_m += M;
977 
978  /* Find facing C blok */
979  while (!is_block_inside_fblock( bA, bC )) {
980  bC++;
981  lrC++;
982  assert( bC < lblokN );
983  }
984 
985  params.A = lrA;
986  params.C = lrC;
987  params.Cm = blok_rownbr( bC );
988 
989  full_n = 0;
990  for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++, blrB++) {
991 
992  params.N = blok_rownbr( bB );
993  full_n += params.N;
994 
995  params.offx = bA->frownum - bC->frownum;
996  params.offy = bB->frownum - fcblk->fcolnum;
997  params.B = blrB;
998 
999  flops += core_slrmm( &params );
1000  }
1001  }
1002 
1004  full_m, full_n, K,
1005  flops, time );
1006  (void)lblokN;
1007  return flops;
1008 }
1009 
1010 /**
1011  *******************************************************************************
1012  *
1013  * @ingroup kernel_fact_null
1014  *
1015  * @brief Compute the updates associated to one off-diagonal block.
1016  *
1017  *******************************************************************************
1018  *
1019  * @param[in] sideA
1020  * Specify if A and C belong to the lower part, or to the upper part.
1021  * If sideA == PastixLCoef, the contribution of:
1022  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1023  * C, otherwise the contribution:
1024  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1025  * to C.
1026  *
1027  * @param[in] trans
1028  * Specify the transposition used for the B matrix. It has to be either
1029  * PastixTrans or PastixTrans.
1030  *
1031  * @param[in] cblk
1032  * The cblk structure to which block belongs to. The A and B pointers
1033  * must be the coeftab of this column block.
1034  * Next column blok must be accessible through cblk[1].
1035  *
1036  * @param[in] blok
1037  * The block from which we compute the contributions.
1038  *
1039  * @param[inout] fcblk
1040  * The pointer to the data structure that describes the panel on which
1041  * we compute the contributions. The C pointer must be one of the
1042  * coeftab from this fcblk. Next column blok must be accessible through
1043  * fcblk[1].
1044  *
1045  * @param[in] A
1046  * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
1047  * coefficients of the panel when the Lower part is computed,
1048  * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
1049  *
1050  * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
1051  * the coefficients of the panel, if Symmetric/Symmetric cases or if
1052  * upper part is computed; cblk.ucoeftab otherwise. Must be of size
1053  * cblk.stride -by- cblk.width
1054  *
1055  * @param[in] lrC
1056  * Pointer to the low-rank representation of the block C.
1057  * Must be followed by the low-rank representation of the following blocks.
1058  *
1059  * @param[in] work
1060  * Temporary memory buffer.
1061  *
1062  * @param[in] lwork
1063  * TODO
1064  *
1065  * @param[in] lowrank
1066  * The structure with low-rank parameters.
1067  *
1068  *******************************************************************************
1069  *
1070  * @return The number of flops performed
1071  *
1072  *******************************************************************************/
1073 static inline pastix_fixdbl_t
1075  pastix_trans_t trans,
1076  const SolverCblk *cblk,
1077  const SolverBlok *blok,
1078  SolverCblk *fcblk,
1079  const float *A,
1080  const float *B,
1081  pastix_lrblock_t *lrC,
1082  float *work,
1083  pastix_int_t lwork,
1084  const pastix_lr_t *lowrank )
1085 {
1086  const SolverBlok *iterblok;
1087  const SolverBlok *fblok;
1088  const SolverBlok *lblok;
1089  pastix_lrblock_t lrA, lrB;
1090  core_slrmm_t params;
1091 
1092  pastix_int_t stride, shift;
1093  pastix_int_t M, N;
1094 
1095  pastix_fixdbl_t flops = 0.0;
1096 
1097  /* Update from a dense block to a low rank block */
1098  assert(!(cblk->cblktype & CBLK_COMPRESSED));
1099  assert( fcblk->cblktype & CBLK_COMPRESSED );
1100  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1101 
1102  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1103 
1104  shift = (sideA == PastixUCoef) ? 1 : 0;
1105  stride = cblk->stride;
1106 
1107  N = blok_rownbr( blok );
1108 
1109  /* Get the B block and its dimensions */
1110  lrB.rk = -1;
1111  lrB.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? N : stride;
1112  lrB.u = (float*)B + blok->coefind; /* lrB is const, we can cast the B pointer */
1113  lrB.v = NULL;
1114 
1115  /**
1116  * Add contribution to C in fcblk:
1117  * Get the first facing block of the distant panel, and the last block of
1118  * the current cblk
1119  */
1120  fblok = fcblk->fblokptr;
1121  lblok = cblk[1].fblokptr;
1122 
1123  params.lowrank = lowrank;
1124  params.transA = PastixNoTrans;
1125  params.transB = trans;
1126  params.N = blok_rownbr( blok );
1127  params.K = cblk_colnbr( cblk );
1128  params.Cn = cblk_colnbr( fcblk );
1129  params.alpha = -1.0;
1130  params.beta = 1.0;
1131  params.work = work;
1132  params.lwork = lwork;
1133  params.lock = &(fcblk->lock);
1134  params.B = &lrB;
1135 
1136  for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1137 
1138  /* Find facing blok */
1139  while (!is_block_inside_fblock( iterblok, fblok ))
1140  {
1141  fblok++;
1142  lrC++;
1143  assert( fblok < fcblk[1].fblokptr );
1144  }
1145 
1146  /* Get the A block and its dimensions */
1147  M = blok_rownbr( iterblok );
1148  lrA.rk = -1;
1149  lrA.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? M : stride;
1150  lrA.u = (float*)A + iterblok->coefind; /* Same as for B */
1151  lrA.v = NULL;
1152 
1153  params.M = M;
1154  params.A = &lrA;
1155  params.C = lrC;
1156  params.Cm = blok_rownbr( fblok );
1157 
1158  params.offx = iterblok->frownum - fblok->frownum;
1159  params.offy = blok->frownum - fcblk->fcolnum;
1160 
1161  flops += core_slrmm( &params );
1162  }
1163  return flops;
1164 }
1165 
1166 /**
1167  *******************************************************************************
1168  *
1169  * @ingroup kernel_fact_null
1170  *
1171  * @brief Compute the updates associated to one off-diagonal block.
1172  *
1173  *******************************************************************************
1174  *
1175  * @param[in] sideA
1176  * Specify if A and C belong to the lower part, or to the upper part.
1177  * If sideA == PastixLCoef, the contribution of:
1178  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1179  * C, otherwise the contribution:
1180  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1181  * to C.
1182  *
1183  * @param[in] trans
1184  * Specify the transposition used for the B matrix. It has to be either
1185  * PastixTrans or PastixTrans.
1186  *
1187  * @param[in] cblk
1188  * The cblk structure to which block belongs to. The A and B pointers
1189  * must be the coeftab of this column block.
1190  * Next column blok must be accessible through cblk[1].
1191  *
1192  * @param[in] blok
1193  * The block from which we compute the contributions.
1194  *
1195  * @param[inout] fcblk
1196  * The pointer to the data structure that describes the panel on which
1197  * we compute the contributions. The C pointer must be one of the
1198  * coeftab from this fcblk. Next column blok must be accessible through
1199  * fcblk[1].
1200  *
1201  * @param[in] lrA
1202  * Pointer to the low-rank representation of the block A.
1203  * Must be followed by the low-rank representation of the following blocks.
1204  *
1205  * @param[in] lrB
1206  * Pointer to the low-rank representation of the block B.
1207  * Must be followed by the low-rank representation of the following blocks.
1208  *
1209  * @param[in] lrC
1210  * Pointer to the low-rank representation of the block B.
1211  * Must be followed by the low-rank representation of the following blocks.
1212  *
1213  * @param[in] work
1214  * Temporary memory buffer.
1215  *
1216  * @param[in] lwork
1217  * TODO
1218  *
1219  * @param[in] lowrank
1220  * The structure with low-rank parameters.
1221  *
1222  *******************************************************************************
1223  *
1224  * @return The number of flops performed
1225  *
1226  *******************************************************************************/
1227 static inline pastix_fixdbl_t
1229  pastix_trans_t trans,
1230  const SolverCblk *cblk,
1231  const SolverBlok *blok,
1232  SolverCblk *fcblk,
1233  const pastix_lrblock_t *lrA,
1234  const pastix_lrblock_t *lrB,
1235  pastix_lrblock_t *lrC,
1236  float *work,
1237  pastix_int_t lwork,
1238  const pastix_lr_t *lowrank )
1239 {
1240  const SolverBlok *iterblok;
1241  const SolverBlok *fblok;
1242  const SolverBlok *lblok;
1243 
1244  pastix_int_t N, K, shift;
1245  core_slrmm_t params;
1246 
1247  pastix_fixdbl_t flops = 0.0;
1248 
1249  /* Update from a low-rank cblk to a low-rank cblk */
1250  assert( cblk->cblktype & CBLK_COMPRESSED );
1251  assert( fcblk->cblktype & CBLK_COMPRESSED );
1252  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1253  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1254 
1255  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1256 
1257  shift = (sideA == PastixUCoef) ? 1 : 0;
1258 
1259  /* Get the B block and its dimensions */
1260  K = cblk_colnbr( cblk );
1261  N = blok_rownbr( blok );
1262 
1263  /*
1264  * Add contribution to C in fcblk:
1265  * Get the first facing block of the distant panel, and the last block of
1266  * the current cblk
1267  */
1268  fblok = fcblk->fblokptr;
1269  lblok = cblk[1].fblokptr;
1270 
1271  params.lowrank = lowrank;
1272  params.transA = PastixNoTrans;
1273  params.transB = trans;
1274  params.N = N;
1275  params.K = K;
1276  params.Cn = cblk_colnbr( fcblk );
1277  params.alpha = -1.0;
1278  params.beta = 1.0;
1279  params.work = work;
1280  params.lwork = lwork;
1281  params.lwused = 0;
1282  params.lock = &(fcblk->lock);
1283  params.B = lrB + (blok - cblk->fblokptr);
1284 
1285  /* for all following blocks in block column */
1286  lrA = lrA + (blok - cblk->fblokptr) + shift;
1287  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1288 
1289  /* Find facing blok */
1290  while (!is_block_inside_fblock( iterblok, fblok ))
1291  {
1292  fblok++;
1293  lrC++;
1294  assert( fblok < fcblk[1].fblokptr );
1295  }
1296 
1297  params.M = blok_rownbr( iterblok );
1298  params.A = lrA;
1299  params.C = lrC;
1300  params.Cm = blok_rownbr( fblok );
1301 
1302  params.offx = iterblok->frownum - fblok->frownum;
1303  params.offy = blok->frownum - fcblk->fcolnum;
1304 
1305  flops += core_slrmm( &params );
1306  }
1307  return flops;
1308 }
1309 
1310 /**
1311  *******************************************************************************
1312  *
1313  * @ingroup kernel_fact_null
1314  *
1315  * @brief Compute the updates associated to one off-diagonal block.
1316  *
1317  *******************************************************************************
1318  *
1319  * @param[in] sideA
1320  * Specify if A and C belong to the lower part, or to the upper part.
1321  * If sideA == PastixLCoef, the contribution of:
1322  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1323  * C, otherwise the contribution:
1324  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1325  * to C.
1326  *
1327  *
1328  * @param[in] trans
1329  * Specify the transposition used for the B matrix. It has to be either
1330  * PastixTrans or PastixTrans.
1331  *
1332  * @param[in] cblk
1333  * The cblk structure to which block belongs to. The A and B pointers
1334  * must be the coeftab of this column block.
1335  * Next column blok must be accessible through cblk[1].
1336  *
1337  * @param[in] blok
1338  * The block from which we compute the contributions.
1339  *
1340  * @param[inout] fcblk
1341  * The pointer to the data structure that describes the panel on which
1342  * we compute the contributions. The C pointer must be one of the
1343  * coeftab from this fcblk. Next column blok must be accessible through
1344  * fcblk[1].
1345  *
1346  * @param[in] lrA
1347  * Pointer to the low-rank representation of the block A.
1348  * Must be followed by the low-rank representation of the following blocks.
1349  *
1350  * @param[in] lrB
1351  * Pointer to the low-rank representation of the block B.
1352  * Must be followed by the low-rank representation of the following blocks.
1353  *
1354  * @param[inout] C
1355  * The pointer to the fcblk.lcoeftab if the lower part is computed,
1356  * fcblk.ucoeftab otherwise.
1357  *
1358  * @param[in] work
1359  * Temporary memory buffer.
1360  *
1361  * @param[in] lwork
1362  * The length of work.
1363  * On entry, if trans = PastixTrans
1364  * lwork >= max(1, K*N). Otherwise lwork >= max(1, M*K).
1365  *
1366  * @param[in] lowrank
1367  * The structure with low-rank parameters.
1368  *
1369  *******************************************************************************
1370  *
1371  * @return The number of flops performed
1372  *
1373  *******************************************************************************/
1374 static inline pastix_fixdbl_t
1376  pastix_trans_t trans,
1377  const SolverCblk *cblk,
1378  const SolverBlok *blok,
1379  SolverCblk *fcblk,
1380  const pastix_lrblock_t *lrA,
1381  const pastix_lrblock_t *lrB,
1382  float *C,
1383  float *work,
1384  pastix_int_t lwork,
1385  const pastix_lr_t *lowrank )
1386 {
1387  const SolverBlok *iterblok;
1388  const SolverBlok *fblok;
1389  const SolverBlok *lblok;
1390 
1391  pastix_int_t N, K, shift;
1392  pastix_lrblock_t lrC;
1393  core_slrmm_t params;
1394 
1395  pastix_fixdbl_t flops = 0.0;
1396 
1397  /* Update from a low-rank cblk to a low-rank cblk */
1398  assert( cblk->cblktype & CBLK_COMPRESSED );
1399  assert( !(fcblk->cblktype & CBLK_COMPRESSED) );
1400  assert( cblk->cblktype & CBLK_LAYOUT_2D );
1401  assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1402 
1403  assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1404 
1405  shift = (sideA == PastixUCoef) ? 1 : 0;
1406 
1407  /* Get the B block and its dimensions */
1408  K = cblk_colnbr( cblk );
1409  N = blok_rownbr( blok );
1410 
1411  /*
1412  * Add contribution to C in fcblk:
1413  * Get the first facing block of the distant panel, and the last block of
1414  * the current cblk
1415  */
1416  fblok = fcblk->fblokptr;
1417  lblok = cblk[1].fblokptr;
1418 
1419  params.lowrank = lowrank;
1420  params.transA = PastixNoTrans;
1421  params.transB = trans;
1422  params.N = N;
1423  params.K = K;
1424  params.Cn = cblk_colnbr( fcblk );
1425  params.alpha = -1.0;
1426  params.beta = 1.0;
1427  params.work = work;
1428  params.lwork = lwork;
1429  params.lwused = 0;
1430  params.lock = &(fcblk->lock);
1431  params.B = lrB + (blok - cblk->fblokptr);
1432  params.C = &lrC;
1433 
1434  lrC.rk = -1;
1435  lrC.v = NULL;
1436 
1437  /* for all following blocks in block column */
1438  lrA = lrA + (blok - cblk->fblokptr) + shift;
1439  for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1440 
1441  /* Find facing blok */
1442  while (!is_block_inside_fblock( iterblok, fblok ))
1443  {
1444  fblok++;
1445  assert( fblok < fcblk[1].fblokptr );
1446  }
1447 
1448  params.M = blok_rownbr( iterblok );
1449  params.A = lrA;
1450  params.Cm = blok_rownbr( fblok );
1451  params.offx = iterblok->frownum - fblok->frownum;
1452  params.offy = blok->frownum - fcblk->fcolnum;
1453 
1454  lrC.rkmax = params.Cm;
1455  lrC.u = C + fblok->coefind;
1456 
1457  flops += core_slrmm( &params );
1458  }
1459  return flops;
1460 }
1461 
1462 /**
1463  *******************************************************************************
1464  *
1465  * @brief Compute the updates associated to one off-diagonal block.
1466  *
1467  *******************************************************************************
1468  *
1469  * @param[in] sideA
1470  * Specify if A and C belong to the lower part, or to the upper part.
1471  * If sideA == PastixLCoef, the contribution of:
1472  * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1473  * C, otherwise the contribution:
1474  * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1475  * to C.
1476  * The pointer to the data structure that describes the panel from
1477  * which we compute the contributions. Next column blok must be
1478  * accessible through cblk[1].
1479  *
1480  * @param[in] trans
1481  * Specify the transposition used for the B matrix. It has to be either
1482  * PastixTrans or PastixTrans.
1483  *
1484  * @param[in] cblk
1485  * The cblk structure to which block belongs to. The A and B pointers
1486  * must be the coeftab of this column block.
1487  * Next column blok must be accessible through cblk[1].
1488  *
1489  * @param[in] blok
1490  * The block from which we compute the contributions.
1491  *
1492  * @param[inout] fcblk
1493  * The pointer to the data structure that describes the panel on which
1494  * we compute the contributions. The C pointer must be one of the
1495  * coeftab from this fcblk. Next column blok must be accessible through
1496  * fcblk[1].
1497  *
1498  * @param[in] A
1499  * The pointer to the correct representation of A.
1500  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1501  * - pastix_lr_block if the block is compressed.
1502  *
1503  * @param[in] B
1504  * The pointer to the correct representation of B.
1505  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1506  * - pastix_lr_block if the block is compressed.
1507  *
1508  * @param[inout] C
1509  * The pointer to the correct representation of C.
1510  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1511  * - pastix_lr_block if the block is compressed.
1512  *
1513  * @param[in] work
1514  * Temporary memory buffer.
1515  *
1516  * @param[in] lwork
1517  * TODO
1518  *
1519  * @param[in] lowrank
1520  * The structure with low-rank parameters.
1521  *
1522  *******************************************************************************
1523  *
1524  * @return The number of static pivoting during factorization of the diagonal
1525  * block.
1526  *
1527  *******************************************************************************/
1530  pastix_trans_t trans,
1531  const SolverCblk *cblk,
1532  const SolverBlok *blok,
1533  SolverCblk *fcblk,
1534  const void *A,
1535  const void *B,
1536  void *C,
1537  float *work,
1538  pastix_int_t lwork,
1539  const pastix_lr_t *lowrank )
1540 {
1541  pastix_ktype_t ktype;
1542  pastix_fixdbl_t time, flops = 0.0;
1543  pastix_int_t m = cblk->stride;
1544  pastix_int_t n = blok_rownbr( blok );
1545  pastix_int_t k = cblk_colnbr( cblk );
1546 
1547  m -= (cblk->cblktype & CBLK_LAYOUT_2D) ? blok->coefind / k : blok->coefind;
1548  m -= (sideA == PastixUCoef) ? blok_rownbr( blok ) : 0;
1549 
1550  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1551  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1552  ktype = PastixKernelGEMMCblkLRLR;
1553  time = kernel_trace_start( ktype );
1554 
1555  flops = core_sgemmsp_lr( sideA, trans,
1556  cblk, blok, fcblk,
1557  A, B, C, work, lwork,
1558  lowrank );
1559  }
1560  else {
1561  ktype = PastixKernelGEMMCblkFRLR;
1562  time = kernel_trace_start( ktype );
1563 
1564  flops = core_sgemmsp_fulllr( sideA, trans,
1565  cblk, blok, fcblk,
1566  A, B, C, work, lwork,
1567  lowrank );
1568  }
1569  }
1570  else if ( fcblk->cblktype & CBLK_LAYOUT_2D ) {
1571  if ( cblk->cblktype & CBLK_COMPRESSED) {
1572  ktype = PastixKernelGEMMCblk2d2d;
1573  time = kernel_trace_start( ktype );
1574  core_sgemmsp_lrfr( sideA, trans,
1575  cblk, blok, fcblk,
1576  A, B, C, work, lwork,
1577  lowrank );
1578  }
1579  else if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
1580  ktype = PastixKernelGEMMCblk2d2d;
1581  time = kernel_trace_start( ktype );
1582 
1583  core_sgemmsp_2d2d( sideA, trans,
1584  cblk, blok, fcblk,
1585  A, B, C );
1586  }
1587  else {
1588  ktype = PastixKernelGEMMCblk1d2d;
1589  time = kernel_trace_start( ktype );
1590 
1591  core_sgemmsp_1d2d( sideA, trans,
1592  cblk, blok, fcblk,
1593  A, B, C );
1594  }
1595  flops = FLOPS_SGEMM( m, n, k );
1596  }
1597  else {
1598  assert( !(cblk->cblktype & CBLK_COMPRESSED) );
1599  ktype = PastixKernelGEMMCblk1d1d;
1600  time = kernel_trace_start( ktype );
1601 
1602  core_sgemmsp_1d1d( sideA, trans,
1603  cblk, blok, fcblk,
1604  A, B, C, work );
1605 
1606  flops = FLOPS_SGEMM( m, n, k );
1607  }
1608 
1609  kernel_trace_stop( blok->inlast, ktype, m, n, k, flops, time );
1610 
1611  return flops;
1612 }
1613 
1614 /**
1615  *******************************************************************************
1616  *
1617  * @brief Compute the CPU gemm associated to a couple of off-diagonal blocks.
1618  *
1619  * C_l = C_l - A_l * op(B_s), with B_s = B_l, or B_u
1620  * or
1621  * C_u = C_u - A_u * op(B_s), with B_s = B_l, or B_u
1622  *
1623  *******************************************************************************
1624  *
1625  * @param[in] transB
1626  * Specify wheter B should be used as PastixNoTrans, PastixTrans, or
1627  * PastixTrans in the computations.
1628  *
1629  * @param[in] cblk
1630  * The cblk structure to which block A and B belong to. The A and B
1631  * pointers must be one of the [lu]coeftab of this column block.
1632  * Next column blok must be accessible through cblk[1].
1633  *
1634  * @param[inout] fcblk
1635  * The pointer to the data structure that describes the panel on which
1636  * we compute the contributions. The C pointer must be one of the
1637  * [lu]coeftab from this fcblk.
1638  * Next column blok must be accessible through fcblk[1].
1639  *
1640  * @param[in] blok_mk
1641  * Specify the index of the A block in the cblk column. This index is
1642  * 0-based for the diagonal block.
1643  *
1644  * @param[in] blok_nk
1645  * Specify the index of the B block in the cblk column. This index is
1646  * 0-based for the diagonal block.
1647  *
1648  * @param[in] blok_mn
1649  * Specify the index of the C block in the fcblk column. This index is
1650  * 0-based for the diagonal block.
1651  *
1652  * @param[in] A
1653  * The pointer to the correct representation of A.
1654  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1655  * - pastix_lr_block if the block is compressed.
1656  *
1657  * @param[in] B
1658  * The pointer to the correct representation of B.
1659  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1660  * - pastix_lr_block if the block is compressed.
1661  *
1662  * @param[in] C
1663  * The pointer to the correct representation of C.
1664  * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1665  * - pastix_lr_block if the block is compressed.
1666  *
1667  * @param[in] lowrank
1668  * The structure with the low-rank parameters.
1669  *
1670  *******************************************************************************
1671  *
1672  * @return TODO
1673  *
1674  *******************************************************************************/
1677  const SolverCblk *cblk,
1678  SolverCblk *fcblk,
1679  pastix_int_t blok_mk,
1680  pastix_int_t blok_nk,
1681  pastix_int_t blok_mn,
1682  const void *A,
1683  const void *B,
1684  void *C,
1685  const pastix_lr_t *lowrank )
1686 {
1687  if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1688  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1689  return core_sgemmsp_block_lrlr( transB,
1690  blok_mk, blok_nk, blok_mn,
1691  cblk, fcblk,
1692  A, B, C, lowrank );
1693  }
1694  else {
1695  return core_sgemmsp_block_frlr( transB,
1696  blok_mk, blok_nk, blok_mn,
1697  cblk, fcblk,
1698  A, B, C, lowrank );
1699  }
1700  }
1701  else {
1702  if ( cblk->cblktype & CBLK_COMPRESSED ) {
1703  assert(0);
1704  return 0.; /* Avoids compilation and coverity warning */
1705  }
1706  else {
1707  return core_sgemmsp_block_frfr( transB,
1708  blok_mk, blok_nk, blok_mn,
1709  cblk, fcblk,
1710  A, B, C );
1711  }
1712  }
1713 }
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
Definition: kernels_trace.h:87
@ PastixKernelGEMMCblkFRLR
Definition: kernels_enums.h:65
@ PastixKernelGEMMBlokLRLR
Definition: kernels_enums.h:68
@ PastixKernelGEMMCblk1d2d
Definition: kernels_enums.h:63
@ PastixKernelGEMMCblkLRLR
Definition: kernels_enums.h:66
@ PastixKernelGEMMCblk1d1d
Definition: kernels_enums.h:62
@ PastixKernelGEMMCblk2d2d
Definition: kernels_enums.h:64
@ PastixKernelGEMMBlok2d2d
Definition: kernels_enums.h:67
int core_sgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, float alpha, const float *A, pastix_int_t LDA, float beta, float *B, pastix_int_t LDB)
Add two matrices together.
Definition: core_sgeadd.c:78
static pastix_fixdbl_t core_sgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_sgemmsp.c:901
static pastix_fixdbl_t core_sgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_sgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const float *A, const float *B, float *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_sgemmsp.c:264
static pastix_fixdbl_t core_sgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const float *A, const float *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_sgemmsp.c:712
static void core_sgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const float *A, const float *B, float *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_sgemmsp.c:404
static pastix_fixdbl_t core_sgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const float *A, const float *B, pastix_lrblock_t *lrC, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_sgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const float *A, const float *B, float *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
Definition: core_sgemmsp.c:545
static pastix_fixdbl_t core_sgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, float *C, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static void core_sgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const float *A, const float *B, float *C, float *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
Definition: core_sgemmsp.c:106
pastix_fixdbl_t cpucblk_sgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpublok_sgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
pastix_int_t K
const pastix_lrblock_t * B
pastix_int_t lwork
pastix_int_t N
pastix_int_t M
pastix_int_t lwused
const pastix_lrblock_t * A
pastix_trans_t transB
pastix_lrblock_t * C
pastix_int_t offy
pastix_int_t offx
const pastix_lr_t * lowrank
pastix_int_t Cn
pastix_int_t Cm
pastix_trans_t transA
pastix_atomic_lock_t * lock
pastix_fixdbl_t core_slrmm(core_slrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Definition: core_slrmm.c:278
Structure to store all the parameters of the core_slrmm family functions.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
@ PastixUCoef
Definition: api.h:479
@ PastixNoTrans
Definition: api.h:445
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
pastix_int_t fcblknm
Definition: solver.h:144
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition: solver.h:504
pastix_int_t frownum
Definition: solver.h:147
pastix_atomic_lock_t lock
Definition: solver.h:162
pastix_int_t coefind
Definition: solver.h:149
SolverBlok * fblokptr
Definition: solver.h:168
pastix_int_t lcblknm
Definition: solver.h:143
int8_t inlast
Definition: solver.h:151
pastix_int_t stride
Definition: solver.h:169
int8_t cblktype
Definition: solver.h:164
pastix_int_t fcolnum
Definition: solver.h:166
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161