PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
core_cgemmsp.c
Go to the documentation of this file.
1/**
2 *
3 * @file core_cgemmsp.c
4 *
5 * PaStiX kernel routines operating on the solver structure.
6 *
7 * @copyright 2011-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Mathieu Faverge
12 * @author Pierre Ramet
13 * @author Xavier Lacoste
14 * @author Gregoire Pichon
15 * @author Nolan Bredel
16 * @author Alycia Lisito
17 * @author Tom Moenne-Loccoz
18 * @date 2024-07-05
19 * @generated from /builds/2mk6rsew/0/solverstack/pastix/kernels/core_zgemmsp.c, normal z -> c, Tue Feb 25 14:34:50 2025
20 *
21 **/
22#include "common.h"
23#include "cblas.h"
24#include "blend/solver.h"
25#include "kernels_trace.h"
26#include "pastix_ccores.h"
27#include "pastix_clrcores.h"
28
29#ifndef DOXYGEN_SHOULD_SKIP_THIS
30static pastix_complex32_t mcone = -1.0;
31static pastix_complex32_t cone = 1.0;
32static pastix_complex32_t czero = 0.0;
33#endif /* DOXYGEN_SHOULD_SKIP_THIS */
34
35/**
36 *******************************************************************************
37 *
38 * @ingroup kernel_fact_null
39 *
40 * @brief Compute the updates that are generated by the transposition of one
41 * single off-diagonal block.
42 *
43 * Both cblk involved in the computation are stored with the 1D storage: Column
44 * Major Layout with blocks interleaved.
45 *
46 * All the off-diagonal block below block are multiplied by the selected block
47 * and added to the facing cblk.
48 *
49 *******************************************************************************
50 *
51 * @param[in] sideA
52 * Specify if A and C belong to the lower part, or to the upper part.
53 * If sideA == PastixLCoef, the contribution of:
54 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
55 * C, otherwise the contribution:
56 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
57 * to C.
58 * The pointer to the data structure that describes the panel from
59 * which we compute the contributions. Next column blok must be
60 * accessible through cblk[1].
61 *
62 * @param[in] trans
63 * Specify the transposition used for the B matrix. It has to be either
64 * PastixTrans or PastixConjTrans.
65 *
66 * @param[in] cblk
67 * The cblk structure to which block belongs to. The A and B pointers
68 * must be the coeftab of this column block.
69 * Next column blok must be accessible through cblk[1].
70 *
71 * @param[in] blok
72 * The block from which we compute the contributions.
73 *
74 * @param[inout] fcblk
75 * The pointer to the data structure that describes the panel on which
76 * we compute the contributions. The C pointer must be one of the
77 * coeftab from this fcblk. Next column blok must be accessible through
78 * fcblk[1].
79 *
80 * @param[in] A
81 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
82 * coefficients of the panel when the Lower part is computed,
83 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
84 *
85 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
86 * the coefficients of the panel, if Symmetric/Hermitian cases or if
87 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
88 * cblk.stride -by- cblk.width
89 *
90 * @param[inout] C
91 * The pointer to the fcblk.lcoeftab if the lower part is computed,
92 * fcblk.ucoeftab otherwise.
93 *
94 * @param[in] work
95 * Temporary memory buffer that is at least equal to the height of the
96 * block B by the sum of the height of all the blocks below the block
97 * B.
98 *
99 *******************************************************************************
100 *
101 * @sa core_cgemmsp_1d2d
102 * @sa core_cgemmsp_2d2d
103 *
104 *******************************************************************************/
105static inline void
107 pastix_trans_t trans,
108 const SolverCblk *cblk,
109 const SolverBlok *blok,
110 SolverCblk *fcblk,
111 const pastix_complex32_t *A,
112 const pastix_complex32_t *B,
114 pastix_complex32_t *work )
115{
116 const SolverBlok *iterblok;
117 const SolverBlok *fblok;
118 const SolverBlok *lblok;
119
120 pastix_complex32_t *tmpC;
121 pastix_complex32_t *wtmp;
122 pastix_int_t stride, stridef, indblok;
123 pastix_int_t M, N, K, m;
124 int shift;
125
126 /* Both cblk and fcblk are stored in 1D */
127 assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
128 assert(!(fcblk->cblktype & CBLK_LAYOUT_2D));
129
130 assert( work != NULL );
131
132 shift = (sideA == PastixUCoef) ? 1 : 0;
133
134 stride = cblk->stride;
135 stridef = fcblk->stride;
136 K = cblk_colnbr( cblk );
137
138 /* First blok */
139 indblok = blok->coefind;
140
141 N = blok_rownbr( blok );
142 M = stride - indblok - (shift * N);
143
144 /* Matrix A = Aik */
145 A = A + indblok + (shift * N);
146 B = B + indblok;
147
148 /*
149 * Compute update A * B'
150 */
151 wtmp = work;
152 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
153 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
154 M, N, K,
155 CBLAS_SADDR(cone), A, stride,
156 B, stride,
157 CBLAS_SADDR(czero), wtmp, M );
158 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
159
160 /*
161 * Add contribution to C in fcblk
162 */
163
164 /* Get the first block of the distant panel */
165 fblok = fcblk->fblokptr;
166
167 /* Move the pointer to the top of the right column */
168 C = C + (blok->frownum - fcblk->fcolnum) * stridef;
169
170 lblok = cblk[1].fblokptr;
171
172 /* for all following blocks in block column */
173 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
174
175 /* Find facing blok */
176 while (!is_block_inside_fblock( iterblok, fblok ))
177 {
178 fblok++;
179 assert( fblok < fcblk[1].fblokptr );
180 }
181
182 tmpC = C + fblok->coefind + iterblok->frownum - fblok->frownum;
183 m = blok_rownbr( iterblok );
184
185 pastix_cblk_lock( fcblk );
187 -1.0, wtmp, M,
188 1.0, tmpC, stridef );
189 pastix_cblk_unlock( fcblk );
190
191 /* Displacement to next block */
192 wtmp += m;
193 }
194}
195
196/**
197 *******************************************************************************
198 *
199 * @ingroup kernel_fact_null
200 *
201 * @brief Compute the updates that are generated by the transposition of one
202 * single off-diagonal block.
203 *
204 * The cblk involved in the matrices A and B are stored with the 1D storage:
205 * Column Major Layout with blocks interleaved. The facing cblk of the atrix C,
206 * is stored with the 2D storage where each block is stored continuously one
207 * after another. (Similar to dense tile storage with variant tile size)
208 *
209 * All the off-diagonal block below block are multiplied by the selected block
210 * and added to the facing cblk.
211 *
212 *******************************************************************************
213 *
214 * @param[in] sideA
215 * Specify if A and C belong to the lower part, or to the upper part.
216 * If sideA == PastixLCoef, the contribution of:
217 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
218 * C, otherwise the contribution:
219 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
220 * to C.
221 * The pointer to the data structure that describes the panel from
222 * which we compute the contributions. Next column blok must be
223 * accessible through cblk[1].
224 *
225 * @param[in] trans
226 * Specify the transposition used for the B matrix. It has to be either
227 * PastixTrans or PastixConjTrans.
228 *
229 * @param[in] cblk
230 * The cblk structure to which block belongs to. The A and B pointers
231 * must be the coeftab of this column block.
232 * Next column blok must be accessible through cblk[1].
233 *
234 * @param[in] blok
235 * The block from which we compute the contributions.
236 *
237 * @param[inout] fcblk
238 * The pointer to the data structure that describes the panel on which
239 * we compute the contributions. The C pointer must be one of the
240 * coeftab from this fcblk. Next column blok must be accessible through
241 * fcblk[1].
242 *
243 * @param[in] A
244 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
245 * coefficients of the panel when the Lower part is computed,
246 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
247 *
248 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
249 * the coefficients of the panel, if Symmetric/Hermitian cases or if
250 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
251 * cblk.stride -by- cblk.width
252 *
253 * @param[inout] C
254 * The pointer to the fcblk.lcoeftab if the lower part is computed,
255 * fcblk.ucoeftab otherwise.
256 *
257 *******************************************************************************
258 *
259 * @sa core_cgemmsp_1d1d
260 * @sa core_cgemmsp_2d2d
261 *
262 *******************************************************************************/
263static inline void
265 pastix_trans_t trans,
266 const SolverCblk *cblk,
267 const SolverBlok *blok,
268 SolverCblk *fcblk,
269 const pastix_complex32_t *A,
270 const pastix_complex32_t *B,
272{
273 const SolverBlok *iterblok;
274 const SolverBlok *fblok;
275 const SolverBlok *lblok;
276 const pastix_complex32_t *blokA;
277 const pastix_complex32_t *blokB;
278 pastix_complex32_t *blokC;
279
280 pastix_int_t stride, stridef;
281 pastix_int_t M, N, K;
282 int shift;
283
284 /* cblk is stored in 1D and fcblk in 2D */
285 assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
286 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
287
288 shift = (sideA == PastixUCoef) ? 1 : 0;
289 stride = cblk->stride;
290
291 /* Get the B block and its dimensions */
292 blokB = B + blok->coefind;
293
294 stride = cblk->stride;
295 K = cblk_colnbr( cblk );
296 N = blok_rownbr( blok );
297
298 /**
299 * Add contribution to C in fcblk:
300 * Get the first facing block of the distant panel, and the last block of
301 * the current cblk
302 */
303 fblok = fcblk->fblokptr;
304 lblok = cblk[1].fblokptr;
305
306 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
307
308 /* Find facing blok */
309 while (!is_block_inside_fblock( iterblok, fblok ))
310 {
311 fblok++;
312 assert( fblok < fcblk[1].fblokptr );
313 }
314
315 stridef = blok_rownbr(fblok);
316
317 /* Get the A block and its dimensions */
318 blokA = A + iterblok->coefind;
319 M = blok_rownbr( iterblok );
320
321 blokC = C + fblok->coefind
322 + iterblok->frownum - fblok->frownum
323 + (blok->frownum - fcblk->fcolnum) * stridef;
324
325 pastix_cblk_lock( fcblk );
326 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
327 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
328 M, N, K,
329 CBLAS_SADDR(mcone), blokA, stride,
330 blokB, stride,
331 CBLAS_SADDR(cone), blokC, stridef );
332 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
333 pastix_cblk_unlock( fcblk );
334 }
335}
336
337/**
338 *******************************************************************************
339 *
340 * @ingroup kernel_fact_null
341 *
342 * @brief Compute the updates that are generated by the transposition of one
343 * single off-diagonal block.
344 *
345 * Both cblk involved in the matrices A, B and C are stored with the 2D storage
346 * where each block is stored continuously one after another. (Similar to dense
347 * tile storage with variant tile size)
348 *
349 * All the off-diagonal block below block are multiplied by the selected block
350 * and added to the facing cblk.
351 *
352 *******************************************************************************
353 *
354 * @param[in] sideA
355 * Specify if A and C belong to the lower part, or to the upper part.
356 * If sideA == PastixLCoef, the contribution of:
357 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
358 * C, otherwise the contribution:
359 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
360 * to C.
361 * The pointer to the data structure that describes the panel from
362 * which we compute the contributions. Next column blok must be
363 * accessible through cblk[1].
364 *
365 * @param[in] trans
366 * Specify the transposition used for the B matrix. It has to be either
367 * PastixTrans or PastixConjTrans.
368 *
369 * @param[in] cblk
370 * The cblk structure to which block belongs to. The A and B pointers
371 * must be the coeftab of this column block.
372 * Next column blok must be accessible through cblk[1].
373 *
374 * @param[in] blok
375 * The block from which we compute the contributions.
376 *
377 * @param[inout] fcblk
378 * The pointer to the data structure that describes the panel on which
379 * we compute the contributions. The C pointer must be one of the
380 * coeftab from this fcblk. Next column blok must be accessible through
381 * fcblk[1].
382 *
383 * @param[in] A
384 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
385 * coefficients of the panel when the Lower part is computed,
386 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
387 *
388 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
389 * the coefficients of the panel, if Symmetric/Hermitian cases or if
390 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
391 * cblk.stride -by- cblk.width
392 *
393 * @param[inout] C
394 * The pointer to the fcblk.lcoeftab if the lower part is computed,
395 * fcblk.ucoeftab otherwise.
396 *
397 *******************************************************************************
398 *
399 * @sa core_cgemmsp_1d1d
400 * @sa core_cgemmsp_1d2d
401 *
402 *******************************************************************************/
403static inline void
405 pastix_trans_t trans,
406 const SolverCblk *cblk,
407 const SolverBlok *blok,
408 SolverCblk *fcblk,
409 const pastix_complex32_t *A,
410 const pastix_complex32_t *B,
412{
413 const SolverBlok *iterblok;
414 const SolverBlok *fblok;
415 const SolverBlok *lblok;
416 const pastix_complex32_t *blokA;
417 const pastix_complex32_t *blokB;
418 pastix_complex32_t *blokC;
419
420 pastix_int_t M, N, K, lda, ldb, ldc;
421 int shift;
422
423 /* Both cblk and fcblk must be stored in 2D */
424 assert( cblk->cblktype & CBLK_LAYOUT_2D );
425 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
426
427 shift = (sideA == PastixUCoef) ? 1 : 0;
428
429 /* Get the B block and its dimensions */
430 blokB = B + blok->coefind;
431
432 ldb = blok_rownbr( blok );
433 K = cblk_colnbr( cblk );
434 N = blok_rownbr( blok );
435
436 /*
437 * Add contribution to C in fcblk:
438 * Get the first facing block of the distant panel, and the last block of
439 * the current cblk
440 */
441 fblok = fcblk->fblokptr;
442 lblok = cblk[1].fblokptr;
443
444 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
445
446 /* Find facing blok */
447 while (!is_block_inside_fblock( iterblok, fblok ))
448 {
449 fblok++;
450 assert( fblok < fcblk[1].fblokptr );
451 }
452
453 ldc = blok_rownbr(fblok);
454
455 /* Get the A block and its dimensions */
456 blokA = A + iterblok->coefind;
457 M = blok_rownbr( iterblok );
458 lda = M;
459
460 blokC = C + fblok->coefind
461 + iterblok->frownum - fblok->frownum
462 + (blok->frownum - fcblk->fcolnum) * ldc;
463
464 pastix_cblk_lock( fcblk );
465 kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
466 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
467 M, N, K,
468 CBLAS_SADDR(mcone), blokA, lda,
469 blokB, ldb,
470 CBLAS_SADDR(cone), blokC, ldc );
471 kernel_trace_stop_lvl2( FLOPS_CGEMM( M, N, K ) );
472 pastix_cblk_unlock( fcblk );
473 }
474}
475
476/**
477 *******************************************************************************
478 *
479 * @ingroup kernel_fact_null
480 *
481 * @brief Compute the updates that are generated by the transposition of all the
482 * blocks facing a common diagonal block, by another similar set of blocks.
483 *
484 * This is used to performed an update:
485 *
486 * C_mn = C_mn - A_mk * op(B_kn)
487 *
488 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
489 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
490 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
491 * belongs to the set of block of the cblk n facing the diagonal block of the
492 * cblk m.
493 *
494 *******************************************************************************
495 *
496 * @param[in] trans
497 * Specify the transposition used for the B matrices. It has to be either
498 * PastixTrans or PastixConjTrans.
499 *
500 * @param[in] blok_mk
501 * Index of the first off-diagonal block in cblk, that is used for A.
502 *
503 * @param[in] blok_kn
504 * Index of the first off-diagonal block in cblk, that is used for B.
505 *
506 * @param[in] blok_mn
507 * Index of the first off-diagonal block in fcblk, that is used for C.
508 *
509 * @param[in] cblk
510 * The cblk structure to which block belongs to. The A and B pointers
511 * must be the coeftab of this column block.
512 * Next column blok must be accessible through cblk[1].
513 *
514 * @param[inout] fcblk
515 * The pointer to the data structure that describes the panel on which
516 * we compute the contributions. The C pointer must be one of the
517 * coeftab from this fcblk. Next column blok must be accessible through
518 * fcblk[1].
519 *
520 * @param[in] A
521 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
522 * coefficients of the panel when the Lower part is computed,
523 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
524 *
525 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
526 * the coefficients of the panel, if Symmetric/Hermitian cases or if
527 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
528 * cblk.stride -by- cblk.width
529 *
530 * @param[inout] C
531 * The pointer to the fcblk.lcoeftab if the lower part is computed,
532 * fcblk.ucoeftab otherwise.
533 *
534 *******************************************************************************
535 *
536 * @return The number of flops performed
537 *
538 *******************************************************************************
539 *
540 * @sa core_cgemmsp_block_frlr
541 * @sa core_cgemmsp_block_lrlr
542 *
543 *******************************************************************************/
544static inline pastix_fixdbl_t
546 pastix_int_t blok_mk,
547 pastix_int_t blok_kn,
548 pastix_int_t blok_mn,
549 const SolverCblk *cblk,
550 SolverCblk *fcblk,
551 const pastix_complex32_t *A,
552 const pastix_complex32_t *B,
554{
555 const SolverBlok *blokA, *blokB, *blokC;
556 const SolverBlok *bA, *bB, *bC;
557 const SolverBlok *fblokK, *lblokK;
558 const SolverBlok *fblokN, *lblokN;
559
560 const pastix_complex32_t *Aptr, *Bptr;
561 pastix_complex32_t *Cptr;
562 pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
563 pastix_int_t full_m, full_n;
564 size_t offsetA, offsetB, offsetC;
565
566 pastix_fixdbl_t flops = 0.0;
568
569 /* Both cblk and fcblk must be stored in 2D */
570 assert( cblk->cblktype & CBLK_LAYOUT_2D );
571 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
572
573 /*
574 * Blocs on column K
575 */
576 fblokK = cblk[0].fblokptr;
577 lblokK = cblk[1].fblokptr;
578
579 blokB = fblokK + blok_kn;
580 offsetB = blokB->coefind;
581 cblk_n = blokB->fcblknm;
582
583 blokA = fblokK + blok_mk;
584 offsetA = blokA->coefind;
585 cblk_m = blokA->fcblknm;
586
587 /*
588 * Blocs on column N
589 */
590 fblokN = fcblk[0].fblokptr;
591 lblokN = fcblk[1].fblokptr;
592
593 blokC = fblokN + blok_mn;
594 offsetC = blokC->coefind;
595 assert( blokC->lcblknm == cblk_n );
596 assert( blokC->fcblknm == cblk_m );
597
598 K = cblk_colnbr( cblk );
599 full_m = 0;
600
601 bC = blokC;
602 for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
603 M = blok_rownbr(bA);
604 Aptr = A + bA->coefind - offsetA;
605 lda = M;
606 full_m += M;
607
608 /* Find facing C blok */
609 while (!is_block_inside_fblock( bA, bC )) {
610 bC++;
611 assert( bC < lblokN );
612 }
613
614 Cptr = C + bC->coefind - offsetC;
615 ldc = blok_rownbr(bC);
616
617 full_n = 0;
618 for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
619 N = blok_rownbr( bB );
620 full_n += N;
621 Bptr = B + bB->coefind - offsetB;
622 ldb = N;
623
624 cblas_cgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
625 M, N, K,
626 CBLAS_SADDR(mcone), Aptr, lda,
627 Bptr, ldb,
628 CBLAS_SADDR(cone), Cptr + (bA->frownum - bC->frownum)
629 + (bB->frownum - fcblk->fcolnum) * ldc , ldc );
630
631 flops += FLOPS_CGEMM( M, N, K );
632 }
633 }
634
636 full_m, full_n, K,
637 flops, time );
638
639 (void)lblokN;
640 return flops;
641}
642
643/**
644 *******************************************************************************
645 *
646 * @ingroup kernel_fact_null
647 *
648 * @brief Compute the updates that are generated by the transposition of all the
649 * blocks facing a common diagonal block, by another similar set of blocks.
650 *
651 * This is used to perform an update:
652 *
653 * C_mn = C_mn - A_mk * op(B_kn)
654 *
655 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
656 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
657 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
658 * belongs to the set of block of the cblk n facing the diagonal block of the
659 * cblk m.
660 *
661 * In this routine, all the matrices are low-rank
662 *
663 *******************************************************************************
664 *
665 * @param[in] transB
666 * Specify the transposition used for the B matrices. It has to be either
667 * PastixTrans or PastixConjTrans.
668 *
669 * @param[in] blok_mk
670 * Index of the first off-diagonal block in cblk, that is used for A.
671 *
672 * @param[in] blok_kn
673 * Index of the first off-diagonal block in cblk, that is used for B.
674 *
675 * @param[in] blok_mn
676 * Index of the first off-diagonal block in fcblk, that is used for C.
677 *
678 * @param[in] cblk
679 * The cblk structure to which block belongs to. The A and B pointers
680 * must be the coeftab of this column block.
681 * Next column blok must be accessible through cblk[1].
682 *
683 * @param[inout] fcblk
684 * The pointer to the data structure that describes the panel on which
685 * we compute the contributions. The C pointer must be one of the
686 * coeftab from this fcblk. Next column blok must be accessible through
687 * fcblk[1].
688 *
689 * @param[in] A
690 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
691 * coefficients of the panel when the Lower part is computed,
692 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
693 *
694 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
695 * the coefficients of the panel, if Symmetric/Hermitian cases or if
696 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
697 * cblk.stride -by- cblk.width
698 *
699 * @param[in] lrC
700 * Pointer to the low-rank representation of the block C.
701 * Must be followed by the low-rank representation of the following blocks.
702 *
703 * @param[in] lowrank
704 * The structure with low-rank parameters.
705 *
706 *******************************************************************************
707 *
708 * @return The number of flops performed
709 *
710 *******************************************************************************/
711static inline pastix_fixdbl_t
713 pastix_int_t blok_mk,
714 pastix_int_t blok_kn,
715 pastix_int_t blok_mn,
716 const SolverCblk *cblk,
717 SolverCblk *fcblk,
718 const pastix_complex32_t *A,
719 const pastix_complex32_t *B,
720 pastix_lrblock_t *lrC,
721 const pastix_lr_t *lowrank )
722{
723 const SolverBlok *blokA, *blokB, *blokC;
724 const SolverBlok *bA, *bB, *bC;
725 const SolverBlok *fblokK, *lblokK;
726 const SolverBlok *fblokN, *lblokN;
727
728 pastix_lrblock_t lrA, lrB;
729 core_clrmm_t params;
730
731 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
732 size_t offsetA, offsetB;
733
734 pastix_fixdbl_t flops = 0.0;
736
737 /* Provide an internal lock as this function is already protected by the runtime dependency system */
738 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
739
740 /* Both cblk and fcblk must be stored in 2D */
741 assert( cblk->cblktype & CBLK_LAYOUT_2D );
742 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
743
744 /* Both cblk and fcblk must be compressed */
745 assert(!(cblk->cblktype & CBLK_COMPRESSED) );
746 assert( fcblk->cblktype & CBLK_COMPRESSED );
747
748 /*
749 * Blocs on column K
750 */
751 fblokK = cblk[0].fblokptr;
752 lblokK = cblk[1].fblokptr;
753
754 blokB = fblokK + blok_kn;
755 offsetB = blokB->coefind;
756 cblk_n = blokB->fcblknm;
757 lrB.rk = -1;
758
759 blokA = fblokK + blok_mk;
760 offsetA = blokA->coefind;
761 cblk_m = blokA->fcblknm;
762 lrA.rk = -1;
763
764 /*
765 * Blocs on column N
766 */
767 fblokN = fcblk[0].fblokptr;
768 lblokN = fcblk[1].fblokptr;
769
770 blokC = fblokN + blok_mn;
771 assert( blokC->lcblknm == cblk_n );
772 assert( blokC->fcblknm == cblk_m );
773
774 full_m = 0;
775 K = cblk_colnbr( cblk );
776
777 params.lowrank = lowrank;
778 params.transA = PastixNoTrans;
779 params.transB = transB;
780 params.K = K;
781 params.Cn = cblk_colnbr( fcblk );
782 params.alpha = -1.0;
783 params.beta = 1.0;
784 params.work = NULL;
785 params.lwork = -1;
786 params.lock = &(lock);
787
788 bC = blokC;
789 for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
790 M = blok_rownbr(bA);
791 params.M = M;
792 full_m += M;
793
794 lrA.rkmax = M;
795 lrA.u = (pastix_complex32_t*)A + bA->coefind - offsetA;
796 lrA.v = NULL;
797
798 /* Find facing C blok */
799 while (!is_block_inside_fblock( bA, bC )) {
800 bC++;
801 lrC++;
802 assert( bC < lblokN );
803 }
804
805 params.A = &lrA;
806 params.C = lrC;
807 params.Cm = blok_rownbr( bC );
808
809 full_n = 0;
810 for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
811
812 params.N = blok_rownbr( bB );
813 full_n += params.N;
814 params.offx = bA->frownum - bC->frownum;
815 params.offy = bB->frownum - fcblk->fcolnum;
816
817 lrB.rkmax = M;
818 lrB.u = (pastix_complex32_t*)B + bB->coefind - offsetB;
819 lrB.v = NULL;
820
821 params.B = &lrB;
822
823 flops += core_clrmm( &params );
824 }
825 }
826
828 full_m, full_n, K,
829 flops, time );
830 (void)lblokN;
831 return flops;
832}
833
834/**
835 *******************************************************************************
836 *
837 * @ingroup kernel_fact_null
838 *
839 * @brief Compute the updates that are generated by the transposition of all the
840 * blocks facing a common diagonal block, by another similar set of blocks.
841 *
842 * This is used to perform an update:
843 *
844 * C_mn = C_mn - A_mk * op(B_kn)
845 *
846 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
847 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
848 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
849 * belongs to the set of block of the cblk n facing the diagonal block of the
850 * cblk m.
851 *
852 * In this routine, all the matrices are low-rank
853 *
854 *******************************************************************************
855 *
856 * @param[in] transB
857 * Specify the transposition used for the B matrices. It has to be either
858 * PastixTrans or PastixConjTrans.
859 *
860 * @param[in] blok_mk
861 * Index of the first off-diagonal block in cblk, that is used for A.
862 *
863 * @param[in] blok_kn
864 * Index of the first off-diagonal block in cblk, that is used for B.
865 *
866 * @param[in] blok_mn
867 * Index of the first off-diagonal block in fcblk, that is used for C.
868 *
869 * @param[in] cblk
870 * The cblk structure to which block belongs to. The A and B pointers
871 * must be the coeftab of this column block.
872 * Next column blok must be accessible through cblk[1].
873 *
874 * @param[inout] fcblk
875 * The pointer to the data structure that describes the panel on which
876 * we compute the contributions. The C pointer must be one of the
877 * coeftab from this fcblk. Next column blok must be accessible through
878 * fcblk[1].
879 *
880 * @param[inout] lrA
881 * Pointer to the low-rank representation of the block A.
882 * Must be followed by the low-rank representation of the following blocks.
883 *
884 * @param[inout] lrB
885 * Pointer to the low-rank representation of the block B.
886 * Must be followed by the low-rank representation of the following blocks.
887 *
888 * @param[inout] lrC
889 * Pointer to the low-rank representation of the block C.
890 * Must be followed by the low-rank representation of the following blocks.
891 *
892 * @param[in] lowrank
893 * The structure with low-rank parameters.
894 *
895 *******************************************************************************
896 *
897 * @return The number of flops performed
898 *
899 *******************************************************************************/
900static inline pastix_fixdbl_t
902 pastix_int_t blok_mk,
903 pastix_int_t blok_kn,
904 pastix_int_t blok_mn,
905 const SolverCblk *cblk,
906 SolverCblk *fcblk,
907 const pastix_lrblock_t *lrA,
908 const pastix_lrblock_t *lrB,
909 pastix_lrblock_t *lrC,
910 const pastix_lr_t *lowrank )
911{
912 const pastix_lrblock_t *blrB;
913 const SolverBlok *blokA, *blokB, *blokC;
914 const SolverBlok *bA, *bB, *bC;
915 const SolverBlok *fblokK, *lblokK;
916 const SolverBlok *fblokN, *lblokN;
917
918 core_clrmm_t params;
919
920 pastix_int_t M, K, cblk_n, cblk_m, full_m, full_n;
921
922 pastix_fixdbl_t flops = 0.0;
924
925 /* Provide an internal lock as this function is already protected by the runtime dependency system */
926 pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;
927
928 /* Both cblk and fcblk must be stored in 2D */
929 assert( cblk->cblktype & CBLK_LAYOUT_2D );
930 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
931
932 /* Both cblk and fcblk must be compressed */
933 assert( cblk->cblktype & CBLK_COMPRESSED );
934 assert( fcblk->cblktype & CBLK_COMPRESSED );
935
936 /*
937 * Blocs on column K
938 */
939 fblokK = cblk[0].fblokptr;
940 lblokK = cblk[1].fblokptr;
941
942 blokB = fblokK + blok_kn;
943 cblk_n = blokB->fcblknm;
944
945 blokA = fblokK + blok_mk;
946 cblk_m = blokA->fcblknm;
947
948 /*
949 * Blocs on column N
950 */
951 fblokN = fcblk[0].fblokptr;
952 lblokN = fcblk[1].fblokptr;
953
954 blokC = fblokN + blok_mn;
955 assert( blokC->lcblknm == cblk_n );
956 assert( blokC->fcblknm == cblk_m );
957
958 full_m = 0;
959 K = cblk_colnbr( cblk );
960
961 params.lowrank = lowrank;
962 params.transA = PastixNoTrans;
963 params.transB = transB;
964 params.K = K;
965 params.Cn = cblk_colnbr( fcblk );
966 params.alpha = -1.0;
967 params.beta = 1.0;
968 params.work = NULL;
969 params.lwork = -1;
970 params.lock = &(lock);
971
972 bC = blokC;
973 for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++, lrA++) {
974 M = blok_rownbr(bA);
975 params.M = M;
976 full_m += M;
977
978 /* Find facing C blok */
979 while (!is_block_inside_fblock( bA, bC )) {
980 bC++;
981 lrC++;
982 assert( bC < lblokN );
983 }
984
985 params.A = lrA;
986 params.C = lrC;
987 params.Cm = blok_rownbr( bC );
988
989 full_n = 0;
990 for (bB = blokB, blrB = lrB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++, blrB++) {
991
992 params.N = blok_rownbr( bB );
993 full_n += params.N;
994
995 params.offx = bA->frownum - bC->frownum;
996 params.offy = bB->frownum - fcblk->fcolnum;
997 params.B = blrB;
998
999 flops += core_clrmm( &params );
1000 }
1001 }
1002
1004 full_m, full_n, K,
1005 flops, time );
1006 (void)lblokN;
1007 return flops;
1008}
1009
1010/**
1011 *******************************************************************************
1012 *
1013 * @ingroup kernel_fact_null
1014 *
1015 * @brief Compute the updates associated to one off-diagonal block.
1016 *
1017 *******************************************************************************
1018 *
1019 * @param[in] sideA
1020 * Specify if A and C belong to the lower part, or to the upper part.
1021 * If sideA == PastixLCoef, the contribution of:
1022 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1023 * C, otherwise the contribution:
1024 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1025 * to C.
1026 *
1027 * @param[in] trans
1028 * Specify the transposition used for the B matrix. It has to be either
1029 * PastixTrans or PastixConjTrans.
1030 *
1031 * @param[in] cblk
1032 * The cblk structure to which block belongs to. The A and B pointers
1033 * must be the coeftab of this column block.
1034 * Next column blok must be accessible through cblk[1].
1035 *
1036 * @param[in] blok
1037 * The block from which we compute the contributions.
1038 *
1039 * @param[inout] fcblk
1040 * The pointer to the data structure that describes the panel on which
1041 * we compute the contributions. The C pointer must be one of the
1042 * coeftab from this fcblk. Next column blok must be accessible through
1043 * fcblk[1].
1044 *
1045 * @param[in] A
1046 * The pointer to the coeftab of the cblk.lcoeftab matrix storing the
1047 * coefficients of the panel when the Lower part is computed,
1048 * cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
1049 *
1050 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
1051 * the coefficients of the panel, if Symmetric/Hermitian cases or if
1052 * upper part is computed; cblk.ucoeftab otherwise. Must be of size
1053 * cblk.stride -by- cblk.width
1054 *
1055 * @param[in] lrC
1056 * Pointer to the low-rank representation of the block C.
1057 * Must be followed by the low-rank representation of the following blocks.
1058 *
1059 * @param[in] work
1060 * Temporary memory buffer.
1061 *
1062 * @param[in] lwork
1063 * TODO
1064 *
1065 * @param[in] lowrank
1066 * The structure with low-rank parameters.
1067 *
1068 *******************************************************************************
1069 *
1070 * @return The number of flops performed
1071 *
1072 *******************************************************************************/
1073static inline pastix_fixdbl_t
1075 pastix_trans_t trans,
1076 const SolverCblk *cblk,
1077 const SolverBlok *blok,
1078 SolverCblk *fcblk,
1079 const pastix_complex32_t *A,
1080 const pastix_complex32_t *B,
1081 pastix_lrblock_t *lrC,
1082 pastix_complex32_t *work,
1083 pastix_int_t lwork,
1084 const pastix_lr_t *lowrank )
1085{
1086 const SolverBlok *iterblok;
1087 const SolverBlok *fblok;
1088 const SolverBlok *lblok;
1089 pastix_lrblock_t lrA, lrB;
1090 core_clrmm_t params;
1091
1092 pastix_int_t stride, shift;
1093 pastix_int_t M, N;
1094
1095 pastix_fixdbl_t flops = 0.0;
1096
1097 /* Update from a dense block to a low rank block */
1098 assert(!(cblk->cblktype & CBLK_COMPRESSED));
1099 assert( fcblk->cblktype & CBLK_COMPRESSED );
1100 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1101
1102 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1103
1104 shift = (sideA == PastixUCoef) ? 1 : 0;
1105 stride = cblk->stride;
1106
1107 N = blok_rownbr( blok );
1108
1109 /* Get the B block and its dimensions */
1110 lrB.rk = -1;
1111 lrB.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? N : stride;
1112 lrB.u = (pastix_complex32_t*)B + blok->coefind; /* lrB is const, we can cast the B pointer */
1113 lrB.v = NULL;
1114
1115 /**
1116 * Add contribution to C in fcblk:
1117 * Get the first facing block of the distant panel, and the last block of
1118 * the current cblk
1119 */
1120 fblok = fcblk->fblokptr;
1121 lblok = cblk[1].fblokptr;
1122
1123 params.lowrank = lowrank;
1124 params.transA = PastixNoTrans;
1125 params.transB = trans;
1126 params.N = blok_rownbr( blok );
1127 params.K = cblk_colnbr( cblk );
1128 params.Cn = cblk_colnbr( fcblk );
1129 params.alpha = -1.0;
1130 params.beta = 1.0;
1131 params.work = work;
1132 params.lwork = lwork;
1133 params.lock = &(fcblk->lock);
1134 params.B = &lrB;
1135
1136 for (iterblok=blok+shift; iterblok<lblok; iterblok++) {
1137
1138 /* Find facing blok */
1139 while (!is_block_inside_fblock( iterblok, fblok ))
1140 {
1141 fblok++;
1142 lrC++;
1143 assert( fblok < fcblk[1].fblokptr );
1144 }
1145
1146 /* Get the A block and its dimensions */
1147 M = blok_rownbr( iterblok );
1148 lrA.rk = -1;
1149 lrA.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? M : stride;
1150 lrA.u = (pastix_complex32_t*)A + iterblok->coefind; /* Same as for B */
1151 lrA.v = NULL;
1152
1153 params.M = M;
1154 params.A = &lrA;
1155 params.C = lrC;
1156 params.Cm = blok_rownbr( fblok );
1157
1158 params.offx = iterblok->frownum - fblok->frownum;
1159 params.offy = blok->frownum - fcblk->fcolnum;
1160
1161 flops += core_clrmm( &params );
1162 }
1163 return flops;
1164}
1165
1166/**
1167 *******************************************************************************
1168 *
1169 * @ingroup kernel_fact_null
1170 *
1171 * @brief Compute the updates associated to one off-diagonal block.
1172 *
1173 *******************************************************************************
1174 *
1175 * @param[in] sideA
1176 * Specify if A and C belong to the lower part, or to the upper part.
1177 * If sideA == PastixLCoef, the contribution of:
1178 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1179 * C, otherwise the contribution:
1180 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1181 * to C.
1182 *
1183 * @param[in] trans
1184 * Specify the transposition used for the B matrix. It has to be either
1185 * PastixTrans or PastixConjTrans.
1186 *
1187 * @param[in] cblk
1188 * The cblk structure to which block belongs to. The A and B pointers
1189 * must be the coeftab of this column block.
1190 * Next column blok must be accessible through cblk[1].
1191 *
1192 * @param[in] blok
1193 * The block from which we compute the contributions.
1194 *
1195 * @param[inout] fcblk
1196 * The pointer to the data structure that describes the panel on which
1197 * we compute the contributions. The C pointer must be one of the
1198 * coeftab from this fcblk. Next column blok must be accessible through
1199 * fcblk[1].
1200 *
1201 * @param[in] lrA
1202 * Pointer to the low-rank representation of the block A.
1203 * Must be followed by the low-rank representation of the following blocks.
1204 *
1205 * @param[in] lrB
1206 * Pointer to the low-rank representation of the block B.
1207 * Must be followed by the low-rank representation of the following blocks.
1208 *
1209 * @param[in] lrC
1210 * Pointer to the low-rank representation of the block B.
1211 * Must be followed by the low-rank representation of the following blocks.
1212 *
1213 * @param[in] work
1214 * Temporary memory buffer.
1215 *
1216 * @param[in] lwork
1217 * TODO
1218 *
1219 * @param[in] lowrank
1220 * The structure with low-rank parameters.
1221 *
1222 *******************************************************************************
1223 *
1224 * @return The number of flops performed
1225 *
1226 *******************************************************************************/
1227static inline pastix_fixdbl_t
1229 pastix_trans_t trans,
1230 const SolverCblk *cblk,
1231 const SolverBlok *blok,
1232 SolverCblk *fcblk,
1233 const pastix_lrblock_t *lrA,
1234 const pastix_lrblock_t *lrB,
1235 pastix_lrblock_t *lrC,
1236 pastix_complex32_t *work,
1237 pastix_int_t lwork,
1238 const pastix_lr_t *lowrank )
1239{
1240 const SolverBlok *iterblok;
1241 const SolverBlok *fblok;
1242 const SolverBlok *lblok;
1243
1244 pastix_int_t N, K, shift;
1245 core_clrmm_t params;
1246
1247 pastix_fixdbl_t flops = 0.0;
1248
1249 /* Update from a low-rank cblk to a low-rank cblk */
1250 assert( cblk->cblktype & CBLK_COMPRESSED );
1251 assert( fcblk->cblktype & CBLK_COMPRESSED );
1252 assert( cblk->cblktype & CBLK_LAYOUT_2D );
1253 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1254
1255 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1256
1257 shift = (sideA == PastixUCoef) ? 1 : 0;
1258
1259 /* Get the B block and its dimensions */
1260 K = cblk_colnbr( cblk );
1261 N = blok_rownbr( blok );
1262
1263 /*
1264 * Add contribution to C in fcblk:
1265 * Get the first facing block of the distant panel, and the last block of
1266 * the current cblk
1267 */
1268 fblok = fcblk->fblokptr;
1269 lblok = cblk[1].fblokptr;
1270
1271 params.lowrank = lowrank;
1272 params.transA = PastixNoTrans;
1273 params.transB = trans;
1274 params.N = N;
1275 params.K = K;
1276 params.Cn = cblk_colnbr( fcblk );
1277 params.alpha = -1.0;
1278 params.beta = 1.0;
1279 params.work = work;
1280 params.lwork = lwork;
1281 params.lwused = 0;
1282 params.lock = &(fcblk->lock);
1283 params.B = lrB + (blok - cblk->fblokptr);
1284
1285 /* for all following blocks in block column */
1286 lrA = lrA + (blok - cblk->fblokptr) + shift;
1287 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1288
1289 /* Find facing blok */
1290 while (!is_block_inside_fblock( iterblok, fblok ))
1291 {
1292 fblok++;
1293 lrC++;
1294 assert( fblok < fcblk[1].fblokptr );
1295 }
1296
1297 params.M = blok_rownbr( iterblok );
1298 params.A = lrA;
1299 params.C = lrC;
1300 params.Cm = blok_rownbr( fblok );
1301
1302 params.offx = iterblok->frownum - fblok->frownum;
1303 params.offy = blok->frownum - fcblk->fcolnum;
1304
1305 flops += core_clrmm( &params );
1306 }
1307 return flops;
1308}
1309
1310/**
1311 *******************************************************************************
1312 *
1313 * @ingroup kernel_fact_null
1314 *
1315 * @brief Compute the updates associated to one off-diagonal block.
1316 *
1317 *******************************************************************************
1318 *
1319 * @param[in] sideA
1320 * Specify if A and C belong to the lower part, or to the upper part.
1321 * If sideA == PastixLCoef, the contribution of:
1322 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1323 * C, otherwise the contribution:
1324 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1325 * to C.
1326 *
1327 *
1328 * @param[in] trans
1329 * Specify the transposition used for the B matrix. It has to be either
1330 * PastixTrans or PastixConjTrans.
1331 *
1332 * @param[in] cblk
1333 * The cblk structure to which block belongs to. The A and B pointers
1334 * must be the coeftab of this column block.
1335 * Next column blok must be accessible through cblk[1].
1336 *
1337 * @param[in] blok
1338 * The block from which we compute the contributions.
1339 *
1340 * @param[inout] fcblk
1341 * The pointer to the data structure that describes the panel on which
1342 * we compute the contributions. The C pointer must be one of the
1343 * coeftab from this fcblk. Next column blok must be accessible through
1344 * fcblk[1].
1345 *
1346 * @param[in] lrA
1347 * Pointer to the low-rank representation of the block A.
1348 * Must be followed by the low-rank representation of the following blocks.
1349 *
1350 * @param[in] lrB
1351 * Pointer to the low-rank representation of the block B.
1352 * Must be followed by the low-rank representation of the following blocks.
1353 *
1354 * @param[inout] C
1355 * The pointer to the fcblk.lcoeftab if the lower part is computed,
1356 * fcblk.ucoeftab otherwise.
1357 *
1358 * @param[in] work
1359 * Temporary memory buffer.
1360 *
1361 * @param[in] lwork
1362 * The length of work.
1363 * On entry, if trans = PastixTrans
1364 * lwork >= max(1, K*N). Otherwise lwork >= max(1, M*K).
1365 *
1366 * @param[in] lowrank
1367 * The structure with low-rank parameters.
1368 *
1369 *******************************************************************************
1370 *
1371 * @return The number of flops performed
1372 *
1373 *******************************************************************************/
1374static inline pastix_fixdbl_t
1376 pastix_trans_t trans,
1377 const SolverCblk *cblk,
1378 const SolverBlok *blok,
1379 SolverCblk *fcblk,
1380 const pastix_lrblock_t *lrA,
1381 const pastix_lrblock_t *lrB,
1383 pastix_complex32_t *work,
1384 pastix_int_t lwork,
1385 const pastix_lr_t *lowrank )
1386{
1387 const SolverBlok *iterblok;
1388 const SolverBlok *fblok;
1389 const SolverBlok *lblok;
1390
1391 pastix_int_t N, K, shift;
1392 pastix_lrblock_t lrC;
1393 core_clrmm_t params;
1394
1395 pastix_fixdbl_t flops = 0.0;
1396
1397 /* Update from a low-rank cblk to a low-rank cblk */
1398 assert( cblk->cblktype & CBLK_COMPRESSED );
1399 assert( !(fcblk->cblktype & CBLK_COMPRESSED) );
1400 assert( cblk->cblktype & CBLK_LAYOUT_2D );
1401 assert( fcblk->cblktype & CBLK_LAYOUT_2D );
1402
1403 assert( (lwork == 0) || ((lwork > 0) && (work != NULL)) );
1404
1405 shift = (sideA == PastixUCoef) ? 1 : 0;
1406
1407 /* Get the B block and its dimensions */
1408 K = cblk_colnbr( cblk );
1409 N = blok_rownbr( blok );
1410
1411 /*
1412 * Add contribution to C in fcblk:
1413 * Get the first facing block of the distant panel, and the last block of
1414 * the current cblk
1415 */
1416 fblok = fcblk->fblokptr;
1417 lblok = cblk[1].fblokptr;
1418
1419 params.lowrank = lowrank;
1420 params.transA = PastixNoTrans;
1421 params.transB = trans;
1422 params.N = N;
1423 params.K = K;
1424 params.Cn = cblk_colnbr( fcblk );
1425 params.alpha = -1.0;
1426 params.beta = 1.0;
1427 params.work = work;
1428 params.lwork = lwork;
1429 params.lwused = 0;
1430 params.lock = &(fcblk->lock);
1431 params.B = lrB + (blok - cblk->fblokptr);
1432 params.C = &lrC;
1433
1434 lrC.rk = -1;
1435 lrC.v = NULL;
1436
1437 /* for all following blocks in block column */
1438 lrA = lrA + (blok - cblk->fblokptr) + shift;
1439 for (iterblok=blok+shift; iterblok<lblok; iterblok++, lrA++) {
1440
1441 /* Find facing blok */
1442 while (!is_block_inside_fblock( iterblok, fblok ))
1443 {
1444 fblok++;
1445 assert( fblok < fcblk[1].fblokptr );
1446 }
1447
1448 params.M = blok_rownbr( iterblok );
1449 params.A = lrA;
1450 params.Cm = blok_rownbr( fblok );
1451 params.offx = iterblok->frownum - fblok->frownum;
1452 params.offy = blok->frownum - fcblk->fcolnum;
1453
1454 lrC.rkmax = params.Cm;
1455 lrC.u = C + fblok->coefind;
1456
1457 flops += core_clrmm( &params );
1458 }
1459 return flops;
1460}
1461
1462/**
1463 *******************************************************************************
1464 *
1465 * @brief Compute the updates associated to one off-diagonal block.
1466 *
1467 *******************************************************************************
1468 *
1469 * @param[in] sideA
1470 * Specify if A and C belong to the lower part, or to the upper part.
1471 * If sideA == PastixLCoef, the contribution of:
1472 * (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
1473 * C, otherwise the contribution:
1474 * (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
1475 * to C.
1476 * The pointer to the data structure that describes the panel from
1477 * which we compute the contributions. Next column blok must be
1478 * accessible through cblk[1].
1479 *
1480 * @param[in] trans
1481 * Specify the transposition used for the B matrix. It has to be either
1482 * PastixTrans or PastixConjTrans.
1483 *
1484 * @param[in] cblk
1485 * The cblk structure to which block belongs to. The A and B pointers
1486 * must be the coeftab of this column block.
1487 * Next column blok must be accessible through cblk[1].
1488 *
1489 * @param[in] blok
1490 * The block from which we compute the contributions.
1491 *
1492 * @param[inout] fcblk
1493 * The pointer to the data structure that describes the panel on which
1494 * we compute the contributions. The C pointer must be one of the
1495 * coeftab from this fcblk. Next column blok must be accessible through
1496 * fcblk[1].
1497 *
1498 * @param[in] A
1499 * The pointer to the correct representation of A.
1500 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1501 * - pastix_lr_block if the block is compressed.
1502 *
1503 * @param[in] B
1504 * The pointer to the correct representation of B.
1505 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1506 * - pastix_lr_block if the block is compressed.
1507 *
1508 * @param[inout] C
1509 * The pointer to the correct representation of C.
1510 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1511 * - pastix_lr_block if the block is compressed.
1512 *
1513 * @param[in] work
1514 * Temporary memory buffer.
1515 *
1516 * @param[in] lwork
1517 * TODO
1518 *
1519 * @param[in] lowrank
1520 * The structure with low-rank parameters.
1521 *
1522 *******************************************************************************
1523 *
1524 * @return The number of static pivoting during factorization of the diagonal
1525 * block.
1526 *
1527 *******************************************************************************/
1530 pastix_trans_t trans,
1531 const SolverCblk *cblk,
1532 const SolverBlok *blok,
1533 SolverCblk *fcblk,
1534 const void *A,
1535 const void *B,
1536 void *C,
1537 pastix_complex32_t *work,
1538 pastix_int_t lwork,
1539 const pastix_lr_t *lowrank )
1540{
1541 pastix_ktype_t ktype;
1542 pastix_fixdbl_t time, flops = 0.0;
1543 pastix_int_t m = cblk->stride;
1544 pastix_int_t n = blok_rownbr( blok );
1545 pastix_int_t k = cblk_colnbr( cblk );
1546
1547 m -= (cblk->cblktype & CBLK_LAYOUT_2D) ? blok->coefind / k : blok->coefind;
1548 m -= (sideA == PastixUCoef) ? blok_rownbr( blok ) : 0;
1549
1550 if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1551 if ( cblk->cblktype & CBLK_COMPRESSED ) {
1553 time = kernel_trace_start( ktype );
1554
1555 flops = core_cgemmsp_lr( sideA, trans,
1556 cblk, blok, fcblk,
1557 A, B, C, work, lwork,
1558 lowrank );
1559 }
1560 else {
1562 time = kernel_trace_start( ktype );
1563
1564 flops = core_cgemmsp_fulllr( sideA, trans,
1565 cblk, blok, fcblk,
1566 A, B, C, work, lwork,
1567 lowrank );
1568 }
1569 }
1570 else if ( fcblk->cblktype & CBLK_LAYOUT_2D ) {
1571 if ( cblk->cblktype & CBLK_COMPRESSED) {
1573 time = kernel_trace_start( ktype );
1574 core_cgemmsp_lrfr( sideA, trans,
1575 cblk, blok, fcblk,
1576 A, B, C, work, lwork,
1577 lowrank );
1578 }
1579 else if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
1581 time = kernel_trace_start( ktype );
1582
1583 core_cgemmsp_2d2d( sideA, trans,
1584 cblk, blok, fcblk,
1585 A, B, C );
1586 }
1587 else {
1589 time = kernel_trace_start( ktype );
1590
1591 core_cgemmsp_1d2d( sideA, trans,
1592 cblk, blok, fcblk,
1593 A, B, C );
1594 }
1595 flops = FLOPS_CGEMM( m, n, k );
1596 }
1597 else {
1598 assert( !(cblk->cblktype & CBLK_COMPRESSED) );
1600 time = kernel_trace_start( ktype );
1601
1602 core_cgemmsp_1d1d( sideA, trans,
1603 cblk, blok, fcblk,
1604 A, B, C, work );
1605
1606 flops = FLOPS_CGEMM( m, n, k );
1607 }
1608
1609 kernel_trace_stop( blok->inlast, ktype, m, n, k, flops, time );
1610
1611 return flops;
1612}
1613
1614/**
1615 *******************************************************************************
1616 *
1617 * @brief Compute the CPU gemm associated to a couple of off-diagonal blocks.
1618 *
1619 * C_l = C_l - A_l * op(B_s), with B_s = B_l, or B_u
1620 * or
1621 * C_u = C_u - A_u * op(B_s), with B_s = B_l, or B_u
1622 *
1623 *******************************************************************************
1624 *
1625 * @param[in] transB
1626 * Specify wheter B should be used as PastixNoTrans, PastixTrans, or
1627 * PastixConjTrans in the computations.
1628 *
1629 * @param[in] cblk
1630 * The cblk structure to which block A and B belong to. The A and B
1631 * pointers must be one of the [lu]coeftab of this column block.
1632 * Next column blok must be accessible through cblk[1].
1633 *
1634 * @param[inout] fcblk
1635 * The pointer to the data structure that describes the panel on which
1636 * we compute the contributions. The C pointer must be one of the
1637 * [lu]coeftab from this fcblk.
1638 * Next column blok must be accessible through fcblk[1].
1639 *
1640 * @param[in] blok_mk
1641 * Specify the index of the A block in the cblk column. This index is
1642 * 0-based for the diagonal block.
1643 *
1644 * @param[in] blok_nk
1645 * Specify the index of the B block in the cblk column. This index is
1646 * 0-based for the diagonal block.
1647 *
1648 * @param[in] blok_mn
1649 * Specify the index of the C block in the fcblk column. This index is
1650 * 0-based for the diagonal block.
1651 *
1652 * @param[in] A
1653 * The pointer to the correct representation of A.
1654 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1655 * - pastix_lr_block if the block is compressed.
1656 *
1657 * @param[in] B
1658 * The pointer to the correct representation of B.
1659 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1660 * - pastix_lr_block if the block is compressed.
1661 *
1662 * @param[in] C
1663 * The pointer to the correct representation of C.
1664 * - coeftab if the block is in full rank. Must be of size cblk.stride -by- cblk.width.
1665 * - pastix_lr_block if the block is compressed.
1666 *
1667 * @param[in] lowrank
1668 * The structure with the low-rank parameters.
1669 *
1670 *******************************************************************************
1671 *
1672 * @return TODO
1673 *
1674 *******************************************************************************/
1677 const SolverCblk *cblk,
1678 SolverCblk *fcblk,
1679 pastix_int_t blok_mk,
1680 pastix_int_t blok_nk,
1681 pastix_int_t blok_mn,
1682 const void *A,
1683 const void *B,
1684 void *C,
1685 const pastix_lr_t *lowrank )
1686{
1687 if ( fcblk->cblktype & CBLK_COMPRESSED ) {
1688 if ( cblk->cblktype & CBLK_COMPRESSED ) {
1689 return core_cgemmsp_block_lrlr( transB,
1690 blok_mk, blok_nk, blok_mn,
1691 cblk, fcblk,
1692 A, B, C, lowrank );
1693 }
1694 else {
1695 return core_cgemmsp_block_frlr( transB,
1696 blok_mk, blok_nk, blok_mn,
1697 cblk, fcblk,
1698 A, B, C, lowrank );
1699 }
1700 }
1701 else {
1702 if ( cblk->cblktype & CBLK_COMPRESSED ) {
1703 assert(0);
1704 return 0.; /* Avoids compilation and coverity warning */
1705 }
1706 else {
1707 return core_cgemmsp_block_frfr( transB,
1708 blok_mk, blok_nk, blok_mn,
1709 cblk, fcblk,
1710 A, B, C );
1711 }
1712 }
1713}
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
float _Complex pastix_complex32_t
Definition datatypes.h:76
double pastix_fixdbl_t
Definition datatypes.h:65
enum pastix_ktype_e pastix_ktype_t
List of the Level 1 events that may be traced in PaStiX.
static void kernel_trace_stop(int8_t inlast, pastix_ktype_t ktype, int m, int n, int k, double flops, double starttime)
Stop the trace of a single kernel.
static double kernel_trace_start(pastix_ktype_t ktype)
Start the trace of a single kernel.
@ PastixKernelGEMMCblkFRLR
@ PastixKernelGEMMBlokLRLR
@ PastixKernelGEMMCblk1d2d
@ PastixKernelGEMMCblkLRLR
@ PastixKernelGEMMCblk1d1d
@ PastixKernelGEMMCblk2d2d
@ PastixKernelGEMMBlok2d2d
int core_cgeadd(pastix_trans_t trans, pastix_int_t M, pastix_int_t N, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t LDA, pastix_complex32_t beta, pastix_complex32_t *B, pastix_int_t LDB)
Add two matrices together.
Definition core_cgeadd.c:78
static void core_cgemmsp_1d1d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C, pastix_complex32_t *work)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_frlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static void core_cgemmsp_1d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static void core_cgemmsp_2d2d(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of one single off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_frfr(pastix_trans_t trans, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_complex32_t *C)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static pastix_fixdbl_t core_cgemmsp_lr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_lrfr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_complex32_t *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
static pastix_fixdbl_t core_cgemmsp_block_lrlr(pastix_trans_t transB, pastix_int_t blok_mk, pastix_int_t blok_kn, pastix_int_t blok_mn, const SolverCblk *cblk, SolverCblk *fcblk, const pastix_lrblock_t *lrA, const pastix_lrblock_t *lrB, pastix_lrblock_t *lrC, const pastix_lr_t *lowrank)
Compute the updates that are generated by the transposition of all the blocks facing a common diagona...
static pastix_fixdbl_t core_cgemmsp_fulllr(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const pastix_complex32_t *A, const pastix_complex32_t *B, pastix_lrblock_t *lrC, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpucblk_cgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
pastix_fixdbl_t cpublok_cgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
pastix_int_t N
pastix_int_t offx
pastix_atomic_lock_t * lock
pastix_complex32_t beta
pastix_int_t M
pastix_lrblock_t * C
pastix_trans_t transA
pastix_complex32_t alpha
pastix_int_t lwused
const pastix_lr_t * lowrank
pastix_int_t Cn
const pastix_lrblock_t * B
pastix_int_t K
const pastix_lrblock_t * A
pastix_int_t offy
pastix_trans_t transB
pastix_int_t lwork
pastix_complex32_t * work
pastix_int_t Cm
pastix_fixdbl_t core_clrmm(core_clrmm_t *params)
Compute the matrix matrix product when involved matrices are stored in a low-rank structure.
Definition core_clrmm.c:278
Structure to store all the parameters of the core_clrmm family functions.
Structure to define the type of function to use for the low-rank kernels and their parameters.
The block low-rank structure to hold a matrix in low-rank form.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
@ PastixUCoef
Definition api.h:479
@ PastixNoTrans
Definition api.h:445
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition solver.h:329
pastix_int_t fcblknm
Definition solver.h:144
static int is_block_inside_fblock(const SolverBlok *blok, const SolverBlok *fblok)
Check if a block is included inside another one.
Definition solver.h:504
pastix_int_t frownum
Definition solver.h:147
pastix_atomic_lock_t lock
Definition solver.h:162
pastix_int_t coefind
Definition solver.h:149
SolverBlok * fblokptr
Definition solver.h:168
pastix_int_t lcblknm
Definition solver.h:143
int8_t inlast
Definition solver.h:151
pastix_int_t stride
Definition solver.h:169
int8_t cblktype
Definition solver.h:164
pastix_int_t fcolnum
Definition solver.h:166
Solver block structure.
Definition solver.h:141
Solver column block structure.
Definition solver.h:161