PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
codelet_blok_cgemmsp.c
Go to the documentation of this file.
1/**
2 *
3 * @file codelet_blok_cgemmsp.c
4 *
5 * StarPU codelets for blas-like functions
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Mathieu Faverge
12 * @author Pierre Ramet
13 * @author Ian Masliah
14 * @author Tom Moenne-Loccoz
15 * @date 2024-07-05
16 *
17 * @generated from /builds/2mk6rsew/0/solverstack/pastix/sopalin/starpu/codelet_blok_zgemmsp.c, normal z -> c, Tue Feb 25 14:35:18 2025
18 *
19 * @addtogroup pastix_starpu
20 * @{
21 *
22 **/
23#ifndef DOXYGEN_SHOULD_SKIP_THIS
24#define _GNU_SOURCE
25#endif /* DOXYGEN_SHOULD_SKIP_THIS */
26#include "common.h"
27#include "blend/solver.h"
28#include "sopalin/sopalin_data.h"
29#include "pastix_ccores.h"
30#if defined(PASTIX_WITH_CUDA)
31#include "pastix_ccuda.h"
32#endif
33#include "pastix_starpu.h"
34#include "pastix_cstarpu.h"
35#include "codelets.h"
36
37/**
38 * @brief Main structure for all tasks of blok_cgemmsp type
39 */
40struct cl_blok_cgemmsp_args_s {
41 profile_data_t profile_data;
42 sopalin_data_t *sopalin_data;
43 pastix_trans_t trans;
44 const SolverCblk *cblk;
45 SolverCblk *fcblk;
46 pastix_int_t blok_mk;
47 pastix_int_t blok_nk;
48 pastix_int_t blok_mn;
49};
50
51#if defined( PASTIX_STARPU_PROFILING )
52/**
53 * @brief Functions to profile the codelet
54 *
55 * Two levels of profiling are available:
56 * 1) A generic one that returns the flops per worker
57 * 2) A more detailed one that generate logs of the performance for each kernel
58 */
59starpu_profile_t blok_cgemmsp_profile = {
60 .next = NULL,
61 .name = "blok_cgemmsp"
62};
63
64/**
65 * @brief Profiling registration function
66 */
67void blok_cgemmsp_profile_register( void ) __attribute__( ( constructor ) );
68void
69blok_cgemmsp_profile_register( void )
70{
71 profiling_register_cl( &blok_cgemmsp_profile );
72}
73
74#ifndef DOXYGEN_SHOULD_SKIP_THIS
75#if defined(PASTIX_STARPU_PROFILING_LOG)
76static void
77cl_profiling_cb_blok_cgemmsp( void *callback_arg )
78{
79 cl_profiling_callback( callback_arg );
80
81 struct starpu_task *task = starpu_task_get_current();
82 struct starpu_profiling_task_info *info = task->profiling_info;
83
84 /* Quick return */
85 if ( info == NULL ) {
86 return;
87 }
88
89 struct cl_blok_cgemmsp_args_s *args = (struct cl_blok_cgemmsp_args_s *) callback_arg;
90 pastix_fixdbl_t flops = args->profile_data.flops;
91 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
92 pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
93
94 pastix_int_t M = blok_rownbr_ext( args->cblk->fblokptr + args->blok_mk );
95 pastix_int_t N = blok_rownbr_ext( args->cblk->fblokptr + args->blok_nk );
96 pastix_int_t K = cblk_colnbr( args->cblk );
97
98 cl_profiling_log_register( task->name, "blok_cgemmsp", M, N, K, flops, speed );
99}
100#endif
101
102#if defined(PASTIX_STARPU_PROFILING_LOG)
103static void (*blok_cgemmsp_callback)(void*) = cl_profiling_cb_blok_cgemmsp;
104#else
105static void (*blok_cgemmsp_callback)(void*) = cl_profiling_callback;
106#endif
107#endif /* DOXYGEN_SHOULD_SKIP_THIS */
108
109#endif /* defined( PASTIX_STARPU_PROFILING ) */
110
111#if defined(PASTIX_STARPU_COST_PER_ARCH)
112/**
113 *******************************************************************************
114 *
115 * @brief Cost model function
116 *
117 * The user can switch from the pastix static model to an history based model
118 * computed automatically.
119 *
120 *******************************************************************************
121 *
122 * @param[in] task
123 * TODO
124 *
125 * @param[in] arch
126 * TODO
127 *
128 * @param[in] nimpl
129 * TODO
130 *
131 *******************************************************************************
132 *
133 * @retval TODO
134 *
135 *******************************************************************************/
136static inline pastix_fixdbl_t
137fct_blok_cgemmsp_cost( struct starpu_task *task,
138 struct starpu_perfmodel_arch *arch,
139 unsigned nimpl )
140{
141 struct cl_blok_cgemmsp_args_s *args = (struct cl_blok_cgemmsp_args_s *)(task->cl_arg);
142
144 pastix_fixdbl_t *coefs;
145 pastix_int_t M = blok_rownbr_ext( args->cblk->fblokptr + args->blok_mk );
146 pastix_int_t N = blok_rownbr_ext( args->cblk->fblokptr + args->blok_nk );
147 pastix_int_t K = cblk_colnbr( args->cblk );
148
149 switch( arch->devices->type ) {
150 case STARPU_CPU_WORKER:
151 coefs = &(args->sopalin_data->cpu_models->coefficients[PastixComplex32-2][PastixKernelGEMMBlok2d2d][0]);
152 break;
153 case STARPU_CUDA_WORKER:
154 coefs = &(args->sopalin_data->gpu_models->coefficients[PastixComplex32-2][PastixKernelGEMMBlok2d2d][0]);
155 break;
156 default:
157 assert(0);
158 return 0.;
159 }
160
161 /* Get cost in us */
162 cost = modelsGetCost3Param( coefs, M, N, K ) * 1e6;
163
164 (void)nimpl;
165 return cost;
166}
167#endif
168
169#ifndef DOXYGEN_SHOULD_SKIP_THIS
170static struct starpu_perfmodel starpu_blok_cgemmsp_model = {
171#if defined(PASTIX_STARPU_COST_PER_ARCH)
172 .type = STARPU_PER_ARCH,
173 .arch_cost_function = fct_blok_cgemmsp_cost,
174#else
175 .type = STARPU_HISTORY_BASED,
176#endif
177 .symbol = "blok_cgemmsp",
178};
179#endif /* DOXYGEN_SHOULD_SKIP_THIS */
180
181#if !defined(PASTIX_STARPU_SIMULATION)
182/**
183 *******************************************************************************
184 *
185 * @brief StarPU CPU implementation
186 *
187 *******************************************************************************
188 *
189 * @param[in] descr
190 * TODO
191 *
192 * @param[in] cl_arg
193 * TODO
194 *
195 *******************************************************************************/
196static void
197fct_blok_cgemmsp_cpu( void *descr[], void *cl_arg )
198{
199 struct cl_blok_cgemmsp_args_s *args = (struct cl_blok_cgemmsp_args_s *)cl_arg;
200 const void *A;
201 const void *B;
202 void *C;
203
204 A = pastix_starpu_blok_get_ptr( descr[0] );
205 B = pastix_starpu_blok_get_ptr( descr[1] );
206 C = pastix_starpu_blok_get_ptr( descr[2] );
207
208 assert( args->cblk->cblktype & CBLK_TASKS_2D );
209 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
210
211 args->profile_data.flops = cpublok_cgemmsp( args->trans,
212 args->cblk, args->fcblk,
213 args->blok_mk, args->blok_nk, args->blok_mn,
214 A, B, C,
215 &(args->sopalin_data->solvmtx->lowrank) );
216}
217
218/**
219 * @brief StarPU GPU implementation
220 */
221#if defined(PASTIX_WITH_CUDA)
222static void
223fct_blok_cgemmsp_gpu( void *descr[], void *cl_arg )
224{
225 struct cl_blok_cgemmsp_args_s *args = (struct cl_blok_cgemmsp_args_s *)cl_arg;
226 const void *A;
227 const void *B;
228 void *C;
229
230 A = pastix_starpu_blok_get_ptr( descr[0] );
231 B = pastix_starpu_blok_get_ptr( descr[1] );
232 C = pastix_starpu_blok_get_ptr( descr[2] );
233
234 assert( args->cblk->cblktype & CBLK_TASKS_2D );
235 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
236
237 args->profile_data.flops = gpublok_cgemmsp( args->trans,
238 args->cblk, args->fcblk,
239 args->blok_mk, args->blok_nk, args->blok_mn,
240 A, B, C,
241 &(args->sopalin_data->solvmtx->lowrank),
242 starpu_cuda_get_local_stream() );
243}
244#endif /* defined(PASTIX_WITH_CUDA) */
245#endif /* !defined(PASTIX_STARPU_SIMULATION) */
246
247#ifndef DOXYGEN_SHOULD_SKIP_THIS
248CODELETS_GPU( blok_cgemmsp, 3, STARPU_CUDA_ASYNC );
249#endif /* DOXYGEN_SHOULD_SKIP_THIS */
250
251/**
252 *******************************************************************************
253 *
254 * @brief TODO
255 *
256 *******************************************************************************
257 *
258 * @param[in] sopalin_data
259 * TODO
260 *
261 * @param[in] sideA
262 * TODO
263 *
264 * @param[in] sideB
265 * TODO
266 *
267 * @param[in] trans
268 * TODO
269 *
270 * @param[in] cblk
271 * TODO
272 *
273 * @param[in] fcblk
274 * TODO
275 *
276 * @param[in] blokA
277 * TODO
278 *
279 * @param[in] blokB
280 * TODO
281 *
282 * @param[in] prio
283 * TODO
284 *
285 *******************************************************************************/
286void
287starpu_task_blok_cgemmsp( sopalin_data_t *sopalin_data,
288 pastix_coefside_t sideA,
289 pastix_coefside_t sideB,
290 pastix_trans_t trans,
291 SolverCblk *cblk,
292 SolverCblk *fcblk,
293 const SolverBlok *blokA,
294 const SolverBlok *blokB,
295 int prio )
296{
297 struct cl_blok_cgemmsp_args_s *cl_arg = NULL;
298 long long execute_where = cl_blok_cgemmsp_any.where;
299 int need_exec = 1;
300#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
301 char *task_name;
302#endif
303
304 pastix_int_t frownum;
305 pastix_int_t lrownum;
306 pastix_int_t blok_mn = 0, j = 0;
307 pastix_int_t blok_mk = blokA - cblk->fblokptr;
308 pastix_int_t blok_nk = blokB - cblk->fblokptr;
309 SolverBlok *blokC = fcblk->fblokptr;
310
311 assert( blok_nk <= blok_mk );
312
313 do {
314 frownum = blokC->frownum;
315 lrownum = blokC->lrownum;
316 blok_mn += j;
317 j = 1;
318
319 /* Increase lrownum as long as blocks are facing the same cblk */
320 while( (blokC < fcblk[1].fblokptr-1) &&
321 (blokC[0].fcblknm == blokC[1].fcblknm) &&
322 (blokC[0].lcblknm == blokC[1].lcblknm) )
323 {
324 blokC++; j++;
325 lrownum = blokC->lrownum;
326 }
327 blokC++;
328 }
329 while( !((blokA->frownum >= frownum) &&
330 (blokA->lrownum <= lrownum)) );
331
332 blokC = fcblk->fblokptr + blok_mn;
333
334 assert( blokA->lcblknm == blokB->lcblknm );
335 assert( blokB->fcblknm == blokC->lcblknm );
336 assert( blokC->frownum <= blokA->frownum );
337 assert( blokA[-1].fcblknm != blokA[0].fcblknm );
338 assert( blokB[-1].fcblknm != blokB[0].fcblknm );
339 assert( (blok_mn == 0) || (blokC[-1].fcblknm != blokC[0].fcblknm) );
340
341 /*
342 * Check if it needs to be submitted
343 */
344#if defined(PASTIX_WITH_MPI)
345 {
346 int need_submit = 0;
347 if ( cblk->ownerid == sopalin_data->solvmtx->clustnum ) {
348 need_submit = 1;
349 }
350 if ( (fcblk->cblktype & CBLK_FANIN) ||
351 (fcblk->ownerid == sopalin_data->solvmtx->clustnum) ) {
352 need_submit = 1;
353 }
354 else {
355 need_exec = 0;
356 }
357 if ( starpu_mpi_cached_receive( blokC->handler[sideA] ) ) {
358 need_submit = 1;
359 }
360 if ( !need_submit ) {
361 return;
362 }
363 }
364#endif
365
366 /*
367 * Create the arguments array
368 */
369 if ( need_exec ) {
370 cl_arg = malloc( sizeof( struct cl_blok_cgemmsp_args_s ) );
371 cl_arg->sopalin_data = sopalin_data;
372#if defined(PASTIX_STARPU_PROFILING)
373 cl_arg->profile_data.measures = blok_cgemmsp_profile.measures;
374 cl_arg->profile_data.flops = NAN;
375#endif
376 cl_arg->trans = trans;
377 cl_arg->cblk = cblk;
378 cl_arg->fcblk = fcblk;
379 cl_arg->blok_mk = blok_mk;
380 cl_arg->blok_nk = blok_nk;
381 cl_arg->blok_mn = blok_mn;
382
383#if defined(PASTIX_WITH_CUDA)
384 if ( (cblk->cblktype & CBLK_COMPRESSED) ||
385 (fcblk->cblktype & CBLK_COMPRESSED) )
386 {
387 /* Disable CUDA */
388 execute_where &= (~STARPU_CUDA);
389 }
390#endif
391 }
392
393#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
394 /* This actually generates a memory leak */
395 asprintf( &task_name, "%s( %ld, %ld, %ld, %ld )",
396 cl_blok_cgemmsp_any.name,
397 (long)(blokA - sopalin_data->solvmtx->bloktab),
398 (long)(blokB - sopalin_data->solvmtx->bloktab),
399 (long)(blokC - sopalin_data->solvmtx->bloktab),
400 (long)sideA );
401#endif
402
403 pastix_starpu_insert_task(
404 &cl_blok_cgemmsp_any,
405 STARPU_CL_ARGS, cl_arg, sizeof( struct cl_blok_cgemmsp_args_s ),
406 STARPU_EXECUTE_WHERE, execute_where,
407#if defined(PASTIX_STARPU_PROFILING)
408 STARPU_CALLBACK_WITH_ARG_NFREE, blok_cgemmsp_callback, cl_arg,
409#endif
410 STARPU_R, blokA->handler[sideA],
411 STARPU_R, blokB->handler[sideB],
412 STARPU_RW, blokC->handler[sideA],
413#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
414 STARPU_NAME, task_name,
415#endif
416#if defined(PASTIX_STARPU_HETEROPRIO)
417 STARPU_PRIORITY, BucketGEMM2D,
418#else
419 STARPU_PRIORITY, prio,
420#endif
421 0);
422 (void)prio;
423}
424
425/**
426 * @}
427 */
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
@ PastixKernelGEMMBlok2d2d
pastix_fixdbl_t cpublok_cgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static void fct_blok_cgemmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
void starpu_task_blok_cgemmsp(sopalin_data_t *sopalin_data, pastix_coefside_t sideA, pastix_coefside_t sideB, pastix_trans_t trans, SolverCblk *cblk, SolverCblk *fcblk, const SolverBlok *blokA, const SolverBlok *blokB, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr_ext(const SolverBlok *blok)
Compute the number of rows of a contiguous block in front of the same cblk.
Definition solver.h:407
pastix_int_t lrownum
Definition solver.h:148
void * handler[2]
Definition solver.h:142
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition solver.h:329
pastix_int_t fcblknm
Definition solver.h:144
pastix_int_t frownum
Definition solver.h:147
SolverBlok * fblokptr
Definition solver.h:168
pastix_int_t lcblknm
Definition solver.h:143
int8_t cblktype
Definition solver.h:164
Solver block structure.
Definition solver.h:141
Solver column block structure.
Definition solver.h:161