PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
codelet_blok_cadd.c
Go to the documentation of this file.
1/**
2 *
3 * @file codelet_blok_cadd.c
4 *
5 * StarPU codelet to sum fanin blocks together.
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Alycia Lisito
12 * @author Mathieu Faverge
13 * @date 2024-07-05
14 *
15 * @generated from /builds/2mk6rsew/0/solverstack/pastix/sopalin/starpu/codelet_blok_zadd.c, normal z -> c, Tue Feb 25 14:35:18 2025
16 *
17 * @addtogroup pastix_starpu
18 * @{
19 *
20 **/
21#ifndef DOXYGEN_SHOULD_SKIP_THIS
22#define _GNU_SOURCE
23#endif /* DOXYGEN_SHOULD_SKIP_THIS */
24#include "common.h"
25#include "blend/solver.h"
26#include "sopalin/sopalin_data.h"
27#include "pastix_ccores.h"
28#if defined(PASTIX_WITH_CUDA)
29#include "pastix_ccuda.h"
30#endif
31#include "pastix_starpu.h"
32#include "pastix_cstarpu.h"
33#include "codelets.h"
34
35/**
36 * @brief Main structure for all tasks of blok_cadd type
37 */
38struct cl_blok_cadd_args_s {
39 profile_data_t profile_data;
40 sopalin_data_t *sopalin_data;
42 const SolverCblk *cblk;
43 SolverCblk *fcblk;
44 pastix_int_t blok_m;
45 pastix_int_t fblok_m;
46};
47
48#if defined( PASTIX_STARPU_PROFILING )
49/**
50 * @brief Functions to profile the codelet
51 *
52 * Two levels of profiling are available:
53 * 1) A generic one that returns the flops per worker
54 * 2) A more detailed one that generate logs of the performance for each kernel
55 */
56starpu_profile_t blok_cadd_profile = {
57 .next = NULL,
58 .name = "blok_cadd"
59};
60
61/**
62 * @brief Profiling registration function
63 */
64void blok_cadd_profile_register( void ) __attribute__( ( constructor ) );
65void
66blok_cadd_profile_register( void )
67{
68 profiling_register_cl( &blok_cadd_profile );
69}
70
71#ifndef DOXYGEN_SHOULD_SKIP_THIS
72#if defined(PASTIX_STARPU_PROFILING_LOG)
73static void
74cl_profiling_cb_blok_cadd( void *callback_arg )
75{
76 cl_profiling_callback( callback_arg );
77
78 struct starpu_task *task = starpu_task_get_current();
79 struct starpu_profiling_task_info *info = task->profiling_info;
80
81 /* Quick return */
82 if ( info == NULL ) {
83 return;
84 }
85
86 struct cl_blok_cadd_args_s *args = (struct cl_blok_cadd_args_s *) callback_arg;
87 pastix_fixdbl_t flops = args->profile_data.flops;
88 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
89 pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
90
91 const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
92 pastix_int_t M = blok_rownbr( blok );
93 pastix_int_t N = cblk_colnbr( args->cblk );
94
95 cl_profiling_log_register( task->name, "blok_cadd", M, N, 0, flops, speed );
96}
97#endif
98
99#if defined(PASTIX_STARPU_PROFILING_LOG)
100static void (*blok_cadd_callback)(void*) = cl_profiling_cb_blok_cadd;
101#else
102static void (*blok_cadd_callback)(void*) = cl_profiling_callback;
103#endif
104#endif /* DOXYGEN_SHOULD_SKIP_THIS */
105
106#endif /* defined( PASTIX_STARPU_PROFILING ) */
107
108
109#if defined(PASTIX_STARPU_COST_PER_ARCH)
110/**
111 *******************************************************************************
112 *
113 * @brief Cost model function
114 *
115 * The user can switch from the pastix static model to an history based model
116 * computed automatically.
117 *
118 *******************************************************************************
119 *
120 * @param[in] task
121 * TODO
122 *
123 * @param[in] arch
124 * TODO
125 *
126 * @param[in] nimpl
127 * TODO
128 *
129 *******************************************************************************
130 *
131 * @retval TODO
132 *
133 *******************************************************************************/
134static inline pastix_fixdbl_t
135fct_blok_cadd_cost( struct starpu_task *task,
136 struct starpu_perfmodel_arch *arch,
137 unsigned nimpl )
138{
139 struct cl_blok_cadd_args_s *args = (struct cl_blok_cadd_args_s *)(task->cl_arg);
140
142 pastix_fixdbl_t *coefs;
143 const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
144 pastix_int_t M = blok_rownbr( blok );
145 pastix_int_t N = cblk_colnbr( args->cblk );
146
147 switch( arch->devices->type ) {
148 case STARPU_CPU_WORKER:
149 coefs = &(args->sopalin_data->cpu_models->coefficients[PastixComplex32-2][PastixKernelGEADDCblkFRFR][0]);
150 break;
151 case STARPU_CUDA_WORKER:
152 coefs = &(args->sopalin_data->gpu_models->coefficients[PastixComplex32-2][PastixKernelGEADDCblkFRFR][0]);
153 break;
154 default:
155 assert(0);
156 return 0.;
157 }
158
159 /* Get cost in us */
160 cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
161
162 (void)nimpl;
163 return cost;
164}
165#endif
166
167#ifndef DOXYGEN_SHOULD_SKIP_THIS
168static struct starpu_perfmodel starpu_blok_cadd_model = {
169#if defined(PASTIX_STARPU_COST_PER_ARCH)
170 .type = STARPU_PER_ARCH,
171 .arch_cost_function = fct_blok_cadd_cost,
172#else
173 .type = STARPU_HISTORY_BASED,
174#endif
175 .symbol = "blok_cadd",
176};
177#endif /* DOXYGEN_SHOULD_SKIP_THIS */
178
179#if !defined(PASTIX_STARPU_SIMULATION)
180/**
181 *******************************************************************************
182 *
183 * @brief StarPU CPU implementation
184 *
185 *******************************************************************************
186 *
187 * @param[in] descr
188 * TODO
189 *
190 * @param[in] cl_arg
191 * TODO
192 *
193 *******************************************************************************/
194static void
195fct_blok_cadd_cpu( void *descr[], void *cl_arg )
196{
197 struct cl_blok_cadd_args_s *args = (struct cl_blok_cadd_args_s *)cl_arg;
198 const void *A;
199 void *B;
200
201 A = pastix_starpu_blok_get_ptr( descr[0] );
202 B = pastix_starpu_blok_get_ptr( descr[1] );
203
204 assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
205 assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
206
207 args->profile_data.flops = cpublok_cadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
208 A, B, NULL, 0,
209 &( args->sopalin_data->solvmtx->lowrank ) );
210}
211
212#if defined(PASTIX_WITH_CUDA) && 0
213/**
214 *******************************************************************************
215 *
216 * @brief StarPU GPU implementation
217 *
218 *******************************************************************************
219 *
220 * @param[in] descr
221 * TODO
222 *
223 * @param[in] cl_arg
224 * TODO
225 *
226 *******************************************************************************/
227static void
228fct_blok_cadd_gpu( void *descr[], void *cl_arg )
229{
230 struct cl_template_args_s *args = (struct cl_template_args_s *)cl_arg;
231 const void *A;
232 void *B;
233
234 A = pastix_starpu_blok_get_ptr( descr[0] );
235 B = pastix_starpu_blok_get_ptr( descr[1] );
236
237 assert( args->cblk->cblktype & CBLK_TASKS_2D );
238 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
239
240 args->profile_data.flops = gpublok_cadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
241 A, B,
242 &( args->sopalin_data->solvmtx->lowrank ),
243 starpu_cuda_get_local_stream() );
244
245}
246#endif /* defined(PASTIX_WITH_CUDA) */
247#endif /* !defined(PASTIX_STARPU_SIMULATION) */
248
249#ifndef DOXYGEN_SHOULD_SKIP_THIS
250CODELETS_CPU( blok_cadd, 2 );
251#endif /* DOXYGEN_SHOULD_SKIP_THIS */
252
253/**
254 *******************************************************************************
255 *
256 * @brief Insert the task to add a fanin cblk on the receiver side (The fanin is
257 * seen on this side as the RECV cblk). Note that the caller always execute the
258 * task.
259 *
260 *******************************************************************************
261 *
262 * @param[in] sopalin_data
263 * Solver matrix information structure that will guide the algorithm.
264 *
265 * @param[in] side
266 * Define which side of the cblk must be tested.
267 * @arg PastixLCoef if lower part only
268 * @arg PastixUCoef if upper part only
269 * @arg PastixLUCoef if both sides.
270 *
271 * @param[in] cblk
272 * The column block of the matrix.
273 *
274 * @param[in] blok
275 * The block of the matrix.
276 *
277 * @param[in] fcblk
278 * The facing column block of the matrix.
279 *
280 * @param[in] fblok
281 * The facing block of the matrix.
282 *
283 * @param[in] prio
284 * The task priority.
285 *
286 *******************************************************************************/
287void
288starpu_task_blok_cadd_recv( sopalin_data_t *sopalin_data,
290 const SolverCblk *cblk,
291 const SolverBlok *blok,
292 SolverCblk *fcblk,
293 SolverBlok *fblok,
294 int prio )
295{
296 struct cl_blok_cadd_args_s *cl_arg = NULL;
297#if defined(PASTIX_DEBUG_STARPU)
298 char *task_name;
299#endif
300
301 assert( blok->fcblknm == fblok->fcblknm );
302
303 /*
304 * Create the arguments array
305 */
306 cl_arg = malloc( sizeof( struct cl_blok_cadd_args_s) );
307 cl_arg->sopalin_data = sopalin_data;
308#if defined(PASTIX_STARPU_PROFILING)
309 cl_arg->profile_data.measures = blok_cadd_profile.measures;
310 cl_arg->profile_data.flops = NAN;
311#endif
312 cl_arg->side = side;
313 cl_arg->cblk = cblk;
314 cl_arg->fcblk = fcblk;
315 cl_arg->blok_m = blok - cblk->fblokptr;
316 cl_arg->fblok_m = fblok - fcblk->fblokptr;
317
318#if defined(PASTIX_DEBUG_STARPU)
319 /* This actually generates a memory leak */
320 asprintf( &task_name, "%s( %ld )",
321 cl_blok_cadd_cpu.name,
322 (long)(cblk - sopalin_data->solvmtx->cblktab) );
323#endif
324
325 assert( cblk->cblktype & CBLK_RECV );
326 assert( !(fcblk->cblktype & (CBLK_RECV|CBLK_FANIN)) );
327
328 pastix_starpu_insert_task(
329 &cl_blok_cadd_cpu,
330 STARPU_CL_ARGS, cl_arg, sizeof( struct cl_blok_cadd_args_s ),
331 STARPU_EXECUTE_ON_NODE, fcblk->ownerid,
332#if defined(PASTIX_STARPU_PROFILING)
333 STARPU_CALLBACK_WITH_ARG_NFREE, blok_cadd_callback, cl_arg,
334#endif
335 STARPU_R, blok->handler[side],
336 STARPU_RW, fblok->handler[side],
337#if defined(PASTIX_DEBUG_STARPU)
338 STARPU_NAME, task_name,
339#endif
340#if defined(PASTIX_STARPU_HETEROPRIO)
341 STARPU_PRIORITY, BucketFacto1D,
342#else
343 STARPU_PRIORITY, prio,
344#endif
345 0);
346
347 (void)prio;
348}
349
350/**
351 *******************************************************************************
352 *
353 * @brief Insert the task to add a fanin cblk on the emitter side. Note that
354 * this task is submitted only to emit a send to the owner of the associated
355 * recv cblk that will perform the add. Thus, the task is always submitted but
356 * never executed.
357 *
358 *******************************************************************************
359 *
360 * @param[in] sopalin_data
361 * Solver matrix information structure that will guide the algorithm.
362 *
363 * @param[in] side
364 * Define which side of the cblk must be tested.
365 * @arg PastixLCoef if lower part only
366 * @arg PastixUCoef if upper part only
367 * @arg PastixLUCoef if both sides.
368 *
369 * @param[in] cblk
370 * The column block of the matrix.
371 *
372 * @param[in] blok
373 * The block of the matrix.
374 *
375 * @param[in] prio
376 * The task priority.
377 *
378 *******************************************************************************/
379void
380starpu_task_blok_cadd_fanin( sopalin_data_t *sopalin_data,
382 const SolverCblk *cblk,
383 const SolverBlok *blok,
384 int prio )
385{
386 assert( cblk->cblktype & CBLK_FANIN );
387
388 pastix_starpu_insert_task(
389 NULL,
390 STARPU_EXECUTE_ON_NODE, cblk->ownerid,
391 STARPU_R, blok->handler[side],
392#if defined(PASTIX_STARPU_HETEROPRIO)
393 STARPU_PRIORITY, BucketFacto1D,
394#else
395 STARPU_PRIORITY, prio,
396#endif
397 0);
398
399 (void)sopalin_data;
400 (void)prio;
401}
402
403/**
404 * @}
405 */
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
@ PastixKernelGEADDCblkFRFR
pastix_fixdbl_t cpublok_cadd(pastix_complex32_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const void *A, void *B, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two bloks.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
void starpu_task_blok_cadd_recv(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, SolverBlok *fblok, int prio)
Insert the task to add a fanin cblk on the receiver side (The fanin is seen on this side as the RECV ...
void starpu_task_blok_cadd_fanin(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, int prio)
Insert the task to add a fanin cblk on the emitter side. Note that this task is submitted only to emi...
static void fct_blok_cadd_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition solver.h:395
void * handler[2]
Definition solver.h:142
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition solver.h:329
pastix_int_t fcblknm
Definition solver.h:144
SolverBlok * fblokptr
Definition solver.h:168
int8_t cblktype
Definition solver.h:164
Solver block structure.
Definition solver.h:141
Solver column block structure.
Definition solver.h:161