PaStiX Handbook  6.3.2
codelet_blok_dadd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file codelet_blok_dadd.c
4  *
5  * StarPU codelet to sum fanin blocks together.
6  *
7  * @copyright 2016-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.3.1
11  * @author Alycia Lisito
12  * @author Mathieu Faverge
13  * @date 2023-12-01
14  *
15  * @generated from /builds/solverstack/pastix/sopalin/starpu/codelet_blok_zadd.c, normal z -> d, Wed Dec 13 12:09:26 2023
16  *
17  * @addtogroup pastix_starpu
18  * @{
19  *
20  **/
21 #ifndef DOXYGEN_SHOULD_SKIP_THIS
22 #define _GNU_SOURCE
23 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
24 #include "common.h"
25 #include "blend/solver.h"
26 #include "sopalin/sopalin_data.h"
27 #include "pastix_dcores.h"
28 #if defined(PASTIX_WITH_CUDA)
29 #include "pastix_dcuda.h"
30 #endif
31 #include "pastix_starpu.h"
32 #include "pastix_dstarpu.h"
33 #include "codelets.h"
34 
35 /**
36  * @brief Main structure for all tasks of blok_dadd type
37  */
38 struct cl_blok_dadd_args_s {
39  profile_data_t profile_data;
40  sopalin_data_t *sopalin_data;
41  pastix_coefside_t side;
42  const SolverCblk *cblk;
43  SolverCblk *fcblk;
44  pastix_int_t blok_m;
45  pastix_int_t fblok_m;
46 };
47 
48 #if defined( PASTIX_STARPU_PROFILING )
49 /**
50  * @brief Functions to profile the codelet
51  *
52  * Two levels of profiling are available:
53  * 1) A generic one that returns the flops per worker
54  * 2) A more detailed one that generate logs of the performance for each kernel
55  */
56 starpu_profile_t blok_dadd_profile = {
57  .next = NULL,
58  .name = "blok_dadd"
59 };
60 
61 /**
62  * @brief Profiling registration function
63  */
64 void blok_dadd_profile_register( void ) __attribute__( ( constructor ) );
65 void
66 blok_dadd_profile_register( void )
67 {
68  profiling_register_cl( &blok_dadd_profile );
69 }
70 
71 #ifndef DOXYGEN_SHOULD_SKIP_THIS
72 #if defined(PASTIX_STARPU_PROFILING_LOG)
73 static void
74 cl_profiling_cb_blok_dadd( void *callback_arg )
75 {
76  cl_profiling_callback( callback_arg );
77 
78  struct starpu_task *task = starpu_task_get_current();
79  struct starpu_profiling_task_info *info = task->profiling_info;
80 
81  /* Quick return */
82  if ( info == NULL ) {
83  return;
84  }
85 
86  struct cl_blok_dadd_args_s *args = (struct cl_blok_dadd_args_s *) callback_arg;
87  pastix_fixdbl_t flops = args->profile_data.flops;
88  pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
89  pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
90 
91  const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
92  pastix_int_t M = blok_rownbr( blok );
93  pastix_int_t N = cblk_colnbr( args->cblk );
94 
95  cl_profiling_log_register( task->name, "blok_dadd", M, N, 0, flops, speed );
96 }
97 #endif
98 
99 #if defined(PASTIX_STARPU_PROFILING_LOG)
100 static void (*blok_dadd_callback)(void*) = cl_profiling_cb_blok_dadd;
101 #else
102 static void (*blok_dadd_callback)(void*) = cl_profiling_callback;
103 #endif
104 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
105 
106 #endif /* defined( PASTIX_STARPU_PROFILING ) */
107 
108 
109 /**
110  *******************************************************************************
111  *
112  * @brief Cost model function
113  *
114  * The user can switch from the pastix static model to an history based model
115  * computed automatically.
116  *
117  *******************************************************************************
118  *
119  * @param[in] task
120  * TODO
121  *
122  * @param[in] arch
123  * TODO
124  *
125  * @param[in] nimpl
126  * TODO
127  *
128  *******************************************************************************
129  *
130  * @retval TODO
131  *
132  *******************************************************************************/
133 static inline pastix_fixdbl_t
134 fct_blok_dadd_cost( struct starpu_task *task,
135  struct starpu_perfmodel_arch *arch,
136  unsigned nimpl )
137 {
138  struct cl_blok_dadd_args_s *args = (struct cl_blok_dadd_args_s *)(task->cl_arg);
139 
140  pastix_fixdbl_t cost = 0.;
141  pastix_fixdbl_t *coefs;
142  const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
143  pastix_int_t M = blok_rownbr( blok );
144  pastix_int_t N = cblk_colnbr( args->cblk );
145 
146  switch( arch->devices->type ) {
147  case STARPU_CPU_WORKER:
148  coefs = &(args->sopalin_data->cpu_models->coefficients[PastixDouble-2][PastixKernelGEADDCblkFRFR][0]);
149  break;
150  case STARPU_CUDA_WORKER:
151  coefs = &(args->sopalin_data->gpu_models->coefficients[PastixDouble-2][PastixKernelGEADDCblkFRFR][0]);
152  break;
153  default:
154  assert(0);
155  return 0.;
156  }
157 
158  /* Get cost in us */
159  cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
160 
161  (void)nimpl;
162  return cost;
163 }
164 
165 #ifndef DOXYGEN_SHOULD_SKIP_THIS
166 static struct starpu_perfmodel starpu_blok_dadd_model = {
167 #if defined( PASTIX_STARPU_COST_PER_ARCH )
168  .type = STARPU_PER_ARCH,
169  .arch_cost_function = fct_blok_dadd_cost,
170 #else
171  .type = STARPU_HISTORY_BASED,
172 #endif
173  .symbol = "blok_dadd",
174 };
175 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
176 
177 #if !defined(PASTIX_STARPU_SIMULATION)
178 /**
179  *******************************************************************************
180  *
181  * @brief StarPU CPU implementation
182  *
183  *******************************************************************************
184  *
185  * @param[in] descr
186  * TODO
187  *
188  * @param[in] cl_arg
189  * TODO
190  *
191  *******************************************************************************/
192 static void
193 fct_blok_dadd_cpu( void *descr[], void *cl_arg )
194 {
195  struct cl_blok_dadd_args_s *args = (struct cl_blok_dadd_args_s *)cl_arg;
196  const void *A;
197  void *B;
198 
199  A = pastix_starpu_blok_get_ptr( descr[0] );
200  B = pastix_starpu_blok_get_ptr( descr[1] );
201 
202  assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
203  assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
204 
205  args->profile_data.flops = cpublok_dadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
206  A, B, NULL, 0,
207  &( args->sopalin_data->solvmtx->lowrank ) );
208 }
209 
210 #if defined(PASTIX_WITH_CUDA) && 0
211 /**
212  *******************************************************************************
213  *
214  * @brief StarPU GPU implementation
215  *
216  *******************************************************************************
217  *
218  * @param[in] descr
219  * TODO
220  *
221  * @param[in] cl_arg
222  * TODO
223  *
224  *******************************************************************************/
225 static void
226 fct_blok_dadd_gpu( void *descr[], void *cl_arg )
227 {
228  struct cl_template_args_s *args = (struct cl_template_args_s *)cl_arg;
229  const void *A;
230  void *B;
231 
232  A = pastix_starpu_blok_get_ptr( descr[0] );
233  B = pastix_starpu_blok_get_ptr( descr[1] );
234 
235  assert( args->cblk->cblktype & CBLK_TASKS_2D );
236  assert( args->fcblk->cblktype & CBLK_TASKS_2D );
237 
238  args->profile_data.flops = gpublok_dadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
239  A, B,
240  &( args->sopalin_data->solvmtx->lowrank ),
241  starpu_cuda_get_local_stream() );
242 
243 }
244 #endif /* defined(PASTIX_WITH_CUDA) */
245 #endif /* !defined(PASTIX_STARPU_SIMULATION) */
246 
247 #ifndef DOXYGEN_SHOULD_SKIP_THIS
248 CODELETS_CPU( blok_dadd, 2 );
249 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
250 
251 /**
252  *******************************************************************************
253  *
254  * @brief Insert the task to add a fanin cblk on the receiver side (The fanin is
255  * seen on this side as the RECV cblk). Note that the caller always execute the
256  * task.
257  *
258  *******************************************************************************
259  *
260  * @param[in] sopalin_data
261  * Solver matrix information structure that will guide the algorithm.
262  *
263  * @param[in] side
264  * Define which side of the cblk must be tested.
265  * @arg PastixLCoef if lower part only
266  * @arg PastixUCoef if upper part only
267  * @arg PastixLUCoef if both sides.
268  *
269  * @param[in] cblk
270  * The column block of the matrix.
271  *
272  * @param[in] blok
273  * The block of the matrix.
274  *
275  * @param[in] fcblk
276  * The facing column block of the matrix.
277  *
278  * @param[in] fblok
279  * The facing block of the matrix.
280  *
281  * @param[in] prio
282  * The task priority.
283  *
284  *******************************************************************************/
285 void
286 starpu_task_blok_dadd_recv( sopalin_data_t *sopalin_data,
287  pastix_coefside_t side,
288  const SolverCblk *cblk,
289  const SolverBlok *blok,
290  SolverCblk *fcblk,
291  SolverBlok *fblok,
292  int prio )
293 {
294  struct cl_blok_dadd_args_s *cl_arg = NULL;
295 #if defined(PASTIX_DEBUG_STARPU)
296  char *task_name;
297 #endif
298 
299  assert( blok->fcblknm == fblok->fcblknm );
300 
301 #if !defined(HAVE_STARPU_DATA_PARTITION_CLEAN_NODE)
302  /* Mark the facing cblk as partitionned */
303  fcblk->partitioned |= (side + 1);
304 #endif
305 
306  /*
307  * Create the arguments array
308  */
309  cl_arg = malloc( sizeof( struct cl_blok_dadd_args_s) );
310  cl_arg->sopalin_data = sopalin_data;
311 #if defined(PASTIX_STARPU_PROFILING)
312  cl_arg->profile_data.measures = blok_dadd_profile.measures;
313  cl_arg->profile_data.flops = NAN;
314 #endif
315  cl_arg->side = side;
316  cl_arg->cblk = cblk;
317  cl_arg->fcblk = fcblk;
318  cl_arg->blok_m = blok - cblk->fblokptr;
319  cl_arg->fblok_m = fblok - fcblk->fblokptr;
320 
321 #if defined(PASTIX_DEBUG_STARPU)
322  /* This actually generates a memory leak */
323  asprintf( &task_name, "%s( %ld )",
324  cl_blok_dadd_cpu.name,
325  (long)(cblk - sopalin_data->solvmtx->cblktab) );
326 #endif
327 
328  assert( cblk->cblktype & CBLK_RECV );
329  assert( !(fcblk->cblktype & (CBLK_RECV|CBLK_FANIN)) );
330 
331  pastix_starpu_insert_task(
332  &cl_blok_dadd_cpu,
333  STARPU_CL_ARGS, cl_arg, sizeof( struct cl_blok_dadd_args_s ),
334  STARPU_EXECUTE_ON_NODE, fcblk->ownerid,
335 #if defined(PASTIX_STARPU_PROFILING)
336  STARPU_CALLBACK_WITH_ARG_NFREE, blok_dadd_callback, cl_arg,
337 #endif
338  STARPU_R, blok->handler[side],
339  STARPU_RW, fblok->handler[side],
340 #if defined(PASTIX_DEBUG_STARPU)
341  STARPU_NAME, task_name,
342 #endif
343 #if defined(PASTIX_STARPU_HETEROPRIO)
344  STARPU_PRIORITY, BucketFacto1D,
345 #else
346  STARPU_PRIORITY, prio,
347 #endif
348  0);
349 
350  (void)prio;
351 }
352 
353 /**
354  *******************************************************************************
355  *
356  * @brief Insert the task to add a fanin cblk on the emitter side. Note that
357  * this task is submitted only to emit a send to the owner of the associated
358  * recv cblk that will perform the add. Thus, the task is always submitted but
359  * never executed.
360  *
361  *******************************************************************************
362  *
363  * @param[in] sopalin_data
364  * Solver matrix information structure that will guide the algorithm.
365  *
366  * @param[in] side
367  * Define which side of the cblk must be tested.
368  * @arg PastixLCoef if lower part only
369  * @arg PastixUCoef if upper part only
370  * @arg PastixLUCoef if both sides.
371  *
372  * @param[in] cblk
373  * The column block of the matrix.
374  *
375  * @param[in] blok
376  * The block of the matrix.
377  *
378  * @param[in] prio
379  * The task priority.
380  *
381  *******************************************************************************/
382 void
383 starpu_task_blok_dadd_fanin( sopalin_data_t *sopalin_data,
384  pastix_coefside_t side,
385  const SolverCblk *cblk,
386  const SolverBlok *blok,
387  int prio )
388 {
389  assert( cblk->cblktype & CBLK_FANIN );
390 
391  pastix_starpu_insert_task(
392  NULL,
393  STARPU_EXECUTE_ON_NODE, cblk->ownerid,
394  STARPU_R, blok->handler[side],
395 #if defined(PASTIX_STARPU_HETEROPRIO)
396  STARPU_PRIORITY, BucketFacto1D,
397 #else
398  STARPU_PRIORITY, prio,
399 #endif
400  0);
401 
402  (void)prio;
403 }
404 
405 /**
406  * @}
407  */
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelGEADDCblkFRFR
Definition: kernels_enums.h:69
pastix_fixdbl_t cpublok_dadd(double alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const void *A, void *B, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two bloks.
Definition: cpublok_dadd.c:431
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static pastix_fixdbl_t fct_blok_dadd_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_blok_dadd_fanin(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, int prio)
Insert the task to add a fanin cblk on the emitter side. Note that this task is submitted only to emi...
void starpu_task_blok_dadd_recv(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, SolverBlok *fblok, int prio)
Insert the task to add a fanin cblk on the receiver side (The fanin is seen on this side as the RECV ...
static void fct_blok_dadd_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:390
void * handler[2]
Definition: solver.h:138
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:324
pastix_int_t fcblknm
Definition: solver.h:140
SolverBlok * fblokptr
Definition: solver.h:163
int8_t cblktype
Definition: solver.h:159
int ownerid
Definition: solver.h:175
int8_t partitioned
Definition: solver.h:160
Solver block structure.
Definition: solver.h:137
Solver column block structure.
Definition: solver.h:156