PaStiX Handbook  6.4.0
codelet_cblk_sadd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file codelet_cblk_sadd.c
4  *
5  * StarPU codelet to sum fanin cblk together.
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Alycia Lisito
12  * @author Mathieu Faverge
13  * @date 2024-07-05
14  *
15  * @generated from /builds/solverstack/pastix/sopalin/starpu/codelet_cblk_zadd.c, normal z -> s, Thu Aug 29 14:20:32 2024
16  *
17  * @addtogroup pastix_starpu
18  * @{
19  *
20  **/
21 #ifndef DOXYGEN_SHOULD_SKIP_THIS
22 #define _GNU_SOURCE
23 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
24 #include "common.h"
25 #include "blend/solver.h"
26 #include "sopalin/sopalin_data.h"
27 #include "pastix_scores.h"
28 #if defined(PASTIX_WITH_CUDA)
29 #include "pastix_scuda.h"
30 #endif
31 #include "pastix_starpu.h"
32 #include "pastix_sstarpu.h"
33 #include "codelets.h"
34 
35 /**
36  * @brief Main structure for all tasks of cblk_sadd type
37  */
38 struct cl_cblk_sadd_args_s {
39  profile_data_t profile_data;
40  sopalin_data_t *sopalin_data;
41  pastix_coefside_t side;
42  const SolverCblk *cblk;
43  SolverCblk *fcblk;
44 };
45 
46 #if defined( PASTIX_STARPU_PROFILING )
47 /**
48  * @brief Functions to profile the codelet
49  *
50  * Two levels of profiling are available:
51  * 1) A generic one that returns the flops per worker
52  * 2) A more detailed one that generate logs of the performance for each kernel
53  */
54 starpu_profile_t cblk_sadd_profile = {
55  .next = NULL,
56  .name = "cblk_sadd"
57 };
58 
59 /**
60  * @brief Profiling registration function
61  */
62 void cblk_sadd_profile_register( void ) __attribute__( ( constructor ) );
63 void
64 cblk_sadd_profile_register( void )
65 {
66  profiling_register_cl( &cblk_sadd_profile );
67 }
68 
69 #ifndef DOXYGEN_SHOULD_SKIP_THIS
70 #if defined(PASTIX_STARPU_PROFILING_LOG)
71 static void
72 cl_profiling_cb_cblk_sadd( void *callback_arg )
73 {
74  cl_profiling_callback( callback_arg );
75 
76  struct starpu_task *task = starpu_task_get_current();
77  struct starpu_profiling_task_info *info = task->profiling_info;
78 
79  /* Quick return */
80  if ( info == NULL ) {
81  return;
82  }
83 
84  struct cl_cblk_sadd_args_s *args = (struct cl_cblk_sadd_args_s *) callback_arg;
85  pastix_fixdbl_t flops = args->profile_data.flops;
86  pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
87  pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
88 
89  pastix_int_t M = args->cblk->stride;
90  pastix_int_t N = cblk_colnbr( args->cblk );
91 
92  cl_profiling_log_register( task->name, "cblk_sadd", M, N, 0, flops, speed );
93 }
94 #endif
95 
96 #if defined(PASTIX_STARPU_PROFILING_LOG)
97 static void (*cblk_sadd_callback)(void*) = cl_profiling_cb_cblk_sadd;
98 #else
99 static void (*cblk_sadd_callback)(void*) = cl_profiling_callback;
100 #endif
101 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
102 
103 #endif /* defined( PASTIX_STARPU_PROFILING ) */
104 
105 /**
106  *******************************************************************************
107  *
108  * @brief Cost model function
109  *
110  * The user can switch from the pastix static model to an history based model
111  * computed automatically.
112  *
113  *******************************************************************************
114  *
115  * @param[in] task
116  * TODO
117  *
118  * @param[in] arch
119  * TODO
120  *
121  * @param[in] nimpl
122  * TODO
123  *
124  *******************************************************************************
125  *
126  * @retval TODO
127  *
128  *******************************************************************************/
129 static inline pastix_fixdbl_t
130 fct_cblk_sadd_cost( struct starpu_task *task,
131  struct starpu_perfmodel_arch *arch,
132  unsigned nimpl )
133 {
134  struct cl_cblk_sadd_args_s *args = (struct cl_cblk_sadd_args_s *)(task->cl_arg);
135 
136  pastix_fixdbl_t cost = 0.;
137  pastix_fixdbl_t *coefs;
138  pastix_int_t M = args->cblk->stride;
139  pastix_int_t N = cblk_colnbr( args->cblk );
140 
141  switch( arch->devices->type ) {
142  case STARPU_CPU_WORKER:
143  coefs = &(args->sopalin_data->cpu_models->coefficients[PastixFloat-2][PastixKernelGEADDCblkFRFR][0]);
144  break;
145  case STARPU_CUDA_WORKER:
146  coefs = &(args->sopalin_data->gpu_models->coefficients[PastixFloat-2][PastixKernelGEADDCblkFRFR][0]);
147  break;
148  default:
149  assert(0);
150  return 0.;
151  }
152 
153  /* Get cost in us */
154  cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
155 
156  (void)nimpl;
157  return cost;
158 }
159 
160 #ifndef DOXYGEN_SHOULD_SKIP_THIS
161 static struct starpu_perfmodel starpu_cblk_sadd_model = {
162 #if defined( PASTIX_STARPU_COST_PER_ARCH )
163  .type = STARPU_PER_ARCH,
164  .arch_cost_function = fct_cblk_sadd_cost,
165 #else
166  .type = STARPU_HISTORY_BASED,
167 #endif
168  .symbol = "cblk_sadd",
169 };
170 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
171 
172 #if !defined(PASTIX_STARPU_SIMULATION)
173 /**
174  *******************************************************************************
175  *
176  * @brief StarPU CPU implementation
177  *
178  *******************************************************************************
179  *
180  * @param[in] descr
181  * TODO
182  *
183  * @param[in] cl_arg
184  * TODO
185  *
186  *******************************************************************************/
187 static void
188 fct_cblk_sadd_cpu( void *descr[], void *cl_arg )
189 {
190  struct cl_cblk_sadd_args_s *args = (struct cl_cblk_sadd_args_s *)cl_arg;
191  const void *A;
192  void *B;
193 
194  A = pastix_starpu_cblk_get_ptr( descr[0] );
195  B = pastix_starpu_cblk_get_ptr( descr[1] );
196 
197  assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
198  assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
199 
200  args->profile_data.flops = cpucblk_sadd( 1., args->cblk, args->fcblk, A, B, NULL, 0,
201  &( args->sopalin_data->solvmtx->lowrank ) );
202 }
203 
204 #if defined(PASTIX_WITH_CUDA) && 0
205 /**
206  *******************************************************************************
207  *
208  * @brief StarPU GPU implementation
209  *
210  *******************************************************************************
211  *
212  * @param[in] descr
213  * TODO
214  *
215  * @param[in] cl_arg
216  * TODO
217  *
218  *******************************************************************************/
219 static void
220 fct_cblk_sadd_gpu( void *descr[], void *cl_arg )
221 {
222  struct cl_template_args_s *args = (struct cl_template_args_s *)cl_arg;
223  const void *A;
224  void *B;
225 
226  A = pastix_starpu_cblk_get_ptr( descr[0] );
227  B = pastix_starpu_cblk_get_ptr( descr[1] );
228 
229  assert( args->cblk->cblktype & CBLK_TASKS_2D );
230  assert( args->fcblk->cblktype & CBLK_TASKS_2D );
231 
232  args->profile_data.flops = gpucblk_sadd( 1., args->cblk, args->fcblk, 1, B,
233  &( args->sopalin_data->solvmtx->lowrank ),
234  starpu_cuda_get_local_stream() );
235 
236 }
237 #endif /* defined(PASTIX_WITH_CUDA) */
238 #endif /* !defined(PASTIX_STARPU_SIMULATION) */
239 
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS
241 CODELETS_CPU( cblk_sadd, 2 );
242 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
243 
244 /**
245  *******************************************************************************
246  *
247  * @brief Insert the task to add a fanin cblk on the receiver side (The fanin is
248  * seen on this side as the RECV cblk). Note that the caller always execute the
249  * task.
250  *
251  *******************************************************************************
252  *
253  * @param[in] sopalin_data
254  * Solver matrix information structure that will guide the algorithm.
255  *
256  * @param[in] side
257  * Define which side of the cblk must be tested.
258  * @arg PastixLCoef if lower part only
259  * @arg PastixUCoef if upper part only
260  * @arg PastixLUCoef if both sides.
261  *
262  * @param[in] cblk
263  * The column block of the matrix.
264  *
265  * @param[in] fcblk
266  * The facing column block of the matrix.
267  *
268  * @param[in] prio
269  * The task priority.
270  *
271  *******************************************************************************/
272 void
273 starpu_task_cblk_sadd_recv( sopalin_data_t *sopalin_data,
274  pastix_coefside_t side,
275  const SolverCblk *cblk,
276  SolverCblk *fcblk,
277  int prio )
278 {
279  struct cl_cblk_sadd_args_s *cl_arg = NULL;
280 #if defined(PASTIX_DEBUG_STARPU)
281  char *task_name;
282 #endif
283 
284  /*
285  * Create the arguments array
286  */
287  cl_arg = malloc( sizeof( struct cl_cblk_sadd_args_s) );
288  cl_arg->sopalin_data = sopalin_data;
289 #if defined(PASTIX_STARPU_PROFILING)
290  cl_arg->profile_data.measures = cblk_sadd_profile.measures;
291  cl_arg->profile_data.flops = NAN;
292 #endif
293  cl_arg->side = side;
294  cl_arg->cblk = cblk;
295  cl_arg->fcblk = fcblk;
296 
297 #if defined(PASTIX_DEBUG_STARPU)
298  /* This actually generates a memory leak */
299  asprintf( &task_name, "%s( %ld )",
300  cl_cblk_sadd_cpu.name,
301  (long)(cblk - sopalin_data->solvmtx->cblktab) );
302 #endif
303 
304  assert( cblk->cblktype & CBLK_RECV );
305  assert( !(fcblk->cblktype & (CBLK_RECV|CBLK_FANIN)) );
306 
307  pastix_starpu_insert_task(
308  &cl_cblk_sadd_cpu,
309  STARPU_CL_ARGS, cl_arg, sizeof( struct cl_cblk_sadd_args_s ),
310  STARPU_EXECUTE_ON_NODE, fcblk->ownerid,
311 #if defined(PASTIX_STARPU_PROFILING)
312  STARPU_CALLBACK_WITH_ARG_NFREE, cblk_sadd_callback, cl_arg,
313 #endif
314  STARPU_R, cblk->handler[side],
315  STARPU_RW, fcblk->handler[side],
316 #if defined(PASTIX_DEBUG_STARPU)
317  STARPU_NAME, task_name,
318 #endif
319 #if defined(PASTIX_STARPU_HETEROPRIO)
320  STARPU_PRIORITY, BucketFacto1D,
321 #else
322  STARPU_PRIORITY, prio,
323 #endif
324  0);
325 
326  (void)prio;
327 }
328 
329 /**
330  *******************************************************************************
331  *
332  * @brief Insert the task to add a fanin cblk on the emitter side. Note that
333  * this task is submitted only to emit a send to the owner of the associated
334  * recv cblk that will perform the add. Thus, the task is always submitted but
335  * never executed.
336  *
337  *******************************************************************************
338  *
339  * @param[in] sopalin_data
340  * Solver matrix information structure that will guide the algorithm.
341  *
342  * @param[in] side
343  * Define which side of the cblk must be tested.
344  * @arg PastixLCoef if lower part only
345  * @arg PastixUCoef if upper part only
346  * @arg PastixLUCoef if both sides.
347  *
348  * @param[in] cblk
349  * The column block of the matrix.
350  *
351  * @param[in] prio
352  * The task priority.
353  *
354  *******************************************************************************/
355 void
356 starpu_task_cblk_sadd_fanin( sopalin_data_t *sopalin_data,
357  pastix_coefside_t side,
358  const SolverCblk *cblk,
359  int prio )
360 {
361  assert( cblk->cblktype & CBLK_FANIN );
362 
363  pastix_starpu_insert_task(
364  NULL,
365  STARPU_EXECUTE_ON_NODE, cblk->ownerid,
366  STARPU_R, cblk->handler[side],
367 #if defined(PASTIX_STARPU_HETEROPRIO)
368  STARPU_PRIORITY, BucketFacto1D,
369 #else
370  STARPU_PRIORITY, prio,
371 #endif
372  0);
373 
374  (void)prio;
375 }
376 
377 /**
378  * @}
379  */
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelGEADDCblkFRFR
Definition: kernels_enums.h:69
pastix_fixdbl_t cpucblk_sadd(float alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const void *A, void *B, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in full rank format.
Definition: cpucblk_sadd.c:391
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static void fct_cblk_sadd_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
static pastix_fixdbl_t fct_cblk_sadd_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_cblk_sadd_fanin(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, int prio)
Insert the task to add a fanin cblk on the emitter side. Note that this task is submitted only to emi...
void starpu_task_cblk_sadd_recv(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, SolverCblk *fcblk, int prio)
Insert the task to add a fanin cblk on the receiver side (The fanin is seen on this side as the RECV ...
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
void * handler[2]
Definition: solver.h:179
int8_t cblktype
Definition: solver.h:164
int ownerid
Definition: solver.h:181
Solver column block structure.
Definition: solver.h:161