PaStiX Handbook  6.4.0
codelet_cblk_cadd.c
Go to the documentation of this file.
1 /**
2  *
3  * @file codelet_cblk_cadd.c
4  *
5  * StarPU codelet to sum fanin cblk together.
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Alycia Lisito
12  * @author Mathieu Faverge
13  * @date 2024-07-05
14  *
15  * @generated from /builds/solverstack/pastix/sopalin/starpu/codelet_cblk_zadd.c, normal z -> c, Tue Oct 8 14:17:34 2024
16  *
17  * @addtogroup pastix_starpu
18  * @{
19  *
20  **/
21 #ifndef DOXYGEN_SHOULD_SKIP_THIS
22 #define _GNU_SOURCE
23 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
24 #include "common.h"
25 #include "blend/solver.h"
26 #include "sopalin/sopalin_data.h"
27 #include "pastix_ccores.h"
28 #if defined(PASTIX_WITH_CUDA)
29 #include "pastix_ccuda.h"
30 #endif
31 #include "pastix_starpu.h"
32 #include "pastix_cstarpu.h"
33 #include "codelets.h"
34 
35 /**
36  * @brief Main structure for all tasks of cblk_cadd type
37  */
38 struct cl_cblk_cadd_args_s {
39  profile_data_t profile_data;
40  sopalin_data_t *sopalin_data;
41  pastix_coefside_t side;
42  const SolverCblk *cblk;
43  SolverCblk *fcblk;
44 };
45 
46 #if defined( PASTIX_STARPU_PROFILING )
47 /**
48  * @brief Functions to profile the codelet
49  *
50  * Two levels of profiling are available:
51  * 1) A generic one that returns the flops per worker
52  * 2) A more detailed one that generate logs of the performance for each kernel
53  */
54 starpu_profile_t cblk_cadd_profile = {
55  .next = NULL,
56  .name = "cblk_cadd"
57 };
58 
59 /**
60  * @brief Profiling registration function
61  */
62 void cblk_cadd_profile_register( void ) __attribute__( ( constructor ) );
63 void
64 cblk_cadd_profile_register( void )
65 {
66  profiling_register_cl( &cblk_cadd_profile );
67 }
68 
69 #ifndef DOXYGEN_SHOULD_SKIP_THIS
70 #if defined(PASTIX_STARPU_PROFILING_LOG)
71 static void
72 cl_profiling_cb_cblk_cadd( void *callback_arg )
73 {
74  cl_profiling_callback( callback_arg );
75 
76  struct starpu_task *task = starpu_task_get_current();
77  struct starpu_profiling_task_info *info = task->profiling_info;
78 
79  /* Quick return */
80  if ( info == NULL ) {
81  return;
82  }
83 
84  struct cl_cblk_cadd_args_s *args = (struct cl_cblk_cadd_args_s *) callback_arg;
85  pastix_fixdbl_t flops = args->profile_data.flops;
86  pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
87  pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
88 
89  pastix_int_t M = args->cblk->stride;
90  pastix_int_t N = cblk_colnbr( args->cblk );
91 
92  cl_profiling_log_register( task->name, "cblk_cadd", M, N, 0, flops, speed );
93 }
94 #endif
95 
96 #if defined(PASTIX_STARPU_PROFILING_LOG)
97 static void (*cblk_cadd_callback)(void*) = cl_profiling_cb_cblk_cadd;
98 #else
99 static void (*cblk_cadd_callback)(void*) = cl_profiling_callback;
100 #endif
101 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
102 
103 #endif /* defined( PASTIX_STARPU_PROFILING ) */
104 
105 /**
106  *******************************************************************************
107  *
108  * @brief Cost model function
109  *
110  * The user can switch from the pastix static model to an history based model
111  * computed automatically.
112  *
113  *******************************************************************************
114  *
115  * @param[in] task
116  * TODO
117  *
118  * @param[in] arch
119  * TODO
120  *
121  * @param[in] nimpl
122  * TODO
123  *
124  *******************************************************************************
125  *
126  * @retval TODO
127  *
128  *******************************************************************************/
129 static inline pastix_fixdbl_t
130 fct_cblk_cadd_cost( struct starpu_task *task,
131  struct starpu_perfmodel_arch *arch,
132  unsigned nimpl )
133 {
134  struct cl_cblk_cadd_args_s *args = (struct cl_cblk_cadd_args_s *)(task->cl_arg);
135 
136  pastix_fixdbl_t cost = 0.;
137  pastix_fixdbl_t *coefs;
138  pastix_int_t M = args->cblk->stride;
139  pastix_int_t N = cblk_colnbr( args->cblk );
140 
141  switch( arch->devices->type ) {
142  case STARPU_CPU_WORKER:
143  coefs = &(args->sopalin_data->cpu_models->coefficients[PastixComplex32-2][PastixKernelGEADDCblkFRFR][0]);
144  break;
145  case STARPU_CUDA_WORKER:
146  coefs = &(args->sopalin_data->gpu_models->coefficients[PastixComplex32-2][PastixKernelGEADDCblkFRFR][0]);
147  break;
148  default:
149  assert(0);
150  return 0.;
151  }
152 
153  /* Get cost in us */
154  cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
155 
156  (void)nimpl;
157  return cost;
158 }
159 
160 #ifndef DOXYGEN_SHOULD_SKIP_THIS
161 static struct starpu_perfmodel starpu_cblk_cadd_model = {
162 #if defined( PASTIX_STARPU_COST_PER_ARCH )
163  .type = STARPU_PER_ARCH,
164  .arch_cost_function = fct_cblk_cadd_cost,
165 #else
166  .type = STARPU_HISTORY_BASED,
167 #endif
168  .symbol = "cblk_cadd",
169 };
170 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
171 
172 #if !defined(PASTIX_STARPU_SIMULATION)
173 /**
174  *******************************************************************************
175  *
176  * @brief StarPU CPU implementation
177  *
178  *******************************************************************************
179  *
180  * @param[in] descr
181  * TODO
182  *
183  * @param[in] cl_arg
184  * TODO
185  *
186  *******************************************************************************/
187 static void
188 fct_cblk_cadd_cpu( void *descr[], void *cl_arg )
189 {
190  struct cl_cblk_cadd_args_s *args = (struct cl_cblk_cadd_args_s *)cl_arg;
191  const void *A;
192  void *B;
193 
194  A = pastix_starpu_cblk_get_ptr( descr[0] );
195  B = pastix_starpu_cblk_get_ptr( descr[1] );
196 
197  assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
198  assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
199 
200  args->profile_data.flops = cpucblk_cadd( 1., args->cblk, args->fcblk, A, B, NULL, 0,
201  &( args->sopalin_data->solvmtx->lowrank ) );
202 }
203 
204 #if defined(PASTIX_WITH_CUDA) && 0
205 /**
206  *******************************************************************************
207  *
208  * @brief StarPU GPU implementation
209  *
210  *******************************************************************************
211  *
212  * @param[in] descr
213  * TODO
214  *
215  * @param[in] cl_arg
216  * TODO
217  *
218  *******************************************************************************/
219 static void
220 fct_cblk_cadd_gpu( void *descr[], void *cl_arg )
221 {
222  struct cl_template_args_s *args = (struct cl_template_args_s *)cl_arg;
223  const void *A;
224  void *B;
225 
226  A = pastix_starpu_cblk_get_ptr( descr[0] );
227  B = pastix_starpu_cblk_get_ptr( descr[1] );
228 
229  assert( args->cblk->cblktype & CBLK_TASKS_2D );
230  assert( args->fcblk->cblktype & CBLK_TASKS_2D );
231 
232  args->profile_data.flops = gpucblk_cadd( 1., args->cblk, args->fcblk, 1, B,
233  &( args->sopalin_data->solvmtx->lowrank ),
234  starpu_cuda_get_local_stream() );
235 
236 }
237 #endif /* defined(PASTIX_WITH_CUDA) */
238 #endif /* !defined(PASTIX_STARPU_SIMULATION) */
239 
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS
241 CODELETS_CPU( cblk_cadd, 2 );
242 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
243 
244 /**
245  *******************************************************************************
246  *
247  * @brief Insert the task to add a fanin cblk on the receiver side (The fanin is
248  * seen on this side as the RECV cblk). Note that the caller always execute the
249  * task.
250  *
251  *******************************************************************************
252  *
253  * @param[in] sopalin_data
254  * Solver matrix information structure that will guide the algorithm.
255  *
256  * @param[in] side
257  * Define which side of the cblk must be tested.
258  * @arg PastixLCoef if lower part only
259  * @arg PastixUCoef if upper part only
260  * @arg PastixLUCoef if both sides.
261  *
262  * @param[in] cblk
263  * The column block of the matrix.
264  *
265  * @param[in] fcblk
266  * The facing column block of the matrix.
267  *
268  * @param[in] prio
269  * The task priority.
270  *
271  *******************************************************************************/
272 void
273 starpu_task_cblk_cadd_recv( sopalin_data_t *sopalin_data,
274  pastix_coefside_t side,
275  const SolverCblk *cblk,
276  SolverCblk *fcblk,
277  int prio )
278 {
279  struct cl_cblk_cadd_args_s *cl_arg = NULL;
280 #if defined(PASTIX_DEBUG_STARPU)
281  char *task_name;
282 #endif
283 
284  /*
285  * Create the arguments array
286  */
287  cl_arg = malloc( sizeof( struct cl_cblk_cadd_args_s) );
288  cl_arg->sopalin_data = sopalin_data;
289 #if defined(PASTIX_STARPU_PROFILING)
290  cl_arg->profile_data.measures = cblk_cadd_profile.measures;
291  cl_arg->profile_data.flops = NAN;
292 #endif
293  cl_arg->side = side;
294  cl_arg->cblk = cblk;
295  cl_arg->fcblk = fcblk;
296 
297 #if defined(PASTIX_DEBUG_STARPU)
298  /* This actually generates a memory leak */
299  asprintf( &task_name, "%s( %ld )",
300  cl_cblk_cadd_cpu.name,
301  (long)(cblk - sopalin_data->solvmtx->cblktab) );
302 #endif
303 
304  assert( cblk->cblktype & CBLK_RECV );
305  assert( !(fcblk->cblktype & (CBLK_RECV|CBLK_FANIN)) );
306 
307  pastix_starpu_insert_task(
308  &cl_cblk_cadd_cpu,
309  STARPU_CL_ARGS, cl_arg, sizeof( struct cl_cblk_cadd_args_s ),
310  STARPU_EXECUTE_ON_NODE, fcblk->ownerid,
311 #if defined(PASTIX_STARPU_PROFILING)
312  STARPU_CALLBACK_WITH_ARG_NFREE, cblk_cadd_callback, cl_arg,
313 #endif
314  STARPU_R, cblk->handler[side],
315  STARPU_RW, fcblk->handler[side],
316 #if defined(PASTIX_DEBUG_STARPU)
317  STARPU_NAME, task_name,
318 #endif
319 #if defined(PASTIX_STARPU_HETEROPRIO)
320  STARPU_PRIORITY, BucketFacto1D,
321 #else
322  STARPU_PRIORITY, prio,
323 #endif
324  0);
325 
326  (void)prio;
327 }
328 
329 /**
330  *******************************************************************************
331  *
332  * @brief Insert the task to add a fanin cblk on the emitter side. Note that
333  * this task is submitted only to emit a send to the owner of the associated
334  * recv cblk that will perform the add. Thus, the task is always submitted but
335  * never executed.
336  *
337  *******************************************************************************
338  *
339  * @param[in] sopalin_data
340  * Solver matrix information structure that will guide the algorithm.
341  *
342  * @param[in] side
343  * Define which side of the cblk must be tested.
344  * @arg PastixLCoef if lower part only
345  * @arg PastixUCoef if upper part only
346  * @arg PastixLUCoef if both sides.
347  *
348  * @param[in] cblk
349  * The column block of the matrix.
350  *
351  * @param[in] prio
352  * The task priority.
353  *
354  *******************************************************************************/
355 void
356 starpu_task_cblk_cadd_fanin( sopalin_data_t *sopalin_data,
357  pastix_coefside_t side,
358  const SolverCblk *cblk,
359  int prio )
360 {
361  assert( cblk->cblktype & CBLK_FANIN );
362 
363  pastix_starpu_insert_task(
364  NULL,
365  STARPU_EXECUTE_ON_NODE, cblk->ownerid,
366  STARPU_R, cblk->handler[side],
367 #if defined(PASTIX_STARPU_HETEROPRIO)
368  STARPU_PRIORITY, BucketFacto1D,
369 #else
370  STARPU_PRIORITY, prio,
371 #endif
372  0);
373 
374  (void)prio;
375 }
376 
377 /**
378  * @}
379  */
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelGEADDCblkFRFR
Definition: kernels_enums.h:69
pastix_fixdbl_t cpucblk_cadd(pastix_complex32_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, const void *A, void *B, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two column bloks in full rank format.
Definition: cpucblk_cadd.c:391
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
void starpu_task_cblk_cadd_recv(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, SolverCblk *fcblk, int prio)
Insert the task to add a fanin cblk on the receiver side (The fanin is seen on this side as the RECV ...
static void fct_cblk_cadd_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
static pastix_fixdbl_t fct_cblk_cadd_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_cblk_cadd_fanin(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, int prio)
Insert the task to add a fanin cblk on the emitter side. Note that this task is submitted only to emi...
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
void * handler[2]
Definition: solver.h:179
int8_t cblktype
Definition: solver.h:164
int ownerid
Definition: solver.h:181
Solver column block structure.
Definition: solver.h:161