PaStiX Handbook  6.4.0
codelet_cblk_dgemmsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file codelet_cblk_dgemmsp.c
4  *
5  * StarPU codelets for blas-like functions
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Ian Masliah
14  * @author Tom Moenne-Loccoz
15  * @author Alycia Lisito
16  * @date 2024-07-05
17  *
18  * @generated from /builds/solverstack/pastix/sopalin/starpu/codelet_cblk_zgemmsp.c, normal z -> d, Tue Oct 8 14:17:34 2024
19  *
20  * @addtogroup pastix_starpu
21  * @{
22  *
23  **/
24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
25 #define _GNU_SOURCE
26 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
27 #include "common.h"
28 #include "blend/solver.h"
29 #include "sopalin/sopalin_data.h"
30 #include "pastix_dcores.h"
31 #if defined(PASTIX_WITH_CUDA)
32 #include "pastix_dcuda.h"
33 #endif
34 #include "pastix_starpu.h"
35 #include "pastix_dstarpu.h"
36 #include "codelets.h"
37 
38 /**
39  * @brief Main structure for all tasks of cblk_dgemmsp type
40  */
41 struct cl_cblk_dgemmsp_args_s {
42  profile_data_t profile_data;
43  sopalin_data_t *sopalin_data;
44  pastix_coefside_t sideA;
45  pastix_trans_t trans;
46  const SolverCblk *cblk;
47  const SolverBlok *blok;
48  SolverCblk *fcblk;
49 };
50 
51 #if defined( PASTIX_STARPU_PROFILING )
52 /**
53  * @brief Functions to profile the codelet
54  *
55  * Two levels of profiling are available:
56  * 1) A generic one that returns the flops per worker
57  * 2) A more detailed one that generate logs of the performance for each kernel
58  */
59 starpu_profile_t cblk_dgemmsp_profile = {
60  .next = NULL,
61  .name = "cblk_dgemmsp"
62 };
63 
64 /**
65  * @brief Profiling registration function
66  */
67 void cblk_dgemmsp_profile_register( void ) __attribute__( ( constructor ) );
68 void
69 cblk_dgemmsp_profile_register( void )
70 {
71  profiling_register_cl( &cblk_dgemmsp_profile );
72 }
73 
74 #ifndef DOXYGEN_SHOULD_SKIP_THIS
75 #if defined(PASTIX_STARPU_PROFILING_LOG)
76 static void
77 cl_profiling_cb_cblk_dgemmsp( void *callback_arg )
78 {
79  cl_profiling_callback( callback_arg );
80 
81  struct starpu_task *task = starpu_task_get_current();
82  struct starpu_profiling_task_info *info = task->profiling_info;
83 
84  /* Quick return */
85  if ( info == NULL ) {
86  return;
87  }
88 
89  struct cl_cblk_dgemmsp_args_s *args = (struct cl_cblk_dgemmsp_args_s *) callback_arg;
90  pastix_fixdbl_t flops = args->profile_data.flops;
91  pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
92  pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
93 
94  pastix_int_t M = args->cblk->stride;
95  pastix_int_t N = blok_rownbr( args->blok );
96  pastix_int_t K = cblk_colnbr( args->cblk );
97 
98  M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
99  M -= (args->sideA == PastixUCoef) ? blok_rownbr( args->blok ) : 0;
100 
101  cl_profiling_log_register( task->name, "cblk_dgemmsp", M, N, K, flops, speed );
102 }
103 #endif
104 
105 #if defined(PASTIX_STARPU_PROFILING_LOG)
106 static void (*cblk_dgemmsp_callback)(void*) = cl_profiling_cb_cblk_dgemmsp;
107 #else
108 static void (*cblk_dgemmsp_callback)(void*) = cl_profiling_callback;
109 #endif
110 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
111 
112 #endif /* defined( PASTIX_STARPU_PROFILING ) */
113 
114 /**
115  *******************************************************************************
116  *
117  * @brief Cost model function
118  *
119  * The user can switch from the pastix static model to an history based model
120  * computed automatically.
121  *
122  *******************************************************************************
123  *
124  * @param[in] task
125  * TODO
126  *
127  * @param[in] arch
128  * TODO
129  *
130  * @param[in] nimpl
131  * TODO
132  *
133  *******************************************************************************
134  *
135  * @retval TODO
136  *
137  *******************************************************************************/
138 static inline pastix_fixdbl_t
139 fct_cblk_dgemmsp_cost( struct starpu_task *task,
140  struct starpu_perfmodel_arch *arch,
141  unsigned nimpl )
142 {
143  struct cl_cblk_dgemmsp_args_s *args = (struct cl_cblk_dgemmsp_args_s *)(task->cl_arg);
144 
145  pastix_fixdbl_t cost = 0.;
146  pastix_fixdbl_t *coefs;
147  pastix_int_t M = args->cblk->stride;
148  pastix_int_t N = blok_rownbr( args->blok );
149  pastix_int_t K = cblk_colnbr( args->cblk );
150 
151  M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
152  M -= (args->sideA == PastixUCoef) ? blok_rownbr( args->blok ) : 0;
153 
154  switch( arch->devices->type ) {
155  case STARPU_CPU_WORKER:
156  coefs = &(args->sopalin_data->cpu_models->coefficients[PastixDouble-2][PastixKernelGEMMCblk2d2d][0]);
157  break;
158  case STARPU_CUDA_WORKER:
159  coefs = &(args->sopalin_data->gpu_models->coefficients[PastixDouble-2][PastixKernelGEMMCblk2d2d][0]);
160  break;
161  default:
162  assert(0);
163  return 0.;
164  }
165 
166  /* Get cost in us */
167  cost = modelsGetCost3Param( coefs, M, N, K ) * 1e6;
168 
169  (void)nimpl;
170  return cost;
171 }
172 
173 #ifndef DOXYGEN_SHOULD_SKIP_THIS
174 static struct starpu_perfmodel starpu_cblk_dgemmsp_model = {
175 #if defined(PASTIX_STARPU_COST_PER_ARCH)
176  .type = STARPU_PER_ARCH,
177  .arch_cost_function = fct_cblk_dgemmsp_cost,
178 #else
179  .type = STARPU_HISTORY_BASED,
180 #endif
181  .symbol = "cblk_dgemmsp",
182 };
183 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
184 
185 #if !defined(PASTIX_STARPU_SIMULATION)
186 /**
187  *******************************************************************************
188  *
189  * @brief StarPU CPU implementation
190  *
191  *******************************************************************************
192  *
193  * @param[in] descr
194  * TODO
195  *
196  * @param[in] cl_arg
197  * TODO
198  *
199  *******************************************************************************/
200 static void
201 fct_cblk_dgemmsp_cpu( void *descr[], void *cl_arg )
202 {
203  struct cl_cblk_dgemmsp_args_s *args = (struct cl_cblk_dgemmsp_args_s *)cl_arg;
204  const void *A;
205  const void *B;
206  void *C;
207 
208  A = pastix_starpu_cblk_get_ptr( descr[0] );
209  B = pastix_starpu_cblk_get_ptr( descr[1] );
210  C = pastix_starpu_cblk_get_ptr( descr[2] );
211 
212  /* Check layout due to NULL workspace for now */
213  assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
214  assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
215 
216  args->profile_data.flops = cpucblk_dgemmsp( args->sideA, args->trans,
217  args->cblk, args->blok, args->fcblk,
218  A, B, C, NULL, 0,
219  &( args->sopalin_data->solvmtx->lowrank ) );
220 }
221 
222 /**
223  * @brief StarPU GPU implementation
224  */
225 #if defined(PASTIX_WITH_CUDA)
226 static void
227 fct_cblk_dgemmsp_gpu( void *descr[], void *cl_arg )
228 {
229  struct cl_cblk_dgemmsp_args_s *args = (struct cl_cblk_dgemmsp_args_s *)cl_arg;
230  const void *A;
231  const void *B;
232  void *C;
233 
234  A = pastix_starpu_cblk_get_ptr( descr[0] );
235  B = pastix_starpu_cblk_get_ptr( descr[1] );
236  C = pastix_starpu_cblk_get_ptr( descr[2] );
237 
238  args->profile_data.flops = gpucblk_dgemmsp( args->sideA, args->trans,
239  args->cblk, args->blok, args->fcblk,
240  A, B, C,
241  &( args->sopalin_data->solvmtx->lowrank ),
242  starpu_cuda_get_local_stream() );
243 }
244 #endif /* defined(PASTIX_WITH_CUDA) */
245 #endif /* !defined(PASTIX_STARPU_SIMULATION) */
246 
247 #ifndef DOXYGEN_SHOULD_SKIP_THIS
248 CODELETS_GPU( cblk_dgemmsp, 3, STARPU_CUDA_ASYNC );
249 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
250 
251 /**
252  *******************************************************************************
253  *
254  * @brief TODO
255  *
256  *******************************************************************************
257  *
258  * @param[in] sopalin_data
259  * TODO
260  *
261  * @param[in] sideA
262  * TODO
263  *
264  * @param[in] sideB
265  * TODO
266  *
267  * @param[in] trans
268  * TODO
269  *
270  * @param[in] cblk
271  * TODO
272  *
273  * @param[in] blok
274  *
275  * @param[in] fcblk
276  * TODO
277  *
278  * @param[in] prio
279  * TODO
280  *
281  *******************************************************************************/
282 void
283 starpu_task_cblk_dgemmsp( sopalin_data_t *sopalin_data,
284  pastix_coefside_t sideA,
285  pastix_coefside_t sideB,
286  pastix_trans_t trans,
287  const SolverCblk *cblk,
288  const SolverBlok *blok,
289  SolverCblk *fcblk,
290  int prio )
291 {
292  struct cl_cblk_dgemmsp_args_s *cl_arg = NULL;
293  long long execute_where = cl_cblk_dgemmsp_any.where;
294  int need_exec = 1;
295 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
296  char *task_name;
297 #endif
298 
299  /*
300  * Check if it needs to be submitted
301  */
302 #if defined(PASTIX_WITH_MPI)
303  {
304  int need_submit = 0;
305  if ( cblk->ownerid == sopalin_data->solvmtx->clustnum ) {
306  need_submit = 1;
307  }
308  if ( (fcblk->cblktype & CBLK_FANIN) ||
309  (fcblk->ownerid == sopalin_data->solvmtx->clustnum) )
310  {
311  need_submit = 1;
312  }
313  else {
314  need_exec = 0;
315  }
316  if ( starpu_mpi_cached_receive( fcblk->handler[sideA] ) ) {
317  need_submit = 1;
318  }
319  if ( !need_submit ) {
320  return;
321  }
322  }
323 #endif
324 
325  /*
326  * Create the arguments array
327  */
328  if ( need_exec ) {
329  cl_arg = malloc( sizeof( struct cl_cblk_dgemmsp_args_s ) );
330  cl_arg->sopalin_data = sopalin_data;
331 #if defined(PASTIX_STARPU_PROFILING)
332  cl_arg->profile_data.measures = cblk_dgemmsp_profile.measures;
333  cl_arg->profile_data.flops = NAN;
334 #endif
335  cl_arg->sideA = sideA;
336  cl_arg->trans = trans;
337  cl_arg->cblk = cblk;
338  cl_arg->blok = blok;
339  cl_arg->fcblk = fcblk;
340 
341 #if defined(PASTIX_WITH_CUDA)
342  if ( (cblk->cblktype & CBLK_COMPRESSED) ||
343  (fcblk->cblktype & CBLK_COMPRESSED) )
344  {
345  /* Disable CUDA */
346  execute_where &= (~STARPU_CUDA);
347  }
348 #endif
349  }
350 
351 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
352  /* This actually generates a memory leak */
353  asprintf( &task_name, "%s( %ld, %ld, %ld )",
354  cl_cblk_dgemmsp_any.name,
355  (long)(cblk - sopalin_data->solvmtx->cblktab),
356  (long)(blok - sopalin_data->solvmtx->bloktab),
357  (long)sideA );
358 #endif
359 
360  pastix_starpu_insert_task(
361  &cl_cblk_dgemmsp_any,
362  STARPU_CL_ARGS, cl_arg, sizeof( struct cl_cblk_dgemmsp_args_s ),
363  STARPU_EXECUTE_WHERE, execute_where,
364 #if defined(PASTIX_STARPU_PROFILING)
365  STARPU_CALLBACK_WITH_ARG_NFREE, cblk_dgemmsp_callback, cl_arg,
366 #endif
367  STARPU_R, cblk->handler[sideA],
368  STARPU_R, cblk->handler[sideB],
369  STARPU_RW, fcblk->handler[sideA],
370 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
371  STARPU_NAME, task_name,
372 #endif
373 #if defined(PASTIX_STARPU_HETEROPRIO)
374  STARPU_PRIORITY, BucketGEMM1D,
375 #else
376  STARPU_PRIORITY, prio,
377 #endif
378  0);
379  (void)prio;
380 }
381 
382 /**
383  * @}
384  */
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelGEMMCblk2d2d
Definition: kernels_enums.h:64
pastix_fixdbl_t cpucblk_dgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
@ PastixUCoef
Definition: api.h:479
static void fct_cblk_dgemmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
static pastix_fixdbl_t fct_cblk_dgemmsp_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_cblk_dgemmsp(sopalin_data_t *sopalin_data, pastix_coefside_t sideA, pastix_coefside_t sideB, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition: solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
void * handler[2]
Definition: solver.h:179
int8_t cblktype
Definition: solver.h:164
int ownerid
Definition: solver.h:181
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161