PaStiX Handbook  6.4.0
codelet_blok_strsmsp.c
Go to the documentation of this file.
1 /**
2  *
3  * @file codelet_blok_strsmsp.c
4  *
5  * StarPU codelets for blas-like functions
6  *
7  * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8  * Univ. Bordeaux. All rights reserved.
9  *
10  * @version 6.4.0
11  * @author Mathieu Faverge
12  * @author Pierre Ramet
13  * @author Ian Masliah
14  * @author Tom Moenne-Loccoz
15  * @date 2024-07-05
16  *
17  * @generated from /builds/solverstack/pastix/sopalin/starpu/codelet_blok_ztrsmsp.c, normal z -> s, Tue Oct 8 14:17:33 2024
18  *
19  * @addtogroup pastix_starpu
20  * @{
21  *
22  **/
23 #ifndef DOXYGEN_SHOULD_SKIP_THIS
24 #define _GNU_SOURCE
25 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
26 #include "common.h"
27 #include "blend/solver.h"
28 #include "sopalin/sopalin_data.h"
29 #include "pastix_scores.h"
30 #if defined(PASTIX_WITH_CUDA)
31 #include "pastix_scuda.h"
32 #endif
33 #include "pastix_starpu.h"
34 #include "pastix_sstarpu.h"
35 #include "codelets.h"
36 
37 /**
38  * @brief Main structure for all tasks of blok_strsmsp type
39  */
40 struct cl_blok_strsmsp_args_s {
41  profile_data_t profile_data;
42  sopalin_data_t *sopalin_data;
43  pastix_side_t side;
44  pastix_uplo_t uplo;
45  pastix_trans_t trans;
46  pastix_diag_t diag;
47  const SolverCblk *cblk;
48  pastix_int_t blok_m;
49 };
50 
51 #if defined( PASTIX_STARPU_PROFILING )
52 /**
53  * @brief Functions to profile the codelet
54  *
55  * Two levels of profiling are available:
56  * 1) A generic one that returns the flops per worker
57  * 2) A more detailed one that generate logs of the performance for each kernel
58  */
59 starpu_profile_t blok_strsmsp_profile = {
60  .next = NULL,
61  .name = "blok_strsmsp"
62 };
63 
64 /**
65  * @brief Profiling registration function
66  */
67 void blok_strsmsp_profile_register( void ) __attribute__( ( constructor ) );
68 void
69 blok_strsmsp_profile_register( void )
70 {
71  profiling_register_cl( &blok_strsmsp_profile );
72 }
73 
74 #ifndef DOXYGEN_SHOULD_SKIP_THIS
75 #if defined(PASTIX_STARPU_PROFILING_LOG)
76 static void
77 cl_profiling_cb_blok_strsmsp( void *callback_arg )
78 {
79  cl_profiling_callback( callback_arg );
80 
81  struct starpu_task *task = starpu_task_get_current();
82  struct starpu_profiling_task_info *info = task->profiling_info;
83 
84  /* Quick return */
85  if ( info == NULL ) {
86  return;
87  }
88 
89  struct cl_blok_strsmsp_args_s *args = (struct cl_blok_strsmsp_args_s *) callback_arg;
90  pastix_fixdbl_t flops = args->profile_data.flops;
91  pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
92  pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
93 
94  pastix_int_t M = blok_rownbr_ext( args->cblk->fblokptr + args->blok_m );
95  pastix_int_t N = cblk_colnbr( args->cblk );
96 
97  cl_profiling_log_register( task->name, "blok_strsmsp", M, N, 0, flops, speed );
98 }
99 #endif
100 
101 #if defined(PASTIX_STARPU_PROFILING_LOG)
102 static void (*blok_strsmsp_callback)(void*) = cl_profiling_cb_blok_strsmsp;
103 #else
104 static void (*blok_strsmsp_callback)(void*) = cl_profiling_callback;
105 #endif
106 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
107 
108 #endif /* defined( PASTIX_STARPU_PROFILING ) */
109 
110 /**
111  *******************************************************************************
112  *
113  * @brief Cost model function
114  *
115  * The user can switch from the pastix static model to an history based model
116  * computed automatically.
117  *
118  *******************************************************************************
119  *
120  * @param[in] task
121  * TODO
122  *
123  * @param[in] arch
124  * TODO
125  *
126  * @param[in] nimpl
127  * TODO
128  *
129  *******************************************************************************
130  *
131  * @retval TODO
132  *
133  *******************************************************************************/
134 static inline pastix_fixdbl_t
135 fct_blok_strsmsp_cost( struct starpu_task *task,
136  struct starpu_perfmodel_arch *arch,
137  unsigned nimpl )
138 {
139  struct cl_blok_strsmsp_args_s *args = (struct cl_blok_strsmsp_args_s *)(task->cl_arg);
140 
141  pastix_fixdbl_t cost = 0.;
142  pastix_fixdbl_t *coefs;
143  pastix_int_t M = blok_rownbr_ext( args->cblk->fblokptr + args->blok_m );
144  pastix_int_t N = cblk_colnbr( args->cblk );
145 
146  switch( arch->devices->type ) {
147  case STARPU_CPU_WORKER:
148  coefs = &(args->sopalin_data->cpu_models->coefficients[PastixFloat-2][PastixKernelTRSMBlok2d][0]);
149  break;
150  case STARPU_CUDA_WORKER:
151  coefs = &(args->sopalin_data->gpu_models->coefficients[PastixFloat-2][PastixKernelTRSMBlok2d][0]);
152  break;
153  default:
154  assert(0);
155  return 0.;
156  }
157 
158  /* Get cost in us */
159  cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
160 
161  (void)nimpl;
162  return cost;
163 }
164 
165 #ifndef DOXYGEN_SHOULD_SKIP_THIS
166 static struct starpu_perfmodel starpu_blok_strsmsp_model = {
167 #if defined(PASTIX_STARPU_COST_PER_ARCH)
168  .type = STARPU_PER_ARCH,
169  .arch_cost_function = fct_blok_strsmsp_cost,
170 #else
171  .type = STARPU_HISTORY_BASED,
172 #endif
173  .symbol = "blok_strsmsp",
174 };
175 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
176 
177 #if !defined(PASTIX_STARPU_SIMULATION)
178 /**
179  *******************************************************************************
180  *
181  * @brief StarPU CPU implementation
182  *
183  *******************************************************************************
184  *
185  * @param[in] descr
186  * TODO
187  *
188  * @param[in] cl_arg
189  * TODO
190  *
191  *******************************************************************************/
192 static void
193 fct_blok_strsmsp_cpu( void *descr[], void *cl_arg )
194 {
195  struct cl_blok_strsmsp_args_s *args = (struct cl_blok_strsmsp_args_s *)cl_arg;
196  const void *A;
197  void *C;
198 
199  A = pastix_starpu_blok_get_ptr( descr[0] );
200  C = pastix_starpu_blok_get_ptr( descr[1] );
201 
202  assert( args->cblk->cblktype & CBLK_TASKS_2D );
203 
204  args->profile_data.flops = cpublok_strsmsp( args->side, args->uplo,
205  args->trans, args->diag,
206  args->cblk, args->blok_m, A, C,
207  &(args->sopalin_data->solvmtx->lowrank) );
208 }
209 
210 /**
211  * @brief StarPU GPU implementation
212  */
213 #if defined(PASTIX_WITH_CUDA)
214 static void
215 fct_blok_strsmsp_gpu( void *descr[], void *cl_arg )
216 {
217  struct cl_blok_strsmsp_args_s *args = (struct cl_blok_strsmsp_args_s *)cl_arg;
218  const void *A;
219  void *C;
220 
221  A = pastix_starpu_blok_get_ptr( descr[0] );
222  C = pastix_starpu_blok_get_ptr( descr[1] );
223 
224  assert( args->cblk->cblktype & CBLK_TASKS_2D );
225 
226  args->profile_data.flops = gpublok_strsmsp( args->side, args->uplo,
227  args->trans, args->diag,
228  args->cblk, args->blok_m, A, C,
229  &(args->sopalin_data->solvmtx->lowrank),
230  starpu_cuda_get_local_stream() );
231 }
232 #endif /* defined(PASTIX_WITH_CUDA) */
233 #endif /* !defined(PASTIX_STARPU_SIMULATION) */
234 
235 #ifndef DOXYGEN_SHOULD_SKIP_THIS
236 CODELETS_GPU( blok_strsmsp, 2, STARPU_CUDA_ASYNC );
237 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
238 
239 /**
240  *******************************************************************************
241  *
242  * @brief TODO
243  *
244  *******************************************************************************
245  *
246  * @param[in] sopalin_data
247  * TODO
248  *
249  * @param[in] coef
250  * TODO
251  *
252  * @param[in] side
253  * TODO
254  *
255  * @param[in] uplo
256  * TODO
257  *
258  * @param[in] trans
259  * TODO
260  *
261  * @param[in] diag
262  * TODO
263  *
264  * @param[in] cblk
265  * TODO
266  *
267  * @param[in] blok
268  * TODO
269  *
270  * @param[in] prio
271  * TODO
272  *
273  *******************************************************************************/
274 void
275 starpu_task_blok_strsmsp( sopalin_data_t *sopalin_data,
276  pastix_coefside_t coef,
277  pastix_side_t side,
278  pastix_uplo_t uplo,
279  pastix_trans_t trans,
280  pastix_diag_t diag,
281  const SolverCblk *cblk,
282  SolverBlok *blok,
283  int prio )
284 {
285  struct cl_blok_strsmsp_args_s *cl_arg = NULL;
286  long long execute_where = cl_blok_strsmsp_any.where;
287  int need_exec = 1;
288 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
289  char *task_name;
290 #endif
291 
292  pastix_int_t blok_m = blok - cblk->fblokptr;
293 
294  /*
295  * Check if it needs to be submitted
296  */
297 #if defined(PASTIX_WITH_MPI)
298  {
299  int need_submit = 0;
300  if ( cblk->ownerid == sopalin_data->solvmtx->clustnum ) {
301  need_submit = 1;
302  }
303  else {
304  need_exec = 0;
305  }
306  if ( starpu_mpi_cached_receive( blok->handler[coef] ) ) {
307  need_submit = 1;
308  }
309  if ( !need_submit ) {
310  return;
311  }
312  }
313 #endif
314 
315  /*
316  * Create the arguments array
317  */
318  if ( need_exec ) {
319  cl_arg = malloc( sizeof( struct cl_blok_strsmsp_args_s ) );
320  cl_arg->sopalin_data = sopalin_data;
321 #if defined(PASTIX_STARPU_PROFILING)
322  cl_arg->profile_data.measures = blok_strsmsp_profile.measures;
323  cl_arg->profile_data.flops = NAN;
324 #endif
325  cl_arg->side = side;
326  cl_arg->uplo = uplo;
327  cl_arg->trans = trans;
328  cl_arg->diag = diag;
329  cl_arg->cblk = cblk;
330  cl_arg->blok_m = blok_m;
331 
332 #if defined(PASTIX_WITH_CUDA)
333  if ( (cblk->cblktype & CBLK_COMPRESSED) ) {
334  execute_where &= (~STARPU_CUDA);
335  }
336 #endif
337  }
338 
339 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
340  /* This actually generates a memory leak */
341  asprintf( &task_name, "%s( %ld, %ld, %ld )",
342  cl_blok_strsmsp_any.name,
343  (long)(cblk - sopalin_data->solvmtx->cblktab),
344  (long)(blok - sopalin_data->solvmtx->bloktab),
345  (long)coef );
346 #endif
347 
348  pastix_starpu_insert_task(
349  &cl_blok_strsmsp_any,
350  STARPU_CL_ARGS, cl_arg, sizeof( struct cl_blok_strsmsp_args_s ),
351  STARPU_EXECUTE_WHERE, execute_where,
352 #if defined(PASTIX_STARPU_PROFILING)
353  STARPU_CALLBACK_WITH_ARG_NFREE, blok_strsmsp_callback, cl_arg,
354 #endif
355  STARPU_R, cblk->fblokptr->handler[coef],
356  STARPU_RW, blok->handler[coef],
357 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
358  STARPU_NAME, task_name,
359 #endif
360 #if defined(PASTIX_STARPU_HETEROPRIO)
361  STARPU_PRIORITY, BucketTRSM2D,
362 #else
363  STARPU_PRIORITY, prio,
364 #endif
365  0);
366  (void)prio;
367 }
368 
369 /**
370  * @}
371  */
BEGIN_C_DECLS typedef int pastix_int_t
Definition: datatypes.h:51
double pastix_fixdbl_t
Definition: datatypes.h:65
@ PastixKernelTRSMBlok2d
Definition: kernels_enums.h:60
pastix_fixdbl_t cpublok_strsmsp(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, pastix_int_t blok_m, const void *A, void *C, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
Definition: core_strsmsp.c:690
enum pastix_diag_e pastix_diag_t
Diagonal.
enum pastix_uplo_e pastix_uplo_t
Upper/Lower part.
enum pastix_side_e pastix_side_t
Side of the operation.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static pastix_fixdbl_t fct_blok_strsmsp_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
static void fct_blok_strsmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
void starpu_task_blok_strsmsp(sopalin_data_t *sopalin_data, pastix_coefside_t coef, pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, SolverBlok *blok, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr_ext(const SolverBlok *blok)
Compute the number of rows of a contiguous block in front of the same cblk.
Definition: solver.h:407
void * handler[2]
Definition: solver.h:142
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition: solver.h:329
SolverBlok * fblokptr
Definition: solver.h:168
int8_t cblktype
Definition: solver.h:164
int ownerid
Definition: solver.h:181
Solver block structure.
Definition: solver.h:141
Solver column block structure.
Definition: solver.h:161