PaStiX Handbook 6.4.0
Loading...
Searching...
No Matches
codelet_cblk_sgemmsp.c
Go to the documentation of this file.
1/**
2 *
3 * @file codelet_cblk_sgemmsp.c
4 *
5 * StarPU codelets for blas-like functions
6 *
7 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8 * Univ. Bordeaux. All rights reserved.
9 *
10 * @version 6.4.0
11 * @author Mathieu Faverge
12 * @author Pierre Ramet
13 * @author Ian Masliah
14 * @author Tom Moenne-Loccoz
15 * @author Alycia Lisito
16 * @date 2024-07-05
17 *
18 * @generated from /builds/2mk6rsew/0/solverstack/pastix/sopalin/starpu/codelet_cblk_zgemmsp.c, normal z -> s, Tue Feb 25 14:35:23 2025
19 *
20 * @addtogroup pastix_starpu
21 * @{
22 *
23 **/
24#ifndef DOXYGEN_SHOULD_SKIP_THIS
25#define _GNU_SOURCE
26#endif /* DOXYGEN_SHOULD_SKIP_THIS */
27#include "common.h"
28#include "blend/solver.h"
29#include "sopalin/sopalin_data.h"
30#include "pastix_scores.h"
31#if defined(PASTIX_WITH_CUDA)
32#include "pastix_scuda.h"
33#endif
34#include "pastix_starpu.h"
35#include "pastix_sstarpu.h"
36#include "codelets.h"
37
38/**
39 * @brief Main structure for all tasks of cblk_sgemmsp type
40 */
41struct cl_cblk_sgemmsp_args_s {
42 profile_data_t profile_data;
43 sopalin_data_t *sopalin_data;
45 pastix_trans_t trans;
46 const SolverCblk *cblk;
47 const SolverBlok *blok;
48 SolverCblk *fcblk;
49};
50
51#if defined( PASTIX_STARPU_PROFILING )
52/**
53 * @brief Functions to profile the codelet
54 *
55 * Two levels of profiling are available:
56 * 1) A generic one that returns the flops per worker
57 * 2) A more detailed one that generate logs of the performance for each kernel
58 */
59starpu_profile_t cblk_sgemmsp_profile = {
60 .next = NULL,
61 .name = "cblk_sgemmsp"
62};
63
64/**
65 * @brief Profiling registration function
66 */
67void cblk_sgemmsp_profile_register( void ) __attribute__( ( constructor ) );
68void
69cblk_sgemmsp_profile_register( void )
70{
71 profiling_register_cl( &cblk_sgemmsp_profile );
72}
73
74#ifndef DOXYGEN_SHOULD_SKIP_THIS
75#if defined(PASTIX_STARPU_PROFILING_LOG)
76static void
77cl_profiling_cb_cblk_sgemmsp( void *callback_arg )
78{
79 cl_profiling_callback( callback_arg );
80
81 struct starpu_task *task = starpu_task_get_current();
82 struct starpu_profiling_task_info *info = task->profiling_info;
83
84 /* Quick return */
85 if ( info == NULL ) {
86 return;
87 }
88
89 struct cl_cblk_sgemmsp_args_s *args = (struct cl_cblk_sgemmsp_args_s *) callback_arg;
90 pastix_fixdbl_t flops = args->profile_data.flops;
91 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
92 pastix_fixdbl_t speed = flops / ( 1000.0 * duration );
93
94 pastix_int_t M = args->cblk->stride;
95 pastix_int_t N = blok_rownbr( args->blok );
96 pastix_int_t K = cblk_colnbr( args->cblk );
97
98 M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
99 M -= (args->sideA == PastixUCoef) ? blok_rownbr( args->blok ) : 0;
100
101 cl_profiling_log_register( task->name, "cblk_sgemmsp", M, N, K, flops, speed );
102}
103#endif
104
105#if defined(PASTIX_STARPU_PROFILING_LOG)
106static void (*cblk_sgemmsp_callback)(void*) = cl_profiling_cb_cblk_sgemmsp;
107#else
108static void (*cblk_sgemmsp_callback)(void*) = cl_profiling_callback;
109#endif
110#endif /* DOXYGEN_SHOULD_SKIP_THIS */
111
112#endif /* defined( PASTIX_STARPU_PROFILING ) */
113
114#if defined(PASTIX_STARPU_COST_PER_ARCH)
115/**
116 *******************************************************************************
117 *
118 * @brief Cost model function
119 *
120 * The user can switch from the pastix static model to an history based model
121 * computed automatically.
122 *
123 *******************************************************************************
124 *
125 * @param[in] task
126 * TODO
127 *
128 * @param[in] arch
129 * TODO
130 *
131 * @param[in] nimpl
132 * TODO
133 *
134 *******************************************************************************
135 *
136 * @retval TODO
137 *
138 *******************************************************************************/
139static inline pastix_fixdbl_t
140fct_cblk_sgemmsp_cost( struct starpu_task *task,
141 struct starpu_perfmodel_arch *arch,
142 unsigned nimpl )
143{
144 struct cl_cblk_sgemmsp_args_s *args = (struct cl_cblk_sgemmsp_args_s *)(task->cl_arg);
145
147 pastix_fixdbl_t *coefs;
148 pastix_int_t M = args->cblk->stride;
149 pastix_int_t N = blok_rownbr( args->blok );
150 pastix_int_t K = cblk_colnbr( args->cblk );
151
152 M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
153 M -= (args->sideA == PastixUCoef) ? blok_rownbr( args->blok ) : 0;
154
155 switch( arch->devices->type ) {
156 case STARPU_CPU_WORKER:
157 coefs = &(args->sopalin_data->cpu_models->coefficients[PastixFloat-2][PastixKernelGEMMCblk2d2d][0]);
158 break;
159 case STARPU_CUDA_WORKER:
160 coefs = &(args->sopalin_data->gpu_models->coefficients[PastixFloat-2][PastixKernelGEMMCblk2d2d][0]);
161 break;
162 default:
163 assert(0);
164 return 0.;
165 }
166
167 /* Get cost in us */
168 cost = modelsGetCost3Param( coefs, M, N, K ) * 1e6;
169
170 (void)nimpl;
171 return cost;
172}
173#endif
174
175#ifndef DOXYGEN_SHOULD_SKIP_THIS
176static struct starpu_perfmodel starpu_cblk_sgemmsp_model = {
177#if defined(PASTIX_STARPU_COST_PER_ARCH)
178 .type = STARPU_PER_ARCH,
179 .arch_cost_function = fct_cblk_sgemmsp_cost,
180#else
181 .type = STARPU_HISTORY_BASED,
182#endif
183 .symbol = "cblk_sgemmsp",
184};
185#endif /* DOXYGEN_SHOULD_SKIP_THIS */
186
187#if !defined(PASTIX_STARPU_SIMULATION)
188/**
189 *******************************************************************************
190 *
191 * @brief StarPU CPU implementation
192 *
193 *******************************************************************************
194 *
195 * @param[in] descr
196 * TODO
197 *
198 * @param[in] cl_arg
199 * TODO
200 *
201 *******************************************************************************/
202static void
203fct_cblk_sgemmsp_cpu( void *descr[], void *cl_arg )
204{
205 struct cl_cblk_sgemmsp_args_s *args = (struct cl_cblk_sgemmsp_args_s *)cl_arg;
206 const void *A;
207 const void *B;
208 void *C;
209
210 A = pastix_starpu_cblk_get_ptr( descr[0] );
211 B = pastix_starpu_cblk_get_ptr( descr[1] );
212 C = pastix_starpu_cblk_get_ptr( descr[2] );
213
214 /* Check layout due to NULL workspace for now */
215 assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
216 assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
217
218 args->profile_data.flops = cpucblk_sgemmsp( args->sideA, args->trans,
219 args->cblk, args->blok, args->fcblk,
220 A, B, C, NULL, 0,
221 &( args->sopalin_data->solvmtx->lowrank ) );
222}
223
224/**
225 * @brief StarPU GPU implementation
226 */
227#if defined(PASTIX_WITH_CUDA)
228static void
229fct_cblk_sgemmsp_gpu( void *descr[], void *cl_arg )
230{
231 struct cl_cblk_sgemmsp_args_s *args = (struct cl_cblk_sgemmsp_args_s *)cl_arg;
232 const void *A;
233 const void *B;
234 void *C;
235
236 A = pastix_starpu_cblk_get_ptr( descr[0] );
237 B = pastix_starpu_cblk_get_ptr( descr[1] );
238 C = pastix_starpu_cblk_get_ptr( descr[2] );
239
240 args->profile_data.flops = gpucblk_sgemmsp( args->sideA, args->trans,
241 args->cblk, args->blok, args->fcblk,
242 A, B, C,
243 &( args->sopalin_data->solvmtx->lowrank ),
244 starpu_cuda_get_local_stream() );
245}
246#endif /* defined(PASTIX_WITH_CUDA) */
247#endif /* !defined(PASTIX_STARPU_SIMULATION) */
248
249#ifndef DOXYGEN_SHOULD_SKIP_THIS
250CODELETS_GPU( cblk_sgemmsp, 3, STARPU_CUDA_ASYNC );
251#endif /* DOXYGEN_SHOULD_SKIP_THIS */
252
253/**
254 *******************************************************************************
255 *
256 * @brief TODO
257 *
258 *******************************************************************************
259 *
260 * @param[in] sopalin_data
261 * TODO
262 *
263 * @param[in] sideA
264 * TODO
265 *
266 * @param[in] sideB
267 * TODO
268 *
269 * @param[in] trans
270 * TODO
271 *
272 * @param[in] cblk
273 * TODO
274 *
275 * @param[in] blok
276 *
277 * @param[in] fcblk
278 * TODO
279 *
280 * @param[in] prio
281 * TODO
282 *
283 *******************************************************************************/
284void
285starpu_task_cblk_sgemmsp( sopalin_data_t *sopalin_data,
286 pastix_coefside_t sideA,
287 pastix_coefside_t sideB,
288 pastix_trans_t trans,
289 const SolverCblk *cblk,
290 const SolverBlok *blok,
291 SolverCblk *fcblk,
292 int prio )
293{
294 struct cl_cblk_sgemmsp_args_s *cl_arg = NULL;
295 long long execute_where = cl_cblk_sgemmsp_any.where;
296 int need_exec = 1;
297#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
298 char *task_name;
299#endif
300
301 /*
302 * Check if it needs to be submitted
303 */
304#if defined(PASTIX_WITH_MPI)
305 {
306 int need_submit = 0;
307 if ( cblk->ownerid == sopalin_data->solvmtx->clustnum ) {
308 need_submit = 1;
309 }
310 if ( (fcblk->cblktype & CBLK_FANIN) ||
311 (fcblk->ownerid == sopalin_data->solvmtx->clustnum) )
312 {
313 need_submit = 1;
314 }
315 else {
316 need_exec = 0;
317 }
318 if ( starpu_mpi_cached_receive( fcblk->handler[sideA] ) ) {
319 need_submit = 1;
320 }
321 if ( !need_submit ) {
322 return;
323 }
324 }
325#endif
326
327 /*
328 * Create the arguments array
329 */
330 if ( need_exec ) {
331 cl_arg = malloc( sizeof( struct cl_cblk_sgemmsp_args_s ) );
332 cl_arg->sopalin_data = sopalin_data;
333#if defined(PASTIX_STARPU_PROFILING)
334 cl_arg->profile_data.measures = cblk_sgemmsp_profile.measures;
335 cl_arg->profile_data.flops = NAN;
336#endif
337 cl_arg->sideA = sideA;
338 cl_arg->trans = trans;
339 cl_arg->cblk = cblk;
340 cl_arg->blok = blok;
341 cl_arg->fcblk = fcblk;
342
343#if defined(PASTIX_WITH_CUDA)
344 if ( (cblk->cblktype & CBLK_COMPRESSED) ||
345 (fcblk->cblktype & CBLK_COMPRESSED) )
346 {
347 /* Disable CUDA */
348 execute_where &= (~STARPU_CUDA);
349 }
350#endif
351 }
352
353#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
354 /* This actually generates a memory leak */
355 asprintf( &task_name, "%s( %ld, %ld, %ld )",
356 cl_cblk_sgemmsp_any.name,
357 (long)(cblk - sopalin_data->solvmtx->cblktab),
358 (long)(blok - sopalin_data->solvmtx->bloktab),
359 (long)sideA );
360#endif
361
362 pastix_starpu_insert_task(
363 &cl_cblk_sgemmsp_any,
364 STARPU_CL_ARGS, cl_arg, sizeof( struct cl_cblk_sgemmsp_args_s ),
365 STARPU_EXECUTE_WHERE, execute_where,
366#if defined(PASTIX_STARPU_PROFILING)
367 STARPU_CALLBACK_WITH_ARG_NFREE, cblk_sgemmsp_callback, cl_arg,
368#endif
369 STARPU_R, cblk->handler[sideA],
370 STARPU_R, cblk->handler[sideB],
371 STARPU_RW, fcblk->handler[sideA],
372#if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
373 STARPU_NAME, task_name,
374#endif
375#if defined(PASTIX_STARPU_HETEROPRIO)
376 STARPU_PRIORITY, BucketGEMM1D,
377#else
378 STARPU_PRIORITY, prio,
379#endif
380 0);
381 (void)prio;
382}
383
384/**
385 * @}
386 */
BEGIN_C_DECLS typedef int pastix_int_t
Definition datatypes.h:51
double pastix_fixdbl_t
Definition datatypes.h:65
@ PastixKernelGEMMCblk2d2d
pastix_fixdbl_t cpucblk_sgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, float *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
@ PastixUCoef
Definition api.h:479
static void fct_cblk_sgemmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
void starpu_task_cblk_sgemmsp(sopalin_data_t *sopalin_data, pastix_coefside_t sideA, pastix_coefside_t sideB, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
Definition solver.h:395
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Definition solver.h:329
void * handler[2]
Definition solver.h:179
int8_t cblktype
Definition solver.h:164
Solver block structure.
Definition solver.h:141
Solver column block structure.
Definition solver.h:161