24 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 #include "sopalin/sopalin_data.h"
31 #if defined(PASTIX_WITH_CUDA)
41 struct cl_cblk_dgemmsp_args_s {
43 sopalin_data_t *sopalin_data;
51 #if defined( PASTIX_STARPU_PROFILING )
59 starpu_profile_t cblk_dgemmsp_profile = {
61 .name =
"cblk_dgemmsp"
67 void cblk_dgemmsp_profile_register(
void ) __attribute__( ( constructor ) );
69 cblk_dgemmsp_profile_register(
void )
71 profiling_register_cl( &cblk_dgemmsp_profile );
74 #ifndef DOXYGEN_SHOULD_SKIP_THIS
75 #if defined(PASTIX_STARPU_PROFILING_LOG)
77 cl_profiling_cb_cblk_dgemmsp(
void *callback_arg )
79 cl_profiling_callback( callback_arg );
81 struct starpu_task *task = starpu_task_get_current();
82 struct starpu_profiling_task_info *info = task->profiling_info;
89 struct cl_cblk_dgemmsp_args_s *args = (
struct cl_cblk_dgemmsp_args_s *) callback_arg;
91 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
98 M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
101 cl_profiling_log_register( task->name,
"cblk_dgemmsp", M, N, K, flops, speed );
105 #if defined(PASTIX_STARPU_PROFILING_LOG)
106 static void (*cblk_dgemmsp_callback)(
void*) = cl_profiling_cb_cblk_dgemmsp;
108 static void (*cblk_dgemmsp_callback)(
void*) = cl_profiling_callback;
140 struct starpu_perfmodel_arch *arch,
143 struct cl_cblk_dgemmsp_args_s *args = (
struct cl_cblk_dgemmsp_args_s *)(task->cl_arg);
151 M -= (args->cblk->cblktype & CBLK_LAYOUT_2D) ? args->blok->coefind / K : args->blok->coefind;
154 switch( arch->devices->type ) {
155 case STARPU_CPU_WORKER:
158 case STARPU_CUDA_WORKER:
167 cost = modelsGetCost3Param( coefs, M, N, K ) * 1e6;
173 #ifndef DOXYGEN_SHOULD_SKIP_THIS
174 static struct starpu_perfmodel starpu_cblk_dgemmsp_model = {
175 #if defined(PASTIX_STARPU_COST_PER_ARCH)
176 .type = STARPU_PER_ARCH,
179 .type = STARPU_HISTORY_BASED,
181 .symbol =
"cblk_dgemmsp",
185 #if !defined(PASTIX_STARPU_SIMULATION)
203 struct cl_cblk_dgemmsp_args_s *args = (
struct cl_cblk_dgemmsp_args_s *)cl_arg;
208 A = pastix_starpu_cblk_get_ptr( descr[0] );
209 B = pastix_starpu_cblk_get_ptr( descr[1] );
210 C = pastix_starpu_cblk_get_ptr( descr[2] );
213 assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
214 assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
217 args->cblk, args->blok, args->fcblk,
219 &( args->sopalin_data->solvmtx->lowrank ) );
225 #if defined(PASTIX_WITH_CUDA)
227 fct_cblk_dgemmsp_gpu(
void *descr[],
void *cl_arg )
229 struct cl_cblk_dgemmsp_args_s *args = (
struct cl_cblk_dgemmsp_args_s *)cl_arg;
234 A = pastix_starpu_cblk_get_ptr( descr[0] );
235 B = pastix_starpu_cblk_get_ptr( descr[1] );
236 C = pastix_starpu_cblk_get_ptr( descr[2] );
238 args->profile_data.flops = gpucblk_dgemmsp( args->sideA, args->trans,
239 args->cblk, args->blok, args->fcblk,
241 &( args->sopalin_data->solvmtx->lowrank ),
242 starpu_cuda_get_local_stream() );
247 #ifndef DOXYGEN_SHOULD_SKIP_THIS
248 CODELETS_GPU( cblk_dgemmsp, 3, STARPU_CUDA_ASYNC );
292 struct cl_cblk_dgemmsp_args_s *cl_arg = NULL;
293 long long execute_where = cl_cblk_dgemmsp_any.where;
295 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
302 #if defined(PASTIX_WITH_MPI)
305 if ( cblk->
ownerid == sopalin_data->solvmtx->clustnum ) {
308 if ( (fcblk->
cblktype & CBLK_FANIN) ||
309 (fcblk->
ownerid == sopalin_data->solvmtx->clustnum) )
316 if ( starpu_mpi_cached_receive( fcblk->
handler[sideA] ) ) {
319 if ( !need_submit ) {
329 cl_arg = malloc(
sizeof(
struct cl_cblk_dgemmsp_args_s ) );
330 cl_arg->sopalin_data = sopalin_data;
331 #if defined(PASTIX_STARPU_PROFILING)
332 cl_arg->profile_data.measures = cblk_dgemmsp_profile.measures;
333 cl_arg->profile_data.flops = NAN;
335 cl_arg->sideA = sideA;
336 cl_arg->trans = trans;
339 cl_arg->fcblk = fcblk;
341 #if defined(PASTIX_WITH_CUDA)
342 if ( (cblk->
cblktype & CBLK_COMPRESSED) ||
343 (fcblk->
cblktype & CBLK_COMPRESSED) )
346 execute_where &= (~STARPU_CUDA);
351 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
353 asprintf( &task_name,
"%s( %ld, %ld, %ld )",
354 cl_cblk_dgemmsp_any.name,
355 (
long)(cblk - sopalin_data->solvmtx->cblktab),
356 (
long)(blok - sopalin_data->solvmtx->bloktab),
360 pastix_starpu_insert_task(
361 &cl_cblk_dgemmsp_any,
362 STARPU_CL_ARGS, cl_arg,
sizeof(
struct cl_cblk_dgemmsp_args_s ),
363 STARPU_EXECUTE_WHERE, execute_where,
364 #
if defined(PASTIX_STARPU_PROFILING)
365 STARPU_CALLBACK_WITH_ARG_NFREE, cblk_dgemmsp_callback, cl_arg,
367 STARPU_R, cblk->
handler[sideA],
368 STARPU_R, cblk->
handler[sideB],
369 STARPU_RW, fcblk->
handler[sideA],
370 #
if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
371 STARPU_NAME, task_name,
373 #
if defined(PASTIX_STARPU_HETEROPRIO)
374 STARPU_PRIORITY, BucketGEMM1D,
376 STARPU_PRIORITY, prio,
BEGIN_C_DECLS typedef int pastix_int_t
@ PastixKernelGEMMCblk2d2d
pastix_fixdbl_t cpucblk_dgemmsp(pastix_coefside_t sideA, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, const void *A, const void *B, void *C, double *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static void fct_cblk_dgemmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
static pastix_fixdbl_t fct_cblk_dgemmsp_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_cblk_dgemmsp(sopalin_data_t *sopalin_data, pastix_coefside_t sideA, pastix_coefside_t sideB, pastix_trans_t trans, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Solver column block structure.