23 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 #include "sopalin/sopalin_data.h"
30 #if defined(PASTIX_WITH_CUDA)
40 struct cl_blok_sgemmsp_args_s {
42 sopalin_data_t *sopalin_data;
51 #if defined( PASTIX_STARPU_PROFILING )
59 starpu_profile_t blok_sgemmsp_profile = {
61 .name =
"blok_sgemmsp"
67 void blok_sgemmsp_profile_register(
void ) __attribute__( ( constructor ) );
69 blok_sgemmsp_profile_register(
void )
71 profiling_register_cl( &blok_sgemmsp_profile );
74 #ifndef DOXYGEN_SHOULD_SKIP_THIS
75 #if defined(PASTIX_STARPU_PROFILING_LOG)
77 cl_profiling_cb_blok_sgemmsp(
void *callback_arg )
79 cl_profiling_callback( callback_arg );
81 struct starpu_task *task = starpu_task_get_current();
82 struct starpu_profiling_task_info *info = task->profiling_info;
89 struct cl_blok_sgemmsp_args_s *args = (
struct cl_blok_sgemmsp_args_s *) callback_arg;
91 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
98 cl_profiling_log_register( task->name,
"blok_sgemmsp", M, N, K, flops, speed );
102 #if defined(PASTIX_STARPU_PROFILING_LOG)
103 static void (*blok_sgemmsp_callback)(
void*) = cl_profiling_cb_blok_sgemmsp;
105 static void (*blok_sgemmsp_callback)(
void*) = cl_profiling_callback;
137 struct starpu_perfmodel_arch *arch,
140 struct cl_blok_sgemmsp_args_s *args = (
struct cl_blok_sgemmsp_args_s *)(task->cl_arg);
148 switch( arch->devices->type ) {
149 case STARPU_CPU_WORKER:
152 case STARPU_CUDA_WORKER:
161 cost = modelsGetCost3Param( coefs, M, N, K ) * 1e6;
168 #ifndef DOXYGEN_SHOULD_SKIP_THIS
169 static struct starpu_perfmodel starpu_blok_sgemmsp_model = {
170 #if defined(PASTIX_STARPU_COST_PER_ARCH)
171 .type = STARPU_PER_ARCH,
174 .type = STARPU_HISTORY_BASED,
176 .symbol =
"blok_sgemmsp",
180 #if !defined(PASTIX_STARPU_SIMULATION)
198 struct cl_blok_sgemmsp_args_s *args = (
struct cl_blok_sgemmsp_args_s *)cl_arg;
203 A = pastix_starpu_blok_get_ptr( descr[0] );
204 B = pastix_starpu_blok_get_ptr( descr[1] );
205 C = pastix_starpu_blok_get_ptr( descr[2] );
207 assert( args->cblk->cblktype & CBLK_TASKS_2D );
208 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
211 args->cblk, args->fcblk,
212 args->blok_mk, args->blok_nk, args->blok_mn,
214 &(args->sopalin_data->solvmtx->lowrank) );
220 #if defined(PASTIX_WITH_CUDA)
222 fct_blok_sgemmsp_gpu(
void *descr[],
void *cl_arg )
224 struct cl_blok_sgemmsp_args_s *args = (
struct cl_blok_sgemmsp_args_s *)cl_arg;
229 A = pastix_starpu_blok_get_ptr( descr[0] );
230 B = pastix_starpu_blok_get_ptr( descr[1] );
231 C = pastix_starpu_blok_get_ptr( descr[2] );
233 assert( args->cblk->cblktype & CBLK_TASKS_2D );
234 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
236 args->profile_data.flops = gpublok_sgemmsp( args->trans,
237 args->cblk, args->fcblk,
238 args->blok_mk, args->blok_nk, args->blok_mn,
240 &(args->sopalin_data->solvmtx->lowrank),
241 starpu_cuda_get_local_stream() );
246 #ifndef DOXYGEN_SHOULD_SKIP_THIS
247 CODELETS_GPU( blok_sgemmsp, 3, STARPU_CUDA_ASYNC );
296 struct cl_blok_sgemmsp_args_s *cl_arg = NULL;
297 long long execute_where = cl_blok_sgemmsp_any.where;
299 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
310 assert( blok_nk <= blok_mk );
319 while( (blokC < fcblk[1].fblokptr-1) &&
320 (blokC[0].fcblknm == blokC[1].fcblknm) &&
321 (blokC[0].lcblknm == blokC[1].lcblknm) )
328 while( !((blokA->
frownum >= frownum) &&
329 (blokA->
lrownum <= lrownum)) );
336 assert( blokA[-1].fcblknm != blokA[0].fcblknm );
337 assert( blokB[-1].fcblknm != blokB[0].fcblknm );
338 assert( (blok_mn == 0) || (blokC[-1].fcblknm != blokC[0].fcblknm) );
343 #if defined(PASTIX_WITH_MPI)
346 if ( cblk->
ownerid == sopalin_data->solvmtx->clustnum ) {
349 if ( (fcblk->
cblktype & CBLK_FANIN) ||
350 (fcblk->
ownerid == sopalin_data->solvmtx->clustnum) ) {
356 if ( starpu_mpi_cached_receive( blokC->
handler[sideA] ) ) {
359 if ( !need_submit ) {
369 cl_arg = malloc(
sizeof(
struct cl_blok_sgemmsp_args_s ) );
370 cl_arg->sopalin_data = sopalin_data;
371 #if defined(PASTIX_STARPU_PROFILING)
372 cl_arg->profile_data.measures = blok_sgemmsp_profile.measures;
373 cl_arg->profile_data.flops = NAN;
375 cl_arg->trans = trans;
377 cl_arg->fcblk = fcblk;
378 cl_arg->blok_mk = blok_mk;
379 cl_arg->blok_nk = blok_nk;
380 cl_arg->blok_mn = blok_mn;
382 #if defined(PASTIX_WITH_CUDA)
383 if ( (cblk->
cblktype & CBLK_COMPRESSED) ||
384 (fcblk->
cblktype & CBLK_COMPRESSED) )
387 execute_where &= (~STARPU_CUDA);
392 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
394 asprintf( &task_name,
"%s( %ld, %ld, %ld, %ld )",
395 cl_blok_sgemmsp_any.name,
396 (
long)(blokA - sopalin_data->solvmtx->bloktab),
397 (
long)(blokB - sopalin_data->solvmtx->bloktab),
398 (
long)(blokC - sopalin_data->solvmtx->bloktab),
402 pastix_starpu_insert_task(
403 &cl_blok_sgemmsp_any,
404 STARPU_CL_ARGS, cl_arg,
sizeof(
struct cl_blok_sgemmsp_args_s ),
405 STARPU_EXECUTE_WHERE, execute_where,
406 #
if defined(PASTIX_STARPU_PROFILING)
407 STARPU_CALLBACK_WITH_ARG_NFREE, blok_sgemmsp_callback, cl_arg,
409 STARPU_R, blokA->
handler[sideA],
410 STARPU_R, blokB->
handler[sideB],
411 STARPU_RW, blokC->
handler[sideA],
412 #
if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
413 STARPU_NAME, task_name,
415 #
if defined(PASTIX_STARPU_HETEROPRIO)
416 STARPU_PRIORITY, BucketGEMM2D,
418 STARPU_PRIORITY, prio,
BEGIN_C_DECLS typedef int pastix_int_t
@ PastixKernelGEMMBlok2d2d
pastix_fixdbl_t cpublok_sgemmsp(pastix_trans_t transB, const SolverCblk *cblk, SolverCblk *fcblk, pastix_int_t blok_mk, pastix_int_t blok_nk, pastix_int_t blok_mn, const void *A, const void *B, void *C, const pastix_lr_t *lowrank)
Compute the CPU gemm associated to a couple of off-diagonal blocks.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static void fct_blok_sgemmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
static pastix_fixdbl_t fct_blok_sgemmsp_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
void starpu_task_blok_sgemmsp(sopalin_data_t *sopalin_data, pastix_coefside_t sideA, pastix_coefside_t sideB, pastix_trans_t trans, SolverCblk *cblk, SolverCblk *fcblk, const SolverBlok *blokA, const SolverBlok *blokB, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr_ext(const SolverBlok *blok)
Compute the number of rows of a contiguous block in front of the same cblk.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Solver column block structure.