21#ifndef DOXYGEN_SHOULD_SKIP_THIS
26#include "sopalin/sopalin_data.h"
28#if defined(PASTIX_WITH_CUDA)
38struct cl_blok_cadd_args_s {
40 sopalin_data_t *sopalin_data;
48#if defined( PASTIX_STARPU_PROFILING )
56starpu_profile_t blok_cadd_profile = {
64void blok_cadd_profile_register(
void ) __attribute__( ( constructor ) );
66blok_cadd_profile_register(
void )
68 profiling_register_cl( &blok_cadd_profile );
71#ifndef DOXYGEN_SHOULD_SKIP_THIS
72#if defined(PASTIX_STARPU_PROFILING_LOG)
74cl_profiling_cb_blok_cadd(
void *callback_arg )
76 cl_profiling_callback( callback_arg );
78 struct starpu_task *task = starpu_task_get_current();
79 struct starpu_profiling_task_info *info = task->profiling_info;
86 struct cl_blok_cadd_args_s *args = (
struct cl_blok_cadd_args_s *) callback_arg;
88 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
91 const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
95 cl_profiling_log_register( task->name,
"blok_cadd", M, N, 0, flops, speed );
99#if defined(PASTIX_STARPU_PROFILING_LOG)
100static void (*blok_cadd_callback)(
void*) = cl_profiling_cb_blok_cadd;
102static void (*blok_cadd_callback)(
void*) = cl_profiling_callback;
109#if defined(PASTIX_STARPU_COST_PER_ARCH)
135fct_blok_cadd_cost(
struct starpu_task *task,
136 struct starpu_perfmodel_arch *arch,
139 struct cl_blok_cadd_args_s *args = (
struct cl_blok_cadd_args_s *)(task->cl_arg);
143 const SolverBlok *blok = args->cblk->fblokptr + args->blok_m;
147 switch( arch->devices->type ) {
148 case STARPU_CPU_WORKER:
151 case STARPU_CUDA_WORKER:
160 cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
167#ifndef DOXYGEN_SHOULD_SKIP_THIS
168static struct starpu_perfmodel starpu_blok_cadd_model = {
169#if defined(PASTIX_STARPU_COST_PER_ARCH)
170 .type = STARPU_PER_ARCH,
171 .arch_cost_function = fct_blok_cadd_cost,
173 .type = STARPU_HISTORY_BASED,
175 .symbol =
"blok_cadd",
179#if !defined(PASTIX_STARPU_SIMULATION)
197 struct cl_blok_cadd_args_s *args = (
struct cl_blok_cadd_args_s *)cl_arg;
201 A = pastix_starpu_blok_get_ptr( descr[0] );
202 B = pastix_starpu_blok_get_ptr( descr[1] );
204 assert( args->cblk->cblktype & CBLK_LAYOUT_2D );
205 assert( args->fcblk->cblktype & CBLK_LAYOUT_2D );
207 args->profile_data.flops =
cpublok_cadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
209 &( args->sopalin_data->solvmtx->lowrank ) );
212#if defined(PASTIX_WITH_CUDA) && 0
228fct_blok_cadd_gpu(
void *descr[],
void *cl_arg )
230 struct cl_template_args_s *args = (
struct cl_template_args_s *)cl_arg;
234 A = pastix_starpu_blok_get_ptr( descr[0] );
235 B = pastix_starpu_blok_get_ptr( descr[1] );
237 assert( args->cblk->cblktype & CBLK_TASKS_2D );
238 assert( args->fcblk->cblktype & CBLK_TASKS_2D );
240 args->profile_data.flops = gpublok_cadd( 1., args->cblk, args->fcblk, args->blok_m, args->fblok_m,
242 &( args->sopalin_data->solvmtx->lowrank ),
243 starpu_cuda_get_local_stream() );
249#ifndef DOXYGEN_SHOULD_SKIP_THIS
250CODELETS_CPU( blok_cadd, 2 );
296 struct cl_blok_cadd_args_s *cl_arg = NULL;
297#if defined(PASTIX_DEBUG_STARPU)
306 cl_arg = malloc(
sizeof(
struct cl_blok_cadd_args_s) );
307 cl_arg->sopalin_data = sopalin_data;
308#if defined(PASTIX_STARPU_PROFILING)
309 cl_arg->profile_data.measures = blok_cadd_profile.measures;
310 cl_arg->profile_data.flops = NAN;
314 cl_arg->fcblk = fcblk;
315 cl_arg->blok_m = blok - cblk->
fblokptr;
316 cl_arg->fblok_m = fblok - fcblk->
fblokptr;
318#if defined(PASTIX_DEBUG_STARPU)
320 asprintf( &task_name,
"%s( %ld )",
321 cl_blok_cadd_cpu.name,
322 (
long)(cblk - sopalin_data->solvmtx->cblktab) );
325 assert( cblk->
cblktype & CBLK_RECV );
326 assert( !(fcblk->
cblktype & (CBLK_RECV|CBLK_FANIN)) );
328 pastix_starpu_insert_task(
330 STARPU_CL_ARGS, cl_arg,
sizeof(
struct cl_blok_cadd_args_s ),
331 STARPU_EXECUTE_ON_NODE, fcblk->
ownerid,
332#
if defined(PASTIX_STARPU_PROFILING)
333 STARPU_CALLBACK_WITH_ARG_NFREE, blok_cadd_callback, cl_arg,
336 STARPU_RW, fblok->
handler[side],
337#
if defined(PASTIX_DEBUG_STARPU)
338 STARPU_NAME, task_name,
340#
if defined(PASTIX_STARPU_HETEROPRIO)
341 STARPU_PRIORITY, BucketFacto1D,
343 STARPU_PRIORITY, prio,
386 assert( cblk->
cblktype & CBLK_FANIN );
388 pastix_starpu_insert_task(
390 STARPU_EXECUTE_ON_NODE, cblk->
ownerid,
392#
if defined(PASTIX_STARPU_HETEROPRIO)
393 STARPU_PRIORITY, BucketFacto1D,
395 STARPU_PRIORITY, prio,
BEGIN_C_DECLS typedef int pastix_int_t
@ PastixKernelGEADDCblkFRFR
pastix_fixdbl_t cpublok_cadd(pastix_complex32_t alpha, const SolverCblk *cblkA, SolverCblk *cblkB, pastix_int_t blokA_m, pastix_int_t blokB_m, const void *A, void *B, pastix_complex32_t *work, pastix_int_t lwork, const pastix_lr_t *lowrank)
Add two bloks.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
void starpu_task_blok_cadd_recv(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, SolverCblk *fcblk, SolverBlok *fblok, int prio)
Insert the task to add a fanin cblk on the receiver side (The fanin is seen on this side as the RECV ...
void starpu_task_blok_cadd_fanin(sopalin_data_t *sopalin_data, pastix_coefside_t side, const SolverCblk *cblk, const SolverBlok *blok, int prio)
Insert the task to add a fanin cblk on the emitter side. Note that this task is submitted only to emi...
static void fct_blok_cadd_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr(const SolverBlok *blok)
Compute the number of rows of a block.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Solver column block structure.