23 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 #include "sopalin/sopalin_data.h"
30 #if defined(PASTIX_WITH_CUDA)
40 struct cl_blok_strsmsp_args_s {
42 sopalin_data_t *sopalin_data;
51 #if defined( PASTIX_STARPU_PROFILING )
59 starpu_profile_t blok_strsmsp_profile = {
61 .name =
"blok_strsmsp"
67 void blok_strsmsp_profile_register(
void ) __attribute__( ( constructor ) );
69 blok_strsmsp_profile_register(
void )
71 profiling_register_cl( &blok_strsmsp_profile );
74 #ifndef DOXYGEN_SHOULD_SKIP_THIS
75 #if defined(PASTIX_STARPU_PROFILING_LOG)
77 cl_profiling_cb_blok_strsmsp(
void *callback_arg )
79 cl_profiling_callback( callback_arg );
81 struct starpu_task *task = starpu_task_get_current();
82 struct starpu_profiling_task_info *info = task->profiling_info;
89 struct cl_blok_strsmsp_args_s *args = (
struct cl_blok_strsmsp_args_s *) callback_arg;
91 pastix_fixdbl_t duration = starpu_timing_timespec_delay_us( &info->start_time, &info->end_time );
97 cl_profiling_log_register( task->name,
"blok_strsmsp", M, N, 0, flops, speed );
101 #if defined(PASTIX_STARPU_PROFILING_LOG)
102 static void (*blok_strsmsp_callback)(
void*) = cl_profiling_cb_blok_strsmsp;
104 static void (*blok_strsmsp_callback)(
void*) = cl_profiling_callback;
136 struct starpu_perfmodel_arch *arch,
139 struct cl_blok_strsmsp_args_s *args = (
struct cl_blok_strsmsp_args_s *)(task->cl_arg);
146 switch( arch->devices->type ) {
147 case STARPU_CPU_WORKER:
150 case STARPU_CUDA_WORKER:
159 cost = modelsGetCost2Param( coefs, M, N ) * 1e6;
165 #ifndef DOXYGEN_SHOULD_SKIP_THIS
166 static struct starpu_perfmodel starpu_blok_strsmsp_model = {
167 #if defined(PASTIX_STARPU_COST_PER_ARCH)
168 .type = STARPU_PER_ARCH,
171 .type = STARPU_HISTORY_BASED,
173 .symbol =
"blok_strsmsp",
177 #if !defined(PASTIX_STARPU_SIMULATION)
195 struct cl_blok_strsmsp_args_s *args = (
struct cl_blok_strsmsp_args_s *)cl_arg;
199 A = pastix_starpu_blok_get_ptr( descr[0] );
200 C = pastix_starpu_blok_get_ptr( descr[1] );
202 assert( args->cblk->cblktype & CBLK_TASKS_2D );
205 args->trans, args->diag,
206 args->cblk, args->blok_m, A, C,
207 &(args->sopalin_data->solvmtx->lowrank) );
213 #if defined(PASTIX_WITH_CUDA)
215 fct_blok_strsmsp_gpu(
void *descr[],
void *cl_arg )
217 struct cl_blok_strsmsp_args_s *args = (
struct cl_blok_strsmsp_args_s *)cl_arg;
221 A = pastix_starpu_blok_get_ptr( descr[0] );
222 C = pastix_starpu_blok_get_ptr( descr[1] );
224 assert( args->cblk->cblktype & CBLK_TASKS_2D );
226 args->profile_data.flops = gpublok_strsmsp( args->side, args->uplo,
227 args->trans, args->diag,
228 args->cblk, args->blok_m, A, C,
229 &(args->sopalin_data->solvmtx->lowrank),
230 starpu_cuda_get_local_stream() );
235 #ifndef DOXYGEN_SHOULD_SKIP_THIS
236 CODELETS_GPU( blok_strsmsp, 2, STARPU_CUDA_ASYNC );
285 struct cl_blok_strsmsp_args_s *cl_arg = NULL;
286 long long execute_where = cl_blok_strsmsp_any.where;
288 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
297 #if defined(PASTIX_WITH_MPI)
300 if ( cblk->
ownerid == sopalin_data->solvmtx->clustnum ) {
306 if ( starpu_mpi_cached_receive( blok->
handler[coef] ) ) {
309 if ( !need_submit ) {
319 cl_arg = malloc(
sizeof(
struct cl_blok_strsmsp_args_s ) );
320 cl_arg->sopalin_data = sopalin_data;
321 #if defined(PASTIX_STARPU_PROFILING)
322 cl_arg->profile_data.measures = blok_strsmsp_profile.measures;
323 cl_arg->profile_data.flops = NAN;
327 cl_arg->trans = trans;
330 cl_arg->blok_m = blok_m;
332 #if defined(PASTIX_WITH_CUDA)
333 if ( (cblk->
cblktype & CBLK_COMPRESSED) ) {
334 execute_where &= (~STARPU_CUDA);
339 #if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
341 asprintf( &task_name,
"%s( %ld, %ld, %ld )",
342 cl_blok_strsmsp_any.name,
343 (
long)(cblk - sopalin_data->solvmtx->cblktab),
344 (
long)(blok - sopalin_data->solvmtx->bloktab),
348 pastix_starpu_insert_task(
349 &cl_blok_strsmsp_any,
350 STARPU_CL_ARGS, cl_arg,
sizeof(
struct cl_blok_strsmsp_args_s ),
351 STARPU_EXECUTE_WHERE, execute_where,
352 #
if defined(PASTIX_STARPU_PROFILING)
353 STARPU_CALLBACK_WITH_ARG_NFREE, blok_strsmsp_callback, cl_arg,
356 STARPU_RW, blok->
handler[coef],
357 #
if defined(PASTIX_DEBUG_STARPU) || defined(PASTIX_STARPU_PROFILING_LOG)
358 STARPU_NAME, task_name,
360 #
if defined(PASTIX_STARPU_HETEROPRIO)
361 STARPU_PRIORITY, BucketTRSM2D,
363 STARPU_PRIORITY, prio,
BEGIN_C_DECLS typedef int pastix_int_t
pastix_fixdbl_t cpublok_strsmsp(pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, pastix_int_t blok_m, const void *A, void *C, const pastix_lr_t *lowrank)
Compute the updates associated to one off-diagonal block.
enum pastix_diag_e pastix_diag_t
Diagonal.
enum pastix_uplo_e pastix_uplo_t
Upper/Lower part.
enum pastix_side_e pastix_side_t
Side of the operation.
enum pastix_trans_e pastix_trans_t
Transpostion.
enum pastix_coefside_e pastix_coefside_t
Data blocks used in the kernel.
static pastix_fixdbl_t fct_blok_strsmsp_cost(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
Cost model function.
static void fct_blok_strsmsp_cpu(void *descr[], void *cl_arg)
StarPU CPU implementation.
void starpu_task_blok_strsmsp(sopalin_data_t *sopalin_data, pastix_coefside_t coef, pastix_side_t side, pastix_uplo_t uplo, pastix_trans_t trans, pastix_diag_t diag, const SolverCblk *cblk, SolverBlok *blok, int prio)
StarPU GPU implementation.
Base structure to all codelet arguments that include the profiling data.
static double cost(symbol_cblk_t *cblk)
Computes the cost of a cblk.
static pastix_int_t blok_rownbr_ext(const SolverBlok *blok)
Compute the number of rows of a contiguous block in front of the same cblk.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
Solver column block structure.