16 #ifndef DOXYGEN_SHOULD_SKIP_THIS
21 #include "common/common.h"
24 #include "kernels_trace.h"
47 #ifndef DOXYGEN_SHOULD_SKIP_THIS
48 volatile double kernels_flops[PastixKernelLvl1Nbr];
50 volatile int32_t kernels_trace_started = 0;
52 #if defined(PASTIX_WITH_EZTRACE)
54 int pastix_eztrace_level = 1;
58 #if defined(PASTIX_GENERATE_MODEL)
60 pastix_model_entry_t *model_entries = NULL;
61 volatile int32_t model_entries_nbr = -1;
62 int32_t model_size = 0;
66 pastix_atomic_lock_t lock_flops = PASTIX_ATOMIC_UNLOCKED;
67 double overall_flops[3] = { 0.0, 0.0, 0.0 };
85 const SolverMatrix *solvmtx = pastix_data->solvmatr;
88 pastix_atomic_lock( &lock_flops );
89 nbstart = pastix_atomic_inc_32b( &(kernels_trace_started) );
91 pastix_atomic_unlock( &lock_flops );
95 #if defined(PASTIX_WITH_EZTRACE)
97 char *level = pastix_getenv(
"PASTIX_EZTRACE_LEVEL");
99 pastix_eztrace_level = atoi(level);
100 pastix_cleanenv(level);
103 if ( pastix_data->dir_global != NULL ) {
104 pastix_setenv(
"EZTRACE_TRACE_DIR", pastix_data->dir_global, 1 );
110 #if defined(PASTIX_GENERATE_MODEL)
112 pastix_int_t cblknbr = solvmtx->cblknbr;
113 pastix_int_t cblkmin2d = solvmtx->cblkmin2d;
114 pastix_int_t total_number_of_tasks = 0;
115 pastix_int_t nbfact, nbtrsm, nbgemm;
116 pastix_int_t cblknum;
123 nbtrsm = cblkmin2d + (cblknbr - cblkmin2d) * solvmtx->cblkmaxblk;
129 nbgemm = solvmtx->bloknbr - cblknbr;
134 cblk = solvmtx->cblktab+cblkmin2d;
135 for(cblknum = cblkmin2d; cblknum < cblknbr; cblknum++, cblk++ ) {
139 nbgemm += nbodb * nbodb;
142 nbgemm += (nbodb * (nbodb-1)) / 2;
146 total_number_of_tasks = nbfact + nbtrsm + nbgemm;
147 model_entries = malloc( total_number_of_tasks *
sizeof(pastix_model_entry_t) );
148 model_size = total_number_of_tasks;
152 memset( (
void*)kernels_flops, 0, PastixKernelLvl1Nbr *
sizeof(
double) );
154 overall_flops[0] = 0.0;
155 overall_flops[1] = 0.0;
156 overall_flops[2] = 0.0;
157 kernels_trace_started = 1;
160 pastix_atomic_unlock( &lock_flops );
183 double total_flops = 0.0;
186 assert( kernels_trace_started > 0 );
187 pastix_atomic_lock( &lock_flops );
188 nbstart = pastix_atomic_dec_32b( &(kernels_trace_started) );
190 pastix_atomic_unlock( &lock_flops );
194 #if defined(PASTIX_WITH_EZTRACE)
198 #if defined(PASTIX_GENERATE_MODEL)
200 char *prec_names[4] = {
201 "s - single real",
"d - double real",
202 "c - single complex",
"z - double complex"
204 pastix_model_entry_t *entry = model_entries;
205 pastix_int_t i, gpucase;
208 f = fopen(
"model.csv",
"w" );
215 fprintf(f,
"# GPU Model data\n");
218 fprintf(f,
"# CPU Model data\n");
221 fprintf( f,
"# Precision: %d - %s\n", pastix_data->bcsc->flttype - 2, prec_names[ pastix_data->bcsc->flttype - 2 ] );
222 fprintf( f,
"Kernel;M;N;K;Time\n" );
224 for(i=0; i <= model_entries_nbr; i++, entry++ ) {
225 switch( entry->ktype ) {
226 case PastixKernelGETRF: pastix_attr_fallthrough;
227 case PastixKernelHETRF: pastix_attr_fallthrough;
228 case PastixKernelPOTRF: pastix_attr_fallthrough;
229 case PastixKernelPXTRF: pastix_attr_fallthrough;
230 case PastixKernelSYTRF: pastix_attr_fallthrough;
231 case PastixKernelSCALOCblk: pastix_attr_fallthrough;
232 case PastixKernelSCALOBlok: pastix_attr_fallthrough;
233 case PastixKernelTRSMCblk1d: pastix_attr_fallthrough;
234 case PastixKernelTRSMCblk2d: pastix_attr_fallthrough;
235 case PastixKernelTRSMCblkLR: pastix_attr_fallthrough;
236 case PastixKernelTRSMBlokLR: pastix_attr_fallthrough;
237 case PastixKernelGEMMCblk1d1d: pastix_attr_fallthrough;
238 case PastixKernelGEMMCblkFRLR: pastix_attr_fallthrough;
239 case PastixKernelGEMMCblkLRLR: pastix_attr_fallthrough;
240 case PastixKernelGEMMBlokLRLR:
245 pastix_attr_fallthrough;
247 fprintf( f,
"%d;%d;%d;%d;%e\n",
248 entry->ktype, entry->m, entry->n, entry->k, entry->time );
254 free( model_entries );
257 model_entries = NULL;
258 model_entries_nbr = -1;
265 pastix_data->dparm[
DPARM_FACT_RLFLOPS] = overall_flops[0] + overall_flops[1] + overall_flops[2];
267 #if defined(PASTIX_SUPERNODE_STATS)
270 " Details of the number of operations:\n"
271 " - POTRF(A11) + TRSM(A11, A21): %6.2lf %cFlops\n"
272 " - HERK(A21, A22) : %6.2lf %cFlops\n"
273 " - POTRF(A22) : %6.2lf %cFlops\n"
274 " Total : %6.2lf %cFlops\n",
275 pastix_print_value( overall_flops[0] ), pastix_print_unit( overall_flops[0] ),
276 pastix_print_value( overall_flops[1] ), pastix_print_unit( overall_flops[1] ),
277 pastix_print_value( overall_flops[2] ), pastix_print_unit( overall_flops[2] ),
283 kernels_trace_started = 0;
284 pastix_atomic_unlock( &lock_flops );
static pastix_int_t core_get_rklimit_end(pastix_int_t M, pastix_int_t N)
Compute the maximal rank accepted for a given matrix size for Just-In-Time strategy.
pastix_int_t(* core_get_rklimit)(pastix_int_t M, pastix_int_t N)
Compute the maximal rank accepted for a given matrix size. The pointer is set according to the low-ra...
double kernelsTraceStop(const pastix_data_t *pastix_data)
Stop the trace module.
void kernelsTraceStart(const pastix_data_t *pastix_data)
Start the trace module.
Solver column block structure.