27 #include "order/order_internal.h"
28 #include "frobeniusupdate.h"
32 #if defined(PASTIX_WITH_MPI)
34 bvec_smpi_frb_merge(
float *dist,
40 frobenius_merge( dist[0], dist[1], loc, loc+1 );
77 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
80 float data[] = { 0., 1. };
85 cblknbr = bcsc->cscfnbr;
86 for( i = 0; i < cblknbr; i++, bcblk++ ) {
89 valptr = (
const float*)(x + scblk->
lcolidx);
91 for( j=0; j < colnbr; j++, valptr++ )
94 frobenius_update( 1, data, data + 1, valptr );
95 #if defined(PRECISION_z) || defined(PRECISION_c)
98 frobenius_update( 1, data, data + 1, valptr );
103 #if defined(PASTIX_WITH_MPI)
107 MPI_Op_create( (MPI_User_function *)bvec_smpi_frb_merge, 1, &merge );
108 MPI_Allreduce( MPI_IN_PLACE, data, 2, MPI_FLOAT, merge, solvmtx->solv_comm );
109 MPI_Op_free( &merge );
113 norm = data[0] * sqrtf( data[1] );
119 struct s_argument_nrm2_s
123 pastix_atomic_lock_t lock;
149 struct s_argument_nrm2_s *arg = (
struct s_argument_nrm2_s*)args;
151 const float *x = arg->x;
154 float *valptr = (
float*)x;
158 size = ctx->global_ctx->world_size;
161 begin = (n / size) * rank;
162 if (rank == (size - 1)) {
165 end = (n / size) * (rank + 1);
169 #if defined(PRECISION_z) || defined(PRECISION_c)
173 for( i = begin; i < end; i++, valptr++ )
175 frobenius_update( 1, &scale, &sumsq, valptr );
176 #if defined(PRECISION_z) || defined(PRECISION_c)
178 frobenius_update( 1, &scale, &sumsq, valptr );
184 pastix_atomic_lock( &(arg->lock) );
185 frobenius_merge( scale, sumsq, &(arg->scale), &(arg->sumsq) );
186 pastix_atomic_unlock( &(arg->lock) );
219 struct s_argument_nrm2_s arg = { n, x, PASTIX_ATOMIC_UNLOCKED, 0., 1. };
222 #if defined(PASTIX_WITH_MPI)
226 MPI_Op_create( (MPI_User_function *)bvec_smpi_frb_merge, 1, &merge );
227 MPI_Allreduce( MPI_IN_PLACE, &(arg.scale), 2, MPI_FLOAT, merge, pastix_data->
solvmatr->solv_comm );
228 MPI_Op_free( &merge );
232 return arg.scale * sqrtf( arg.sumsq );
264 #if defined(PASTIX_WITH_MPI) && 0
267 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
271 cblknbr = bcsc->cscfnbr;
272 for( i = 0; i < cblknbr; i++, bcblk++ ) {
276 cblas_sscal( n, (alpha), x + scblk->
lcolidx, 1 );
280 cblas_sscal( n, (alpha), x, 1 );
285 struct s_argument_scal_s
313 struct s_argument_scal_s *arg = (
struct s_argument_scal_s*)args;
316 float alpha = arg->alpha;
326 begin = (n/size) * rank;
327 if (rank == (size - 1)) {
330 end = (n/size) * (rank + 1);
333 if ( (end - begin) > 0 ) {
334 cblas_sscal( end - begin, (alpha), x + begin, 1 );
367 struct s_argument_scal_s arg = {n, alpha, x};
404 #if defined(PASTIX_WITH_MPI) && 0
407 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
411 cblknbr = bcsc->cscfnbr;
412 for( i = 0; i < cblknbr; i++, bcblk++ ){
416 cblas_saxpy( n, (alpha),
422 cblas_saxpy( n, (alpha), x, 1, y, 1 );
426 struct s_argument_axpy_s
455 struct s_argument_axpy_s *arg = (
struct s_argument_axpy_s*)args;
457 float alpha = arg->alpha;
458 const float *x = arg->x;
463 if( (y == NULL) || (x == NULL) ) {
467 if( alpha == (
float)0.0 ) {
474 begin = (n/size) * rank;
475 if (rank == (size - 1)) {
478 end = (n/size) * (rank + 1);
481 if ( (end - begin) > 0 ) {
482 cblas_saxpy( end - begin, (alpha), x + begin, 1, y + begin, 1 );
519 struct s_argument_axpy_s args = {n, alpha, x, y};
523 struct s_argument_dot_s
528 pastix_atomic_lock_t lock;
532 #if defined(PRECISION_z) || defined(PRECISION_c)
568 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
575 cblknbr = bcsc->cscfnbr;
576 for( i = 0; i < cblknbr; i++, bcblk++ ) {
582 for( j=0; j<n; j++, xptr++, yptr++ ) {
583 r += (*xptr) * (*yptr);
587 #if defined(PASTIX_WITH_MPI)
588 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_FLOAT,
589 MPI_SUM, solvmtx->solv_comm );
615 struct s_argument_dot_s *arg = (
struct s_argument_dot_s*)args;
618 const float *xptr = arg->x;
619 const float *yptr = arg->y;
626 begin = (n/size) * rank;
627 if (rank != size - 1) {
628 end = (n/size) * (rank + 1);
636 for ( i = begin; i < end; i++, xptr++, yptr++ )
638 r += (*xptr) * (*yptr);
641 if ( fabsf(r) > 0. ) {
642 pastix_atomic_lock( &(arg->lock) );
644 pastix_atomic_unlock( &(arg->lock) );
681 struct s_argument_dot_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
684 #if defined(PASTIX_WITH_MPI)
685 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_FLOAT,
686 MPI_SUM, pastix_data->
solvmatr->solv_comm );
728 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
735 cblknbr = bcsc->cscfnbr;
736 for( i = 0; i < cblknbr; i++, bcblk++ ) {
742 for( j=0; j<n; j++, xptr++, yptr++ ) {
743 r += (*xptr) * (*yptr);
747 #if defined(PASTIX_WITH_MPI)
748 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_FLOAT,
749 MPI_SUM, solvmtx->solv_comm );
775 struct s_argument_dot_s *arg = (
struct s_argument_dot_s*)args;
778 const float *x = arg->x;
779 const float *y = arg->y;
789 begin = (n / size) * rank;
790 if ( rank == (size - 1)) {
793 end = (n / size) * (rank + 1);
799 for (i=begin; i<end; i++, xptr++, yptr++)
801 r += (*xptr) * (*yptr);
804 if ( fabsf(r) > 0. ) {
805 pastix_atomic_lock( &(arg->lock) );
807 pastix_atomic_unlock( &(arg->lock) );
844 struct s_argument_dot_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
847 #if defined(PASTIX_WITH_MPI)
848 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_FLOAT,
849 MPI_SUM, pastix_data->
solvmatr->solv_comm );
884 #if defined(PASTIX_WITH_MPI) && 0
887 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
891 cblknbr = bcsc->cscfnbr;
892 for( i = 0; i < cblknbr; i++, bcblk++ ) {
896 memcpy( y + scblk->
lcolidx, x + scblk->
lcolidx, n *
sizeof(
float) );
900 memcpy( y, x, n *
sizeof(
float) );
904 struct argument_copy_s {
933 struct argument_copy_s *arg = (
struct argument_copy_s*)args;
941 begin = (n/size) * rank;
943 if (rank == (size - 1)) {
946 end = (n/size) * (rank + 1);
949 if ( (end - begin) > 0 ) {
950 memcpy( arg->y + begin, arg->x + begin, (end - begin) *
sizeof(
float) );
985 struct argument_copy_s args = {n, x, y};
1023 .flttype = PastixFloat,
1024 .m = pastix_data->
bcsc->n,
1026 .ld = pastix_data->
bcsc->n,
1036 #if defined(PRECISION_z) || defined(PRECISION_d)
1046 rc = LAPACKE_slag2d_work( LAPACK_COL_MAJOR,
n, nrhs,
1053 rc = LAPACKE_slag2d_work( LAPACK_COL_MAJOR,
n, nrhs,
1060 assert(work == NULL);
1064 if ( rhsb.
cblkb != NULL ) {
1122 #if defined(PASTIX_WITH_MPI) && 0
1125 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
1129 cblknbr = bcsc->cscfnbr;
1130 for( i = 0; i < cblknbr; i++, bcblk++ ) {
1134 cblas_sgemv( CblasColMajor, CblasNoTrans,
m,
n,
1135 (alpha), A + scblk->
lcolidx, lda, x, 1,
1136 (beta), y + scblk->
lcolidx, 1 );
1140 cblas_sgemv( CblasColMajor, CblasNoTrans,
m,
n,
1141 (alpha), A, lda, x, 1,
1185 struct s_gemv_s *arg = (
struct s_gemv_s*)args;
1189 float alpha = arg->alpha;
1190 const float *A = arg->A;
1193 const float *x = arg->x;
1195 float beta = arg->beta;
1206 Aptr = A + sub_m * rank;
1209 yptr = y + sub_m * rank;
1211 if (rank == (size - 1)) {
1216 cblas_sgemv( CblasColMajor, CblasNoTrans, sub_m, n,
1217 (alpha), Aptr, lda, xptr, 1,
1272 struct s_gemv_s arg = {m, n, alpha, A, lda, x, beta, y};
1299 #if defined( PASTIX_WITH_MPI )
1307 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1308 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1312 if ( cblk->
fcolnum != lastindex ) {
1315 y + lastindex, 0, ( cblk->
fcolnum - lastindex ) *
sizeof(
float ) );
1317 lastindex = cblk->
lcolnum + 1;
1320 if ( lastindex < n ) {
1322 memset( y + lastindex, 0, ( n - lastindex ) *
sizeof(
float ) );
1355 #if defined( PASTIX_WITH_MPI )
1358 float *yglobal = NULL;
1362 MPI_Request request_c = MPI_REQUEST_NULL;
1363 MPI_Request request_n = MPI_REQUEST_NULL;
1372 yglobal = malloc( gn *
sizeof(
float) );
1373 #if !defined(NDEBUG)
1374 memset( yglobal, 0xff, gn *
sizeof(
float) );
1380 MPI_Allgather( &ln, 1, PASTIX_MPI_INT,
1381 all_n, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1383 MPI_Allgather( &lcblknbr, 1, PASTIX_MPI_INT,
1384 all_cblknbr, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1386 for( c=0; c<pastix_data->
procnbr; c++ )
1388 max_n = pastix_imax( max_n, all_n[c] );
1389 max_cblknbr = pastix_imax( max_cblknbr, all_cblknbr[c] );
1392 ytmp = malloc( max_n *
sizeof(
float) );
1393 indices = malloc( max_cblknbr * 2 *
sizeof(
pastix_int_t) );
1395 for( c=0; c<pastix_data->
procnbr; c++ )
1397 if ( all_n[c] == 0 ) {
1401 if ( c == pastix_data->
procnum ) {
1402 MPI_Ibcast( (
float*)y, ln, PASTIX_MPI_FLOAT, c, pastix_data->
pastix_comm, &request_n );
1407 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1408 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1413 yto = yglobal + cblk->
fcolnum;
1415 memcpy( yto, yfr,
cblk_colnbr( cblk ) *
sizeof(
float ) );
1417 indices[ 2*cblki ] = cblk->
fcolnum;
1418 indices[ 2*cblki+1 ] = cblk->
lcolnum;
1421 assert( cblki == lcblknbr );
1423 MPI_Ibcast( indices, 2 * lcblknbr, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1424 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1425 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1428 MPI_Ibcast( ytmp, all_n[c], PASTIX_MPI_FLOAT, c, pastix_data->
pastix_comm, &request_n );
1429 MPI_Ibcast( indices, all_cblknbr[c] * 2, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1430 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1431 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1436 for ( i = 0; i < all_cblknbr[c]; i++ ) {
1437 yto = yglobal + indices[2*i];
1439 colnbr = indices[2*i+1] - indices[2*i] + 1;
1440 memcpy( yto, yfr, colnbr *
sizeof(
float ) );
1449 free( all_cblknbr );
1482 #if defined( PASTIX_WITH_MPI )
1484 MPI_Allreduce( MPI_IN_PLACE,
1485 y, pastix_data->
csc->gNexp,
1486 PASTIX_MPI_FLOAT, MPI_SUM,
BEGIN_C_DECLS typedef int pastix_int_t
static void pthread_bvec_saxpy(isched_thread_t *ctx, void *args)
Compute y <- alpha * x + y (Parallel version).
static void pthread_bvec_sgemv(isched_thread_t *ctx, void *args)
Compute.
static void pthread_bvec_sdot(isched_thread_t *ctx, void *args)
Compute the scalar product x.y. (Parallel version)
static void pthread_bvec_scopy(isched_thread_t *ctx, void *args)
Copy a vector y = x (parallel version)
void bvec_sgemv_seq(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, float alpha, const float *A, pastix_int_t lda, const float *x, float beta, float *y)
Compute.
const float * bvec_sgather_remote(const pastix_data_t *pastix_data, const float *y)
Gather a distributed right hand side (bvec storage) on all nodes.
float bvec_sdot_smp(pastix_data_t *pastix_data, pastix_int_t n, const float *x, const float *y)
Compute a regular scalar product x.y (Parallel version)
void bvec_scopy_smp(pastix_data_t *pastix_data, pastix_int_t n, const float *x, float *y)
Copy a vector y = x (parallel version)
static void pthread_bvec_sscal(isched_thread_t *ctx, void *args)
Scale a vector (Parallel version)
void bvec_sscal_smp(pastix_data_t *pastix_data, pastix_int_t n, float alpha, float *x)
Scale a vector (Parallel version)
void bvec_sscal_seq(pastix_data_t *pastix_data, pastix_int_t n, float alpha, float *x)
Scale a vector by the scalar alpha. (Sequential version)
void bvec_saxpy_seq(pastix_data_t *pastix_data, pastix_int_t n, float alpha, const float *x, float *y)
Compute y <- alpha * x + y. (Sequential version)
void bcsc_sspsv(pastix_data_t *pastix_data, float *b, float *work)
Solve A x = b with A the sparse matrix.
void bvec_scopy_seq(pastix_data_t *pastix_data, pastix_int_t n, const float *x, float *y)
Copy a vector y = x (Sequential version)
void bvec_sgemv_smp(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, float alpha, const float *A, pastix_int_t lda, const float *x, float beta, float *y)
Compute.
float bvec_snrm2_smp(pastix_data_t *pastix_data, pastix_int_t n, const float *x)
Compute the norm 2 of a vector. (Parallel version)
void bvec_sallreduce(const pastix_data_t *pastix_data, float *y)
Apply an all reduce of the vector on all nodes.
float bvec_snrm2_seq(pastix_data_t *pastix_data, pastix_int_t n, const float *x)
Compute the norm 2 of a vector. (Sequential version)
float bvec_sdot_seq(pastix_data_t *pastix_data, pastix_int_t n, const float *x, const float *y)
Compute the scalar product x.y. (Sequential version)
static void pthread_bvec_snrm2(isched_thread_t *ctx, void *args)
Compute the norm 2 of a vector. (Parallel version)
void bvec_snullify_remote(const pastix_data_t *pastix_data, float *y)
Set to 0 remote coefficients.
void bvec_saxpy_smp(pastix_data_t *pastix_data, pastix_int_t n, float alpha, const float *x, float *y)
Perform y = alpha * x + y (Parallel version)
Compressed colptr format for the bcsc.
int pastix_subtask_solve(pastix_data_t *pastix_data, pastix_rhs_t b)
Solve the given problem without applying the permutation.
pastix_coeftype_t flttype
PASTIX_Comm inter_node_comm
Main PaStiX data structure.
Main PaStiX RHS structure.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
SolverCblk *restrict cblktab
Solver column block structure.
Solver column block structure.