27 #include "order/order_internal.h"
28 #include "frobeniusupdate.h"
32 #if defined(PASTIX_WITH_MPI)
34 bvec_zmpi_frb_merge(
double *dist,
40 frobenius_merge( dist[0], dist[1], loc, loc+1 );
73 const pastix_complex64_t *x )
77 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
80 double data[] = { 0., 1. };
85 cblknbr = bcsc->cscfnbr;
86 for( i = 0; i < cblknbr; i++, bcblk++ ) {
89 valptr = (
const double*)(x + scblk->
lcolidx);
91 for( j=0; j < colnbr; j++, valptr++ )
94 frobenius_update( 1, data, data + 1, valptr );
95 #if defined(PRECISION_z) || defined(PRECISION_c)
98 frobenius_update( 1, data, data + 1, valptr );
103 #if defined(PASTIX_WITH_MPI)
107 MPI_Op_create( (MPI_User_function *)bvec_zmpi_frb_merge, 1, &merge );
108 MPI_Allreduce( MPI_IN_PLACE, data, 2, MPI_DOUBLE, merge, solvmtx->solv_comm );
109 MPI_Op_free( &merge );
113 norm = data[0] * sqrt( data[1] );
119 struct z_argument_nrm2_s
122 const pastix_complex64_t *x;
123 pastix_atomic_lock_t lock;
149 struct z_argument_nrm2_s *arg = (
struct z_argument_nrm2_s*)args;
151 const pastix_complex64_t *x = arg->x;
154 double *valptr = (
double*)x;
158 size = ctx->global_ctx->world_size;
161 begin = (n / size) * rank;
162 if (rank == (size - 1)) {
165 end = (n / size) * (rank + 1);
169 #if defined(PRECISION_z) || defined(PRECISION_c)
173 for( i = begin; i < end; i++, valptr++ )
175 frobenius_update( 1, &scale, &sumsq, valptr );
176 #if defined(PRECISION_z) || defined(PRECISION_c)
178 frobenius_update( 1, &scale, &sumsq, valptr );
184 pastix_atomic_lock( &(arg->lock) );
185 frobenius_merge( scale, sumsq, &(arg->scale), &(arg->sumsq) );
186 pastix_atomic_unlock( &(arg->lock) );
217 const pastix_complex64_t *x )
219 struct z_argument_nrm2_s arg = { n, x, PASTIX_ATOMIC_UNLOCKED, 0., 1. };
222 #if defined(PASTIX_WITH_MPI)
226 MPI_Op_create( (MPI_User_function *)bvec_zmpi_frb_merge, 1, &merge );
227 MPI_Allreduce( MPI_IN_PLACE, &(arg.scale), 2, MPI_DOUBLE, merge, pastix_data->
solvmatr->solv_comm );
228 MPI_Op_free( &merge );
232 return arg.scale * sqrt( arg.sumsq );
261 pastix_complex64_t alpha,
262 pastix_complex64_t *x )
264 #if defined(PASTIX_WITH_MPI) && 0
267 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
271 cblknbr = bcsc->cscfnbr;
272 for( i = 0; i < cblknbr; i++, bcblk++ ) {
276 cblas_zscal( n, CBLAS_SADDR(alpha), x + scblk->
lcolidx, 1 );
280 cblas_zscal( n, CBLAS_SADDR(alpha), x, 1 );
285 struct z_argument_scal_s
288 pastix_complex64_t alpha;
289 pastix_complex64_t *x;
313 struct z_argument_scal_s *arg = (
struct z_argument_scal_s*)args;
314 pastix_complex64_t *x = arg->x;
316 pastix_complex64_t alpha = arg->alpha;
326 begin = (n/size) * rank;
327 if (rank == (size - 1)) {
330 end = (n/size) * (rank + 1);
333 if ( (end - begin) > 0 ) {
334 cblas_zscal( end - begin, CBLAS_SADDR(alpha), x + begin, 1 );
364 pastix_complex64_t alpha,
365 pastix_complex64_t *x )
367 struct z_argument_scal_s arg = {n, alpha, x};
400 pastix_complex64_t alpha,
401 const pastix_complex64_t *x,
402 pastix_complex64_t *y)
404 #if defined(PASTIX_WITH_MPI) && 0
407 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
411 cblknbr = bcsc->cscfnbr;
412 for( i = 0; i < cblknbr; i++, bcblk++ ){
416 cblas_zaxpy( n, CBLAS_SADDR(alpha),
422 cblas_zaxpy( n, CBLAS_SADDR(alpha), x, 1, y, 1 );
426 struct z_argument_axpy_s
429 pastix_complex64_t alpha;
430 const pastix_complex64_t *x;
431 pastix_complex64_t *y;
455 struct z_argument_axpy_s *arg = (
struct z_argument_axpy_s*)args;
457 pastix_complex64_t alpha = arg->alpha;
458 const pastix_complex64_t *x = arg->x;
459 pastix_complex64_t *y = arg->y;
463 if( (y == NULL) || (x == NULL) ) {
467 if( alpha == (pastix_complex64_t)0.0 ) {
474 begin = (n/size) * rank;
475 if (rank == (size - 1)) {
478 end = (n/size) * (rank + 1);
481 if ( (end - begin) > 0 ) {
482 cblas_zaxpy( end - begin, CBLAS_SADDR(alpha), x + begin, 1, y + begin, 1 );
515 pastix_complex64_t alpha,
516 const pastix_complex64_t *x,
517 pastix_complex64_t *y )
519 struct z_argument_axpy_s args = {n, alpha, x, y};
523 struct z_argument_dotc_s
526 const pastix_complex64_t *x;
527 const pastix_complex64_t *y;
528 pastix_atomic_lock_t lock;
529 pastix_complex64_t sum;
532 #if defined(PRECISION_z) || defined(PRECISION_c)
563 const pastix_complex64_t *x,
564 const pastix_complex64_t *y )
568 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
571 const pastix_complex64_t *xptr;
572 const pastix_complex64_t *yptr;
573 pastix_complex64_t r = 0.0;
575 cblknbr = bcsc->cscfnbr;
576 for( i = 0; i < cblknbr; i++, bcblk++ ) {
582 for( j=0; j<n; j++, xptr++, yptr++ ) {
583 r += conj(*xptr) * (*yptr);
587 #if defined(PASTIX_WITH_MPI)
588 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_COMPLEX64,
589 MPI_SUM, solvmtx->solv_comm );
612 pthread_bvec_zdotc( isched_thread_t *ctx,
615 struct z_argument_dotc_s *arg = (
struct z_argument_dotc_s*)args;
618 const pastix_complex64_t *xptr = arg->x;
619 const pastix_complex64_t *yptr = arg->y;
621 pastix_complex64_t r = 0.0;
626 begin = (n/size) * rank;
627 if (rank != size - 1) {
628 end = (n/size) * (rank + 1);
636 for ( i = begin; i < end; i++, xptr++, yptr++ )
638 r += conj(*xptr) * (*yptr);
641 if ( cabs(r) > 0. ) {
642 pastix_atomic_lock( &(arg->lock) );
644 pastix_atomic_unlock( &(arg->lock) );
678 const pastix_complex64_t *x,
679 const pastix_complex64_t *y )
681 struct z_argument_dotc_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
682 isched_parallel_call( pastix_data->
isched, pthread_bvec_zdotc, &arg );
684 #if defined(PASTIX_WITH_MPI)
685 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_COMPLEX64,
686 MPI_SUM, pastix_data->
solvmatr->solv_comm );
723 const pastix_complex64_t *x,
724 const pastix_complex64_t *y )
728 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
731 const pastix_complex64_t *xptr;
732 const pastix_complex64_t *yptr;
733 pastix_complex64_t r = 0.0;
735 cblknbr = bcsc->cscfnbr;
736 for( i = 0; i < cblknbr; i++, bcblk++ ) {
742 for( j=0; j<n; j++, xptr++, yptr++ ) {
743 r += (*xptr) * (*yptr);
747 #if defined(PASTIX_WITH_MPI)
748 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_COMPLEX64,
749 MPI_SUM, solvmtx->solv_comm );
775 struct z_argument_dotc_s *arg = (
struct z_argument_dotc_s*)args;
778 const pastix_complex64_t *x = arg->x;
779 const pastix_complex64_t *y = arg->y;
780 const pastix_complex64_t *xptr;
781 const pastix_complex64_t *yptr;
782 pastix_complex64_t r = 0.0;
789 begin = (n / size) * rank;
790 if ( rank == (size - 1)) {
793 end = (n / size) * (rank + 1);
799 for (i=begin; i<end; i++, xptr++, yptr++)
801 r += (*xptr) * (*yptr);
804 if ( cabs(r) > 0. ) {
805 pastix_atomic_lock( &(arg->lock) );
807 pastix_atomic_unlock( &(arg->lock) );
841 const pastix_complex64_t *x,
842 const pastix_complex64_t *y )
844 struct z_argument_dotc_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
847 #if defined(PASTIX_WITH_MPI)
848 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_COMPLEX64,
849 MPI_SUM, pastix_data->
solvmatr->solv_comm );
881 const pastix_complex64_t *x,
882 pastix_complex64_t *y )
884 #if defined(PASTIX_WITH_MPI) && 0
887 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
891 cblknbr = bcsc->cscfnbr;
892 for( i = 0; i < cblknbr; i++, bcblk++ ) {
896 memcpy( y + scblk->
lcolidx, x + scblk->
lcolidx, n *
sizeof(pastix_complex64_t) );
900 memcpy( y, x, n *
sizeof(pastix_complex64_t) );
904 struct argument_copy_s {
906 const pastix_complex64_t *x;
907 pastix_complex64_t *y;
933 struct argument_copy_s *arg = (
struct argument_copy_s*)args;
941 begin = (n/size) * rank;
943 if (rank == (size - 1)) {
946 end = (n/size) * (rank + 1);
949 if ( (end - begin) > 0 ) {
950 memcpy( arg->y + begin, arg->x + begin, (end - begin) *
sizeof(pastix_complex64_t) );
982 const pastix_complex64_t *x,
983 pastix_complex64_t *y )
985 struct argument_copy_s args = {n, x, y};
1018 pastix_complex64_t *b,
1023 .flttype = PastixComplex64,
1024 .m = pastix_data->
bcsc->n,
1026 .ld = pastix_data->
bcsc->n,
1036 #if defined(PRECISION_z) || defined(PRECISION_d)
1042 rhsb.
flttype = PastixComplex32;
1046 rc = LAPACKE_zlag2c_work( LAPACK_COL_MAJOR,
n, nrhs,
1053 rc = LAPACKE_clag2z_work( LAPACK_COL_MAJOR,
n, nrhs,
1060 assert(work == NULL);
1064 if ( rhsb.
cblkb != NULL ) {
1115 pastix_complex64_t alpha,
1116 const pastix_complex64_t *A,
1118 const pastix_complex64_t *x,
1119 pastix_complex64_t beta,
1120 pastix_complex64_t *y )
1122 #if defined(PASTIX_WITH_MPI) && 0
1125 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
1129 cblknbr = bcsc->cscfnbr;
1130 for( i = 0; i < cblknbr; i++, bcblk++ ) {
1134 cblas_zgemv( CblasColMajor, CblasNoTrans,
m,
n,
1135 CBLAS_SADDR(alpha), A + scblk->
lcolidx, lda, x, 1,
1136 CBLAS_SADDR(beta), y + scblk->
lcolidx, 1 );
1140 cblas_zgemv( CblasColMajor, CblasNoTrans,
m,
n,
1141 CBLAS_SADDR(alpha), A, lda, x, 1,
1142 CBLAS_SADDR(beta), y, 1 );
1151 pastix_complex64_t alpha;
1152 const pastix_complex64_t *A;
1154 const pastix_complex64_t *x;
1155 pastix_complex64_t beta;
1156 pastix_complex64_t *y;
1185 struct z_gemv_s *arg = (
struct z_gemv_s*)args;
1189 pastix_complex64_t alpha = arg->alpha;
1190 const pastix_complex64_t *A = arg->A;
1191 const pastix_complex64_t *Aptr;
1193 const pastix_complex64_t *x = arg->x;
1194 const pastix_complex64_t *xptr;
1195 pastix_complex64_t beta = arg->beta;
1196 pastix_complex64_t *y = arg->y;
1197 pastix_complex64_t *yptr;
1206 Aptr = A + sub_m * rank;
1209 yptr = y + sub_m * rank;
1211 if (rank == (size - 1)) {
1216 cblas_zgemv( CblasColMajor, CblasNoTrans, sub_m, n,
1217 CBLAS_SADDR(alpha), Aptr, lda, xptr, 1,
1218 CBLAS_SADDR(beta), yptr, 1 );
1265 pastix_complex64_t alpha,
1266 const pastix_complex64_t *A,
1268 const pastix_complex64_t *x,
1269 pastix_complex64_t beta,
1270 pastix_complex64_t *y )
1272 struct z_gemv_s arg = {m, n, alpha, A, lda, x, beta, y};
1297 pastix_complex64_t *y )
1299 #if defined( PASTIX_WITH_MPI )
1307 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1308 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1312 if ( cblk->
fcolnum != lastindex ) {
1315 y + lastindex, 0, ( cblk->
fcolnum - lastindex ) *
sizeof( pastix_complex64_t ) );
1317 lastindex = cblk->
lcolnum + 1;
1320 if ( lastindex < n ) {
1322 memset( y + lastindex, 0, ( n - lastindex ) *
sizeof( pastix_complex64_t ) );
1351 const pastix_complex64_t *
1353 const pastix_complex64_t *y )
1355 #if defined( PASTIX_WITH_MPI )
1358 pastix_complex64_t *yglobal = NULL;
1359 pastix_complex64_t *yto, *ytmp;
1360 const pastix_complex64_t *yfr;
1362 MPI_Request request_c = MPI_REQUEST_NULL;
1363 MPI_Request request_n = MPI_REQUEST_NULL;
1372 yglobal = malloc( gn *
sizeof(pastix_complex64_t) );
1373 #if !defined(NDEBUG)
1374 memset( yglobal, 0xff, gn *
sizeof(pastix_complex64_t) );
1380 MPI_Allgather( &ln, 1, PASTIX_MPI_INT,
1381 all_n, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1383 MPI_Allgather( &lcblknbr, 1, PASTIX_MPI_INT,
1384 all_cblknbr, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1386 for( c=0; c<pastix_data->
procnbr; c++ )
1388 max_n = pastix_imax( max_n, all_n[c] );
1389 max_cblknbr = pastix_imax( max_cblknbr, all_cblknbr[c] );
1392 ytmp = malloc( max_n *
sizeof(pastix_complex64_t) );
1393 indices = malloc( max_cblknbr * 2 *
sizeof(
pastix_int_t) );
1395 for( c=0; c<pastix_data->
procnbr; c++ )
1397 if ( all_n[c] == 0 ) {
1401 if ( c == pastix_data->
procnum ) {
1402 MPI_Ibcast( (pastix_complex64_t*)y, ln, PASTIX_MPI_COMPLEX64, c, pastix_data->
pastix_comm, &request_n );
1407 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1408 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1413 yto = yglobal + cblk->
fcolnum;
1415 memcpy( yto, yfr,
cblk_colnbr( cblk ) *
sizeof( pastix_complex64_t ) );
1417 indices[ 2*cblki ] = cblk->
fcolnum;
1418 indices[ 2*cblki+1 ] = cblk->
lcolnum;
1421 assert( cblki == lcblknbr );
1423 MPI_Ibcast( indices, 2 * lcblknbr, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1424 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1425 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1428 MPI_Ibcast( ytmp, all_n[c], PASTIX_MPI_COMPLEX64, c, pastix_data->
pastix_comm, &request_n );
1429 MPI_Ibcast( indices, all_cblknbr[c] * 2, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1430 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1431 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1436 for ( i = 0; i < all_cblknbr[c]; i++ ) {
1437 yto = yglobal + indices[2*i];
1439 colnbr = indices[2*i+1] - indices[2*i] + 1;
1440 memcpy( yto, yfr, colnbr *
sizeof( pastix_complex64_t ) );
1449 free( all_cblknbr );
1480 pastix_complex64_t *y )
1482 #if defined( PASTIX_WITH_MPI )
1484 MPI_Allreduce( MPI_IN_PLACE,
1485 y, pastix_data->
csc->gNexp,
1486 PASTIX_MPI_COMPLEX64, MPI_SUM,
BEGIN_C_DECLS typedef int pastix_int_t
float _Complex pastix_complex32_t
static void pthread_bvec_zdotu(isched_thread_t *ctx, void *args)
Compute the scalar product x.y. (Parallel version)
static void pthread_bvec_zcopy(isched_thread_t *ctx, void *args)
Copy a vector y = x (parallel version)
static void pthread_bvec_zgemv(isched_thread_t *ctx, void *args)
Compute.
static void pthread_bvec_zaxpy(isched_thread_t *ctx, void *args)
Compute y <- alpha * x + y (Parallel version).
void bvec_zcopy_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x, pastix_complex64_t *y)
Copy a vector y = x (Sequential version)
void bvec_zgemv_seq(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t lda, const pastix_complex64_t *x, pastix_complex64_t beta, pastix_complex64_t *y)
Compute.
void bvec_znullify_remote(const pastix_data_t *pastix_data, pastix_complex64_t *y)
Set to 0 remote coefficients.
void bvec_zaxpy_seq(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex64_t alpha, const pastix_complex64_t *x, pastix_complex64_t *y)
Compute y <- alpha * x + y. (Sequential version)
pastix_complex64_t bvec_zdotu_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x, const pastix_complex64_t *y)
Compute the scalar product x.y. (Sequential version)
void bvec_zallreduce(const pastix_data_t *pastix_data, pastix_complex64_t *y)
Apply an all reduce of the vector on all nodes.
double bvec_znrm2_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x)
Compute the norm 2 of a vector. (Parallel version)
static void pthread_bvec_zscal(isched_thread_t *ctx, void *args)
Scale a vector (Parallel version)
double bvec_znrm2_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x)
Compute the norm 2 of a vector. (Sequential version)
void bvec_zgemv_smp(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, pastix_complex64_t alpha, const pastix_complex64_t *A, pastix_int_t lda, const pastix_complex64_t *x, pastix_complex64_t beta, pastix_complex64_t *y)
Compute.
void bvec_zcopy_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x, pastix_complex64_t *y)
Copy a vector y = x (parallel version)
void bvec_zscal_smp(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex64_t alpha, pastix_complex64_t *x)
Scale a vector (Parallel version)
static void pthread_bvec_znrm2(isched_thread_t *ctx, void *args)
Compute the norm 2 of a vector. (Parallel version)
void bvec_zaxpy_smp(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex64_t alpha, const pastix_complex64_t *x, pastix_complex64_t *y)
Perform y = alpha * x + y (Parallel version)
const pastix_complex64_t * bvec_zgather_remote(const pastix_data_t *pastix_data, const pastix_complex64_t *y)
Gather a distributed right hand side (bvec storage) on all nodes.
void bvec_zscal_seq(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex64_t alpha, pastix_complex64_t *x)
Scale a vector by the scalar alpha. (Sequential version)
void bcsc_zspsv(pastix_data_t *pastix_data, pastix_complex64_t *b, pastix_complex32_t *work)
Solve A x = b with A the sparse matrix.
pastix_complex64_t bvec_zdotu_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex64_t *x, const pastix_complex64_t *y)
Compute a regular scalar product x.y (Parallel version)
Compressed colptr format for the bcsc.
int pastix_subtask_solve(pastix_data_t *pastix_data, pastix_rhs_t b)
Solve the given problem without applying the permutation.
pastix_coeftype_t flttype
PASTIX_Comm inter_node_comm
Main PaStiX data structure.
Main PaStiX RHS structure.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
SolverCblk *restrict cblktab
Solver column block structure.
Solver column block structure.