27#include "order/order_internal.h"
28#include "frobeniusupdate.h"
32#if defined(PASTIX_WITH_MPI)
34bvec_cmpi_frb_merge(
float *dist,
40 frobenius_merge( dist[0], dist[1], loc, loc+1 );
77 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
80 float data[] = { 0., 1. };
85 cblknbr = bcsc->cscfnbr;
86 for( i = 0; i < cblknbr; i++, bcblk++ ) {
89 valptr = (
const float*)(x + scblk->
lcolidx);
91 for( j=0; j < colnbr; j++, valptr++ )
94 frobenius_update( 1, data, data + 1, valptr );
95#if defined(PRECISION_z) || defined(PRECISION_c)
98 frobenius_update( 1, data, data + 1, valptr );
103#if defined(PASTIX_WITH_MPI)
107 MPI_Op_create( (MPI_User_function *)bvec_cmpi_frb_merge, 1, &merge );
108 MPI_Allreduce( MPI_IN_PLACE, data, 2, MPI_FLOAT, merge, solvmtx->solv_comm );
109 MPI_Op_free( &merge );
113 norm = data[0] * sqrtf( data[1] );
119struct c_argument_nrm2_s
123 pastix_atomic_lock_t lock;
149 struct c_argument_nrm2_s *arg = (
struct c_argument_nrm2_s*)args;
154 float *valptr = (
float*)x;
158 size = ctx->global_ctx->world_size;
161 begin = (n / size) * rank;
162 if (rank == (size - 1)) {
165 end = (n / size) * (rank + 1);
169#if defined(PRECISION_z) || defined(PRECISION_c)
173 for( i = begin; i < end; i++, valptr++ )
175 frobenius_update( 1, &scale, &sumsq, valptr );
176#if defined(PRECISION_z) || defined(PRECISION_c)
178 frobenius_update( 1, &scale, &sumsq, valptr );
184 pastix_atomic_lock( &(arg->lock) );
185 frobenius_merge( scale, sumsq, &(arg->scale), &(arg->sumsq) );
186 pastix_atomic_unlock( &(arg->lock) );
219 struct c_argument_nrm2_s arg = { n, x, PASTIX_ATOMIC_UNLOCKED, 0., 1. };
222#if defined(PASTIX_WITH_MPI)
226 MPI_Op_create( (MPI_User_function *)bvec_cmpi_frb_merge, 1, &merge );
227 MPI_Allreduce( MPI_IN_PLACE, &(arg.scale), 2, MPI_FLOAT, merge, pastix_data->
solvmatr->solv_comm );
228 MPI_Op_free( &merge );
232 return arg.scale * sqrtf( arg.sumsq );
264#if defined(PASTIX_WITH_MPI) && 0
267 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
271 cblknbr = bcsc->cscfnbr;
272 for( i = 0; i < cblknbr; i++, bcblk++ ) {
276 cblas_cscal( n, CBLAS_SADDR(alpha), x + scblk->
lcolidx, 1 );
280 cblas_cscal( n, CBLAS_SADDR(alpha), x, 1 );
285struct c_argument_scal_s
313 struct c_argument_scal_s *arg = (
struct c_argument_scal_s*)args;
326 begin = (n/size) * rank;
327 if (rank == (size - 1)) {
330 end = (n/size) * (rank + 1);
333 if ( (end - begin) > 0 ) {
334 cblas_cscal( end - begin, CBLAS_SADDR(alpha), x + begin, 1 );
367 struct c_argument_scal_s arg = {n, alpha, x};
404#if defined(PASTIX_WITH_MPI) && 0
407 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
411 cblknbr = bcsc->cscfnbr;
412 for( i = 0; i < cblknbr; i++, bcblk++ ){
416 cblas_caxpy( n, CBLAS_SADDR(alpha),
422 cblas_caxpy( n, CBLAS_SADDR(alpha), x, 1, y, 1 );
426struct c_argument_axpy_s
455 struct c_argument_axpy_s *arg = (
struct c_argument_axpy_s*)args;
463 if( (y == NULL) || (x == NULL) ) {
474 begin = (n/size) * rank;
475 if (rank == (size - 1)) {
478 end = (n/size) * (rank + 1);
481 if ( (end - begin) > 0 ) {
482 cblas_caxpy( end - begin, CBLAS_SADDR(alpha), x + begin, 1, y + begin, 1 );
519 struct c_argument_axpy_s args = {n, alpha, x, y};
523struct c_argument_dotc_s
528 pastix_atomic_lock_t lock;
532#if defined(PRECISION_z) || defined(PRECISION_c)
568 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
575 cblknbr = bcsc->cscfnbr;
576 for( i = 0; i < cblknbr; i++, bcblk++ ) {
582 for( j=0; j<n; j++, xptr++, yptr++ ) {
583 r += conjf(*xptr) * (*yptr);
587#if defined(PASTIX_WITH_MPI)
588 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_COMPLEX32,
589 MPI_SUM, solvmtx->solv_comm );
612pthread_bvec_cdotc( isched_thread_t *ctx,
615 struct c_argument_dotc_s *arg = (
struct c_argument_dotc_s*)args;
626 begin = (n/size) * rank;
627 if (rank != size - 1) {
628 end = (n/size) * (rank + 1);
636 for ( i = begin; i < end; i++, xptr++, yptr++ )
638 r += conjf(*xptr) * (*yptr);
641 if ( cabsf(r) > 0. ) {
642 pastix_atomic_lock( &(arg->lock) );
644 pastix_atomic_unlock( &(arg->lock) );
681 struct c_argument_dotc_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
682 isched_parallel_call( pastix_data->
isched, pthread_bvec_cdotc, &arg );
684#if defined(PASTIX_WITH_MPI)
685 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_COMPLEX32,
686 MPI_SUM, pastix_data->
solvmatr->solv_comm );
728 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
735 cblknbr = bcsc->cscfnbr;
736 for( i = 0; i < cblknbr; i++, bcblk++ ) {
742 for( j=0; j<n; j++, xptr++, yptr++ ) {
743 r += (*xptr) * (*yptr);
747#if defined(PASTIX_WITH_MPI)
748 MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_COMPLEX32,
749 MPI_SUM, solvmtx->solv_comm );
775 struct c_argument_dotc_s *arg = (
struct c_argument_dotc_s*)args;
789 begin = (n / size) * rank;
790 if ( rank == (size - 1)) {
793 end = (n / size) * (rank + 1);
799 for (i=begin; i<end; i++, xptr++, yptr++)
801 r += (*xptr) * (*yptr);
804 if ( cabsf(r) > 0. ) {
805 pastix_atomic_lock( &(arg->lock) );
807 pastix_atomic_unlock( &(arg->lock) );
844 struct c_argument_dotc_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
847#if defined(PASTIX_WITH_MPI)
848 MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_COMPLEX32,
849 MPI_SUM, pastix_data->
solvmatr->solv_comm );
884#if defined(PASTIX_WITH_MPI) && 0
887 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
891 cblknbr = bcsc->cscfnbr;
892 for( i = 0; i < cblknbr; i++, bcblk++ ) {
904struct argument_copy_s {
933 struct argument_copy_s *arg = (
struct argument_copy_s*)args;
941 begin = (n/size) * rank;
943 if (rank == (size - 1)) {
946 end = (n/size) * (rank + 1);
949 if ( (end - begin) > 0 ) {
985 struct argument_copy_s args = {n, x, y};
1023 .flttype = PastixComplex32,
1024 .m = pastix_data->
bcsc->n,
1026 .ld = pastix_data->
bcsc->n,
1036#if defined(PRECISION_z) || defined(PRECISION_d)
1042 rhsb.
flttype = PastixComplex32;
1046 rc = LAPACKE_clag2z_work( LAPACK_COL_MAJOR,
n, nrhs,
1053 rc = LAPACKE_clag2z_work( LAPACK_COL_MAJOR,
n, nrhs,
1060 assert(work == NULL);
1064 if ( rhsb.
cblkb != NULL ) {
1122#if defined(PASTIX_WITH_MPI) && 0
1125 pastix_bcsc_t *bcsc = pastix_data->
bcsc;
1129 cblknbr = bcsc->cscfnbr;
1130 for( i = 0; i < cblknbr; i++, bcblk++ ) {
1134 cblas_cgemv( CblasColMajor, CblasNoTrans,
m,
n,
1135 CBLAS_SADDR(alpha), A + scblk->
lcolidx, lda, x, 1,
1136 CBLAS_SADDR(beta), y + scblk->
lcolidx, 1 );
1140 cblas_cgemv( CblasColMajor, CblasNoTrans,
m,
n,
1141 CBLAS_SADDR(alpha), A, lda, x, 1,
1142 CBLAS_SADDR(beta), y, 1 );
1185 struct c_gemv_s *arg = (
struct c_gemv_s*)args;
1206 Aptr = A + sub_m * rank;
1209 yptr = y + sub_m * rank;
1211 if (rank == (size - 1)) {
1216 cblas_cgemv( CblasColMajor, CblasNoTrans, sub_m, n,
1217 CBLAS_SADDR(alpha), Aptr, lda, xptr, 1,
1218 CBLAS_SADDR(beta), yptr, 1 );
1272 struct c_gemv_s arg = {m, n, alpha, A, lda, x, beta, y};
1299#if defined( PASTIX_WITH_MPI )
1307 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1308 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1312 if ( cblk->
fcolnum != lastindex ) {
1317 lastindex = cblk->
lcolnum + 1;
1320 if ( lastindex < n ) {
1355#if defined( PASTIX_WITH_MPI )
1362 MPI_Request request_c = MPI_REQUEST_NULL;
1363 MPI_Request request_n = MPI_REQUEST_NULL;
1380 MPI_Allgather( &ln, 1, PASTIX_MPI_INT,
1381 all_n, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1383 MPI_Allgather( &lcblknbr, 1, PASTIX_MPI_INT,
1384 all_cblknbr, 1, PASTIX_MPI_INT, pastix_data->
pastix_comm );
1386 for( c=0; c<pastix_data->
procnbr; c++ )
1388 max_n = pastix_imax( max_n, all_n[c] );
1389 max_cblknbr = pastix_imax( max_cblknbr, all_cblknbr[c] );
1393 indices = malloc( max_cblknbr * 2 *
sizeof(
pastix_int_t) );
1395 for( c=0; c<pastix_data->
procnbr; c++ )
1397 if ( all_n[c] == 0 ) {
1401 if ( c == pastix_data->
procnum ) {
1407 for ( i = 0; i < cblknbr; i++, cblk++ ) {
1408 if ( cblk->
cblktype & (CBLK_FANIN|CBLK_RECV) ) {
1413 yto = yglobal + cblk->
fcolnum;
1417 indices[ 2*cblki ] = cblk->
fcolnum;
1418 indices[ 2*cblki+1 ] = cblk->
lcolnum;
1421 assert( cblki == lcblknbr );
1423 MPI_Ibcast( indices, 2 * lcblknbr, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1424 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1425 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1428 MPI_Ibcast( ytmp, all_n[c], PASTIX_MPI_COMPLEX32, c, pastix_data->
pastix_comm, &request_n );
1429 MPI_Ibcast( indices, all_cblknbr[c] * 2, PASTIX_MPI_INT, c, pastix_data->
pastix_comm, &request_c );
1430 MPI_Wait( &request_n, MPI_STATUS_IGNORE );
1431 MPI_Wait( &request_c, MPI_STATUS_IGNORE );
1436 for ( i = 0; i < all_cblknbr[c]; i++ ) {
1437 yto = yglobal + indices[2*i];
1439 colnbr = indices[2*i+1] - indices[2*i] + 1;
1449 free( all_cblknbr );
1482#if defined( PASTIX_WITH_MPI )
1484 MPI_Allreduce( MPI_IN_PLACE,
1485 y, pastix_data->
csc->gNexp,
1486 PASTIX_MPI_COMPLEX32, MPI_SUM,
BEGIN_C_DECLS typedef int pastix_int_t
float _Complex pastix_complex32_t
static void pthread_bvec_cdotu(isched_thread_t *ctx, void *args)
Compute the scalar product x.y. (Parallel version)
static void pthread_bvec_cgemv(isched_thread_t *ctx, void *args)
Compute.
static void pthread_bvec_caxpy(isched_thread_t *ctx, void *args)
Compute y <- alpha * x + y (Parallel version).
static void pthread_bvec_ccopy(isched_thread_t *ctx, void *args)
Copy a vector y = x (parallel version)
void bvec_cgemv_smp(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t lda, const pastix_complex32_t *x, pastix_complex32_t beta, pastix_complex32_t *y)
Compute.
float bvec_cnrm2_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x)
Compute the norm 2 of a vector. (Sequential version)
const pastix_complex32_t * bvec_cgather_remote(const pastix_data_t *pastix_data, const pastix_complex32_t *y)
Gather a distributed right hand side (bvec storage) on all nodes.
void bvec_cscal_seq(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex32_t alpha, pastix_complex32_t *x)
Scale a vector by the scalar alpha. (Sequential version)
void bvec_callreduce(const pastix_data_t *pastix_data, pastix_complex32_t *y)
Apply an all reduce of the vector on all nodes.
void bvec_ccopy_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x, pastix_complex32_t *y)
Copy a vector y = x (parallel version)
void bvec_ccopy_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x, pastix_complex32_t *y)
Copy a vector y = x (Sequential version)
void bvec_caxpy_seq(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex32_t alpha, const pastix_complex32_t *x, pastix_complex32_t *y)
Compute y <- alpha * x + y. (Sequential version)
void bcsc_cspsv(pastix_data_t *pastix_data, pastix_complex32_t *b, pastix_complex32_t *work)
Solve A x = b with A the sparse matrix.
pastix_complex32_t bvec_cdotu_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x, const pastix_complex32_t *y)
Compute a regular scalar product x.y (Parallel version)
static void pthread_bvec_cnrm2(isched_thread_t *ctx, void *args)
Compute the norm 2 of a vector. (Parallel version)
pastix_complex32_t bvec_cdotu_seq(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x, const pastix_complex32_t *y)
Compute the scalar product x.y. (Sequential version)
void bvec_cgemv_seq(pastix_data_t *pastix_data, pastix_int_t m, pastix_int_t n, pastix_complex32_t alpha, const pastix_complex32_t *A, pastix_int_t lda, const pastix_complex32_t *x, pastix_complex32_t beta, pastix_complex32_t *y)
Compute.
float bvec_cnrm2_smp(pastix_data_t *pastix_data, pastix_int_t n, const pastix_complex32_t *x)
Compute the norm 2 of a vector. (Parallel version)
void bvec_caxpy_smp(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex32_t alpha, const pastix_complex32_t *x, pastix_complex32_t *y)
Perform y = alpha * x + y (Parallel version)
static void pthread_bvec_cscal(isched_thread_t *ctx, void *args)
Scale a vector (Parallel version)
void bvec_cscal_smp(pastix_data_t *pastix_data, pastix_int_t n, pastix_complex32_t alpha, pastix_complex32_t *x)
Scale a vector (Parallel version)
void bvec_cnullify_remote(const pastix_data_t *pastix_data, pastix_complex32_t *y)
Set to 0 remote coefficients.
Compressed colptr format for the bcsc.
int pastix_subtask_solve(pastix_data_t *pastix_data, pastix_rhs_t b)
Solve the given problem without applying the permutation.
pastix_coeftype_t flttype
PASTIX_Comm inter_node_comm
Main PaStiX data structure.
Main PaStiX RHS structure.
static pastix_int_t cblk_colnbr(const SolverCblk *cblk)
Compute the number of columns in a column block.
SolverCblk *restrict cblktab
Solver column block structure.
Solver column block structure.