PaStiX Handbook: build/bcsc/bvec_scompute.c Source File

Go to the documentation of this file.
 /**
  *
  * @file bvec_scompute.c
  *
  *  Functions computing operations on the BCSC.
  *
  * @copyright 2004-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
  *                      Univ. Bordeaux. All rights reserved.
  *
  * @version 6.3.2
  * @author Mathieu Faverge
  * @author Pierre Ramet
  * @author Xavier Lacoste
  * @author Gregoire Pichon
  * @author Theophile Terraz
  * @author Tony Delarue
  * @author Vincent Bridonneau
  * @date 2023-07-21
  * @generated from /builds/solverstack/pastix/bcsc/bvec_zcompute.c, normal z -> s, Wed Dec 13 12:09:45 2023
  *
  **/
 #include "common.h"
 #include <math.h>
 #include "lapacke.h"
 #include "bcsc/bcsc.h"
 #include "bcsc_s.h"
 #include "order/order_internal.h"
 #include "frobeniusupdate.h"
 #include "cblas.h"
 #include "blend/solver.h"
  
 #if defined(PASTIX_WITH_MPI)
 void
 bvec_smpi_frb_merge( float       *dist,
                      float       *loc,
                      int          *len,
                      MPI_Datatype *dtype )
 {
     assert( *len == 2 );
     frobenius_merge( dist[0], dist[1], loc, loc+1 );
     (void)len;
     (void)dtype;
 }
 #endif
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute the norm 2 of a vector. (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          Provide information to the parallel version, to know the global
  *          context (Number of thread, barrier, ...).
  *
  * @param[in] n
  *          The size of the vector x.
  *
  * @param[in] x
  *          The vector x of size n.
  *
  *******************************************************************************
  *
  * @retval the norm 2 of x.
  *
  *******************************************************************************/
 float
 bvec_snrm2_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x )
 {
     SolverMatrix  *solvmtx = pastix_data->solvmatr;
     SolverCblk    *scblk;
     pastix_bcsc_t *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t   *bcblk   = bcsc->cscftab;
     pastix_int_t   cblknbr, colnbr;
     float         data[] = { 0., 1. }; /* Scale, Sum */
     float         norm;
     const float  *valptr;
     pastix_int_t   i, j;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         colnbr = cblk_colnbr( scblk );
         valptr = (const float*)(x + scblk->lcolidx);
  
         for( j=0; j < colnbr; j++, valptr++ )
         {
             /* Real part */
             frobenius_update( 1, data, data + 1, valptr );
 #if defined(PRECISION_z) || defined(PRECISION_c)
             /* Imaginary part */
             valptr++;
             frobenius_update( 1, data, data + 1, valptr );
 #endif
         }
     }
  
 #if defined(PASTIX_WITH_MPI)
     {
         MPI_Op merge;
  
         MPI_Op_create( (MPI_User_function *)bvec_smpi_frb_merge, 1, &merge );
         MPI_Allreduce( MPI_IN_PLACE, data, 2, MPI_FLOAT, merge, solvmtx->solv_comm );
         MPI_Op_free( &merge );
     }
 #endif
  
     norm = data[0] * sqrtf( data[1] );
  
     (void)n;
     return norm;
 }
  
 struct s_argument_nrm2_s
 {
     pastix_int_t              n;
     const float *x;
     pastix_atomic_lock_t      lock;
     float                    scale;
     float                    sumsq;
 };
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute the norm 2 of a vector. (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The context of the current thread
  *
  * @param[inout] args
  *          The parameter that providenumber of elements of x,
  *          and the vector which norm2 is to be computed, and the norm value
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_snrm2( isched_thread_t *ctx,
                    void            *args )
 {
     struct s_argument_nrm2_s *arg = (struct s_argument_nrm2_s*)args;
     pastix_int_t              n = arg->n;
     const float *x = arg->x;
     float                    scale = 0.;
     float                    sumsq = 1.;
     float                   *valptr = (float*)x;
     pastix_int_t              i, rank, size;
     pastix_int_t              begin, end;
  
     size = ctx->global_ctx->world_size;
     rank = ctx->rank;
  
     begin = (n / size) * rank;
     if (rank == (size - 1)) {
         end = n; /* One iteration more */
     } else {
         end = (n / size) * (rank + 1);
     }
  
     valptr += begin;
 #if defined(PRECISION_z) || defined(PRECISION_c)
     valptr += begin;
 #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
  
     for( i = begin; i < end; i++, valptr++ )
     {
         frobenius_update( 1, &scale, &sumsq, valptr );
 #if defined(PRECISION_z) || defined(PRECISION_c)
         valptr ++;
         frobenius_update( 1, &scale, &sumsq, valptr );
 #endif
     }
  
     /* If we computed something */
     if ( scale != 0. ) {
         pastix_atomic_lock( &(arg->lock) );
         frobenius_merge( scale, sumsq, &(arg->scale), &(arg->sumsq) );
         pastix_atomic_unlock( &(arg->lock) );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute the norm 2 of a vector. (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          Provide information to the parallel version, to know the global
  *          context (Number of thread, barrier, ...).
  *
  * @param[in] x
  *          The vector which norm2 is to be computed
  *
  * @param[in] n
  *          The number of elements of x
  *
  *******************************************************************************
  *
  * @return The norm 2 of the vector
  *
  *******************************************************************************/
 float
 bvec_snrm2_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x )
 {
     struct s_argument_nrm2_s arg = { n, x, PASTIX_ATOMIC_UNLOCKED, 0., 1. };
     isched_parallel_call( pastix_data->isched, pthread_bvec_snrm2, &arg );
  
 #if defined(PASTIX_WITH_MPI)
     {
         MPI_Op merge;
  
         MPI_Op_create( (MPI_User_function *)bvec_smpi_frb_merge, 1, &merge );
         MPI_Allreduce( MPI_IN_PLACE, &(arg.scale), 2, MPI_FLOAT, merge, pastix_data->solvmatr->solv_comm );
         MPI_Op_free( &merge );
     }
 #endif
  
     return arg.scale * sqrtf( arg.sumsq );
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Scale a vector by the scalar alpha. (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          Provide information to the parallel version, to know the global
  *          context (Number of thread, barrier, ...).
  *
  * @param[in] n
  *          The size of the vector x.
  *
  * @param[in] alpha
  *          The scalar to scale the vector x.
  *
  * @param[inout] x
  *          The vector x to scale.
  *
  *******************************************************************************/
 void
 bvec_sscal_seq( pastix_data_t      *pastix_data,
                 pastix_int_t        n,
                 float  alpha,
                 float *x )
 {
 #if defined(PASTIX_WITH_MPI) && 0
     SolverMatrix  *solvmtx = pastix_data->solvmatr;
     SolverCblk    *scblk   = solvmtx->cblktab;
     pastix_bcsc_t *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t   *bcblk   = bcsc->cscftab;
     pastix_int_t   i, cblknbr;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         n = cblk_colnbr( scblk );
  
         cblas_sscal( n, (alpha), x + scblk->lcolidx, 1 );
     }
 #else
     (void)pastix_data;
     cblas_sscal( n, (alpha), x, 1 );
 #endif
  
 }
  
 struct s_argument_scal_s
 {
   pastix_int_t        n;
   float  alpha;
   float *x;
 };
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Scale a vector (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The information about number of thread and rank of the actual thread
  *
  * @param[inout] args
  *          The argument providing number of elements of the vector, the scaling
  *          parameter and, and the vector to be scaled
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_sscal( isched_thread_t *ctx,
                     void            *args )
 {
     struct s_argument_scal_s *arg   = (struct s_argument_scal_s*)args;
     float       *x     = arg->x;
     pastix_int_t              n     = arg->n;
     float        alpha = arg->alpha;
     pastix_int_t              size  = ctx->global_ctx->world_size;
     pastix_int_t              rank;
     pastix_int_t              begin, end;
  
     if( x == NULL ) {
         return;
     }
  
     rank = (pastix_int_t)ctx->rank;
     begin = (n/size) * rank;
     if (rank == (size - 1)) {
         end = n;
     } else {
         end = (n/size) * (rank + 1);
     }
  
     if ( (end - begin) > 0 ) {
         cblas_sscal( end - begin, (alpha), x + begin, 1 );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Scale a vector (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of the vector
  *
  * @param[in] alpha
  *          The scaling parameter
  *
  * @param[inout] x
  *          The vector to be scaled
  *
  *******************************************************************************/
 void
 bvec_sscal_smp( pastix_data_t      *pastix_data,
                 pastix_int_t        n,
                 float  alpha,
                 float *x )
 {
     struct s_argument_scal_s arg = {n, alpha, x};
     isched_parallel_call( pastix_data->isched, pthread_bvec_sscal, &arg );
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute y <- alpha * x + y. (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
 -*
  * @param[in] n
  *          The size of the vectors.
  *
  * @param[in] alpha
  *          A scalar.
  *
  * @param[in] x
  *          The vector x.
  *
  * @param[inout] y
  *          The vector y.
  *
  *******************************************************************************/
 void
 bvec_saxpy_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 float        alpha,
                 const float *x,
                 float       *y)
 {
 #if defined(PASTIX_WITH_MPI) && 0
     SolverMatrix  *solvmtx = pastix_data->solvmatr;
     SolverCblk    *scblk   = solvmtx->cblktab;
     pastix_bcsc_t *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t   *bcblk   = bcsc->cscftab;
     pastix_int_t   i, cblknbr;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ){
         scblk   = solvmtx->cblktab + bcblk->cblknum;
         n = cblk_colnbr( scblk );
  
         cblas_saxpy( n, (alpha),
                      x + scblk->lcolidx, 1,
                      y + scblk->lcolidx, 1 );
     }
 #else
     (void)pastix_data;
     cblas_saxpy( n, (alpha), x, 1, y, 1 );
 #endif
 }
  
 struct s_argument_axpy_s
 {
     pastix_int_t              n;
     float        alpha;
     const float *x;
     float       *y;
 };
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc_internal
  *
  * @brief Compute y <- alpha * x + y (Parallel version).
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The context about the current thread
  *
  * @param[inout] args
  *          The parameter providing the size of the vectors, a scalar,
  *          the vectors x and y.
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_saxpy( isched_thread_t *ctx,
                     void            *args)
 {
     struct s_argument_axpy_s  *arg = (struct s_argument_axpy_s*)args;
     pastix_int_t               n = arg->n;
     float         alpha = arg->alpha;
     const float  *x = arg->x;
     float        *y = arg->y;
     pastix_int_t               rank, size;
     pastix_int_t               begin, end;
  
     if( (y == NULL) || (x == NULL) ) {
         return;
     }
  
     if( alpha == (float)0.0 ) {
         return;
      }
  
     size = (pastix_int_t)ctx->global_ctx->world_size;
     rank = (pastix_int_t)ctx->rank;
  
     begin = (n/size) * rank;
     if (rank  == (size - 1)) {
         end = n;
     } else {
         end = (n/size) * (rank + 1);
     }
  
     if ( (end - begin) > 0 ) {
         cblas_saxpy( end - begin, (alpha), x + begin, 1, y + begin, 1 );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Perform y = alpha * x + y (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of vectors x and y
  *
  * @param[in] alpha
  *          The scalar to scale x
  *
  * @param[in] x
  *          The vector to be scaled
  *
  * @param[inout] y
  *          The resulting solution
  *
  *******************************************************************************/
 void
 bvec_saxpy_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 float        alpha,
                 const float *x,
                 float       *y )
 {
     struct s_argument_axpy_s args = {n, alpha, x, y};
     isched_parallel_call( pastix_data->isched, pthread_bvec_saxpy, &args );
 }
  
 struct s_argument_dot_s
 {
     pastix_int_t              n;
     const float *x;
     const float *y;
     pastix_atomic_lock_t      lock;
     float        sum;
 };
  
 #if defined(PRECISION_z) || defined(PRECISION_c)
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute the scalar product (x).y (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The size of the vectors.
  *
  * @param[in] x
  *          The vector x.
  *
  * @param[in] y
  *          The vector y.
  *
  *******************************************************************************
  *
  * @retval the scalar product of (x) and y.
  *
  *******************************************************************************/
 float
 bvec_sdot_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 const float *y )
 {
     SolverMatrix             *solvmtx = pastix_data->solvmatr;
     SolverCblk               *scblk   = solvmtx->cblktab;
     pastix_bcsc_t            *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t              *bcblk   = bcsc->cscftab;
     pastix_int_t              i, j, cblknbr;
     const float *xptr;
     const float *yptr;
     float        r = 0.0;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         n = cblk_colnbr( scblk );
  
         xptr = x + scblk->lcolidx;
         yptr = y + scblk->lcolidx;
         for( j=0; j<n; j++, xptr++, yptr++ ) {
             r += (*xptr) * (*yptr);
         }
     }
  
 #if defined(PASTIX_WITH_MPI)
     MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_FLOAT,
                    MPI_SUM, solvmtx->solv_comm );
 #endif
  
     return r;
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc_internal
  *
  * @brief Compute the scalar product (x).y. (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The context of the current thread
  *
  * @param[in] args
  *          The argument provding the vectors x and y and their size.
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_sdot( isched_thread_t *ctx,
                     void            *args )
 {
     struct s_argument_dot_s *arg  = (struct s_argument_dot_s*)args;
     pastix_int_t n = arg->n;
     int i;
     const float *xptr = arg->x;
     const float *yptr = arg->y;
     pastix_int_t begin, end, rank, size;
     float r = 0.0;
  
     rank = (pastix_int_t)ctx->rank;
     size = (pastix_int_t)ctx->global_ctx->world_size;
  
     begin = (n/size) * rank;
     if (rank != size - 1) {
         end = (n/size) * (rank + 1);
     } else { /*The last one computes the calcul for the rest of the sum*/
         end = n;
     }
  
     xptr += begin;
     yptr += begin;
  
     for ( i = begin; i < end; i++, xptr++, yptr++ )
     {
         r += (*xptr) * (*yptr);
     }
  
     if ( fabsf(r) > 0. ) {
         pastix_atomic_lock( &(arg->lock) );
         arg->sum += r;
         pastix_atomic_unlock( &(arg->lock) );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute a scalar product between real vectors: (x).y
  * (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of vectors x, y and r
  *
  * @param[in] y
  *          The first vector of the scalar product
  *
  * @param[in] n
  *          The second vector of the scalar product
  *
  * @param[out] r
  *          The result of the scalar product
  *
  *******************************************************************************/
 float
 bvec_sdot_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 const float *y )
 {
     struct s_argument_dot_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
     isched_parallel_call( pastix_data->isched, pthread_bvec_sdot, &arg );
  
 #if defined(PASTIX_WITH_MPI)
     MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_FLOAT,
                    MPI_SUM, pastix_data->solvmatr->solv_comm );
 #endif
  
     return arg.sum;
 }
 #endif
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute the scalar product x.y. (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] x
  *          The vector x.
  *
  * @param[in] y
  *          The vector y.
  *
  * @param[in] n
  *          The size of the vectors.
  *
  *******************************************************************************
  *
  * @retval the scalar product of x and y.
  *
  *******************************************************************************/
 float
 bvec_sdot_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 const float *y )
 {
     SolverMatrix             *solvmtx = pastix_data->solvmatr;
     SolverCblk               *scblk   = solvmtx->cblktab;
     pastix_bcsc_t            *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t              *bcblk   = bcsc->cscftab;
     pastix_int_t              i, j, cblknbr;
     const float *xptr;
     const float *yptr;
     float        r = 0.0;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         n = cblk_colnbr( scblk );
  
         xptr = x + scblk->lcolidx;
         yptr = y + scblk->lcolidx;
         for( j=0; j<n; j++, xptr++, yptr++ ) {
             r += (*xptr) * (*yptr);
         }
     }
  
 #if defined(PASTIX_WITH_MPI)
     MPI_Allreduce( MPI_IN_PLACE, &r, 1, PASTIX_MPI_FLOAT,
                    MPI_SUM, solvmtx->solv_comm );
 #endif
  
     return r;
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc_internal
  *
  * @brief Compute the scalar product x.y. (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The context of the current thread.
  *
  * @param[in] args
  *          The argument providing the vectors x and y and their size n.
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_sdot( isched_thread_t *ctx,
                     void            *args )
 {
     struct s_argument_dot_s *arg = (struct s_argument_dot_s*)args;
     int i;
     pastix_int_t              n = arg->n;
     const float *x = arg->x;
     const float *y = arg->y;
     const float *xptr;
     const float *yptr;
     float        r = 0.0;
     pastix_int_t              size, rank;
     pastix_int_t              begin, end;
  
     rank = (pastix_int_t)ctx->rank;
     size = (pastix_int_t)ctx->global_ctx->world_size;
  
     begin = (n / size) * rank;
     if ( rank == (size - 1)) {
         end = n;
     } else {
         end = (n / size) * (rank + 1);
     }
  
     xptr = x + begin;
     yptr = y + begin;
  
     for (i=begin; i<end; i++, xptr++, yptr++)
     {
         r += (*xptr) * (*yptr);
     }
  
     if ( fabsf(r) > 0. ) {
         pastix_atomic_lock( &(arg->lock) );
         arg->sum += r;
         pastix_atomic_unlock( &(arg->lock) );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute a regular scalar product x.y (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of vectors x, y and r
  *
  * @param[in] x
  *          The first vector of the scalar product
  *
  * @param[in] y
  *          The second vector of the scalar product
  *
  *******************************************************************************
  *
  * @return The allocated vector
  *
  *******************************************************************************/
 float
 bvec_sdot_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 const float *y )
 {
     struct s_argument_dot_s arg = {n, x, y, PASTIX_ATOMIC_UNLOCKED, 0.0};
     isched_parallel_call( pastix_data->isched, pthread_bvec_sdot, &arg );
  
 #if defined(PASTIX_WITH_MPI)
     MPI_Allreduce( MPI_IN_PLACE, &(arg.sum), 1, PASTIX_MPI_FLOAT,
                    MPI_SUM, pastix_data->solvmatr->solv_comm );
 #endif
  
     return arg.sum;
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Copy a vector y = x (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of vectors x and y
  *
  * @param[in] x
  *          The vector to be copied
  *
  * @param[inout] y
  *          The vector copy of x
  *
  *******************************************************************************/
 void
 bvec_scopy_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 float       *y )
 {
 #if defined(PASTIX_WITH_MPI) && 0
     SolverMatrix  *solvmtx = pastix_data->solvmatr;
     SolverCblk    *scblk   = solvmtx->cblktab;
     pastix_bcsc_t *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t   *bcblk   = bcsc->cscftab;
     pastix_int_t   i, cblknbr;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         n = cblk_colnbr( scblk );
  
         memcpy( y + scblk->lcolidx, x + scblk->lcolidx, n * sizeof(float) );
     }
 #else
     (void)pastix_data;
     memcpy( y, x, n * sizeof(float) );
 #endif
 }
  
 struct argument_copy_s {
     pastix_int_t              n;
     const float *x;
     float       *y;
 };
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc_internal
  *
  * @brief Copy a vector y = x (parallel version)
  *
  *        Perform the coopy of x into y.
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          The context of the current thread
  *
  * @param[inout] args
  *          The argument containing the vector copy and the one to copy and their
  *          size.
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_scopy( isched_thread_t *ctx,
                     void            *args )
 {
     struct argument_copy_s   *arg = (struct argument_copy_s*)args;
     pastix_int_t              n = arg->n;
     pastix_int_t              size, rank;
     pastix_int_t              begin, end;
  
     size = (pastix_int_t)ctx->global_ctx->world_size;
     rank = (pastix_int_t)ctx->rank;
  
     begin = (n/size) * rank;
  
     if (rank == (size - 1)) {
         end = n;
     } else {
         end = (n/size) * (rank + 1);
     }
  
     if ( (end - begin) > 0 ) {
         memcpy( arg->y + begin, arg->x + begin, (end - begin) * sizeof(float) );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Copy a vector y = x (parallel version)
  *
  *        Initialise argument for the parallel function
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] n
  *          The number of elements of vectors x and y
  *
  * @param[in] x
  *          The vector to be copied
  *
  * @param[inout] y
  *          The vector copy of x
  *
  *******************************************************************************/
 void
 bvec_scopy_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              n,
                 const float *x,
                 float       *y )
 {
     struct argument_copy_s args = {n, x, y};
     isched_parallel_call( pastix_data->isched, pthread_bvec_scopy, &args );
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Solve A x = b with A the sparse matrix
  *
  * In Complex64 and Double precision and if mixed-precision is enabled, solve
  * SA sx = sb with SA the sparse matrix previously initialized respectively
  * in Complex32 or Float precision.
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The PaStiX data structure that describes the solver instance.
  *
  * @param[inout] b
  *          On entry, the right hand side
  *          On exit, the solution of the problem A x = b
  * @param[inout] work
  *          On entry, if mixed-precision is disabled or incompatible with the
  * sparse matrix's precision, the normal solve function is called and work must
  * be NULL. On exit, works stays NULL
  *          If mixed-precision is enabled, work must be allocated as a vector
  * half the size of b, so that the solve function can be performed in
  * in mixed-precision. On exit, work is undefined.
  *
  *******************************************************************************/
 void bcsc_sspsv( pastix_data_t      *pastix_data,
                  float *b,
                  float *work )
 {
     struct pastix_rhs_s rhsb = {
         .allocated  = 0,
         .flttype    = PastixFloat,
         .m          = pastix_data->bcsc->n,
         .n          = 1,
         .ld         = pastix_data->bcsc->n,
         .b          = b,
         .cblkb      = NULL,
         .rhs_comm   = NULL,
         .Ploc2Pglob = NULL,
     };
     int rc;
  
     pastix_data->iparm[IPARM_VERBOSE]--;
  
 #if defined(PRECISION_z) || defined(PRECISION_d)
     if ( pastix_data->iparm[IPARM_MIXED] )
     {
         pastix_int_t n    = rhsb.m;
         pastix_int_t nrhs = rhsb.n;
  
         rhsb.flttype = PastixFloat;
         rhsb.b = work;
  
         /* Copying b into work at half the precision */
         rc = LAPACKE_slag2d_work( LAPACK_COL_MAJOR, n, nrhs,
                                   b, n, work, n );
         assert( rc == 0 );
  
         pastix_subtask_solve( pastix_data, &rhsb );
  
         /* Reverting to normal precision after solving */
         rc = LAPACKE_slag2d_work( LAPACK_COL_MAJOR, n, nrhs,
                                   work, n, b, n );
         assert( rc == 0 );
     }
     else
 #endif
     {
         assert(work == NULL);
         pastix_subtask_solve( pastix_data, &rhsb );
     }
  
     if ( rhsb.cblkb != NULL ) {
         free( rhsb.cblkb );
     }
     pastix_data->iparm[IPARM_VERBOSE]++;
     (void)rc;
     (void)work;
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute \f[ y = \alpha A x + \beta y \f] (Sequential version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] m
  *          The number of rows of the matrix A, and the size of y.
  *
  * @param[in] n
  *          The number of columns of the matrix A, and the size of x.
  *
  * @param[in] alpha
  *          The scalar alpha.
  *
  * @param[in] A
  *          The dense matrix A of size lda-by-n.
  *
  * @param[in] lda
  *          The leading dimension of the matrix A. lda >= max(1,m)
  *
  * @param[in] x
  *          The vector x of size n.
  *
  * @param[in] beta
  *          The scalar beta.
  *
  * @param[inout] y
  *          On entry, the initial vector y of size m.
  *          On exit, the updated vector.
  *
  *******************************************************************************/
 void
 bvec_sgemv_seq( pastix_data_t            *pastix_data,
                 pastix_int_t              m,
                 pastix_int_t              n,
                 float        alpha,
                 const float *A,
                 pastix_int_t              lda,
                 const float *x,
                 float        beta,
                 float       *y )
 {
 #if defined(PASTIX_WITH_MPI) && 0
     SolverMatrix  *solvmtx = pastix_data->solvmatr;
     SolverCblk    *scblk   = solvmtx->cblktab;
     pastix_bcsc_t *bcsc    = pastix_data->bcsc;
     bcsc_cblk_t   *bcblk   = bcsc->cscftab;
     pastix_int_t   i, cblknbr;
  
     cblknbr = bcsc->cscfnbr;
     for( i = 0; i < cblknbr; i++, bcblk++ ) {
         scblk  = solvmtx->cblktab + bcblk->cblknum;
         m = cblk_colnbr( scblk );
  
         cblas_sgemv( CblasColMajor, CblasNoTrans, m, n,
                      (alpha), A + scblk->lcolidx, lda, x, 1,
                      (beta), y + scblk->lcolidx, 1 );
     }
 #else
     (void)pastix_data;
     cblas_sgemv( CblasColMajor, CblasNoTrans, m, n,
                  (alpha), A, lda, x, 1,
                  (beta), y, 1 );
 #endif
  
 }
  
 struct s_gemv_s
 {
     pastix_int_t              m;
     pastix_int_t              n;
     float        alpha;
     const float *A;
     pastix_int_t              lda;
     const float *x;
     float        beta;
     float       *y;
 };
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc_internal
  *
  * @brief Compute \f[ y = \alpha A x + \beta y \f] (Parallel version)
  *
  *        This is the function called by bvec_sgemv_smp to perform gemv
  *        (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] ctx
  *          Information about number of thread and rank of current thread.
  *
  * @param[inout] args
  *          The number of rows (m) and columns (n) of the matrix A, vecteors y
  *          (size m) and x (size n), scalars alpha and beta. The dense matrix A
  *          of size lda-by-n, and its leading dimension lda >= max(1,m), and
  *          the vector x of size n. The result is stored in y.
  *
  *******************************************************************************/
 static inline void
 pthread_bvec_sgemv( isched_thread_t *ctx,
                     void            *args )
 {
     struct s_gemv_s          *arg   = (struct s_gemv_s*)args;
     pastix_int_t              m     = arg->m;
     pastix_int_t              sub_m;
     pastix_int_t              n     = arg->n;
     float        alpha = arg->alpha;
     const float *A     = arg->A;
     const float *Aptr;
     pastix_int_t              lda   = arg->lda;
     const float *x     = arg->x;
     const float *xptr;
     float        beta  = arg->beta;
     float       *y     = arg->y;
     float       *yptr;
     pastix_int_t              size, rank;
  
     size = (pastix_int_t)ctx->global_ctx->world_size;
     rank = (pastix_int_t)ctx->rank;
  
     sub_m = (m / size);
  
     /* Subdivision of A */
     Aptr = A + sub_m * rank;
  
     xptr = x;
     yptr = y + sub_m * rank;
  
     if (rank == (size - 1)) {
         sub_m += m % size; /* Last thread has to do more tasks */
     }
  
     if ( sub_m > 0 ) {
         cblas_sgemv( CblasColMajor, CblasNoTrans, sub_m, n,
                      (alpha), Aptr, lda, xptr, 1,
                      (beta), yptr, 1 );
     }
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Compute \f[ y = \alpha A x + \beta y \f] (Parallel version)
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[in] m
  *          The number of rows of the matrix A, and the size of y.
  *
  * @param[in] n
  *          The number of columns of the matrix A, and the size of x.
  *
  * @param[in] alpha
  *          The scalar alpha.
  *
  * @param[in] A
  *          The dense matrix A of size lda-by-n.
  *
  * @param[in] lda
  *          The leading dimension of the matrix A. lda >= max(1,m)
  *
  * @param[in] x
  *          The vector x of size n.
  *
  * @param[in] beta
  *          The scalar beta.
  *
  * @param[inout] y
  *          On entry, the initial vector y of size m.
  *          On exit, the updated vector.
  *
  *******************************************************************************/
 void
 bvec_sgemv_smp( pastix_data_t            *pastix_data,
                 pastix_int_t              m,
                 pastix_int_t              n,
                 float        alpha,
                 const float *A,
                 pastix_int_t              lda,
                 const float *x,
                 float        beta,
                 float       *y )
 {
     struct s_gemv_s arg = {m, n, alpha, A, lda, x, beta, y};
  
     isched_parallel_call( pastix_data->isched, pthread_bvec_sgemv, &arg );
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Set to 0 remote coefficients
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[inout] y
  *          On entry, the initial vector y of size m.
  *          On exit, the y vector with remote section set to 0.
  *
  *******************************************************************************/
 void
 bvec_snullify_remote( const pastix_data_t *pastix_data,
                       float  *y )
 {
 #if defined( PASTIX_WITH_MPI )
     const SolverMatrix *solvmtx = pastix_data->solvmatr;
     const SolverCblk   *cblk = solvmtx->cblktab;
     pastix_int_t        cblknbr;
     pastix_int_t        i, lastindex = 0;
     pastix_int_t        n = pastix_data->csc->gNexp;
  
     cblknbr = solvmtx->cblknbr;
     for ( i = 0; i < cblknbr; i++, cblk++ ) {
         if ( cblk->cblktype & (CBLK_FANIN|CBLK_RECV) ) {
             continue;
         }
  
         if ( cblk->fcolnum != lastindex ) {
             /* Set to 0 all remote data bewtween previous local cblk, and current cblk */
             memset(
                 y + lastindex, 0, ( cblk->fcolnum - lastindex ) * sizeof( float ) );
         }
         lastindex = cblk->lcolnum + 1;
     }
  
     if ( lastindex < n ) {
         /* Set to 0 all remote data bewtween previous local cblk, and current cblk */
         memset( y + lastindex, 0, ( n - lastindex ) * sizeof( float ) );
     }
 #endif
     (void)pastix_data;
     (void)y;
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Gather a distributed right hand side (bvec storage) on all nodes.
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[inout] y
  *          On entry, the local portion of the vector y.
  *          On exit, the complete vector y.
  *
  *******************************************************************************
  *
  * @retval TODO
  *
  *******************************************************************************/
 const float *
 bvec_sgather_remote( const pastix_data_t      *pastix_data,
                      const float *y )
 {
 #if defined( PASTIX_WITH_MPI )
     const SolverMatrix       *solvmtx = pastix_data->solvmatr;
     const SolverCblk         *cblk    = solvmtx->cblktab;
     float       *yglobal = NULL;
     float       *yto, *ytmp;
     const float *yfr;
     pastix_int_t              c, i, cblki, cblknbr;
     MPI_Request               request_c = MPI_REQUEST_NULL;
     MPI_Request               request_n = MPI_REQUEST_NULL;
     pastix_int_t             *all_n, *all_cblknbr, *indices;
     pastix_int_t              max_n       = 0;
     pastix_int_t              max_cblknbr = 0;
     pastix_int_t              gn = pastix_data->bcsc->gN;
     pastix_int_t              ln = pastix_data->bcsc->n;
     pastix_int_t              lcblknbr, colnbr;
  
     if ( ln != 0 ) {
         yglobal = malloc( gn * sizeof(float) );
 #if !defined(NDEBUG)
         memset( yglobal, 0xff, gn * sizeof(float) );
 #endif
     }
     all_n       = malloc( pastix_data->procnbr * sizeof(pastix_int_t) );
     all_cblknbr = malloc( pastix_data->procnbr * sizeof(pastix_int_t) );
  
     MPI_Allgather( &ln, 1, PASTIX_MPI_INT,
                    all_n, 1, PASTIX_MPI_INT, pastix_data->pastix_comm );
     lcblknbr = solvmtx->cblknbr - solvmtx->faninnbr - solvmtx->recvnbr;
     MPI_Allgather( &lcblknbr, 1, PASTIX_MPI_INT,
                    all_cblknbr, 1, PASTIX_MPI_INT, pastix_data->pastix_comm );
  
     for( c=0; c<pastix_data->procnbr; c++ )
     {
         max_n       = pastix_imax( max_n,       all_n[c]       );
         max_cblknbr = pastix_imax( max_cblknbr, all_cblknbr[c] );
     }
  
     ytmp    = malloc( max_n * sizeof(float) );
     indices = malloc( max_cblknbr * 2 * sizeof(pastix_int_t) );
  
     for( c=0; c<pastix_data->procnbr; c++ )
     {
         if ( all_n[c] == 0 ) {
             continue;
         }
  
         if ( c == pastix_data->procnum ) {
             MPI_Ibcast( (float*)y, ln, PASTIX_MPI_FLOAT, c, pastix_data->pastix_comm, &request_n );
  
             cblknbr = solvmtx->cblknbr;
             cblk    = solvmtx->cblktab;
             cblki   = 0;
             for ( i = 0; i < cblknbr; i++, cblk++ ) {
                 if ( cblk->cblktype & (CBLK_FANIN|CBLK_RECV) ) {
                     continue;
                 }
  
                 yfr = y       + cblk->lcolidx;
                 yto = yglobal + cblk->fcolnum;
  
                 memcpy( yto, yfr, cblk_colnbr( cblk ) * sizeof( float ) );
  
                 indices[ 2*cblki   ] = cblk->fcolnum;
                 indices[ 2*cblki+1 ] = cblk->lcolnum;
                 cblki++;
             }
             assert( cblki == lcblknbr );
  
             MPI_Ibcast( indices, 2 * lcblknbr, PASTIX_MPI_INT, c, pastix_data->pastix_comm, &request_c );
             MPI_Wait( &request_n, MPI_STATUS_IGNORE );
             MPI_Wait( &request_c, MPI_STATUS_IGNORE );
         }
         else {
             MPI_Ibcast( ytmp, all_n[c], PASTIX_MPI_FLOAT, c, pastix_data->pastix_comm, &request_n );
             MPI_Ibcast( indices, all_cblknbr[c] * 2, PASTIX_MPI_INT, c, pastix_data->pastix_comm, &request_c );
             MPI_Wait( &request_n, MPI_STATUS_IGNORE );
             MPI_Wait( &request_c, MPI_STATUS_IGNORE );
  
             /* If ln = 0, there are no local computation so no need to store in yglobal */
             if ( ln != 0 ) {
                 yfr = ytmp;
                 for ( i = 0; i < all_cblknbr[c]; i++ ) {
                     yto = yglobal + indices[2*i];
  
                     colnbr = indices[2*i+1] - indices[2*i] + 1;
                     memcpy( yto, yfr, colnbr * sizeof( float ) );
  
                     yfr += colnbr;
                 }
             }
         }
     }
  
     free( all_n );
     free( all_cblknbr );
     free( ytmp );
     free( indices );
  
     return yglobal;
 #else
     (void)pastix_data;
     return y;
 #endif
 }
  
 /**
  *******************************************************************************
  *
  * @ingroup bcsc
  *
  * @brief Apply an all reduce of the vector on all nodes
  *
  *******************************************************************************
  *
  * @param[in] pastix_data
  *          The information about sequential and parallel version (Number of
  *          thread, ...).
  *
  * @param[inout] y
  *          On entry, the initial vector y of size m.
  *          On exit, the y vector with remote section set to 0.
  *
  *******************************************************************************/
 void
 bvec_sallreduce( const pastix_data_t *pastix_data,
                  float  *y )
 {
 #if defined( PASTIX_WITH_MPI )
     /* Reduce the partial sums on all nodes */
     MPI_Allreduce( MPI_IN_PLACE,
                    y, pastix_data->csc->gNexp,
                    PASTIX_MPI_FLOAT, MPI_SUM,
                    pastix_data->inter_node_comm );
 #endif
     (void)pastix_data;
     (void)y;
 }