22 #include "kernels_trace.h"
76 const pastix_complex32_t *A,
78 const pastix_complex32_t *D,
80 pastix_complex32_t *B,
83 pastix_complex32_t alpha;
99 if ( lda < pastix_imax(1,M) )
107 if ( ldb < pastix_imax(1,M) ) {
112 #if defined(PRECISION_z) || defined(PRECISION_c)
114 for( j=0; j<N; j++, D += ldd ) {
116 for( i=0; i<M; i++, B++, A++ ) {
117 *B = conjf(*A) * alpha;
126 for( j=0; j<N; j++, D += ldd ) {
128 for( i=0; i<M; i++, B++, A++ ) {
178 pastix_fixdbl_t time;
179 pastix_complex32_t *LD;
181 time = kernel_trace_start( PastixKernelSCALOCblk );
191 const pastix_complex32_t *L;
192 const pastix_complex32_t *D;
193 pastix_int_t ldl, ldd, ldld;
195 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
202 for(; blok < lblk; blok++, lrL++, lrLD++) {
205 assert( lrLD->
rk == -1 );
209 lrLD->
rkmax = lrL->rkmax;
211 if ( lrL->rk == -1 ) {
212 assert( M == lrL->rkmax );
215 memcpy( lrLD->
u, lrL->u, lrL->rkmax * N *
sizeof(pastix_complex32_t) );
225 memcpy( lrLD->
u, lrL->u, M * lrL->rk *
sizeof(pastix_complex32_t) );
226 lrLD->
v = ((pastix_complex32_t *)lrLD->
u) + M * lrL->rk;
227 memcpy( lrLD->
v, lrL->v, N * lrL->rkmax *
sizeof(pastix_complex32_t) );
243 else if ( cblk->
cblktype & CBLK_LAYOUT_2D ) {
244 L = D = (pastix_complex32_t *)dataL;
245 LD = (pastix_complex32_t *)dataLD;
248 for(; blok < lblk; blok++) {
258 L = D = (pastix_complex32_t *)dataL;
259 LD = (pastix_complex32_t *)dataLD;
272 kernel_trace_stop( cblk->
fblokptr->
inlast, PastixKernelSCALOCblk, M, N, 0, (pastix_fixdbl_t)(M*N), time );
323 pastix_int_t M, N, ldd, offset, cblk_m;
324 const pastix_complex32_t *lA;
326 pastix_complex32_t *D, *B, *A;
327 pastix_complex32_t *lB;
335 assert( cblk->
cblktype & CBLK_LAYOUT_2D );
337 blok = fblok + blok_m;
341 if ( cblk->
cblktype & CBLK_COMPRESSED ) {
346 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++, lrA++, lrB++) {
351 lrB->
rkmax = lrA->rkmax;
353 if ( lrB->
rk == -1 ) {
354 assert( M == lrA->rkmax );
355 assert( NULL == lrA->v );
358 memcpy( lrB->
u, lrA->u, lrA->rkmax * N *
sizeof(pastix_complex32_t) );
368 memcpy( lrB->
u, lrA->u, M * lrA->rk *
sizeof(pastix_complex32_t) );
369 lrB->
v = ((pastix_complex32_t *)lrB->
u) + M * lrA->rk;
370 memcpy( lrB->
v, lrA->v, N * lrA->rkmax *
sizeof(pastix_complex32_t) );
379 lA, M, D, ldd, lB, M );
383 A = (pastix_complex32_t *)dataA;
384 D = (pastix_complex32_t *)dataD;
385 B = (pastix_complex32_t *)dataB;
387 for (; (blok < lblok) && (blok->
fcblknm == cblk_m); blok++) {
388 lA = A + blok->
coefind - offset;
389 lB = B + blok->
coefind - offset;
394 lA, M, D, ldd, lB, M );