Skip to content

Commit 6000bd4

Browse files
committed
WIP
1 parent c6d1e2a commit 6000bd4

2 files changed

Lines changed: 7 additions & 7 deletions

File tree

jsrc/ar.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ REDUCCPFX(tymesinsO, D, I, TYMESO)
228228
acc1=prim(acc1,acc5); acc2=prim(acc2,acc6); acc3=prim(acc3,acc7); acc0=prim(acc0,acc4); \
229229
acc2=prim(acc2,acc3); acc0=prim(acc0,acc1); acc0=prim(acc0,acc2); /* combine accumulators vertically */ \
230230
acc0=prim(acc0,_mm256_permute4x64_pd(acc0,0b11111110)); acc0=prim(acc0,_mm256_permute_pd(acc0,0xf)); /* combine accumulators horizontally 01+=23, 0+=1 */ \
231-
*(I*)z=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++z; /* store the single result from 0 */ \
231+
*z=_mm256_cvtsd_f64(acc0); ++z; /* store the single result from 0 */ \
232232
)
233233

234234
// f/ on rank>1, going down columns to save bandwidth
@@ -429,8 +429,8 @@ DF1(jtcompsum){
429429
c0=_mm256_add_pd(c0,_mm256_permute_pd(c0,0xf)); acc1=_mm256_permute_pd(acc0,0xf); // combine c0+c1, acc1<-1
430430
TWOSUM(acc0,acc1,acc0,c1); c0=_mm256_add_pd(c0,c1); // combine 0123, combine all low parts
431431
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
432-
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++zv; // store the single result
433-
// _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
432+
*zv=_mm256_cvtsd_f64(acc0); ++zv; // store the single result
433+
// obsolete _mm_storel_pd(zv++,_mm256_castpd256_pd128(acc0));
434434
}
435435
}else{
436436
// rank>1, going down columns to save bandwidth and add accuracy

jsrc/va2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -794,10 +794,10 @@ static A jtva2(J jt,AD * RESTRICT a,AD * RESTRICT w,AD * RESTRICT self,UI allran
794794
ti * RESTRICT wv1=wv+dplen; wv1=j==1?wv:wv1; \
795795
oneprod2 \
796796
if(j>1){--j; _mm_storeu_pd(zv,_mm256_castpd256_pd128 (acc000)); _mm_storeu_pd(zv+ndpi,_mm256_castpd256_pd128 (acc100)); wv+=dplen; zv +=2;} \
797-
else{*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc000),0x0); *(I*)(zv+ndpi)=_mm256_extract_epi64(_mm256_castpd_si256(acc100),0x0); zv+=1;} \
797+
else{*zv=_mm256_cvtsd_f64(acc000); *(zv+ndpi)=_mm256_cvtsd_f64(acc100); zv+=1;} \
798798
}else{ \
799799
oneprod1 \
800-
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc000),0x0); \
800+
*zv=_mm256_cvtsd_f64(acc000); \
801801
zv+=1; \
802802
} \
803803
if(!--j)break; \
@@ -932,7 +932,7 @@ static A jtva2(J jt,AD * RESTRICT a,AD * RESTRICT w,AD * RESTRICT self,UI allran
932932
acc3=MUL_ACC(acc3,_mm256_maskload_pd(av,endmask),_mm256_maskload_pd(wv,endmask)); av+=((dplen-1)&(NPAR-1))+1; wv+=((dplen-1)&(NPAR-1))+1; \
933933
acc0=_mm256_add_pd(acc0,acc1); acc2=_mm256_add_pd(acc2,acc3); acc0=_mm256_add_pd(acc0,acc2); /* combine accumulators vertically */ \
934934
acc0=_mm256_add_pd(acc0,_mm256_permute4x64_pd(acc0,0b11111110)); acc0=_mm256_add_pd(acc0,_mm256_permute_pd(acc0,0xf)); /* combine accumulators horizontally 01+=23, 0+=1 */ \
935-
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++zv;
935+
*zv=_mm256_cvtsd_f64(acc0); ++zv;
936936
#else
937937
#define ONEPRODD D total0=0.0; D total1=0.0; if(dplen&1)total1=(D)*av++*(D)*wv++; DQ(dplen>>1, total0+=(D)*av++*(D)*wv++; total1+=(D)*av++*(D)*wv++;); *zv++=total0+total1;
938938
#endif
@@ -1100,7 +1100,7 @@ DF2(jtsumattymes1){
11001100
acc0=_mm256_add_pd(acc0,_mm256_permute_pd(acc0,0xf));
11011101
acc0=_mm256_add_pd(acc0,c0); // add low parts back into high in case there is overlap
11021102
#endif
1103-
*(I*)zv=_mm256_extract_epi64(_mm256_castpd_si256(acc0),0x0); ++zv;
1103+
*zv=_mm256_cvtsd_f64(acc0); ++zv;
11041104
if(!--j)break; av=av0; // repeat a if needed
11051105
}
11061106
}

0 commit comments

Comments
 (0)