@@ -794,10 +794,10 @@ static A jtva2(J jt,AD * RESTRICT a,AD * RESTRICT w,AD * RESTRICT self,UI allran
794794 ti * RESTRICT wv1 = wv + dplen ; wv1 = j == 1 ?wv :wv1 ; \
795795 oneprod2 \
796796 if (j > 1 ){-- j ; _mm_storeu_pd (zv ,_mm256_castpd256_pd128 (acc000 )); _mm_storeu_pd (zv + ndpi ,_mm256_castpd256_pd128 (acc100 )); wv += dplen ; zv += 2 ;} \
797- else {* ( I * ) zv = _mm256_extract_epi64 ( _mm256_castpd_si256 ( acc000 ), 0x0 ) ; * (I * )( zv + ndpi )= _mm256_extract_epi64 ( _mm256_castpd_si256 ( acc100 ), 0x0 ); zv += 1 ;} \
797+ else {* zv = _mm256_cvtsd_f64 ( acc000 ); * (zv + ndpi )= _mm256_cvtsd_f64 ( acc100 ); zv += 1 ;} \
798798 }else { \
799799 oneprod1 \
800- * ( I * ) zv = _mm256_extract_epi64 ( _mm256_castpd_si256 ( acc000 ), 0x0 ); \
800+ * zv = _mm256_cvtsd_f64 ( acc000 ); \
801801 zv += 1 ; \
802802 } \
803803 if (!-- j )break ; \
@@ -932,7 +932,7 @@ static A jtva2(J jt,AD * RESTRICT a,AD * RESTRICT w,AD * RESTRICT self,UI allran
932932 acc3 = MUL_ACC (acc3 ,_mm256_maskload_pd (av ,endmask ),_mm256_maskload_pd (wv ,endmask )); av += ((dplen - 1 )& (NPAR - 1 ))+ 1 ; wv += ((dplen - 1 )& (NPAR - 1 ))+ 1 ; \
933933 acc0 = _mm256_add_pd (acc0 ,acc1 ); acc2 = _mm256_add_pd (acc2 ,acc3 ); acc0 = _mm256_add_pd (acc0 ,acc2 ); /* combine accumulators vertically */ \
934934 acc0 = _mm256_add_pd (acc0 ,_mm256_permute4x64_pd (acc0 ,0b11111110 )); acc0 = _mm256_add_pd (acc0 ,_mm256_permute_pd (acc0 ,0xf )); /* combine accumulators horizontally 01+=23, 0+=1 */ \
935- * ( I * ) zv = _mm256_extract_epi64 ( _mm256_castpd_si256 ( acc0 ), 0x0 ); ++ zv ;
935+ * zv = _mm256_cvtsd_f64 ( acc0 ); ++ zv ;
936936#else
937937#define ONEPRODD D total0=0.0; D total1=0.0; if(dplen&1)total1=(D)*av++*(D)*wv++; DQ(dplen>>1, total0+=(D)*av++*(D)*wv++; total1+=(D)*av++*(D)*wv++;); *zv++=total0+total1;
938938#endif
@@ -1100,7 +1100,7 @@ DF2(jtsumattymes1){
11001100 acc0 = _mm256_add_pd (acc0 ,_mm256_permute_pd (acc0 ,0xf ));
11011101 acc0 = _mm256_add_pd (acc0 ,c0 ); // add low parts back into high in case there is overlap
11021102#endif
1103- * ( I * ) zv = _mm256_extract_epi64 ( _mm256_castpd_si256 ( acc0 ), 0x0 ); ++ zv ;
1103+ * zv = _mm256_cvtsd_f64 ( acc0 ); ++ zv ;
11041104 if (!-- j )break ; av = av0 ; // repeat a if needed
11051105 }
11061106 }
0 commit comments