Skip to content

Commit c6d1e2a

Browse files
committed
128!:11 for ek/bk update
1 parent db9c300 commit c6d1e2a

11 files changed

Lines changed: 200 additions & 49 deletions

File tree

jsrc/cv.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,10 @@ F2(jtfit){F2PREFIP;A f;C c;I k,l,m,r;V*sv;
7474
case CNOT: case CXCO: case CSPARSE: case CEBAR:
7575
R fitct(a,w,cno);
7676
case CQQ: ;
77-
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f"r!.0 is supported
78-
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:0
79-
R CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r); // supports IRS
77+
RE(wval=i0(w)); ASSERT(BETWEENC(wval,0,1),EVDOMAIN); // only f"r!.[01] is supported
78+
ASSERT(sv->valencefns[1]==jtsumattymes1,EVDOMAIN) // Must be +/@:*"1!:[01]
79+
RZ(f=CDERIV(CFIT,0,jtsumattymes1,VIRS2, m,l,r)); // supports IRS
80+
FAV(f)->localuse.lu1.fittype=wval; R f;
8081
case CSLASH: ;
8182
RE(wval=i0(w)); ASSERT(wval==0,EVDOMAIN); // only f/!.0 is supported
8283
ASSERT(FAV(sv->fgh[0])->id==CPLUS,EVDOMAIN) // Must be +/!:0

jsrc/j.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,7 @@ if(likely(!((I)jtinplace&JTWILLBEOPENED)))z=EPILOGNORET(z); RETF(z); \
12331233
#define GAE0(v,t,n,r,erraction) {HISTOCALL if(unlikely(!(v=jtga0(jt,(I)(t),(I)(r),(I)(n)))))erraction; AN(v)=(n);} // used when shape=0 and rank is never 1 or will always be filled in by user even if rank 1
12341234
#endif
12351235
#define GAE(v,t,n,r,s,erraction) {GAE0(v,t,n,r,erraction) MCISH(AS(v),(I*)(s),(r))} // error action
1236-
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when rank will always be filled in by user. Default error action is to exit
1236+
#define GA00(v,t,n,r) {GAE0(v,t,n,r,R 0)} // used when shape will always be filled in by user. Default error action is to exit
12371237
#define GA(v,t,n,r,s) {GA00(v,t,n,r) MCISH(AS(v),(I*)(s),(r))} // s points to shape
12381238
#define GA0(v,t,n,r) {GA00(v,t,n,r) *((r)==1?AS(v):jt->shapesink)=(n);} // used when shape=0 but rank may be 1 and must fill in with AN if so - never for sparse blocks
12391239
#define GA10(v,t,n) {GA00(v,t,n,1) AS(v)[0]=(n);} // used when rank is known to be 1
@@ -1904,10 +1904,12 @@ if(likely(type _i<3)){z=(I)&oneone; z=type _i>1?(I)_zzt:z; _zzt=type _i<1?(I*)z:
19041904
#if (C_AVX2&&SY_64) || EMU_AVX2
19051905
// create double-precision product of inputs
19061906
#define TWOPROD(in0,in1,outhi,outlo) outhi=_mm256_mul_pd(in0,in1); outlo=_mm256_fmsub_pd(in0,in1,outhi);
1907-
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. Needs t and signbit.
1907+
// create double-precision sum of inputs, where it is not known which is larger NOTE in0 and outhi might be identical. Needs sgnbit.
19081908
#define TWOSUM(in0,in1,outhi,outlo) {__m256d t=_mm256_andnot_pd(sgnbit,in0); outlo=_mm256_andnot_pd(sgnbit,in1); t=_mm256_sub_pd(t,outlo); \
1909-
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); \
1910-
outhi=_mm256_add_pd(in0,in1); outlo=_mm256_sub_pd(outlo,outhi); outlo=_mm256_add_pd(outlo,t);} // 1 if in1 larger; select outlo=max t=min
1909+
outlo=_mm256_blendv_pd(in0,in1,t); t=_mm256_blendv_pd(in1,in0,t); /* outlo=val with larger abs t=val with smaller abs */ \
1910+
outhi=_mm256_add_pd(in0,in1); /* single-prec sum */ \
1911+
outlo=_mm256_sub_pd(outlo,outhi); /* big-(big+small): implied val of -small after rounding */ \
1912+
outlo=_mm256_add_pd(outlo,t);} // amt by which actual value exceeds implied: this is the lost low precision
19111913
#define DPADD(hi0,lo0,hi1,lo1,outhi,outlo) outhi=_mm256_add_pd(hi0,hi1); outlo=_mm256_add_pd(lo0,lo1);
19121914
#else
19131915
#define TWOSPLIT(a,x,y) y=(a)*134217730.0; x=y-(a); x=y-x; y=(a)-x; // must avoid compiler tuning

jsrc/ja.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,10 @@
428428
#define fplus(x,y) jtfplus(jt,(x),(y))
429429
#define fpoly(x,y) jtfpoly(jt,(x),(y))
430430
#define fpolyc(x) jtfpolyc(jt,(x))
431-
#define fr(x) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0))mf(x);else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
431+
#define gmpmfree(x) {I allocsize = AN(x)+AKXR(0); jt->bytes-=allocsize; jt->malloctotal-=allocsize; jt->mfreegenallo-=allocsize; free(x);}
432+
#define frcommon(x,f) {if(likely((x)!=0)){I Zs = AC(x); if(likely(!ACISPERM(Zs))){if(likely(--Zs<=0)){f(x);}else AC(x)=Zs;}}} // use fr for known nonrecursives, and for locales
433+
#define fr(x) frcommon(x,mf)
434+
#define frgmp(x) frcommon(x,gmpmfree) // to free GMP blocks
432435
#define fram(x0,x1,x2,x3,x4) jtfram(jt,(x0),(x1),(x2),(x3),(x4))
433436
#define from(x,y) jtfrom(jt,(x),(y))
434437
#define frombs(x,y) jtfrombs(jt,(x),(y))

jsrc/je.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ extern F2(jtdomainerr2);
448448
extern F2(jtdot);
449449
extern F2(jtdrop);
450450
extern F2(jtebar);
451+
extern F2(jtekupdate);
451452
extern F2(jteps);
452453
extern F2(jtetoiso8601);
453454
extern F2(jtiso8601toe);

jsrc/jtype.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,7 @@ typedef struct {
10061006
A cachedref; // for namerefs ('name'~), the cached value, or 0 if not cached
10071007
AF fork2hfn; // for dyad fork that is NOT a comparison combination or jtintersect, the function to call to process h (might be in h@][)
10081008
I forcetask; // for t., the flags extracted from n. Bits 0-7=thread pool; bit 8=worker thread only
1009+
I fittype; // for u!.t where t is a code, its value is stored here in the CFIT block
10091010
} lu1; // this is the high-use stuff in the second cacheline
10101011
};
10111012
} localuse; // always 16 bytes, 4 I4s

jsrc/m.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,8 @@ void jtfamftrav(J jt,AD* RESTRICT wd,I t){I n=AN(wd);
976976
}else if(t&SYMB){wd=jtfreesymtab(jt,wd,AR(wd)); // SYMB is used as a flag; we test here AFTER NAME and ADV which are lower bits
977977
} else if(t&(RAT|XNUM|XD)) {A* RESTRICT v=AAV(wd);
978978
// single-level indirect forms. handle each block
979-
DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
979+
DQ(t&RAT?2*n:n, if(*v)if(AT(*v)&LIT){frgmp(*v);}else fr(*v); ++v;);
980+
// obsolete DQ(t&RAT?2*n:n, if(*v)fr(*v); ++v;);
980981
}else if(ISSPARSE(t)){P* RESTRICT v=PAV(wd);
981982
fana(SPA(v,a)); fana(SPA(v,e)); fana(SPA(v,i)); fana(SPA(v,x));
982983
// for sparse, decrement the usecount

jsrc/va2.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -974,7 +974,7 @@ I jtsumattymesprods(J jt,I it,void *avp, void *wvp,I dplen,I nfro,I nfri,I ndpo,
974974

975975

976976

977-
// +/@:*"1 with IRS, also +/@:*"1!.0 on float args
977+
// +/@:*"1 with IRS, also +/@:*"1!.0 on float args and +/@:*"1!.1 producing a float extended-precision result, a length-2 list per product
978978
DF2(jtsumattymes1){
979979
ARGCHK2(a,w);
980980
I ar=AR(a); I wr=AR(w); I acr=jt->ranks>>RANKTX; I wcr=jt->ranks&RMAX;
@@ -985,9 +985,12 @@ DF2(jtsumattymes1){
985985
// Now that we have used the rank info, clear jt->ranks. All verbs start with jt->ranks=RMAXX unless they have "n applied
986986
// we do this before we generate failures
987987
RESETRANK; // This is required if we go to slower code
988+
989+
I fit=0; if(unlikely(FAV(self)->id==CFIT))fit=1+FAV(self)->localuse.lu1.fittype; // fit 0=normal, 1=!.0, 2=!.1
990+
988991
// if an argument is empty, sparse, has cell-rank 0, or not a fast arithmetic type, revert to the code for f/@:g atomic
989992
if(((-((AT(a)|AT(w))&((NOUN|SPARSE)&~(B01|INT|FL))))|(AN(a)-1)|(AN(w)-1)|(acr-1)|(wcr-1))<0) { // test for all unusual cases
990-
if(FAV(self)->id==CFIT)self=FAV(self)->fgh[0]; // lose the !.0 if we revert
993+
if(fit!=0)self=FAV(self)->fgh[0]; // lose the !.[01] if we revert
991994
R rank2ex(a,w,FAV(self)->fgh[0],MIN(acr,1),MIN(wcr,1),acr,wcr,jtfslashatg);
992995
}
993996
// We can handle it here, and both ranks are at least 1.
@@ -996,11 +999,12 @@ DF2(jtsumattymes1){
996999
// This promotes the outer loops to inner loops
9971000
{I rankgiven = (acr|wcr)-1; acr=rankgiven?acr:ar; wcr=rankgiven?wcr:wr;}
9981001

1002+
9991003
// Exchange if needed to make the cell-rank of a no greater than that of w. That way we know that w will never repeat in the inner loop
10001004
if(acr>wcr){A t=w; I tr=wr; I tcr=wcr; w=a; wr=ar; wcr=acr; a=t; ar=tr; acr=tcr;}
10011005

10021006
// Convert arguments as required
1003-
I it=MAX(AT(a),AT(w)); it=FAV(self)->id==CFIT?FL:it; // if input types are dissimilar, convert to the larger. For +/@:*"1!.0, convert everything to float
1007+
I it=MAX(AT(a),AT(w)); it=fit!=0?FL:it; // if input types are dissimilar, convert to the larger. For +/@:*"1!.[01], convert everything to float
10041008
if(unlikely(it!=(AT(w)|AT(a)))){
10051009
if(TYPESNE(it,AT(a))){RZ(a=cvt(it,a));} // convert to common input type
10061010
if(TYPESNE(it,AT(w))){RZ(w=cvt(it,w));}
@@ -1017,9 +1021,9 @@ DF2(jtsumattymes1){
10171021
A z;
10181022
// if there is frame, create the outer loop values
10191023
I nfro,nfri; // outer loop counts, and which arg is repeated
1020-
if(likely(((ar-acr)|(wr-wcr))==0)){ // normal case
1024+
if(likely(((ar-acr)|(wr-wcr))==0)){ // normal case of no frame
10211025
nfro=nfri=1; // no outer loops, repeata immaterial
1022-
GA(z,FL>>(it&B01),ndpo*ndpi,wcr-1,AS(w)); // type is INT if inputs booleans, otherwise FL
1026+
GA(z,FL>>(it&B01),(ndpo*ndpi)<<(fit>>1),wcr-1+(fit>>1),AS(w)); // type is INT if inputs booleans, otherwise FL
10231027
}else{
10241028
// There is frame, analyze and check it
10251029
I af=ar-acr; I wf=wr-wcr; I commonf=wf; I *as=AS(a), *ws=AS(w); I *longs=as;
@@ -1029,14 +1033,15 @@ DF2(jtsumattymes1){
10291033
ASSERTAGREE(as,ws,commonf) // verify common frame
10301034
PROD(nfri,af,longs+commonf); PROD(nfro,commonf,longs); // number of outer loops, number of repeats
10311035
I zn = ndpo*ndpi*nfro; DPMULDE(zn,nfri,zn); // no error possible till we extend the shape
1032-
GA00(z,FL>>(it&B01),zn,af+commonf+wcr-1); I *zs=AS(z); // type is INT if inputs booleans, otherwise FL
1036+
GA00(z,FL>>(it&B01),zn<<(fit>>1),af+commonf+wcr-1+(fit>>1)); I *zs=AS(z); // type is INT if inputs booleans, otherwise FL
10331037
// install the shape
10341038
MCISH(zs,longs,af+commonf); MCISH(zs+af+commonf,ws+wr-wcr,wcr-1);
10351039
}
1040+
if(unlikely(fit==2))AS(w)[AR(w)-1]=2; // if +/@:*"1!.1, we store two atoms per sum
10361041

1037-
if(likely(FAV(self)->id!=CFIT)){RZ(jtsumattymesprods(jt,it,voidAV(a),voidAV(w),dplen,nfro,nfri,ndpo,ndpi,voidAV(z))); // eval standard dot-product, check for error
1042+
if(likely(fit==0)){RZ(jtsumattymesprods(jt,it,voidAV(a),voidAV(w),dplen,nfro,nfri,ndpo,ndpi,voidAV(z))); // eval standard dot-product, check for error
10381043
}else{
1039-
// here for +/@:*"1!.0, double-precision dot product https://www-pequan.lip6.fr/~graillat/papers/IC2012.pdf
1044+
// here for +/@:*"1!.[01], double-precision dot product https://www-pequan.lip6.fr/~graillat/papers/IC2012.pdf
10401045
NAN0;
10411046
#if (C_AVX2&&SY_64) || EMU_AVX2
10421047
#if 1 // higher precision. Required when a large product is added to a small total. Dependency loop for acc is 4 clocks; for c is 4 clocks. Total 12 insts, so unrolled 2 would do

0 commit comments

Comments
 (0)