Skip to content

Commit ab4ee4e

Browse files
committed
Properly account for OS-block space in the allocating thread
1 parent a0cbe3a commit ab4ee4e

3 files changed

Lines changed: 48 additions & 41 deletions

File tree

jsrc/ct.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ ASSERT(0,EVNONCE)
454454
}else if(m==2){
455455
// return list of idle threads
456456
ASSERT(AR(w)==1,EVRANK) ASSERT(AN(w)==0,EVLENGTH) // only '' is allowed as an argument for now
457-
GAT0(z,INT,MAXTASKS,1) I *zv=IAV1(z); // Don't allocate under lock, and list may change: so allocate max possible
457+
GA0(z,INT,MAXTASKS,1) I *zv=IAV1(z); // Don't allocate under lock, and list may change: so allocate max possible. Don't use GAT in case MAXTASKS is too big for it
458458
I threadct=0; J mjt=MTHREAD(JJTOJ(jt)); J currjt=mjt; // # threads, master thread, current thread
459459
WRITELOCK(mjt->tasklock); while(currjt->taskidleq){zv[threadct++]=currjt->taskidleq; currjt=JTFORTHREAD(jt,currjt->taskidleq);} WRITEUNLOCK(mjt->tasklock); // copy idle threads to result. The master can never be idle
460460
AN(z)=AS(z)[0]=threadct; // install # idles found

jsrc/jt.h

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ typedef struct rngdata {
8888
} us; // access as US
8989
} uflags; // 4 bytes
9090
// TASKCOMMREGION starts here, holding parms passed at startup
91-
I bytes; // bytes currently in use - used only during 7!:1 clear for task
91+
I bytesmax; // high-water mark of "bytes" - used only during 7!:1 clear for task
9292
S etxn; // strlen(etx) but set negative to freeze changes to the error line clear for task
9393
S etxn1; // last non-zero etxn clear for task
9494
B foldrunning; // 1 if fold is running (allows Z:) clear for task
@@ -101,8 +101,6 @@ typedef struct rngdata {
101101
A xmod; // extended integer: the m in m&|@f clear for task
102102
// end of cacheline 0
103103
// end of TASKCOMMREGION
104-
// end of cacheline 0
105-
I bytesmax; // high-water mark of "bytes" - used only during 7!:1 clear for task
106104
I4 parsercalls; // # times parser was called clear for task
107105
// ************************************** here starts the part that is initialized to non0 values when the task is started. Earlier values may also be initialized
108106
UI4 ranks; // low half: rank of w high half: rank of a for IRS init for task to 3F3F should be 2 bytes?
@@ -139,8 +137,8 @@ typedef struct rngdata {
139137

140138
I shapesink[SY_64?2:4]; // garbage area used as load/store targets of operations we don't want to branch around. While waiting for work, this holds the address of the WAITBLOK we are waiting on
141139
// things needed for allocation of large blocks
142-
I mfreegenallo; // Amount allocated through malloc, biased
143-
I malloctotal; // net total of malloc/free performed in m.c only
140+
A* tstacknext; // if not 0, points to the recently-used tstack buffer, whose chain field points to tstacknext
141+
A* tstackcurr; // current allocation, holding NTSTACK bytes+1 block for alignment. First entry points to next-lower allocation
144142
PFRAME parserstackframe; // 4 words
145143
// end of cacheline 4
146144

@@ -155,18 +153,12 @@ typedef struct rngdata {
155153
I malloctotalhwmk; // highest value since most recent 7!:1
156154
// end of cacheline 5
157155

158-
A* tstacknext; // if not 0, points to the recently-used tstack buffer, whose chain field points to tstacknext
159-
A* tstackcurr; // current allocation, holding NTSTACK bytes+1 block for alignment. First entry points to next-lower allocation
156+
// seldom used, but contended during system lock
160157
C *etx; // [1+NETX]; // display text for last error (+1 for trailing 0)
161158
void *dtoa; /* use internally by dtoa.c */
162159
PSTK initparserstack[1]; // stack used for messages when we don't have a real one
163160
I4 getlasterror; // DLL error info from previous DLL call
164161
I4 dlllasterror; // DLL domain error info (before DLL call)
165-
I filler6[1];
166-
// end of cacheline 6
167-
168-
// Area used for intertask communication
169-
A repatq[-PMINL+PLIML+1]; // queue of blocks allocated in this thread but freed by other threads. Used as a lock, so put in its own cacheline. We have 5 queues to avoid muxing; could do with 1
170162
S taskidleq; // thread#s of the tasks waiting for work. Root of the idle chain is in the master.
171163
S tasklock; // lock for taskidleq. Used only in master
172164
S taskstate; // task state: modified by other tasks on a system lock
@@ -181,6 +173,13 @@ typedef struct rngdata {
181173
#else
182174
I filler7[2];
183175
#endif
176+
// end of cacheline 6
177+
178+
// Area used for intertask communication of memory allocation
179+
A repatq[-PMINL+PLIML+1]; // queue of blocks allocated in this thread but freed by other threads. Used as a lock, so put in its own cacheline. We have 5 queues to avoid muxing; could do with 1
180+
I bytes; // bytes currently in use - used only during 7!:1
181+
I mfreegenallo; // Amount allocated through malloc, biased
182+
I malloctotal; // net total of malloc/free performed in m.c only
184183
// end of cacheline 7
185184
// stats I totalpops;
186185
// stats I nonnullpops;
@@ -342,7 +341,7 @@ typedef struct JSTstruct {
342341
typedef JST* JS; // shared part of struct
343342

344343
// When the task is not running, part of the per-call area is used as a communication region to hold parameters:
345-
#define TASKCOMMREGION(jt) ((void **)(&jt->bytes)) // [0..3] are parms, each a (void *)
344+
#define TASKCOMMREGION(jt) ((void **)(&jt->bytesmax)) // [0..3] are parms, each a (void *)
346345
#define TASKAWAITBLOK(jt) (*(void **)&jt->shapesink) // pointer to WAITBLOK in a waiting task
347346

348347

jsrc/m.c

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ B jtspfree(J jt){I i;A p;
249249
#if PYXES
250250
// do space accounting for the expat blocks that we saw here
251251
if(unlikely(nexpats>0)){ // small expats will probably be rare
252-
jt->bytes-=nexpats*incr; // keep track of total allocation
252+
__atomic_fetch_sub(&jt->bytes,nexpats*incr,__ATOMIC_ACQ_REL); // keep track of total allocation
253253
jt->mfree[i].ballo-=nexpats*incr; // also number of bytes in this chain
254254
}
255255
#endif
@@ -274,12 +274,12 @@ B jtspfree(J jt){I i;A p;
274274
if(FHRHISROOTALLOFREE(AFHRH(baseblock))){ // Free fully-unused base blocks;
275275
#if 1 || ALIGNTOCACHE // with short headers, always align to cache bdy
276276
FREECHK(((I**)baseblock)[-1]); // If aligned, the word before the block points to the original block address
277-
jt->malloctotal-=PSIZE+TAILPAD+CACHELINESIZE; // return storage+bdy
278-
jt->mfreegenallo-=TAILPAD+CACHELINESIZE; // remove pad from the amount we report allocated
277+
__atomic_fetch_sub(&jt->malloctotal,PSIZE+TAILPAD+CACHELINESIZE,__ATOMIC_ACQ_REL); // return storage+bdy
278+
__atomic_fetch_sub(&jt->mfreegenallo,TAILPAD+CACHELINESIZE,__ATOMIC_ACQ_REL); // remove pad from the amount we report allocated
279279
#else
280280
FREECHK(baseblock);
281-
jt->malloctotal-=PSIZE+TAILPAD; // return storage
282-
jt->mfreegenallo-=TAILPAD; // remove pad from the amount we report allocated
281+
__atomic_fetch_sub(&jt->malloctotal,PSIZE+TAILPAD,__ATOMIC_ACQ_REL); // return storage
282+
__atomic_fetch_sub(&jt->mfreegenallo,TAILPAD,__ATOMIC_ACQ_REL); // remove pad from the amount we report allocated
283283
#endif
284284
}else{AFHRH(baseblock) = virginbase;} // restore the count to 0 in the rest
285285
p=np; // step to next base block
@@ -292,7 +292,7 @@ B jtspfree(J jt){I i;A p;
292292
// compensated for by a change to mfreegenallo. mfreegenallo must also account for the excess padding that is now being returned
293293
// This elides the step of subtracting coalesced buffers from the number of allocated buffers of size i, followed by
294294
// adding the bytes for those blocks to mfreebgenallo
295-
jt->mfreegenallo -= SBFREEB - (jt->mfree[i].ballo & ~MFREEBCOUNTING); // subtract diff between current mfreeb[] and what it will be set to
295+
__atomic_fetch_sub(&jt->mfreegenallo,SBFREEB - (jt->mfree[i].ballo & ~MFREEBCOUNTING),__ATOMIC_ACQ_REL); // subtract diff between current mfreeb[] and what it will be set to
296296
jt->mfree[i].ballo = SBFREEB + (jt->mfree[i].ballo & MFREEBCOUNTING); // set so we trigger rescan when we have allocated another SBFREEB bytes
297297
}
298298
}
@@ -418,7 +418,7 @@ R totalallo;
418418
// Also count current space, and set that into jt->bytes and the result of this function
419419
I jtspstarttracking(J jt){I i;
420420
for(i=PMINL;i<=PLIML;++i){jt->mfree[-PMINL+i].ballo |= MFREEBCOUNTING;}
421-
jt->mfreegenallo |= MFREEBCOUNTING; // same for non-pool alloc
421+
__atomic_fetch_or(&jt->mfreegenallo,MFREEBCOUNTING,__ATOMIC_ACQ_REL); // same for non-pool alloc
422422
R jt->bytes = spbytesinuse();
423423
}
424424

@@ -1015,7 +1015,7 @@ A* jttg(J jt, A *pushp){ // Filling last slot; must allocate next page.
10151015
jt->tnextpushp = pushp; // set the push pointer so we can back out the last allocation
10161016
ASSERT(0,EVWSFULL); // fail
10171017
}
1018-
jt->malloctotal += NTSTACK+NTSTACKBLOCK; // add to total allocated
1018+
__atomic_fetch_add(&jt->malloctotal,NTSTACK+NTSTACKBLOCK,__ATOMIC_ACQ_REL); // add to total allocated
10191019
// chain previous allocation to the new one
10201020
*v = (A)jt->tstackcurr; // backchain old buffers to new, including bias
10211021
jt->tstackcurr = (A*)v; // set new buffer as the one to use, biased so we can index it from pushx
@@ -1085,7 +1085,7 @@ void jttpop(J jt,A *old){A *endingtpushp;
10851085
// There is no way two allocations could back up so as to make the end of one exactly the beginning of the other
10861086
if((A*)np!=pushp-1){
10871087
// if there is another block in this allocation, step to it. Otherwise:
1088-
if(jt->tstacknext){FREECHK(jt->tstacknext); jt->malloctotal-=NTSTACK+NTSTACKBLOCK;} // account for malloc'd memory
1088+
if(jt->tstacknext){FREECHK(jt->tstacknext); __atomic_fetch_sub(&jt->malloctotal,NTSTACK+NTSTACKBLOCK,__ATOMIC_ACQ_REL);} // account for malloc'd memory
10891089
// We will set the block we are vacating as the next-to-use. We can have only 1 such; if there is one already, free it
10901090
jt->tstacknext=jt->tstackcurr; // save the next-to-use, after removing bias
10911091
jt->tstackcurr=(A*)jt->tstackcurr[0]; // back up to the previous block
@@ -1135,8 +1135,8 @@ __attribute__((noinline)) A jtgafallopool(J jt,I blockx,I n){
11351135
// allocate without alignment
11361136
ASSERT(av=MALLOC(PSIZE+TAILPAD),EVWSFULL);
11371137
#endif
1138-
I nt=jt->malloctotal+=PSIZE+TAILPAD+ALIGNPOOLTOCACHE*CACHELINESIZE; // add to total JE mem allocated
1139-
jt->mfreegenallo+=PSIZE+TAILPAD+ALIGNPOOLTOCACHE*CACHELINESIZE; // ...add them to the total bytes allocated drom OS
1138+
I nt=__atomic_add_fetch(&jt->malloctotal,PSIZE+TAILPAD+ALIGNPOOLTOCACHE*CACHELINESIZE,__ATOMIC_ACQ_REL); // add to total JE mem allocated
1139+
__atomic_fetch_add(&jt->mfreegenallo,PSIZE+TAILPAD+ALIGNPOOLTOCACHE*CACHELINESIZE,__ATOMIC_ACQ_REL); // ...add them to the total bytes allocated drom OS
11401140
{I ot=jt->malloctotalhwmk; ot=ot>nt?ot:nt; jt->malloctotalhwmk=ot;}
11411141
// split the allocation into blocks. Chain them together, and flag the base. We chain them in ascending order (the order doesn't matter), but
11421142
// we visit them in back-to-front order so the first-allocated headers are in cache
@@ -1157,7 +1157,7 @@ __attribute__((noinline)) A jtgafallopool(J jt,I blockx,I n){
11571157
#endif
11581158
jt->mfree[-PMINL+1+blockx].pool=(A)((C*)u+n); // the second block becomes the head of the free list
11591159
if(unlikely((((jt->mfree[-PMINL+1+blockx].ballo+=n-PSIZE)&MFREEBCOUNTING)!=0))){ // We are adding a bunch of free blocks now...
1160-
jt->bytes += n; if(jt->bytes>jt->bytesmax)jt->bytesmax=jt->bytes;
1160+
I jtbytes=__atomic_add_fetch(&jt->bytes,n,__ATOMIC_ACQ_REL); if(jtbytes>jt->bytesmax)jt->bytesmax=jtbytes;
11611161
}
11621162
A *tp=jt->tnextpushp; AZAPLOC(z)=tp; *tp++=z; jt->tnextpushp=tp; if(unlikely(((I)tp&(NTSTACKBLOCK-1))==0))RZ(z=jttgz(jt,tp,z)); // do the tpop/zaploc chaining
11631163
R z;
@@ -1175,14 +1175,16 @@ __attribute__((noinline)) A jtgafalloos(J jt,I blockx,I n){A z;
11751175
ASSERT(z=MALLOC(n),EVWSFULL);
11761176
#endif
11771177
AFHRH(z) = (US)FHRHSYSJHDR(1+blockx); // Save the size of the allocation so we know how to free it and how big it was
1178-
if(unlikely((((jt->mfreegenallo+=n)&MFREEBCOUNTING)!=0))){
1179-
jt->bytes += n; if(jt->bytes>jt->bytesmax)jt->bytesmax=jt->bytes;
1178+
if(unlikely((((__atomic_fetch_add(&jt->mfreegenallo,n,__ATOMIC_ACQ_REL))&MFREEBCOUNTING)!=0))){
1179+
I jtbytes=__atomic_add_fetch(&jt->bytes,n,__ATOMIC_ACQ_REL); if(jtbytes>jt->bytesmax)jt->bytesmax=jtbytes;
1180+
// obsolete jt->bytes += n; if(jt->bytes>jt->bytesmax)jt->bytesmax=jt->bytes;
11801181
}
1181-
I nt=jt->malloctotal+=n;
1182+
I nt=__atomic_add_fetch(&jt->malloctotal,n,__ATOMIC_ACQ_REL);
11821183
{I ot=jt->malloctotalhwmk; ot=ot>nt?ot:nt; jt->malloctotalhwmk=ot;}
11831184
A *tp=jt->tnextpushp; AZAPLOC(z)=tp; *tp++=z; jt->tnextpushp=tp; if(unlikely(((I)tp&(NTSTACKBLOCK-1))==0))RZ(z=jttgz(jt,tp,z)); // do the tpop/zaploc chaining
11841185
#if PYXES
1185-
z->lock=0; // init lock on the block to 'available'
1186+
*(I4 *)&z->origin=THREADID(jt); // init allocating thread# and clear the lock
1187+
// obsolete z->lock=0; // init lock on the block to 'available'
11861188
#endif
11871189
R z;
11881190
}
@@ -1213,7 +1215,8 @@ if((I)jt&3)SEGFAULT;
12131215
jt->mfree[-PMINL+1+blockx].pool = AFCHAIN(z); // remove & use the head of the free chain
12141216
// If the user is keeping track of memory high-water mark with 7!:2, figure it out & keep track of it. Otherwise save the cycles. All allo routines must do this
12151217
if(unlikely((((jt->mfree[-PMINL+1+blockx].ballo+=n)&MFREEBCOUNTING)!=0))){
1216-
jt->bytes += n; if(jt->bytes>jt->bytesmax)jt->bytesmax=jt->bytes;
1218+
I jtbytes=__atomic_add_fetch(&jt->bytes,n,__ATOMIC_ACQ_REL); if(jtbytes>jt->bytesmax)jt->bytesmax=jtbytes;
1219+
// obsolete jt->bytes += n; if(jt->bytes>jt->bytesmax)jt->bytesmax=jt->bytes;
12171220
}
12181221
// Put the new block into the tpop stack and point the blocks to its zappable tpop slot. We have to check for a new tpop stack block, and we cleverly
12191222
// pass z into that function, which will return it unchanged, so that we don't have to push the value in this routine
@@ -1295,7 +1298,8 @@ RESTRICTF A jtga0(J jt,I type,I rank,I atoms){A z;
12951298

12961299
// free a block. The usecount must make it freeable. If the block was a small block allocated in a different thread,
12971300
// repatriate it
1298-
void jtmf(J jt,A w,I hrh){I mfreeb;
1301+
void jtmf(J jt,A w,I hrh){
1302+
// obsolete I mfreeb;
12991303
#if MEMAUDIT&16
13001304
auditmemchains();
13011305
#endif
@@ -1325,21 +1329,24 @@ printf("%p-\n",w);
13251329
#endif
13261330
#endif
13271331
I allocsize; // size of full allocation for this block
1332+
#if PYXES
1333+
I origthread=w->origin;
1334+
#endif
13281335
if(FHRHBINISPOOL(hrh)){ // allocated from subpool
13291336
allocsize = FHRHPOOLBINSIZE(hrh);
13301337
#if MEMAUDIT&4
13311338
DO((allocsize>>LGSZI), if(i!=6)((I*)w)[i] = (I)0xdeadbeefdeadbeefLL;); // wipe the block clean before we free it - but not the reserved area
13321339
#endif
13331340
#if PYXES
1334-
I origthread=w->origin; if(likely(origthread==THREADID(jt))){ // if block was allocated from this thread
1341+
if(likely(origthread==THREADID(jt))){ // if block was allocated from this thread
13351342
#endif
1336-
jt->bytes -= allocsize; // keep track of total allocation
1337-
mfreeb = jt->mfree[blockx].ballo; // number of bytes allocated at this size (biased zero point)
1343+
__atomic_fetch_sub(&jt->bytes,allocsize,__ATOMIC_ACQ_REL); // keep track of total allocation
1344+
I mfreeb = __atomic_fetch_sub(&jt->mfree[blockx].ballo,allocsize,__ATOMIC_ACQ_REL); // number of bytes allocated at this size (biased zero point)
13381345
AFCHAIN(w)=jt->mfree[blockx].pool; // append free list to the new addition...
13391346
jt->mfree[blockx].pool=w; // ...and make new addition the new head
1340-
if(unlikely(0 > (mfreeb-=allocsize)))jt->uflags.us.uq.uq_c.spfreeneeded=1; // Indicate we have one more free buffer;
1347+
if(unlikely(mfreeb<0))jt->uflags.us.uq.uq_c.spfreeneeded=1; // Indicate we have one more free buffer;
13411348
// if this kicks the list into garbage-collection mode, indicate that
1342-
jt->mfree[blockx].ballo=mfreeb;
1349+
// obsolete jt->mfree[blockx].ballo=mfreeb;
13431350
#if PYXES
13441351
}else{
13451352
// repatriate a block allocated in another thread
@@ -1348,15 +1355,16 @@ printf("%p-\n",w);
13481355
}
13491356
#endif
13501357
}else{ // buffer allocated from malloc
1351-
mfreeb = jt->mfreegenallo;
1358+
// obsolete mfreeb = jt->mfreegenallo;
13521359
allocsize = FHRHSYSSIZE(hrh);
13531360
#if MEMAUDIT&4
13541361
DO((allocsize>>LGSZI), if(i!=6)((I*)w)[i] = (I)0xdeadbeefdeadbeefLL;); // wipe the block clean before we free it - but not the reserved area
13551362
#endif
13561363
allocsize+=TAILPAD+ALIGNTOCACHE*CACHELINESIZE; // the actual allocation had a tail pad and boundary
1357-
jt->bytes -= allocsize; // keep track of total allocation
1358-
jt->malloctotal-=allocsize;
1359-
jt->mfreegenallo-=allocsize; // account for all the bytes returned to the OS
1364+
jt=JTFORTHREAD(jt,origthread); // switch to the thread the block came from
1365+
__atomic_fetch_sub(&jt->bytes,allocsize,__ATOMIC_ACQ_REL); // keep track of total allocation
1366+
__atomic_fetch_sub(&jt->malloctotal,allocsize,__ATOMIC_ACQ_REL);
1367+
__atomic_fetch_sub(&jt->mfreegenallo,allocsize,__ATOMIC_ACQ_REL); // account for all the bytes returned to the OS
13601368
#if ALIGNTOCACHE
13611369
FREECHK(((I**)w)[-1]); // point to initial allocation and free it
13621370
#else

0 commit comments

Comments
 (0)