@@ -123,6 +123,7 @@ def write_and_launch(name, code, team):
123123 return p
124124
125125pid_team_cache = {}
126+ _any_pid_to_team = {} # populated at startup: all pid variants -> team
126127
127128def _read_environ_team (pid ):
128129 try :
@@ -135,6 +136,17 @@ def _read_environ_team(pid):
135136 pass
136137 return None
137138
139+ def _nspids (pid ):
140+ """Return all NSpid values from /proc/<pid>/status (handles PID namespaces)."""
141+ try :
142+ with open (f'/proc/{ pid } /status' ) as f :
143+ for line in f :
144+ if line .startswith ('NSpid:' ):
145+ return [int (x ) for x in line .split ()[1 :]]
146+ except (FileNotFoundError , ValueError ):
147+ pass
148+ return [pid ]
149+
138150def _ppid (pid ):
139151 try :
140152 with open (f'/proc/{ pid } /status' ) as f :
@@ -145,17 +157,44 @@ def _ppid(pid):
145157 pass
146158 return None
147159
160+ def build_team_map ():
161+ """Scan /proc for all processes with ALUMINATAI_TEAM, map all their PID variants."""
162+ global _any_pid_to_team
163+ result = {}
164+ try :
165+ for entry in os .scandir ('/proc' ):
166+ if not entry .name .isdigit ():
167+ continue
168+ pid = int (entry .name )
169+ team = _read_environ_team (pid )
170+ if team :
171+ for ns_pid in _nspids (pid ):
172+ result [ns_pid ] = team
173+ result [pid ] = team
174+ except (PermissionError , OSError ):
175+ pass
176+ _any_pid_to_team = result
177+ return result
178+
148179def get_team (pid ):
149- """Walk process tree upward until ALUMINATAI_TEAM is found ."""
180+ """Look up team by checking PID map (handles namespace gaps) then tree walk ."""
150181 if pid in pid_team_cache :
151182 return pid_team_cache [pid ]
183+ # Direct hit from pre-built map
184+ if pid in _any_pid_to_team :
185+ pid_team_cache [pid ] = _any_pid_to_team [pid ]
186+ return _any_pid_to_team [pid ]
187+ # Walk parent tree (for child CUDA processes)
152188 current , visited = pid , set ()
153189 while current and current not in visited :
154190 visited .add (current )
155191 team = _read_environ_team (current )
156192 if team :
157193 pid_team_cache [pid ] = team
158194 return team
195+ if current in _any_pid_to_team :
196+ pid_team_cache [pid ] = _any_pid_to_team [current ]
197+ return _any_pid_to_team [current ]
159198 parent = _ppid (current )
160199 if not parent or parent == current :
161200 break
@@ -196,6 +235,11 @@ def drain(name, proc):
196235
197236 time .sleep (90 )
198237
238+ # Build PID->team map (handles PID namespace gaps between /proc and NVML)
239+ print ("Building PID→team map..." )
240+ found = build_team_map ()
241+ print (f" found { len (found )} PID entries: { set (found .values ())} " )
242+
199243 # Init NVML
200244 pynvml .nvmlInit ()
201245 handle = pynvml .nvmlDeviceGetHandleByIndex (GPU_INDEX )
@@ -224,6 +268,7 @@ def drain(name, proc):
224268 samples [team ].append ((power_w * frac , mem_mb , frac ))
225269
226270 if time .time () >= next_print :
271+ build_team_map () # refresh to pick up any new child pids
227272 elapsed = int (time .time () - start )
228273 print (f"\n [{ elapsed } s] total GPU: { power_w :.1f} W" )
229274 for p in gpu_procs :
0 commit comments