Skip to content

Commit 314688a

Browse files
fix(tests): NSpid namespace mapping for NVML host PIDs in containers
RunPod containers use PID namespaces — NVML returns host PIDs while /proc uses container PIDs. Scan all /proc entries + NSpid fields at startup to build a complete pid->team map covering all namespace variants. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 101c3c1 commit 314688a

1 file changed

Lines changed: 46 additions & 1 deletion

File tree

agent/tests/multi_workload_attribution.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def write_and_launch(name, code, team):
123123
return p
124124

125125
pid_team_cache = {}
126+
_any_pid_to_team = {} # populated at startup: all pid variants -> team
126127

127128
def _read_environ_team(pid):
128129
try:
@@ -135,6 +136,17 @@ def _read_environ_team(pid):
135136
pass
136137
return None
137138

139+
def _nspids(pid):
140+
"""Return all NSpid values from /proc/<pid>/status (handles PID namespaces)."""
141+
try:
142+
with open(f'/proc/{pid}/status') as f:
143+
for line in f:
144+
if line.startswith('NSpid:'):
145+
return [int(x) for x in line.split()[1:]]
146+
except (FileNotFoundError, ValueError):
147+
pass
148+
return [pid]
149+
138150
def _ppid(pid):
139151
try:
140152
with open(f'/proc/{pid}/status') as f:
@@ -145,17 +157,44 @@ def _ppid(pid):
145157
pass
146158
return None
147159

160+
def build_team_map():
161+
"""Scan /proc for all processes with ALUMINATAI_TEAM, map all their PID variants."""
162+
global _any_pid_to_team
163+
result = {}
164+
try:
165+
for entry in os.scandir('/proc'):
166+
if not entry.name.isdigit():
167+
continue
168+
pid = int(entry.name)
169+
team = _read_environ_team(pid)
170+
if team:
171+
for ns_pid in _nspids(pid):
172+
result[ns_pid] = team
173+
result[pid] = team
174+
except (PermissionError, OSError):
175+
pass
176+
_any_pid_to_team = result
177+
return result
178+
148179
def get_team(pid):
149-
"""Walk process tree upward until ALUMINATAI_TEAM is found."""
180+
"""Look up team by checking PID map (handles namespace gaps) then tree walk."""
150181
if pid in pid_team_cache:
151182
return pid_team_cache[pid]
183+
# Direct hit from pre-built map
184+
if pid in _any_pid_to_team:
185+
pid_team_cache[pid] = _any_pid_to_team[pid]
186+
return _any_pid_to_team[pid]
187+
# Walk parent tree (for child CUDA processes)
152188
current, visited = pid, set()
153189
while current and current not in visited:
154190
visited.add(current)
155191
team = _read_environ_team(current)
156192
if team:
157193
pid_team_cache[pid] = team
158194
return team
195+
if current in _any_pid_to_team:
196+
pid_team_cache[pid] = _any_pid_to_team[current]
197+
return _any_pid_to_team[current]
159198
parent = _ppid(current)
160199
if not parent or parent == current:
161200
break
@@ -196,6 +235,11 @@ def drain(name, proc):
196235

197236
time.sleep(90)
198237

238+
# Build PID->team map (handles PID namespace gaps between /proc and NVML)
239+
print("Building PID→team map...")
240+
found = build_team_map()
241+
print(f" found {len(found)} PID entries: {set(found.values())}")
242+
199243
# Init NVML
200244
pynvml.nvmlInit()
201245
handle = pynvml.nvmlDeviceGetHandleByIndex(GPU_INDEX)
@@ -224,6 +268,7 @@ def drain(name, proc):
224268
samples[team].append((power_w * frac, mem_mb, frac))
225269

226270
if time.time() >= next_print:
271+
build_team_map() # refresh to pick up any new child pids
227272
elapsed = int(time.time() - start)
228273
print(f"\n [{elapsed}s] total GPU: {power_w:.1f}W")
229274
for p in gpu_procs:

0 commit comments

Comments
 (0)