Skip to content

Commit 3549393

Browse files
fix tests (#1555)
Co-authored-by: Luke Lombardi <luke@beam.cloud>
1 parent e87add0 commit 3549393

3 files changed

Lines changed: 115 additions & 39 deletions

File tree

pkg/worker/gpu_info_test.go

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package worker
22

33
import (
44
"errors"
5-
"os"
65
"testing"
76

87
"github.com/stretchr/testify/assert"
@@ -175,33 +174,6 @@ func TestAvailableGPUDevicesReturnsQueryErrors(t *testing.T) {
175174
assert.Nil(t, devices)
176175
}
177176

178-
func TestResolveVisibleDevicesUsesChildProcess(t *testing.T) {
179-
origResolve := resolveVisibleDevices
180-
defer func() { resolveVisibleDevices = origResolve }()
181-
182-
resolveVisibleDevices = func() string {
183-
return "GPU-04612b44-abcd-1234-5678-aabbccddeeff"
184-
}
185-
186-
result := resolveVisibleDevices()
187-
assert.Equal(t, "GPU-04612b44-abcd-1234-5678-aabbccddeeff", result)
188-
}
189-
190-
func TestResolveVisibleDevicesFallsBackToEnv(t *testing.T) {
191-
origResolve := resolveVisibleDevices
192-
defer func() { resolveVisibleDevices = origResolve }()
193-
194-
os.Setenv("NVIDIA_VISIBLE_DEVICES", "GPU-fallback-uuid")
195-
defer os.Unsetenv("NVIDIA_VISIBLE_DEVICES")
196-
197-
resolveVisibleDevices = func() string {
198-
return os.Getenv("NVIDIA_VISIBLE_DEVICES")
199-
}
200-
201-
result := resolveVisibleDevices()
202-
assert.Equal(t, "GPU-fallback-uuid", result)
203-
}
204-
205177
func TestAvailableGPUDevicesSingleGPUUUID(t *testing.T) {
206178
cleanup := withMockDevices(eightGPUOutput, true)
207179
defer cleanup()

pkg/worker/gpu_integration_test.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package worker
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"strings"
7+
"testing"
8+
9+
common "github.com/beam-cloud/beta9/pkg/common"
10+
"gvisor.dev/gvisor/pkg/sync"
11+
)
12+
13+
func TestIntegrationGPUIsolation(t *testing.T) {
14+
if os.Getenv("GPU_INTEGRATION") != "1" {
15+
t.Skip("set GPU_INTEGRATION=1 to run on a real GPU node")
16+
}
17+
18+
// Step 1: Read PID 1's env (what os.Getenv sees — the broken path)
19+
pid1Env := "(unknown)"
20+
data, err := os.ReadFile("/proc/1/environ")
21+
if err == nil {
22+
for _, entry := range strings.Split(string(data), "\x00") {
23+
if strings.HasPrefix(entry, "NVIDIA_VISIBLE_DEVICES=") {
24+
pid1Env = strings.TrimPrefix(entry, "NVIDIA_VISIBLE_DEVICES=")
25+
break
26+
}
27+
}
28+
}
29+
t.Logf("PID 1 NVIDIA_VISIBLE_DEVICES = %q", pid1Env)
30+
31+
// Step 2: Call the REAL resolveVisibleDevices() from gpu_info.go
32+
resolved := resolveVisibleDevices()
33+
t.Logf("resolveVisibleDevices() = %q", resolved)
34+
35+
if resolved == "void" || resolved == "" {
36+
t.Fatalf("resolveVisibleDevices() returned %q — void bug NOT fixed", resolved)
37+
}
38+
if !strings.HasPrefix(resolved, "GPU-") {
39+
t.Fatalf("resolveVisibleDevices() returned %q — expected GPU UUID", resolved)
40+
}
41+
42+
// Step 3: Create the REAL NvidiaInfoClient with the resolved value (same as NewContainerNvidiaManager)
43+
client := &NvidiaInfoClient{visibleDevices: resolved}
44+
45+
// Step 4: Call the REAL AvailableGPUDevices()
46+
devices, err := client.AvailableGPUDevices()
47+
if err != nil {
48+
t.Fatalf("AvailableGPUDevices() error: %v", err)
49+
}
50+
t.Logf("AvailableGPUDevices() = %v", devices)
51+
52+
if len(devices) == 0 {
53+
t.Fatal("AvailableGPUDevices() returned empty — GPU not visible")
54+
}
55+
if len(devices) != 1 {
56+
t.Fatalf("AvailableGPUDevices() returned %d GPUs, expected exactly 1 for per-worker isolation", len(devices))
57+
}
58+
59+
// Step 5: Verify the OLD path (void) would have failed
60+
oldClient := &NvidiaInfoClient{visibleDevices: pid1Env}
61+
oldDevices, err := oldClient.AvailableGPUDevices()
62+
if err != nil {
63+
t.Logf("Old path error (expected): %v", err)
64+
}
65+
t.Logf("Old path (PID 1 env=%q) -> AvailableGPUDevices() = %v", pid1Env, oldDevices)
66+
67+
if pid1Env == "void" && len(oldDevices) > 0 {
68+
t.Error("Old code path with void should return empty, but got devices — test logic wrong")
69+
}
70+
71+
// Step 6: Exercise the REAL ContainerNvidiaManager.AssignGPUDevices (chooseDevices)
72+
manager := &ContainerNvidiaManager{
73+
gpuAllocationMap: common.NewSafeMap[[]int](),
74+
gpuCount: 1,
75+
mu: sync.Mutex{},
76+
infoClient: client,
77+
resolvedVisibleDevices: resolved,
78+
}
79+
80+
assigned, err := manager.AssignGPUDevices("test-container-1", 1)
81+
if err != nil {
82+
t.Fatalf("AssignGPUDevices() failed: %v", err)
83+
}
84+
t.Logf("AssignGPUDevices(\"test-container-1\", 1) = %v", assigned)
85+
86+
if len(assigned) != 1 {
87+
t.Fatalf("Expected 1 assigned GPU, got %d", len(assigned))
88+
}
89+
if assigned[0] != devices[0] {
90+
t.Fatalf("Assigned GPU %d doesn't match available GPU %d", assigned[0], devices[0])
91+
}
92+
93+
// Step 7: Verify second allocation to same worker FAILS (only 1 GPU available)
94+
_, err = manager.AssignGPUDevices("test-container-2", 1)
95+
if err == nil {
96+
t.Fatal("Second allocation should fail — only 1 GPU per worker")
97+
}
98+
t.Logf("Second allocation correctly failed: %v", err)
99+
100+
fmt.Printf("\nRESULT: resolved=%s gpu_index=%d old_path_would_fail=%v PASS\n",
101+
resolved, assigned[0], pid1Env == "void" && len(oldDevices) == 0)
102+
}

pkg/worker/nvidia.go

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,12 @@ type GPUManager interface {
3232
}
3333

3434
type ContainerNvidiaManager struct {
35-
gpuAllocationMap *common.SafeMap[[]int]
36-
gpuCount uint32
37-
mu sync.Mutex
38-
statFunc func(path string, stat *syscall.Stat_t) (err error)
39-
infoClient GPUInfoClient
35+
gpuAllocationMap *common.SafeMap[[]int]
36+
gpuCount uint32
37+
mu sync.Mutex
38+
statFunc func(path string, stat *syscall.Stat_t) (err error)
39+
infoClient GPUInfoClient
40+
resolvedVisibleDevices string
4041
}
4142

4243
func NewContainerNvidiaManager(gpuCount uint32) GPUManager {
@@ -51,11 +52,12 @@ func NewContainerNvidiaManager(gpuCount uint32) GPUManager {
5152
log.Info().Str("resolved_visible_devices", visibleDevices).Msg("resolved NVIDIA_VISIBLE_DEVICES for GPU filtering")
5253

5354
return &ContainerNvidiaManager{
54-
gpuAllocationMap: common.NewSafeMap[[]int](),
55-
gpuCount: gpuCount,
56-
mu: sync.Mutex{},
57-
statFunc: syscall.Stat,
58-
infoClient: &NvidiaInfoClient{visibleDevices: visibleDevices},
55+
gpuAllocationMap: common.NewSafeMap[[]int](),
56+
gpuCount: gpuCount,
57+
mu: sync.Mutex{},
58+
statFunc: syscall.Stat,
59+
infoClient: &NvidiaInfoClient{visibleDevices: visibleDevices},
60+
resolvedVisibleDevices: visibleDevices,
5961
}
6062
}
6163

@@ -114,7 +116,7 @@ func (c *ContainerNvidiaManager) chooseDevices(containerId string, requestedGpuC
114116
// Check if we managed to allocate the requested number of GPUs
115117
if len(allocableDevices) < int(requestedGpuCount) {
116118
return nil, fmt.Errorf("not enough GPUs available: requested=%d, allocable=%d, visible=%d, configured=%d, already_allocated=%d, NVIDIA_VISIBLE_DEVICES=%q",
117-
requestedGpuCount, len(allocableDevices), len(availableDevices), c.gpuCount, len(currentAllocations), c.infoClient.(*NvidiaInfoClient).visibleDevices)
119+
requestedGpuCount, len(allocableDevices), len(availableDevices), c.gpuCount, len(currentAllocations), c.resolvedVisibleDevices)
118120
}
119121

120122
// Allocate the requested number of GPUs

0 commit comments

Comments
 (0)