Skip to content

Commit 4e0c951

Browse files
committed
CPU fallback for mac scrubs
1 parent bb15a59 commit 4e0c951

7 files changed

Lines changed: 370 additions & 81 deletions

File tree

build.sh

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ set -e
1010
# ./build.sh breakout --web # Emscripten web build
1111
# ./build.sh breakout --profile # Kernel profiling binary
1212

13-
ENV=${1:?Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile]}
13+
ENV=${1:?Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile|--cpu]}
1414
MODE=""
1515
PRECISION=""
1616
DEBUG=""
@@ -22,6 +22,7 @@ for arg in "${@:2}"; do
2222
--fast) MODE=fast ;;
2323
--web) MODE=web ;;
2424
--profile) MODE=profile ;;
25+
--cpu) MODE=cpu; PRECISION="-DPRECISION_FLOAT" ;;
2526
esac
2627
done
2728

@@ -203,6 +204,33 @@ if [ "$MODE" = "profile" ]; then
203204
exit 0
204205
fi
205206

207+
if [ "$MODE" = "cpu" ]; then
208+
echo "=== Compiling bindings_cpu.cpp ==="
209+
g++ -c -fPIC -fopenmp \
210+
-D_GLIBCXX_USE_CXX11_ABI=1 \
211+
-DPLATFORM_DESKTOP \
212+
-std=c++17 \
213+
-I. -Isrc \
214+
-I$PYTHON_INCLUDE -I$PYBIND_INCLUDE \
215+
-DOBS_TENSOR_T=$OBS_TENSOR_T \
216+
$PRECISION $LINK_OPT \
217+
src/bindings_cpu.cpp -o src/bindings_cpu.o
218+
219+
echo "=== Linking $OUTPUT (CPU) ==="
220+
LINK_CMD=(
221+
g++ -shared -fPIC -fopenmp
222+
src/bindings_cpu.o "$STATIC_LIB" "$RAYLIB_A"
223+
-lm -lpthread -lomp5
224+
$LINK_OPT
225+
)
226+
[ "$PLATFORM" = "Linux" ] && LINK_CMD+=(-Bsymbolic-functions)
227+
[ "$PLATFORM" = "Darwin" ] && LINK_CMD+=(-framework Cocoa -framework OpenGL -framework IOKit)
228+
LINK_CMD+=(-o "$OUTPUT")
229+
"${LINK_CMD[@]}"
230+
echo "=== Built: $OUTPUT (CPU) ==="
231+
exit 0
232+
fi
233+
206234
echo "=== Compiling bindings.cu ==="
207235
$NVCC -c -Xcompiler -fPIC \
208236
-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \

constellation/cache_data.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -188,23 +188,11 @@ def cached_load(path, env_name, cache):
188188
#data['metrics/agent_steps'] = [e/1e6 for e in data['metrics/agent_steps']]
189189
del data['metrics/agent_steps']
190190

191-
'''
192-
for k, v in data.items():
193-
for e in v:
194-
if e is None or isinstance(e, str):
195-
continue
196-
try:
197-
if e > 1e9 or e < -1e9:
198-
breakpoint()
199-
except:
200-
breakpoint()
201-
'''
202-
203191
# Filter to pareto
192+
'''
204193
steps = data['agent_steps']
205194
costs = data['uptime']
206195
scores = data['env/score']
207-
'''
208196
idxs = pareto_idx(steps, costs, scores)
209197
for k in data:
210198
try:

pufferlib/torch_pufferl.py

Lines changed: 58 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import glob
77
import time
8+
import ctypes
89
from collections import defaultdict
910

1011
import numpy as np
@@ -93,6 +94,20 @@ def __init__(self, ptr, shape, dtype):
9394
'version': 2,
9495
}
9596

97+
_TORCH_TO_CTYPE = {
98+
torch.uint8: ctypes.c_uint8,
99+
torch.float32: ctypes.c_float,
100+
}
101+
102+
def _cpu_tensor(ptr, shape, dtype):
103+
'''Zero-copy CPU tensor from a raw pointer via ctypes.'''
104+
ctype = _TORCH_TO_CTYPE[dtype]
105+
n = 1
106+
for s in shape:
107+
n *= s
108+
arr = (ctype * n).from_address(ptr)
109+
return torch.frombuffer(arr, dtype=dtype).reshape(shape)
110+
96111
class PuffeRL:
97112
def __init__(self, args, vec, policy, verbose=True):
98113
config = args['train']
@@ -103,16 +118,25 @@ def __init__(self, args, vec, policy, verbose=True):
103118
torch.backends.cudnn.benchmark = True
104119

105120
self._vec = vec
121+
self.gpu = vec.gpu
106122
total_agents = vec.total_agents
107123
self.total_agents = total_agents
108124
obs_dtype = _OBS_DTYPE_MAP.get(vec.obs_dtype, torch.uint8)
109125

110-
self.vec_obs = torch.as_tensor(_CudaPtr(vec.gpu_obs_ptr,
111-
(total_agents, vec.obs_size), obs_dtype))
112-
self.vec_rewards = torch.as_tensor(_CudaPtr(vec.gpu_rewards_ptr,
113-
(total_agents,), torch.float32))
114-
self.vec_terminals = torch.as_tensor(_CudaPtr(vec.gpu_terminals_ptr,
115-
(total_agents,), torch.float32))
126+
if self.gpu:
127+
self.vec_obs = torch.as_tensor(_CudaPtr(vec.gpu_obs_ptr,
128+
(total_agents, vec.obs_size), obs_dtype))
129+
self.vec_rewards = torch.as_tensor(_CudaPtr(vec.gpu_rewards_ptr,
130+
(total_agents,), torch.float32))
131+
self.vec_terminals = torch.as_tensor(_CudaPtr(vec.gpu_terminals_ptr,
132+
(total_agents,), torch.float32))
133+
else:
134+
self.vec_obs = _cpu_tensor(vec.obs_ptr,
135+
(total_agents, vec.obs_size), obs_dtype)
136+
self.vec_rewards = _cpu_tensor(vec.rewards_ptr,
137+
(total_agents,), torch.float32)
138+
self.vec_terminals = _cpu_tensor(vec.terminals_ptr,
139+
(total_agents,), torch.float32)
116140

117141
vec.reset()
118142
horizon = config['horizon']
@@ -148,7 +172,7 @@ def __init__(self, args, vec, policy, verbose=True):
148172
self.last_log_step = 0
149173
self.last_log_time = time.time()
150174
self.start_time = time.time()
151-
self.profile = Profile()
175+
self.profile = Profile(gpu=self.gpu)
152176
self.verbose = verbose
153177

154178
self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
@@ -203,9 +227,13 @@ def rollouts(self):
203227
self.values[t] = value.flatten()
204228

205229
prof.mark(2)
206-
actions_gpu = (action.T if action.dim() > 1 else action.unsqueeze(-1)).to(dtype=torch.float32, device='cuda').contiguous()
207-
self._vec.step(actions_gpu.data_ptr())
208-
torch.cuda.synchronize()
230+
actions_flat = (action.T if action.dim() > 1 else action.unsqueeze(-1)).to(dtype=torch.float32).contiguous()
231+
if self.gpu:
232+
actions_flat = actions_flat.cuda()
233+
self._vec.gpu_step(actions_flat.data_ptr())
234+
torch.cuda.synchronize()
235+
else:
236+
self._vec.cpu_step(actions_flat.data_ptr())
209237
o, r, d = self.vec_obs, self.vec_rewards, self.vec_terminals
210238
prof.mark(3)
211239

@@ -348,7 +376,7 @@ def log(self):
348376
'train_misc': perf[P.TRAIN_MISC],
349377
'train_forward': perf[P.TRAIN_FORWARD],
350378
},
351-
'util': dict(_C.get_utilization(self.args.get('gpu_id', 0))),
379+
'util': dict(_C.get_utilization(self.args.get('gpu_id', 0))) if self.gpu else {},
352380
}
353381
self.last_log_time = time.time()
354382
self.last_log_step = self.global_step
@@ -376,7 +404,8 @@ def create_pufferl(cls, args):
376404
os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank)
377405

378406
args['vec']['num_buffers'] = 1
379-
vec = _C.create_vec(args)
407+
gpu = 1 if device == 'cuda' else 0
408+
vec = _C.create_vec(args, gpu)
380409
policy = load_policy(args, vec)
381410

382411
if 'LOCAL_RANK' in os.environ:
@@ -395,7 +424,8 @@ def create_pufferl(cls, args):
395424
def compute_puff_advantage(values, rewards, terminals,
396425
ratio, advantages, gamma, gae_lambda, vtrace_rho_clip, vtrace_c_clip):
397426
num_steps, horizon = values.shape
398-
_C.puff_advantage(
427+
fn = _C.puff_advantage if values.is_cuda else _C.puff_advantage_cpu
428+
fn(
399429
values.data_ptr(), rewards.data_ptr(), terminals.data_ptr(),
400430
ratio.data_ptr(), advantages.data_ptr(),
401431
num_steps, horizon,
@@ -406,16 +436,26 @@ class Profile:
406436
'''Matches pufferlib.cu profiling: accumulate ms, report seconds.'''
407437
ROLLOUT, EVAL_GPU, EVAL_ENV, TRAIN, TRAIN_MISC, TRAIN_FORWARD, NUM = range(7)
408438

409-
def __init__(self):
439+
def __init__(self, gpu=True):
410440
self.accum = [0.0] * Profile.NUM
411-
self._events = [torch.cuda.Event(enable_timing=True) for _ in range(4)]
441+
self.gpu = gpu
442+
if gpu:
443+
self._events = [torch.cuda.Event(enable_timing=True) for _ in range(4)]
444+
else:
445+
self._stamps = [0.0] * 4
412446

413447
def mark(self, idx):
414-
self._events[idx].record()
448+
if self.gpu:
449+
self._events[idx].record()
450+
else:
451+
self._stamps[idx] = time.perf_counter()
415452

416453
def elapsed(self, idx, start_ev, end_ev):
417-
self._events[end_ev].synchronize()
418-
self.accum[idx] += self._events[start_ev].elapsed_time(self._events[end_ev])
454+
if self.gpu:
455+
self._events[end_ev].synchronize()
456+
self.accum[idx] += self._events[start_ev].elapsed_time(self._events[end_ev])
457+
else:
458+
self.accum[idx] += (self._stamps[end_ev] - self._stamps[start_ev]) * 1000.0
419459

420460
def read_and_reset(self):
421461
out = [v / 1000.0 for v in self.accum]

src/bindings.cu

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,10 @@ struct VecEnv {
256256
std::vector<int> act_sizes;
257257
std::string obs_dtype;
258258
size_t obs_elem_size;
259+
int gpu;
259260
};
260261

261-
std::unique_ptr<VecEnv> create_vec(py::dict args) {
262+
std::unique_ptr<VecEnv> create_vec(py::dict args, int gpu) {
262263
py::dict vec_kwargs = args["vec"].cast<py::dict>();
263264
py::dict env_kwargs = args["env"].cast<py::dict>();
264265

@@ -269,9 +270,10 @@ std::unique_ptr<VecEnv> create_vec(py::dict args) {
269270
Dict* env_dict = py_dict_to_c_dict(env_kwargs);
270271

271272
auto ve = std::make_unique<VecEnv>();
273+
ve->gpu = gpu;
272274
{
273275
py::gil_scoped_release no_gil;
274-
ve->vec = create_static_vec(total_agents, num_buffers, vec_dict, env_dict);
276+
ve->vec = create_static_vec(total_agents, num_buffers, gpu, vec_dict, env_dict);
275277
}
276278
ve->total_agents = total_agents;
277279
ve->obs_size = get_obs_size();
@@ -291,14 +293,22 @@ void vec_reset(VecEnv& ve) {
291293
static_vec_reset(ve.vec);
292294
}
293295

294-
// actions_ptr: data_ptr() of a (total_agents, num_atns) float64 CUDA tensor
295-
void vec_step(VecEnv& ve, long long actions_ptr) {
296+
void gpu_vec_step_py(VecEnv& ve, long long actions_ptr) {
296297
cudaMemcpy(ve.vec->gpu_actions, (void*)actions_ptr,
297298
(size_t)ve.total_agents * ve.num_atns * sizeof(float),
298299
cudaMemcpyDeviceToDevice);
299300
{
300301
py::gil_scoped_release no_gil;
301-
static_vec_step(ve.vec);
302+
gpu_vec_step(ve.vec);
303+
}
304+
}
305+
306+
void cpu_vec_step_py(VecEnv& ve, long long actions_ptr) {
307+
memcpy(ve.vec->actions, (void*)actions_ptr,
308+
(size_t)ve.total_agents * ve.num_atns * sizeof(float));
309+
{
310+
py::gil_scoped_release no_gil;
311+
cpu_vec_step(ve.vec);
302312
}
303313
}
304314

@@ -512,20 +522,26 @@ PYBIND11_MODULE(_C, m) {
512522
return now - pufferl.start_time;
513523
});
514524
m.def("puff_advantage", &py_puff_advantage);
515-
m.def("create_vec", &create_vec);
525+
m.def("create_vec", &create_vec, py::arg("args"), py::arg("gpu") = 1);
516526
py::class_<VecEnv, std::unique_ptr<VecEnv>>(m, "VecEnv")
517527
.def_readonly("total_agents", &VecEnv::total_agents)
518528
.def_readonly("obs_size", &VecEnv::obs_size)
519529
.def_readonly("num_atns", &VecEnv::num_atns)
520530
.def_readonly("act_sizes", &VecEnv::act_sizes)
521531
.def_readonly("obs_dtype", &VecEnv::obs_dtype)
522532
.def_readonly("obs_elem_size", &VecEnv::obs_elem_size)
533+
.def_readonly("gpu", &VecEnv::gpu)
523534
// GPU buffer pointers — wrap with torch.from_blob(..., device='cuda')
524535
.def_property_readonly("gpu_obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_observations; })
525536
.def_property_readonly("gpu_rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_rewards; })
526537
.def_property_readonly("gpu_terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->gpu_terminals; })
538+
// CPU buffer pointers (same as gpu_ in CPU mode since they alias)
539+
.def_property_readonly("obs_ptr", [](VecEnv& ve) { return (long long)ve.vec->observations; })
540+
.def_property_readonly("rewards_ptr", [](VecEnv& ve) { return (long long)ve.vec->rewards; })
541+
.def_property_readonly("terminals_ptr", [](VecEnv& ve) { return (long long)ve.vec->terminals; })
527542
.def("reset", &vec_reset)
528-
.def("step", &vec_step)
543+
.def("gpu_step", &gpu_vec_step_py)
544+
.def("cpu_step", &cpu_vec_step_py)
529545
.def("log", &vec_log)
530546
.def("close", &vec_close);
531547

0 commit comments

Comments
 (0)