Merge pull request #501 from PufferAI/bfloatatns

jsuarez5341 · web-flow · commit 033692651dfc · 2026-03-20T08:17:35.000-04:00
bfloat atns
diff --git a/pufferlib/ocean/breakout/binding.c b/pufferlib/ocean/breakout/binding.c
@@ -3,7 +3,6 @@
 #define NUM_ATNS 1
 #define ACT_SIZES {3}
 #define OBS_TENSOR_T FloatTensor
-#define ACT_TYPE DOUBLE
 
 #define Env Breakout
 #include "vecenv.h"
diff --git a/pufferlib/ocean/breakout/breakout.h b/pufferlib/ocean/breakout/breakout.h
@@ -40,7 +40,7 @@ typedef struct Breakout {
     Client* client;
     Log log;
     float* observations;
-    double* actions;
+    float* actions;
     float* rewards;
     float* terminals;
     int num_agents;
@@ -121,7 +121,7 @@ void init(Breakout* env) {
 void allocate(Breakout* env) {
     init(env);
     env->observations = (float*)calloc(11 + env->num_bricks, sizeof(float));
-    env->actions = (double*)calloc(1, sizeof(double));
+    env->actions = (float*)calloc(1, sizeof(float));
     env->rewards = (float*)calloc(1, sizeof(float));
     env->terminals = (float*)calloc(1, sizeof(float));
 }
diff --git a/pufferlib/src/bindings.cu b/pufferlib/src/bindings.cu
@@ -453,10 +453,6 @@ PYBIND11_MODULE(_C, m) {
         .def("__repr__", [](const PrecisionTensor& t) { return std::string(puf_repr(&t)); })
         .def("ndim", [](const PrecisionTensor& t) { return ndim(t.shape); })
         .def("numel", [](const PrecisionTensor& t) { return numel(t.shape); });
-    py::class_<DoubleTensor>(m, "DoubleTensor")
-        .def("__repr__", [](const DoubleTensor& t) { return std::string(puf_repr(&t)); })
-        .def("ndim", [](const DoubleTensor& t) { return ndim(t.shape); })
-        .def("numel", [](const DoubleTensor& t) { return numel(t.shape); });
     py::class_<FloatTensor>(m, "FloatTensor")
         .def("__repr__", [](const FloatTensor& t) { return std::string(puf_repr(&t)); })
         .def("ndim", [](const FloatTensor& t) { return ndim(t.shape); })
diff --git a/pufferlib/src/kernels.cu b/pufferlib/src/kernels.cu
@@ -138,18 +138,6 @@ __global__ void transpose_102(precision_t* __restrict__ dst,
     dst[b * A * C + a * C + c] = src[idx];
 }
 
-// This exists for actions (currently fp64)
-__global__ void transpose_102(double* __restrict__ dst,
-        const double* __restrict__ src, int A, int B, int C) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = A * B * C;
-    if (idx >= total) {
-        return;
-    }
-    int a = idx / (B * C), rem = idx % (B * C), b = rem / C, c = rem % C;
-    dst[b * A * C + a * C + c] = src[idx];
-}
-
 __global__ void fill_precision_kernel(precision_t* __restrict__ dst, precision_t val, int n) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
@@ -247,10 +235,6 @@ inline const char* puf_repr(const PrecisionTensor* t) {
     return _puf_repr_impl("PrecisionTensor", USE_BF16 ? "bf16" : "f32",
         t->shape, ndim(t->shape), numel(t->shape), !t->data);
 }
-inline const char* puf_repr(const DoubleTensor* t) {
-    return _puf_repr_impl("DoubleTensor", "f64",
-        t->shape, ndim(t->shape), numel(t->shape), !t->data);
-}
 inline const char* puf_repr(const FloatTensor* t) {
     return _puf_repr_impl("FloatTensor", "f32",
         t->shape, ndim(t->shape), numel(t->shape), !t->data);
@@ -431,9 +415,6 @@ void alloc_register(Allocator* a, PrecisionTensor* t) {
 void alloc_register(Allocator* a, FloatTensor* t) {
     alloc_register_impl(a, (void**)&t->data, t->shape, sizeof(float));
 }
-void alloc_register(Allocator* a, DoubleTensor* t) {
-    alloc_register_impl(a, (void**)&t->data, t->shape, sizeof(double));
-}
 void alloc_register(Allocator* a, LongTensor* t) {
     alloc_register_impl(a, (void**)&t->data, t->shape, sizeof(long));
 }
diff --git a/pufferlib/src/pufferlib.cu b/pufferlib/src/pufferlib.cu
@@ -16,7 +16,7 @@ enum LossIdx {
 
 struct RolloutBuf {
     PrecisionTensor observations;  // (horizon, segments, input_size)
-    DoubleTensor actions;          // (horizon, segments, num_atns)
+    PrecisionTensor actions;       // (horizon, segments, num_atns)
     PrecisionTensor values;        // (horizon, segments)
     PrecisionTensor logprobs;      // (horizon, segments)
     PrecisionTensor rewards;       // (horizon, segments)
@@ -49,7 +49,7 @@ void register_rollout_buffers(RolloutBuf& bufs, Allocator* alloc, int H, int S,
 struct TrainGraph {
     PrecisionTensor mb_obs;         // (S, H, input_size)
     PrecisionTensor mb_state;       // (L, S, 1, hidden)
-    DoubleTensor mb_actions;        // (S, H, num_atns)
+    PrecisionTensor mb_actions;     // (S, H, num_atns)
     PrecisionTensor mb_logprobs;    // (S, H)
     FloatTensor mb_advantages;      // (S, H) f32
     PrecisionTensor mb_prio;        // (S, 1)
@@ -62,7 +62,7 @@ struct TrainGraph {
 struct PPOGraphArgs {
     precision_t* out_ratio;
     precision_t* out_newvalue;
-    const double* actions;
+    const precision_t* actions;
     const precision_t* old_logprobs;
     const float* advantages;
     const precision_t* prio;
@@ -90,7 +90,7 @@ struct PPOKernelArgs {
 
 struct PPOBuffersPuf {
     FloatTensor loss_output, grad_loss;
-    DoubleTensor saved_for_bwd;
+    FloatTensor saved_for_bwd;
     FloatTensor grad_logits, grad_values, grad_logstd, adv_scratch;
 };
 
@@ -179,19 +179,10 @@ inline PrecisionTensor puf_slice(PrecisionTensor& p, int t, int start, int count
         return {.data = p.data + (t*S + start), .shape = {count}};
     }
 }
-inline DoubleTensor puf_slice(DoubleTensor& p, int t, int start, int count) {
-    if (ndim(p.shape) == 3) {
-        long S = p.shape[1], F = p.shape[2];
-        return {.data = p.data + (t*S + start)*F, .shape = {count, F}};
-    } else {
-        long S = p.shape[1];
-        return {.data = p.data + (t*S + start), .shape = {count}};
-    }
-}
 
 struct EnvBuf {
     OBS_TENSOR_T obs;     // (total_agents, obs_size) — type defined per-env in binding.c
-    DoubleTensor actions; // (total_agents, num_atns) f64
+    FloatTensor actions; // (total_agents, num_atns) f64
     FloatTensor rewards;  // (total_agents,) f32
     FloatTensor terminals;// (total_agents,) f32
 };
@@ -204,7 +195,7 @@ StaticVec* create_environments(int num_buffers, int total_agents,
         .shape = {total_agents, get_obs_size()},
     };
     env.actions = {
-        .data = (double*)vec->gpu_actions,
+        .data = (float*)vec->gpu_actions,
         .shape = {total_agents, get_num_atns()},
     };
     env.rewards = {
@@ -386,7 +377,7 @@ __global__ void sample_logits_kernel(
     PrecisionTensor dec_out,              // (B, fused_cols) fused logits+value from decoder
     PrecisionTensor logstd_puf,           // (1, od) log std for continuous, or empty
     IntTensor act_sizes_puf,              // (num_atns,) action head sizes
-    double* __restrict__ actions,         // (B, num_atns) output
+    precision_t* __restrict__ actions,    // (B, num_atns) output
     precision_t* __restrict__ logprobs,   // (B,) output
     precision_t* __restrict__ value_out,  // (B,) output
     uint64_t seed,
@@ -437,7 +428,7 @@ __global__ void sample_logits_kernel(
             float normalized = (action - mean) / std;
             float log_prob = -0.5f * normalized * normalized - 0.5f * LOG_2PI - log_std;
 
-            actions[idx * num_atns + h] = double(action);
+            actions[idx * num_atns + h] = from_float(action);
             total_log_prob += log_prob;
         }
     } else {
@@ -508,7 +499,7 @@ __global__ void sample_logits_kernel(
             float log_prob = sampled_logit - logsumexp;
 
             // Write action for this head
-            actions[idx * num_atns + h] = double(sampled_action);
+            actions[idx * num_atns + h] = from_float(sampled_action);
             total_log_prob += log_prob;
 
             // Advance to next action head
@@ -578,7 +569,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
     PrecisionTensor dec_puf = policy_forward(&pufferl->policy, pufferl->weights, pufferl->buffer_activations[buf], obs_dst, state_puf, stream);
 
     // Sample actions, logprobs, values into rollout buffer
-    DoubleTensor act_slice = puf_slice(rollouts.actions, t, start, block_size);
+    PrecisionTensor act_slice = puf_slice(rollouts.actions, t, start, block_size);
     PrecisionTensor lp_slice = puf_slice(rollouts.logprobs, t, start, block_size);
     PrecisionTensor val_slice = puf_slice(rollouts.values, t, start, block_size);
 
@@ -598,9 +589,8 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
 
     // Copy actions to env
     long act_cols = env.actions.shape[1];
-    cudaMemcpyAsync(
-        env.actions.data + start * act_cols,
-        act_slice.data, numel(act_slice.shape) * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    cast_kernel<<<grid_size(numel(act_slice.shape)), BLOCK_SIZE, 0, stream>>>(
+            env.actions.data + start * act_cols, act_slice.data, numel(act_slice.shape));
 
     if (capturing) {
         cudagraph_capture_end(&pufferl->fused_rollout_cudagraphs[graph], cap_stream_raw);
@@ -1301,7 +1291,7 @@ __global__ void select_copy_kernel(
 
     // Compute row byte counts from tensor shapes
     int obs_row_bytes = (numel(rollouts.observations.shape) / rollouts.observations.shape[0]) * sizeof(precision_t);
-    int act_row_bytes = (numel(rollouts.actions.shape) / rollouts.actions.shape[0]) * sizeof(double);
+    int act_row_bytes = (numel(rollouts.actions.shape) / rollouts.actions.shape[0]) * sizeof(precision_t);
     int lp_row_bytes = (numel(rollouts.logprobs.shape) / rollouts.logprobs.shape[0]) * sizeof(precision_t);
     int horizon = rollouts.values.shape[1];
 
diff --git a/pufferlib/src/tensor.h b/pufferlib/src/tensor.h
@@ -10,11 +10,6 @@ typedef struct {
     int64_t shape[PUF_MAX_DIMS];
 } FloatTensor;
 
-typedef struct {
-    double* data;
-    int64_t shape[PUF_MAX_DIMS];
-} DoubleTensor;
-
 typedef struct {
     unsigned char* data;
     int64_t shape[PUF_MAX_DIMS];
diff --git a/pufferlib/src/vecenv.h b/pufferlib/src/vecenv.h
@@ -79,11 +79,11 @@ typedef struct StaticVec {
     int* buffer_env_starts;
     int* buffer_env_counts;
     void* observations;
-    double* actions;
+    float* actions;
     float* rewards;
     float* terminals;
     void* gpu_observations;
-    double* gpu_actions;
+    float* gpu_actions;
     float* gpu_rewards;
     float* gpu_terminals;
     cudaStream_t* streams;
@@ -252,7 +252,7 @@ static void* static_omp_threadmanager(void* arg) {
             cudaMemcpyAsync(
                 &vec->actions[agent_start * NUM_ATNS],
                 &vec->gpu_actions[agent_start * NUM_ATNS],
-                agents_per_buffer * NUM_ATNS * sizeof(double),
+                agents_per_buffer * NUM_ATNS * sizeof(float),
                 cudaMemcpyDeviceToHost, stream);
             cudaStreamSynchronize(stream);
             clock_gettime(CLOCK_MONOTONIC, &t1);
@@ -384,17 +384,17 @@ StaticVec* create_static_vec(int total_agents, int num_buffers, Dict* vec_kwargs
 
     size_t obs_elem_size = obs_element_size();
     cudaHostAlloc((void**)&vec->observations, total_agents * OBS_SIZE * obs_elem_size, cudaHostAllocPortable);
-    cudaHostAlloc((void**)&vec->actions, total_agents * NUM_ATNS * sizeof(double), cudaHostAllocPortable);
+    cudaHostAlloc((void**)&vec->actions, total_agents * NUM_ATNS * sizeof(float), cudaHostAllocPortable);
     cudaHostAlloc((void**)&vec->rewards, total_agents * sizeof(float), cudaHostAllocPortable);
     cudaHostAlloc((void**)&vec->terminals, total_agents * sizeof(float), cudaHostAllocPortable);
 
     cudaMalloc((void**)&vec->gpu_observations, total_agents * OBS_SIZE * obs_elem_size);
-    cudaMalloc((void**)&vec->gpu_actions, total_agents * NUM_ATNS * sizeof(double));
+    cudaMalloc((void**)&vec->gpu_actions, total_agents * NUM_ATNS * sizeof(float));
     cudaMalloc((void**)&vec->gpu_rewards, total_agents * sizeof(float));
     cudaMalloc((void**)&vec->gpu_terminals, total_agents * sizeof(float));
 
     cudaMemset(vec->gpu_observations, 0, total_agents * OBS_SIZE * obs_elem_size);
-    cudaMemset(vec->gpu_actions, 0, total_agents * NUM_ATNS * sizeof(double));
+    cudaMemset(vec->gpu_actions, 0, total_agents * NUM_ATNS * sizeof(float));
     cudaMemset(vec->gpu_rewards, 0, total_agents * sizeof(float));
     cudaMemset(vec->gpu_terminals, 0, total_agents * sizeof(float));
 
@@ -483,7 +483,7 @@ void static_vec_close(StaticVec* vec) {
 
     cudaDeviceSynchronize();
     size_t obs_bytes = vec->total_agents * OBS_SIZE * obs_element_size();
-    size_t act_bytes = vec->total_agents * NUM_ATNS * sizeof(double);
+    size_t act_bytes = vec->total_agents * NUM_ATNS * sizeof(float);
     size_t rew_bytes = vec->total_agents * sizeof(float);
     size_t term_bytes = vec->total_agents * sizeof(float);
     cudaFree(vec->gpu_observations);
@@ -578,7 +578,7 @@ size_t get_obs_elem_size(void) { return obs_element_size(); }
 void static_vec_step(StaticVec* vec) {
     // D2H: copy GPU actions to CPU pinned memory so envs can read them
     cudaMemcpy(vec->actions, vec->gpu_actions,
-        (size_t)vec->total_agents * NUM_ATNS * sizeof(double),
+        (size_t)vec->total_agents * NUM_ATNS * sizeof(float),
         cudaMemcpyDeviceToHost);
 
     memset(vec->rewards, 0, vec->total_agents * sizeof(float));