PufferAI
diff --git a/‎pufferlib/src/cudnn_conv2d.cu‎
Lines changed: 2 additions & 0 deletions b/‎pufferlib/src/cudnn_conv2d.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pufferlib/src/kernels.cu‎
Lines changed: 4 additions & 0 deletions b/‎pufferlib/src/kernels.cu‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pufferlib/src/models.cu‎
Lines changed: 42 additions & 19 deletions b/‎pufferlib/src/models.cu‎
Lines changed: 42 additions & 19 deletions
diff --git a/‎pufferlib/src/ocean.cu‎
Lines changed: 2 additions & 0 deletions b/‎pufferlib/src/ocean.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pufferlib/src/pufferlib.cu‎
Lines changed: 24 additions & 41 deletions b/‎pufferlib/src/pufferlib.cu‎
Lines changed: 24 additions & 41 deletions
@@ -8,6 +8,8 @@
 #include <cudnn.h>
 #include <cstdio>
 
+#include "kernels.cu"
+
 #ifndef CHECK_CUDNN
 #define CHECK_CUDNN(call) do { \
     cudnnStatus_t e = call; \
 
@@ -172,12 +172,14 @@ __global__ void add_kernel(float* __restrict__ dst, const precision_t* __restric
     }
 }
 
+#ifndef PRECISION_FLOAT
 __global__ void add_kernel(precision_t* __restrict__ dst, const precision_t* __restrict__ src, int n) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
         dst[idx] = from_float(to_float(dst[idx]) + to_float(src[idx]));
     }
 }
+#endif
 
 #include "tensor.h"
 
@@ -322,13 +324,15 @@ __global__ void cast_kernel(precision_t* __restrict__ dst,
     }
 }
 
+#ifndef PRECISION_FLOAT
 __global__ void cast_kernel(float* __restrict__ dst,
         const precision_t* __restrict__ src, int n) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < n) {
         dst[idx] = to_float(src[idx]);
     }
 }
+#endif
 
 __global__ void cast_kernel(precision_t* __restrict__ dst,
         const unsigned char* __restrict__ src, int n) {
 
@@ -1,18 +1,15 @@
-// Uses vector for MinGRU activations
+// Removed vector dependency for MinGRU activations - now uses raw pointers
 
 #ifndef PUFFERLIB_MODELS_CU
 #define PUFFERLIB_MODELS_CU
 
 #include <cuda_runtime.h>
-#include <vector>
 #include <string>
 #include <cstdint>
 
 #include <stdio.h>
 #include <stdlib.h>
 
-using std::vector;
-
 #include "kernels.cu"
 
 // Shared function pointer types (same signature for encoder and decoder)
@@ -22,6 +19,7 @@ typedef void (*reg_train_fn)(void* weights, void* buf, Allocator* acts, Allocato
 typedef void (*reg_rollout_fn)(void* weights, void* buf, Allocator* alloc, int B);
 typedef void* (*create_weights_fn)(void* self);
 typedef void  (*free_weights_fn)(void* weights);
+typedef void  (*free_activations_fn)(void* activations);
 typedef PrecisionTensor (*forward_fn)(void* weights, void* activations, PrecisionTensor input, cudaStream_t stream);
 typedef void (*encoder_backward_fn)(void* weights, void* activations,
     PrecisionTensor grad, cudaStream_t stream);
@@ -43,6 +41,7 @@ struct Encoder {
     reg_rollout_fn reg_rollout;
     create_weights_fn create_weights;
     free_weights_fn free_weights;
+    free_activations_fn free_activations;
     int in_dim, out_dim;
 };
 
@@ -55,6 +54,7 @@ struct Decoder {
     reg_rollout_fn reg_rollout;
     create_weights_fn create_weights;
     free_weights_fn free_weights;
+    free_activations_fn free_activations;
     int hidden_dim, output_dim;
     bool continuous;
 };
@@ -69,6 +69,7 @@ struct Network {
     reg_rollout_fn reg_rollout;
     create_weights_fn create_weights;
     free_weights_fn free_weights;
+    free_activations_fn free_activations;
     int hidden, num_layers, horizon;
 };
 
@@ -480,6 +481,10 @@ static void encoder_free_weights(void* weights) {
     free(weights);
 }
 
+static void encoder_free_activations(void* activations) {
+    free(activations);
+}
+
 #include "ocean.cu"
 
 struct DecoderWeights {
@@ -559,6 +564,10 @@ static void decoder_free_weights(void* weights) {
     free(weights);
 }
 
+static void decoder_free_activations(void* activations) {
+    free(activations);
+}
+
 static PrecisionTensor decoder_backward(void* w, void* activations,
     FloatTensor grad_logits, FloatTensor grad_logstd, FloatTensor grad_value, cudaStream_t stream) {
     DecoderWeights* dw = (DecoderWeights*)w;
@@ -579,18 +588,26 @@ static PrecisionTensor decoder_backward(void* w, void* activations,
 struct MinGRUActivations {
     int num_layers;
     // Rollout
-    vector<PrecisionTensor> combined;     // per-layer (B_inf, 3*H)
+    PrecisionTensor* combined;            // per-layer (B_inf, 3*H) - malloc'd
     PrecisionTensor out;                  // (B_inf, H)
     PrecisionTensor next_state;           // (B_inf, H)
     // Training
-    vector<PrecisionTensor> saved_inputs; // per-layer (B, TT, H)
-    vector<PrefixScan> scan_bufs;         // per-layer scan state
-    vector<PrecisionTensor> combined_bufs;// per-layer (B_TT, 3*H)
-    vector<PrecisionTensor> wgrad_scratch;// per-layer (3*H, H) weight grad output
+    PrecisionTensor* saved_inputs;       // per-layer (B, TT, H) - malloc'd
+    PrefixScan* scan_bufs;               // per-layer scan state - malloc'd
+    PrecisionTensor* combined_bufs;      // per-layer (B_TT, 3*H) - malloc'd
+    PrecisionTensor* wgrad_scratch;      // per-layer (3*H, H) weight grad output - malloc'd
     PrecisionTensor grad_input_buf;       // (B_TT, H)
     PrecisionTensor grad_next_state;      // (B, 1, H)
 };
 
+void mingru_activations_free(MinGRUActivations* a) {
+    free(a->combined);
+    free(a->saved_inputs);
+    free(a->scan_bufs);
+    free(a->combined_bufs);
+    free(a->wgrad_scratch);
+}
+
 struct MinGRUWeights {
     int hidden, num_layers, horizon;
     PrecisionTensor* weights;  // [num_layers], malloc'd
@@ -625,10 +642,10 @@ static void mingru_reg_train(void* w, void* activations, Allocator* acts, Alloca
     MinGRUActivations* a = (MinGRUActivations*)activations;
     int H = m->hidden, TT = m->horizon, B = B_TT / TT;
     a->num_layers = m->num_layers;
-    a->saved_inputs.resize(m->num_layers);
-    a->scan_bufs.resize(m->num_layers);
-    a->combined_bufs.resize(m->num_layers);
-    a->wgrad_scratch.resize(m->num_layers);
+    a->saved_inputs = (PrecisionTensor*)calloc(m->num_layers, sizeof(PrecisionTensor));
+    a->scan_bufs = (PrefixScan*)calloc(m->num_layers, sizeof(PrefixScan));
+    a->combined_bufs = (PrecisionTensor*)calloc(m->num_layers, sizeof(PrecisionTensor));
+    a->wgrad_scratch = (PrecisionTensor*)calloc(m->num_layers, sizeof(PrecisionTensor));
     a->grad_input_buf = {.shape = {B_TT, H}};
     a->grad_next_state = {.shape = {B, 1, H}};
     alloc_register(acts,&a->grad_input_buf);
@@ -666,7 +683,7 @@ static void mingru_reg_rollout(void* weights, void* activations, Allocator* allo
     MinGRUActivations* a = (MinGRUActivations*)activations;
     int H = w->hidden;
     a->num_layers = w->num_layers;
-    a->combined.resize(w->num_layers);
+    a->combined = (PrecisionTensor*)calloc(w->num_layers, sizeof(PrecisionTensor));
     for (int i = 0; i < w->num_layers; i++) {
         a->combined[i] = {.shape = {B_inf, 3 * H}};
         alloc_register(alloc,&a->combined[i]);
@@ -684,12 +701,19 @@ static void* mingru_create_weights(void* self) {
     mw->weights = (PrecisionTensor*)calloc(n->num_layers, sizeof(PrecisionTensor));
     return mw;
 }
+
 static void mingru_free_weights(void* weights) {
     MinGRUWeights* mw = (MinGRUWeights*)weights;
     free(mw->weights);
     free(mw);
 }
 
+static void mingru_free_activations(void* activations) {
+    MinGRUActivations* a = (MinGRUActivations*)activations;
+    mingru_activations_free(a);
+    free(a);
+}
+
 static PrecisionTensor mingru_forward(void* w, PrecisionTensor x, PrecisionTensor state,
         void* activations, cudaStream_t stream) {
     MinGRUWeights* m = (MinGRUWeights*)w;
@@ -764,11 +788,10 @@ struct PolicyWeights {
     void* network;
 };
 
-static void policy_activations_free(PolicyActivations& a) {
-    free(a.encoder);
-    free(a.decoder);
-    ((MinGRUActivations*)a.network)->~MinGRUActivations();
-    free(a.network);
+static void policy_activations_free(Policy* p, PolicyActivations& a) {
+    p->encoder.free_activations(a.encoder);
+    p->decoder.free_activations(a.decoder);
+    p->network.free_activations(a.network);
 }
 
 PrecisionTensor policy_forward(Policy* p, PolicyWeights& w, PolicyActivations& activations,
 
@@ -285,6 +285,7 @@ static void* nmmo3_encoder_create_weights(void* self) {
     return nmmo3_encoder_create(e->in_dim, e->out_dim);
 }
 static void nmmo3_encoder_free_weights(void* weights) { free(weights); }
+static void nmmo3_encoder_free_activations(void* activations) { free(activations); }
 
 // Override encoder vtable for known ocean environments. No-op for unknown envs.
 static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
@@ -298,6 +299,7 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
             .reg_rollout = nmmo3_encoder_reg_rollout,
             .create_weights = nmmo3_encoder_create_weights,
             .free_weights = nmmo3_encoder_free_weights,
+            .free_activations = nmmo3_encoder_free_activations,
             .in_dim = enc->in_dim, .out_dim = enc->out_dim,
         };
     }
 
@@ -1,8 +1,8 @@
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
-#include <nccl.h>
 #include <nvtx3/nvToolsExt.h>
 #include <nvml.h>
+#include <nccl.h>
 
 #include "models.cu"
 #include "muon.cu"
@@ -166,32 +166,13 @@ void register_train_buffers(TrainGraph& bufs, Allocator* alloc, int S, int H, in
     alloc_register(alloc, &bufs.mb_newvalue);
 }
 
-// Minimal CUDA graph wrapper using raw APIs (no torch dependency)
-struct RawCudaGraph {
-    cudaGraph_t graph = nullptr;
-    cudaGraphExec_t exec = nullptr;
-
-    void capture_begin(cudaStream_t stream) {
-        cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-    }
-    void capture_end(cudaStream_t stream) {
-        cudaStreamEndCapture(stream, &graph);
-        cudaGraphInstantiate(&exec, graph, 0);
-    }
-    void replay(cudaStream_t stream) {
-        cudaGraphLaunch(exec, stream);
-    }
-    void reset() {
-        if (exec) {
-            cudaGraphExecDestroy(exec);
-            exec = nullptr;
-        }
-        if (graph) {
-            cudaGraphDestroy(graph);
-            graph = nullptr;
-        }
-    }
-};
+// CUDA graph helpers
+inline void cudagraph_capture_end(cudaGraphExec_t* exec, cudaStream_t stream) {
+    cudaGraph_t graph;
+    cudaStreamEndCapture(stream, &graph);
+    cudaGraphInstantiate(exec, graph, 0);
+    cudaGraphDestroy(graph);
+}
 
 // Slice: select dim0 index t, then narrow dim0 from start for count.
 // 3D (H, S, F) -> (count, F); 2D (H, S) -> (count,)
@@ -335,8 +316,8 @@ typedef struct {
     EnvBuf env;
     TrainGraph train_buf;
     FloatTensor advantages_puf;  // Pre-allocated for train_impl (S, H) f32
-    RawCudaGraph* fused_rollout_cudagraphs;  // [horizon][num_buffers]
-    RawCudaGraph train_cudagraph;
+    cudaGraphExec_t* fused_rollout_cudagraphs;  // [horizon][num_buffers]
+    cudaGraphExec_t train_cudagraph;
     cudaStream_t* streams;  // per-buffer raw CUDA streams
     cudaStream_t default_stream;  // main-thread stream (captured once at init)
     IntTensor act_sizes_puf;    // CUDA int32 tensor of action head sizes
@@ -562,7 +543,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
 
     cudaStream_t current_stream = tl_stream;
     if (pufferl->rollout_captured) {
-        pufferl->fused_rollout_cudagraphs[graph].replay(current_stream);
+        cudaGraphLaunch(pufferl->fused_rollout_cudagraphs[graph], current_stream);
         profile_end(hypers.profile);
         return;
     }
@@ -572,7 +553,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
     if (capturing) {
         cudaStreamCreate(&cap_stream_raw);
         current_stream = cap_stream_raw;
-        pufferl->fused_rollout_cudagraphs[graph].capture_begin(cap_stream_raw);
+        cudaStreamBeginCapture(cap_stream_raw, cudaStreamCaptureModeGlobal);
     }
 
     RolloutBuf& rollouts = pufferl->rollouts;
@@ -628,7 +609,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
         act_slice.data, numel(act_slice.shape) * sizeof(double), cudaMemcpyDeviceToDevice, stream);
 
     if (capturing) {
-        pufferl->fused_rollout_cudagraphs[graph].capture_end(cap_stream_raw);
+        cudagraph_capture_end(&pufferl->fused_rollout_cudagraphs[graph], cap_stream_raw);
         cudaStreamSynchronize(cap_stream_raw);
         cudaDeviceSynchronize();
         cudaStreamDestroy(cap_stream_raw);
@@ -1460,13 +1441,13 @@ void train_impl(PuffeRL& pufferl) {
 
         cudaEventRecord(pufferl.profile.events[3]);  // end misc / start forward
         if (pufferl.train_captured) {
-            pufferl.train_cudagraph.replay(train_stream);
+            cudaGraphLaunch(pufferl.train_cudagraph, train_stream);
         } else {
             bool capturing = pufferl.train_warmup == hypers.cudagraphs;
             cudaStream_t cap_stream_raw = train_stream;
             if (capturing) {
                 cudaStreamCreate(&cap_stream_raw);
-                pufferl.train_cudagraph.capture_begin(cap_stream_raw);
+                cudaStreamBeginCapture(cap_stream_raw, cudaStreamCaptureModeGlobal);
             }
 
             cudaStream_t stream = cap_stream_raw;
@@ -1499,9 +1480,8 @@ void train_impl(PuffeRL& pufferl) {
                 cast_kernel<<<grid_size(n), BLOCK_SIZE, 0, stream>>>(
                     pufferl.param_puf.data, pufferl.master_weights.data, n);
             }
-
             if (capturing) {
-                pufferl.train_cudagraph.capture_end(cap_stream_raw);
+                cudagraph_capture_end(&pufferl.train_cudagraph, cap_stream_raw);
                 cudaStreamSynchronize(cap_stream_raw);
                 cudaDeviceSynchronize();
                 cudaStreamDestroy(cap_stream_raw);
@@ -1636,6 +1616,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         .reg_rollout = encoder_reg_rollout,
         .create_weights = encoder_create_weights,
         .free_weights = encoder_free_weights,
+        .free_activations = encoder_free_activations,
         .in_dim = input_size, .out_dim = hidden_size,
     };
     create_custom_encoder(env_name, &encoder);
@@ -1648,6 +1629,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         .reg_rollout = decoder_reg_rollout,
         .create_weights = decoder_create_weights,
         .free_weights = decoder_free_weights,
+        .free_activations = decoder_free_activations,
         .hidden_dim = hidden_size, .output_dim = decoder_output_size, .continuous = is_continuous,
     };
     Network network = {
@@ -1660,6 +1642,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         .reg_rollout = mingru_reg_rollout,
         .create_weights = mingru_create_weights,
         .free_weights = mingru_free_weights,
+        .free_activations = mingru_free_activations,
         .hidden = hidden_size, .num_layers = num_layers, .horizon = hypers.horizon,
     };
     pufferl->policy = Policy{
@@ -1744,7 +1727,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
     muon_post_create(&pufferl->muon);
 
     if (hypers.cudagraphs >= 0) {
-        pufferl->fused_rollout_cudagraphs = (RawCudaGraph*)calloc(horizon*num_buffers, sizeof(RawCudaGraph));
+        pufferl->fused_rollout_cudagraphs = (cudaGraphExec_t*)calloc(horizon*num_buffers, sizeof(cudaGraphExec_t));
         pufferl->train_warmup = 0;
 
         // Snapshot weights + optimizer state before init-time capture
@@ -1831,15 +1814,15 @@ void close_impl(PuffeRL& pufferl) {
         cudaProfilerStop();
     }
 
-    pufferl.train_cudagraph.reset();
+    cudaGraphExecDestroy(pufferl.train_cudagraph);
     for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) {
-        pufferl.fused_rollout_cudagraphs[i].reset();
+        cudaGraphExecDestroy(pufferl.fused_rollout_cudagraphs[i]);
     }
 
     policy_weights_free(&pufferl.policy, &pufferl.weights);
-    policy_activations_free(pufferl.train_activations);
+    policy_activations_free(&pufferl.policy, pufferl.train_activations);
     for (int buf = 0; buf < pufferl.hypers.num_buffers; buf++) {
-        policy_activations_free(pufferl.buffer_activations[buf]);
+        policy_activations_free(&pufferl.policy, pufferl.buffer_activations[buf]);
     }
 
     if (USE_BF16) {
Original file line number	Diff line number	Diff line change
`@@ -172,12 +172,14 @@ __global__ void add_kernel(float* __restrict__ dst, const precision_t* __restric`
`172`	`172`	`}`
`173`	`173`	`}`
`174`	`174`
	`175`	`+#ifndef PRECISION_FLOAT`
`175`	`176`	`__global__ void add_kernel(precision_t* __restrict__ dst, const precision_t* __restrict__ src, int n) {`
`176`	`177`	`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
`177`	`178`	`if (idx < n) {`
`178`	`179`	`dst[idx] = from_float(to_float(dst[idx]) + to_float(src[idx]));`
`179`	`180`	`}`
`180`	`181`	`}`
	`182`	`+#endif`
`181`	`183`
`182`	`184`	`#include "tensor.h"`
`183`	`185`
`@@ -322,13 +324,15 @@ __global__ void cast_kernel(precision_t* __restrict__ dst,`
`322`	`324`	`}`
`323`	`325`	`}`
`324`	`326`
	`327`	`+#ifndef PRECISION_FLOAT`
`325`	`328`	`__global__ void cast_kernel(float* __restrict__ dst,`
`326`	`329`	`const precision_t* __restrict__ src, int n) {`
`327`	`330`	`int idx = blockIdx.x * blockDim.x + threadIdx.x;`
`328`	`331`	`if (idx < n) {`
`329`	`332`	`dst[idx] = to_float(src[idx]);`
`330`	`333`	`}`
`331`	`334`	`}`
	`335`	`+#endif`
`332`	`336`
`333`	`337`	`__global__ void cast_kernel(precision_t* __restrict__ dst,`
`334`	`338`	`const unsigned char* __restrict__ src, int n) {`
Original file line number	Diff line number	Diff line change
`@@ -285,6 +285,7 @@ static void* nmmo3_encoder_create_weights(void* self) {`
`285`	`285`	`return nmmo3_encoder_create(e->in_dim, e->out_dim);`
`286`	`286`	`}`
`287`	`287`	`static void nmmo3_encoder_free_weights(void* weights) { free(weights); }`
	`288`	`+static void nmmo3_encoder_free_activations(void* activations) { free(activations); }`
`288`	`289`
`289`	`290`	`// Override encoder vtable for known ocean environments. No-op for unknown envs.`
`290`	`291`	`static void create_custom_encoder(const std::string& env_name, Encoder* enc) {`
`@@ -298,6 +299,7 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) {`
`298`	`299`	`.reg_rollout = nmmo3_encoder_reg_rollout,`
`299`	`300`	`.create_weights = nmmo3_encoder_create_weights,`
`300`	`301`	`.free_weights = nmmo3_encoder_free_weights,`
	`302`	`+ .free_activations = nmmo3_encoder_free_activations,`
`301`	`303`	`.in_dim = enc->in_dim, .out_dim = enc->out_dim,`
`302`	`304`	`};`
`303`	`305`	`}`