PufferAI
diff --git a/‎pufferlib/src/models.cu‎
Lines changed: 19 additions & 2 deletions b/‎pufferlib/src/models.cu‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎pufferlib/src/ocean.cu‎
Lines changed: 36 additions & 1 deletion b/‎pufferlib/src/ocean.cu‎
Lines changed: 36 additions & 1 deletion
@@ -488,22 +488,32 @@ static void encoder_free_activations(void* activations) {
 #include "ocean.cu"
 
 struct DecoderWeights {
-    PrecisionTensor weight, logstd;
+    PrecisionTensor weight, bias, logstd;
     int hidden_dim, output_dim;
     bool continuous;
 };
 
 struct DecoderActivations {
-    PrecisionTensor out, grad_out, saved_input, grad_input, wgrad_scratch, logstd_scratch;
+    PrecisionTensor out, grad_out, saved_input, grad_input, wgrad_scratch, bgrad_scratch, logstd_scratch;
 };
 
+__global__ void bias_add_kernel(precision_t* __restrict__ data,
+        const precision_t* __restrict__ bias, int total, int dim) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) return;
+    data[idx] = from_float(to_float(data[idx]) + to_float(bias[idx % dim]));
+}
+
 static PrecisionTensor decoder_forward(void* w, void* activations, PrecisionTensor input, cudaStream_t stream) {
     DecoderWeights* dw = (DecoderWeights*)w;
     DecoderActivations* a = (DecoderActivations*)activations;
     if (a->saved_input.data) {
         puf_copy(&a->saved_input, &input, stream);
     }
     puf_mm(&input, &dw->weight, &a->out, stream);
+    int B = input.shape[0], od1 = dw->output_dim + 1;
+    bias_add_kernel<<<grid_size(B * od1), BLOCK_SIZE, 0, stream>>>(
+        a->out.data, dw->bias.data, B * od1, od1);
     return a->out;
 }
 
@@ -514,12 +524,15 @@ static void decoder_init_weights(void* w, ulong* seed, cudaStream_t stream) {
         .shape = {dw->output_dim + 1, dw->hidden_dim},
     };
     puf_kaiming_init(&wt, 0.01f, (*seed)++, stream);
+    cudaMemsetAsync(dw->bias.data, 0, numel(dw->bias.shape) * sizeof(precision_t), stream);
 }
 
 static void decoder_reg_params(void* w, Allocator* alloc) {
     DecoderWeights* dw = (DecoderWeights*)w;
     dw->weight = {.shape = {dw->output_dim + 1, dw->hidden_dim}};
+    dw->bias = {.shape = {dw->output_dim + 1}};
     alloc_register(alloc,&dw->weight);
+    alloc_register(alloc,&dw->bias);
     if (dw->continuous) {
         dw->logstd = {.shape = {1, dw->output_dim}};
         alloc_register(alloc,&dw->logstd);
@@ -536,13 +549,15 @@ static void decoder_reg_train(void* w, void* activations, Allocator* acts, Alloc
         .saved_input = {.shape = {B_TT, dw->hidden_dim}},
         .grad_input = {.shape = {B_TT, dw->hidden_dim}},
         .wgrad_scratch = {.shape = {od1, dw->hidden_dim}},
+        .bgrad_scratch = {.shape = {od1}},
         .logstd_scratch = {.shape = {1, dw->output_dim}},
     };
     alloc_register(acts,&a->out);
     alloc_register(acts,&a->saved_input);
     alloc_register(acts,&a->grad_out);
     alloc_register(acts,&a->grad_input);
     alloc_register(grads,&a->wgrad_scratch);
+    alloc_register(grads,&a->bgrad_scratch);
     if (dw->continuous) alloc_register(grads,&a->logstd_scratch);
 }
 
@@ -577,6 +592,8 @@ static PrecisionTensor decoder_backward(void* w, void* activations,
     assemble_decoder_grad_kernel<<<grid_size(B_TT * od1), BLOCK_SIZE, 0, stream>>>(
         a->grad_out.data, grad_logits.data, grad_value.data, B_TT, od, od1);
     puf_mm_tn(&a->grad_out, &a->saved_input, &a->wgrad_scratch, stream);
+    n3_bias_grad_kernel<<<od1, 256, 0, stream>>>(
+        a->bgrad_scratch.data, a->grad_out.data, B_TT, od1);
     if (dw->continuous && grad_logstd.data != nullptr) {
         sum_rows_to_precision_kernel<<<grid_size(dw->output_dim), BLOCK_SIZE, 0, stream>>>(
             a->logstd_scratch.data, grad_logstd.data, B_TT, dw->output_dim);
 
@@ -140,6 +140,32 @@ __global__ void n3_concat_backward_conv_kernel(
     conv_grad[b * N3_CONV_FLAT + c] = concat_grad[b * N3_CONCAT + c];
 }
 
+// Embedding backward: scatter-add grad from concat_grad's player_embed region
+// into embed_wgrad (float accumulation buffer).
+// Each (b, f) looked up row obs[b, MAP_SIZE+f] from the table.
+__global__ void n3_embedding_backward_kernel(
+    float* __restrict__ embed_wgrad_f,
+    const precision_t* __restrict__ concat_grad,
+    const precision_t* __restrict__ obs,
+    int B, int obs_size) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * N3_PLAYER * N3_EMBED_DIM) return;
+    int b = idx / (N3_PLAYER * N3_EMBED_DIM);
+    int rem = idx % (N3_PLAYER * N3_EMBED_DIM);
+    int f = rem / N3_EMBED_DIM;
+    int d = rem % N3_EMBED_DIM;
+    int val = (int)to_float(obs[b * obs_size + N3_MAP_SIZE + f]);
+    float g = to_float(concat_grad[b * N3_CONCAT + N3_CONV_FLAT + f * N3_EMBED_DIM + d]);
+    atomicAdd(&embed_wgrad_f[val * N3_EMBED_DIM + d], g);
+}
+
+// Cast float buffer to precision_t
+__global__ void n3_float_to_precision_kernel(
+    precision_t* __restrict__ dst, const float* __restrict__ src, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) dst[idx] = from_float(src[idx]);
+}
+
 // ---- NMMO3 encoder structs ----
 
 struct NMMO3EncoderWeights {
@@ -152,6 +178,7 @@ struct NMMO3EncoderActivations {
     ConvActivations conv1, conv2;
     PrecisionTensor multihot, embed_out, concat, out, saved_obs;
     PrecisionTensor embed_wgrad, proj_wgrad, proj_bgrad;
+    FloatTensor embed_wgrad_f;  // float accumulation buffer for scatter-add
 };
 
 static NMMO3EncoderWeights* nmmo3_encoder_create(int obs_size, int hidden) {
@@ -219,7 +246,13 @@ static void nmmo3_encoder_backward(void* w, void* activations, PrecisionTensor g
         B, ew->conv1.OC, ew->conv1.OH * ew->conv1.OW);
     conv_backward(&ew->conv1, &a->conv1, NULL, stream);
 
-    cudaMemsetAsync(a->embed_wgrad.data, 0, numel(a->embed_wgrad.shape) * sizeof(precision_t), stream);
+    // Embedding backward: scatter-add from concat gradient into float buffer, then cast
+    int embed_n = N3_EMBED_VOCAB * N3_EMBED_DIM;
+    cudaMemsetAsync(a->embed_wgrad_f.data, 0, embed_n * sizeof(float), stream);
+    n3_embedding_backward_kernel<<<grid_size(B * N3_PLAYER * N3_EMBED_DIM), BLOCK_SIZE, 0, stream>>>(
+        a->embed_wgrad_f.data, grad_concat.data, a->saved_obs.data, B, ew->obs_size);
+    n3_float_to_precision_kernel<<<grid_size(embed_n), BLOCK_SIZE, 0, stream>>>(
+        a->embed_wgrad.data, a->embed_wgrad_f.data, embed_n);
 }
 
 static void nmmo3_encoder_init_weights(void* w, uint64_t* seed, cudaStream_t stream) {
@@ -261,9 +294,11 @@ static void nmmo3_encoder_reg_train(void* w, void* activations, Allocator* acts,
     alloc_register(acts,&a->embed_out); alloc_register(acts,&a->concat);
     alloc_register(acts,&a->out);       alloc_register(acts,&a->saved_obs);
     a->embed_wgrad = {.shape = {N3_EMBED_VOCAB, N3_EMBED_DIM}};
+    a->embed_wgrad_f = {.shape = {N3_EMBED_VOCAB, N3_EMBED_DIM}};
     a->proj_wgrad  = {.shape = {ew->hidden, N3_CONCAT}};
     a->proj_bgrad  = {.shape = {ew->hidden}};
     alloc_register(grads,&a->embed_wgrad);
+    alloc_register(acts,&a->embed_wgrad_f);
     alloc_register(grads,&a->proj_wgrad);  alloc_register(grads,&a->proj_bgrad);
 }