vortexgpgpu
diff --git a/‎hw/rtl/dxa/VX_dxa_cl2smem.sv‎
Lines changed: 94 additions & 28 deletions b/‎hw/rtl/dxa/VX_dxa_cl2smem.sv‎
Lines changed: 94 additions & 28 deletions
diff --git a/‎hw/rtl/dxa/VX_dxa_rd_ctrl.sv‎
Lines changed: 18 additions & 2 deletions b/‎hw/rtl/dxa/VX_dxa_rd_ctrl.sv‎
Lines changed: 18 additions & 2 deletions
@@ -100,11 +100,27 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
         : '0;
 
     // ════════════════════════════════════════════════════════════
-    // Sub-B: Fill Buffer — 2-state FSM with per-CL drain
+    // Sub-B: Fill Buffer — 3-state FSM with per-CL drain
     // ════════════════════════════════════════════════════════════
-    // ACCEPT: take CL input, always transition to DRAIN (every CL drains completely).
+    // ACCEPT: capture compressed_data and metadata into _r staging regs,
+    //         transition to LOAD. Does NOT write fb_data_r (the wide
+    //         barrel-shifted load happens in LOAD, one cycle later).
+    // LOAD:   shift the registered compressed_data into fb_data_r at
+    //         byte_offset position, transfer level/word_addr/byte_offset/
+    //         start_word/is_final from staging regs to public regs.
+    //         Unconditional single-cycle pass-through to DRAIN.
     // DRAIN:  emit SMEM words. On the LAST drain cycle (drain_will_empty),
-    //         simultaneously accept next CL if available (zero bubble).
+    //         simultaneously accept next CL if available into staging regs
+    //         and transition to LOAD (one cycle of pipeline latency
+    //         between CLs instead of the old zero-bubble overlap).
+    //
+    // Why the LOAD split exists (Fix #7, 2026-04-09):
+    //   The previous 2-state FSM combined (compressed_data >> shift_amount)
+    //   AND (shifted_data << byte_offset) AND the fb_data_r load AND the
+    //   upstream rc2cs_buf output register into a single cycle. The
+    //   resulting 10-level comb cone on ~600 bits was the FPGA critical
+    //   path at 300 MHz (~3.29 ns, 73% route). Splitting the load into
+    //   its own FB_LOAD state cuts that cone in half.
     //
     // Per-CL address decomposition:
     //   cl_byte_offset = cl_in_smem_byte_addr[SMEM_OFF_W-1:0]
@@ -115,10 +131,11 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
     //   smem_out_data  = fb_data_r[SMEM_WORD_SIZE-1:0]
     // This prevents valid/ready combinational deadlock.
 
-    localparam FB_ACCEPT = 1'b0;
-    localparam FB_DRAIN  = 1'b1;
+    localparam FB_ACCEPT = 2'd0;
+    localparam FB_LOAD   = 2'd1;
+    localparam FB_DRAIN  = 2'd2;
 
-    reg                    fb_state_r;
+    reg [1:0]              fb_state_r;
     reg [FILL_CAP*8-1:0]  fb_data_r /*verilator split_var*/;
     reg [FILL_W-1:0]      fb_level_r;
     reg                    fb_is_final_cl_r;  // true if the last CL in the transfer
@@ -128,6 +145,17 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
     reg [SMEM_OFF_W-1:0]      fb_byte_offset_r;
     reg [SMEM_ADDR_WIDTH-1:0] fb_start_word_r;
 
+    // Staging registers captured in FB_ACCEPT (and overlap branch of
+    // FB_DRAIN) and consumed by FB_LOAD to drive fb_data_r. The
+    // fb_compressed_data_r register stores the post-barrel-shift-right
+    // data one cycle before fb_data_r receives the
+    // `<< byte_offset` shifted version.
+    reg [CL_SIZE*8-1:0]       fb_compressed_data_r /*verilator split_var*/;
+    reg [FILL_W-1:0]          fb_load_level_r;
+    reg                       fb_load_is_final_cl_r;
+    reg [SMEM_OFF_W-1:0]      fb_load_byte_offset_r;
+    reg [SMEM_ADDR_WIDTH-1:0] fb_load_start_word_r;
+
     // ── Output (register-driven, no combinational dependency) ──
     wire has_full_word = (fb_level_r >= FILL_W'(SMEM_WORD_SIZE));
     // Per-CL drain: every CL must drain completely, including partial last word.
@@ -172,8 +200,28 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
     // ── New level calculation ──
     wire [FILL_W-1:0] new_level_accept = FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
 
-    // Bit offset for appending compressed data to fill buffer at byte_offset position.
-    wire [FILL_W+2:0] fb_accept_bit_offset = {FILL_W'(cl_byte_offset), 3'b000};
+    // Bit offset for appending compressed data to fill buffer at byte_offset
+    // position. With the LOAD split (Fix #7), the wide `<< byte_offset` shift
+    // runs from REGISTERED fb_load_byte_offset_r in the FB_LOAD state, not
+    // from the live combinational cl_byte_offset. Using the registered
+    // version is the whole point of the register cut.
+    wire [FILL_W+2:0] fb_load_bit_offset = {FILL_W'(fb_load_byte_offset_r), 3'b000};
+
+    // Always-clock fb_data_r through an explicit next-state mux so Vivado
+    // does not materialize a high-fanout CE from the FB_LOAD / FB_DRAIN
+    // decode onto the entire ~700-bit bank.
+    wire fb_data_load_fire  = (fb_state_r == FB_LOAD);
+    wire fb_data_clear_fire = (fb_state_r == FB_DRAIN) && drain_fire
+                            && drain_will_empty && !cl_in_fire;
+    wire fb_data_shift_fire = (fb_state_r == FB_DRAIN) && drain_fire
+                            && !drain_will_empty;
+    wire [FILL_CAP*8-1:0] fb_data_load_next =
+        (FILL_CAP*8)'(fb_compressed_data_r) << fb_load_bit_offset;
+    wire [FILL_CAP*8-1:0] fb_data_shift_next = fb_data_r >> (SMEM_WORD_SIZE * 8);
+    wire [FILL_CAP*8-1:0] fb_data_next = fb_data_load_fire  ? fb_data_load_next
+                                     : fb_data_clear_fire ? '0
+                                     : fb_data_shift_fire ? fb_data_shift_next
+                                     : fb_data_r;
 
     // ── Registered state update ──
     always @(posedge clk) begin
@@ -185,42 +233,60 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
             fb_word_addr_r    <= '0;
             fb_byte_offset_r  <= '0;
             fb_start_word_r   <= '0;
+            fb_compressed_data_r  <= '0;
+            fb_load_level_r       <= '0;
+            fb_load_is_final_cl_r <= 1'b0;
+            fb_load_byte_offset_r <= '0;
+            fb_load_start_word_r  <= '0;
         end else begin
+            fb_data_r <= fb_data_next;
             case (fb_state_r)
             FB_ACCEPT: begin
                 if (cl_in_fire) begin
-                    // Load compressed data at byte_offset position within the fill buffer.
-                    fb_data_r <= (FILL_CAP*8)'(compressed_data) << fb_accept_bit_offset;
-                    fb_level_r <= new_level_accept;
-                    fb_is_final_cl_r <= cl_in_last;
-                    fb_word_addr_r   <= cl_start_word;
-                    fb_byte_offset_r <= cl_byte_offset;
-                    fb_start_word_r  <= cl_start_word;
-                    // Always transition to DRAIN (every CL drains completely).
-                    fb_state_r <= FB_DRAIN;
+                    // Stage compressed_data and metadata for the next-cycle LOAD.
+                    // Does NOT touch fb_data_r here — that happens in FB_LOAD.
+                    fb_compressed_data_r  <= compressed_data;
+                    fb_load_level_r       <= new_level_accept;
+                    fb_load_is_final_cl_r <= cl_in_last;
+                    fb_load_byte_offset_r <= cl_byte_offset;
+                    fb_load_start_word_r  <= cl_start_word;
+                    fb_state_r            <= FB_LOAD;
                 end
             end
+            FB_LOAD: begin
+                // One-cycle register boundary between barrel-compress and
+                // the wide `<< byte_offset` load. Transfers staging regs
+                // into the public fb_* regs and transitions to DRAIN.
+                fb_level_r       <= fb_load_level_r;
+                fb_is_final_cl_r <= fb_load_is_final_cl_r;
+                fb_word_addr_r   <= fb_load_start_word_r;
+                fb_byte_offset_r <= fb_load_byte_offset_r;
+                fb_start_word_r  <= fb_load_start_word_r;
+                fb_state_r       <= FB_DRAIN;
+            end
             FB_DRAIN: begin
                 if (drain_fire) begin
                     if (drain_will_empty && cl_in_fire) begin
-                        // OVERLAP: drain last word of current CL AND accept new CL.
-                        fb_data_r  <= (FILL_CAP*8)'(compressed_data) << ({FILL_W'(cl_byte_offset), 3'b000});
-                        fb_level_r <= FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
-                        fb_is_final_cl_r <= cl_in_last;
-                        fb_word_addr_r   <= cl_start_word;
-                        fb_byte_offset_r <= cl_byte_offset;
-                        fb_start_word_r  <= cl_start_word;
-                        // Always drain the new CL too.
-                        fb_state_r <= FB_DRAIN;
+                        // OVERLAP: drain last word AND accept new CL.
+                        // With the LOAD split, overlap now parks the new
+                        // CL in staging regs and transitions to FB_LOAD
+                        // so the wide shift runs in a clean next cycle.
+                        // Old behavior (single-cycle drain+load) is the
+                        // exact comb cone that capped Fmax, so keeping
+                        // the split here is important.
+                        fb_compressed_data_r  <= compressed_data;
+                        fb_load_level_r       <= FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
+                        fb_load_is_final_cl_r <= cl_in_last;
+                        fb_load_byte_offset_r <= cl_byte_offset;
+                        fb_load_start_word_r  <= cl_start_word;
+                        fb_state_r <= FB_LOAD;
                     end else if (drain_will_empty) begin
                         // Last drain word, no new CL available.
                         fb_state_r <= FB_ACCEPT;
-                        fb_data_r  <= '0;
                         fb_level_r <= '0;
                         fb_is_final_cl_r <= 1'b0;
                     end else begin
                         // More words to drain from current CL.
-                        fb_data_r  <= fb_data_r >> (SMEM_WORD_SIZE * 8);
                         fb_level_r <= fb_level_r - FILL_W'(SMEM_WORD_SIZE);
                         fb_word_addr_r <= fb_word_addr_r + SMEM_ADDR_WIDTH'(1);
                     end
 
@@ -95,11 +95,27 @@ module VX_dxa_rd_ctrl import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
     wire [RSP_FIFO_SIZEW-1:0] rsp_fifo_size;
     wire rsp_fifo_alm_empty, rsp_fifo_alm_full;
 
+    // rsp_fifo: wide (~530-bit) GMEM response payload. Pre-fix config was
+    //   `OUT_REG=0, LUTRAM=1`, which placed 512-bit cl_data in distributed
+    //   RAM with an async read path — the dominant LUT consumer inside
+    //   rd_ctrl on U55C (~14K LUTs). Two orthogonal fixes applied:
+    //     1. OUT_REG=1 — add an output register stage so downstream
+    //        cl2smem/wr_ctrl consumers see a register boundary, not a
+    //        530-bit comb mux from LUTRAM. Show-ahead FIFO semantics are
+    //        preserved by VX_fifo_queue's empty->non-empty bypass path.
+    //     2. LUTRAM=0 — move the underlying DP-RAM storage to BRAM so the
+    //        wide payload lives in block RAM (a few BRAM18/36 tiles) rather
+    //        than burning ~14K LUTs on the wide LUTRAM mux tree. BRAM with
+    //        OUT_REG=1 matches the existing VX_fifo_queue drain timing (1
+    //        cycle data_out_r → downstream).
+    //   Total cost: 1 cycle of drain latency (negligible — DMA throughput
+    //   is bound by GMEM read latency, not FIFO drain), plus a few BRAM
+    //   tiles (plenty of headroom on U55C).
     VX_fifo_queue #(
         .DATAW   (RSP_FIFO_DATAW),
         .DEPTH   (RSP_FIFO_DEPTH),
-        .OUT_REG (0),
-        .LUTRAM  (1)
+        .OUT_REG (1),
+        .LUTRAM  (0)
     ) rsp_fifo (
         .clk      (clk),
         .reset    (reset),