@@ -100,11 +100,27 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
100100 : '0 ;
101101
102102 // ════════════════════════════════════════════════════════════
103- // Sub-B: Fill Buffer — 2 -state FSM with per-CL drain
103+ // Sub-B: Fill Buffer — 3 -state FSM with per-CL drain
104104 // ════════════════════════════════════════════════════════════
105- // ACCEPT: take CL input, always transition to DRAIN (every CL drains completely).
105+ // ACCEPT: capture compressed_data and metadata into _r staging regs,
106+ // transition to LOAD. Does NOT write fb_data_r (the wide
107+ // barrel-shifted load happens in LOAD, one cycle later).
108+ // LOAD: shift the registered compressed_data into fb_data_r at
109+ // byte_offset position, transfer level/word_addr/byte_offset/
110+ // start_word/is_final from staging regs to public regs.
111+ // Unconditional single-cycle pass-through to DRAIN.
106112 // DRAIN: emit SMEM words. On the LAST drain cycle (drain_will_empty),
107- // simultaneously accept next CL if available (zero bubble).
113+ // simultaneously accept next CL if available into staging regs
114+ // and transition to LOAD (one cycle of pipeline latency
115+ // between CLs instead of the old zero-bubble overlap).
116+ //
117+ // Why the LOAD split exists (Fix #7, 2026-04-09):
118+ // The previous 2-state FSM combined (compressed_data >> shift_amount)
119+ // AND (shifted_data << byte_offset) AND the fb_data_r load AND the
120+ // upstream rc2cs_buf output register into a single cycle. The
121+ // resulting 10-level comb cone on ~600 bits was the FPGA critical
122+ // path at 300 MHz (~3.29 ns, 73% route). Splitting the load into
123+ // its own FB_LOAD state cuts that cone in half.
108124 //
109125 // Per-CL address decomposition:
110126 // cl_byte_offset = cl_in_smem_byte_addr[SMEM_OFF_W-1:0]
@@ -115,10 +131,11 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
115131 // smem_out_data = fb_data_r[SMEM_WORD_SIZE-1:0]
116132 // This prevents valid/ready combinational deadlock.
117133
118- localparam FB_ACCEPT = 1'b0 ;
119- localparam FB_DRAIN = 1'b1 ;
134+ localparam FB_ACCEPT = 2'd0 ;
135+ localparam FB_LOAD = 2'd1 ;
136+ localparam FB_DRAIN = 2'd2 ;
120137
121- reg fb_state_r;
138+ reg [ 1 : 0 ] fb_state_r;
122139 reg [FILL_CAP * 8 - 1 : 0 ] fb_data_r /* verilator split_var*/ ;
123140 reg [FILL_W - 1 : 0 ] fb_level_r;
124141 reg fb_is_final_cl_r; // true if the last CL in the transfer
@@ -128,6 +145,17 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
128145 reg [SMEM_OFF_W - 1 : 0 ] fb_byte_offset_r;
129146 reg [SMEM_ADDR_WIDTH - 1 : 0 ] fb_start_word_r;
130147
148+ // Staging registers captured in FB_ACCEPT (and overlap branch of
149+ // FB_DRAIN) and consumed by FB_LOAD to drive fb_data_r. The
150+ // fb_compressed_data_r register stores the post-barrel-shift-right
151+ // data one cycle before fb_data_r receives the
152+ // `<< byte_offset` shifted version.
153+ reg [CL_SIZE * 8 - 1 : 0 ] fb_compressed_data_r /* verilator split_var*/ ;
154+ reg [FILL_W - 1 : 0 ] fb_load_level_r;
155+ reg fb_load_is_final_cl_r;
156+ reg [SMEM_OFF_W - 1 : 0 ] fb_load_byte_offset_r;
157+ reg [SMEM_ADDR_WIDTH - 1 : 0 ] fb_load_start_word_r;
158+
131159 // ── Output (register-driven, no combinational dependency) ──
132160 wire has_full_word = (fb_level_r >= FILL_W ' (SMEM_WORD_SIZE ));
133161 // Per-CL drain: every CL must drain completely, including partial last word.
@@ -172,8 +200,28 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
172200 // ── New level calculation ──
173201 wire [FILL_W - 1 : 0 ] new_level_accept = FILL_W ' (cl_byte_offset) + FILL_W ' (valid_count);
174202
175- // Bit offset for appending compressed data to fill buffer at byte_offset position.
176- wire [FILL_W + 2 : 0 ] fb_accept_bit_offset = { FILL_W ' (cl_byte_offset), 3'b000 } ;
203+ // Bit offset for appending compressed data to fill buffer at byte_offset
204+ // position. With the LOAD split (Fix #7), the wide `<< byte_offset` shift
205+ // runs from REGISTERED fb_load_byte_offset_r in the FB_LOAD state, not
206+ // from the live combinational cl_byte_offset. Using the registered
207+ // version is the whole point of the register cut.
208+ wire [FILL_W + 2 : 0 ] fb_load_bit_offset = { FILL_W ' (fb_load_byte_offset_r), 3'b000 } ;
209+
210+ // Always-clock fb_data_r through an explicit next-state mux so Vivado
211+ // does not materialize a high-fanout CE from the FB_LOAD / FB_DRAIN
212+ // decode onto the entire ~700-bit bank.
213+ wire fb_data_load_fire = (fb_state_r == FB_LOAD );
214+ wire fb_data_clear_fire = (fb_state_r == FB_DRAIN ) && drain_fire
215+ && drain_will_empty && ! cl_in_fire;
216+ wire fb_data_shift_fire = (fb_state_r == FB_DRAIN ) && drain_fire
217+ && ! drain_will_empty;
218+ wire [FILL_CAP * 8 - 1 : 0 ] fb_data_load_next =
219+ (FILL_CAP * 8 )'(fb_compressed_data_r) << fb_load_bit_offset;
220+ wire [FILL_CAP * 8 - 1 : 0 ] fb_data_shift_next = fb_data_r >> (SMEM_WORD_SIZE * 8 );
221+ wire [FILL_CAP * 8 - 1 : 0 ] fb_data_next = fb_data_load_fire ? fb_data_load_next
222+ : fb_data_clear_fire ? '0
223+ : fb_data_shift_fire ? fb_data_shift_next
224+ : fb_data_r;
177225
178226 // ── Registered state update ──
179227 always @ (posedge clk) begin
@@ -185,42 +233,60 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
185233 fb_word_addr_r <= '0 ;
186234 fb_byte_offset_r <= '0 ;
187235 fb_start_word_r <= '0 ;
236+ fb_compressed_data_r <= '0 ;
237+ fb_load_level_r <= '0 ;
238+ fb_load_is_final_cl_r <= 1'b0 ;
239+ fb_load_byte_offset_r <= '0 ;
240+ fb_load_start_word_r <= '0 ;
188241 end else begin
242+ fb_data_r <= fb_data_next;
189243 case (fb_state_r)
190244 FB_ACCEPT : begin
191245 if (cl_in_fire) begin
192- // Load compressed data at byte_offset position within the fill buffer.
193- fb_data_r <= (FILL_CAP * 8 )'(compressed_data) << fb_accept_bit_offset;
194- fb_level_r <= new_level_accept;
195- fb_is_final_cl_r <= cl_in_last;
196- fb_word_addr_r <= cl_start_word;
197- fb_byte_offset_r <= cl_byte_offset;
198- fb_start_word_r <= cl_start_word;
199- // Always transition to DRAIN (every CL drains completely).
200- fb_state_r <= FB_DRAIN ;
246+ // Stage compressed_data and metadata for the next-cycle LOAD.
247+ // Does NOT touch fb_data_r here — that happens in FB_LOAD.
248+ fb_compressed_data_r <= compressed_data;
249+ fb_load_level_r <= new_level_accept;
250+ fb_load_is_final_cl_r <= cl_in_last;
251+ fb_load_byte_offset_r <= cl_byte_offset;
252+ fb_load_start_word_r <= cl_start_word;
253+ fb_state_r <= FB_LOAD ;
201254 end
202255 end
256+ FB_LOAD : begin
257+ // One-cycle register boundary between barrel-compress and
258+ // the wide `<< byte_offset` load. Transfers staging regs
259+ // into the public fb_* regs and transitions to DRAIN.
260+ fb_level_r <= fb_load_level_r;
261+ fb_is_final_cl_r <= fb_load_is_final_cl_r;
262+ fb_word_addr_r <= fb_load_start_word_r;
263+ fb_byte_offset_r <= fb_load_byte_offset_r;
264+ fb_start_word_r <= fb_load_start_word_r;
265+ fb_state_r <= FB_DRAIN ;
266+ end
203267 FB_DRAIN : begin
204268 if (drain_fire) begin
205269 if (drain_will_empty && cl_in_fire) begin
206- // OVERLAP: drain last word of current CL AND accept new CL.
207- fb_data_r <= (FILL_CAP * 8 )'(compressed_data) << ({ FILL_W ' (cl_byte_offset), 3'b000 } );
208- fb_level_r <= FILL_W ' (cl_byte_offset) + FILL_W ' (valid_count);
209- fb_is_final_cl_r <= cl_in_last;
210- fb_word_addr_r <= cl_start_word;
211- fb_byte_offset_r <= cl_byte_offset;
212- fb_start_word_r <= cl_start_word;
213- // Always drain the new CL too.
214- fb_state_r <= FB_DRAIN ;
270+ // OVERLAP: drain last word AND accept new CL.
271+ // With the LOAD split, overlap now parks the new
272+ // CL in staging regs and transitions to FB_LOAD
273+ // so the wide shift runs in a clean next cycle.
274+ // Old behavior (single-cycle drain+load) is the
275+ // exact comb cone that capped Fmax, so keeping
276+ // the split here is important.
277+ fb_compressed_data_r <= compressed_data;
278+ fb_load_level_r <= FILL_W ' (cl_byte_offset) + FILL_W ' (valid_count);
279+ fb_load_is_final_cl_r <= cl_in_last;
280+ fb_load_byte_offset_r <= cl_byte_offset;
281+ fb_load_start_word_r <= cl_start_word;
282+ fb_state_r <= FB_LOAD ;
215283 end else if (drain_will_empty) begin
216284 // Last drain word, no new CL available.
217285 fb_state_r <= FB_ACCEPT ;
218- fb_data_r <= '0 ;
219286 fb_level_r <= '0 ;
220287 fb_is_final_cl_r <= 1'b0 ;
221288 end else begin
222289 // More words to drain from current CL.
223- fb_data_r <= fb_data_r >> (SMEM_WORD_SIZE * 8 );
224290 fb_level_r <= fb_level_r - FILL_W ' (SMEM_WORD_SIZE );
225291 fb_word_addr_r <= fb_word_addr_r + SMEM_ADDR_WIDTH ' (1 );
226292 end
0 commit comments