Skip to content

Commit 1ddab2a

Browse files
committed
fix dxa timing and wg/sp+dxa tests
1 parent 248cc5f commit 1ddab2a

9 files changed

Lines changed: 439 additions & 112 deletions

File tree

hw/rtl/dxa/VX_dxa_cl2smem.sv

Lines changed: 94 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,27 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
100100
: '0;
101101

102102
// ════════════════════════════════════════════════════════════
103-
// Sub-B: Fill Buffer — 2-state FSM with per-CL drain
103+
// Sub-B: Fill Buffer — 3-state FSM with per-CL drain
104104
// ════════════════════════════════════════════════════════════
105-
// ACCEPT: take CL input, always transition to DRAIN (every CL drains completely).
105+
// ACCEPT: capture compressed_data and metadata into _r staging regs,
106+
// transition to LOAD. Does NOT write fb_data_r (the wide
107+
// barrel-shifted load happens in LOAD, one cycle later).
108+
// LOAD: shift the registered compressed_data into fb_data_r at
109+
// byte_offset position, transfer level/word_addr/byte_offset/
110+
// start_word/is_final from staging regs to public regs.
111+
// Unconditional single-cycle pass-through to DRAIN.
106112
// DRAIN: emit SMEM words. On the LAST drain cycle (drain_will_empty),
107-
// simultaneously accept next CL if available (zero bubble).
113+
// simultaneously accept next CL if available into staging regs
114+
// and transition to LOAD (one cycle of pipeline latency
115+
// between CLs instead of the old zero-bubble overlap).
116+
//
117+
// Why the LOAD split exists (Fix #7, 2026-04-09):
118+
// The previous 2-state FSM combined (compressed_data >> shift_amount)
119+
// AND (shifted_data << byte_offset) AND the fb_data_r load AND the
120+
// upstream rc2cs_buf output register into a single cycle. The
121+
// resulting 10-level comb cone on ~600 bits was the FPGA critical
122+
// path at 300 MHz (~3.29 ns, 73% route). Splitting the load into
123+
// its own FB_LOAD state cuts that cone in half.
108124
//
109125
// Per-CL address decomposition:
110126
// cl_byte_offset = cl_in_smem_byte_addr[SMEM_OFF_W-1:0]
@@ -115,10 +131,11 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
115131
// smem_out_data = fb_data_r[SMEM_WORD_SIZE-1:0]
116132
// This prevents valid/ready combinational deadlock.
117133

118-
localparam FB_ACCEPT = 1'b0;
119-
localparam FB_DRAIN = 1'b1;
134+
localparam FB_ACCEPT = 2'd0;
135+
localparam FB_LOAD = 2'd1;
136+
localparam FB_DRAIN = 2'd2;
120137

121-
reg fb_state_r;
138+
reg [1:0] fb_state_r;
122139
reg [FILL_CAP*8-1:0] fb_data_r /*verilator split_var*/;
123140
reg [FILL_W-1:0] fb_level_r;
124141
reg fb_is_final_cl_r; // true if the last CL in the transfer
@@ -128,6 +145,17 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
128145
reg [SMEM_OFF_W-1:0] fb_byte_offset_r;
129146
reg [SMEM_ADDR_WIDTH-1:0] fb_start_word_r;
130147

148+
// Staging registers captured in FB_ACCEPT (and overlap branch of
149+
// FB_DRAIN) and consumed by FB_LOAD to drive fb_data_r. The
150+
// fb_compressed_data_r register stores the post-barrel-shift-right
151+
// data one cycle before fb_data_r receives the
152+
// `<< byte_offset` shifted version.
153+
reg [CL_SIZE*8-1:0] fb_compressed_data_r /*verilator split_var*/;
154+
reg [FILL_W-1:0] fb_load_level_r;
155+
reg fb_load_is_final_cl_r;
156+
reg [SMEM_OFF_W-1:0] fb_load_byte_offset_r;
157+
reg [SMEM_ADDR_WIDTH-1:0] fb_load_start_word_r;
158+
131159
// ── Output (register-driven, no combinational dependency) ──
132160
wire has_full_word = (fb_level_r >= FILL_W'(SMEM_WORD_SIZE));
133161
// Per-CL drain: every CL must drain completely, including partial last word.
@@ -172,8 +200,28 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
172200
// ── New level calculation ──
173201
wire [FILL_W-1:0] new_level_accept = FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
174202

175-
// Bit offset for appending compressed data to fill buffer at byte_offset position.
176-
wire [FILL_W+2:0] fb_accept_bit_offset = {FILL_W'(cl_byte_offset), 3'b000};
203+
// Bit offset for appending compressed data to fill buffer at byte_offset
204+
// position. With the LOAD split (Fix #7), the wide `<< byte_offset` shift
205+
// runs from REGISTERED fb_load_byte_offset_r in the FB_LOAD state, not
206+
// from the live combinational cl_byte_offset. Using the registered
207+
// version is the whole point of the register cut.
208+
wire [FILL_W+2:0] fb_load_bit_offset = {FILL_W'(fb_load_byte_offset_r), 3'b000};
209+
210+
// Always-clock fb_data_r through an explicit next-state mux so Vivado
211+
// does not materialize a high-fanout CE from the FB_LOAD / FB_DRAIN
212+
// decode onto the entire ~700-bit bank.
213+
wire fb_data_load_fire = (fb_state_r == FB_LOAD);
214+
wire fb_data_clear_fire = (fb_state_r == FB_DRAIN) && drain_fire
215+
&& drain_will_empty && !cl_in_fire;
216+
wire fb_data_shift_fire = (fb_state_r == FB_DRAIN) && drain_fire
217+
&& !drain_will_empty;
218+
wire [FILL_CAP*8-1:0] fb_data_load_next =
219+
(FILL_CAP*8)'(fb_compressed_data_r) << fb_load_bit_offset;
220+
wire [FILL_CAP*8-1:0] fb_data_shift_next = fb_data_r >> (SMEM_WORD_SIZE * 8);
221+
wire [FILL_CAP*8-1:0] fb_data_next = fb_data_load_fire ? fb_data_load_next
222+
: fb_data_clear_fire ? '0
223+
: fb_data_shift_fire ? fb_data_shift_next
224+
: fb_data_r;
177225

178226
// ── Registered state update ──
179227
always @(posedge clk) begin
@@ -185,42 +233,60 @@ module VX_dxa_cl2smem import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
185233
fb_word_addr_r <= '0;
186234
fb_byte_offset_r <= '0;
187235
fb_start_word_r <= '0;
236+
fb_compressed_data_r <= '0;
237+
fb_load_level_r <= '0;
238+
fb_load_is_final_cl_r <= 1'b0;
239+
fb_load_byte_offset_r <= '0;
240+
fb_load_start_word_r <= '0;
188241
end else begin
242+
fb_data_r <= fb_data_next;
189243
case (fb_state_r)
190244
FB_ACCEPT: begin
191245
if (cl_in_fire) begin
192-
// Load compressed data at byte_offset position within the fill buffer.
193-
fb_data_r <= (FILL_CAP*8)'(compressed_data) << fb_accept_bit_offset;
194-
fb_level_r <= new_level_accept;
195-
fb_is_final_cl_r <= cl_in_last;
196-
fb_word_addr_r <= cl_start_word;
197-
fb_byte_offset_r <= cl_byte_offset;
198-
fb_start_word_r <= cl_start_word;
199-
// Always transition to DRAIN (every CL drains completely).
200-
fb_state_r <= FB_DRAIN;
246+
// Stage compressed_data and metadata for the next-cycle LOAD.
247+
// Does NOT touch fb_data_r here — that happens in FB_LOAD.
248+
fb_compressed_data_r <= compressed_data;
249+
fb_load_level_r <= new_level_accept;
250+
fb_load_is_final_cl_r <= cl_in_last;
251+
fb_load_byte_offset_r <= cl_byte_offset;
252+
fb_load_start_word_r <= cl_start_word;
253+
fb_state_r <= FB_LOAD;
201254
end
202255
end
256+
FB_LOAD: begin
257+
// One-cycle register boundary between barrel-compress and
258+
// the wide `<< byte_offset` load. Transfers staging regs
259+
// into the public fb_* regs and transitions to DRAIN.
260+
fb_level_r <= fb_load_level_r;
261+
fb_is_final_cl_r <= fb_load_is_final_cl_r;
262+
fb_word_addr_r <= fb_load_start_word_r;
263+
fb_byte_offset_r <= fb_load_byte_offset_r;
264+
fb_start_word_r <= fb_load_start_word_r;
265+
fb_state_r <= FB_DRAIN;
266+
end
203267
FB_DRAIN: begin
204268
if (drain_fire) begin
205269
if (drain_will_empty && cl_in_fire) begin
206-
// OVERLAP: drain last word of current CL AND accept new CL.
207-
fb_data_r <= (FILL_CAP*8)'(compressed_data) << ({FILL_W'(cl_byte_offset), 3'b000});
208-
fb_level_r <= FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
209-
fb_is_final_cl_r <= cl_in_last;
210-
fb_word_addr_r <= cl_start_word;
211-
fb_byte_offset_r <= cl_byte_offset;
212-
fb_start_word_r <= cl_start_word;
213-
// Always drain the new CL too.
214-
fb_state_r <= FB_DRAIN;
270+
// OVERLAP: drain last word AND accept new CL.
271+
// With the LOAD split, overlap now parks the new
272+
// CL in staging regs and transitions to FB_LOAD
273+
// so the wide shift runs in a clean next cycle.
274+
// Old behavior (single-cycle drain+load) is the
275+
// exact comb cone that capped Fmax, so keeping
276+
// the split here is important.
277+
fb_compressed_data_r <= compressed_data;
278+
fb_load_level_r <= FILL_W'(cl_byte_offset) + FILL_W'(valid_count);
279+
fb_load_is_final_cl_r <= cl_in_last;
280+
fb_load_byte_offset_r <= cl_byte_offset;
281+
fb_load_start_word_r <= cl_start_word;
282+
fb_state_r <= FB_LOAD;
215283
end else if (drain_will_empty) begin
216284
// Last drain word, no new CL available.
217285
fb_state_r <= FB_ACCEPT;
218-
fb_data_r <= '0;
219286
fb_level_r <= '0;
220287
fb_is_final_cl_r <= 1'b0;
221288
end else begin
222289
// More words to drain from current CL.
223-
fb_data_r <= fb_data_r >> (SMEM_WORD_SIZE * 8);
224290
fb_level_r <= fb_level_r - FILL_W'(SMEM_WORD_SIZE);
225291
fb_word_addr_r <= fb_word_addr_r + SMEM_ADDR_WIDTH'(1);
226292
end

hw/rtl/dxa/VX_dxa_rd_ctrl.sv

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,27 @@ module VX_dxa_rd_ctrl import VX_gpu_pkg::*, VX_dxa_pkg::*; #(
9595
wire [RSP_FIFO_SIZEW-1:0] rsp_fifo_size;
9696
wire rsp_fifo_alm_empty, rsp_fifo_alm_full;
9797

98+
// rsp_fifo: wide (~530-bit) GMEM response payload. Pre-fix config was
99+
// `OUT_REG=0, LUTRAM=1`, which placed 512-bit cl_data in distributed
100+
// RAM with an async read path — the dominant LUT consumer inside
101+
// rd_ctrl on U55C (~14K LUTs). Two orthogonal fixes applied:
102+
// 1. OUT_REG=1 — add an output register stage so downstream
103+
// cl2smem/wr_ctrl consumers see a register boundary, not a
104+
// 530-bit comb mux from LUTRAM. Show-ahead FIFO semantics are
105+
// preserved by VX_fifo_queue's empty->non-empty bypass path.
106+
// 2. LUTRAM=0 — move the underlying DP-RAM storage to BRAM so the
107+
// wide payload lives in block RAM (a few BRAM18/36 tiles) rather
108+
// than burning ~14K LUTs on the wide LUTRAM mux tree. BRAM with
109+
// OUT_REG=1 matches the existing VX_fifo_queue drain timing (1
110+
// cycle data_out_r → downstream).
111+
// Total cost: 1 cycle of drain latency (negligible — DMA throughput
112+
// is bound by GMEM read latency, not FIFO drain), plus a few BRAM
113+
// tiles (plenty of headroom on U55C).
98114
VX_fifo_queue #(
99115
.DATAW (RSP_FIFO_DATAW),
100116
.DEPTH (RSP_FIFO_DEPTH),
101-
.OUT_REG (0),
102-
.LUTRAM (1)
117+
.OUT_REG (1),
118+
.LUTRAM (0)
103119
) rsp_fifo (
104120
.clk (clk),
105121
.reset (reset),

0 commit comments

Comments
 (0)