From 384bcd407376cd88bed137108ab7a1723d3d06b7 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 13:19:12 -0400 Subject: [PATCH 01/65] draft: streaming-mode cpu reduction design spec --- ...26-05-15-streaming-cpu-reduction-design.md | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md diff --git a/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md new file mode 100644 index 00000000..17285fc1 --- /dev/null +++ b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md @@ -0,0 +1,222 @@ +# Streaming-mode CPU Reduction — Design (Draft) + +Status: **DRAFT / work-in-progress brainstorm.** Not yet a committed spec. Captures +the running design discussion so far. + +Date: 2026-05-15 + +## Problem + +`dz_hl_publisher` in streaming mode (`--ingest-mode stream`) consumes excessive +CPU on the same host as the Hyperliquid validator. When the publisher loads the +CPU, the validator falls behind; when the validator falls behind, streaming +output becomes bursty / out-of-order, which forces the publisher into +grace-fallback finalization and increases per-chunk work — a negative feedback +loop. + +Block mode is stable in production and **must not regress** from any change +made here. + +## Profiling evidence + +`perf top` over a 30s sample of the publisher in streaming mode: + +| % CPU | Symbol | +|------:|--------| +| 27.82 | `server::order_book::levels::map_to_l2_levels` | +| 16.98 | `server::order_book::levels::build_l2_level` | +| 6.93 | `libm __log10_finite` | +| 4.04 | `floor` | +| 2.60 | `server::order_book::levels::l2_levels_to_l2_levels` | +| ~7 | realloc / `Vec` growth | + +≈65% of CPU is in L2 snapshot computation and the math it relies on. + +## Where the cost comes from + +`compute_l2_snapshots` ([server/src/listeners/order_book/utils.rs:216](../../../server/src/listeners/order_book/utils.rs)) runs at the end of every +`process_data` chunk ([server/src/listeners/order_book/mod.rs:1857](../../../server/src/listeners/order_book/mod.rs)). Per call, **for every coin in +parallel via rayon** it produces **7 snapshot variants**: + +1. One unbucketed base snapshot via `to_l2_snapshot(None, None, None)`. This + walks the full `BTreeMap>` and `fold`s each linked list + **twice** per price level — once for size, once for count + ([server/src/order_book/levels.rs:96-97](../../../server/src/order_book/levels.rs)). +2. Six bucketed snapshots over the base for `n_sig_figs ∈ {5,4,3,2}` × + `mantissa ∈ {None, Some(2), Some(5)}` ([utils.rs:223-244](../../../server/src/listeners/order_book/utils.rs)). + +Each bucketed level calls `bucket()` → `Px::num_digits()` which computes +`(value as f64).log10().floor() as u32 + 1` ([types.rs:130](../../../server/src/order_book/types.rs)). This explains +`__log10_finite` (6.93%) and `floor` (4.04%) in perf. + +Each Vec starts at capacity 0 ([levels.rs:63,85](../../../server/src/order_book/levels.rs)) and grows by doubling — the +~7% realloc. + +### Why this dominates in streaming, not in block + +- **Block mode:** `process_data` runs once per file rotation (one block). + Snapshot is computed once per block. +- **Streaming mode:** the validator appends continuously and with + `--disable-output-file-buffering` flushes per line. `notify` fires many times + per block, and every chunk ends with `l2_snapshots(true)`. The `snapped` gate + at [state.rs:94](../../../server/src/listeners/order_book/state.rs) only deduplicates per `state.time`, so any time `state.time` + advances mid-burst, the full 7-snapshot fan-out runs again across ~200 coins. + +That's also the feedback-loop mechanism: snapshot cost scales with chunk count, +not block count. Validator slowness → more / smaller chunks per block → more +snapshot computes → higher publisher CPU → less CPU for the validator. + +## Key deployment context + +- **No WebSocket subscribers in production.** All 6 bucketed variants exist + solely for WebSocket `l2book` subscribers requesting a specific aggregation. + TOB reads only BBO from the unbucketed snapshot. **The 6 bucketed variants + are pure waste in this deployment**, and they're where the `log10`/`floor` + time is going (bucketing is the only consumer of `num_digits()`). + +## Scope and goals + +**In scope:** CPU reduction in streaming mode (`--ingest-mode stream`), +measured as publisher CPU% on a representative validator-co-located host +under live load. + +**Out of scope:** memory reduction, latency improvements beyond what falls out +of CPU work, WebSocket server performance, DoB wire-protocol changes, +validator-side tuning. + +**Must not regress:** block-mode behavior, byte-for-byte. Block mode is the +production-stable baseline. + +### Success criteria + +- ≥40% reduction in publisher CPU% in streaming mode at steady state on the + canary box. +- `orderbook_tob_snapshot_compute` p50/p99 drop materially. +- `orderbook_tob_suppressed_total` does not regress. +- `block_mode_multicast_e2e` and + `dual_validator_fixture_matches_block_and_stream_goldens` both pass with no + golden updates required. + +## Block-mode safety net (already in place) + +- `server/src/listeners/order_book/block_mode_multicast_e2e.rs` replays real + Hyperliquid by-block fixtures and compares TOB/DoB output **byte-for-byte** + against golden packets. +- `dual_validator_fixture_matches_block_and_stream_goldens` asserts + block-vs-stream semantic parity. +- Any shared-code change that alters a quote will fail one of these tests + loudly. None can silently break block mode. + +## Approach — focus on L2 snapshot cost + +Initial brainstorm covered JSON-parser swap, per-diff batch explosion, +`tokio::spawn`-per-diff, DoB tap mutex, etc. Those are real but each is <5% vs +the ~65% in snapshot computation. They're **followups, not headline work.** + +### Levers, ordered by risk-adjusted ROI + +#### L2-1: Replace `Px::num_digits()` log10/floor with `u64::ilog10()` + +- File: [server/src/order_book/types.rs:130](../../../server/src/order_book/types.rs) +- Change: `(value as f64).log10().floor() as u32 + 1` → `value.ilog10() + 1` + (with the `if value == 0 { 1 }` guard preserved). +- Affects: both modes (shared function). +- Risk: **none.** `u64::ilog10()` returns exactly the same digit count for any + `u64 > 0` as the f64 formula. Add a unit test asserting equivalence over a + bounded sweep. +- Expected win: removes the ~11% in `__log10_finite` + `floor`. + +#### L2-3: Drop the 6 bucketed snapshot variants + +The big one, enabled by "no WS subscribers." + +- File: [server/src/listeners/order_book/utils.rs:216](../../../server/src/listeners/order_book/utils.rs) +- **L2-3a:** `compute_l2_snapshots` returns only the unbucketed snapshot. The + bucketed views move to a lazy on-demand path: if a WS `l2book` subscriber + ever connects, derive the bucketed snapshot from the base via the existing + `Snapshot::to_l2_snapshot` snapshot-from-snapshot path. +- **L2-3b:** TOB consumes only BBO. Add a cheap `to_bbo_snapshot()` that + returns `(best_bid, best_ask, time)` per coin: O(log N) BTreeMap navigation + + one linked-list fold per side per coin. The full unbucketed snapshot + becomes optional, computed only when there's a WS consumer. +- Affects: both modes (shared snapshot path). +- Risk: low-medium. Behavior change is "compute fewer variants." Wire output + for TOB is unchanged (still BBO). WS lazy path needs to handle "snapshot + requested at this height after we skipped it" — easiest: keep a small + cache of the base snapshot at the current height, materialize bucketed + views on demand. +- Expected win: largest single reduction. Eliminates 6 of 7 variants and + removes the only consumer of `num_digits()`. + +#### L2-4: Pre-size output Vecs + +- Files: [server/src/order_book/levels.rs:63,85](../../../server/src/order_book/levels.rs) +- Change: `Vec::with_capacity(n_levels.unwrap_or(reasonable_default))` in + `map_to_l2_levels` and `l2_levels_to_l2_levels`. +- Affects: both modes. +- Risk: none. Mechanical. +- Expected win: most of the ~7% realloc time. (Largely moot once L2-3b + collapses the ladder walk to BBO, but cheap to land.) + +#### L2-5: Coalesce streaming snapshot frequency to per-block + +- File: [server/src/listeners/order_book/mod.rs:1857](../../../server/src/listeners/order_book/mod.rs) (`process_data` tail) and + [state.rs:93](../../../server/src/listeners/order_book/state.rs) (`l2_snapshots` gating). +- Change: in streaming mode, defer snapshot compute when the chunk is mid-block; + only compute at block-height advance or on a bounded "haven't snapped for X + ms" fallback. +- Affects: streaming only (gated on `IngestMode::Stream`). **Block-mode call + site untouched.** +- Risk: low. TOB only reads the latest snapshot, so deferring within a single + block height is safe. Need to ensure that when a block finalizes we + guarantee at least one snapshot is emitted. +- Expected win: cuts the multiplier between validator chattiness and snapshot + work — directly breaks the feedback loop. Defense in depth even after L2-3. + +#### L2-2 (deferred): Cache per-level `(sz, n)` aggregates + +- File: [server/src/order_book/levels.rs](../../../server/src/order_book/levels.rs) + book mutation paths. +- Change: maintain a `(sz, n)` aggregate per price level, updated incrementally + on `add_resting_order` / `cancel_order` / `modify_sz`. Replaces the + double-fold in `map_to_l2_levels`. Code comment at [levels.rs:95](../../../server/src/order_book/levels.rs) already + calls this out. +- Affects: both modes (shared book). +- Risk: **highest.** Aggregate drift → wrong BBO → silent quote errors. + Mitigation: debug-mode invariant assert (`debug_assert!(cached == folded)`) + after each mutation; parity tests catch divergence. +- **Decision: defer.** With L2-3b in place we only fold one level per side, + not every level. Re-measure first. Only take this risk if L2-1/L2-3/L2-5 + aren't enough. + +### Operational mitigation (orthogonal, do anyway) + +cgroup / cpuset to cap publisher CPU and pin away from validator cores. +Doesn't fix the root cause but bounds the feedback loop while changes land. + +## Followups (out of scope for v1, captured for later) + +- Per-diff "explode batch into N single-event batches" in `receive_stream_diffs` + ([mod.rs:1088](../../../server/src/listeners/order_book/mod.rs)). +- `tokio::spawn` per individual L4 update in `publish_l4_update` + ([mod.rs:1259](../../../server/src/listeners/order_book/mod.rs)); skip entirely if no L4 subscribers. +- DoB tap `std::sync::Mutex` on sequence counter + ([dob_tap.rs:69,94,119](../../../server/src/listeners/order_book/dob_tap.rs)) → AtomicU64-per-instrument. +- Grace-fallback `warn!` rate limiting in drain loop ([mod.rs:1141,1164,1186](../../../server/src/listeners/order_book/mod.rs)). +- `serde_json` → `simd-json` for JSONL parsing. (Was the initial hypothesis; + perf data showed it's not the dominant cost.) + +## Open questions + +- Should L2-5 also apply to block mode for symmetry, or stay streaming-only to + minimize block-mode change surface? Current leaning: streaming-only. +- For L2-3b, define the BBO snapshot data type and where it lives. Likely a + per-coin `(Option, Option)` keyed by `Coin`. +- For L2-3a's lazy WS path, where does the base-snapshot cache live? On the + listener? On `OrderBookState`? Cache invalidation on `state.time` advance. + +## Next sections (not yet drafted) + +- Section 2: Architecture — where BBO lives, how `l2_snapshots` splits into + "cheap TOB path" vs "lazy WS path," the streaming coalescing mechanism. +- Section 3: Implementation order, parity-test plan, perf re-measurement plan. +- Section 4: Rollout — feature gate if needed, canary plan, rollback. From 00c6b9e9c732f875f378682a50a17d8ba1a7d52c Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 13:45:26 -0400 Subject: [PATCH 02/65] spec: revise streaming cpu design with --enable-websocket flag and finalization-driven snapshots --- ...26-05-15-streaming-cpu-reduction-design.md | 284 +++++++++++++----- 1 file changed, 212 insertions(+), 72 deletions(-) diff --git a/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md index 17285fc1..c6fb2c56 100644 --- a/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md +++ b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md @@ -1,7 +1,9 @@ -# Streaming-mode CPU Reduction — Design (Draft) +# Streaming-mode CPU Reduction — Design -Status: **DRAFT / work-in-progress brainstorm.** Not yet a committed spec. Captures -the running design discussion so far. +Status: **Approved.** Section 1 (problem, scope, levers, decision log) +complete. Section 2 (architecture detail) intentionally deferred to the +implementation plan. Revised 2026-05-15 to address Codex adversarial review +findings. Date: 2026-05-15 @@ -63,39 +65,81 @@ Each Vec starts at capacity 0 ([levels.rs:63,85](../../../server/src/order_book/ advances mid-burst, the full 7-snapshot fan-out runs again across ~200 coins. That's also the feedback-loop mechanism: snapshot cost scales with chunk count, -not block count. Validator slowness → more / smaller chunks per block → more -snapshot computes → higher publisher CPU → less CPU for the validator. +not block count. -## Key deployment context +## Deployment posture: multicast-only by default -- **No WebSocket subscribers in production.** All 6 bucketed variants exist - solely for WebSocket `l2book` subscribers requesting a specific aggregation. - TOB reads only BBO from the unbucketed snapshot. **The 6 bucketed variants - are pure waste in this deployment**, and they're where the `log10`/`floor` - time is going (bucketing is the only consumer of `num_digits()`). +This publisher is deployed **multicast-only**. WebSocket subscribers are not +expected and not supported in this deployment. + +To make that posture enforceable rather than implicit, introduce a CLI flag: + +``` +--enable-websocket (default: false; new for this work) +``` + +WebSocket support is **off by default.** When the flag is unset (the default): +- The WebSocket listener at `{address}:{port}` is **not bound**. No `/ws` + endpoint is accepted. +- The L2 snapshot path produces only the variants required by TOB. +- `--address`/`--port` become optional. If supplied, they are validated as + unused and a one-line warning is logged at startup so misconfigurations are + visible. + +When `--enable-websocket` is set, behavior is unchanged from today, including +the full 7-variant L2 snapshot fan-out. This is the escape hatch for anyone +who needs WS. + +The optimization work below is the **default path.** A subscriber that +connects to a publisher started with `--enable-websocket` still gets the same +snapshots they get today. + +### Behavior change for existing users + +This is a default-behavior change: a binary started today **without** any +new flag accepts `/ws` and serves L2 subscriptions. After this work lands, +the same invocation **does not** bind the WebSocket listener. Anyone relying +on WS today must add `--enable-websocket` to their startup command. This +needs to be called out in: + +- The CHANGELOG entry for the release that lands this work. +- A startup `info!` log line that names the active mode ("websocket disabled" + / "websocket enabled"). +- README — update the "Setup" example to reflect the new default and document + the opt-in flag. + +This replaces the earlier "no WS subscribers" deployment assumption with an +enforceable runtime default, addressing Codex finding #1. ## Scope and goals **In scope:** CPU reduction in streaming mode (`--ingest-mode stream`), measured as publisher CPU% on a representative validator-co-located host -under live load. +under live load, in the default (WS-disabled) configuration. -**Out of scope:** memory reduction, latency improvements beyond what falls out -of CPU work, WebSocket server performance, DoB wire-protocol changes, -validator-side tuning. +**Out of scope:** memory reduction, latency improvements beyond what falls +out of CPU work, WebSocket server performance (other than the on/off switch), +DoB wire-protocol changes, validator-side tuning. -**Must not regress:** block-mode behavior, byte-for-byte. Block mode is the -production-stable baseline. +**Must not regress:** block-mode behavior, byte-for-byte, in either the +default (WS-disabled) or `--enable-websocket` configurations. Block mode is +the production-stable baseline. The publisher must continue to support +running with `--enable-websocket` for users who need WS. ### Success criteria - ≥40% reduction in publisher CPU% in streaming mode at steady state on the - canary box. + canary box, with the default (WS-disabled) `--ingest-mode stream` + configuration. - `orderbook_tob_snapshot_compute` p50/p99 drop materially. - `orderbook_tob_suppressed_total` does not regress. - `block_mode_multicast_e2e` and `dual_validator_fixture_matches_block_and_stream_goldens` both pass with no - golden updates required. + golden updates required (subject to the `num_digits` boundary-test caveat in + L2-1 below). +- **New:** a streaming quote-sequence test verifies that every finalized + streaming block emits its final BBO via TOB — not just that final book state + is correct (addresses Codex finding #2). ## Block-mode safety net (already in place) @@ -104,12 +148,13 @@ production-stable baseline. against golden packets. - `dual_validator_fixture_matches_block_and_stream_goldens` asserts block-vs-stream semantic parity. -- Any shared-code change that alters a quote will fail one of these tests - loudly. None can silently break block mode. +- These tests verify **final state and emitted packets**, but the existing + parity test for streaming does not explicitly assert "every block emits its + final quote at finalization time." L2-5 introduces such a test (see below). ## Approach — focus on L2 snapshot cost -Initial brainstorm covered JSON-parser swap, per-diff batch explosion, +Earlier brainstorm covered JSON-parser swap, per-diff batch explosion, `tokio::spawn`-per-diff, DoB tap mutex, etc. Those are real but each is <5% vs the ~65% in snapshot computation. They're **followups, not headline work.** @@ -119,59 +164,129 @@ the ~65% in snapshot computation. They're **followups, not headline work.** - File: [server/src/order_book/types.rs:130](../../../server/src/order_book/types.rs) - Change: `(value as f64).log10().floor() as u32 + 1` → `value.ilog10() + 1` - (with the `if value == 0 { 1 }` guard preserved). + (preserve `value == 0` guard). - Affects: both modes (shared function). -- Risk: **none.** `u64::ilog10()` returns exactly the same digit count for any - `u64 > 0` as the f64 formula. Add a unit test asserting equivalence over a - bounded sweep. +- **Risk classification: latent bug fix, not bit-equivalent replacement.** + The current f64 formula loses precision above `2^53` and rounds values + immediately below `10^n` up to exactly `10^n`. Example: + `999_999_999_999_999_999_u64` rounds to `1e18` as f64, giving `floor(18.0) + + 1 = 19` digits, while `ilog10() + 1` correctly gives `18`. `u64::ilog10()` is + the mathematically correct answer; the f64 formula is buggy near every + `10^n` boundary above f64 integer precision. +- **For Hyperliquid prices:** `Px` stores `price * MULTIPLIER` as `u64` with + `MULTIPLIER = 10^8`. A price of `$10^10` is `10^18` in `u64` — right at the + problem range. Realistic prices are well below this, but the spec must not + assume the bug is unreachable. +- **Decision:** take this as a correctness fix. The f64 behavior is wrong, not + load-bearing. +- **Test plan (required, not optional):** + 1. Unit test: for every `n ∈ {1..19}`, assert `Px::new(10u64.pow(n)) + .num_digits() == n + 1` and `Px::new(10u64.pow(n) - 1).num_digits() == n`. + 2. Unit test: dense sweep of `Px::new(x).num_digits()` for the realistic + Hyperliquid `Px` value range (top of book at typical liquid coins — + concrete ranges to be sourced from a recent snapshot before landing). + 3. Run `block_mode_multicast_e2e` and + `dual_validator_fixture_matches_block_and_stream_goldens` and explicitly + check whether any golden differs. If any does, investigate whether the + golden was capturing buggy f64 output at an exact `10^n` price; document + and regenerate only if so. - Expected win: removes the ~11% in `__log10_finite` + `floor`. -#### L2-3: Drop the 6 bucketed snapshot variants +#### L2-3: Drop the 6 bucketed snapshot variants in the default config -The big one, enabled by "no WS subscribers." +The big lever. Active by default; the `--enable-websocket` opt-in restores +the full fan-out. - File: [server/src/listeners/order_book/utils.rs:216](../../../server/src/listeners/order_book/utils.rs) -- **L2-3a:** `compute_l2_snapshots` returns only the unbucketed snapshot. The - bucketed views move to a lazy on-demand path: if a WS `l2book` subscriber - ever connects, derive the bucketed snapshot from the base via the existing - `Snapshot::to_l2_snapshot` snapshot-from-snapshot path. -- **L2-3b:** TOB consumes only BBO. Add a cheap `to_bbo_snapshot()` that - returns `(best_bid, best_ask, time)` per coin: O(log N) BTreeMap navigation - + one linked-list fold per side per coin. The full unbucketed snapshot - becomes optional, computed only when there's a WS consumer. -- Affects: both modes (shared snapshot path). -- Risk: low-medium. Behavior change is "compute fewer variants." Wire output - for TOB is unchanged (still BBO). WS lazy path needs to handle "snapshot - requested at this height after we skipped it" — easiest: keep a small - cache of the base snapshot at the current height, materialize bucketed - views on demand. +- **L2-3a:** `compute_l2_snapshots` accepts a config struct identifying which + variants to materialize. In the default config, only the unbucketed base + snapshot is produced. With `--enable-websocket`, behavior is identical to + today (all 7 variants), so opt-in behavior is preserved bit-for-bit. +- **L2-3b:** TOB consumes only BBO. Add a `to_bbo_snapshot()` path returning + `(best_bid, best_ask, time)` per coin — O(log N) BTreeMap navigation + one + linked-list fold per side per coin. In the default config, + `compute_l2_snapshots` short-circuits to BBO instead of the full unbucketed + ladder. +- **Affects:** in the default config, all paths (block + stream); with + `--enable-websocket`, no behavior change. Parity tests will run in **both** + configurations to confirm no regressions in either. +- **Risk:** WS users can no longer connect by accident; they must opt in + explicitly. The behavior-change-for-existing-users note above covers the + rollout caveat. With the flag respected, the prior concern about silently + breaking WS clients reduces to "WS users must read the release notes." +- **Test plan:** + 1. Existing parity tests pass unchanged with `--enable-websocket` **on**. + 2. New test: in the default (WS-disabled) config, parity tests verify TOB + output is unchanged from the `--enable-websocket` baseline (TOB only + reads BBO, so it should be invariant across both configs). + 3. New test: in the default config, assert the WS listener is not bound + (port refuses connections) and a startup log line announces it. - Expected win: largest single reduction. Eliminates 6 of 7 variants and - removes the only consumer of `num_digits()`. + removes the only consumer of `num_digits()` in the hot path. #### L2-4: Pre-size output Vecs - Files: [server/src/order_book/levels.rs:63,85](../../../server/src/order_book/levels.rs) - Change: `Vec::with_capacity(n_levels.unwrap_or(reasonable_default))` in `map_to_l2_levels` and `l2_levels_to_l2_levels`. -- Affects: both modes. +- Affects: both modes; no behavior change. - Risk: none. Mechanical. -- Expected win: most of the ~7% realloc time. (Largely moot once L2-3b - collapses the ladder walk to BBO, but cheap to land.) - -#### L2-5: Coalesce streaming snapshot frequency to per-block - -- File: [server/src/listeners/order_book/mod.rs:1857](../../../server/src/listeners/order_book/mod.rs) (`process_data` tail) and - [state.rs:93](../../../server/src/listeners/order_book/state.rs) (`l2_snapshots` gating). -- Change: in streaming mode, defer snapshot compute when the chunk is mid-block; - only compute at block-height advance or on a bounded "haven't snapped for X - ms" fallback. -- Affects: streaming only (gated on `IngestMode::Stream`). **Block-mode call - site untouched.** -- Risk: low. TOB only reads the latest snapshot, so deferring within a single - block height is safe. Need to ensure that when a block finalizes we - guarantee at least one snapshot is emitted. +- Expected win: most of the ~7% realloc time. (Largely moot once L2-3 collapses + the ladder walk to BBO, but cheap to land regardless.) + +#### L2-5: Tie streaming snapshot emission to block finalization + +This is the streaming-side coalescing, redesigned to address Codex finding #2. + +**The bug in the earlier sketch:** "skip snapshot mid-block, emit on +block-height advance" can publish the BBO derived from the *first* diff of a +block and then suppress later diffs in the same block — because the height +hasn't advanced. In streaming, a block's diffs arrive across multiple chunks, +and BBO can change with any of them. + +**The corrected design:** + +- **Finalization is authoritative.** `finalize_stream_block` ([mod.rs:1248](../../../server/src/listeners/order_book/mod.rs)) + is the only point where the publisher knows the block is complete. Snapshot + emission for streaming mode must be tied to finalization, not chunk + completion. +- **Suppress chunk-level emission.** In streaming mode, the per-chunk + `l2_snapshots(true)` call at [mod.rs:1857](../../../server/src/listeners/order_book/mod.rs) is replaced with a + *book-dirty* flag set on each `apply_stream_diff` that mutates BBO. The + expensive `compute_l2_snapshots` is **not** called per chunk in streaming + mode. +- **Finalization emits.** When `finalize_stream_block` runs, if the dirty + flag is set, compute and emit one snapshot for the finalized height. Clear + the dirty flag. This is `≤ 1` snapshot per block — the same cadence as + block mode. +- **Backstop for stuck streams.** If a block hasn't finalized within a bounded + window (e.g. 5s) but the book is dirty, emit anyway. Avoids quote staleness + during grace-fallback. +- **Block mode untouched.** The change is gated on `IngestMode::Stream`. +- **Required tests (addresses Codex finding #2):** + 1. **Two-diff-in-block test:** a streaming block contains a `New` diff at + `BBO_a` followed by an `Update` diff that moves BBO to `BBO_b`. Assert + that exactly one TOB snapshot is emitted for the finalized block, and + that the snapshot carries `BBO_b` (the final state), not `BBO_a`. + 2. **No-spurious-emission test:** a streaming block whose diffs don't change + BBO must not emit a TOB snapshot purely because of a chunk boundary. + 3. **Finalization-emission test:** every finalized streaming block whose + diffs mutated BBO produces exactly one TOB snapshot for that height + before any later block's snapshot. + 4. **Stuck-stream backstop test:** if finalization is delayed past the + bounded window while BBO is dirty, a snapshot is emitted at the bound, + not at the next block. +- **Tests should assert quote sequences, not just final state.** Extend the + dual-validator parity test (or add a sibling) to compare the *ordered list* + of TOB Quote messages emitted by block mode vs streaming mode for the same + fixture, not just the final book state. Note that block and stream do not + emit identical packet sequences today (streaming emits per-chunk), so this + test characterizes the post-L2-5 invariant: one Quote per block whose BBO + changed, in finalization order, equal across modes. - Expected win: cuts the multiplier between validator chattiness and snapshot - work — directly breaks the feedback loop. Defense in depth even after L2-3. + work. With L2-3 already in place, this is defense in depth — but it also + directly breaks the feedback loop, because publisher work no longer scales + with chunk count. #### L2-2 (deferred): Cache per-level `(sz, n)` aggregates @@ -193,30 +308,55 @@ The big one, enabled by "no WS subscribers." cgroup / cpuset to cap publisher CPU and pin away from validator cores. Doesn't fix the root cause but bounds the feedback loop while changes land. +## Decision log (Codex findings) + +- **Finding 1 (L2 variant removal risks):** addressed by making the + multicast-only posture the **default behavior**, with `--enable-websocket` + as an explicit opt-in for users who still need WS. `compute_l2_snapshots` + is parameterized on the WS state; `--enable-websocket` preserves today's + behavior bit-for-bit. No "lazy" subscriber-aware materialization path is + needed because subscribers cannot connect in the default config. +- **Finding 2 (per-block coalescing partial state):** addressed by tying + streaming snapshot emission to `finalize_stream_block`, not to chunk + boundaries or height advance. Required tests verify the final BBO of each + block is emitted exactly once, in finalization order. +- **Finding 3 (`ilog10` not f64-equivalent):** accepted as a latent bug fix. + Required boundary tests around every `10^n` and a sweep over realistic `Px` + ranges. Goldens that capture the buggy f64 output (if any) will be + identified and either left as-is (if outside reachable price ranges) or + regenerated with documented justification. + ## Followups (out of scope for v1, captured for later) - Per-diff "explode batch into N single-event batches" in `receive_stream_diffs` ([mod.rs:1088](../../../server/src/listeners/order_book/mod.rs)). - `tokio::spawn` per individual L4 update in `publish_l4_update` - ([mod.rs:1259](../../../server/src/listeners/order_book/mod.rs)); skip entirely if no L4 subscribers. + ([mod.rs:1259](../../../server/src/listeners/order_book/mod.rs)); skip entirely if no L4 subscribers — naturally + redundant when WS is disabled (the default), since L4 has no consumer. - DoB tap `std::sync::Mutex` on sequence counter ([dob_tap.rs:69,94,119](../../../server/src/listeners/order_book/dob_tap.rs)) → AtomicU64-per-instrument. -- Grace-fallback `warn!` rate limiting in drain loop ([mod.rs:1141,1164,1186](../../../server/src/listeners/order_book/mod.rs)). +- Grace-fallback `warn!` rate limiting in drain loop + ([mod.rs:1141,1164,1186](../../../server/src/listeners/order_book/mod.rs)). - `serde_json` → `simd-json` for JSONL parsing. (Was the initial hypothesis; perf data showed it's not the dominant cost.) ## Open questions -- Should L2-5 also apply to block mode for symmetry, or stay streaming-only to - minimize block-mode change surface? Current leaning: streaming-only. -- For L2-3b, define the BBO snapshot data type and where it lives. Likely a - per-coin `(Option, Option)` keyed by `Coin`. -- For L2-3a's lazy WS path, where does the base-snapshot cache live? On the - listener? On `OrderBookState`? Cache invalidation on `state.time` advance. +- For L2-5, what should the stuck-stream backstop interval be? 5s is a + placeholder; the TOB freshness threshold is the natural ceiling. +- For L2-3, when WS is disabled (the default), should the L4 broadcast channel + ([mod.rs:1259](../../../server/src/listeners/order_book/mod.rs)) also be short-circuited (skip `tokio::spawn`)? Probably + yes; L4 has no consumer without WS. Could fold into L2-3 directly. +- For L2-1, do we want to keep the f64 formula behind a feature flag for any + reason (e.g. exact golden replay of historical recordings)? Default answer: + no. ## Next sections (not yet drafted) -- Section 2: Architecture — where BBO lives, how `l2_snapshots` splits into - "cheap TOB path" vs "lazy WS path," the streaming coalescing mechanism. -- Section 3: Implementation order, parity-test plan, perf re-measurement plan. -- Section 4: Rollout — feature gate if needed, canary plan, rollback. +- Section 2: Architecture — exact shape of the WS-disabled snapshot path, the + finalization-driven emission path, the new dirty-flag wiring. +- Section 3: Implementation order, perf re-measurement plan after each lever + lands. +- Section 4: Rollout — canary plan in default (WS-disabled) config; rollback + by passing `--enable-websocket` to restore today's behavior if any + regression appears. From d4fd8ef922a4b11879d501e20fadf27ff240e430 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 13:56:05 -0400 Subject: [PATCH 03/65] plan: streaming cpu reduction implementation plan --- .../2026-05-15-streaming-cpu-reduction.md | 1779 +++++++++++++++++ 1 file changed, 1779 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md diff --git a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md new file mode 100644 index 00000000..f03ac56d --- /dev/null +++ b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md @@ -0,0 +1,1779 @@ +# Streaming-mode CPU Reduction Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Reduce publisher CPU in streaming mode by ≥40% to break the +negative feedback loop with the co-located Hyperliquid validator, without +regressing block mode. + +**Architecture:** Five independently-landable changes. (1) Replace +`Px::num_digits()` f64 math with integer `u64::ilog10()`. (2) Pre-size output +Vecs in level construction. (3) Introduce `--enable-websocket` (default off) +that gates the WS listener bind and shrinks L2 snapshot work. (4) Drop the 6 +bucketed L2 variants when WS is disabled, with a BBO-only short-circuit for +the TOB hot path. (5) In streaming mode, tie L2 snapshot emission to +`finalize_stream_block` instead of every file-read chunk, with a book-dirty +flag and a stuck-stream backstop. L2-2 (per-level aggregate cache) is +deferred pending re-measurement after L2-1/3/4/5 land. + +**Tech Stack:** Rust 2024 edition; Tokio async; rayon for snapshot fan-out; +clap CLI; cargo test + custom fixture-based golden tests in +`server/src/listeners/order_book/block_mode_multicast_e2e.rs` and +`parity_tests.rs`. + +**Reference:** [docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md](../specs/2026-05-15-streaming-cpu-reduction-design.md) (committed as `00c6b9e`). + +--- + +## Phase 0: Pre-flight + +### Task 0.1: Capture baseline perf numbers + +**Files:** +- No code changes +- Record: `docs/superpowers/plans/2026-05-15-streaming-cpu-baseline.md` (new, gitignored or committed at your discretion) + +- [ ] **Step 1: Run the publisher in the canary environment and capture perf top.** + +On the canary host, with the binary built from `main`: + +```bash +sudo perf top -p $(pgrep dz_hl_publisher) -d 30 +``` + +Record: +1. Top 10 functions by `Self%`. +2. `top -p $(pgrep dz_hl_publisher) -n 5` snapshot — CPU% column. +3. `curl -s http://127.0.0.1:9090/metrics | grep -E "^orderbook_tob_snapshot_compute|^orderbook_tob_suppressed_total|^orderbook_tob_snapshot_source_block_lag"` over a 5-minute window. + +- [ ] **Step 2: Save baseline alongside the plan.** + +Create `docs/superpowers/plans/2026-05-15-streaming-cpu-baseline.md`. Capture date, host, git SHA, the perf top excerpt, and the metric snapshots. This is the "before" anchor every later perf re-measurement compares against. + +- [ ] **Step 3: Commit baseline.** + +```bash +git add docs/superpowers/plans/2026-05-15-streaming-cpu-baseline.md +git commit -m "perf: capture streaming cpu baseline before reductions" +``` + +--- + +## Phase 1: L2-1 — Replace `Px::num_digits()` f64 math with `u64::ilog10()` + +### Task 1.1: Add `Px::num_digits()` boundary unit tests (failing — they pass today but assert a stronger contract) + +**Files:** +- Modify: `server/src/order_book/types.rs` (add tests at end of file) + +- [ ] **Step 1: Add the test module with boundary tests.** + +Open `server/src/order_book/types.rs`. After the existing `impl Sz` block (around line 149), append: + +```rust +#[cfg(test)] +mod num_digits_tests { + use super::Px; + + #[test] + fn zero_returns_one() { + assert_eq!(Px::new(0).num_digits(), 1); + } + + #[test] + fn single_digit_values() { + for v in 1u64..=9 { + assert_eq!(Px::new(v).num_digits(), 1, "value={v}"); + } + } + + #[test] + fn powers_of_ten_have_expected_digit_count() { + // 10^0 = 1 → 1 digit, 10^1 = 10 → 2 digits, ..., 10^19 overflows u64. + for n in 0u32..=18 { + let v = 10u64.pow(n); + let expected = n + 1; + assert_eq!(Px::new(v).num_digits(), expected, "10^{n} = {v} should have {expected} digits"); + } + } + + #[test] + fn just_below_powers_of_ten_have_expected_digit_count() { + // 9, 99, 999, ..., values just below 10^n must report n digits, not n+1. + // This is the f64 imprecision boundary that motivates the ilog10 swap: + // 999_999_999_999_999_999_u64 rounds to 1e18 as f64, so the old formula + // returned 19 instead of 18. + for n in 1u32..=18 { + let v = 10u64.pow(n) - 1; + let expected = n; + assert_eq!(Px::new(v).num_digits(), expected, "10^{n} - 1 = {v} should have {expected} digits"); + } + } + + #[test] + fn u64_max_has_twenty_digits() { + assert_eq!(Px::new(u64::MAX).num_digits(), 20); + } + + #[test] + fn dense_sweep_realistic_hyperliquid_range() { + // Px stores price * 10^8 as u64. Realistic Hyperliquid prices range + // from ~1e-6 (memecoin) to ~1e6 (BTC) USD, so Px values land in roughly + // 1e2 .. 1e14. Sweep that range and cross-check against ilog10. + for exp in 2u32..=14 { + let v = 10u64.pow(exp); + assert_eq!(Px::new(v).num_digits(), exp + 1); + assert_eq!(Px::new(v - 1).num_digits(), exp); + assert_eq!(Px::new(v + 1).num_digits(), exp + 1); + } + } +} +``` + +- [ ] **Step 2: Run the tests on the current (f64) implementation.** + +```bash +cargo test -p server --lib order_book::types::num_digits_tests -- --nocapture +``` + +Expected: `just_below_powers_of_ten_have_expected_digit_count` **FAILS** at `n=17` or `n=18` because the f64 formula rounds `10^18 - 1` up to `1e18` and returns 19 instead of 18. +(Other tests may pass or fail depending on platform f64 behavior — record the result.) + +- [ ] **Step 3: Commit the failing tests.** + +```bash +git add server/src/order_book/types.rs +git commit -m "test: add Px::num_digits boundary tests around 10^n thresholds" +``` + +### Task 1.2: Switch `Px::num_digits()` to `u64::ilog10()` + +**Files:** +- Modify: `server/src/order_book/types.rs:128-132` + +- [ ] **Step 1: Replace the function body.** + +In `server/src/order_book/types.rs`, find the block at line 128–132: + +```rust + #[allow(clippy::cast_possible_truncation)] + #[allow(clippy::cast_sign_loss)] + pub(crate) fn num_digits(self) -> u32 { + if self.value() == 0 { 1 } else { (self.value() as f64).log10().floor() as u32 + 1 } + } +``` + +Replace with: + +```rust + pub(crate) fn num_digits(self) -> u32 { + if self.value() == 0 { 1 } else { self.value().ilog10() + 1 } + } +``` + +(The `#[allow]` attributes are no longer needed — there are no casts.) + +- [ ] **Step 2: Run the boundary tests — they should now all pass.** + +```bash +cargo test -p server --lib order_book::types::num_digits_tests -- --nocapture +``` + +Expected: all 6 tests **PASS**. + +- [ ] **Step 3: Run the full server test suite.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS**. If `block_mode_multicast_e2e` or `dual_validator_fixture_matches_block_and_stream_goldens` fails with a golden-bytes mismatch, the golden captured the buggy f64 output at an exact `10^n` price boundary. Investigate the diverging packet, log the price involved, and (only with explicit justification documented in the commit message) regenerate the golden. + +- [ ] **Step 4: Run clippy.** + +```bash +cargo clippy --workspace --all-targets -- -D warnings +``` + +Expected: clean. + +- [ ] **Step 5: Commit.** + +```bash +git add server/src/order_book/types.rs +git commit -m "perf: replace Px::num_digits f64 log10 with u64::ilog10" +``` + +### Task 1.3: Re-measure perf after L2-1 + +- [ ] **Step 1: Deploy the new binary to the canary and capture a 30s perf top.** + +Same procedure as Task 0.1. Append the results to `docs/superpowers/plans/2026-05-15-streaming-cpu-baseline.md` under a new heading "After L2-1 (ilog10)". + +- [ ] **Step 2: Verify that `__log10_finite` and `floor` are no longer in the top 10.** + +Document the new top 10. Expected: ~11% of CPU recovered. + +--- + +## Phase 2: L2-4 — Pre-size output Vecs + +### Task 2.1: Pre-size in `map_to_l2_levels` and `l2_levels_to_l2_levels` + +**Files:** +- Modify: `server/src/order_book/levels.rs:63,85` + +- [ ] **Step 1: Update both `Vec::new()` call sites with `Vec::with_capacity`.** + +In `server/src/order_book/levels.rs`, line 63: + +```rust +fn l2_levels_to_l2_levels( + levels: &[InnerLevel], + side: Side, + n_levels: Option, + n_sig_figs: Option, + mantissa: Option, +) -> Vec { + let cap = n_levels.unwrap_or(levels.len()); + let mut new_levels = Vec::with_capacity(cap); +``` + +Line 85: + +```rust +fn map_to_l2_levels( + orders: &BTreeMap>, + side: Side, + n_levels: Option, + n_sig_figs: Option, + mantissa: Option, +) -> Vec { + let cap = n_levels.unwrap_or_else(|| orders.len().min(1024)); + let mut levels = Vec::with_capacity(cap); +``` + +The `.min(1024)` cap on the unbounded path prevents pathological pre-allocation for books with extreme level counts. + +- [ ] **Step 2: Run the full server test suite.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS** — no behavior change. + +- [ ] **Step 3: Run clippy.** + +```bash +cargo clippy --workspace --all-targets -- -D warnings +``` + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/order_book/levels.rs +git commit -m "perf: pre-size L2 level output Vecs to skip realloc growth" +``` + +### Task 2.2: Re-measure perf after L2-4 + +- [ ] **Step 1: Capture perf top.** Append to baseline doc under "After L2-4 (pre-size)". Expected: ~7% realloc CPU recovered. + +--- + +## Phase 3: `--enable-websocket` CLI flag plumbing + +This phase introduces the flag end-to-end but does **not** change any L2 +snapshot logic yet. It is purely about (a) making the flag exist, (b) +gating the WS listener bind on it, (c) propagating it to `OrderBookState` +so later phases can branch on it. + +### Task 3.1: Add `--enable-websocket` flag to the CLI + +**Files:** +- Modify: `binaries/src/bin/dz_hl_publisher.rs:155-157` (add flag and validation) + +- [ ] **Step 1: Add a failing test that asserts the flag defaults to false.** + +In `binaries/src/bin/dz_hl_publisher.rs`, locate the `mod tests` block (line 253). Add: + +```rust + #[test] + fn enable_websocket_defaults_off() { + let args = Args::parse_from(["dz_hl_publisher", "--address", "127.0.0.1", "--port", "8000"]); + assert!(!args.enable_websocket); + } + + #[test] + fn enable_websocket_flag_parses() { + let args = Args::parse_from([ + "dz_hl_publisher", + "--address", + "127.0.0.1", + "--port", + "8000", + "--enable-websocket", + ]); + assert!(args.enable_websocket); + } +``` + +- [ ] **Step 2: Run the tests — they should fail to compile (no `enable_websocket` field).** + +```bash +cargo test -p binaries --bin dz_hl_publisher enable_websocket -- --nocapture +``` + +Expected: **COMPILE FAIL** with "no field `enable_websocket` on type `Args`". + +- [ ] **Step 3: Add the flag to the `Args` struct.** + +In `binaries/src/bin/dz_hl_publisher.rs`, after the `separate_fill_ingest` field at line 156, add: + +```rust + /// Enable the WebSocket listener at `{address}:{port}`. Off by default — + /// this publisher is multicast-only by default. With this flag set, the + /// full L2 snapshot fan-out (7 variants per coin) is computed to preserve + /// today's WS subscription semantics. + #[arg(long, default_value_t = false)] + enable_websocket: bool, +``` + +- [ ] **Step 4: Run the tests — they should now pass.** + +```bash +cargo test -p binaries --bin dz_hl_publisher enable_websocket -- --nocapture +``` + +Expected: both tests **PASS**. + +- [ ] **Step 5: Commit.** + +```bash +git add binaries/src/bin/dz_hl_publisher.rs +git commit -m "feat: add --enable-websocket cli flag (default off)" +``` + +### Task 3.2: Thread `enable_websocket` through `run_websocket_server` + +**Files:** +- Modify: `binaries/src/bin/dz_hl_publisher.rs:238-247` (pass flag to `run_websocket_server`) +- Modify: `server/src/servers/websocket_server.rs:53-62` (add parameter; log mode at startup) + +- [ ] **Step 1: Update `run_websocket_server` signature.** + +In `server/src/servers/websocket_server.rs`, line 53: + +```rust +pub async fn run_websocket_server( + address: &str, + ignore_spot: bool, + compression_level: u32, + multicast_config: Option, + dob_config: Option, + ingest_mode: IngestMode, + hl_data_root: Option, + separate_fill_ingest: bool, + enable_websocket: bool, +) -> Result<()> { +``` + +Immediately after the existing `separate_fill_ingest` validation block (line 65, right after `return Err`), add: + +```rust + if enable_websocket { + info!("websocket mode: ENABLED (listener will bind {address}, full L2 snapshot fan-out active)"); + } else { + info!("websocket mode: DISABLED (multicast-only; --address/--port ignored, L2 fan-out reduced)"); + } +``` + +- [ ] **Step 2: Update the call site in the binary.** + +In `binaries/src/bin/dz_hl_publisher.rs`, line 238: + +```rust + run_websocket_server( + &full_address, + true, + compression_level, + multicast_config, + dob_config, + ingest_mode, + args.hl_data_root, + args.separate_fill_ingest, + args.enable_websocket, + ) + .await?; +``` + +- [ ] **Step 3: Build to verify the signature change compiles.** + +```bash +cargo build --workspace +``` + +Expected: clean build. + +- [ ] **Step 4: Run the existing test suite to verify no regressions.** + +```bash +cargo test --workspace +``` + +Expected: all tests **PASS** (no behavioral change yet — flag is plumbed but not consumed). + +- [ ] **Step 5: Commit.** + +```bash +git add binaries/src/bin/dz_hl_publisher.rs server/src/servers/websocket_server.rs +git commit -m "feat: thread enable_websocket through run_websocket_server with startup log" +``` + +### Task 3.3: Conditionally skip the WS `TcpListener` bind in default config + +**Files:** +- Modify: `server/src/servers/websocket_server.rs` (the section that binds the WS TcpListener and accepts connections) + +- [ ] **Step 1: Find the WS listener bind site.** + +```bash +grep -n "TcpListener::bind\|axum::serve\|listen.*await" server/src/servers/websocket_server.rs +``` + +Expected: locate the bind call (it is the `TcpListener` and routing setup that accepts `/ws` connections; typically near the end of `run_websocket_server`). + +- [ ] **Step 2: Wrap the bind + serve block in `if enable_websocket`.** + +In `server/src/servers/websocket_server.rs`, locate the `TcpListener::bind(&address)...` and the associated `axum::serve(...)` / accept loop. Wrap the entire block: + +```rust + if enable_websocket { + // ... existing bind + axum::serve / accept-loop code ... + } else { + // Multicast-only mode: keep the task running so the listener / multicast + // tasks (spawned above) stay alive. Park forever; the process exits via + // listener fatal errors or external signal. + std::future::pending::<()>().await; + } + Ok(()) +``` + +(If there is currently a different flow control — e.g. the function returns from inside the accept loop — adjust accordingly so the function does not exit immediately when WS is disabled.) + +- [ ] **Step 3: Add an integration test that proves the WS port is not bound in default config.** + +Create `server/tests/websocket_disabled_test.rs` with: + +```rust +//! Verifies that `--enable-websocket` defaults to off and the WS port is not +//! bound in that configuration. + +use std::{ + net::{Ipv4Addr, SocketAddr, TcpListener as StdListener}, + time::Duration, +}; + +use server::{IngestMode, run_websocket_server}; + +#[tokio::test(flavor = "multi_thread")] +async fn ws_port_not_bound_when_websocket_disabled() { + // Pick an ephemeral free port for this test. + let probe = StdListener::bind((Ipv4Addr::LOCALHOST, 0)).expect("bind probe"); + let port = probe.local_addr().expect("local_addr").port(); + drop(probe); + + let address = format!("127.0.0.1:{port}"); + let server_handle = tokio::spawn(async move { + let _ = run_websocket_server( + &address, + true, + 1, + None, + None, + IngestMode::Block, + // No data root; the listener will fail to find files, but it should + // still NOT bind the WS port before that happens. + Some(std::env::temp_dir()), + false, + false, // enable_websocket = false + ) + .await; + }); + + // Give the server task a moment to attempt any binds. + tokio::time::sleep(Duration::from_millis(200)).await; + + // The probe port should now be re-bindable, proving the WS server did + // not bind it. + let addr = SocketAddr::from((Ipv4Addr::LOCALHOST, port)); + let rebind = StdListener::bind(addr); + assert!( + rebind.is_ok(), + "expected ws port {port} to be unbound when enable_websocket=false; rebind err: {:?}", + rebind.err() + ); + + server_handle.abort(); +} +``` + +- [ ] **Step 4: Run the new test.** + +```bash +cargo test -p server --test websocket_disabled_test -- --nocapture +``` + +Expected: **PASS**. + +- [ ] **Step 5: Run the full test suite to catch any regressions in the WS-enabled path.** + +```bash +cargo test --workspace +``` + +Expected: all tests **PASS**. Existing WS-using tests should still bind because they construct `run_websocket_server` with `enable_websocket: true` (verify this — if any test calls the new signature with `false` it may need updating). + +- [ ] **Step 6: Commit.** + +```bash +git add server/src/servers/websocket_server.rs server/tests/websocket_disabled_test.rs +git commit -m "feat: skip ws tcp bind when --enable-websocket is off" +``` + +--- + +## Phase 4: L2-3 — Drop bucketed L2 snapshot variants when WS is disabled + +### Task 4.1: Thread `enable_websocket` into `OrderBookState` + +**Files:** +- Modify: `server/src/listeners/order_book/state.rs` (add field + constructor parameter) +- Modify: `server/src/listeners/order_book/mod.rs` (constructors of `OrderBookState`) + +- [ ] **Step 1: Add the field.** + +In `server/src/listeners/order_book/state.rs`, line 16, modify the struct: + +```rust +pub(super) struct OrderBookState { + order_book: OrderBooks, + height: u64, + time: u64, + snapped: bool, + ignore_spot: bool, + enable_websocket: bool, + dob_tap: Option, +} +``` + +Update `Clone` impl at line 45–58 to copy `enable_websocket`: + +```rust +impl Clone for OrderBookState { + fn clone(&self) -> Self { + Self { + order_book: self.order_book.clone(), + height: self.height, + time: self.time, + snapped: self.snapped, + ignore_spot: self.ignore_spot, + enable_websocket: self.enable_websocket, + dob_tap: None, + } + } +} +``` + +Update `from_snapshot` at line 61: + +```rust + pub(super) fn from_snapshot( + snapshot: Snapshots, + height: u64, + time: u64, + ignore_triggers: bool, + ignore_spot: bool, + enable_websocket: bool, + ) -> Self { + Self { + ignore_spot, + enable_websocket, + time, + height, + order_book: OrderBooks::from_snapshots(snapshot, ignore_triggers), + snapped: false, + dob_tap: None, + } + } +``` + +- [ ] **Step 2: Update every caller of `from_snapshot`.** + +```bash +grep -rn "OrderBookState::from_snapshot\|from_snapshot(" server/src/ | grep -v "//" +``` + +Update each call site to pass the new `enable_websocket` parameter. For non-listener callers (tests, snapshot validation cloning), default to `true` to preserve today's behavior unless context obviously demands otherwise. + +The primary call site is in `OrderBookListener` — it needs to know `enable_websocket`. Add a field on the listener and thread it through `new_with_ingest_mode` or set it via a setter analogous to `set_l4_message_tx`. Concretely, in `server/src/listeners/order_book/mod.rs`, on `OrderBookListener`, add: + +```rust + enable_websocket: bool, +``` + +Initialize it to `false` in `new_with_ingest_mode` (the default is multicast-only). Add a setter: + +```rust + pub(crate) fn set_enable_websocket(&mut self, enable_websocket: bool) { + self.enable_websocket = enable_websocket; + } +``` + +Pass it into `OrderBookState::from_snapshot` wherever the listener constructs the state. + +- [ ] **Step 3: Wire the setter call from `run_websocket_server`.** + +In `server/src/servers/websocket_server.rs`, where the listener is built (around line 75–80): + +```rust + let listener = { + let market_message_tx = market_message_tx.clone(); + let mut listener = OrderBookListener::new_with_ingest_mode(Some(market_message_tx), ignore_spot, ingest_mode); + listener.set_l4_message_tx(l4_message_tx.clone()); + listener.set_enable_websocket(enable_websocket); + listener + }; +``` + +- [ ] **Step 4: Build to verify all call sites compile.** + +```bash +cargo build --workspace +``` + +If you missed a `from_snapshot` call site, fix the compile error and re-run. + +- [ ] **Step 5: Run the full test suite.** + +```bash +cargo test --workspace +``` + +Expected: all tests **PASS** (still no behavior change — the new field is unused). + +- [ ] **Step 6: Commit.** + +```bash +git add server/src/listeners/order_book/state.rs server/src/listeners/order_book/mod.rs server/src/servers/websocket_server.rs +git commit -m "feat: thread enable_websocket into OrderBookListener and OrderBookState" +``` + +### Task 4.2: Parameterize `compute_l2_snapshots` on `enable_websocket` + +**Files:** +- Modify: `server/src/listeners/order_book/utils.rs:216-249` +- Modify: `server/src/listeners/order_book/state.rs:93-100` + +- [ ] **Step 1: Add a config struct and update `compute_l2_snapshots` signature.** + +In `server/src/listeners/order_book/utils.rs`, replace the `compute_l2_snapshots` function (line 216) with: + +```rust +/// Number of L2 levels retained in the unbucketed snapshot when WebSocket is +/// disabled. TOB (BBO) needs 1; we keep a small margin so any non-WS consumer +/// that peeks past level 1 has room without re-introducing the full ladder +/// walk that dominates streaming-mode CPU. If a downstream consumer ever +/// needs deeper levels in the WS-disabled config, raise this constant. +const DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED: usize = 1; + +pub(super) fn compute_l2_snapshots( + order_books: &OrderBooks, + enable_websocket: bool, +) -> L2Snapshots { + L2Snapshots( + order_books + .as_ref() + .par_iter() + .map(|(coin, order_book)| { + let mut entries: Vec<(L2SnapshotParams, Snapshot)> = Vec::with_capacity(if enable_websocket { 7 } else { 1 }); + // L2-3b: when WS is disabled, the only consumer of the unbucketed + // snapshot is TOB BBO. Cap the ladder walk at + // `DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED` levels per side. + let base_n_levels = if enable_websocket { None } else { Some(DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED) }; + let snapshot = order_book.to_l2_snapshot(base_n_levels, None, None); + entries.push((L2SnapshotParams { n_sig_figs: None, mantissa: None }, snapshot)); + + if enable_websocket { + let mut add_new_snapshot = |n_sig_figs: Option, mantissa: Option, idx: usize, entries: &mut Vec<(L2SnapshotParams, Snapshot)>| { + if let Some((_, last_snapshot)) = &entries.get(entries.len() - idx) { + let snapshot = last_snapshot.to_l2_snapshot(None, n_sig_figs, mantissa); + entries.push((L2SnapshotParams { n_sig_figs, mantissa }, snapshot)); + } + }; + for n_sig_figs in (2..=5).rev() { + if n_sig_figs == 5 { + for mantissa in [None, Some(2), Some(5)] { + if mantissa == Some(5) { + add_new_snapshot(Some(n_sig_figs), mantissa, 2, &mut entries); + } else { + add_new_snapshot(Some(n_sig_figs), mantissa, 1, &mut entries); + } + } + } else { + add_new_snapshot(Some(n_sig_figs), None, 1, &mut entries); + } + } + } + + (coin.clone(), entries.into_iter().collect::>>()) + }) + .collect(), + ) +} +``` + +The closure is reshaped slightly because it now needs `&mut entries` as a parameter (Rust borrow-checker; the inner `if let Some` was previously a sibling expression). + +**Note on L2-3b:** the `base_n_levels = Some(1)` change is the spec's L2-3b ("TOB consumes only BBO") — instead of adding a separate `to_bbo_snapshot()` method, we reuse the existing `to_l2_snapshot` path with `n_levels = Some(1)`. This is equivalent and avoids a parallel code path. If a downstream consumer (DoB snapshot stream, for example) is found to need deeper levels in the WS-disabled config, raise `DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED`. + +**Before committing, verify no non-WS consumer reads beyond level 1.** + +```bash +grep -rn "L2SnapshotParams::new(None, None)\|params_map.get.*None.*None\|truncate" server/src --include="*.rs" | head -20 +``` + +Confirm every consumer in the default-multicast hot path only reads level 1 (e.g. via `truncate(1)` or `.first()`). If any consumer reads deeper, raise the constant or special-case that consumer. + +- [ ] **Step 2: Update `OrderBookState::l2_snapshots` and the test variant.** + +In `server/src/listeners/order_book/state.rs`, line 93: + +```rust + pub(super) fn l2_snapshots(&mut self, prevent_future_snaps: bool) -> Option<(u64, L2Snapshots)> { + if self.snapped { + None + } else { + self.snapped = prevent_future_snaps || self.snapped; + Some((self.time, compute_l2_snapshots(&self.order_book, self.enable_websocket))) + } + } + + #[cfg(test)] + pub(super) fn compute_l2_snapshots_for_test(&self) -> (u64, L2Snapshots) { + (self.time, compute_l2_snapshots(&self.order_book, self.enable_websocket)) + } +``` + +- [ ] **Step 3: Run the full test suite with `enable_websocket=true` paths.** + +```bash +cargo test --workspace +``` + +Most existing tests construct state with `enable_websocket: true` (after Task 4.1) or call test helpers that should preserve today's WS-enabled behavior. Tests should pass. + +If any test passes a state constructed with `enable_websocket: false` and then asserts on bucketed snapshots, update those tests to either (a) construct with `enable_websocket: true` or (b) assert only on the unbucketed variant. + +- [ ] **Step 4: Add a unit test that asserts variant count by flag.** + +In `server/src/listeners/order_book/utils.rs`, at the end of the file (inside a `#[cfg(test)] mod tests` block — create one if absent): + +```rust +#[cfg(test)] +mod compute_l2_snapshots_tests { + use super::*; + use crate::order_book::multi_book::OrderBooks; + use crate::types::inner::InnerL4Order; + + #[test] + fn ws_enabled_emits_seven_variants_per_coin() { + let order_books: OrderBooks = OrderBooks::default(); + let snapshots = compute_l2_snapshots(&order_books, /* enable_websocket = */ true); + for (_coin, variants) in snapshots.as_ref() { + assert_eq!(variants.len(), 7, "WS-enabled config must emit all 7 variants"); + } + } + + #[test] + fn ws_disabled_emits_one_variant_per_coin() { + let order_books: OrderBooks = OrderBooks::default(); + let snapshots = compute_l2_snapshots(&order_books, /* enable_websocket = */ false); + for (_coin, variants) in snapshots.as_ref() { + assert_eq!(variants.len(), 1, "WS-disabled config must emit only the unbucketed variant"); + assert!(variants.contains_key(&L2SnapshotParams { n_sig_figs: None, mantissa: None })); + } + } +} +``` + +(If `OrderBooks::default()` doesn't exist, populate with an empty/minimal book using the existing test fixtures — e.g. construct from an empty snapshot.) + +- [ ] **Step 5: Run the new unit tests.** + +```bash +cargo test -p server --lib listeners::order_book::utils::compute_l2_snapshots_tests +``` + +Expected: **PASS**. + +- [ ] **Step 6: Commit.** + +```bash +git add server/src/listeners/order_book/utils.rs server/src/listeners/order_book/state.rs +git commit -m "perf: skip 6 bucketed l2 variants when websocket is disabled" +``` + +### Task 4.3: Parity sanity — run goldens in both flag configurations + +**Files:** +- Modify: `server/src/listeners/order_book/block_mode_multicast_e2e.rs` or sibling (depending on existing harness) + +- [ ] **Step 1: Inspect the existing parity test harness.** + +```bash +grep -n "fn dual_validator_fixture_matches_block_and_stream_goldens\|fn block_mode_multicast_e2e\|enable_websocket" server/src/listeners/order_book/block_mode_multicast_e2e.rs server/src/listeners/order_book/parity_tests.rs +``` + +Identify how the test currently constructs the listener / state. + +- [ ] **Step 2: Add a `#[test]` that re-runs the existing block-mode golden check with `enable_websocket=true` AND a second invocation with `enable_websocket=false`.** + +Concretely, refactor the existing golden assertion into a function that takes `enable_websocket: bool`, then have two `#[test]` functions call it. The TOB output is BBO-derived and should be identical across both flag settings. + +Example skeleton (adapt to existing harness): + +```rust +fn run_block_mode_golden_with_flag(enable_websocket: bool) { + // ... existing harness body, parameterizing the OrderBookState/OrderBookListener + // construction to pass `enable_websocket` ... +} + +#[test] +fn block_mode_multicast_e2e_websocket_enabled() { + run_block_mode_golden_with_flag(true); +} + +#[test] +fn block_mode_multicast_e2e_websocket_disabled() { + run_block_mode_golden_with_flag(false); +} +``` + +Do the same for `dual_validator_fixture_matches_block_and_stream_goldens`. + +- [ ] **Step 3: Run both tests.** + +```bash +cargo test -p server listeners::order_book::block_mode_multicast_e2e -- --nocapture +cargo test -p server dual_validator_fixture_matches_block_and_stream_goldens -- --nocapture +``` + +Expected: both **PASS** for both flag values. TOB output is invariant. + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/listeners/order_book/block_mode_multicast_e2e.rs server/src/listeners/order_book/parity_tests.rs +git commit -m "test: run multicast e2e goldens with --enable-websocket on and off" +``` + +### Task 4.4: Re-measure perf after L2-3 + +- [ ] **Step 1: Capture perf top with `--enable-websocket` off (the new default).** Append to baseline doc under "After L2-3 (drop bucketed variants)". Expected: dominant cost in `map_to_l2_levels` should drop sharply; this is the biggest single win. + +--- + +## Phase 5: L2-5 — Tie streaming snapshot emission to block finalization + +### Task 5.1: Add a `book_dirty` flag to `OrderBookState` + +**Files:** +- Modify: `server/src/listeners/order_book/state.rs` + +- [ ] **Step 1: Add the field and accessor methods.** + +In `server/src/listeners/order_book/state.rs`, struct definition (line 16): + +```rust +pub(super) struct OrderBookState { + order_book: OrderBooks, + height: u64, + time: u64, + snapped: bool, + ignore_spot: bool, + enable_websocket: bool, + /// Set to true on any apply that may have changed the published L2/BBO + /// state. Cleared whenever a snapshot is emitted. Used by streaming-mode + /// finalization to decide whether to compute a snapshot for the closing + /// block. + book_dirty: bool, + dob_tap: Option, +} +``` + +In the `Clone` impl, add `book_dirty: self.book_dirty`. In `from_snapshot`, initialize `book_dirty: false`. + +Add accessors: + +```rust + pub(super) const fn book_dirty(&self) -> bool { + self.book_dirty + } + + pub(super) fn mark_book_dirty(&mut self) { + self.book_dirty = true; + } + + pub(super) fn clear_book_dirty(&mut self) { + self.book_dirty = false; + } +``` + +- [ ] **Step 2: Set `book_dirty = true` in `apply_stream_diff` on successful mutations.** + +In `server/src/listeners/order_book/state.rs`, function `apply_stream_diff` (line 264). Set `self.book_dirty = true;` immediately before each `Ok(true)` / `Ok(false)` path that performed (or could have performed) a mutation. Simplest correct placement: at the end, right before `Ok(true)` at line 344, plus inside each soft-tolerance branch that advances height. To minimize risk, set it for every successful mutation branch (New with inserted resting, Update that modified sz, Remove that cancelled). + +Concretely, replace the function ending: + +```rust + self.height = self.height.max(block_number); + self.time = block_time_ms; + self.snapped = false; + self.book_dirty = true; + Ok(true) + } +``` + +For soft-tolerance branches (missing status / missing order), do **not** set `book_dirty` — those paths do not mutate the resting book. + +Also update `apply_updates` (block-mode batched apply, line ~140 in the same file) the same way: set `book_dirty = true` after a successful mutation. Block mode currently snapshots per chunk so this is benign; streaming-mode will use it. + +- [ ] **Step 3: Run tests.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS**. No behavioral change yet (flag is set but not consumed). + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/listeners/order_book/state.rs +git commit -m "feat: add book_dirty flag to OrderBookState, set on apply mutations" +``` + +### Task 5.2: Skip per-chunk snapshot in streaming mode + +**Files:** +- Modify: `server/src/listeners/order_book/mod.rs:1855-1890` + +- [ ] **Step 1: Wrap the per-chunk snapshot emission in a non-streaming guard.** + +In `server/src/listeners/order_book/mod.rs`, around line 1852–1855: + +```rust + if self.ingest_mode == IngestMode::Stream && event_source == EventSource::Fills { + return Ok(()); + } + + // In streaming mode, snapshots are emitted by finalize_stream_block, + // not per chunk. The dirty flag is set by apply_stream_diff and + // consumed at block finalization. Skip the per-chunk compute here. + if self.ingest_mode == IngestMode::Stream { + return Ok(()); + } + + let snapshot_source = ingest_source_label(event_source); + let snapshot_start = Instant::now(); + let snapshot = self.l2_snapshots(true); + // ... rest unchanged ... +``` + +- [ ] **Step 2: Run streaming mode tests — some are expected to FAIL because no snapshot is being emitted yet.** + +```bash +cargo test -p server stream -- --nocapture +``` + +Expected: streaming tests that assert on TOB snapshot emission **FAIL**. This is intentional — we'll re-route emission via `finalize_stream_block` in the next task. + +If a non-streaming test fails, investigate immediately — block mode should be untouched. + +- [ ] **Step 3: Do NOT commit yet.** This task is part of a multi-step change. The next task restores correctness. + +### Task 5.3: Emit snapshot from `finalize_stream_block` when book is dirty + +**Files:** +- Modify: `server/src/listeners/order_book/mod.rs:1028-1042` (and possibly call site at line 1248) + +- [ ] **Step 1: Extract the snapshot enqueue logic into a helper.** + +In `server/src/listeners/order_book/mod.rs`, currently lines 1855–1889 build and send an `InternalMessage::Snapshot`. Extract a method on `OrderBookListener`: + +```rust + fn emit_tob_snapshot(&mut self, source_label: &'static str, source_block_time_ms: u64, source_local_time_ms: u64) { + let snapshot_start = Instant::now(); + let snapshot = self.l2_snapshots(true); + crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); + if let Some(snapshot) = snapshot { + let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); + let latest_heights = self.ingest_heights(); + crate::metrics::observe_tob_snapshot_enqueue_lag( + source_label, + Duration::from_millis(now_ms().saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_source_block_lag( + source_label, + Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_validator_write_lag( + source_label, + Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), + ); + if let Some(tx) = &self.internal_message_tx { + let enqueued_at_ms = now_ms(); + let snapshot_msg = Arc::new(InternalMessage::Snapshot { + l2_snapshots: snapshot.1, + time: snapshot.0, + height: snapshot_height, + source: source_label, + source_block_time_ms, + source_local_time_ms, + latest_heights, + enqueued_at_ms, + }); + let _unused = tx.send(snapshot_msg); + } + if let Some(state) = self.order_book_state.as_mut() { + state.clear_book_dirty(); + } + } + } +``` + +Update the existing block-mode snapshot site at line 1855–1889 to call `self.emit_tob_snapshot(snapshot_source, source_block_time_ms, source_local_time_ms);` instead of inlining (preserving `last_source_times.unwrap_or((snapshot.0, snapshot.0))` for the `(block_time, local_time)` defaults). + +- [ ] **Step 2: Call `emit_tob_snapshot` from `finalize_stream_block` when dirty.** + +In `server/src/listeners/order_book/mod.rs`, `finalize_stream_block` (line 1028): + +```rust + fn finalize_stream_block(&mut self, height: u64, block: StreamingBlock, mode: StreamFinalizationMode) { + if block.boundary_open + && let (Some(state), Some(block_time_ms)) = (self.order_book_state.as_mut(), block.block_time_ms) + { + state.emit_batch_boundary(1, height, block_time_ms); + } + if let Some(last_received_at) = block.last_received_at { + crate::metrics::observe_stream_finalization_lag(mode.label(), last_received_at.elapsed()); + } + match mode { + StreamFinalizationMode::Watermark => self.streaming_state.finalized_watermark_blocks += 1, + StreamFinalizationMode::GraceFallback => self.streaming_state.finalized_grace_blocks += 1, + } + self.streaming_state.finalized_height = Some(height); + + // Streaming snapshot emission: tied to block finalization (L2-5). + let dirty = self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty); + if dirty { + let block_time_ms = block.block_time_ms.unwrap_or(0); + let local_time_ms = block.local_time_ms.unwrap_or(block_time_ms); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + block_time_ms, + local_time_ms, + ); + } + } +``` + +- [ ] **Step 3: Run streaming tests — they should now pass.** + +```bash +cargo test -p server stream -- --nocapture +cargo test -p server dual_validator_fixture_matches_block_and_stream_goldens -- --nocapture +``` + +Expected: **PASS**. If `dual_validator_fixture` fails with a packet-sequence mismatch, the golden may need regenerating against the new (per-block, not per-chunk) emission cadence. Verify this is the only diff and capture the new golden — see Task 5.5 for the explicit parity-test contract. + +- [ ] **Step 4: Run full suite.** + +```bash +cargo test --workspace +``` + +- [ ] **Step 5: Commit Tasks 5.2 + 5.3 together as the L2-5 core change.** + +```bash +git add server/src/listeners/order_book/mod.rs +git commit -m "perf: emit streaming l2 snapshot at block finalization, not per chunk" +``` + +### Task 5.4: Stuck-stream backstop + +**Files:** +- Modify: `server/src/listeners/order_book/mod.rs` (the periodic ticker block around line 255–278) + +- [ ] **Step 1: Add a backstop emission on the periodic 60s ticker.** + +In `server/src/listeners/order_book/mod.rs`, the ticker arm of the `select!` at line 255–278 currently does liveness check + latency report + snapshot fetch. Add a backstop: + +```rust + _ = ticker.tick() => { + // ... existing liveness + latency + progress + health ... + + // Stuck-stream backstop (L2-5): if the book has been dirty + // longer than STREAM_DIRTY_BACKSTOP and no block has finalized, + // emit a snapshot from the latest applied diff. Prevents quote + // staleness during prolonged grace_fallback. + { + let mut guard = listener.lock().await; + if guard.ingest_mode == IngestMode::Stream + && guard.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) + { + let elapsed = guard.last_dirty_emit_at.map_or(Duration::MAX, |t| t.elapsed()); + if elapsed >= STREAM_DIRTY_BACKSTOP_INTERVAL { + if let (Some(bt), Some(lt)) = (guard.last_batch_block_time_ms, guard.last_batch_local_time_ms) { + guard.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + ); + guard.last_dirty_emit_at = Some(Instant::now()); + } + } + } + } + + // ... existing snapshot fetch ... + } +``` + +Add to `OrderBookListener`: + +```rust + last_dirty_emit_at: Option, +``` + +Initialize to `None` in `new_with_ingest_mode`. Set it inside `emit_tob_snapshot` when streaming mode is active. + +Add a const near other tuning constants in the file: + +```rust +const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_secs(5); +``` + +- [ ] **Step 2: Run tests.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS**. + +- [ ] **Step 3: Commit.** + +```bash +git add server/src/listeners/order_book/mod.rs +git commit -m "feat: add stuck-stream snapshot backstop for prolonged grace_fallback" +``` + +### Task 5.5a: Add streaming test helpers + +**Files:** +- Modify: `server/src/listeners/order_book/mod.rs` (add test-only constructor + ticker hook) + +- [ ] **Step 1: Add a streaming-mode test constructor.** + +In `server/src/listeners/order_book/mod.rs`, near the existing `for_test_with_snapshot` at line 1368: + +```rust + /// Test-only: like `for_test_with_snapshot` but in streaming mode and + /// with an attached `market_message_tx` so tests can capture emitted + /// `InternalMessage::Snapshot` messages. + #[cfg(test)] + pub(crate) fn for_test_streaming_with_snapshot( + snapshot: Snapshots, + height: u64, + ) -> (Self, tokio::sync::broadcast::Receiver>) { + let (tx, rx) = tokio::sync::broadcast::channel::>(256); + let mut listener = Self::new_with_ingest_mode(Some(tx), false, IngestMode::Stream); + listener.complete_stream_startup_sync_for_test(); + listener.init_from_snapshot(snapshot, height); + (listener, rx) + } +``` + +- [ ] **Step 2: Add a test-only "fire the backstop tick" helper.** + +To support the stuck-stream backstop test without sleeping, add a method that mimics what the periodic ticker arm does for the backstop check: + +```rust + /// Test-only: forces a backstop snapshot emission if the book is dirty, + /// bypassing the wall-clock interval check. Mirrors the production + /// ticker arm's backstop logic. + #[cfg(test)] + pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { + if self.ingest_mode == IngestMode::Stream + && self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) + && let (Some(bt), Some(lt)) = (self.last_batch_block_time_ms, self.last_batch_local_time_ms) + { + self.emit_tob_snapshot(ingest_source_label(EventSource::OrderDiffs), bt, lt); + self.last_dirty_emit_at = Some(Instant::now()); + } + } +``` + +- [ ] **Step 3: Run existing tests to verify no regressions.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS**. + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/listeners/order_book/mod.rs +git commit -m "test: add streaming-mode listener test helpers (constructor, backstop hook)" +``` + +### Task 5.5b: Tests for L2-5 behaviors + +**Files:** +- Create: `server/src/listeners/order_book/stream_finalization_tests.rs` +- Modify: `server/src/listeners/order_book/mod.rs` (one-line `mod` registration) + +- [ ] **Step 1: Register the new test module.** + +In `server/src/listeners/order_book/mod.rs`, near the existing test module registrations (search for `#[cfg(test)] mod parity_tests`), add: + +```rust +#[cfg(test)] +mod stream_finalization_tests; +``` + +- [ ] **Step 2: Create the test file with all four required tests.** + +Create `server/src/listeners/order_book/stream_finalization_tests.rs`: + +```rust +//! L2-5 tests: streaming snapshot emission tied to block finalization. +//! +//! See docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md +//! "L2-5 — Required tests" for the contract these tests assert. + +#![cfg(test)] + +use std::collections::HashMap; +use std::sync::Arc; + +use alloy::primitives::Address; +use chrono::NaiveDateTime; +use tokio::sync::broadcast::Receiver; + +use crate::listeners::order_book::utils::EventBatch; +use crate::listeners::order_book::{InternalMessage, L2SnapshotParams, OrderBookListener}; +use crate::order_book::multi_book::Snapshots; +use crate::order_book::{Coin, OrderBook, Px, Side, Snapshot, Sz}; +use crate::types::inner::InnerL4Order; +use crate::types::node_data::{Batch, NodeDataOrderDiff, NodeDataOrderStatus}; +use crate::types::{L4Order, OrderDiff}; + +const TEST_COIN: &str = "BTC"; + +fn dt_from_ms(block_time_ms: u64) -> NaiveDateTime { + let secs = (block_time_ms / 1_000) as i64; + let nsecs = ((block_time_ms % 1_000) * 1_000_000) as u32; + chrono::DateTime::::from_timestamp(secs, nsecs).expect("valid timestamp").naive_utc() +} + +fn seed_snapshot() -> Snapshots { + // Pre-seed with one ask far above any test BBO so the universe is non-empty + // and the listener accepts the snapshot. We do not cancel it in tests; it + // simply sits above all test prices and does not interfere with BBO. + let coin = Coin::new(TEST_COIN); + let mut book: OrderBook = OrderBook::new(); + book.add_order(InnerL4Order { + user: Address::new([0; 20]), + coin: coin.clone(), + side: Side::Ask, + limit_px: Px::parse_from_str("99999").expect("valid px"), + sz: Sz::parse_from_str("1").expect("valid sz"), + oid: 9_000, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut map: HashMap> = HashMap::new(); + map.insert(coin, book.to_snapshot()); + Snapshots::new(map) +} + +fn add_event( + block_time_ms: u64, + side: Side, + oid: u64, + px: &str, + sz: &str, +) -> (NodeDataOrderStatus, NodeDataOrderDiff) { + let user = Address::new([0; 20]); + let l4 = L4Order { + user: Some(user), + coin: TEST_COIN.to_string(), + side, + limit_px: px.to_string(), + sz: sz.to_string(), + oid, + timestamp: block_time_ms, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }; + let status = NodeDataOrderStatus { + time: dt_from_ms(block_time_ms), + user, + status: "open".to_string(), + order: l4, + }; + let diff = NodeDataOrderDiff::new_for_test( + user, + oid, + px.to_string(), + TEST_COIN.to_string(), + OrderDiff::New { sz: sz.to_string() }, + ); + (status, diff) +} + +fn update_diff(oid: u64, px: &str, orig_sz: &str, new_sz: &str) -> NodeDataOrderDiff { + NodeDataOrderDiff::new_for_test( + Address::new([0; 20]), + oid, + px.to_string(), + TEST_COIN.to_string(), + OrderDiff::Update { orig_sz: orig_sz.to_string(), new_sz: new_sz.to_string() }, + ) +} + +fn feed_block( + listener: &mut OrderBookListener, + height: u64, + block_time_ms: u64, + statuses: Vec, + diffs: Vec, +) { + listener + .receive_batch(EventBatch::Orders(Batch::new_for_test(height, block_time_ms, statuses))) + .expect("statuses batch applies"); + listener + .receive_batch(EventBatch::BookDiffs(Batch::new_for_test(height, block_time_ms, diffs))) + .expect("diffs batch applies"); +} + +/// Drains the snapshot receiver and returns only `InternalMessage::Snapshot` +/// messages in receive order. Non-snapshot messages are skipped. +fn drain_snapshots(rx: &mut Receiver>) -> Vec> { + let mut snapshots = Vec::new(); + while let Ok(msg) = rx.try_recv() { + if matches!(msg.as_ref(), InternalMessage::Snapshot { .. }) { + snapshots.push(msg); + } + } + snapshots +} + +/// Extracts the best bid `Px::value()` from an `InternalMessage::Snapshot`. +fn snapshot_best_bid(msg: &InternalMessage) -> Option { + let InternalMessage::Snapshot { l2_snapshots, .. } = msg else { return None }; + let coin = Coin::new(TEST_COIN); + let params_map = l2_snapshots.as_ref().get(&coin)?; + let unbucketed = params_map.get(&L2SnapshotParams::new(None, None))?; + let levels = unbucketed.clone().truncate(1).export_inner_snapshot(); + levels[0].first().map(|level| level.px().value()) +} + +fn snapshot_height(msg: &InternalMessage) -> u64 { + let InternalMessage::Snapshot { height, .. } = msg else { panic!("not a Snapshot") }; + *height +} + +/// L2-5 test 1: two-diff-in-block — the second diff moves BBO; finalization +/// must emit exactly one snapshot carrying the *final* BBO. +#[tokio::test(flavor = "current_thread")] +async fn two_diffs_in_block_emit_final_bbo_once() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: add bid @ 100, then add a higher bid @ 105 (moves BBO). + let (status_a, diff_a) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + let (status_b, diff_b) = add_event(1_700_000_002_000, Side::Bid, 102, "105", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![status_a, status_b], vec![diff_a, diff_b]); + + // Block 3: trigger the watermark so block 2 finalizes. + let (status_c, diff_c) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![status_c], vec![diff_c]); + + let snapshots = drain_snapshots(&mut rx); + let block_2_snapshots: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert_eq!(block_2_snapshots.len(), 1, "exactly one snapshot for height 2; got {}", block_2_snapshots.len()); + + let final_best_bid = snapshot_best_bid(block_2_snapshots[0]).expect("BBO exists"); + let expected = Px::parse_from_str("105").unwrap().value(); + assert_eq!(final_best_bid, expected, "snapshot for height 2 must carry the final BBO (105), got {final_best_bid}"); +} + +/// L2-5 test 2: no spurious emission — feeding a block with no diffs that +/// reach the book must not produce a snapshot for that height. +#[tokio::test(flavor = "current_thread")] +async fn block_without_mutations_emits_no_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: empty statuses, empty diffs. No mutations → book_dirty stays false. + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![]); + + // Block 3: again empty. + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2_snapshots: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!(height_2_snapshots.is_empty(), "no snapshot should be emitted for a clean block; got {}", height_2_snapshots.len()); +} + +/// L2-5 test 3: finalization-emission ordering — BBO-changing diffs across +/// blocks N, N+1, N+2 produce snapshots in monotonic height order, one each. +#[tokio::test(flavor = "current_thread")] +async fn finalized_blocks_emit_snapshots_in_order() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 102, "110", "5"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let (s4, d4) = add_event(1_700_000_004_000, Side::Bid, 103, "120", "5"); + feed_block(&mut listener, 4, 1_700_000_004_000, vec![s4], vec![d4]); + + // Trigger finalization of block 4 by feeding block 5. + let (s5, d5) = add_event(1_700_000_005_000, Side::Bid, 104, "121", "1"); + feed_block(&mut listener, 5, 1_700_000_005_000, vec![s5], vec![d5]); + + let snapshots = drain_snapshots(&mut rx); + let heights: Vec = snapshots.iter().map(|m| snapshot_height(m)).collect(); + + assert!(heights.windows(2).all(|w| w[0] < w[1]), "heights must be strictly increasing; got {heights:?}"); + assert!(heights.contains(&2), "expected snapshot for height 2"); + assert!(heights.contains(&3), "expected snapshot for height 3"); + assert!(heights.contains(&4), "expected snapshot for height 4"); + + let count_each = |h: u64| heights.iter().filter(|&&x| x == h).count(); + assert_eq!(count_each(2), 1); + assert_eq!(count_each(3), 1); + assert_eq!(count_each(4), 1); +} + +/// L2-5 test 4: stuck-stream backstop — if no later block arrives, the +/// backstop hook (firing what the production ticker fires on a 5s interval) +/// emits one snapshot for the dirty block. +#[tokio::test(flavor = "current_thread")] +async fn stuck_stream_backstop_emits_dirty_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // No block 3 arrives. Without the backstop, no snapshot would emit. + let before_backstop = drain_snapshots(&mut rx); + assert!( + before_backstop.is_empty(), + "without finalization there should be no snapshot yet; got {} snapshot(s)", + before_backstop.len(), + ); + + // Fire the backstop manually (production fires this on a 5s ticker). + listener.fire_stream_dirty_backstop_for_test(); + + let after_backstop = drain_snapshots(&mut rx); + assert_eq!(after_backstop.len(), 1, "backstop must emit exactly one snapshot"); + assert_eq!(snapshot_height(&after_backstop[0]), 2, "backstop snapshot must carry the last applied height"); +} +``` + +- [ ] **Step 3: Run all four tests.** + +```bash +cargo test -p server stream_finalization_tests -- --nocapture +``` + +Expected: all four **PASS**. + +If `two_diffs_in_block_emit_final_bbo_once` fails with "exactly one snapshot for height 2; got 0", verify that `apply_stream_diff` is setting `book_dirty = true` (Task 5.1 Step 2). If `block_without_mutations_emits_no_snapshot` fails with a non-empty snapshot list, verify that `apply_updates` / `apply_stream_diff` does not set `book_dirty` on a no-op apply. + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/listeners/order_book/stream_finalization_tests.rs server/src/listeners/order_book/mod.rs +git commit -m "test: l2-5 streaming snapshot finalization tests (two-diff, no-spurious, ordering, backstop)" +``` + +### Task 5.6: Quote-sequence parity test (block vs stream) + +**Files:** +- Modify: `server/src/listeners/order_book/parity_tests.rs` (extend existing dual-validator test) + +- [ ] **Step 1: Locate the existing dual-validator fixture loader.** + +Inspect how `dual_validator_fixture_matches_block_and_stream_goldens` constructs its block-mode and stream-mode listeners. It already replays the fixture in both modes; the existing test compares final book state and golden packet bytes. We'll add a sibling that compares the ordered snapshot sequence. + +```bash +grep -n "dual_validator_fixture_matches_block_and_stream_goldens\|fn replay\|fn drive_listener" server/src/listeners/order_book/parity_tests.rs server/src/listeners/order_book/block_mode_multicast_e2e.rs +``` + +Identify the function that loads the fixture and drives the listener through it. Note its signature — typically it takes the listener and a path or in-memory event list and returns after all events have been applied. + +- [ ] **Step 2: Add a sibling helper that captures the ordered snapshot sequence.** + +Add to the same test file as the existing dual-validator test (likely `parity_tests.rs`): + +```rust +/// (height, coin_name, best_bid_value, best_ask_value) tuple per emitted +/// snapshot. Sorted-by-coin within a height for stable comparison since +/// the snapshot's internal HashMap iteration order is non-deterministic. +#[derive(Debug, PartialEq, Eq, Clone)] +struct QuoteRow { + height: u64, + coin: String, + best_bid: Option, + best_ask: Option, +} + +fn extract_quote_rows(msg: &InternalMessage) -> Vec { + let InternalMessage::Snapshot { l2_snapshots, height, .. } = msg else { return vec![] }; + let mut rows = Vec::new(); + for (coin, params_map) in l2_snapshots.as_ref() { + let Some(unbucketed) = params_map.get(&L2SnapshotParams::new(None, None)) else { continue }; + let levels = unbucketed.clone().truncate(1).export_inner_snapshot(); + let best_bid = levels[0].first().map(|l| l.px().value()); + let best_ask = levels[1].first().map(|l| l.px().value()); + rows.push(QuoteRow { height: *height, coin: coin.value().to_string(), best_bid, best_ask }); + } + rows.sort_by(|a, b| a.coin.cmp(&b.coin)); + rows +} + +/// Drives the fixture through a listener in the given mode and returns the +/// flattened ordered list of QuoteRow per emitted snapshot. +fn replay_fixture_and_capture_quote_rows(ingest_mode: IngestMode) -> Vec { + let (tx, mut rx) = tokio::sync::broadcast::channel::>(4096); + // Build a listener in `ingest_mode` using the same fixture loader the + // existing dual-validator parity test uses. Pseudocode (adapt to the + // helper name discovered in Step 1): + let mut listener = build_dual_validator_listener_for_test(ingest_mode, Some(tx.clone())); + drive_dual_validator_fixture(&mut listener, ingest_mode); + if ingest_mode == IngestMode::Stream { + listener.finalize_streaming_for_test().expect("finalize streaming"); + } + drop(tx); // close so try_recv terminates + + let mut all_rows = Vec::new(); + while let Ok(msg) = rx.try_recv() { + all_rows.extend(extract_quote_rows(&msg)); + } + all_rows +} + +/// L2-5 quote-sequence parity: with finalization-driven snapshot emission, +/// the ordered (per-height, per-coin) BBO sequence in block-mode replay must +/// equal the sequence in stream-mode replay of the same fixture. +#[test] +fn block_and_stream_emit_identical_tob_quote_sequence() { + let block_rows = replay_fixture_and_capture_quote_rows(IngestMode::Block); + let stream_rows = replay_fixture_and_capture_quote_rows(IngestMode::Stream); + + assert_eq!( + block_rows.len(), + stream_rows.len(), + "snapshot row counts differ: block={} stream={}", + block_rows.len(), + stream_rows.len(), + ); + // Compare element-by-element so a mismatch points at the first diverging + // height/coin/BBO instead of dumping two giant vectors. + for (i, (b, s)) in block_rows.iter().zip(stream_rows.iter()).enumerate() { + assert_eq!(b, s, "row {i} diverges between block and stream: block={b:?} stream={s:?}"); + } +} +``` + +`build_dual_validator_listener_for_test` and `drive_dual_validator_fixture` are the existing helpers (or close adaptations of them) located in Step 1. If the existing test inlines these, refactor it to expose them as separate functions before adding the new test — the refactor is a separate commit. + +- [ ] **Step 1b: If the helpers don't exist, refactor first.** + +If the existing `dual_validator_fixture_matches_block_and_stream_goldens` test inlines its fixture-driving logic, extract `build_dual_validator_listener_for_test(mode, tx)` and `drive_dual_validator_fixture(listener, mode)` as `#[cfg(test)] fn` helpers above the existing test. Run the existing test to verify the refactor is behavior-preserving. Commit the refactor before adding the new quote-sequence test. + +```bash +cargo test -p server dual_validator_fixture_matches_block_and_stream_goldens -- --nocapture +``` + +Expected: still **PASS** after the refactor. + +```bash +git add server/src/listeners/order_book/parity_tests.rs +git commit -m "refactor: extract dual-validator fixture driver helpers for reuse" +``` + +- [ ] **Step 2: Run the test.** + +```bash +cargo test -p server block_and_stream_emit_identical_tob_quote_sequence -- --nocapture +``` + +Expected: **PASS**. If it fails, capture the first diverging quote and investigate — most likely either the fixture has an intra-chunk emission that should be coalesced, or the stream finalization timing differs in a way that affects ordering. + +- [ ] **Step 3: Commit.** + +```bash +git add server/src/listeners/order_book/parity_tests.rs +git commit -m "test: assert block and stream emit identical tob quote sequences post-l2-5" +``` + +### Task 5.7: Re-measure perf after L2-5 + +- [ ] **Step 1: Capture perf top.** Append to baseline doc under "After L2-5 (finalization-driven streaming snapshot)". Expected: streaming-specific CPU drops further; the multiplier between validator chattiness and snapshot work is eliminated. + +- [ ] **Step 2: Compare cumulative CPU% reduction to the ≥40% success criterion.** + +If <40%, evaluate whether L2-2 (per-level aggregate cache) is needed. If ≥40%, L2-2 can remain deferred indefinitely. + +--- + +## Phase 6: Documentation + +### Task 6.1: Update README + +**Files:** +- Modify: `README.md` + +- [ ] **Step 1: Update the "Setup" section.** + +Find the existing `cargo run --release --bin dz_hl_publisher -- --address 0.0.0.0 --port 8000` example. Add a note that the default mode is multicast-only and that `--address`/`--port` are only used when `--enable-websocket` is set. + +```markdown +The publisher is **multicast-only by default**. The WebSocket listener is +disabled unless `--enable-websocket` is passed. In multicast-only mode, +`--address` and `--port` are accepted but ignored (a startup log line +confirms the mode). To run with WebSocket subscriptions enabled (today's +behavior): + +```bash +cargo run --release --bin dz_hl_publisher -- \ + --address 0.0.0.0 --port 8000 \ + --enable-websocket +``` +``` + +- [ ] **Step 2: Add a CLI flag row for `--enable-websocket` in the multicast/CLI arguments table if one exists.** + +- [ ] **Step 3: Commit.** + +```bash +git add README.md +git commit -m "docs: document --enable-websocket default and behavior" +``` + +### Task 6.2: Update CHANGELOG / release notes + +**Files:** +- Modify or create: `CHANGELOG.md` + +- [ ] **Step 1: Add an entry calling out the default-behavior change.** + +```markdown +## Unreleased + +### Breaking changes + +- WebSocket listener is now **disabled by default**. Pass `--enable-websocket` + to restore the previous behavior. The multicast publisher path (TOB/DoB) is + unaffected. This reduces L2 snapshot CPU cost in production deployments + that only use multicast. + +### Performance + +- Streaming-mode CPU reduced (target ≥40%) by: + - replacing `Px::num_digits()` f64 log10 with integer `u64::ilog10()` + - skipping bucketed L2 snapshot variants when `--enable-websocket` is off + - tying streaming L2 snapshot emission to block finalization instead of + every file-read chunk + - pre-sizing L2 level Vecs to avoid reallocation growth + +### Fixes + +- `Px::num_digits()` no longer mis-reports an extra digit for values just + below large powers of ten (latent f64 imprecision bug). +``` + +- [ ] **Step 2: Commit.** + +```bash +git add CHANGELOG.md +git commit -m "docs: changelog for streaming cpu reduction and ws default-off" +``` + +--- + +## Phase 7: Ship + +### Task 7.1: Final canary verification + +- [ ] **Step 1: Deploy the branch to canary; run with default (no `--enable-websocket`).** + +- [ ] **Step 2: Run for ≥1h. Capture:** + - perf top final + - `top -p ...` CPU% average + - `orderbook_tob_snapshot_compute` histograms + - `orderbook_tob_suppressed_total` delta + - `orderbook_dob_channel_drops_total` delta (must remain 0) + +- [ ] **Step 3: Append to baseline doc as the "Ship verification" section.** + +- [ ] **Step 4: Verify all success criteria from the spec are met.** + +If yes, open the PR. If a criterion isn't met, document the gap and decide on L2-2 vs alternative. + +### Task 7.2: Rollback plan documented + +- [ ] **Step 1: Confirm `--enable-websocket` is the rollback path.** + +If any regression appears after merge, operations can pass `--enable-websocket` to restore today's behavior without a code revert. The flag fully re-enables WS and all 7 L2 snapshot variants. + +If a regression is in the streaming-finalization snapshot path (L2-5) specifically and `--enable-websocket` doesn't help, the rollback is a `git revert` of the L2-5 commits in Phase 5 (Tasks 5.2 / 5.3 / 5.4 / 5.5 / 5.6). The other phases (L2-1, L2-4, Phase 3 plumbing, L2-3) can stand alone. + +--- + +## Notes + +- **L2-2 is intentionally not in this plan.** Per the spec, it stays deferred + until L2-1 / L2-3 / L2-4 / L2-5 have landed and been re-measured. If + cumulative CPU reduction is below 40%, open a follow-up plan for L2-2; + otherwise leave it alone. +- **Followups (not in scope here):** per-diff batch explosion in + `receive_stream_diffs` (mod.rs:1088), `tokio::spawn`-per-diff in + `publish_l4_update` (mod.rs:1259), DoB tap mutex (dob_tap.rs), grace-fallback + warn-log rate limiting. These are <5% each per the perf data and can be + scoped as separate plans if/when warranted. From ae3bb49e1a4d77cd563106248d7abbdbd3d12770 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 14:09:29 -0400 Subject: [PATCH 04/65] plan: address codex findings on plan v1 (recovery, backstop, dirty placement) --- .../2026-05-15-streaming-cpu-reduction.md | 451 +++++++++++++++++- 1 file changed, 434 insertions(+), 17 deletions(-) diff --git a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md index f03ac56d..d038b1ad 100644 --- a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md +++ b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md @@ -931,24 +931,82 @@ Add accessors: } ``` -- [ ] **Step 2: Set `book_dirty = true` in `apply_stream_diff` on successful mutations.** +- [ ] **Step 2: Set `book_dirty = true` only inside successful-mutation branches.** -In `server/src/listeners/order_book/state.rs`, function `apply_stream_diff` (line 264). Set `self.book_dirty = true;` immediately before each `Ok(true)` / `Ok(false)` path that performed (or could have performed) a mutation. Simplest correct placement: at the end, right before `Ok(true)` at line 344, plus inside each soft-tolerance branch that advances height. To minimize risk, set it for every successful mutation branch (New with inserted resting, Update that modified sz, Remove that cancelled). +> **Codex finding #3 (medium):** placing `book_dirty = true` at the function +> exit would mark soft-tolerance no-op paths (missing-order Update/Remove) as +> dirty, producing spurious snapshots. Each successful mutation branch must +> set it individually. -Concretely, replace the function ending: +In `server/src/listeners/order_book/state.rs`, function `apply_stream_diff` (line 264), set `self.book_dirty = true;` inside each branch that *actually mutated* the book, not at the shared exit: ```rust + match inner_diff { + InnerOrderDiff::New { sz } => { + let Some(order) = order_status else { + // Soft-tolerance: New diff without matching opening status. + log::warn!("apply_stream_diff: New diff without matching opening status, skipping {diff:?}"); + self.height = self.height.max(block_number); + self.time = block_time_ms; + self.snapped = false; + return Ok(false); + }; + let inner_order = resting_order_from_raw_new(order, &diff, sz)?; + let order_for_tap = inner_order.clone(); + self.order_book.add_resting_order_from_diff(inner_order); + if !self.order_book.contains_order(&oid, &coin) { + log::warn!( + "apply_stream_diff: New order did not rest after raw-diff insert, later updates will be missing; \ + block_number={block_number} oid={oid:?} coin={coin:?} order={order_for_tap:?}" + ); + } else { + self.book_dirty = true; + } + if let Some(tap) = self.dob_tap.as_mut() { + tap.emit_order_add(&coin, &order_for_tap, time_ns); + } + } + InnerOrderDiff::Update { new_sz, .. } => match self.order_book.modify_sz(oid.clone(), coin.clone(), new_sz) + { + Some((old_sz, px)) => { + self.book_dirty = true; + if let Some(tap) = self.dob_tap.as_mut() { + let exec_quantity = crate::order_book::Sz::new(old_sz.value().saturating_sub(new_sz.value())); + tap.emit_order_execute(&coin, oid, px, exec_quantity, time_ns); + } + } + None => { + // Soft-tolerance: do NOT mark dirty — no mutation occurred. + log::warn!( + "apply_stream_diff: Update for missing order at block {block_number}, skipping {diff:?}" + ); + } + }, + InnerOrderDiff::Remove => { + if self.order_book.cancel_order(oid.clone(), coin.clone()) { + self.book_dirty = true; + if let Some(tap) = self.dob_tap.as_mut() { + tap.emit_order_cancel(&coin, oid, time_ns); + } + } else { + // Soft-tolerance: do NOT mark dirty. + log::warn!( + "apply_stream_diff: Remove for missing order at block {block_number}, skipping {diff:?}" + ); + } + } + } + self.height = self.height.max(block_number); self.time = block_time_ms; self.snapped = false; - self.book_dirty = true; + // `book_dirty` is intentionally NOT set here — each mutation branch sets + // it itself, so soft-tolerance no-op paths leave it untouched. Ok(true) } ``` -For soft-tolerance branches (missing status / missing order), do **not** set `book_dirty` — those paths do not mutate the resting book. - -Also update `apply_updates` (block-mode batched apply, line ~140 in the same file) the same way: set `book_dirty = true` after a successful mutation. Block mode currently snapshots per chunk so this is benign; streaming-mode will use it. +Also update `apply_updates` (block-mode batched apply, around line 140 in the same file) the same way: set `book_dirty = true` only on successful mutations, never at the shared function exit. Block mode currently snapshots per chunk so this is benign, but the invariant matters for L2-5 correctness anywhere `book_dirty` is read. - [ ] **Step 3: Run tests.** @@ -1009,12 +1067,40 @@ If a non-streaming test fails, investigate immediately — block mode should be **Files:** - Modify: `server/src/listeners/order_book/mod.rs:1028-1042` (and possibly call site at line 1248) -- [ ] **Step 1: Extract the snapshot enqueue logic into a helper.** +- [ ] **Step 1: Extract the snapshot enqueue logic into a helper with explicit "authoritative" semantics.** + +> **Codex finding #2 (high):** the backstop must not clear the finalization +> dirty flag. If it does, a backstop fired before block finalization will +> either suppress the authoritative final snapshot (no later diffs arrive → +> finalization sees `dirty=false`) or duplicate the height (late diff +> re-dirties → finalization emits again). The fix: thread an explicit +> `is_authoritative` flag through the emit helper, and only clear `book_dirty` +> on authoritative emissions (i.e. finalization). Backstop emissions are +> provisional and leave the dirty flag set so finalization can still emit. In `server/src/listeners/order_book/mod.rs`, currently lines 1855–1889 build and send an `InternalMessage::Snapshot`. Extract a method on `OrderBookListener`: ```rust - fn emit_tob_snapshot(&mut self, source_label: &'static str, source_block_time_ms: u64, source_local_time_ms: u64) { + /// Whether the caller of `emit_tob_snapshot` is taking the authoritative + /// final snapshot for the current dirty range, or a provisional snapshot + /// (e.g. stuck-stream backstop) that does NOT close out the dirty state. + #[derive(Copy, Clone, Debug, PartialEq, Eq)] + enum SnapshotEmission { + /// Block finalization: authoritative final snapshot for this height. + /// Clears `book_dirty` because the block is closed. + Authoritative, + /// Backstop / provisional: emit current state but leave `book_dirty` + /// set so a subsequent finalization still emits its own snapshot. + Provisional, + } + + fn emit_tob_snapshot( + &mut self, + source_label: &'static str, + source_block_time_ms: u64, + source_local_time_ms: u64, + emission: SnapshotEmission, + ) { let snapshot_start = Instant::now(); let snapshot = self.l2_snapshots(true); crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); @@ -1047,14 +1133,24 @@ In `server/src/listeners/order_book/mod.rs`, currently lines 1855–1889 build a }); let _unused = tx.send(snapshot_msg); } - if let Some(state) = self.order_book_state.as_mut() { + if emission == SnapshotEmission::Authoritative + && let Some(state) = self.order_book_state.as_mut() + { state.clear_book_dirty(); } } } ``` -Update the existing block-mode snapshot site at line 1855–1889 to call `self.emit_tob_snapshot(snapshot_source, source_block_time_ms, source_local_time_ms);` instead of inlining (preserving `last_source_times.unwrap_or((snapshot.0, snapshot.0))` for the `(block_time, local_time)` defaults). +Update the existing block-mode snapshot site at line 1855–1889 to call `self.emit_tob_snapshot(snapshot_source, source_block_time_ms, source_local_time_ms, SnapshotEmission::Authoritative);` — block mode always emits authoritative snapshots because there's no provisional/backstop concept there. + +**Add a provisional-snapshot counter for observability** (optional but recommended): + +```rust +crate::metrics::inc_tob_snapshot_provisional_total(source_label); +``` + +…inside the `Provisional` branch. If the metrics file does not yet have this counter, add it alongside the existing `orderbook_tob_*` family with the same low-cardinality label (`source`). Skip the metric in v1 if it complicates landing; the correctness fix is the dirty-flag handling, not the counter. - [ ] **Step 2: Call `emit_tob_snapshot` from `finalize_stream_block` when dirty.** @@ -1077,6 +1173,9 @@ In `server/src/listeners/order_book/mod.rs`, `finalize_stream_block` (line 1028) self.streaming_state.finalized_height = Some(height); // Streaming snapshot emission: tied to block finalization (L2-5). + // This is the authoritative emission for this height — it clears the + // dirty flag. Any earlier provisional/backstop emission deliberately + // left the flag set so this call still fires. let dirty = self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty); if dirty { let block_time_ms = block.block_time_ms.unwrap_or(0); @@ -1085,6 +1184,7 @@ In `server/src/listeners/order_book/mod.rs`, `finalize_stream_block` (line 1028) ingest_source_label(EventSource::OrderDiffs), block_time_ms, local_time_ms, + SnapshotEmission::Authoritative, ); } } @@ -1137,10 +1237,14 @@ In `server/src/listeners/order_book/mod.rs`, the ticker arm of the `select!` at let elapsed = guard.last_dirty_emit_at.map_or(Duration::MAX, |t| t.elapsed()); if elapsed >= STREAM_DIRTY_BACKSTOP_INTERVAL { if let (Some(bt), Some(lt)) = (guard.last_batch_block_time_ms, guard.last_batch_local_time_ms) { + // Provisional emission: leaves book_dirty set + // so finalize_stream_block still emits the + // authoritative final snapshot for this height. guard.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), bt, lt, + SnapshotEmission::Provisional, ); guard.last_dirty_emit_at = Some(Instant::now()); } @@ -1181,6 +1285,120 @@ git add server/src/listeners/order_book/mod.rs git commit -m "feat: add stuck-stream snapshot backstop for prolonged grace_fallback" ``` +### Task 5.4b: Recovery emits an authoritative snapshot + +> **Codex finding #1 (high):** `apply_recovery` at [mod.rs:1274](../../../server/src/listeners/order_book/mod.rs) calls +> `state.replace_coin_from_snapshot` and `state.remove_coin` directly, +> bypassing `apply_stream_diff` / `apply_updates`. After L2-5 removes the +> per-chunk snapshot emission in streaming mode, recovery no longer produces +> a TOB snapshot until an unrelated later diff arrives — multicast TOB can +> serve stale or corrupt quotes after recovery, potentially indefinitely +> for quiet instruments. The doc comment at line 1271 says "the next tick's +> `InternalMessage::Snapshot` will carry the corrected BBO," which the L2-5 +> change silently breaks. Recovery must dirty the book and emit immediately. + +**Files:** +- Modify: `server/src/listeners/order_book/mod.rs:1274-1318` (`apply_recovery`) + +- [ ] **Step 1: Mark the book dirty inside `apply_recovery`.** + +In `server/src/listeners/order_book/mod.rs`, function `apply_recovery` (line 1274), add a local `mutated` tracker and call `state.mark_book_dirty()` if any branch did work. Concretely, replace the function body's mutation section with: + +```rust + fn apply_recovery(&mut self, report: &utils::ValidationReport, fresh_snapshot: Snapshots) { + let taps = self.dob_replay_taps.as_ref(); + let Some(state) = self.order_book_state.as_mut() else { + return; + }; + + let mut fresh_map = fresh_snapshot.value(); + let mut mutated = false; + + for (coin, msg) in &report.diverged { + log::warn!("recovery: re-initializing {} (divergence: {})", coin.value(), msg); + Self::emit_dob_instrument_reset(taps, coin); + if let Some(fresh_book) = fresh_map.remove(coin) { + state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); + } else { + log::warn!( + "recovery: diverged coin {} missing from fresh snapshot — dropping from state", + coin.value() + ); + state.remove_coin(coin); + } + mutated = true; + } + + for coin in &report.missing_in_fresh { + log::warn!("recovery: dropping stale coin {} not in fresh snapshot", coin.value()); + Self::emit_dob_instrument_reset(taps, coin); + state.remove_coin(coin); + mutated = true; + } + + for coin in &report.extra_in_fresh { + if let Some(fresh_book) = fresh_map.remove(coin) { + log::warn!("recovery: adding new coin {} from fresh snapshot", coin.value()); + Self::emit_dob_instrument_reset(taps, coin); + state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); + mutated = true; + } + } + + if mutated { + state.mark_book_dirty(); + } + } +``` + +- [ ] **Step 2: Emit a snapshot immediately after a recovery that mutated state.** + +`apply_recovery` is called from the snapshot-validation flow. In streaming mode, waiting for the next finalization or backstop tick can mean seconds of stale TOB for the repaired coin. Emit the authoritative snapshot from the caller as soon as `apply_recovery` returns. + +Find the call site: + +```bash +grep -n "apply_recovery" server/src/listeners/order_book/mod.rs +``` + +At each call site, after `self.apply_recovery(&report, fresh_snapshot);`, add: + +```rust + // Recovery may have mutated the book without going through + // apply_stream_diff/apply_updates. Emit an authoritative TOB snapshot + // now so multicast subscribers see the corrected BBO without waiting + // for an unrelated future diff (L2-5 + recovery interaction). + if self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) { + let (bt, lt) = self + .last_batch_block_time_ms + .zip(self.last_batch_local_time_ms) + .unwrap_or((0, 0)); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Authoritative, + ); + } +``` + +(If `last_batch_block_time_ms` is `None` because recovery ran before any batch was processed, the `unwrap_or((0, 0))` still produces a valid emission; the source-lag metrics will be uninformative for that one snapshot but the BBO content is correct.) + +- [ ] **Step 3: Run tests.** + +```bash +cargo test -p server +``` + +Expected: all tests **PASS** (no test exercises recovery yet — that's Task 5.5b test 5). + +- [ ] **Step 4: Commit.** + +```bash +git add server/src/listeners/order_book/mod.rs +git commit -m "fix: emit authoritative tob snapshot after streaming recovery" +``` + ### Task 5.5a: Add streaming test helpers **Files:** @@ -1214,14 +1432,20 @@ To support the stuck-stream backstop test without sleeping, add a method that mi ```rust /// Test-only: forces a backstop snapshot emission if the book is dirty, /// bypassing the wall-clock interval check. Mirrors the production - /// ticker arm's backstop logic. + /// ticker arm's backstop logic — provisional emission, dirty flag stays + /// set so finalization can still emit the authoritative snapshot. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { if self.ingest_mode == IngestMode::Stream && self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) && let (Some(bt), Some(lt)) = (self.last_batch_block_time_ms, self.last_batch_local_time_ms) { - self.emit_tob_snapshot(ingest_source_label(EventSource::OrderDiffs), bt, lt); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Provisional, + ); self.last_dirty_emit_at = Some(Instant::now()); } } @@ -1512,23 +1736,200 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { assert_eq!(after_backstop.len(), 1, "backstop must emit exactly one snapshot"); assert_eq!(snapshot_height(&after_backstop[0]), 2, "backstop snapshot must carry the last applied height"); } + +/// L2-5 test 5 (Codex finding #2): a late same-height diff that arrives +/// after the provisional backstop must still result in an authoritative +/// finalization snapshot. The backstop must not have cleared the dirty +/// flag. +#[tokio::test(flavor = "current_thread")] +async fn late_diff_after_backstop_still_emits_authoritative_final() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: one BBO-changing diff. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Fire the backstop — emits a provisional snapshot at height 2. + listener.fire_stream_dirty_backstop_for_test(); + let provisional = drain_snapshots(&mut rx); + assert_eq!(provisional.len(), 1, "backstop should emit one provisional snapshot"); + assert_eq!(snapshot_height(&provisional[0]), 2); + + // Late same-height diff arrives that moves BBO higher. + let (s2b, d2b) = add_event(1_700_000_002_000, Side::Bid, 102, "110", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2b], vec![d2b]); + + // Block 3 triggers finalization of block 2. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let after_final = drain_snapshots(&mut rx); + // Must contain the authoritative finalization snapshot for height 2. + let height_2_finals: Vec<_> = after_final.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + !height_2_finals.is_empty(), + "finalization must still emit an authoritative snapshot for height 2 even after a backstop" + ); + // Final snapshot must carry the final BBO (110, not 100). + let final_bbo = snapshot_best_bid(height_2_finals.last().unwrap()).expect("BBO exists"); + let expected = Px::parse_from_str("110").unwrap().value(); + assert_eq!(final_bbo, expected, "authoritative final must carry the latest BBO (110); got {final_bbo}"); +} + +/// L2-5 test 6 (Codex finding #3): tolerated soft-tolerance Update/Remove +/// branches that don't actually mutate the book must NOT mark the book +/// dirty, so no spurious snapshot is emitted. +#[tokio::test(flavor = "current_thread")] +async fn missing_order_update_does_not_emit_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: feed an Update for an oid that doesn't exist in the book. + // The branch logs a warn and falls through, but must not dirty the book. + let missing_oid = 999_999u64; + let bad_update = update_diff(missing_oid, "100", "5", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![bad_update]); + + // Block 3 to trigger any pending finalization of block 2. + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + height_2.is_empty(), + "no snapshot should be emitted for a block whose only diff was a no-op missing-order Update; got {}", + height_2.len(), + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn missing_order_remove_does_not_emit_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let missing_oid = 999_999u64; + let bad_remove = NodeDataOrderDiff::new_for_test( + Address::new([0; 20]), + missing_oid, + "100".to_string(), + TEST_COIN.to_string(), + OrderDiff::Remove, + ); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![bad_remove]); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + height_2.is_empty(), + "no snapshot should be emitted for a block whose only diff was a no-op missing-order Remove; got {}", + height_2.len(), + ); +} + +/// L2-5 test 7 (Codex finding #1): recovery must emit an authoritative +/// snapshot immediately so multicast TOB does not stay stale for quiet +/// instruments after a divergence is repaired. +#[tokio::test(flavor = "current_thread")] +async fn recovery_emits_authoritative_snapshot_without_later_diff() { + use crate::listeners::order_book::utils::ValidationReport; + + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Apply one diff so last_batch_*_time_ms are populated; then drain. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 102, "101", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + let _drain_initial = drain_snapshots(&mut rx); + + // Simulate a recovery: build a fresh snapshot with a *different* BBO, + // mark BTC as diverged, and invoke the recovery path directly. + let coin = Coin::new(TEST_COIN); + let mut repaired: OrderBook = OrderBook::new(); + repaired.add_order(InnerL4Order { + user: Address::new([0; 20]), + coin: coin.clone(), + side: Side::Bid, + limit_px: Px::parse_from_str("200").expect("valid px"), + sz: Sz::parse_from_str("9").expect("valid sz"), + oid: 7_777, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut fresh_map: HashMap> = HashMap::new(); + fresh_map.insert(coin.clone(), repaired.to_snapshot()); + let fresh = Snapshots::new(fresh_map); + + let report = ValidationReport { + diverged: vec![(coin.clone(), "synthetic divergence for test".to_string())], + missing_in_fresh: vec![], + extra_in_fresh: vec![], + }; + + listener.apply_recovery_for_test(&report, fresh); + + let after = drain_snapshots(&mut rx); + assert_eq!(after.len(), 1, "recovery must emit exactly one authoritative snapshot"); + let repaired_bbo = snapshot_best_bid(&after[0]).expect("BBO exists"); + let expected = Px::parse_from_str("200").unwrap().value(); + assert_eq!(repaired_bbo, expected, "snapshot after recovery must carry the repaired BBO (200); got {repaired_bbo}"); +} +``` + +Note: `apply_recovery_for_test` is a `#[cfg(test)] pub(crate)` wrapper around `self.apply_recovery(&report, fresh_snapshot)` plus the post-recovery `emit_tob_snapshot` call from Task 5.4b Step 2. Add it to `OrderBookListener` alongside the other `*_for_test` helpers if it doesn't already exist: + +```rust + #[cfg(test)] + pub(crate) fn apply_recovery_for_test( + &mut self, + report: &utils::ValidationReport, + fresh_snapshot: Snapshots, + ) { + self.apply_recovery(report, fresh_snapshot); + if self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) { + let (bt, lt) = self.last_batch_block_time_ms.zip(self.last_batch_local_time_ms).unwrap_or((0, 0)); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Authoritative, + ); + } + } ``` -- [ ] **Step 3: Run all four tests.** +- [ ] **Step 3: Run all seven tests.** ```bash cargo test -p server stream_finalization_tests -- --nocapture ``` -Expected: all four **PASS**. +Expected: all **PASS**. -If `two_diffs_in_block_emit_final_bbo_once` fails with "exactly one snapshot for height 2; got 0", verify that `apply_stream_diff` is setting `book_dirty = true` (Task 5.1 Step 2). If `block_without_mutations_emits_no_snapshot` fails with a non-empty snapshot list, verify that `apply_updates` / `apply_stream_diff` does not set `book_dirty` on a no-op apply. +| Test | What it asserts | +|---|---| +| `two_diffs_in_block_emit_final_bbo_once` | Multi-diff block emits one final BBO at finalization | +| `block_without_mutations_emits_no_snapshot` | Empty block emits nothing | +| `finalized_blocks_emit_snapshots_in_order` | Heights emit monotonically, one per dirty block | +| `stuck_stream_backstop_emits_dirty_snapshot` | Backstop emits when no finalization arrives | +| `late_diff_after_backstop_still_emits_authoritative_final` | **Codex #2:** backstop doesn't suppress finalization | +| `missing_order_update_does_not_emit_snapshot` | **Codex #3:** tolerated Update no-op stays clean | +| `missing_order_remove_does_not_emit_snapshot` | **Codex #3:** tolerated Remove no-op stays clean | +| `recovery_emits_authoritative_snapshot_without_later_diff` | **Codex #1:** recovery is visible immediately | + +If `two_diffs_in_block_emit_final_bbo_once` fails with "exactly one snapshot for height 2; got 0", verify Task 5.1 Step 2 (per-branch `book_dirty`). If `late_diff_after_backstop_still_emits_authoritative_final` fails because finalization emits nothing, verify Task 5.3 Step 1 (`SnapshotEmission::Provisional` does not clear dirty). If `missing_order_*` tests fail, verify that `apply_stream_diff` does not set dirty in the soft-tolerance branches. - [ ] **Step 4: Commit.** ```bash git add server/src/listeners/order_book/stream_finalization_tests.rs server/src/listeners/order_book/mod.rs -git commit -m "test: l2-5 streaming snapshot finalization tests (two-diff, no-spurious, ordering, backstop)" +git commit -m "test: l2-5 streaming snapshot finalization tests including codex-flagged edge cases" ``` ### Task 5.6: Quote-sequence parity test (block vs stream) @@ -1766,6 +2167,22 @@ If a regression is in the streaming-finalization snapshot path (L2-5) specifical --- +## Decision log (Codex adversarial findings on plan v1) + +- **Finding 1 (high, recovery silent after L2-5):** addressed by Task 5.4b. + `apply_recovery` now sets `book_dirty` after any mutation, and the caller + immediately emits an authoritative TOB snapshot. Task 5.5b test 7 verifies + the corrected BBO is visible without a later diff. +- **Finding 2 (high, backstop suppresses final snapshot):** addressed by + splitting `emit_tob_snapshot` into `Authoritative` vs `Provisional` + emissions. Only authoritative clears `book_dirty`; backstop leaves it set + so finalization still emits. Task 5.5b test 5 verifies a late same-height + diff after a backstop still produces the authoritative final snapshot. +- **Finding 3 (medium, dirty flag on no-op branches):** addressed by Task + 5.1 Step 2 — `book_dirty = true` moves inside each successful mutation + branch, never the shared function exit. Task 5.5b tests 6 verify that + tolerated missing-order Update/Remove paths do not emit snapshots. + ## Notes - **L2-2 is intentionally not in this plan.** Per the spec, it stays deferred From 4b57d07a6158737fdd3b6fbde90c577e10c6549f Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 14:17:59 -0400 Subject: [PATCH 05/65] plan: address codex v2 findings (ws-gate, snapped gate, dedicated 5s backstop ticker) --- .../2026-05-15-streaming-cpu-reduction.md | 307 +++++++++++++++--- 1 file changed, 260 insertions(+), 47 deletions(-) diff --git a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md index d038b1ad..b20a70e2 100644 --- a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md +++ b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md @@ -1028,7 +1028,13 @@ git commit -m "feat: add book_dirty flag to OrderBookState, set on apply mutatio **Files:** - Modify: `server/src/listeners/order_book/mod.rs:1855-1890` -- [ ] **Step 1: Wrap the per-chunk snapshot emission in a non-streaming guard.** +- [ ] **Step 1: Wrap the per-chunk snapshot emission in a default-multicast-only guard.** + +> **Codex finding (high):** Task 5.2 must NOT collapse the cadence for +> WS-enabled streaming. The plan promises that `--enable-websocket` preserves +> today's behavior; that requires keeping per-chunk emission whenever WS is +> on, regardless of ingest mode. L2-5 is only active in the default +> (WS-disabled) configuration. In `server/src/listeners/order_book/mod.rs`, around line 1852–1855: @@ -1037,10 +1043,11 @@ In `server/src/listeners/order_book/mod.rs`, around line 1852–1855: return Ok(()); } - // In streaming mode, snapshots are emitted by finalize_stream_block, - // not per chunk. The dirty flag is set by apply_stream_diff and - // consumed at block finalization. Skip the per-chunk compute here. - if self.ingest_mode == IngestMode::Stream { + // L2-5: in streaming + WS-disabled, snapshots are emitted by + // finalize_stream_block (and the dirty backstop), not per chunk. + // WS-enabled streaming retains today's per-chunk cadence so WS + // subscribers see the same intra-block L2 update granularity. + if self.ingest_mode == IngestMode::Stream && !self.enable_websocket { return Ok(()); } @@ -1102,7 +1109,16 @@ In `server/src/listeners/order_book/mod.rs`, currently lines 1855–1889 build a emission: SnapshotEmission, ) { let snapshot_start = Instant::now(); - let snapshot = self.l2_snapshots(true); + // L2-5 + Codex finding: `prevent_future_snaps` mirrors the authority of + // this emission. Authoritative emissions set `snapped` so block mode's + // duplicate-chunk suppression keeps working; provisional (backstop) + // emissions leave `snapped` untouched so the later authoritative + // finalization call can still produce a snapshot. Without this, a + // backstop fired before finalization would mark `snapped=true` and the + // finalization's `l2_snapshots(true)` would return `None`, silently + // dropping the authoritative final BBO. + let prevent_future_snaps = emission == SnapshotEmission::Authoritative; + let snapshot = self.l2_snapshots(prevent_future_snaps); crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); if let Some(snapshot) = snapshot { let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); @@ -1212,77 +1228,251 @@ git add server/src/listeners/order_book/mod.rs git commit -m "perf: emit streaming l2 snapshot at block finalization, not per chunk" ``` -### Task 5.4: Stuck-stream backstop +### Task 5.4: Stuck-stream backstop on a dedicated 5s ticker + +> **Codex finding (high):** the existing ticker in `hl_listen` is +> [60s](../../../server/src/listeners/order_book/mod.rs) (`interval_at(start, Duration::from_secs(60))` at mod.rs:167) +> and runs the fatal-liveness check before any new logic placed in that arm. +> A 5s backstop bolted into the 60s ticker would fire at most every 60s and +> only after the liveness path. Additionally, `last_batch_times()` at +> [mod.rs:602](../../../server/src/listeners/order_book/mod.rs) is a take-and-clear; the backstop cannot rely on +> `last_batch_block_time_ms` / `last_batch_local_time_ms` still being +> populated when it fires. Solution: a dedicated 5s ticker on its own +> `select!` arm, plus dedicated dirty-state timestamp + source-time fields +> set inside `apply_*` mutation branches. **Files:** -- Modify: `server/src/listeners/order_book/mod.rs` (the periodic ticker block around line 255–278) +- Modify: `server/src/listeners/order_book/mod.rs` (struct fields, listener loop `select!`, mutation hooks) +- Modify: `server/src/listeners/order_book/state.rs` (record dirty source times on mutation) -- [ ] **Step 1: Add a backstop emission on the periodic 60s ticker.** +- [ ] **Step 1: Add dirty-state tracking fields.** -In `server/src/listeners/order_book/mod.rs`, the ticker arm of the `select!` at line 255–278 currently does liveness check + latency report + snapshot fetch. Add a backstop: +In `server/src/listeners/order_book/state.rs`, extend `OrderBookState` with the source times captured when the book first became dirty in the current dirty epoch (so the backstop has the right `(block_time_ms, local_time_ms)` to attach to the snapshot regardless of `last_batch_times()` clearing): ```rust - _ = ticker.tick() => { - // ... existing liveness + latency + progress + health ... +pub(super) struct OrderBookState { + // ... existing fields ... + enable_websocket: bool, + book_dirty: bool, + /// Source `(block_time_ms, local_time_ms)` recorded the first time the + /// book became dirty in the current dirty epoch. Reset to `None` when + /// `book_dirty` is cleared. Lets the stuck-stream backstop reach for + /// reliable times without depending on the take-and-clear + /// `OrderBookListener::last_batch_times()` field. + dirty_source_times: Option<(u64, u64)>, + /// `Instant` at which the book first became dirty in the current dirty + /// epoch. Used by the backstop to gate emission cadence. + book_dirty_since: Option, + dob_tap: Option, +} + +impl OrderBookState { + pub(super) fn book_dirty_since(&self) -> Option { + self.book_dirty_since + } + + pub(super) fn dirty_source_times(&self) -> Option<(u64, u64)> { + self.dirty_source_times + } + + fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { + if !self.book_dirty { + self.book_dirty_since = Some(std::time::Instant::now()); + self.dirty_source_times = Some((block_time_ms, local_time_ms)); + } + self.book_dirty = true; + } + + pub(super) fn clear_book_dirty(&mut self) { + self.book_dirty = false; + self.book_dirty_since = None; + self.dirty_source_times = None; + } +} +``` + +The existing `mark_book_dirty()` from Task 5.1 stays as a fallback (used by recovery, where source times come from the listener's last batch). The new `mark_dirty_with_times` is the preferred call site inside `apply_stream_diff` / `apply_updates`. + +- [ ] **Step 2: Call `mark_dirty_with_times` from each successful mutation branch in `apply_stream_diff`.** + +In `server/src/listeners/order_book/state.rs`, function `apply_stream_diff` (the same one updated in Task 5.1 Step 2), replace each `self.book_dirty = true;` with `self.mark_dirty_with_times(block_time_ms, local_time_ms);`. The `block_time_ms` is already a parameter; `local_time_ms` needs to be threaded through. Since `apply_stream_diff` does not currently take `local_time_ms`, add it as a parameter: - // Stuck-stream backstop (L2-5): if the book has been dirty - // longer than STREAM_DIRTY_BACKSTOP and no block has finalized, - // emit a snapshot from the latest applied diff. Prevents quote - // staleness during prolonged grace_fallback. +```rust + pub(super) fn apply_stream_diff( + &mut self, + block_number: u64, + block_time_ms: u64, + local_time_ms: u64, + diff: NodeDataOrderDiff, + order_status: Option, + ) -> Result { +``` + +Update the call site in `server/src/listeners/order_book/mod.rs` (the streaming drain at ~line 1228) to pass the diff batch's `local_time` (already available as `diff_batch.local_time_ms()`). + +For `apply_updates` (block mode batched apply), add an analogous parameter or compute `local_time_ms` from the batch. + +- [ ] **Step 3: Add a dedicated 5s backstop ticker as its own `select!` arm.** + +In `server/src/listeners/order_book/mod.rs`, the `hl_listen` function (around line 167) constructs the existing 60s ticker. Add a second ticker for the backstop and a new `select!` arm. Place the new arm **before** the existing 60s `ticker.tick()` arm so the backstop fires before any potentially-fatal liveness logic: + +```rust + let mut ticker = interval_at(start, Duration::from_secs(60)); + let mut backstop_ticker = tokio::time::interval(STREAM_DIRTY_BACKSTOP_INTERVAL); + backstop_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + + loop { + tokio::select! { + event = fs_event_rx.recv() => match event { + // ... existing code ... + }, + snapshot_fetch_res = snapshot_fetch_task_rx.recv() => { + // ... existing code ... + } + _ = backstop_ticker.tick() => { + let mut guard = listener.lock().await; + if guard.ingest_mode == IngestMode::Stream + && !guard.enable_websocket + && let Some(state) = guard.order_book_state.as_ref() + && state.book_dirty() + && let Some(since) = state.book_dirty_since() + && since.elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL + && let Some((bt, lt)) = state.dirty_source_times() { - let mut guard = listener.lock().await; - if guard.ingest_mode == IngestMode::Stream - && guard.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) - { - let elapsed = guard.last_dirty_emit_at.map_or(Duration::MAX, |t| t.elapsed()); - if elapsed >= STREAM_DIRTY_BACKSTOP_INTERVAL { - if let (Some(bt), Some(lt)) = (guard.last_batch_block_time_ms, guard.last_batch_local_time_ms) { - // Provisional emission: leaves book_dirty set - // so finalize_stream_block still emits the - // authoritative final snapshot for this height. - guard.emit_tob_snapshot( - ingest_source_label(EventSource::OrderDiffs), - bt, - lt, - SnapshotEmission::Provisional, - ); - guard.last_dirty_emit_at = Some(Instant::now()); - } - } + guard.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Provisional, + ); + // After a provisional emission, advance the dirty-since + // anchor so the next backstop fires at the same cadence + // rather than every tick. + if let Some(state) = guard.order_book_state.as_mut() { + state.bump_dirty_since_after_provisional(); } } - - // ... existing snapshot fetch ... } + _ = ticker.tick() => { + // ... existing 60s liveness + report + fetch_snapshot ... + } + } + } ``` -Add to `OrderBookListener`: +Add `bump_dirty_since_after_provisional` to `OrderBookState`: ```rust - last_dirty_emit_at: Option, + pub(super) fn bump_dirty_since_after_provisional(&mut self) { + if self.book_dirty { + self.book_dirty_since = Some(std::time::Instant::now()); + } + } ``` -Initialize to `None` in `new_with_ingest_mode`. Set it inside `emit_tob_snapshot` when streaming mode is active. - -Add a const near other tuning constants in the file: +Add the constant near other tuning constants in `mod.rs`: ```rust const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_secs(5); ``` -- [ ] **Step 2: Run tests.** +- [ ] **Step 4: Update Task 5.4b's recovery emission to use `dirty_source_times` (not `last_batch_times`).** + +Re-open the post-recovery emission added in Task 5.4b Step 2 and change it from reading `self.last_batch_block_time_ms.zip(self.last_batch_local_time_ms)` to reading from the state's `dirty_source_times()`: + +```rust + if let Some(state) = self.order_book_state.as_ref() + && state.book_dirty() + { + let (bt, lt) = state.dirty_source_times().unwrap_or((0, 0)); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Authoritative, + ); + } +``` + +Recovery itself should call `state.mark_dirty_with_times(...)` using the latest batch's times (from `last_batch_times` consumed once at recovery start), or fall back to `(0, 0)` for the source labels when no batch has been processed. + +- [ ] **Step 5: Update the test helper `fire_stream_dirty_backstop_for_test` to use `dirty_source_times`.** + +In `server/src/listeners/order_book/mod.rs`: + +```rust + #[cfg(test)] + pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { + if self.ingest_mode == IngestMode::Stream + && !self.enable_websocket + && let Some(state) = self.order_book_state.as_ref() + && state.book_dirty() + && let Some((bt, lt)) = state.dirty_source_times() + { + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Provisional, + ); + if let Some(state) = self.order_book_state.as_mut() { + state.bump_dirty_since_after_provisional(); + } + } + } +``` + +- [ ] **Step 6: Run tests.** ```bash cargo test -p server ``` -Expected: all tests **PASS**. +Expected: all tests **PASS**. If any L2-5 test fails because `apply_stream_diff` now takes `local_time_ms` as a parameter, update the test scaffolding (Task 5.5a `feed_block` helper) to pass it through. -- [ ] **Step 3: Commit.** +- [ ] **Step 7: Add a production-path backstop test (Codex follow-up).** + +> **Codex recommendation:** add a test that goes through `process_update` +> rather than only direct `receive_batch` calls, to catch any divergence +> between the production event path and the test helper. + +The dedicated backstop ticker is hard to test in unit scope without `tokio::time::pause`. Add a `#[tokio::test(flavor = "current_thread", start_paused = true)]` test that: +1. Constructs a streaming listener via `for_test_streaming_with_snapshot`. +2. Feeds a single BBO-changing diff via `feed_block`. +3. Asserts no snapshot has been emitted yet. +4. `tokio::time::advance(Duration::from_secs(6)).await;` +5. Calls `listener.fire_stream_dirty_backstop_for_test()` — which inside the production loop would have been driven by the 5s `backstop_ticker.tick()`. +6. Asserts exactly one provisional snapshot for the dirtied height. + +Add to `stream_finalization_tests.rs`: + +```rust +#[tokio::test(flavor = "current_thread", start_paused = true)] +async fn backstop_fires_after_dirty_interval_via_production_helper() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Before the interval elapses, the backstop helper should not emit. + listener.fire_stream_dirty_backstop_for_test(); + assert!(drain_snapshots(&mut rx).is_empty(), "backstop must not fire before STREAM_DIRTY_BACKSTOP_INTERVAL elapsed"); + + tokio::time::advance(std::time::Duration::from_secs(6)).await; + + listener.fire_stream_dirty_backstop_for_test(); + let snaps = drain_snapshots(&mut rx); + assert_eq!(snaps.len(), 1, "backstop must emit exactly one snapshot after the dirty interval"); +} +``` + +The existing `stuck_stream_backstop_emits_dirty_snapshot` test stays — it asserts the post-elapsed path. The new test asserts the interval gate is real. + +- [ ] **Step 8: Commit.** ```bash -git add server/src/listeners/order_book/mod.rs -git commit -m "feat: add stuck-stream snapshot backstop for prolonged grace_fallback" +git add server/src/listeners/order_book/mod.rs server/src/listeners/order_book/state.rs server/src/listeners/order_book/stream_finalization_tests.rs +git commit -m "feat: 5s stuck-stream snapshot backstop on dedicated ticker" ``` ### Task 5.4b: Recovery emits an authoritative snapshot @@ -2183,6 +2373,29 @@ If a regression is in the streaming-finalization snapshot path (L2-5) specifical branch, never the shared function exit. Task 5.5b tests 6 verify that tolerated missing-order Update/Remove paths do not emit snapshots. +## Decision log (Codex adversarial findings on plan v2) + +- **Finding 1 (high, L2-5 cadence change leaked to `--enable-websocket`):** + addressed by Task 5.2 — the per-chunk skip is gated on + `IngestMode::Stream && !enable_websocket`. WS-enabled streaming retains + today's per-chunk cadence. The `--enable-websocket` rollback contract is + now real. +- **Finding 2 (high, `snapped` gate suppressed authoritative finalization):** + addressed by parameterizing `emit_tob_snapshot` on `SnapshotEmission`: + `Authoritative` → `l2_snapshots(true)` (sets `snapped`, prevents + duplicate at same height); `Provisional` → `l2_snapshots(false)` (no + `snapped` mutation, so finalization can still emit). Without this fix + the dirty-flag fix from v2 was hollow — the `snapped` gate was a second + suppression channel. +- **Finding 3 (high, backstop wired to 60s ticker after liveness):** Task + 5.4 rewritten. Dedicated 5s ticker on its own `select!` arm, placed + before the existing 60s ticker arm. Dirty-state tracking moves into + `OrderBookState` (`book_dirty_since`, `dirty_source_times`), populated + inside successful mutation branches — independent of the take-and-clear + `last_batch_times()`. `apply_stream_diff` gains a `local_time_ms` + parameter to feed the dirty source times. Task 5.5b adds a paused-time + test that asserts the interval gate is real, not a no-op. + ## Notes - **L2-2 is intentionally not in this plan.** Per the spec, it stays deferred From cff3895feec2b62818f03b5fa19bcd47c9484290 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 14:22:48 -0400 Subject: [PATCH 06/65] plan: align stuck-stream backstop interval with tob freshness threshold (250ms = catchup_threshold_ms/2) --- .../2026-05-15-streaming-cpu-reduction.md | 24 ++++++++++++++++--- ...26-05-15-streaming-cpu-reduction-design.md | 16 +++++++++---- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md index b20a70e2..3051a474 100644 --- a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md +++ b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md @@ -1373,8 +1373,25 @@ Add `bump_dirty_since_after_provisional` to `OrderBookState`: Add the constant near other tuning constants in `mod.rs`: ```rust -const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_secs(5); -``` +/// Stuck-stream backstop interval. Set to half of +/// `MulticastPublisher::CATCHUP_THRESHOLD_MS` (500ms today) so the provisional +/// emission always fires before a dirty BBO can age past the TOB freshness +/// suppression cutoff. If `CATCHUP_THRESHOLD_MS` is ever lowered, lower this +/// in lockstep. Backstop only runs when block finalization is delayed; in +/// healthy streaming this never fires because finalization emits first. +const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_millis(250); + +// Compile-time check that the backstop stays within half the freshness +// suppression cutoff. If this assertion fires, update one of the constants +// rather than masking the divergence. +const _BACKSTOP_VS_FRESHNESS_INVARIANT: () = assert!( + STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64 * 2 + <= crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS, + "STREAM_DIRTY_BACKSTOP_INTERVAL must be <= CATCHUP_THRESHOLD_MS / 2 so stalls cannot age TOB past suppression", +); +``` + +This requires exposing `CATCHUP_THRESHOLD_MS` so the listener module can read it. In `server/src/multicast/publisher.rs:509`, change both `#[cfg(not(test))]` and `#[cfg(test)]` declarations from `const CATCHUP_THRESHOLD_MS: u64 = 500;` to `pub(crate) const CATCHUP_THRESHOLD_MS: u64 = 500;` so the listener's compile-time check can reference it. (Alternative if the visibility change is undesirable: duplicate the value in `mod.rs` as `const CATCHUP_THRESHOLD_MS_MIRROR: u64 = 500;` with a doc comment, and run the assert against the mirror — the price is a runtime divergence risk if someone updates the publisher value without updating the mirror.) - [ ] **Step 4: Update Task 5.4b's recovery emission to use `dirty_source_times` (not `last_batch_times`).** @@ -1458,7 +1475,8 @@ async fn backstop_fires_after_dirty_interval_via_production_helper() { listener.fire_stream_dirty_backstop_for_test(); assert!(drain_snapshots(&mut rx).is_empty(), "backstop must not fire before STREAM_DIRTY_BACKSTOP_INTERVAL elapsed"); - tokio::time::advance(std::time::Duration::from_secs(6)).await; + // Advance past the 250ms backstop interval (with a small safety margin). + tokio::time::advance(std::time::Duration::from_millis(260)).await; listener.fire_stream_dirty_backstop_for_test(); let snaps = drain_snapshots(&mut rx); diff --git a/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md index c6fb2c56..5eae05e9 100644 --- a/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md +++ b/docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md @@ -260,8 +260,14 @@ and BBO can change with any of them. the dirty flag. This is `≤ 1` snapshot per block — the same cadence as block mode. - **Backstop for stuck streams.** If a block hasn't finalized within a bounded - window (e.g. 5s) but the book is dirty, emit anyway. Avoids quote staleness - during grace-fallback. + window (250ms — half of the TOB freshness suppression threshold + `CATCHUP_THRESHOLD_MS = 500ms`) but the book is dirty, emit a provisional + snapshot anyway. Avoids quote staleness during grace-fallback. The 250ms + number is derived from the freshness threshold so a stalled stream cannot + let TOB quotes age past suppression; if the threshold ever changes, the + backstop interval changes with it. Backstop emissions are provisional + (`SnapshotEmission::Provisional`) so the dirty flag stays set and the + authoritative finalization snapshot still emits when the block closes. - **Block mode untouched.** The change is gated on `IngestMode::Stream`. - **Required tests (addresses Codex finding #2):** 1. **Two-diff-in-block test:** a streaming block contains a `New` diff at @@ -342,8 +348,10 @@ Doesn't fix the root cause but bounds the feedback loop while changes land. ## Open questions -- For L2-5, what should the stuck-stream backstop interval be? 5s is a - placeholder; the TOB freshness threshold is the natural ceiling. +- ~~For L2-5, what should the stuck-stream backstop interval be?~~ Resolved: + 250ms = `CATCHUP_THRESHOLD_MS / 2`. Derived from the TOB freshness + suppression cutoff so backstop always fires before a dirty quote can age + past suppression. - For L2-3, when WS is disabled (the default), should the L4 broadcast channel ([mod.rs:1259](../../../server/src/listeners/order_book/mod.rs)) also be short-circuited (skip `tokio::spawn`)? Probably yes; L4 has no consumer without WS. Could fold into L2-3 directly. From f794ca1170ed0c6db5b30982f95e525d8179a943 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 14:53:30 -0400 Subject: [PATCH 07/65] test: add Px::num_digits boundary tests around 10^n thresholds --- server/src/order_book/types.rs | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/server/src/order_book/types.rs b/server/src/order_book/types.rs index 06aee219..4ce021e6 100644 --- a/server/src/order_book/types.rs +++ b/server/src/order_book/types.rs @@ -198,3 +198,61 @@ mod sz_to_fixed_tests { assert_eq!(sz_to_fixed(Sz::new(0), 0), 0); } } + +#[cfg(test)] +mod num_digits_tests { + use super::Px; + + #[test] + fn zero_returns_one() { + assert_eq!(Px::new(0).num_digits(), 1); + } + + #[test] + fn single_digit_values() { + for v in 1u64..=9 { + assert_eq!(Px::new(v).num_digits(), 1, "value={v}"); + } + } + + #[test] + fn powers_of_ten_have_expected_digit_count() { + // 10^0 = 1 → 1 digit, 10^1 = 10 → 2 digits, ..., 10^19 overflows u64. + for n in 0u32..=18 { + let v = 10u64.pow(n); + let expected = n + 1; + assert_eq!(Px::new(v).num_digits(), expected, "10^{n} = {v} should have {expected} digits"); + } + } + + #[test] + fn just_below_powers_of_ten_have_expected_digit_count() { + // 9, 99, 999, ..., values just below 10^n must report n digits, not n+1. + // This is the f64 imprecision boundary that motivates the ilog10 swap: + // 999_999_999_999_999_999_u64 rounds to 1e18 as f64, so the old formula + // returned 19 instead of 18. + for n in 1u32..=18 { + let v = 10u64.pow(n) - 1; + let expected = n; + assert_eq!(Px::new(v).num_digits(), expected, "10^{n} - 1 = {v} should have {expected} digits"); + } + } + + #[test] + fn u64_max_has_twenty_digits() { + assert_eq!(Px::new(u64::MAX).num_digits(), 20); + } + + #[test] + fn dense_sweep_realistic_hyperliquid_range() { + // Px stores price * 10^8 as u64. Realistic Hyperliquid prices range + // from ~1e-6 (memecoin) to ~1e6 (BTC) USD, so Px values land in roughly + // 1e2 .. 1e14. Sweep that range and cross-check against ilog10. + for exp in 2u32..=14 { + let v = 10u64.pow(exp); + assert_eq!(Px::new(v).num_digits(), exp + 1); + assert_eq!(Px::new(v - 1).num_digits(), exp); + assert_eq!(Px::new(v + 1).num_digits(), exp + 1); + } + } +} From 9e20066d0c21f31047925ca6f53cbc325b5a47ec Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 15:03:40 -0400 Subject: [PATCH 08/65] perf: replace Px::num_digits f64 log10 with u64::ilog10 --- server/src/order_book/types.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/src/order_book/types.rs b/server/src/order_book/types.rs index 4ce021e6..94d85369 100644 --- a/server/src/order_book/types.rs +++ b/server/src/order_book/types.rs @@ -125,10 +125,8 @@ impl Px { s.trim_end_matches('.').to_string() } - #[allow(clippy::cast_possible_truncation)] - #[allow(clippy::cast_sign_loss)] pub(crate) fn num_digits(self) -> u32 { - if self.value() == 0 { 1 } else { (self.value() as f64).log10().floor() as u32 + 1 } + if self.value() == 0 { 1 } else { self.value().ilog10() + 1 } } } From 6100fa2950689186e41a14e9cb6d31e75dd8030e Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 15:17:39 -0400 Subject: [PATCH 09/65] perf: pre-size L2 level output Vecs to skip realloc growth --- server/src/order_book/levels.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/order_book/levels.rs b/server/src/order_book/levels.rs index 23aad1e5..053af849 100644 --- a/server/src/order_book/levels.rs +++ b/server/src/order_book/levels.rs @@ -60,7 +60,8 @@ fn l2_levels_to_l2_levels( n_sig_figs: Option, mantissa: Option, ) -> Vec { - let mut new_levels = Vec::new(); + let cap = n_levels.unwrap_or(levels.len()); + let mut new_levels = Vec::with_capacity(cap); if n_levels == Some(0) { return new_levels; } @@ -82,7 +83,8 @@ fn map_to_l2_levels( n_sig_figs: Option, mantissa: Option, ) -> Vec { - let mut levels = Vec::new(); + let cap = n_levels.unwrap_or_else(|| orders.len().min(1024)); + let mut levels = Vec::with_capacity(cap); if n_levels == Some(0) { return levels; } From f27a70e4f97f3d3c609b70aaa99b36e2eb4eda18 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 15:43:25 -0400 Subject: [PATCH 10/65] feat: add --enable-websocket cli flag (default off) --- binaries/src/bin/dz_hl_publisher.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index 3f794b3b..ac537a68 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -154,6 +154,12 @@ struct Args { /// In streaming mode, tail fills on an independent listener so TOB trades do not wait behind book processing. #[arg(long, default_value_t = false)] separate_fill_ingest: bool, + + /// Enable the WebSocket listener at `{address}:{port}`. Off by default — + /// this publisher is multicast-only by default. When set, per-subscriber + /// L2 snapshot variants are computed to serve active WebSocket subscriptions. + #[arg(long, default_value_t = false)] + enable_websocket: bool, } impl Args { @@ -309,4 +315,23 @@ mod tests { Args::parse_from(["dz_hl_publisher", "--address", "127.0.0.1", "--port", "8000", "--separate-fill-ingest"]); assert_eq!(args.validate().err().as_deref(), Some("--separate-fill-ingest requires --ingest-mode stream")); } + + #[test] + fn enable_websocket_defaults_off() { + let args = Args::parse_from(["dz_hl_publisher", "--address", "127.0.0.1", "--port", "8000"]); + assert!(!args.enable_websocket); + } + + #[test] + fn enable_websocket_flag_parses() { + let args = Args::parse_from([ + "dz_hl_publisher", + "--address", + "127.0.0.1", + "--port", + "8000", + "--enable-websocket", + ]); + assert!(args.enable_websocket); + } } From c9927b69e83d6e8f7d5d2815d555e7e158a8bf6d Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 16:28:07 -0400 Subject: [PATCH 11/65] feat: thread enable_websocket through run_websocket_server with startup log --- binaries/src/bin/dz_hl_publisher.rs | 1 + server/src/servers/websocket_server.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index ac537a68..3d302e86 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -250,6 +250,7 @@ async fn main() -> Result<()> { ingest_mode, args.hl_data_root, args.separate_fill_ingest, + args.enable_websocket, ) .await?; diff --git a/server/src/servers/websocket_server.rs b/server/src/servers/websocket_server.rs index 3daaf4a5..93abc394 100644 --- a/server/src/servers/websocket_server.rs +++ b/server/src/servers/websocket_server.rs @@ -50,6 +50,7 @@ use crate::{ /// separate tasks are spawned for the DoB mktdata, refdata, and heartbeat channels. /// Both channels share a single instrument registry, bootstrapped from the HL API URL /// in `multicast_config` when available. +#[allow(clippy::too_many_arguments)] pub async fn run_websocket_server( address: &str, ignore_spot: bool, @@ -59,11 +60,18 @@ pub async fn run_websocket_server( ingest_mode: IngestMode, hl_data_root: Option, separate_fill_ingest: bool, + enable_websocket: bool, ) -> Result<()> { if separate_fill_ingest && ingest_mode != IngestMode::Stream { return Err("--separate-fill-ingest requires streaming ingest mode".into()); } + if enable_websocket { + info!("websocket mode: ENABLED (listener will bind {address}, full L2 snapshot fan-out active)"); + } else { + info!("websocket mode: DISABLED (multicast-only; --address/--port ignored, L2 fan-out reduced)"); + } + let (market_message_tx, _) = channel::>(100); let (l4_message_tx, _) = channel::>(4096); @@ -674,7 +682,7 @@ mod tests { #[tokio::test] async fn separate_fill_ingest_rejects_block_mode_before_startup() { - let err = run_websocket_server("127.0.0.1:0", true, 1, None, None, IngestMode::Block, None, true) + let err = run_websocket_server("127.0.0.1:0", true, 1, None, None, IngestMode::Block, None, true, true) .await .err() .map(|err| err.to_string()); From 96c4e356c83d29865a88e20dc99c270c7ab42b4d Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 16:42:58 -0400 Subject: [PATCH 12/65] feat: skip ws tcp bind when --enable-websocket is off gate the TcpListener::bind + axum::serve block on `enable_websocket`; when false (the default), the ws port is never bound and the task parks via `std::future::pending` so the hl_listen and multicast publisher tasks keep running. adds an integration test that verifies the port remains rebindable in the disabled case. --- binaries/src/bin/dz_hl_publisher.rs | 6 ++- server/src/servers/websocket_server.rs | 17 +++++-- server/tests/websocket_disabled_test.rs | 67 +++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 server/tests/websocket_disabled_test.rs diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index 3d302e86..98b1fe87 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -186,7 +186,11 @@ async fn main() -> Result<()> { }; let full_address = format!("{}:{}", args.address, args.port); - println!("Running websocket server on {full_address}"); + if args.enable_websocket { + println!("Running websocket server on {full_address}"); + } else { + println!("Running multicast-only publisher (websocket disabled; --address/--port unused)"); + } let compression_level = args.websocket_compression_level.unwrap_or(1); if !args.disable_metrics { diff --git a/server/src/servers/websocket_server.rs b/server/src/servers/websocket_server.rs index 93abc394..70521da1 100644 --- a/server/src/servers/websocket_server.rs +++ b/server/src/servers/websocket_server.rs @@ -337,12 +337,19 @@ pub async fn run_websocket_server( }), ); - let listener = TcpListener::bind(address).await?; - info!("WebSocket server running at ws://{address}"); + if enable_websocket { + let listener = TcpListener::bind(address).await?; + info!("WebSocket server running at ws://{address}"); - if let Err(err) = axum::serve(listener, app.into_make_service()).await { - error!("Server fatal error: {err}"); - std::process::exit(2); + if let Err(err) = axum::serve(listener, app.into_make_service()).await { + error!("Server fatal error: {err}"); + std::process::exit(2); + } + } else { + // Multicast-only mode (default): do not bind the WebSocket listener. + // The hl_listen + multicast publisher tasks spawned above keep the + // process alive; park here so run_websocket_server does not return. + std::future::pending::<()>().await; } Ok(()) diff --git a/server/tests/websocket_disabled_test.rs b/server/tests/websocket_disabled_test.rs new file mode 100644 index 00000000..6fa64c3e --- /dev/null +++ b/server/tests/websocket_disabled_test.rs @@ -0,0 +1,67 @@ +//! Verifies that the WS port is not bound when enable_websocket is false. + +#![allow(unused_crate_dependencies)] + +use std::{ + net::{Ipv4Addr, SocketAddr, TcpListener as StdListener}, + path::PathBuf, + time::Duration, +}; + +use server::{IngestMode, run_websocket_server}; + +/// Creates the HL data directory structure required by hl_listen with IngestMode::Block. +/// Returns the root path; the caller is responsible for cleanup. +fn make_hl_data_root() -> PathBuf { + let root = std::env::temp_dir() + .join(format!("ws_disabled_test_{}", std::process::id())); + std::fs::create_dir_all(root.join("node_order_statuses_by_block")).unwrap(); + std::fs::create_dir_all(root.join("node_raw_book_diffs_by_block")).unwrap(); + std::fs::create_dir_all(root.join("node_fills_by_block")).unwrap(); + root +} + +#[tokio::test(flavor = "multi_thread")] +async fn ws_port_not_bound_when_websocket_disabled() { + let probe = StdListener::bind((Ipv4Addr::LOCALHOST, 0)).expect("bind probe"); + let port = probe.local_addr().expect("local_addr").port(); + drop(probe); + + let hl_data_root = make_hl_data_root(); + let hl_data_path = hl_data_root.clone(); + let address = format!("127.0.0.1:{port}"); + let server_handle = tokio::spawn(async move { + let _ = run_websocket_server( + &address, + true, + 1, + None, + None, + IngestMode::Block, + Some(hl_data_path), + false, + false, // enable_websocket = false + ) + .await; + }); + + // This sleep gives the server task a chance to run far enough to bind the + // WS port *if it were going to*. The race is one-directional and safe in + // the disabled case: no bind ever happens, so whether the task has been + // polled yet or not, the port stays unbound and the rebind below succeeds. + // The test asserts absence-of-bind, not presence-of-park — it guards + // against a future flag-flip mistake re-introducing the bind, and cannot + // spuriously pass in the disabled path. + tokio::time::sleep(Duration::from_millis(200)).await; + + let addr = SocketAddr::from((Ipv4Addr::LOCALHOST, port)); + let rebind = StdListener::bind(addr); + assert!( + rebind.is_ok(), + "expected ws port {port} to be unbound when enable_websocket=false; rebind err: {:?}", + rebind.err() + ); + + server_handle.abort(); + let _ = std::fs::remove_dir_all(&hl_data_root); +} From 387b2248abbccbe85fdce57b1bc675531b3f33aa Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 22:25:00 -0400 Subject: [PATCH 13/65] feat: thread enable_websocket into OrderBookListener and OrderBookState --- server/src/listeners/order_book/mod.rs | 8 +++++++- server/src/listeners/order_book/state.rs | 4 ++++ server/src/servers/websocket_server.rs | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 888a71dc..01c2a24f 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -481,6 +481,7 @@ struct UnresolvedStreamNewDebug { pub(crate) struct OrderBookListener { ingest_mode: IngestMode, ignore_spot: bool, + enable_websocket: bool, fill_status_file: Option, order_status_file: Option, order_diff_file: Option, @@ -533,6 +534,7 @@ impl OrderBookListener { Self { ingest_mode, ignore_spot, + enable_websocket: false, fill_status_file: None, order_status_file: None, order_diff_file: None, @@ -614,6 +616,10 @@ impl OrderBookListener { self.l4_message_tx = Some(tx); } + pub(crate) fn set_enable_websocket(&mut self, enable_websocket: bool) { + self.enable_websocket = enable_websocket; + } + fn l4_message_tx(&self) -> Option<&Sender>> { self.l4_message_tx.as_ref().or(self.internal_message_tx.as_ref()) } @@ -1320,7 +1326,7 @@ impl OrderBookListener { fn init_from_snapshot(&mut self, snapshot: Snapshots, height: u64) { info!("No existing snapshot"); - let mut new_order_book = OrderBookState::from_snapshot(snapshot, height, 0, true, self.ignore_spot); + let mut new_order_book = OrderBookState::from_snapshot(snapshot, height, 0, true, self.ignore_spot, self.enable_websocket); // In stream mode, drop any buffered stream events at heights <= snapshot // height — those are already reflected in the snapshot. Replaying them // would either dup orders (New) or get rejected by apply_stream_diff's diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 4dbd4165..266da6ee 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -19,6 +19,7 @@ pub(super) struct OrderBookState { time: u64, snapped: bool, ignore_spot: bool, + enable_websocket: bool, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -50,6 +51,7 @@ impl Clone for OrderBookState { time: self.time, snapped: self.snapped, ignore_spot: self.ignore_spot, + enable_websocket: self.enable_websocket, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -64,9 +66,11 @@ impl OrderBookState { time: u64, ignore_triggers: bool, ignore_spot: bool, + enable_websocket: bool, ) -> Self { Self { ignore_spot, + enable_websocket, time, height, order_book: OrderBooks::from_snapshots(snapshot, ignore_triggers), diff --git a/server/src/servers/websocket_server.rs b/server/src/servers/websocket_server.rs index 70521da1..1fb15cd6 100644 --- a/server/src/servers/websocket_server.rs +++ b/server/src/servers/websocket_server.rs @@ -84,6 +84,7 @@ pub async fn run_websocket_server( let market_message_tx = market_message_tx.clone(); let mut listener = OrderBookListener::new_with_ingest_mode(Some(market_message_tx), ignore_spot, ingest_mode); listener.set_l4_message_tx(l4_message_tx.clone()); + listener.set_enable_websocket(enable_websocket); listener }; let listener = Arc::new(Mutex::new(listener)); From 1704c9b3529f18dea3aba8deeead03c73edd1e1f Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 22:33:02 -0400 Subject: [PATCH 14/65] test: tidy websocket_disabled_test doc backticks and unwrap allow --- server/tests/websocket_disabled_test.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/server/tests/websocket_disabled_test.rs b/server/tests/websocket_disabled_test.rs index 6fa64c3e..a6462cdb 100644 --- a/server/tests/websocket_disabled_test.rs +++ b/server/tests/websocket_disabled_test.rs @@ -1,6 +1,7 @@ -//! Verifies that the WS port is not bound when enable_websocket is false. +//! Verifies that the WS port is not bound when `enable_websocket` is false. #![allow(unused_crate_dependencies)] +#![allow(clippy::unwrap_used)] use std::{ net::{Ipv4Addr, SocketAddr, TcpListener as StdListener}, @@ -10,11 +11,11 @@ use std::{ use server::{IngestMode, run_websocket_server}; -/// Creates the HL data directory structure required by hl_listen with IngestMode::Block. -/// Returns the root path; the caller is responsible for cleanup. +/// Creates the HL data directory structure required by `hl_listen` with +/// `IngestMode::Block`. Returns the root path; the caller is responsible for +/// cleanup. fn make_hl_data_root() -> PathBuf { - let root = std::env::temp_dir() - .join(format!("ws_disabled_test_{}", std::process::id())); + let root = std::env::temp_dir().join(format!("ws_disabled_test_{}", std::process::id())); std::fs::create_dir_all(root.join("node_order_statuses_by_block")).unwrap(); std::fs::create_dir_all(root.join("node_raw_book_diffs_by_block")).unwrap(); std::fs::create_dir_all(root.join("node_fills_by_block")).unwrap(); From 5d9fb0ec3de524c9667ece2160c8f5c0b8180878 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 22:38:00 -0400 Subject: [PATCH 15/65] perf: skip 6 bucketed l2 variants and cap unbucketed to bbo when websocket disabled --- server/src/listeners/order_book/state.rs | 7 +- server/src/listeners/order_book/utils.rs | 129 +++++++++++++++++++---- 2 files changed, 115 insertions(+), 21 deletions(-) diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 266da6ee..50be0870 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -99,13 +99,16 @@ impl OrderBookState { None } else { self.snapped = prevent_future_snaps || self.snapped; - Some((self.time, compute_l2_snapshots(&self.order_book))) + Some((self.time, compute_l2_snapshots(&self.order_book, self.enable_websocket))) } } #[cfg(test)] pub(super) fn compute_l2_snapshots_for_test(&self) -> (u64, L2Snapshots) { - (self.time, compute_l2_snapshots(&self.order_book)) + // Always compute the full 7-variant map for test introspection, + // independent of the runtime enable_websocket flag, so tests that + // assert on bucketed L2 levels are not silently given a reduced map. + (self.time, compute_l2_snapshots(&self.order_book, true)) } pub(super) fn compute_universe(&self) -> HashSet { diff --git a/server/src/listeners/order_book/utils.rs b/server/src/listeners/order_book/utils.rs index fc4fbfad..b882826b 100644 --- a/server/src/listeners/order_book/utils.rs +++ b/server/src/listeners/order_book/utils.rs @@ -213,35 +213,77 @@ impl L2SnapshotParams { } } -pub(super) fn compute_l2_snapshots(order_books: &OrderBooks) -> L2Snapshots { +/// Number of L2 levels retained in the unbucketed snapshot when WebSocket is +/// disabled. TOB (BBO) needs 1; we keep this tight because the only consumer +/// in the WS-disabled config is the TOB best-bid/ask. If a downstream consumer +/// ever needs deeper levels in the WS-disabled config, raise this constant. +/// +/// `pub(super)` so the listener module can reference it in a compile-time +/// invariant check against the TOB freshness threshold (see L2-5 backstop). +pub(super) const DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED: usize = 1; + +/// Derives one bucketed L2 variant from an earlier entry and appends it. +/// +/// `source_offset` selects which already-pushed entry to bucket from, counting +/// back from the end (`1` = the most recent entry, `2` = the one before it). +/// The `Some(5)` mantissa case uses `2` deliberately: it must bucket from the +/// un-mantissa'd `n_sig_figs=5` entry, not from the `Some(2)` one, because +/// `Some(2)` is NOT a superset of the `Some(5)` information. +fn push_bucketed_variant( + entries: &mut Vec<(L2SnapshotParams, Snapshot)>, + n_sig_figs: Option, + mantissa: Option, + source_offset: usize, +) { + debug_assert!( + entries.len() >= source_offset, + "push_bucketed_variant source_offset {source_offset} exceeds entries.len() {}", + entries.len() + ); + if let Some((_, src)) = entries.get(entries.len() - source_offset) { + let snapshot = src.to_l2_snapshot(None, n_sig_figs, mantissa); + entries.push((L2SnapshotParams { n_sig_figs, mantissa }, snapshot)); + } +} + +pub(super) fn compute_l2_snapshots( + order_books: &OrderBooks, + enable_websocket: bool, +) -> L2Snapshots { L2Snapshots( order_books .as_ref() .par_iter() .map(|(coin, order_book)| { - let mut entries = Vec::new(); - let snapshot = order_book.to_l2_snapshot(None, None, None); - entries.push((L2SnapshotParams { n_sig_figs: None, mantissa: None }, snapshot)); - let mut add_new_snapshot = |n_sig_figs: Option, mantissa: Option, idx: usize| { - if let Some((_, last_snapshot)) = &entries.get(entries.len() - idx) { - let snapshot = last_snapshot.to_l2_snapshot(None, n_sig_figs, mantissa); - entries.push((L2SnapshotParams { n_sig_figs, mantissa }, snapshot)); - } + let mut entries: Vec<(L2SnapshotParams, Snapshot)> = + Vec::with_capacity(if enable_websocket { 7 } else { 1 }); + // L2-3b: when WS is disabled, the only consumer of the + // unbucketed snapshot is TOB BBO. Cap the ladder walk at + // DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED levels per side. + let base_n_levels = if enable_websocket { + None + } else { + Some(DEFAULT_MULTICAST_LEVELS_WHEN_WS_DISABLED) }; - for n_sig_figs in (2..=5).rev() { - if n_sig_figs == 5 { - for mantissa in [None, Some(2), Some(5)] { - if mantissa == Some(5) { - // Some(2) is NOT a superset of this info! - add_new_snapshot(Some(n_sig_figs), mantissa, 2); - } else { - add_new_snapshot(Some(n_sig_figs), mantissa, 1); + let snapshot = order_book.to_l2_snapshot(base_n_levels, None, None); + entries.push((L2SnapshotParams { n_sig_figs: None, mantissa: None }, snapshot)); + + if enable_websocket { + for n_sig_figs in (2..=5).rev() { + if n_sig_figs == 5 { + for mantissa in [None, Some(2), Some(5)] { + if mantissa == Some(5) { + push_bucketed_variant(&mut entries, Some(n_sig_figs), mantissa, 2); + } else { + push_bucketed_variant(&mut entries, Some(n_sig_figs), mantissa, 1); + } } + } else { + push_bucketed_variant(&mut entries, Some(n_sig_figs), None, 1); } - } else { - add_new_snapshot(Some(n_sig_figs), None, 1); } } + (coin.clone(), entries.into_iter().collect::>>()) }) .collect(), @@ -283,3 +325,52 @@ impl BatchQueue { self.deque.front() } } + +#[cfg(test)] +mod compute_l2_snapshots_tests { + use super::*; + use crate::order_book::multi_book::OrderBooks; + use crate::test_fixtures::build_one_coin_snapshot; + use crate::types::inner::InnerL4Order; + + /// Build a minimal `OrderBooks` with one coin and the given + /// number of orders per side so tests can control how many levels exist. + fn build_order_books(coin_str: &str, num_orders: usize) -> OrderBooks { + let snapshot = build_one_coin_snapshot(coin_str, num_orders, 0); + OrderBooks::from_snapshots(snapshot, false) + } + + #[test] + fn ws_enabled_emits_seven_variants_per_coin() { + let order_books = build_order_books("BTC", 1); + let snapshots = compute_l2_snapshots(&order_books, true); + for variants in snapshots.as_ref().values() { + assert_eq!(variants.len(), 7, "WS-enabled must emit all 7 variants"); + } + } + + #[test] + fn ws_disabled_emits_one_variant_per_coin() { + let order_books = build_order_books("BTC", 1); + let snapshots = compute_l2_snapshots(&order_books, false); + for variants in snapshots.as_ref().values() { + assert_eq!(variants.len(), 1, "WS-disabled must emit only the unbucketed variant"); + assert!(variants.contains_key(&L2SnapshotParams { n_sig_figs: None, mantissa: None })); + } + } + + #[test] + fn ws_disabled_unbucketed_capped_at_bbo() { + // Use >1 level per side so the cap is meaningful. + let order_books = build_order_books("BTC", 5); + let snapshots = compute_l2_snapshots(&order_books, false); + for variants in snapshots.as_ref().values() { + let unbucketed = variants + .get(&L2SnapshotParams { n_sig_figs: None, mantissa: None }) + .expect("unbucketed variant present"); + let levels = unbucketed.clone().export_inner_snapshot(); + assert!(levels[0].len() <= 1, "bids capped at BBO when WS disabled"); + assert!(levels[1].len() <= 1, "asks capped at BBO when WS disabled"); + } + } +} From f9eae5d25925ed3ce4d97514ba9d2c0d34b57d31 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Fri, 15 May 2026 22:51:14 -0400 Subject: [PATCH 16/65] test: assert tob bbo is identical across enable_websocket configs --- server/src/listeners/order_book/utils.rs | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/server/src/listeners/order_book/utils.rs b/server/src/listeners/order_book/utils.rs index b882826b..899dcfe1 100644 --- a/server/src/listeners/order_book/utils.rs +++ b/server/src/listeners/order_book/utils.rs @@ -373,4 +373,66 @@ mod compute_l2_snapshots_tests { assert!(levels[1].len() <= 1, "asks capped at BBO when WS disabled"); } } + + /// The core TOB-invariance property: the BBO (level 0 of the unbucketed + /// variant) is identical whether `enable_websocket` is true or false. + /// + /// TOB (multicast) derives its best bid/ask exclusively from level 0 of the + /// `(n_sig_figs=None, mantissa=None)` variant. That level is computed by a + /// full ladder walk capped at `n_levels=1` when WS is disabled, and by an + /// uncapped walk when WS is enabled. The BBO price and size at level 0 must + /// be identical in both cases — this test asserts that property directly. + /// + /// Multiple coins are tested to rule out a single-coin coincidence. + #[test] + fn tob_bbo_identical_across_websocket_flag() { + use crate::order_book::{Coin, Snapshot}; + use std::collections::HashMap; + + // Build a two-coin Snapshots with >1 level per side so the WS-disabled + // cap (n_levels=1) is meaningful. Using build_one_coin_snapshot for BTC + // (5 orders per side) and ETH (3 orders per side) to exercise the + // multi-coin parallel path. + let btc_snapshots = build_one_coin_snapshot("BTC", 5, 0); + let eth_snapshots = build_one_coin_snapshot("ETH", 3, 10_000); + + // Merge into a single Snapshots map. + let mut combined: HashMap> = btc_snapshots.value(); + combined.extend(eth_snapshots.value()); + let order_books = OrderBooks::from_snapshots(Snapshots::new(combined), false); + + let ws_on = compute_l2_snapshots(&order_books, true); + let ws_off = compute_l2_snapshots(&order_books, false); + + // Both maps must cover the same set of coins. + assert_eq!( + ws_on.as_ref().len(), + ws_off.as_ref().len(), + "same coin count regardless of enable_websocket" + ); + + let unbucketed_key = L2SnapshotParams { n_sig_figs: None, mantissa: None }; + + for (coin, on_variants) in ws_on.as_ref() { + let off_variants = ws_off.as_ref().get(coin).expect("coin present in ws-off map"); + + let on_unbucketed = on_variants.get(&unbucketed_key).expect("ws-on has unbucketed variant"); + let off_unbucketed = off_variants.get(&unbucketed_key).expect("ws-off has unbucketed variant"); + + // TOB reads exactly 1 level (BBO). Compare the BBO derived from + // each flag configuration — they must be byte-identical. + let on_bbo = on_unbucketed.truncate(1).export_inner_snapshot(); + let off_bbo = off_unbucketed.truncate(1).export_inner_snapshot(); + + assert_eq!( + on_bbo, off_bbo, + "TOB BBO must be identical across enable_websocket for coin {coin:?}: \ + ws-on={on_bbo:?} vs ws-off={off_bbo:?}", + ); + + // Sanity: the BBO must be non-empty (both coins have resting orders). + assert!(!on_bbo[0].is_empty(), "BBO bid must be non-empty for {coin:?}"); + assert!(!on_bbo[1].is_empty(), "BBO ask must be non-empty for {coin:?}"); + } + } } From acce6699d18295fafb68907395a4eaf2ab93915c Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 21:36:23 -0400 Subject: [PATCH 17/65] fix: reject startup when no market-data output is configured --- README.md | 4 ++- binaries/src/bin/dz_hl_publisher.rs | 54 +++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 41562142..1a7e6b60 100644 --- a/README.md +++ b/README.md @@ -34,9 +34,11 @@ The `l4book` subscription first sends a snapshot of the entire book and then for 2. Then run this local server: ```bash -cargo run --release --bin dz_hl_publisher -- --address 0.0.0.0 --port 8000 +cargo run --release --bin dz_hl_publisher -- --address 0.0.0.0 --port 8000 --enable-websocket ``` +The publisher is multicast-only by default and requires at least one output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`); starting without any of these is rejected at startup. + By default the server reads `$HOME/hl/data/node_*_by_block`. To opt into streaming disk ingest, use: ```bash diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index 98b1fe87..e47e1cab 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -168,6 +168,13 @@ impl Args { if self.separate_fill_ingest && ingest_mode != IngestMode::Stream { return Err("--separate-fill-ingest requires --ingest-mode stream".to_owned()); } + if !self.enable_websocket && self.multicast_group.is_none() && self.dob_group.is_none() { + return Err( + "no market-data output configured: pass at least one of --enable-websocket, \ + --multicast-group, or --dob-group" + .to_owned(), + ); + } Ok(ingest_mode) } } @@ -339,4 +346,51 @@ mod tests { ]); assert!(args.enable_websocket); } + + #[test] + fn rejects_when_no_output_configured() { + let args = Args::parse_from(["dz_hl_publisher", "--address", "0.0.0.0", "--port", "8000"]); + assert!( + args.validate() + .err() + .is_some_and(|e| e.contains("no market-data output configured")), + "bare --address/--port with no output mode must be rejected" + ); + } + + #[test] + fn accepts_when_enable_websocket_set() { + let args = Args::parse_from([ + "dz_hl_publisher", "--address", "0.0.0.0", "--port", "8000", "--enable-websocket", + ]); + assert!(args.validate().is_ok(), "--enable-websocket alone is a valid output mode"); + } + + #[test] + fn accepts_when_multicast_group_set() { + let args = Args::parse_from([ + "dz_hl_publisher", + "--address", + "0.0.0.0", + "--port", + "8000", + "--multicast-group", + "239.0.0.1", + ]); + assert!(args.validate().is_ok(), "--multicast-group alone is a valid output mode"); + } + + #[test] + fn accepts_when_dob_group_set() { + let args = Args::parse_from([ + "dz_hl_publisher", + "--address", + "0.0.0.0", + "--port", + "8000", + "--dob-group", + "239.0.0.2", + ]); + assert!(args.validate().is_ok(), "--dob-group alone is a valid output mode"); + } } From 3f9f006b44cdc14ca73a6871c43eaabafdf2cdf6 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 21:44:13 -0400 Subject: [PATCH 18/65] feat: add book_dirty flag to OrderBookState set only on real mutations --- server/src/listeners/order_book/state.rs | 79 +++++++++++++++++++----- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 50be0870..eaa3d6ff 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -20,6 +20,12 @@ pub(super) struct OrderBookState { snapped: bool, ignore_spot: bool, enable_websocket: bool, + /// Set true only when a mutation actually changed the resting book since + /// the last snapshot. Cleared by an authoritative snapshot emission. + /// L2-5 uses this so streaming finalization can decide whether to emit a + /// snapshot for the closing block. Soft-tolerance no-op branches MUST NOT + /// set this — see `apply_stream_diff`. + book_dirty: bool, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -52,6 +58,7 @@ impl Clone for OrderBookState { snapped: self.snapped, ignore_spot: self.ignore_spot, enable_websocket: self.enable_websocket, + book_dirty: self.book_dirty, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -75,6 +82,7 @@ impl OrderBookState { height, order_book: OrderBooks::from_snapshots(snapshot, ignore_triggers), snapped: false, + book_dirty: false, dob_tap: None, } } @@ -88,6 +96,18 @@ impl OrderBookState { self.height } + pub(super) const fn book_dirty(&self) -> bool { + self.book_dirty + } + + pub(super) const fn mark_book_dirty(&mut self) { + self.book_dirty = true; + } + + pub(super) const fn clear_book_dirty(&mut self) { + self.book_dirty = false; + } + // forcibly take snapshot - (time, height, snapshot) pub(super) fn compute_snapshot(&self) -> TimedSnapshots { TimedSnapshots { time: self.time, height: self.height, snapshot: self.order_book.to_snapshots_par() } @@ -199,7 +219,12 @@ impl OrderBookState { let inner_order = resting_order_from_raw_new(order, &diff, sz)?; let order_for_tap = inner_order.clone(); self.order_book.add_resting_order_from_diff(inner_order); - if !self.order_book.contains_order(&oid, &coin) { + if self.order_book.contains_order(&oid, &coin) { + // Order actually rested — book mutated. + self.book_dirty = true; + } else { + // Soft-tolerance no-op: order did not rest, so the book + // was NOT mutated — do NOT set book_dirty here. log::warn!( "apply_updates: New order did not rest after raw-diff insert, later updates will be missing; \ height={height} oid={oid:?} coin={coin:?} order={order_for_tap:?}" @@ -209,16 +234,20 @@ impl OrderBookState { tap.emit_order_add(&coin, &order_for_tap, time_ns); } } else { - // Soft-tolerance: New diff without matching opening status. - // We can't add the order without the user/time/cloid info from - // the status, so skip and let snapshot validation reconcile. - // Same shape as the Update/Remove missing-order branches below. + // Soft-tolerance no-op: New diff without matching opening + // status. We can't add the order without user/time/cloid + // from the status, so skip and let snapshot validation + // reconcile. The book was NOT mutated — do NOT set + // book_dirty. Same shape as the Update/Remove + // missing-order branches below. log::warn!("apply_updates: New diff without matching opening status, skipping {diff:?}"); } } InnerOrderDiff::Update { new_sz, .. } => { match self.order_book.modify_sz(oid.clone(), coin.clone(), new_sz) { Some((old_sz, px)) => { + // Order size modified — book mutated. + self.book_dirty = true; if let Some(tap) = self.dob_tap.as_mut() { // exec_quantity = reduction in resting size let exec_quantity = @@ -227,23 +256,27 @@ impl OrderBookState { } } None => { - // Soft-tolerance: the order isn't on our book, but the venue - // has an Update for it. Snapshot validation runs every 60s - // and applies surgical recovery on divergence; missing this - // event won't permanently corrupt state. Crashing here would - // turn what's likely a transient ordering race into a hard - // failure cycle. + // Soft-tolerance no-op: the order isn't on our book, but + // the venue has an Update for it. The book was NOT + // mutated — do NOT set book_dirty. Snapshot validation + // runs every 60s and applies surgical recovery on + // divergence; missing this event won't permanently + // corrupt state. Crashing here would turn what's likely + // a transient ordering race into a hard failure cycle. log::warn!("apply_updates: Update for missing order at height {height}, skipping {diff:?}"); } } } InnerOrderDiff::Remove => { if self.order_book.cancel_order(oid.clone(), coin.clone()) { + // Order cancelled — book mutated. + self.book_dirty = true; if let Some(tap) = self.dob_tap.as_mut() { tap.emit_order_cancel(&coin, oid, time_ns); } } else { - // Soft-tolerance — see Update branch above. + // Soft-tolerance no-op — see Update branch above. Book NOT + // mutated; do NOT set book_dirty. log::warn!("apply_updates: Remove for missing order at height {height}, skipping {diff:?}"); } } @@ -285,6 +318,7 @@ impl OrderBookState { let oid = diff.oid(); let coin = diff.coin(); if coin.is_spot() && self.ignore_spot { + // Spot-ignored no-op: book NOT mutated; do NOT set book_dirty. self.height = self.height.max(block_number); self.time = block_time_ms; return Ok(false); @@ -294,9 +328,10 @@ impl OrderBookState { match inner_diff { InnerOrderDiff::New { sz } => { let Some(order) = order_status else { - // Soft-tolerance: New diff without matching opening status. - // Snapshot validation will reconcile. Advance height anyway so - // we don't replay this diff and so subsequent diffs apply. + // Soft-tolerance no-op: New diff without matching opening + // status. Snapshot validation will reconcile. Advance height + // anyway so we don't replay this diff and so subsequent diffs + // apply. The book was NOT mutated — do NOT set book_dirty. log::warn!("apply_stream_diff: New diff without matching opening status, skipping {diff:?}"); self.height = self.height.max(block_number); self.time = block_time_ms; @@ -306,7 +341,12 @@ impl OrderBookState { let inner_order = resting_order_from_raw_new(order, &diff, sz)?; let order_for_tap = inner_order.clone(); self.order_book.add_resting_order_from_diff(inner_order); - if !self.order_book.contains_order(&oid, &coin) { + if self.order_book.contains_order(&oid, &coin) { + // Order actually rested — book mutated. + self.book_dirty = true; + } else { + // Soft-tolerance no-op: order did not rest, so the book was + // NOT mutated — do NOT set book_dirty here. log::warn!( "apply_stream_diff: New order did not rest after raw-diff insert, later updates will be missing; \ block_number={block_number} oid={oid:?} coin={coin:?} order={order_for_tap:?}" @@ -319,13 +359,16 @@ impl OrderBookState { InnerOrderDiff::Update { new_sz, .. } => match self.order_book.modify_sz(oid.clone(), coin.clone(), new_sz) { Some((old_sz, px)) => { + // Order size modified — book mutated. + self.book_dirty = true; if let Some(tap) = self.dob_tap.as_mut() { let exec_quantity = crate::order_book::Sz::new(old_sz.value().saturating_sub(new_sz.value())); tap.emit_order_execute(&coin, oid, px, exec_quantity, time_ns); } } None => { - // Soft-tolerance: see apply_updates' matching branch. + // Soft-tolerance no-op: see apply_updates' matching branch. + // Book NOT mutated; do NOT set book_dirty. log::warn!( "apply_stream_diff: Update for missing order at block {block_number}, skipping {diff:?}" ); @@ -333,6 +376,8 @@ impl OrderBookState { }, InnerOrderDiff::Remove => { if self.order_book.cancel_order(oid.clone(), coin.clone()) { + // Order cancelled — book mutated. + self.book_dirty = true; if let Some(tap) = self.dob_tap.as_mut() { tap.emit_order_cancel(&coin, oid, time_ns); } From 2ab72b4f6bfdac262dc01a976b21f85aa8f6046a Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 22:17:02 -0400 Subject: [PATCH 19/65] perf: emit streaming l2 snapshot at block finalization, not per chunk L2-5: in streaming + WS-disabled, TOB L2 snapshots now emit once per finalized block (when the book changed) instead of once per file-read chunk. Eliminates the per-chunk snapshot CPU multiplier. Streaming TOB goldens regenerated: the per-chunk path emitted ~27 duplicate quotes per block-time; the new path emits one authoritative quote per finalized dirty block. Every regenerated quote is byte-identical to the block-mode golden at the same source timestamp (verified). Block-mode goldens unchanged. finalize_streaming_for_test updated to mirror finalize_stream_block's authoritative dirty-emission so the regenerated goldens reflect production behavior including the final block. --- server/src/listeners/order_book/mod.rs | 151 ++++++++++++++---- server/src/listeners/order_book/state.rs | 12 ++ .../golden/stream_tob_marketdata.bin | Bin 12608 -> 3192 bytes .../golden/parity_report.json | 6 +- .../golden/stream_tob_marketdata.bin | Bin 118224 -> 5232 bytes .../golden/tob_end_of_block_quotes.bin | Bin 3200 -> 3136 bytes 6 files changed, 131 insertions(+), 38 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 01c2a24f..de21b352 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -469,6 +469,19 @@ impl StreamFinalizationMode { } } +/// Whether `emit_tob_snapshot` is taking the authoritative final snapshot for +/// the current dirty range, or a provisional snapshot (e.g. stuck-stream +/// backstop) that does NOT close out the dirty state. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum SnapshotEmission { + /// Block finalization: authoritative final snapshot for this height. + /// Clears `book_dirty` because the block is closed. + Authoritative, + /// Backstop / provisional: emit current state but leave `book_dirty` + /// set so a subsequent finalization still emits its own snapshot. + Provisional, +} + struct UnresolvedStreamNewDebug { height: u64, oid: Oid, @@ -1045,6 +1058,86 @@ impl OrderBookListener { StreamFinalizationMode::GraceFallback => self.streaming_state.finalized_grace_blocks += 1, } self.streaming_state.finalized_height = Some(height); + + // Streaming snapshot emission tied to block finalization (L2-5). + // Authoritative: clears book_dirty. Any earlier provisional/backstop + // emission deliberately left the flag set so this call still fires. + self.emit_authoritative_block_snapshot(&block); + } + + /// Emits the authoritative end-of-block TOB snapshot for `block` if the + /// book changed since the last snapshot. Shared by streaming finalization + /// and the test finalization helper so the two cannot drift (the test + /// helper previously lagged this logic). + fn emit_authoritative_block_snapshot(&mut self, block: &StreamingBlock) { + let dirty = self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty); + if dirty { + let block_time_ms = block.block_time_ms.unwrap_or(0); + let local_time_ms = block.local_time_ms.unwrap_or(block_time_ms); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + block_time_ms, + local_time_ms, + SnapshotEmission::Authoritative, + ); + } + } + + fn emit_tob_snapshot( + &mut self, + source_label: &'static str, + source_block_time_ms: u64, + source_local_time_ms: u64, + emission: SnapshotEmission, + ) { + let snapshot_start = Instant::now(); + // L2-5 + Codex finding: `prevent_future_snaps` mirrors the authority of + // this emission. Authoritative emissions set `snapped` so block mode's + // duplicate-chunk suppression keeps working; provisional (backstop) + // emissions leave `snapped` untouched so the later authoritative + // finalization call can still produce a snapshot. Without this, a + // backstop fired before finalization would mark `snapped=true` and the + // finalization's `l2_snapshots(true)` would return `None`, silently + // dropping the authoritative final BBO. + let prevent_future_snaps = emission == SnapshotEmission::Authoritative; + let snapshot = self.l2_snapshots(prevent_future_snaps); + crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); + if let Some(snapshot) = snapshot { + // l2_snapshots returned Some, so order_book_state is present; unwrap_or(0) is unreachable defensively. + let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); + let latest_heights = self.ingest_heights(); + crate::metrics::observe_tob_snapshot_enqueue_lag( + source_label, + Duration::from_millis(now_ms().saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_source_block_lag( + source_label, + Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_validator_write_lag( + source_label, + Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), + ); + if let Some(tx) = &self.internal_message_tx { + let enqueued_at_ms = now_ms(); + let snapshot_msg = Arc::new(InternalMessage::Snapshot { + l2_snapshots: snapshot.1, + time: snapshot.0, + height: snapshot_height, + source: source_label, + source_block_time_ms, + source_local_time_ms, + latest_heights, + enqueued_at_ms, + }); + let _unused = tx.send(snapshot_msg); + } + if emission == SnapshotEmission::Authoritative + && let Some(state) = self.order_book_state.as_mut() + { + state.clear_book_dirty(); + } + } } fn receive_stream_statuses(&mut self, batch: Batch) -> Result<()> { @@ -1326,7 +1419,8 @@ impl OrderBookListener { fn init_from_snapshot(&mut self, snapshot: Snapshots, height: u64) { info!("No existing snapshot"); - let mut new_order_book = OrderBookState::from_snapshot(snapshot, height, 0, true, self.ignore_spot, self.enable_websocket); + let mut new_order_book = + OrderBookState::from_snapshot(snapshot, height, 0, true, self.ignore_spot, self.enable_websocket); // In stream mode, drop any buffered stream events at heights <= snapshot // height — those are already reflected in the snapshot. Replaying them // would either dup orders (New) or get rejected by apply_stream_diff's @@ -1417,6 +1511,7 @@ impl OrderBookListener { state.emit_batch_boundary(1, height, block_time_ms); } self.streaming_state.finalized_height = Some(height); + self.emit_authoritative_block_snapshot(&block); } Ok(()) } @@ -1855,44 +1950,30 @@ impl DirectoryListener for OrderBookListener { } let _reported = self.progress.report_if_due(); } + // Fills do not drive TOB snapshots in streaming mode. if self.ingest_mode == IngestMode::Stream && event_source == EventSource::Fills { return Ok(()); } - let snapshot_source = ingest_source_label(event_source); - let snapshot_start = Instant::now(); - let snapshot = self.l2_snapshots(true); - crate::metrics::observe_tob_snapshot_compute(snapshot_source, snapshot_start.elapsed()); - if let Some(snapshot) = snapshot { - let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); - let latest_heights = self.ingest_heights(); - let (source_block_time_ms, source_local_time_ms) = last_source_times.unwrap_or((snapshot.0, snapshot.0)); - crate::metrics::observe_tob_snapshot_enqueue_lag( - snapshot_source, - Duration::from_millis(now_ms().saturating_sub(snapshot.0)), - ); - crate::metrics::observe_tob_snapshot_source_block_lag( - snapshot_source, - Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), - ); - crate::metrics::observe_tob_snapshot_validator_write_lag( - snapshot_source, - Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), - ); - if let Some(tx) = &self.internal_message_tx { - let enqueued_at_ms = now_ms(); - let snapshot = Arc::new(InternalMessage::Snapshot { - l2_snapshots: snapshot.1, - time: snapshot.0, - height: snapshot_height, - source: snapshot_source, - source_block_time_ms, - source_local_time_ms, - latest_heights, - enqueued_at_ms, - }); - let _unused = tx.send(snapshot); - } + // L2-5: in streaming + WS-disabled, snapshots are emitted by + // finalize_stream_block (and the dirty backstop in Task 5.4), not per + // chunk. WS-enabled streaming retains today's per-chunk cadence so WS + // subscribers see the same intra-block L2 update granularity. + if self.ingest_mode == IngestMode::Stream && !self.enable_websocket { + return Ok(()); } + let snapshot_source = ingest_source_label(event_source); + // The original per-chunk fallback for `last_source_times` used + // `snapshot.0` (= state.time()) for both source_block_time_ms and + // source_local_time_ms. Compute that equivalent here before calling the + // helper so block-mode wire output stays byte-identical. + let state_time_ms = self.order_book_state.as_ref().map(OrderBookState::time).unwrap_or(0); + let (source_block_time_ms, source_local_time_ms) = last_source_times.unwrap_or((state_time_ms, state_time_ms)); + self.emit_tob_snapshot( + snapshot_source, + source_block_time_ms, + source_local_time_ms, + SnapshotEmission::Authoritative, + ); Ok(()) } } diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index eaa3d6ff..6f91ec2f 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -96,10 +96,22 @@ impl OrderBookState { self.height } + /// Returns the timestamp of the most-recent order book update, mirroring + /// what `l2_snapshots` returns as `snapshot.0`. Used by + /// `emit_tob_snapshot` to compute the default source times when no file + /// rows were processed in the current chunk. + pub(super) const fn time(&self) -> u64 { + self.time + } + pub(super) const fn book_dirty(&self) -> bool { self.book_dirty } + // Wired into the recovery path in Task 5.4b (apply_recovery marks the book + // dirty so the corrected BBO is emitted). Direct mutation sites set + // self.book_dirty inline; this accessor exists for callers outside state.rs. + #[allow(dead_code)] pub(super) const fn mark_book_dirty(&mut self) { self.book_dirty = true; } diff --git a/server/tests/fixtures/hl_block_mode/golden/stream_tob_marketdata.bin b/server/tests/fixtures/hl_block_mode/golden/stream_tob_marketdata.bin index a2fdf609f8aefe754a423474a808d4b828308126..733409b40897a21f1c33ca359ad284f620484ee6 100644 GIT binary patch delta 90 zcmV-g0Hy!HV)z)4QIlB#u#>q0ACuq%kdrV6v6J}#v6BfH wXOl1+pp&2=u#zHKv$-P&0h53yu#=c7u(MDs2mzB|E{Kz$FrbkrDU)e0FWmAZ=Kufz delta 499 zcma*ftxv;17zS`kyR}ykBp6T&0s@8)L&l$gY!JMXpqgNcViH-o<}t`x)V)p3WF-RcMii(yH??@-UA0$r{C!wWC-v?@=!!K7KPoT7MZ9 fWh|WqGINI8-I2dIKEYFb&HMtBxb1Dm*PZek8{4e) diff --git a/server/tests/fixtures/hl_dual_validator/golden/parity_report.json b/server/tests/fixtures/hl_dual_validator/golden/parity_report.json index 49cb643b..78fe87b5 100644 --- a/server/tests/fixtures/hl_dual_validator/golden/parity_report.json +++ b/server/tests/fixtures/hl_dual_validator/golden/parity_report.json @@ -21,9 +21,9 @@ "tob": { "block_end_quote_count": 52, "block_marketdata_packets": 64, - "matched_end_quote_count": 50, - "stream_end_quote_count": 50, - "stream_marketdata_packets": 1345, + "matched_end_quote_count": 49, + "stream_end_quote_count": 49, + "stream_marketdata_packets": 61, "trade_count": 11 } } diff --git a/server/tests/fixtures/hl_dual_validator/golden/stream_tob_marketdata.bin b/server/tests/fixtures/hl_dual_validator/golden/stream_tob_marketdata.bin index eb05cdcfb87e21b83773e81018531f9d4a22087e..2e0a1281243590fc0a22ab0123f3613f1a75fd22 100644 GIT binary patch delta 345 zcmXAku}cDR9L0GAr3QKJ4Njudi7na`4WSrxxpZz;i;%=YkV;TOP()A%zw!}589`!8 z)=*1dQazVCg$@9Uh63?g=fMi*hVHJnx2kf~xmQw;*9%8!MV z8%(OxP)W6lrqYg5YS2k5|C|0FSnFq1`%_GPjpbbkCICZOUmAU*FlkR9#7IE&Y8(rU z#-Z0Hk)Smt`k&vm@+CQFS2h-vpSZ0hBC(N+L>oWhHOvEPgZz^Ud+28A1H?jWGnk^gZj-=cD#T@8bM8Sc42M&eFfqirT zWXccI1I-W6%_Q@X_{#AXx2~SLUc_1P4ExfuBI_5+Z?1fgm2T4~GpF!~E@oe|1KV#LdyO|N z-Q>SGijRRW{Uy?1}avd*pDW8FZKMpO)tqr;4NiV*6r~Cf6&QRj5 zpXnZm&Y4Djn5k7anFj~xQXOhu(39bNa-e*nTbO)P`zM@8@ir?^KZK@x->;|Ac!TwC z3cB|74~j$U>an(-nBNe(5oUJXpi6lM9KgZZPr?uI1N>mE!%Fu7hzrC8;=+oH+b_K~ z6b{g}*N4D2Gv5b4zz@!SQSbx&06$nBkoF5AE)W-p3&e#zOUipEw>QK`nC5;XkNFbl(mEeFfCH}s zV&4_~06)MFjLG{z5EqCG#0BC)oU4KSK=}Y1z#-c{G1N`eP1H@)P1Mco`)QFUktdNS zktdNSktgl*A+dg8txQFaPH@R1^?(aT#262J7 zKwO}oL_djs68$94gy<(-?`^@n9OmUTp2J2ziGGsjP~2zXK8yPx-}lSKu6QyL#(pm7 zQXd5l;Na{hz5I~wem3*`TQg4|)~O#ydS8_LJP_#m^BMS3e+CZV!1IgfQ{e~r0e(2V zUq@UZE($p=*ng+kFYJr3Q}|yauznJ{6c>fYp>28MH12~4JC%P4%n#5dKZM1h^}R1* z{CY*0efa{rZqTLc1`goB<4fER!4L2Q{BZdF5aI%Hfw(|i9BIEC@&ob%@&ob%@6!(B$o^0RvNh_}{#msCSUG7t%OMPphap;+P|5@Y*v;8HqA3FDc z`YqDcd_K#o&O?`DW#CKkR%jd&H~likuV1n5^QIng0bTM#p>f!;;h|aN2hHcvJ^TP& z@&h2JDq`yRIE<%Tf5{RCbd zpiB2LVR2y7&-yC&^AOGVUwB=&M9=PwuHJhT3SBq-!6sLJ5FCbT?(dU1w9cJS>&!bs zH)^Qc{^m_e-4a9H6{*@$s__oXQ{ARF2G^;^TWUDop1Uqr>JB&5ZCUn$Qnw7ccHS9i z92V9-G+O1~azj5rm+CJ#^bZc~w43exP;S`oq;LOzl{4OiZkeHOTkBuSRl38WYhO2T zApe?ji2b$KwNF9#x75&Q$-8#ji9+mu5W2$*b=e!8O5GAe-PDq|N>%X|9bmd$|GYuT zVW=SoS+_Y#{uR1$3;y|Yi})pQeTrC|729uS-#+Hha;<;^EzJRgs$fENZ?EJsNmp>Lu&Q$Q+N>T&XdGE_(GTZc5na(9$%t>U4&y)w34k>y7W9DIQZj`dhVP??%Tnd@9XyB3tj40!J#lXtQ)e;RVT!C8)6t2Vd+-1jvmS5 z_TYMHCXR<1ju*P*S8(Vb96Fc3pVK_5T;F6rK$rXh4&dOdL-0eeepq$u)2`e1C_(V&F*_0NMz-}k#N=RB0x zIQ{Mygl?u!2VeUf6!<~yhxYoNuJs#{7j(~|mH8%TJ?=V>SLo`VLo4fcExg~gZY*?l z&x<`;y8FJf(sdrM7_aU*wDNeNOZ6NajxL9`3IE9HJV-D9LYJ;1I1DTfvF7FMo>!z08pw!hp|60~v_0e9X?g-8EVrAX7 znjg8&u@@XFplkO#rW`u1dQr)@(vWZKiD!@D^RD7~8FAgJ40Y?HPbhUqLf7^KIM96n zIDiAsKO*nI5AZ{ne&~30)>pXxNX_@%9oFx>vc*;3WZk&+XZTqUk#)s-$U9e@dG_2# z_(<5aq%pljz{aG?6r-#NtIym*7N4h0n#(9O)pfiJzE1RTJ@@!s!a zl^+&%mfHL7M(NIPdH6TBvZayt6QeZW2kONk-*tPp*4XbQ7xw{Tymf{gpiBKPIDmt* zKWiAbX0r3XMZ}wF-q|sA+X*V&TEjR_KleSQZjGVtfd{rvQjNFTP`B^;cPFZJN9L(+ z#ovFb9B-B3c%zs6X@ZJFWuEFby){QU-U`FP1X&5dEW=SAX&5_E);DJM^55@A>bIr*}K+rF^~QaYFZ09wye) z&qumm?&q2k;A`K2lE+eE_!qk5 zUvK~iXMKYo;D>_tL)qW9gi zusc@2?92;Z3Pi(&F6Pz-PF0M z3ZA&ceng=g%@f^p;(^zk9OUuFJ~(3rAHVi}5OnRn0eq?d4~@g<)UHtc3tjSWfpe(1 zXtOK-ihQQqN2R;w?REMrDBT2Gz7M()1`e5dlCb=6_ix|J=CkDG+h_7RFP^vXdq0#Y zFx^<+oqb_&fG+tL914>|x@_x5&bVm$c(Uu9B#|FBuZx&Oud zj?gVP)ZH=o+pgz=h3*JL-JWZv)vLx^VW=D3UgCOAT#UEURQJ4ll)6;`>n5Li*6x3G z&pD9uSyUP?bSaYjrl>!#LysMOUxw?)=%ziqzjToJ)R_neudrQ7!UOUm);oK>4z(H+<^MXF>T4y5uvNL&GWA<2Y_LS$_6C$hu&jpLtS}0)m^V~`km)v$4HmWsyUhWtD`mhJ+JYGrQ2NW9-$Jyo2|2l6-vs_w33%Vu!0+V$6iFLWvY?)<>h+;v(1a%DTOk z<dx{+d68jQn%Jnch};Tu5}PG-Wo&Q6`S4bD?+#0P`Bj=ao75w&>d;0 zo4oqndguL{(A8acl>NZQU8vO6T_=@wdn-P6trv^&>aOd`y1UZ9Q0nTgL&vUpcbqey z$+~TA)%H9ebOYird%tU6g5ayWFG0R;tJYU4b#?bqKsQr|q`2rR?Q-o?5!X$3|BK8w zRbHml)!nZHUG;Sfi*Iu2y_L>7B>bSe?@qpM{Y}^JI*Z_Na9;M$oGxSAPrB`vj_uYu zOy=WvJ%nAfd7`tf1;rJ*6lX_^L(4Sxb3!7{w9l86{7`7R$^Sj(dR{6h4*gBnU%s){ z?cMTAIdj1hLX>YkuH1n UMteVr?sJr^aZgh=4zlk50bb0NEdT%j diff --git a/server/tests/fixtures/hl_dual_validator/golden/tob_end_of_block_quotes.bin b/server/tests/fixtures/hl_dual_validator/golden/tob_end_of_block_quotes.bin index c82b35b74b446c890d122c8aa944932a63e25ae5..758b9a9e074495424bc6ecb3a3fc669e0812ca14 100644 GIT binary patch delta 8 PcmZpWJRq^r!GQ+=4Zi}f delta 72 zcmX>g(IDwy!@$763?hJlk%1Y)0FrzeJYe Date: Sun, 17 May 2026 22:36:59 -0400 Subject: [PATCH 20/65] feat: 5s stuck-stream snapshot backstop on dedicated 250ms ticker --- server/src/listeners/order_book/mod.rs | 70 ++++++++++++++++++++++++ server/src/listeners/order_book/state.rs | 57 ++++++++++++++++--- server/src/multicast/publisher.rs | 8 +-- 3 files changed, 122 insertions(+), 13 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index de21b352..9b9f2b6a 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -66,6 +66,20 @@ const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_secs(5); #[cfg(test)] const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_millis(50); +/// Stuck-stream backstop interval. Half of +/// `MulticastPublisher::CATCHUP_THRESHOLD_MS` (500ms) so a provisional +/// emission always fires before a dirty BBO ages past the TOB freshness +/// suppression cutoff. Backstop only runs when block finalization is +/// delayed; in healthy streaming, finalization emits first and this never +/// fires. +const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_millis(250); + +const _BACKSTOP_VS_FRESHNESS_INVARIANT: () = assert!( + STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() * 2 + <= crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS as u128, + "STREAM_DIRTY_BACKSTOP_INTERVAL must be <= CATCHUP_THRESHOLD_MS / 2 so stalls cannot age TOB past suppression", +); + /// Wall-clock nanoseconds since the Unix epoch. Mirrors the helper in /// `multicast::dob` (kept module-local there) so `apply_recovery` can stamp /// `InstrumentReset.timestamp_ns` without crossing a privacy boundary. @@ -165,6 +179,14 @@ pub(crate) async fn hl_listen( watcher.watch(&order_diffs_dir, RecursiveMode::Recursive)?; let start = Instant::now() + Duration::from_secs(5); let mut ticker = interval_at(start, Duration::from_secs(60)); + let mut backstop_ticker = tokio::time::interval(STREAM_DIRTY_BACKSTOP_INTERVAL); + backstop_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // Compute once: the backstop arm is only useful in stream mode with WS disabled. + // When inactive, the arm is never polled so the ticker never contends the lock. + let backstop_active = ingest_mode == IngestMode::Stream && { + let g = listener.lock().await; + !g.enable_websocket + }; // Track last event time for liveness detection (replaces the sleep(5) branch // that was recreating a Sleep future on every select! iteration) let mut last_event_time: Option = None; @@ -252,6 +274,10 @@ pub(crate) async fn hl_listen( Some(Ok(())) => {} } } + _ = backstop_ticker.tick(), if backstop_active => { + let mut guard = listener.lock().await; + guard.try_emit_stuck_stream_backstop(); + } _ = ticker.tick() => { // Liveness check: replaces the per-iteration sleep(5) future. // Detection window is 5–15 s instead of exactly 5 s, which is @@ -1065,6 +1091,38 @@ impl OrderBookListener { self.emit_authoritative_block_snapshot(&block); } + /// Stuck-stream backstop: if streaming + WS-disabled and the book has been + /// dirty for >= STREAM_DIRTY_BACKSTOP_INTERVAL without a finalization + /// emission, emit a provisional snapshot (does NOT clear book_dirty, so the + /// eventual authoritative finalization snapshot still fires). Shared by the + /// hl_listen backstop ticker arm and the test helper so the two cannot drift. + fn try_emit_stuck_stream_backstop(&mut self) -> bool { + let should_emit = self.ingest_mode == IngestMode::Stream + && !self.enable_websocket + && self.order_book_state.as_ref().is_some_and(|s| { + s.book_dirty() + && s.book_dirty_since() + .is_some_and(|since| since.elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL) + }); + if should_emit + && let Some((bt, lt)) = + self.order_book_state.as_ref().and_then(OrderBookState::dirty_source_times) + { + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Provisional, + ); + if let Some(state) = self.order_book_state.as_mut() { + state.bump_dirty_since_after_provisional(); + } + true + } else { + false + } + } + /// Emits the authoritative end-of-block TOB snapshot for `block` if the /// book changed since the last snapshot. Shared by streaming finalization /// and the test finalization helper so the two cannot drift (the test @@ -1314,6 +1372,7 @@ impl OrderBookListener { let diff_batch = block.diffs.pop_front().expect("front diff exists"); let diff = diff_batch.events_ref().first().expect("stream diff batch has one event").clone(); let block_time_ms = diff_batch.block_time(); + let local_time_ms = diff_batch.local_time_ms(); if self.is_ready() { if !block.boundary_open && !(diff.coin().is_spot() && self.ignore_spot) { @@ -1327,6 +1386,7 @@ impl OrderBookListener { self.order_book_state.as_mut().ok_or("streaming order book state not ready")?.apply_stream_diff( height, block_time_ms, + local_time_ms, diff, status, )?; @@ -1516,6 +1576,16 @@ impl OrderBookListener { Ok(()) } + /// Test-only equivalent of the production `backstop_ticker.tick()` arm in + /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty and + /// `book_dirty_since` has aged past `STREAM_DIRTY_BACKSTOP_INTERVAL`. + /// The elapsed-time gate is intentional: Task 5.5b's paused-time test can + /// assert the interval is real by advancing a fake clock before calling. + #[cfg(test)] + pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { + self.try_emit_stuck_stream_backstop(); + } + // forcibly grab current snapshot pub(crate) fn compute_snapshot(&mut self) -> Option { self.order_book_state.as_mut().map(|o| o.compute_snapshot()) diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 6f91ec2f..fe843931 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -26,6 +26,15 @@ pub(super) struct OrderBookState { /// snapshot for the closing block. Soft-tolerance no-op branches MUST NOT /// set this — see `apply_stream_diff`. book_dirty: bool, + /// Source `(block_time_ms, local_time_ms)` recorded the first time the + /// book became dirty in the current dirty epoch. Reset to `None` when + /// `book_dirty` is cleared. Lets the stuck-stream backstop attach + /// reliable times without depending on the take-and-clear + /// `OrderBookListener::last_batch_times()`. + dirty_source_times: Option<(u64, u64)>, + /// `Instant` at which the book first became dirty in the current dirty + /// epoch. Used by the backstop to gate emission cadence. + book_dirty_since: Option, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -59,6 +68,8 @@ impl Clone for OrderBookState { ignore_spot: self.ignore_spot, enable_websocket: self.enable_websocket, book_dirty: self.book_dirty, + dirty_source_times: self.dirty_source_times, + book_dirty_since: self.book_dirty_since, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -83,6 +94,8 @@ impl OrderBookState { order_book: OrderBooks::from_snapshots(snapshot, ignore_triggers), snapped: false, book_dirty: false, + dirty_source_times: None, + book_dirty_since: None, dob_tap: None, } } @@ -108,9 +121,32 @@ impl OrderBookState { self.book_dirty } + pub(super) const fn book_dirty_since(&self) -> Option { + self.book_dirty_since + } + + pub(super) const fn dirty_source_times(&self) -> Option<(u64, u64)> { + self.dirty_source_times + } + + fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { + if !self.book_dirty { + self.book_dirty_since = Some(std::time::Instant::now()); + self.dirty_source_times = Some((block_time_ms, local_time_ms)); + } + self.book_dirty = true; + } + + pub(super) fn bump_dirty_since_after_provisional(&mut self) { + if self.book_dirty { + self.book_dirty_since = Some(std::time::Instant::now()); + } + } + // Wired into the recovery path in Task 5.4b (apply_recovery marks the book - // dirty so the corrected BBO is emitted). Direct mutation sites set - // self.book_dirty inline; this accessor exists for callers outside state.rs. + // dirty so the corrected BBO is re-emitted). All current mutation sites call + // `mark_dirty_with_times`; this fallback exists for callers that have only a + // block_time and no separate local_time. #[allow(dead_code)] pub(super) const fn mark_book_dirty(&mut self) { self.book_dirty = true; @@ -118,6 +154,8 @@ impl OrderBookState { pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; + self.book_dirty_since = None; + self.dirty_source_times = None; } // forcibly take snapshot - (time, height, snapshot) @@ -233,7 +271,9 @@ impl OrderBookState { self.order_book.add_resting_order_from_diff(inner_order); if self.order_book.contains_order(&oid, &coin) { // Order actually rested — book mutated. - self.book_dirty = true; + // Block mode has no separate local_time; pass block_time for both. The + // dirty_source_times field is only consumed by the stream-mode backstop. + self.mark_dirty_with_times(time, time); } else { // Soft-tolerance no-op: order did not rest, so the book // was NOT mutated — do NOT set book_dirty here. @@ -259,7 +299,7 @@ impl OrderBookState { match self.order_book.modify_sz(oid.clone(), coin.clone(), new_sz) { Some((old_sz, px)) => { // Order size modified — book mutated. - self.book_dirty = true; + self.mark_dirty_with_times(time, time); if let Some(tap) = self.dob_tap.as_mut() { // exec_quantity = reduction in resting size let exec_quantity = @@ -282,7 +322,7 @@ impl OrderBookState { InnerOrderDiff::Remove => { if self.order_book.cancel_order(oid.clone(), coin.clone()) { // Order cancelled — book mutated. - self.book_dirty = true; + self.mark_dirty_with_times(time, time); if let Some(tap) = self.dob_tap.as_mut() { tap.emit_order_cancel(&coin, oid, time_ns); } @@ -317,6 +357,7 @@ impl OrderBookState { &mut self, block_number: u64, block_time_ms: u64, + local_time_ms: u64, diff: NodeDataOrderDiff, order_status: Option, ) -> Result { @@ -355,7 +396,7 @@ impl OrderBookState { self.order_book.add_resting_order_from_diff(inner_order); if self.order_book.contains_order(&oid, &coin) { // Order actually rested — book mutated. - self.book_dirty = true; + self.mark_dirty_with_times(block_time_ms, local_time_ms); } else { // Soft-tolerance no-op: order did not rest, so the book was // NOT mutated — do NOT set book_dirty here. @@ -372,7 +413,7 @@ impl OrderBookState { { Some((old_sz, px)) => { // Order size modified — book mutated. - self.book_dirty = true; + self.mark_dirty_with_times(block_time_ms, local_time_ms); if let Some(tap) = self.dob_tap.as_mut() { let exec_quantity = crate::order_book::Sz::new(old_sz.value().saturating_sub(new_sz.value())); tap.emit_order_execute(&coin, oid, px, exec_quantity, time_ns); @@ -389,7 +430,7 @@ impl OrderBookState { InnerOrderDiff::Remove => { if self.order_book.cancel_order(oid.clone(), coin.clone()) { // Order cancelled — book mutated. - self.book_dirty = true; + self.mark_dirty_with_times(block_time_ms, local_time_ms); if let Some(tap) = self.dob_tap.as_mut() { tap.emit_order_cancel(&coin, oid, time_ns); } diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index b5b19c7f..92644988 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -505,11 +505,9 @@ impl MulticastPublisher { self.seq.fetch_add(1, Ordering::Relaxed) } - #[cfg(not(test))] - const CATCHUP_THRESHOLD_MS: u64 = 500; - - #[cfg(test)] - const CATCHUP_THRESHOLD_MS: u64 = 500; + // STREAM_DIRTY_BACKSTOP_INTERVAL in listeners/order_book/mod.rs must remain + // <= CATCHUP_THRESHOLD_MS / 2; a compile-time assert there enforces this. + pub(crate) const CATCHUP_THRESHOLD_MS: u64 = 500; #[cfg(test)] fn should_publish_lag(_lag_ms: u64) -> bool { From 161410dbeb95a3078403737bc9f33b04e116d9d4 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 22:54:36 -0400 Subject: [PATCH 21/65] fix: emit authoritative tob snapshot after streaming recovery --- server/src/listeners/order_book/mod.rs | 118 +++++++++++++++-------- server/src/listeners/order_book/state.rs | 11 +-- 2 files changed, 79 insertions(+), 50 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 9b9f2b6a..0321876c 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1427,53 +1427,91 @@ impl OrderBookListener { /// Instead of exiting the process when the internal book diverges from a /// fresh snapshot, we re-initialize just the affected coins in place. /// The rest of the channel continues untouched — other instruments are - /// unaffected, and the next tick's `InternalMessage::Snapshot` will - /// carry the corrected BBO for the repaired coins through the normal - /// publish path. + /// unaffected. When at least one coin is repaired the book is marked dirty + /// and an authoritative TOB snapshot is emitted immediately so multicast + /// subscribers see the corrected BBO without waiting for an unrelated future + /// diff or the next finalization (L2-5 + recovery interaction, Task 5.4b). fn apply_recovery(&mut self, report: &utils::ValidationReport, fresh_snapshot: Snapshots) { - // Disjoint borrows: `state` is the &mut on the book, `taps` is the - // &shared on the DoB channels. Splitting at the field level lets the - // emit helper borrow `taps` while `state` mutation continues. - let taps = self.dob_replay_taps.as_ref(); - let Some(state) = self.order_book_state.as_mut() else { - return; - }; + // Capture source timestamps BEFORE taking the &mut state borrow so the + // borrow checker allows us to call &mut self methods later. + let recovery_block_time_ms = self.last_batch_block_time_ms.unwrap_or(0); + let recovery_local_time_ms = self.last_batch_local_time_ms.unwrap_or(0); - let mut fresh_map = fresh_snapshot.value(); - - // Repair coins that diverged: replace the internal book with the fresh one. - // Emit InstrumentReset BEFORE the mutation so subscribers discard - // delta state for the affected instrument before any new deltas - // derived from the replaced book begin flowing. - for (coin, msg) in &report.diverged { - log::warn!("recovery: re-initializing {} (divergence: {})", coin.value(), msg); - Self::emit_dob_instrument_reset(taps, coin); - if let Some(fresh_book) = fresh_map.remove(coin) { - state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); - } else { - log::warn!( - "recovery: diverged coin {} missing from fresh snapshot — dropping from state", - coin.value() - ); + let mut mutated = false; + + { + // Disjoint borrows: `state` is the &mut on the book, `taps` is the + // &shared on the DoB channels. Splitting at the field level lets the + // emit helper borrow `taps` while `state` mutation continues. + let taps = self.dob_replay_taps.as_ref(); + let Some(state) = self.order_book_state.as_mut() else { + return; + }; + + let mut fresh_map = fresh_snapshot.value(); + + // Repair coins that diverged: replace the internal book with the fresh one. + // Emit InstrumentReset BEFORE the mutation so subscribers discard + // delta state for the affected instrument before any new deltas + // derived from the replaced book begin flowing. + for (coin, msg) in &report.diverged { + log::warn!("recovery: re-initializing {} (divergence: {})", coin.value(), msg); + Self::emit_dob_instrument_reset(taps, coin); + if let Some(fresh_book) = fresh_map.remove(coin) { + state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); + } else { + log::warn!( + "recovery: diverged coin {} missing from fresh snapshot — dropping from state", + coin.value() + ); + state.remove_coin(coin); + } + mutated = true; + } + + // Drop coins that we have but the fresh snapshot doesn't (stale data). + for coin in &report.missing_in_fresh { + log::warn!("recovery: dropping stale coin {} not in fresh snapshot", coin.value()); + Self::emit_dob_instrument_reset(taps, coin); state.remove_coin(coin); + mutated = true; } - } - // Drop coins that we have but the fresh snapshot doesn't (stale data). - for coin in &report.missing_in_fresh { - log::warn!("recovery: dropping stale coin {} not in fresh snapshot", coin.value()); - Self::emit_dob_instrument_reset(taps, coin); - state.remove_coin(coin); - } + // Add coins that are in the fresh snapshot but not in our state (new listings + // that appeared between our bootstrap and now). + for coin in &report.extra_in_fresh { + if let Some(fresh_book) = fresh_map.remove(coin) { + log::warn!("recovery: adding new coin {} from fresh snapshot", coin.value()); + Self::emit_dob_instrument_reset(taps, coin); + state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); + mutated = true; + } + } - // Add coins that are in the fresh snapshot but not in our state (new listings - // that appeared between our bootstrap and now). - for coin in &report.extra_in_fresh { - if let Some(fresh_book) = fresh_map.remove(coin) { - log::warn!("recovery: adding new coin {} from fresh snapshot", coin.value()); - Self::emit_dob_instrument_reset(taps, coin); - state.replace_coin_from_snapshot(coin.clone(), fresh_book, true); + if mutated { + // Mark dirty with timestamps so both the immediate emit below + // and the 250ms backstop (if the immediate emit is suppressed + // by the snapped gate) carry consistent source times. + state.mark_dirty_with_times(recovery_block_time_ms, recovery_local_time_ms); } + } // `state` borrow ends here; `&mut self` is free again + + // Recovery mutated the book outside apply_stream_diff/apply_updates. + // Emit an authoritative TOB snapshot now so multicast subscribers see + // the corrected BBO immediately instead of waiting for an unrelated + // future diff or the next finalization (L2-5 + recovery interaction). + if self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) { + let (bt, lt) = self + .order_book_state + .as_ref() + .and_then(OrderBookState::dirty_source_times) + .unwrap_or((0, 0)); + self.emit_tob_snapshot( + ingest_source_label(EventSource::OrderDiffs), + bt, + lt, + SnapshotEmission::Authoritative, + ); } } diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index fe843931..0579d004 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -129,7 +129,7 @@ impl OrderBookState { self.dirty_source_times } - fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { + pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { if !self.book_dirty { self.book_dirty_since = Some(std::time::Instant::now()); self.dirty_source_times = Some((block_time_ms, local_time_ms)); @@ -143,15 +143,6 @@ impl OrderBookState { } } - // Wired into the recovery path in Task 5.4b (apply_recovery marks the book - // dirty so the corrected BBO is re-emitted). All current mutation sites call - // `mark_dirty_with_times`; this fallback exists for callers that have only a - // block_time and no separate local_time. - #[allow(dead_code)] - pub(super) const fn mark_book_dirty(&mut self) { - self.book_dirty = true; - } - pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.book_dirty_since = None; From 8b49e3e69cfada736bf3f33c6b171c2c0712d564 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 23:01:21 -0400 Subject: [PATCH 22/65] test: add streaming-mode listener test constructor --- server/src/listeners/order_book/mod.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 0321876c..2e5f7320 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1569,6 +1569,22 @@ impl OrderBookListener { listener } + /// Test-only: like `for_test_with_snapshot` but in streaming mode, with an + /// attached internal-message broadcast so tests can capture emitted + /// `InternalMessage::Snapshot` messages. WS is disabled (multicast-only, + /// the L2-5 default), so streaming snapshots are finalization-driven. + #[cfg(test)] + pub(crate) fn for_test_streaming_with_snapshot( + snapshot: Snapshots, + height: u64, + ) -> (Self, tokio::sync::broadcast::Receiver>) { + let (tx, rx) = tokio::sync::broadcast::channel::>(256); + let mut listener = Self::new_with_ingest_mode(Some(tx), false, IngestMode::Stream); + listener.complete_stream_startup_sync_for_test(); + listener.init_from_snapshot(snapshot, height); + (listener, rx) + } + /// Test-only entry point: feeds a synthetic `(order_statuses, order_diffs)` /// pair through the same `apply_updates` path the file-watcher uses. The /// DoB tap (if attached) fires on every successful apply, and the From 4eca5554fda1b9ab100c165158e4d3a89ef874d7 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 23:13:20 -0400 Subject: [PATCH 23/65] test: l2-5 streaming finalization behavior suite 8 tests covering the l2-5 behavioral contracts: one authoritative snapshot per finalized bbo-changing block; no snapshot for clean blocks; finalization in height order; backstop only after 250ms dirty interval; backstop provisional does not suppress later authoritative finalization; tolerated no-op update/remove stays clean; recovery emits authoritative snapshot immediately. --- server/src/listeners/order_book/mod.rs | 31 +- .../order_book/stream_finalization_tests.rs | 468 ++++++++++++++++++ 2 files changed, 495 insertions(+), 4 deletions(-) create mode 100644 server/src/listeners/order_book/stream_finalization_tests.rs diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 2e5f7320..cf99bb90 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -47,6 +47,8 @@ pub(crate) mod dob_tap; pub(crate) mod latency; #[cfg(test)] mod parity_tests; +#[cfg(test)] +mod stream_finalization_tests; mod progress; mod state; mod utils; @@ -1097,12 +1099,21 @@ impl OrderBookListener { /// eventual authoritative finalization snapshot still fires). Shared by the /// hl_listen backstop ticker arm and the test helper so the two cannot drift. fn try_emit_stuck_stream_backstop(&mut self) -> bool { + self.try_emit_stuck_stream_backstop_inner(true) + } + + /// Inner implementation. `enforce_interval = true` in all production paths + /// (ticker arm, existing test helper). `enforce_interval = false` is exposed + /// only via the `#[cfg(test)]` helper below, allowing tests to bypass the + /// elapsed-time gate without affecting any other backstop condition. + fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket && self.order_book_state.as_ref().is_some_and(|s| { s.book_dirty() - && s.book_dirty_since() - .is_some_and(|since| since.elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL) + && (!enforce_interval + || s.book_dirty_since() + .is_some_and(|since| since.elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL)) }); if should_emit && let Some((bt, lt)) = @@ -1633,13 +1644,25 @@ impl OrderBookListener { /// Test-only equivalent of the production `backstop_ticker.tick()` arm in /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty and /// `book_dirty_since` has aged past `STREAM_DIRTY_BACKSTOP_INTERVAL`. - /// The elapsed-time gate is intentional: Task 5.5b's paused-time test can - /// assert the interval is real by advancing a fake clock before calling. + /// The elapsed-time gate is ENFORCED (enforce_interval=true), so calling + /// this immediately after a dirty block will NOT emit — exercising the real + /// gate. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { self.try_emit_stuck_stream_backstop(); } + /// Test-only backstop variant that bypasses ONLY the elapsed-time gate + /// (`book_dirty_since().elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL`). + /// All other conditions are real: streaming mode, WS disabled, book_dirty, + /// dirty_source_times present, emits `SnapshotEmission::Provisional`, calls + /// `bump_dirty_since_after_provisional`. Use this to exercise the provisional + /// emit path deterministically without a real sleep. + #[cfg(test)] + pub(crate) fn fire_stream_dirty_backstop_ignoring_interval_for_test(&mut self) { + self.try_emit_stuck_stream_backstop_inner(false); + } + // forcibly grab current snapshot pub(crate) fn compute_snapshot(&mut self) -> Option { self.order_book_state.as_mut().map(|o| o.compute_snapshot()) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs new file mode 100644 index 00000000..ab9f5a69 --- /dev/null +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -0,0 +1,468 @@ +//! L2-5 tests: streaming snapshot emission tied to block finalization. +//! +//! See docs/superpowers/specs/2026-05-15-streaming-cpu-reduction-design.md +//! "L2-5 — Required tests" for the contract these tests assert. +//! +//! These tests drive a real `OrderBookListener` in streaming mode through the +//! same `receive_batch` path production uses, asserting the five behavioral +//! contracts for L2-5: +//! +//! (1) One authoritative snapshot per finalized BBO-changing block. +//! (2) Clean (no-mutation) blocks emit nothing. +//! (3) Finalized blocks emit in height order. +//! (4) Backstop fires only after the 250ms dirty interval, emits Provisional, +//! does NOT suppress the later authoritative finalization snapshot. +//! (5) Recovery emits an authoritative snapshot with the repaired BBO +//! immediately. + +#![allow(clippy::unwrap_used, clippy::expect_used)] +#![allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)] + +use std::collections::HashMap; +use std::sync::Arc; + +use alloy::primitives::Address; +use chrono::NaiveDateTime; +use tokio::sync::broadcast::Receiver; + +use crate::listeners::order_book::utils::EventBatch; +use crate::listeners::order_book::{InternalMessage, L2SnapshotParams, OrderBookListener}; +use crate::order_book::multi_book::Snapshots; +use crate::order_book::{Coin, OrderBook, Px, Side, Snapshot, Sz}; +use crate::types::inner::InnerL4Order; +use crate::types::node_data::{Batch, NodeDataOrderDiff, NodeDataOrderStatus}; +use crate::types::{L4Order, OrderDiff}; + +const TEST_COIN: &str = "BTC"; + +fn dt_from_ms(block_time_ms: u64) -> NaiveDateTime { + let secs = (block_time_ms / 1_000) as i64; + let nsecs = ((block_time_ms % 1_000) * 1_000_000) as u32; + chrono::DateTime::::from_timestamp(secs, nsecs).expect("valid timestamp").naive_utc() +} + +/// Pre-seed with one far-out ask so the listener accepts the snapshot and the +/// book universe is non-empty. The ask sits at 99999 and does not interfere +/// with any test BBO. +fn seed_snapshot() -> Snapshots { + let coin = Coin::new(TEST_COIN); + let mut book: OrderBook = OrderBook::new(); + book.add_order(InnerL4Order { + user: Address::new([0; 20]), + coin: coin.clone(), + side: Side::Ask, + limit_px: Px::parse_from_str("99999").expect("valid px"), + sz: Sz::parse_from_str("1").expect("valid sz"), + oid: 9_000, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut map: HashMap> = HashMap::new(); + map.insert(coin, book.to_snapshot()); + Snapshots::new(map) +} + +fn add_event( + block_time_ms: u64, + side: Side, + oid: u64, + px: &str, + sz: &str, +) -> (NodeDataOrderStatus, NodeDataOrderDiff) { + let user = Address::new([0; 20]); + let l4 = L4Order { + user: Some(user), + coin: TEST_COIN.to_string(), + side, + limit_px: px.to_string(), + sz: sz.to_string(), + oid, + timestamp: block_time_ms, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }; + let status = NodeDataOrderStatus { time: dt_from_ms(block_time_ms), user, status: "open".to_string(), order: l4 }; + let diff = NodeDataOrderDiff::new_for_test( + user, + oid, + px.to_string(), + TEST_COIN.to_string(), + OrderDiff::New { sz: sz.to_string() }, + ); + (status, diff) +} + +fn update_diff(oid: u64, px: &str, orig_sz: &str, new_sz: &str) -> NodeDataOrderDiff { + NodeDataOrderDiff::new_for_test( + Address::new([0; 20]), + oid, + px.to_string(), + TEST_COIN.to_string(), + OrderDiff::Update { orig_sz: orig_sz.to_string(), new_sz: new_sz.to_string() }, + ) +} + +/// Feed one streaming block: send the statuses batch then the diffs batch. +/// In streaming mode, `receive_batch` calls `drain_streaming_blocks` after +/// each batch, so feeding the diffs batch may finalize an earlier block. +fn feed_block( + listener: &mut OrderBookListener, + height: u64, + block_time_ms: u64, + statuses: Vec, + diffs: Vec, +) { + listener + .receive_batch(EventBatch::Orders(Batch::new_for_test(height, block_time_ms, statuses))) + .expect("statuses batch applies"); + listener + .receive_batch(EventBatch::BookDiffs(Batch::new_for_test(height, block_time_ms, diffs))) + .expect("diffs batch applies"); +} + +/// Drains the snapshot receiver and returns only `InternalMessage::Snapshot` +/// messages in receive order. Non-snapshot messages are skipped. +fn drain_snapshots(rx: &mut Receiver>) -> Vec> { + let mut snapshots = Vec::new(); + while let Ok(msg) = rx.try_recv() { + if matches!(msg.as_ref(), InternalMessage::Snapshot { .. }) { + snapshots.push(msg); + } + } + snapshots +} + +/// Extracts the best bid `Px::value()` from an `InternalMessage::Snapshot`. +/// +/// Uses the unbucketed `L2SnapshotParams::new(None, None)` variant and reads +/// the first bid level's `InnerLevel.px` directly (no string round-trip). +fn snapshot_best_bid(msg: &InternalMessage) -> Option { + let InternalMessage::Snapshot { l2_snapshots, .. } = msg else { return None }; + let coin = Coin::new(TEST_COIN); + let params_map = l2_snapshots.as_ref().get(&coin)?; + let unbucketed = params_map.get(&L2SnapshotParams::new(None, None))?; + // Index 0 = bids; the snapshot is already truncated to 1 level in + // WS-disabled mode, so `first()` gives the best bid. + unbucketed.as_ref()[0].first().map(|level| level.px.value()) +} + +fn snapshot_height(msg: &InternalMessage) -> u64 { + let InternalMessage::Snapshot { height, .. } = msg else { panic!("not a Snapshot") }; + *height +} + +// --------------------------------------------------------------------------- +// Test 1: Two diffs in the same block — finalization must emit exactly one +// snapshot carrying the FINAL BBO. +// --------------------------------------------------------------------------- + +/// L2-5 contract (1): multi-diff block emits one authoritative snapshot at +/// finalization, carrying the final BBO after all diffs in that block. +#[tokio::test(flavor = "current_thread")] +async fn two_diffs_in_block_emit_final_bbo_once() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: add bid @ 100, then add a higher bid @ 105 (moves BBO). + let (status_a, diff_a) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + let (status_b, diff_b) = add_event(1_700_000_002_000, Side::Bid, 102, "105", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![status_a, status_b], vec![diff_a, diff_b]); + + // Block 3: trigger the watermark so block 2 finalizes. + let (status_c, diff_c) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![status_c], vec![diff_c]); + + let snapshots = drain_snapshots(&mut rx); + let block_2_snapshots: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert_eq!(block_2_snapshots.len(), 1, "exactly one snapshot for height 2; got {}", block_2_snapshots.len()); + + let final_best_bid = snapshot_best_bid(block_2_snapshots[0]).expect("BBO exists"); + let expected = Px::parse_from_str("105").unwrap().value(); + assert_eq!(final_best_bid, expected, "snapshot for height 2 must carry the final BBO (105), got {final_best_bid}"); +} + +// --------------------------------------------------------------------------- +// Test 2: No spurious emission for a block with no mutations. +// --------------------------------------------------------------------------- + +/// L2-5 contract (2): a block with no diffs that reach the book must not +/// produce a snapshot for that height. +#[tokio::test(flavor = "current_thread")] +async fn block_without_mutations_emits_no_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: empty statuses and diffs. No mutations → book_dirty stays false. + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![]); + + // Block 3: again empty. Finalizes block 2. + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2_snapshots: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + height_2_snapshots.is_empty(), + "no snapshot should be emitted for a clean block; got {}", + height_2_snapshots.len() + ); +} + +// --------------------------------------------------------------------------- +// Test 3: Finalized blocks emit snapshots in height order, one per block. +// --------------------------------------------------------------------------- + +/// L2-5 contract (3): BBO-changing diffs across blocks N, N+1, N+2 produce +/// snapshots in monotonic height order, one snapshot per block. +#[tokio::test(flavor = "current_thread")] +async fn finalized_blocks_emit_snapshots_in_order() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 102, "110", "5"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let (s4, d4) = add_event(1_700_000_004_000, Side::Bid, 103, "120", "5"); + feed_block(&mut listener, 4, 1_700_000_004_000, vec![s4], vec![d4]); + + // Block 5 triggers finalization of block 4 (and cascades to 2 and 3). + let (s5, d5) = add_event(1_700_000_005_000, Side::Bid, 104, "121", "1"); + feed_block(&mut listener, 5, 1_700_000_005_000, vec![s5], vec![d5]); + + let snapshots = drain_snapshots(&mut rx); + let heights: Vec = snapshots.iter().map(|m| snapshot_height(m)).collect(); + + assert!(heights.windows(2).all(|w| w[0] < w[1]), "heights must be strictly increasing; got {heights:?}"); + assert!(heights.contains(&2), "expected snapshot for height 2"); + assert!(heights.contains(&3), "expected snapshot for height 3"); + assert!(heights.contains(&4), "expected snapshot for height 4"); + + let count_each = |h: u64| heights.iter().filter(|&&x| x == h).count(); + assert_eq!(count_each(2), 1, "exactly one snapshot for height 2"); + assert_eq!(count_each(3), 1, "exactly one snapshot for height 3"); + assert_eq!(count_each(4), 1, "exactly one snapshot for height 4"); +} + +// --------------------------------------------------------------------------- +// Tests 4 + C1b (merged): Stuck-stream backstop — interval gate + emit path. +// +// Two-call design (Option C / Task 5.5b): +// 1. `fire_stream_dirty_backstop_for_test()` — enforces the real 250ms gate. +// Called immediately after the dirty block (~0ms elapsed), it must NOT +// emit. This assertion is load-bearing: it proves the interval gate is +// genuinely active on the production path. +// 2. `fire_stream_dirty_backstop_ignoring_interval_for_test()` — bypasses +// ONLY the elapsed-time comparison; all other conditions (streaming, +// !ws, book_dirty, dirty_source_times) remain real. This exercises the +// provisional emit path deterministically without any real sleep. +// --------------------------------------------------------------------------- + +/// L2-5 contract (4): when no later block arrives, the backstop hook +/// (which production fires on a 5s ticker) emits one Provisional snapshot +/// for the dirty block AFTER the 250ms interval elapses — but NOT before. +/// +/// Also covers C1b (deferred Task 5.4 Step 7): `fire_stream_dirty_backstop_for_test` +/// gates on the real 250ms elapsed-time interval and does not emit early. +#[tokio::test(flavor = "current_thread")] +async fn stuck_stream_backstop_emits_dirty_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Real gate: firing immediately (~0ms elapsed) must NOT emit — proves the + // interval check is active on the production-path helper. + listener.fire_stream_dirty_backstop_for_test(); + let before_interval = drain_snapshots(&mut rx); + assert!( + before_interval.is_empty(), + "backstop must NOT fire before the 250ms dirty interval; got {} snapshot(s)", + before_interval.len(), + ); + + // Bypass only the elapsed check (all other conditions remain real); the + // backstop must now emit exactly one Provisional snapshot. + listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); + let after_interval = drain_snapshots(&mut rx); + assert_eq!(after_interval.len(), 1, "backstop must emit exactly one snapshot after interval; got {}", after_interval.len()); + assert_eq!(snapshot_height(&after_interval[0]), 2, "backstop snapshot must carry the last applied height (2)"); +} + +// --------------------------------------------------------------------------- +// Test 5 (Codex finding #2): late diff after backstop still gets authoritative +// final — the provisional emit must NOT clear the dirty flag. +// --------------------------------------------------------------------------- + +/// L2-5 contract (4) + Codex finding #2: a late same-height diff that arrives +/// after the provisional backstop must still produce an authoritative +/// finalization snapshot. The backstop must not have cleared book_dirty. +#[tokio::test(flavor = "current_thread")] +async fn late_diff_after_backstop_still_emits_authoritative_final() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: one BBO-changing diff at 100. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Trigger the provisional emit via the interval-bypassing helper (no sleep). + listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); + let provisional = drain_snapshots(&mut rx); + assert_eq!(provisional.len(), 1, "backstop should emit one provisional snapshot; got {}", provisional.len()); + assert_eq!(snapshot_height(&provisional[0]), 2, "provisional snapshot must be for height 2"); + + // Late same-height-2 diff moves BBO to 110. + let (s2b, d2b) = add_event(1_700_000_002_000, Side::Bid, 102, "110", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2b], vec![d2b]); + + // Block 3 triggers finalization of block 2. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let after_final = drain_snapshots(&mut rx); + let height_2_finals: Vec<_> = after_final.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + !height_2_finals.is_empty(), + "finalization must still emit an authoritative snapshot for height 2 even after a backstop" + ); + // The final snapshot must carry the LATER BBO (110, not 100). + let final_bbo = snapshot_best_bid(height_2_finals.last().unwrap()).expect("BBO exists"); + let expected = Px::parse_from_str("110").unwrap().value(); + assert_eq!(final_bbo, expected, "authoritative final must carry the latest BBO (110); got {final_bbo}"); +} + +// --------------------------------------------------------------------------- +// Tests 6a/6b (Codex finding #3): tolerated no-op Update/Remove branches must +// NOT mark the book dirty — so no spurious snapshot is emitted. +// --------------------------------------------------------------------------- + +/// L2-5 contract (2) + Codex finding #3: an Update diff for an oid that does +/// not exist in the book is tolerated silently but must NOT dirty the book. +#[tokio::test(flavor = "current_thread")] +async fn missing_order_update_does_not_emit_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: Update for a non-existent oid. + let missing_oid = 999_999u64; + let bad_update = update_diff(missing_oid, "100", "5", "3"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![bad_update]); + + // Block 3 triggers any pending finalization of block 2. + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + height_2.is_empty(), + "no snapshot should be emitted for a block whose only diff was a no-op missing-order Update; got {}", + height_2.len(), + ); +} + +/// L2-5 contract (2) + Codex finding #3: a Remove diff for an oid that does +/// not exist in the book is tolerated silently but must NOT dirty the book. +#[tokio::test(flavor = "current_thread")] +async fn missing_order_remove_does_not_emit_snapshot() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let missing_oid = 999_999u64; + let bad_remove = NodeDataOrderDiff::new_for_test( + Address::new([0; 20]), + missing_oid, + "100".to_string(), + TEST_COIN.to_string(), + OrderDiff::Remove, + ); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![], vec![bad_remove]); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![], vec![]); + + let snapshots = drain_snapshots(&mut rx); + let height_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + height_2.is_empty(), + "no snapshot should be emitted for a block whose only diff was a no-op missing-order Remove; got {}", + height_2.len(), + ); +} + +// --------------------------------------------------------------------------- +// Test 7 (Codex finding #1): recovery emits an authoritative snapshot +// immediately so multicast TOB does not stay stale after a divergence repair. +// --------------------------------------------------------------------------- + +/// L2-5 contract (5) + Codex finding #1: `apply_recovery` must emit an +/// authoritative snapshot immediately, visible on the internal-message +/// channel, without requiring a subsequent diff or block finalization. +/// +/// C2 (plan correction): Task 5.4b folded the authoritative emit INTO +/// `apply_recovery` directly. No wrapper is needed; we call the private +/// method directly (child modules can access parent-module private items). +#[tokio::test(flavor = "current_thread")] +async fn recovery_emits_authoritative_snapshot_without_later_diff() { + use super::utils::ValidationReport; + + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Apply a couple of blocks so last_batch_*_time_ms are populated. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 102, "101", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + // Finalize remaining buffered blocks so snapped is cleared for the + // recovery path (replace_coin_from_snapshot clears snapped). + let _ignored = listener.finalize_streaming_for_test(); + let _drain_initial = drain_snapshots(&mut rx); + + // Build a fresh Snapshots with a DIFFERENT best bid (200) to simulate + // a repaired divergence. + let coin = Coin::new(TEST_COIN); + let mut repaired: OrderBook = OrderBook::new(); + repaired.add_order(InnerL4Order { + user: Address::new([0; 20]), + coin: coin.clone(), + side: Side::Bid, + limit_px: Px::parse_from_str("200").expect("valid px"), + sz: Sz::parse_from_str("9").expect("valid sz"), + oid: 7_777, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut fresh_map: HashMap> = HashMap::new(); + fresh_map.insert(coin.clone(), repaired.to_snapshot()); + let fresh = Snapshots::new(fresh_map); + + // Mark BTC as diverged and invoke apply_recovery directly. + // (Task 5.4b folded the authoritative emit into apply_recovery, so no + // wrapper is required — the emit fires unconditionally when mutated.) + let report = ValidationReport { + diverged: vec![(coin, "synthetic divergence for test".to_string())], + missing_in_fresh: vec![], + extra_in_fresh: vec![], + }; + + listener.apply_recovery(&report, fresh); + + let after = drain_snapshots(&mut rx); + assert_eq!(after.len(), 1, "recovery must emit exactly one authoritative snapshot; got {}", after.len()); + let repaired_bbo = snapshot_best_bid(&after[0]).expect("BBO exists after recovery"); + let expected = Px::parse_from_str("200").unwrap().value(); + assert_eq!(repaired_bbo, expected, "snapshot after recovery must carry the repaired BBO (200); got {repaired_bbo}"); +} From e4e469647763b73f527863ff7e009810d740f0d4 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 23:36:51 -0400 Subject: [PATCH 24/65] refactor: expose dual-validator fixture snapshot capture for reuse --- .../order_book/block_mode_multicast_e2e.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/server/src/listeners/order_book/block_mode_multicast_e2e.rs b/server/src/listeners/order_book/block_mode_multicast_e2e.rs index a3f6b48f..0bfb6553 100644 --- a/server/src/listeners/order_book/block_mode_multicast_e2e.rs +++ b/server/src/listeners/order_book/block_mode_multicast_e2e.rs @@ -1419,6 +1419,10 @@ struct CapturedReplay { tob_refdata: Vec>, dob_mktdata: Vec>, dob_refdata: Vec>, + /// Ordered sequence of `InternalMessage::Snapshot` messages emitted during + /// replay, in emission order. Used by parity tests that assert the full + /// per-height TOB quote sequence is identical between block and stream mode. + snapshots: Vec>, } async fn capture_fixture_replay(root: &Path, ingest_mode: IngestMode, replay_coin: &str) -> CapturedReplay { @@ -1429,6 +1433,8 @@ async fn capture_fixture_replay(root: &Path, ingest_mode: IngestMode, replay_coi let registry = new_shared_registry(test_registry_state_for(replay_coin)); let (internal_tx, _) = broadcast_channel::>(8192); + // Subscribe before any replay so we capture every emitted Snapshot in order. + let mut snapshot_rx = internal_tx.subscribe(); let mut listener = OrderBookListener::new_with_ingest_mode(Some(internal_tx.clone()), true, ingest_mode); listener.init_from_snapshot(snapshot, snapshot_height); @@ -1488,6 +1494,14 @@ async fn capture_fixture_replay(root: &Path, ingest_mode: IngestMode, replay_coi let l2_snapshot = stable_l2_debug(&guard.l2_snapshots_for_test().unwrap().1); drop(guard); + // Drain every InternalMessage::Snapshot that was emitted during the replay. + let mut captured_snapshots: Vec> = Vec::new(); + while let Ok(msg) = snapshot_rx.try_recv() { + if matches!(msg.as_ref(), InternalMessage::Snapshot { .. }) { + captured_snapshots.push(msg); + } + } + CapturedReplay { height, l4_snapshot, @@ -1506,6 +1520,7 @@ async fn capture_fixture_replay(root: &Path, ingest_mode: IngestMode, replay_coi normalize_dob_refdata(dob_refdata.finish()), &[dob_const::MSG_TYPE_INSTRUMENT_DEF, dob_const::MSG_TYPE_MANIFEST_SUMMARY], ), + snapshots: captured_snapshots, } } From 9075ef3a6c1db6da13214739894e4737e74ba96e Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Sun, 17 May 2026 23:39:24 -0400 Subject: [PATCH 25/65] test: assert stream quote sequence is an ordered block subsequence with identical bbo (l2-5) --- .../2026-05-15-streaming-cpu-reduction.md | 3 + .../order_book/block_mode_multicast_e2e.rs | 257 +++++++++++++++++- 2 files changed, 258 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md index 3051a474..b93692b3 100644 --- a/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md +++ b/docs/superpowers/plans/2026-05-15-streaming-cpu-reduction.md @@ -2261,6 +2261,9 @@ git add server/src/listeners/order_book/parity_tests.rs git commit -m "test: assert block and stream emit identical tob quote sequences post-l2-5" ``` +> **NOTE (post-implementation):** The original spec above asserts that block and stream emit *equal* ordered sequences. This is **incorrect**. The true L2-5 contract, established by forensic classification of the dual-validator fixture's 3 block-only heights, is: +> **stream is an ordered subsequence of block with identical BBO at shared heights and no real BBO change dropped** — block redundantly emits no-op-block snapshots (empty/absent blocks, and an init-artifact at ts=0) that stream intentionally suppresses (the L2-5 CPU win). The 3 block-only heights on the dual-validator fixture are: 985148181 (init-snapshot artifact: block emits ts=0 quote at state.time=0 that stream structurally cannot emit), 985148193 and 985148228 (zero-BTC-event blocks entirely absent from the streaming feed; book unchanged → book_dirty=false → L2-5 correctly suppresses). The test was renamed to `stream_quote_sequence_is_block_subsequence_with_identical_bbo` and the assertion replaced with three checks: (1) stream_rows is an ordered subsequence of block_rows under full QuoteRow equality, (2) every distinct BBO that block records as a change also appears in stream_rows (content-match on coin+bid+ask, robust to block's init-artifact height), (3) every block-only row carries the fixture's constant BBO (asserting by value, not by hardcoded height set). The pre-existing `assert_stream_end_quotes_match_block` independently guarantees per-timestamp content correctness. + ### Task 5.7: Re-measure perf after L2-5 - [ ] **Step 1: Capture perf top.** Append to baseline doc under "After L2-5 (finalization-driven streaming snapshot)". Expected: streaming-specific CPU drops further; the multiplier between validator chattiness and snapshot work is eliminated. diff --git a/server/src/listeners/order_book/block_mode_multicast_e2e.rs b/server/src/listeners/order_book/block_mode_multicast_e2e.rs index 0bfb6553..1db582c0 100644 --- a/server/src/listeners/order_book/block_mode_multicast_e2e.rs +++ b/server/src/listeners/order_book/block_mode_multicast_e2e.rs @@ -22,7 +22,7 @@ use tokio::{ time::{sleep, timeout}, }; -use super::{EventSource, IngestMode, InternalMessage, OrderBookListener, now_ms, utils::EventBatch}; +use super::{EventSource, IngestMode, InternalMessage, L2SnapshotParams, OrderBookListener, now_ms, utils::EventBatch}; use crate::{ instruments::{InstrumentInfo, RegistryState, UniverseEntry, make_symbol, new_shared_registry}, listeners::{ @@ -44,7 +44,7 @@ use crate::{ protocol::{constants as tob_const, dob::constants as dob_const}, types::{ L4Order, OrderDiff, - inner::InnerL4Order, + inner::{InnerL4Order, InnerLevel}, node_data::{Batch, NodeDataFill, NodeDataOrderDiff, NodeDataOrderStatus}, }, }; @@ -565,6 +565,259 @@ async fn dual_validator_fixture_matches_block_and_stream_goldens() { ); } +// --------------------------------------------------------------------------- +// Task 5.6 — L2-5 block-vs-stream TOB quote-sequence subsequence contract +// +// L2-5 makes the streaming listener suppress snapshot emission for blocks where +// the book is unchanged (`book_dirty=false`). Block mode unconditionally emits +// a snapshot after every `apply_updates` call (clearing `snapped`), so it emits +// even for empty/absent blocks. This means stream emits a strict ordered +// SUBSEQUENCE of block's snapshot sequence, not an equal sequence. +// +// The correct L2-5 contract (verified by forensic classification of the 3 +// dual-validator block-only heights): +// • stream ⊆ block — stream_rows is an ordered subsequence of block_rows +// with full QuoteRow equality at shared positions (height+coin+bbo). +// • No real BBO change is dropped — every block row where the BBO actually +// moved (from the previously block-emitted BBO for that coin) must also +// appear in stream_rows. +// • All block-only rows carry the unchanged BBO — every row in block that +// stream does NOT emit carries the fixture's constant BBO, confirming it +// was a no-op/absent-block suppression and not a dropped real change. +// +// The 3 forensically-classified block-only heights on the dual-validator fixture: +// • 985148181 — init-snapshot artifact: block emits ts=0 quote at state.time=0; +// stream structurally cannot/should not emit at init. +// • 985148193, 985148228 — zero-BTC-event blocks entirely absent from the +// streaming feed (book unchanged → book_dirty=false → L2-5 +// correctly suppresses). +// +// `assert_stream_end_quotes_match_block` independently guarantees per-timestamp +// content correctness; this test closes the completeness and ordering gap. +// --------------------------------------------------------------------------- + +/// A single row extracted from one `InternalMessage::Snapshot` for one coin. +/// +/// `best_bid` / `best_ask` are `Px::value()` u64s from the unbucketed +/// `Snapshot` (`L2SnapshotParams::new(None, None)`). +/// `None` means that side has no resting orders in the snapshot. +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +struct QuoteRow { + height: u64, + coin: String, + best_bid: Option, + best_ask: Option, +} + +/// Map one `InternalMessage::Snapshot` to a `Vec`, one entry per +/// coin in the snapshot. Rows are sorted by coin name so HashMap iteration +/// order does not affect the comparison. Non-Snapshot messages return an +/// empty vec. +fn extract_quote_rows(msg: &InternalMessage) -> Vec { + let InternalMessage::Snapshot { l2_snapshots, height, .. } = msg else { + return Vec::new(); + }; + let mut rows: Vec = l2_snapshots + .as_ref() + .iter() + .filter_map(|(coin, params_map)| { + // WS-disabled config produces exactly one entry: the unbucketed variant. + let unbucketed = params_map.get(&L2SnapshotParams::new(None, None))?; + let sides = unbucketed.as_ref(); + // sides[0] = bids (descending), sides[1] = asks (ascending). + // .first() gives the best level on each side. + let best_bid = sides[0].first().map(|l: &InnerLevel| l.px.value()); + let best_ask = sides[1].first().map(|l: &InnerLevel| l.px.value()); + Some(QuoteRow { height: *height, coin: coin.value().to_string(), best_bid, best_ask }) + }) + .collect(); + rows.sort_by(|a, b| a.coin.cmp(&b.coin)); + rows +} + +/// Flatten an ordered snapshot sequence into one `Vec` preserving +/// emission order across snapshots and stable coin order within each snapshot. +fn quote_rows_from_snapshots(snapshots: &[Arc]) -> Vec { + snapshots.iter().flat_map(|msg| extract_quote_rows(msg.as_ref())).collect() +} + +/// L2-5 subsequence contract: stream emits an ordered SUBSEQUENCE of block's +/// TOB snapshot sequence, with content-identical BBO at every shared position, +/// and no real BBO change dropped. +/// +/// Three assertions: +/// 1. stream_rows is an ordered subsequence of block_rows under full QuoteRow +/// equality — proves no wrong BBO, no fabricated/duplicate/reordered +/// stream emission, and stream never emits a (height,bbo) block didn't. +/// 2. Every block row where the BBO actually changed (from the previously +/// block-emitted BBO for that coin) also appears in stream_rows — proves +/// stream drops only no-op/unchanged-BBO blocks, never a real BBO change. +/// This is the load-bearing completeness assertion: it is non-vacuous on +/// any fixture with real BBO movement, where it would FAIL if stream +/// dropped a block where block recorded a BBO change. +/// 3. Every block-only row (in block but not in stream) carries the fixture's +/// constant BBO — proves all suppressions were benign no-op/absent-block +/// suppressions, not dropped real changes. Asserted by BBO value (robust), +/// not by hardcoding the 3 known heights (brittle). +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn stream_quote_sequence_is_block_subsequence_with_identical_bbo() { + let fixture = extract_dual_validator_fixture(); + let root = fixture.path(); + let manifest = dual_validator_manifest(); + let replay_coin = dual_fixture_coin(&manifest); + + let block = capture_fixture_replay(root, IngestMode::Block, &replay_coin).await; + let stream = capture_fixture_replay(root, IngestMode::Stream, &replay_coin).await; + + let block_rows = quote_rows_from_snapshots(&block.snapshots); + let stream_rows = quote_rows_from_snapshots(&stream.snapshots); + + assert!( + !block_rows.is_empty(), + "block mode must emit at least one TOB snapshot during dual-validator fixture replay" + ); + assert!( + !stream_rows.is_empty(), + "stream mode must emit at least one TOB snapshot during dual-validator fixture replay \ + (guards against vacuous pass where stream emitted nothing)" + ); + + // ----------------------------------------------------------------------- + // Assertion 1: stream_rows is an ordered subsequence of block_rows. + // + // Walk block_rows with a cursor; for each stream_row advance the block + // cursor until an equal QuoteRow is found. If the block list is exhausted + // before matching a stream row, the invariant is violated. + // ----------------------------------------------------------------------- + { + let mut block_cursor = 0usize; + for stream_row in &stream_rows { + let found_at = block_rows[block_cursor..].iter().position(|b| b == stream_row); + match found_at { + Some(offset) => { + block_cursor += offset + 1; + } + None => { + // Gather a window of block around the cursor for context. + let window_start = block_cursor.saturating_sub(2); + let window_end = (block_cursor + 5).min(block_rows.len()); + let window = &block_rows[window_start..window_end]; + panic!( + "stream_rows is NOT an ordered subsequence of block_rows.\n\ + First unmatched stream row: {stream_row:?}\n\ + Block window around cursor [{block_cursor}] (indices {window_start}..{window_end}): {window:#?}" + ); + } + } + } + } + + // ----------------------------------------------------------------------- + // Assertion 2: no real BBO change is dropped. + // + // Walk block_rows per coin tracking the last-emitted (best_bid, best_ask). + // Collect the distinct (coin, best_bid, best_ask) BBO values that represent + // real transitions: the first BBO per coin (nothing → BBO) and every + // subsequent BBO that differs from the last. + // + // We match on (coin, best_bid, best_ask) — NOT on height — because block + // may emit the same BBO value at an init-artifact height (985148181, ts=0) + // that stream structurally cannot emit, while stream correctly emits that + // same BBO value at the first real data height. The semantic contract is: + // "stream must have observed every distinct BBO that block recorded, though + // not necessarily at the exact same height as any block artifact emission." + // + // On this constant-BBO fixture block_bbo_changes contains exactly one + // distinct BBO per coin (the nothing→BBO transition). Stream emits that + // BBO at the first real data height, so the assertion is satisfied. + // This CHECK IS NOT VACUOUS: it would FAIL on any fixture where stream + // drops a block where block recorded a BBO change to a NEW value that + // stream never emits at all. + // ----------------------------------------------------------------------- + { + use std::collections::{HashMap, HashSet}; + + // Collect distinct BBO values that block transitions through per coin. + let mut last_bbo_per_coin: HashMap<&str, (Option, Option)> = HashMap::new(); + // Set of (coin, best_bid, best_ask) triples that represent real BBO changes. + let mut block_bbo_change_values: HashSet<(&str, Option, Option)> = + HashSet::new(); + for row in &block_rows { + let bbo = (row.best_bid, row.best_ask); + match last_bbo_per_coin.get(row.coin.as_str()) { + None => { + // First row for this coin — always a change (nothing → BBO). + block_bbo_change_values.insert((&row.coin, row.best_bid, row.best_ask)); + last_bbo_per_coin.insert(&row.coin, bbo); + } + Some(&prev) if prev != bbo => { + block_bbo_change_values.insert((&row.coin, row.best_bid, row.best_ask)); + last_bbo_per_coin.insert(&row.coin, bbo); + } + _ => {} + } + } + + // Build the set of (coin, bid, ask) values present in stream_rows. + let stream_bbo_values: HashSet<(&str, Option, Option)> = stream_rows + .iter() + .map(|r| (r.coin.as_str(), r.best_bid, r.best_ask)) + .collect(); + + for &(coin, bid, ask) in &block_bbo_change_values { + if !stream_bbo_values.contains(&(coin, bid, ask)) { + // Find the first block row with this BBO for the error message. + let example_block_row = block_rows + .iter() + .find(|r| r.coin.as_str() == coin && r.best_bid == bid && r.best_ask == ask) + .unwrap(); + panic!( + "stream_rows never emits BBO ({bid:?}, {ask:?}) for coin {coin:?}, \ + which block recorded as a real BBO change.\n\ + Example block row with this BBO: {example_block_row:?}\n\ + This is a genuine L2-5 completeness failure — stream dropped a BBO \ + transition that block recorded." + ); + } + } + } + + // ----------------------------------------------------------------------- + // Assertion 3: every block-only row carries the constant fixture BBO. + // + // Block-only rows are those in block_rows that are NOT in stream_rows. + // We assert by BBO value (robust), not by hardcoding the 3 known heights + // (brittle). The 3 forensically-classified benign heights are: + // • 985148181 — init-snapshot artifact (block emits ts=0; stream cannot) + // • 985148193, 985148228 — zero-BTC-event/absent blocks (book unchanged) + // All three carry the fixture's constant BBO, so the value-based check + // catches any future case where a block-only row has a different BBO + // (which would indicate a dropped real change and must fail the test). + // ----------------------------------------------------------------------- + { + const FIXTURE_CONSTANT_BID: u64 = 8_130_700_000_000; + const FIXTURE_CONSTANT_ASK: u64 = 8_130_800_000_000; + + let stream_row_set: std::collections::HashSet<&QuoteRow> = + stream_rows.iter().collect(); + let block_only: Vec<&QuoteRow> = + block_rows.iter().filter(|r| !stream_row_set.contains(r)).collect(); + + for row in &block_only { + let bid_ok = row.best_bid == Some(FIXTURE_CONSTANT_BID); + let ask_ok = row.best_ask == Some(FIXTURE_CONSTANT_ASK); + if !bid_ok || !ask_ok { + panic!( + "block-only row carries a non-constant BBO — this is a real dropped \ + BBO change, not a benign no-op suppression.\n\ + Offending block-only row: {row:?}\n\ + Expected constant BBO: bid=Some({FIXTURE_CONSTANT_BID}) ask=Some({FIXTURE_CONSTANT_ASK})" + ); + } + } + } +} + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] #[ignore = "requires EDGE_MULTICAST_REF_DIR pointing at the pinned edge-multicast-ref checkout"] async fn dual_validator_tob_streaming_feed_is_consumed_by_edge_parser() { From 190383d122b0d26f004e6ff8b04744120bcb187e Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 00:01:43 -0400 Subject: [PATCH 26/65] docs: document --enable-websocket multicast-only default in README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a7e6b60..6ae053e8 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The `l4book` subscription first sends a snapshot of the entire book and then for cargo run --release --bin dz_hl_publisher -- --address 0.0.0.0 --port 8000 --enable-websocket ``` -The publisher is multicast-only by default and requires at least one output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`); starting without any of these is rejected at startup. +The publisher is **multicast-only by default**. The WebSocket listener is **not bound** unless `--enable-websocket` is passed — `--address` and `--port` are accepted but unused in that mode (a startup log line states the active output mode). At least one output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`) is required; starting without any of these is rejected at startup. By default the server reads `$HOME/hl/data/node_*_by_block`. To opt into streaming disk ingest, use: @@ -188,6 +188,7 @@ cargo run --release --bin dz_hl_publisher -- \ | Flag | Default | Description | |------|---------|-------------| +| `--enable-websocket` | *(off)* | Bind the WebSocket listener at `--address`:`--port`. Off by default — publisher is multicast-only unless this flag is passed. | | `--multicast-group` | *(none — multicast disabled)* | Multicast group address (e.g. `239.0.0.1`). Enables multicast when set. | | `--multicast-port` | `5000` | UDP port for multicast traffic. | | `--multicast-bind-addr` | *(required when group is set)* | Local address to bind the multicast UDP socket. | From eeed11cb80b5319c7a95c7892ec4f4055760d866 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 00:02:22 -0400 Subject: [PATCH 27/65] docs: changelog for streaming cpu reduction and ws default-off --- CHANGELOG.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..de07c839 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,52 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). + +## Unreleased + +### Breaking changes + +- The WebSocket listener is now **disabled by default**. The publisher is + multicast-only unless `--enable-websocket` is passed. Existing deployments + that relied on the default `--address`/`--port` WebSocket server MUST add + `--enable-websocket` to retain that behavior. Startup is now rejected if no + output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`) + is configured. + + **Rollback:** to restore the previous behavior without a code revert, pass + `--enable-websocket` — this re-enables the WS listener and the full L2 + snapshot fan-out (all 7 variants per coin). If a regression is isolated to + the streaming finalization-driven snapshot path (L2-5) and `--enable-websocket` + does not address it, revert the L2-5 commits (`feat: 5s stuck-stream snapshot + backstop on dedicated 250ms ticker`, `perf: emit streaming l2 snapshot at + block finalization, not per chunk`, `feat: add book_dirty flag to + OrderBookState set only on real mutations`, `fix: emit authoritative tob + snapshot after streaming recovery`); the L2-1 (`perf: replace Px::num_digits + f64 log10 with u64::ilog10`), L2-4 (`perf: pre-size L2 level output Vecs to + skip realloc growth`), and L2-3 (`perf: skip 6 bucketed l2 variants and cap + unbucketed to bbo when websocket disabled`) changes are independent and can + stand alone. + +### Performance + +- Streaming-mode CPU reduced by: + - replacing `Px::num_digits()` f64 `log10().floor()` with integer + `u64::ilog10()` (also fixes a latent off-by-one for values just below + large powers of ten); + - in the default (WS-disabled) config, computing only the unbucketed + best-bid/ask L2 snapshot instead of 7 bucketed variants per coin; + - in streaming mode (WS-disabled), emitting the TOB L2 snapshot once per + finalized block instead of once per file-read chunk, with a 250ms + stuck-stream backstop and recovery-path emission so corrected BBOs are + never withheld; + - pre-sizing L2 level output vectors to avoid reallocation growth. + +### Fixed + +- `Px::num_digits()` no longer reports an extra digit for `u64` values just + below large powers of ten (f64 cast imprecision). +- Streaming recovery now emits an authoritative TOB snapshot immediately, so + a per-coin divergence repair is reflected to multicast subscribers without + waiting for an unrelated later diff. From 274804d190a78c3f23f92b9cb67d16bc62923a92 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 00:17:04 -0400 Subject: [PATCH 28/65] fix: authoritative tob snapshots bypass staleness suppression so stalled-stream provisional is always corrected --- server/src/listeners/order_book/mod.rs | 6 +++ server/src/multicast/publisher.rs | 60 +++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index cf99bb90..988cc009 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1198,6 +1198,7 @@ impl OrderBookListener { source_local_time_ms, latest_heights, enqueued_at_ms, + authoritative: emission == SnapshotEmission::Authoritative, }); let _unused = tx.send(snapshot_msg); } @@ -2157,6 +2158,11 @@ pub(crate) enum InternalMessage { source_local_time_ms: u64, latest_heights: IngestHeights, enqueued_at_ms: u64, + /// `true` for block-finalization and recovery snapshots (canonical + /// closed-block state); `false` for the 250 ms stuck-stream provisional + /// backstop. Authoritative snapshots bypass the publisher's staleness + /// gate so they always supersede any provisional already published. + authoritative: bool, }, Fills { batch: Batch, diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 92644988..172c394e 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -519,6 +519,17 @@ impl MulticastPublisher { lag_ms <= Self::CATCHUP_THRESHOLD_MS } + /// Decides whether to publish a snapshot message. + /// + /// Authoritative snapshots (block finalization / per-coin recovery) are the + /// canonical closed-block state and MUST reach subscribers even if + /// wall-clock-stale — they supersede any provisional backstop snapshot + /// already published for the block. Provisional (stuck-stream 250 ms + /// backstop) snapshots keep the existing freshness gate. + const fn snapshot_should_publish(lag_ms: u64, authoritative: bool) -> bool { + authoritative || lag_ms <= Self::CATCHUP_THRESHOLD_MS + } + fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -977,6 +988,7 @@ impl MulticastPublisher { source_local_time_ms, latest_heights, enqueued_at_ms, + authoritative, } => { cached_snapshot = Some(msg.clone()); let now_ms = Self::now_ms(); @@ -992,7 +1004,11 @@ impl MulticastPublisher { Duration::from_millis(listener_to_publisher_ms), ); let lag_ms = now_ms.saturating_sub(*time); - if Self::should_publish_lag(lag_ms) { + // Provisional (stuck-stream backstop) snapshots keep the + // freshness gate; authoritative (block finalization / recovery) + // snapshots bypass it so a stalled-stream provisional is always + // superseded by its authoritative correction. + if Self::snapshot_should_publish(lag_ms, *authoritative) { health.observe_publishable_lag(lag_ms); crate::metrics::observe_tob_source_lag( "snapshot", @@ -1840,4 +1856,46 @@ mod tests { assert_eq!(report.receiver_lag_events, 3); assert_eq!(report.receiver_lagged_messages, 19); } + + /// Decision-matrix test for `snapshot_should_publish`. + /// + /// Together with `stream_finalization_tests::late_diff_after_backstop_still_emits_authoritative_final` + /// (which proves the listener emits an authoritative snapshot even when a provisional + /// backstop was already sent) this constitutes end-to-end proof: the listener emits + /// the authoritative snapshot, and the publisher unconditionally publishes it even + /// when wall-clock-stale — so a stalled-stream provisional is always superseded. + #[test] + fn authoritative_snapshot_bypasses_staleness_suppression() { + let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; + + // provisional, stale → suppressed (existing gate preserved) + assert!( + !MulticastPublisher::snapshot_should_publish(thr + 1, false), + "provisional stale snapshot must be suppressed" + ); + + // authoritative, stale → published (the bug fix) + assert!( + MulticastPublisher::snapshot_should_publish(thr + 1, true), + "authoritative stale snapshot must bypass suppression" + ); + + // provisional, fresh → published (normal operating case) + assert!( + MulticastPublisher::snapshot_should_publish(0, false), + "provisional fresh snapshot must be published" + ); + + // provisional, exactly at threshold → published (boundary inclusive, matches <=) + assert!( + MulticastPublisher::snapshot_should_publish(thr, false), + "provisional at-threshold snapshot must be published" + ); + + // authoritative, arbitrarily stale → still published + assert!( + MulticastPublisher::snapshot_should_publish(thr + 10_000, true), + "arbitrarily stale authoritative snapshot must be published" + ); + } } From 26c00f3cfd03fb206b4fd1360a913f440d661beb Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 00:49:35 -0400 Subject: [PATCH 29/65] fix: narrow staleness bypass to corrections only and decouple recovery from stream dirty epoch --- server/src/listeners/order_book/mod.rs | 124 +++++++++----- server/src/listeners/order_book/state.rs | 22 +++ .../order_book/stream_finalization_tests.rs | 159 ++++++++++++++++++ server/src/multicast/publisher.rs | 72 ++++---- 4 files changed, 301 insertions(+), 76 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 988cc009..cf148a1a 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -498,16 +498,23 @@ impl StreamFinalizationMode { } /// Whether `emit_tob_snapshot` is taking the authoritative final snapshot for -/// the current dirty range, or a provisional snapshot (e.g. stuck-stream -/// backstop) that does NOT close out the dirty state. +/// the current dirty range, a provisional backstop snapshot, or a correction +/// that must bypass the publisher's staleness gate. #[derive(Copy, Clone, Debug, PartialEq, Eq)] enum SnapshotEmission { - /// Block finalization: authoritative final snapshot for this height. - /// Clears `book_dirty` because the block is closed. - Authoritative, - /// Backstop / provisional: emit current state but leave `book_dirty` - /// set so a subsequent finalization still emits its own snapshot. + /// Stuck-stream 250ms backstop. Lag-gated at the publisher. Does NOT + /// close the dirty epoch; marks that a provisional was published so the + /// epoch's finalization knows to emit as `Correction`. Provisional, + /// Normal block finalization / block-mode per-chunk emit. Lag-gated at + /// the publisher (catch-up suppression applies). Closes the stream + /// dirty epoch. + Authoritative, + /// Must reach subscribers regardless of staleness: (a) recovery + /// divergence correction, (b) the finalization that closes a dirty + /// epoch in which a `Provisional` was already published (it supersedes + /// that provisional). Bypasses the publisher freshness gate. + Correction, } struct UnresolvedStreamNewDebug { @@ -1124,9 +1131,11 @@ impl OrderBookListener { bt, lt, SnapshotEmission::Provisional, + false, // prevent_future_snaps=false: later finalization must still emit ); if let Some(state) = self.order_book_state.as_mut() { state.bump_dirty_since_after_provisional(); + state.mark_provisional_published(); } true } else { @@ -1138,37 +1147,57 @@ impl OrderBookListener { /// book changed since the last snapshot. Shared by streaming finalization /// and the test finalization helper so the two cannot drift (the test /// helper previously lagged this logic). + /// + /// This is the ONLY place that calls `clear_book_dirty()` — it owns the + /// stream dirty epoch lifecycle. Recovery and Provisional never clear it. fn emit_authoritative_block_snapshot(&mut self, block: &StreamingBlock) { let dirty = self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty); if dirty { let block_time_ms = block.block_time_ms.unwrap_or(0); let local_time_ms = block.local_time_ms.unwrap_or(block_time_ms); + // If a provisional was already published for this dirty epoch, emit + // as Correction (bypasses publisher staleness gate, supersedes the + // provisional). Otherwise emit as Authoritative (lag-gated, catch-up + // suppression applies — fixes Bug A). + let kind = if self + .order_book_state + .as_ref() + .is_some_and(OrderBookState::provisional_published_this_epoch) + { + SnapshotEmission::Correction + } else { + SnapshotEmission::Authoritative + }; self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), block_time_ms, local_time_ms, - SnapshotEmission::Authoritative, + kind, + false, // prevent_future_snaps=false: book_dirty guards stream epoch; snapped not needed ); + // Close the dirty epoch: clear dirty flag and reset provisional tracker. + if let Some(state) = self.order_book_state.as_mut() { + state.clear_book_dirty(); + } } } + /// `prevent_future_snaps`: if true, sets the `snapped` flag after emitting, + /// which prevents a second emission at the same height. Used only by block + /// mode's per-chunk path to guard against double-emit across chunk boundaries. + /// Stream finalization and recovery do NOT set this — stream mode uses + /// `book_dirty` as the epoch guard, and recovery is orthogonal to the stream + /// epoch (setting snapped=true in recovery would suppress the stream block's + /// finalization snapshot, causing Bug B). fn emit_tob_snapshot( &mut self, source_label: &'static str, source_block_time_ms: u64, source_local_time_ms: u64, emission: SnapshotEmission, + prevent_future_snaps: bool, ) { let snapshot_start = Instant::now(); - // L2-5 + Codex finding: `prevent_future_snaps` mirrors the authority of - // this emission. Authoritative emissions set `snapped` so block mode's - // duplicate-chunk suppression keeps working; provisional (backstop) - // emissions leave `snapped` untouched so the later authoritative - // finalization call can still produce a snapshot. Without this, a - // backstop fired before finalization would mark `snapped=true` and the - // finalization's `l2_snapshots(true)` would return `None`, silently - // dropping the authoritative final BBO. - let prevent_future_snaps = emission == SnapshotEmission::Authoritative; let snapshot = self.l2_snapshots(prevent_future_snaps); crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); if let Some(snapshot) = snapshot { @@ -1198,15 +1227,13 @@ impl OrderBookListener { source_local_time_ms, latest_heights, enqueued_at_ms, - authoritative: emission == SnapshotEmission::Authoritative, + bypass_staleness: matches!(emission, SnapshotEmission::Correction), }); let _unused = tx.send(snapshot_msg); } - if emission == SnapshotEmission::Authoritative - && let Some(state) = self.order_book_state.as_mut() - { - state.clear_book_dirty(); - } + // NOTE: `clear_book_dirty` is NOT called here. The stream-finalization + // epoch-closer (`emit_authoritative_block_snapshot`) is the only caller + // that clears the dirty epoch. Recovery and Provisional never clear it. } } @@ -1500,29 +1527,27 @@ impl OrderBookListener { } } - if mutated { - // Mark dirty with timestamps so both the immediate emit below - // and the 250ms backstop (if the immediate emit is suppressed - // by the snapped gate) carry consistent source times. - state.mark_dirty_with_times(recovery_block_time_ms, recovery_local_time_ms); - } + // Recovery does NOT participate in the stream dirty epoch: + // no mark_dirty_with_times, no clear_book_dirty. A concurrent + // unfinalized stream block's dirty epoch is left fully intact. } // `state` borrow ends here; `&mut self` is free again // Recovery mutated the book outside apply_stream_diff/apply_updates. - // Emit an authoritative TOB snapshot now so multicast subscribers see - // the corrected BBO immediately instead of waiting for an unrelated - // future diff or the next finalization (L2-5 + recovery interaction). - if self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty) { - let (bt, lt) = self - .order_book_state - .as_ref() - .and_then(OrderBookState::dirty_source_times) - .unwrap_or((0, 0)); + // Emit a Correction snapshot immediately so multicast subscribers see + // the corrected BBO without waiting for an unrelated future diff or + // block finalization (L2-5 + recovery interaction, Bug B fix). + // Correction always bypasses the publisher staleness gate. + // Recovery NEVER touches book_dirty / book_dirty_since / + // dirty_source_times / provisional_published_this_epoch — the stream + // dirty epoch for any concurrent unfinalized block is preserved intact. + if mutated { self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), - bt, - lt, - SnapshotEmission::Authoritative, + recovery_block_time_ms, + recovery_local_time_ms, + SnapshotEmission::Correction, + false, // prevent_future_snaps=false: recovery is orthogonal to stream epoch; + // setting snapped=true would suppress the stream block's finalization (Bug B) ); } } @@ -2116,11 +2141,17 @@ impl DirectoryListener for OrderBookListener { // helper so block-mode wire output stays byte-identical. let state_time_ms = self.order_book_state.as_ref().map(OrderBookState::time).unwrap_or(0); let (source_block_time_ms, source_local_time_ms) = last_source_times.unwrap_or((state_time_ms, state_time_ms)); + // Authoritative: block-mode per-chunk snapshots are lag-gated in production + // (the freshness gate suppresses stale catch-up/backlog quotes, matching + // pre-L2-5 behavior). Golden determinism is preserved by should_publish_lag's + // #[cfg(test)] → true override at the publish call site, not by bypassing + // staleness. prevent_future_snaps=true keeps the original per-chunk snapped dedup. self.emit_tob_snapshot( snapshot_source, source_block_time_ms, source_local_time_ms, SnapshotEmission::Authoritative, + true, // prevent_future_snaps=true: block mode guards against double-emit across chunks ); Ok(()) } @@ -2158,11 +2189,12 @@ pub(crate) enum InternalMessage { source_local_time_ms: u64, latest_heights: IngestHeights, enqueued_at_ms: u64, - /// `true` for block-finalization and recovery snapshots (canonical - /// closed-block state); `false` for the 250 ms stuck-stream provisional - /// backstop. Authoritative snapshots bypass the publisher's staleness - /// gate so they always supersede any provisional already published. - authoritative: bool, + /// `true` only for `Correction` emissions (recovery divergence correction, + /// or finalization that supersedes a provisional already published this epoch). + /// Bypasses the publisher's staleness gate. `false` for `Provisional` and + /// `Authoritative` emissions, which remain lag-gated (catch-up suppression + /// applies to normal finalization). + bypass_staleness: bool, }, Fills { batch: Batch, diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 0579d004..2a1fc32e 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -35,6 +35,11 @@ pub(super) struct OrderBookState { /// `Instant` at which the book first became dirty in the current dirty /// epoch. Used by the backstop to gate emission cadence. book_dirty_since: Option, + /// True if a `Provisional` snapshot was already published during the + /// current dirty epoch (i.e. since the last `clear_book_dirty`). When the + /// epoch's finalization fires it emits as `Correction` (bypasses staleness + /// gate) to supersede the provisional. Reset with `clear_book_dirty`. + provisional_published_this_epoch: bool, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -70,6 +75,7 @@ impl Clone for OrderBookState { book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, book_dirty_since: self.book_dirty_since, + provisional_published_this_epoch: self.provisional_published_this_epoch, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -96,6 +102,7 @@ impl OrderBookState { book_dirty: false, dirty_source_times: None, book_dirty_since: None, + provisional_published_this_epoch: false, dob_tap: None, } } @@ -143,10 +150,25 @@ impl OrderBookState { } } + /// Records that a `Provisional` snapshot was published for the current + /// dirty epoch. The epoch's finalization will then emit as `Correction` + /// (bypasses publisher staleness gate) to supersede it. + pub(super) fn mark_provisional_published(&mut self) { + self.provisional_published_this_epoch = true; + } + + /// Returns `true` if a `Provisional` snapshot was already published + /// during the current dirty epoch. + pub(super) const fn provisional_published_this_epoch(&self) -> bool { + self.provisional_published_this_epoch + } + pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.book_dirty_since = None; self.dirty_source_times = None; + // Epoch-scoped: reset alongside the dirty epoch it belongs to. + self.provisional_published_this_epoch = false; } // forcibly take snapshot - (time, height, snapshot) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index ab9f5a69..1c7167a0 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -164,6 +164,13 @@ fn snapshot_height(msg: &InternalMessage) -> u64 { *height } +/// Returns the `bypass_staleness` field of a Snapshot message, or `None` if +/// the message is not a Snapshot. +fn snapshot_bypass(msg: &InternalMessage) -> Option { + let InternalMessage::Snapshot { bypass_staleness, .. } = msg else { return None }; + Some(*bypass_staleness) +} + // --------------------------------------------------------------------------- // Test 1: Two diffs in the same block — finalization must emit exactly one // snapshot carrying the FINAL BBO. @@ -466,3 +473,155 @@ async fn recovery_emits_authoritative_snapshot_without_later_diff() { let expected = Px::parse_from_str("200").unwrap().value(); assert_eq!(repaired_bbo, expected, "snapshot after recovery must carry the repaired BBO (200); got {repaired_bbo}"); } + +// --------------------------------------------------------------------------- +// Tests 8a/8b/8c (Task 5.9): 3-state emission model correctness. +// +// 8a: normal finalization (no prior provisional) → bypass_staleness=false +// (Bug A fix: catch-up suppression preserved for normal finalization) +// 8b: finalization after provisional → bypass_staleness=true (Correction) +// (supersedes the provisional, must reach subscribers regardless of lag) +// 8c: recovery over an unfinalized dirty block does NOT consume or clear +// that block's dirty epoch — the stream block still emits its own +// finalization snapshot (Bug B fix) +// --------------------------------------------------------------------------- + +/// Task 5.9 Bug A fix: a normal block finalization with no prior provisional +/// must emit as Authoritative (bypass_staleness=false) so catch-up suppression +/// applies to historical/backlog replay. This proves Bug A is fixed. +#[tokio::test(flavor = "current_thread")] +async fn normal_finalization_snapshot_is_lag_gated_not_bypassed() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: one BBO-changing diff — no backstop fired. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Block 3: triggers finalization of block 2. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let snapshots = drain_snapshots(&mut rx); + let block_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert_eq!(block_2.len(), 1, "exactly one snapshot for height 2; got {}", block_2.len()); + + let bypass = snapshot_bypass(block_2[0]).expect("snapshot has bypass_staleness"); + assert!( + !bypass, + "normal finalization (no prior provisional) must have bypass_staleness=false; got true" + ); +} + +/// Task 5.9: finalization that follows a provisional in the same dirty epoch +/// must emit as Correction (bypass_staleness=true) to supersede the provisional. +#[tokio::test(flavor = "current_thread")] +async fn finalization_after_provisional_emits_correction() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: one BBO-changing diff. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Fire the backstop (bypassing interval gate) → Provisional published, sets + // provisional_published_this_epoch. + listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); + let provisional = drain_snapshots(&mut rx); + assert_eq!(provisional.len(), 1, "backstop must emit one provisional; got {}", provisional.len()); + assert!( + !snapshot_bypass(&provisional[0]).unwrap_or(true), + "provisional snapshot must have bypass_staleness=false" + ); + + // Block 3: triggers finalization of block 2. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let after_final = drain_snapshots(&mut rx); + let block_2_finals: Vec<_> = after_final.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert_eq!(block_2_finals.len(), 1, "exactly one finalization snapshot for height 2; got {}", block_2_finals.len()); + + let bypass = snapshot_bypass(block_2_finals[0]).expect("snapshot has bypass_staleness"); + assert!( + bypass, + "finalization after provisional must emit Correction (bypass_staleness=true); got false" + ); +} + +/// Task 5.9 Bug B fix: recovery over an unfinalized dirty block must NOT +/// consume or clear that block's dirty epoch. After recovery emits its own +/// Correction snapshot, the original stream block must STILL emit its own +/// authoritative finalization snapshot when it eventually finalizes. +#[tokio::test(flavor = "current_thread")] +async fn recovery_does_not_consume_unfinalized_stream_dirty_block() { + use super::utils::ValidationReport; + use std::collections::HashMap; + + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Block 2: BBO-changing diff — dirty epoch started. Do NOT finalize yet. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Drain any snapshots emitted so far (none expected, block 2 not finalized). + let before_recovery = drain_snapshots(&mut rx); + assert!(before_recovery.is_empty(), "no snapshot before recovery; got {}", before_recovery.len()); + + // Apply recovery with a different coin (simulating divergence). + // Recovery must emit a Correction snapshot immediately. + let coin = Coin::new(TEST_COIN); + let mut repaired: crate::order_book::OrderBook = crate::order_book::OrderBook::new(); + repaired.add_order(InnerL4Order { + user: alloy::primitives::Address::new([0; 20]), + coin: coin.clone(), + side: Side::Ask, + limit_px: Px::parse_from_str("88888").expect("valid px"), + sz: Sz::parse_from_str("2").expect("valid sz"), + oid: 8_888, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut fresh_map: HashMap> = HashMap::new(); + fresh_map.insert(coin.clone(), repaired.to_snapshot()); + let fresh = crate::order_book::multi_book::Snapshots::new(fresh_map); + + let report = ValidationReport { + diverged: vec![(coin, "synthetic divergence for bug-B test".to_string())], + missing_in_fresh: vec![], + extra_in_fresh: vec![], + }; + + listener.apply_recovery(&report, fresh); + + // Recovery must have emitted exactly one Correction snapshot. + let recovery_snapshots = drain_snapshots(&mut rx); + assert_eq!( + recovery_snapshots.len(), + 1, + "recovery must emit exactly one Correction snapshot; got {}", + recovery_snapshots.len() + ); + let recovery_bypass = snapshot_bypass(&recovery_snapshots[0]).expect("has bypass_staleness"); + assert!(recovery_bypass, "recovery snapshot must have bypass_staleness=true (Correction)"); + + // Now finalize block 2 by feeding block 3. + // Bug B: before the fix, recovery would have cleared book_dirty so the + // stream block 2 finalization would have seen a clean book and emitted + // nothing. With the fix, book_dirty is untouched by recovery, so block 2 + // still emits its own snapshot. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 104, "95", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + let after_finalize = drain_snapshots(&mut rx); + let block_2_finals: Vec<_> = after_finalize.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!( + !block_2_finals.is_empty(), + "Bug B regression: stream block 2 must still emit its own finalization snapshot after recovery; got none" + ); +} diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 172c394e..948af28c 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -519,15 +519,22 @@ impl MulticastPublisher { lag_ms <= Self::CATCHUP_THRESHOLD_MS } - /// Decides whether to publish a snapshot message. + /// Decides whether to publish a snapshot message given production (real) + /// lag gating logic. Used by tests to assert the decision-matrix logic + /// directly without the test-mode lag override. The publisher loop uses + /// `bypass_staleness || should_publish_lag(lag_ms)` so that in test builds + /// (where `should_publish_lag` always returns `true`) fixture replays with + /// historical block times are not suppressed — matching the fills path. /// - /// Authoritative snapshots (block finalization / per-coin recovery) are the - /// canonical closed-block state and MUST reach subscribers even if - /// wall-clock-stale — they supersede any provisional backstop snapshot - /// already published for the block. Provisional (stuck-stream 250 ms - /// backstop) snapshots keep the existing freshness gate. - const fn snapshot_should_publish(lag_ms: u64, authoritative: bool) -> bool { - authoritative || lag_ms <= Self::CATCHUP_THRESHOLD_MS + /// Only `Correction` emissions (recovery divergence correction, or + /// finalization that supersedes a provisional already published this epoch) + /// bypass the staleness gate (`bypass_staleness = true`). Both + /// `Provisional` (stuck-stream 250ms backstop) and normal `Authoritative` + /// (block finalization) emissions keep the existing freshness gate so that + /// catch-up suppression still applies to historical/backlog replays. + #[cfg(test)] + pub(crate) const fn snapshot_should_publish(lag_ms: u64, bypass_staleness: bool) -> bool { + bypass_staleness || lag_ms <= Self::CATCHUP_THRESHOLD_MS } fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { @@ -988,7 +995,7 @@ impl MulticastPublisher { source_local_time_ms, latest_heights, enqueued_at_ms, - authoritative, + bypass_staleness, } => { cached_snapshot = Some(msg.clone()); let now_ms = Self::now_ms(); @@ -1004,11 +1011,16 @@ impl MulticastPublisher { Duration::from_millis(listener_to_publisher_ms), ); let lag_ms = now_ms.saturating_sub(*time); - // Provisional (stuck-stream backstop) snapshots keep the - // freshness gate; authoritative (block finalization / recovery) - // snapshots bypass it so a stalled-stream provisional is always - // superseded by its authoritative correction. - if Self::snapshot_should_publish(lag_ms, *authoritative) { + // Correction snapshots (recovery divergence correction, or + // finalization superseding a provisional) bypass the staleness + // gate. Provisional and normal Authoritative snapshots remain + // lag-gated (catch-up suppression preserved — Bug A fix). + // `should_publish_lag` is used for the lag check so that in + // test builds (where it always returns true) fixture replays + // with historical block times are not suppressed — same + // pattern as the fills path. `snapshot_should_publish` is + // kept as a pure function used by the decision-matrix test. + if *bypass_staleness || Self::should_publish_lag(lag_ms) { health.observe_publishable_lag(lag_ms); crate::metrics::observe_tob_source_lag( "snapshot", @@ -1859,43 +1871,43 @@ mod tests { /// Decision-matrix test for `snapshot_should_publish`. /// - /// Together with `stream_finalization_tests::late_diff_after_backstop_still_emits_authoritative_final` - /// (which proves the listener emits an authoritative snapshot even when a provisional - /// backstop was already sent) this constitutes end-to-end proof: the listener emits - /// the authoritative snapshot, and the publisher unconditionally publishes it even - /// when wall-clock-stale — so a stalled-stream provisional is always superseded. + /// Together with `stream_finalization_tests::finalization_after_provisional_emits_correction` + /// (which proves the listener emits a Correction when a provisional was already + /// published this epoch) this constitutes end-to-end proof: a stalled-stream + /// provisional is always superseded by the correction, and normal finalization + /// is lag-gated (catch-up suppression preserved). #[test] - fn authoritative_snapshot_bypasses_staleness_suppression() { + fn correction_snapshot_bypasses_staleness_suppression() { let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; - // provisional, stale → suppressed (existing gate preserved) + // non-bypass (Provisional/Authoritative), stale → suppressed assert!( !MulticastPublisher::snapshot_should_publish(thr + 1, false), - "provisional stale snapshot must be suppressed" + "provisional/authoritative stale snapshot must be suppressed" ); - // authoritative, stale → published (the bug fix) + // correction (bypass_staleness=true), stale → published assert!( MulticastPublisher::snapshot_should_publish(thr + 1, true), - "authoritative stale snapshot must bypass suppression" + "correction stale snapshot must bypass suppression" ); - // provisional, fresh → published (normal operating case) + // non-bypass, fresh → published (normal operating case) assert!( MulticastPublisher::snapshot_should_publish(0, false), - "provisional fresh snapshot must be published" + "non-bypass fresh snapshot must be published" ); - // provisional, exactly at threshold → published (boundary inclusive, matches <=) + // non-bypass, exactly at threshold → published (boundary inclusive, matches <=) assert!( MulticastPublisher::snapshot_should_publish(thr, false), - "provisional at-threshold snapshot must be published" + "non-bypass at-threshold snapshot must be published" ); - // authoritative, arbitrarily stale → still published + // correction, arbitrarily stale → still published assert!( MulticastPublisher::snapshot_should_publish(thr + 10_000, true), - "arbitrarily stale authoritative snapshot must be published" + "arbitrarily stale correction snapshot must be published" ); } } From 54fe8c892b7e01472861c6dc6030d5108d4f3cd6 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 01:09:25 -0400 Subject: [PATCH 30/65] fix: gate L2-5 finalization and recovery emission on !enable_websocket so --enable-websocket preserves pre-L2-5 streaming cadence --- server/src/listeners/order_book/mod.rs | 35 ++++- .../order_book/stream_finalization_tests.rs | 128 ++++++++++++++++++ 2 files changed, 157 insertions(+), 6 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index cf148a1a..6d3d3677 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1150,7 +1150,24 @@ impl OrderBookListener { /// /// This is the ONLY place that calls `clear_book_dirty()` — it owns the /// stream dirty epoch lifecycle. Recovery and Provisional never clear it. + /// + /// When `enable_websocket=true`, the per-chunk path in `process_data` is + /// active (guard did not fire) and owns all TOB emission, preserving the + /// pre-L2-5 cadence exactly. Finalization must NOT emit an additional + /// snapshot in that case (doing so would produce duplicate L2/TOB + /// emissions and break the `--enable-websocket` rollback contract). + /// We still clear `book_dirty` so the flag does not accumulate stale state + /// across blocks — it was set by `apply_stream_diff` but is unused when + /// WS is enabled, exactly as in block mode. fn emit_authoritative_block_snapshot(&mut self, block: &StreamingBlock) { + // WS-enabled: per-chunk path owns all emission (pre-L2-5 cadence). + // Clear book_dirty for tidiness but do NOT emit a second snapshot. + if self.enable_websocket { + if let Some(state) = self.order_book_state.as_mut() { + state.clear_book_dirty(); + } + return; + } let dirty = self.order_book_state.as_ref().is_some_and(OrderBookState::book_dirty); if dirty { let block_time_ms = block.block_time_ms.unwrap_or(0); @@ -1533,14 +1550,20 @@ impl OrderBookListener { } // `state` borrow ends here; `&mut self` is free again // Recovery mutated the book outside apply_stream_diff/apply_updates. - // Emit a Correction snapshot immediately so multicast subscribers see - // the corrected BBO without waiting for an unrelated future diff or - // block finalization (L2-5 + recovery interaction, Bug B fix). - // Correction always bypasses the publisher staleness gate. - // Recovery NEVER touches book_dirty / book_dirty_since / + // + // WS-disabled (L2-5, default): emit a Correction snapshot immediately + // so multicast subscribers see the corrected BBO without waiting for an + // unrelated future diff or block finalization (L2-5 + recovery + // interaction, Bug B fix). Correction always bypasses the publisher + // staleness gate. Recovery NEVER touches book_dirty / book_dirty_since / // dirty_source_times / provisional_published_this_epoch — the stream // dirty epoch for any concurrent unfinalized block is preserved intact. - if mutated { + // + // WS-enabled (pre-L2-5 rollback path): the per-chunk path in + // `process_data` is active; recovery's corrected BBO is carried by the + // next per-chunk snapshot exactly as pre-L2-5. Emitting here would + // produce an extra Correction emission that breaks the rollback contract. + if mutated && !self.enable_websocket { self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), recovery_block_time_ms, diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 1c7167a0..66b05d95 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -625,3 +625,131 @@ async fn recovery_does_not_consume_unfinalized_stream_dirty_block() { "Bug B regression: stream block 2 must still emit its own finalization snapshot after recovery; got none" ); } + +// --------------------------------------------------------------------------- +// Tests 9 + 10 (Task 5.10): --enable-websocket restores pre-L2-5 cadence. +// +// With enable_websocket=true, the per-chunk path in process_data is active +// and owns all TOB emission (exactly as pre-L2-5). The L2-5 finalization- +// driven snapshot and the recovery explicit-Correction emit must both be +// suppressed so WS-enabled streaming is byte-for-byte pre-L2-5. +// +// Tests are discriminating: they FAIL if the WS-enabled guard is absent. +// --------------------------------------------------------------------------- + +/// Task 5.10: with enable_websocket=true, block finalization must NOT emit +/// an additional TOB snapshot (the per-chunk path in process_data already +/// owns it). Any finalization-origin snapshot would be a duplicate that +/// breaks the `--enable-websocket` rollback contract. +/// +/// Discriminating: fails if `emit_authoritative_block_snapshot` still emits +/// when enable_websocket=true. +#[tokio::test(flavor = "current_thread")] +async fn ws_enabled_streaming_finalization_does_not_double_emit() { + // Build a WS-enabled streaming listener: start with the WS-disabled + // streaming constructor (enable_websocket=false by default) then flip WS + // on via the pub(crate) setter BEFORE feeding any data. + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + listener.set_enable_websocket(true); + + // Block 2: BBO-changing diff marks the book dirty. + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + + // Block 3: triggers finalization of block 2. In WS-enabled streaming, the + // per-chunk path is active and already emitted any snapshot it would emit; + // finalize_stream_block must NOT emit an additional snapshot for height 2. + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + + // Drain everything. In the test harness the per-chunk guard fires because + // enable_websocket=true, so WS-owned snapshots may arrive — we only care + // that NO finalization-origin extra snapshot brings the total above what + // the per-chunk path alone would produce. + // + // Easiest discriminating assertion: force finalization directly via the + // test helper and verify it adds ZERO new snapshots for height 2. + let _before_force = drain_snapshots(&mut rx); + + // Force finalize any remaining buffered blocks. + listener.finalize_streaming_for_test().expect("finalize_streaming_for_test"); + let after_force = drain_snapshots(&mut rx); + + // finalize_streaming_for_test calls emit_authoritative_block_snapshot + // internally. With enable_websocket=true the gate must suppress the emit, + // so no additional snapshots should arrive on the channel. + assert!( + after_force.is_empty(), + "ws_enabled: finalize_streaming_for_test must NOT emit any snapshot (finalization-driven \ + emit is gated on !enable_websocket); got {} snapshot(s)", + after_force.len() + ); +} + +/// Task 5.10: with enable_websocket=true, `apply_recovery` must NOT emit an +/// explicit Correction snapshot. The per-chunk path is active; recovery's +/// corrected BBO is carried by the next per-chunk snapshot exactly as pre-L2-5. +/// +/// Discriminating: fails if the recovery explicit-emit guard (`!enable_websocket`) +/// is absent. +#[tokio::test(flavor = "current_thread")] +async fn ws_enabled_recovery_does_not_explicitly_emit() { + use super::utils::ValidationReport; + + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + listener.set_enable_websocket(true); + + // Feed a couple of blocks so last_batch_*_time_ms are populated (required + // by emit_tob_snapshot's source-time plumbing). + let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + let (s3, d3) = add_event(1_700_000_003_000, Side::Bid, 102, "101", "1"); + feed_block(&mut listener, 3, 1_700_000_003_000, vec![s3], vec![d3]); + listener.finalize_streaming_for_test().expect("finalize_streaming_for_test"); + + // Drain all setup snapshots (WS-enabled per-chunk snapshots, etc.). + let _drain_initial = drain_snapshots(&mut rx); + + // Build a repair snapshot with a different BBO to simulate divergence. + let coin = Coin::new(TEST_COIN); + let mut repaired: crate::order_book::OrderBook = crate::order_book::OrderBook::new(); + repaired.add_order(InnerL4Order { + user: alloy::primitives::Address::new([0; 20]), + coin: coin.clone(), + side: Side::Bid, + limit_px: Px::parse_from_str("300").expect("valid px"), + sz: Sz::parse_from_str("7").expect("valid sz"), + oid: 6_666, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut fresh_map: std::collections::HashMap> = + std::collections::HashMap::new(); + fresh_map.insert(coin.clone(), repaired.to_snapshot()); + let fresh = crate::order_book::multi_book::Snapshots::new(fresh_map); + + let report = ValidationReport { + diverged: vec![(coin, "synthetic divergence for ws-enabled recovery test".to_string())], + missing_in_fresh: vec![], + extra_in_fresh: vec![], + }; + + listener.apply_recovery(&report, fresh); + + // WS-enabled: the explicit Correction emit in apply_recovery is gated on + // !enable_websocket, so NO snapshot should be emitted by recovery itself. + let after_recovery = drain_snapshots(&mut rx); + assert!( + after_recovery.is_empty(), + "ws_enabled: apply_recovery must NOT emit an explicit Correction snapshot \ + (gated on !enable_websocket); got {} snapshot(s)", + after_recovery.len() + ); +} From 3f6d91bcbfcc354108cbdee81b26c26731ae2334 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 01:23:48 -0400 Subject: [PATCH 31/65] order book: atomic live-height recheck before surgical recovery --- server/src/listeners/order_book/mod.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 6d3d3677..d022fb4e 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -418,7 +418,28 @@ fn fetch_snapshot( report.missing_in_fresh.len(), report.extra_in_fresh.len(), ); - listener.lock().await.apply_recovery(&report, expected_snapshot); + // Off-lock validation (above) means the live book may have + // advanced past `height` while we computed/validated without + // the lock. Re-lock and re-check the live height *atomically* + // with apply_recovery (single held guard, no gap): only apply + // if the book is still exactly at the validated height. + // Otherwise the report is stale — discard it; a future + // validation cycle will re-derive against current state. + // This matters because apply_recovery emits a staleness- + // bypassing Correction snapshot; applying a raced report + // would broadcast a stale rollback to multicast subscribers. + let mut recovery_guard = listener.lock().await; + let live_height = recovery_guard + .order_book_state + .as_ref() + .map(|s| s.height()); + if live_height == Some(height) { + recovery_guard.apply_recovery(&report, expected_snapshot); + } else { + log::warn!( + "snapshot validation: discarding stale recovery report — live height {live_height:?} no longer matches validated height {height}" + ); + } Ok(()) } } From 75bf8e4faaa7f94eb6e4a471f1bb96c0f85b4ae6 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 09:53:17 -0400 Subject: [PATCH 32/65] order book: only mark provisional published when publisher will deliver it --- server/src/listeners/order_book/mod.rs | 99 ++++++++++++++++---------- server/src/multicast/publisher.rs | 4 +- 2 files changed, 63 insertions(+), 40 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index d022fb4e..acf62131 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1147,7 +1147,7 @@ impl OrderBookListener { && let Some((bt, lt)) = self.order_book_state.as_ref().and_then(OrderBookState::dirty_source_times) { - self.emit_tob_snapshot( + let delivered = self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), bt, lt, @@ -1156,7 +1156,15 @@ impl OrderBookListener { ); if let Some(state) = self.order_book_state.as_mut() { state.bump_dirty_since_after_provisional(); - state.mark_provisional_published(); + // Only treat the provisional as "published this epoch" if the publisher + // will actually deliver it. During catch-up/backlog replay the provisional + // is staleness-suppressed (lag > CATCHUP_THRESHOLD_MS); subscribers never + // see it, so finalization must emit a normal lag-gated Authoritative (not + // a staleness-bypassing Correction). Otherwise catch-up suppression is + // defeated and stale backlog quotes leak to multicast subscribers. + if delivered == Some(true) { + state.mark_provisional_published(); + } } true } else { @@ -1227,6 +1235,11 @@ impl OrderBookListener { /// `book_dirty` as the epoch guard, and recovery is orthogonal to the stream /// epoch (setting snapped=true in recovery would suppress the stream block's /// finalization snapshot, causing Bug B). + /// + /// Returns Some(true)/Some(false) = the emitted snapshot will be delivered / + /// suppressed by the publisher's staleness gate; None = no snapshot produced. + /// Used by the stuck-stream backstop so a suppressed provisional is not + /// mistaken for a delivered one. fn emit_tob_snapshot( &mut self, source_label: &'static str, @@ -1234,45 +1247,55 @@ impl OrderBookListener { source_local_time_ms: u64, emission: SnapshotEmission, prevent_future_snaps: bool, - ) { + ) -> Option { let snapshot_start = Instant::now(); let snapshot = self.l2_snapshots(prevent_future_snaps); crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); - if let Some(snapshot) = snapshot { - // l2_snapshots returned Some, so order_book_state is present; unwrap_or(0) is unreachable defensively. - let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); - let latest_heights = self.ingest_heights(); - crate::metrics::observe_tob_snapshot_enqueue_lag( - source_label, - Duration::from_millis(now_ms().saturating_sub(snapshot.0)), - ); - crate::metrics::observe_tob_snapshot_source_block_lag( - source_label, - Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), - ); - crate::metrics::observe_tob_snapshot_validator_write_lag( - source_label, - Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), - ); - if let Some(tx) = &self.internal_message_tx { - let enqueued_at_ms = now_ms(); - let snapshot_msg = Arc::new(InternalMessage::Snapshot { - l2_snapshots: snapshot.1, - time: snapshot.0, - height: snapshot_height, - source: source_label, - source_block_time_ms, - source_local_time_ms, - latest_heights, - enqueued_at_ms, - bypass_staleness: matches!(emission, SnapshotEmission::Correction), - }); - let _unused = tx.send(snapshot_msg); - } - // NOTE: `clear_book_dirty` is NOT called here. The stream-finalization - // epoch-closer (`emit_authoritative_block_snapshot`) is the only caller - // that clears the dirty epoch. Recovery and Provisional never clear it. - } + let Some(snapshot) = snapshot else { + return None; + }; + // l2_snapshots returned Some, so order_book_state is present; unwrap_or(0) is unreachable defensively. + let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); + let latest_heights = self.ingest_heights(); + // Single source of truth for snapshot lag: the publisher gates on + // `now_ms().saturating_sub(snapshot.0)`, so predict delivery with the + // exact same value and the exact same gate function it uses. + let snapshot_lag_ms = now_ms().saturating_sub(snapshot.0); + crate::metrics::observe_tob_snapshot_enqueue_lag( + source_label, + Duration::from_millis(snapshot_lag_ms), + ); + crate::metrics::observe_tob_snapshot_source_block_lag( + source_label, + Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_validator_write_lag( + source_label, + Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), + ); + let bypass = matches!(emission, SnapshotEmission::Correction); + if let Some(tx) = &self.internal_message_tx { + let enqueued_at_ms = now_ms(); + let snapshot_msg = Arc::new(InternalMessage::Snapshot { + l2_snapshots: snapshot.1, + time: snapshot.0, + height: snapshot_height, + source: source_label, + source_block_time_ms, + source_local_time_ms, + latest_heights, + enqueued_at_ms, + bypass_staleness: bypass, + }); + let _unused = tx.send(snapshot_msg); + } + // NOTE: `clear_book_dirty` is NOT called here. The stream-finalization + // epoch-closer (`emit_authoritative_block_snapshot`) is the only caller + // that clears the dirty epoch. Recovery and Provisional never clear it. + Some( + bypass + || crate::multicast::publisher::MulticastPublisher::should_publish_lag(snapshot_lag_ms), + ) } fn receive_stream_statuses(&mut self, batch: Batch) -> Result<()> { diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 948af28c..1403e5b8 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -510,12 +510,12 @@ impl MulticastPublisher { pub(crate) const CATCHUP_THRESHOLD_MS: u64 = 500; #[cfg(test)] - fn should_publish_lag(_lag_ms: u64) -> bool { + pub(crate) fn should_publish_lag(_lag_ms: u64) -> bool { true } #[cfg(not(test))] - fn should_publish_lag(lag_ms: u64) -> bool { + pub(crate) fn should_publish_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } From 886adc5cb24d4f109556ef53032ec87823838b44 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:10:04 -0400 Subject: [PATCH 33/65] order book: finalization never bypasses staleness; only recovery corrects (remove unsound provisional prediction) --- server/src/listeners/order_book/mod.rs | 128 +++++++----------- server/src/listeners/order_book/state.rs | 22 --- .../order_book/stream_finalization_tests.rs | 24 ++-- server/src/multicast/publisher.rs | 27 ++-- 4 files changed, 79 insertions(+), 122 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index acf62131..6a1cad08 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1147,7 +1147,7 @@ impl OrderBookListener { && let Some((bt, lt)) = self.order_book_state.as_ref().and_then(OrderBookState::dirty_source_times) { - let delivered = self.emit_tob_snapshot( + self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), bt, lt, @@ -1156,15 +1156,6 @@ impl OrderBookListener { ); if let Some(state) = self.order_book_state.as_mut() { state.bump_dirty_since_after_provisional(); - // Only treat the provisional as "published this epoch" if the publisher - // will actually deliver it. During catch-up/backlog replay the provisional - // is staleness-suppressed (lag > CATCHUP_THRESHOLD_MS); subscribers never - // see it, so finalization must emit a normal lag-gated Authoritative (not - // a staleness-bypassing Correction). Otherwise catch-up suppression is - // defeated and stale backlog quotes leak to multicast subscribers. - if delivered == Some(true) { - state.mark_provisional_published(); - } } true } else { @@ -1201,27 +1192,23 @@ impl OrderBookListener { if dirty { let block_time_ms = block.block_time_ms.unwrap_or(0); let local_time_ms = block.local_time_ms.unwrap_or(block_time_ms); - // If a provisional was already published for this dirty epoch, emit - // as Correction (bypasses publisher staleness gate, supersedes the - // provisional). Otherwise emit as Authoritative (lag-gated, catch-up - // suppression applies — fixes Bug A). - let kind = if self - .order_book_state - .as_ref() - .is_some_and(OrderBookState::provisional_published_this_epoch) - { - SnapshotEmission::Correction - } else { - SnapshotEmission::Authoritative - }; + // Finalization is ALWAYS Authoritative (lag-gated; catch-up + // suppression applies — fixes Bug A). Only recovery emits + // Correction (it corrects incorrect data after snapshot-divergence + // detection, a legitimate staleness bypass). Provisional delivery + // cannot be confirmed listener-side — the publisher applies the + // staleness gate later with a different clock — so finalization + // never bypasses staleness. A subscriber that saw a provisional + // holds that consistent, recent snapshot and is corrected by the + // next lag-gated Authoritative once caught up. self.emit_tob_snapshot( ingest_source_label(EventSource::OrderDiffs), block_time_ms, local_time_ms, - kind, + SnapshotEmission::Authoritative, false, // prevent_future_snaps=false: book_dirty guards stream epoch; snapped not needed ); - // Close the dirty epoch: clear dirty flag and reset provisional tracker. + // Close the dirty epoch: clear dirty flag. if let Some(state) = self.order_book_state.as_mut() { state.clear_book_dirty(); } @@ -1235,11 +1222,6 @@ impl OrderBookListener { /// `book_dirty` as the epoch guard, and recovery is orthogonal to the stream /// epoch (setting snapped=true in recovery would suppress the stream block's /// finalization snapshot, causing Bug B). - /// - /// Returns Some(true)/Some(false) = the emitted snapshot will be delivered / - /// suppressed by the publisher's staleness gate; None = no snapshot produced. - /// Used by the stuck-stream backstop so a suppressed provisional is not - /// mistaken for a delivered one. fn emit_tob_snapshot( &mut self, source_label: &'static str, @@ -1247,55 +1229,45 @@ impl OrderBookListener { source_local_time_ms: u64, emission: SnapshotEmission, prevent_future_snaps: bool, - ) -> Option { + ) { let snapshot_start = Instant::now(); let snapshot = self.l2_snapshots(prevent_future_snaps); crate::metrics::observe_tob_snapshot_compute(source_label, snapshot_start.elapsed()); - let Some(snapshot) = snapshot else { - return None; - }; - // l2_snapshots returned Some, so order_book_state is present; unwrap_or(0) is unreachable defensively. - let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); - let latest_heights = self.ingest_heights(); - // Single source of truth for snapshot lag: the publisher gates on - // `now_ms().saturating_sub(snapshot.0)`, so predict delivery with the - // exact same value and the exact same gate function it uses. - let snapshot_lag_ms = now_ms().saturating_sub(snapshot.0); - crate::metrics::observe_tob_snapshot_enqueue_lag( - source_label, - Duration::from_millis(snapshot_lag_ms), - ); - crate::metrics::observe_tob_snapshot_source_block_lag( - source_label, - Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), - ); - crate::metrics::observe_tob_snapshot_validator_write_lag( - source_label, - Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), - ); - let bypass = matches!(emission, SnapshotEmission::Correction); - if let Some(tx) = &self.internal_message_tx { - let enqueued_at_ms = now_ms(); - let snapshot_msg = Arc::new(InternalMessage::Snapshot { - l2_snapshots: snapshot.1, - time: snapshot.0, - height: snapshot_height, - source: source_label, - source_block_time_ms, - source_local_time_ms, - latest_heights, - enqueued_at_ms, - bypass_staleness: bypass, - }); - let _unused = tx.send(snapshot_msg); - } - // NOTE: `clear_book_dirty` is NOT called here. The stream-finalization - // epoch-closer (`emit_authoritative_block_snapshot`) is the only caller - // that clears the dirty epoch. Recovery and Provisional never clear it. - Some( - bypass - || crate::multicast::publisher::MulticastPublisher::should_publish_lag(snapshot_lag_ms), - ) + if let Some(snapshot) = snapshot { + // l2_snapshots returned Some, so order_book_state is present; unwrap_or(0) is unreachable defensively. + let snapshot_height = self.order_book_state.as_ref().map(OrderBookState::height).unwrap_or(0); + let latest_heights = self.ingest_heights(); + crate::metrics::observe_tob_snapshot_enqueue_lag( + source_label, + Duration::from_millis(now_ms().saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_source_block_lag( + source_label, + Duration::from_millis(source_block_time_ms.saturating_sub(snapshot.0)), + ); + crate::metrics::observe_tob_snapshot_validator_write_lag( + source_label, + Duration::from_millis(source_local_time_ms.saturating_sub(source_block_time_ms)), + ); + if let Some(tx) = &self.internal_message_tx { + let enqueued_at_ms = now_ms(); + let snapshot_msg = Arc::new(InternalMessage::Snapshot { + l2_snapshots: snapshot.1, + time: snapshot.0, + height: snapshot_height, + source: source_label, + source_block_time_ms, + source_local_time_ms, + latest_heights, + enqueued_at_ms, + bypass_staleness: matches!(emission, SnapshotEmission::Correction), + }); + let _unused = tx.send(snapshot_msg); + } + // NOTE: `clear_book_dirty` is NOT called here. The stream-finalization + // epoch-closer (`emit_authoritative_block_snapshot`) is the only caller + // that clears the dirty epoch. Recovery and Provisional never clear it. + } } fn receive_stream_statuses(&mut self, batch: Batch) -> Result<()> { @@ -1600,8 +1572,8 @@ impl OrderBookListener { // unrelated future diff or block finalization (L2-5 + recovery // interaction, Bug B fix). Correction always bypasses the publisher // staleness gate. Recovery NEVER touches book_dirty / book_dirty_since / - // dirty_source_times / provisional_published_this_epoch — the stream - // dirty epoch for any concurrent unfinalized block is preserved intact. + // dirty_source_times — the stream dirty epoch for any concurrent + // unfinalized block is preserved intact. // // WS-enabled (pre-L2-5 rollback path): the per-chunk path in // `process_data` is active; recovery's corrected BBO is carried by the diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 2a1fc32e..0579d004 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -35,11 +35,6 @@ pub(super) struct OrderBookState { /// `Instant` at which the book first became dirty in the current dirty /// epoch. Used by the backstop to gate emission cadence. book_dirty_since: Option, - /// True if a `Provisional` snapshot was already published during the - /// current dirty epoch (i.e. since the last `clear_book_dirty`). When the - /// epoch's finalization fires it emits as `Correction` (bypasses staleness - /// gate) to supersede the provisional. Reset with `clear_book_dirty`. - provisional_published_this_epoch: bool, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -75,7 +70,6 @@ impl Clone for OrderBookState { book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, book_dirty_since: self.book_dirty_since, - provisional_published_this_epoch: self.provisional_published_this_epoch, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -102,7 +96,6 @@ impl OrderBookState { book_dirty: false, dirty_source_times: None, book_dirty_since: None, - provisional_published_this_epoch: false, dob_tap: None, } } @@ -150,25 +143,10 @@ impl OrderBookState { } } - /// Records that a `Provisional` snapshot was published for the current - /// dirty epoch. The epoch's finalization will then emit as `Correction` - /// (bypasses publisher staleness gate) to supersede it. - pub(super) fn mark_provisional_published(&mut self) { - self.provisional_published_this_epoch = true; - } - - /// Returns `true` if a `Provisional` snapshot was already published - /// during the current dirty epoch. - pub(super) const fn provisional_published_this_epoch(&self) -> bool { - self.provisional_published_this_epoch - } - pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.book_dirty_since = None; self.dirty_source_times = None; - // Epoch-scoped: reset alongside the dirty epoch it belongs to. - self.provisional_published_this_epoch = false; } // forcibly take snapshot - (time, height, snapshot) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 66b05d95..73c2017b 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -479,8 +479,11 @@ async fn recovery_emits_authoritative_snapshot_without_later_diff() { // // 8a: normal finalization (no prior provisional) → bypass_staleness=false // (Bug A fix: catch-up suppression preserved for normal finalization) -// 8b: finalization after provisional → bypass_staleness=true (Correction) -// (supersedes the provisional, must reach subscribers regardless of lag) +// 8b: finalization after a provisional → still Authoritative +// (bypass_staleness=false, lag-gated). The listener cannot confirm +// whether the publisher actually delivered the provisional (the gate is +// applied later with a different clock), so finalization never bypasses +// staleness. Only recovery (incorrect-data correction) bypasses. // 8c: recovery over an unfinalized dirty block does NOT consume or clear // that block's dirty epoch — the stream block still emits its own // finalization snapshot (Bug B fix) @@ -512,18 +515,21 @@ async fn normal_finalization_snapshot_is_lag_gated_not_bypassed() { ); } -/// Task 5.9: finalization that follows a provisional in the same dirty epoch -/// must emit as Correction (bypass_staleness=true) to supersede the provisional. +/// Finalization that follows a provisional in the same dirty epoch must still +/// emit as Authoritative (bypass_staleness=false, lag-gated). The listener +/// cannot reliably know whether the publisher delivered the provisional (the +/// staleness gate is applied later with a different clock), so any listener-side +/// prediction is unsound and could leak stale backlog quotes. Only recovery +/// (incorrect-data correction) bypasses staleness. #[tokio::test(flavor = "current_thread")] -async fn finalization_after_provisional_emits_correction() { +async fn finalization_after_provisional_is_lag_gated_not_bypassed() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); // Block 2: one BBO-changing diff. let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); - // Fire the backstop (bypassing interval gate) → Provisional published, sets - // provisional_published_this_epoch. + // Fire the backstop (bypassing interval gate) → Provisional published. listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); let provisional = drain_snapshots(&mut rx); assert_eq!(provisional.len(), 1, "backstop must emit one provisional; got {}", provisional.len()); @@ -542,8 +548,8 @@ async fn finalization_after_provisional_emits_correction() { let bypass = snapshot_bypass(block_2_finals[0]).expect("snapshot has bypass_staleness"); assert!( - bypass, - "finalization after provisional must emit Correction (bypass_staleness=true); got false" + !bypass, + "finalization after provisional must be lag-gated Authoritative (bypass_staleness=false); listener cannot confirm provisional delivery so only recovery bypasses" ); } diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 1403e5b8..81e25c95 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -510,12 +510,12 @@ impl MulticastPublisher { pub(crate) const CATCHUP_THRESHOLD_MS: u64 = 500; #[cfg(test)] - pub(crate) fn should_publish_lag(_lag_ms: u64) -> bool { + fn should_publish_lag(_lag_ms: u64) -> bool { true } #[cfg(not(test))] - pub(crate) fn should_publish_lag(lag_ms: u64) -> bool { + fn should_publish_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -526,12 +526,11 @@ impl MulticastPublisher { /// (where `should_publish_lag` always returns `true`) fixture replays with /// historical block times are not suppressed — matching the fills path. /// - /// Only `Correction` emissions (recovery divergence correction, or - /// finalization that supersedes a provisional already published this epoch) - /// bypass the staleness gate (`bypass_staleness = true`). Both - /// `Provisional` (stuck-stream 250ms backstop) and normal `Authoritative` - /// (block finalization) emissions keep the existing freshness gate so that - /// catch-up suppression still applies to historical/backlog replays. + /// Only `Correction` emissions (recovery divergence correction) bypass the + /// staleness gate (`bypass_staleness = true`). Both `Provisional` + /// (stuck-stream 250ms backstop) and `Authoritative` (block finalization) + /// emissions keep the existing freshness gate so that catch-up suppression + /// still applies to historical/backlog replays. #[cfg(test)] pub(crate) const fn snapshot_should_publish(lag_ms: u64, bypass_staleness: bool) -> bool { bypass_staleness || lag_ms <= Self::CATCHUP_THRESHOLD_MS @@ -1871,11 +1870,13 @@ mod tests { /// Decision-matrix test for `snapshot_should_publish`. /// - /// Together with `stream_finalization_tests::finalization_after_provisional_emits_correction` - /// (which proves the listener emits a Correction when a provisional was already - /// published this epoch) this constitutes end-to-end proof: a stalled-stream - /// provisional is always superseded by the correction, and normal finalization - /// is lag-gated (catch-up suppression preserved). + /// Together with + /// `stream_finalization_tests::finalization_after_provisional_is_lag_gated_not_bypassed` + /// (which proves finalization stays lag-gated Authoritative even after a + /// provisional, since the listener cannot confirm provisional delivery) this + /// constitutes end-to-end proof: only recovery Corrections bypass staleness, + /// while both provisional and finalization emissions stay lag-gated + /// (catch-up suppression preserved). #[test] fn correction_snapshot_bypasses_staleness_suppression() { let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; From ddc9f47e32e4603a6cda242fcadcc457a817d82b Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:11:03 -0400 Subject: [PATCH 34/65] publisher: reject --dob-group without --multicast-group (registry not bootstrapped otherwise) --- binaries/src/bin/dz_hl_publisher.rs | 36 +++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index e47e1cab..bd22e842 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -175,6 +175,14 @@ impl Args { .to_owned(), ); } + if self.dob_group.is_some() && self.multicast_group.is_none() { + return Err( + "--dob-group requires --multicast-group: the shared instrument registry is \ + only bootstrapped from the HL API in multicast mode, so a DoB-only publisher \ + would resolve no instruments and emit no market data" + .to_owned(), + ); + } Ok(ingest_mode) } } @@ -381,16 +389,40 @@ mod tests { } #[test] - fn accepts_when_dob_group_set() { + fn rejects_dob_group_without_multicast_group() { + let args = Args::parse_from([ + "dz_hl_publisher", + "--address", + "0.0.0.0", + "--port", + "8000", + "--dob-group", + "239.0.0.2", + ]); + assert!( + args.validate() + .err() + .is_some_and(|e| e.contains("--dob-group requires --multicast-group")), + "--dob-group alone must be rejected: the registry is only bootstrapped in multicast mode" + ); + } + + #[test] + fn accepts_when_dob_and_multicast_groups_set() { let args = Args::parse_from([ "dz_hl_publisher", "--address", "0.0.0.0", "--port", "8000", + "--multicast-group", + "239.0.0.1", "--dob-group", "239.0.0.2", ]); - assert!(args.validate().is_ok(), "--dob-group alone is a valid output mode"); + assert!( + args.validate().is_ok(), + "--dob-group together with --multicast-group is a valid output mode" + ); } } From 068d1011b4e64cddcc5f38e2f47391116dc148ea Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:20:23 -0400 Subject: [PATCH 35/65] websocket server: reject no-output config at the runner boundary, not just the cli --- server/src/servers/websocket_server.rs | 9 +++++ server/tests/websocket_disabled_test.rs | 54 ++++++++++--------------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/server/src/servers/websocket_server.rs b/server/src/servers/websocket_server.rs index 1fb15cd6..8bb6fe0b 100644 --- a/server/src/servers/websocket_server.rs +++ b/server/src/servers/websocket_server.rs @@ -66,6 +66,15 @@ pub async fn run_websocket_server( return Err("--separate-fill-ingest requires streaming ingest mode".into()); } + if !enable_websocket && multicast_config.is_none() && dob_config.is_none() { + return Err( + "no market-data output configured: enable_websocket is false and both \ + multicast_config and dob_config are None — the server would tail HL \ + data with no output. Enable WebSocket or pass a multicast/DoB config." + .into(), + ); + } + if enable_websocket { info!("websocket mode: ENABLED (listener will bind {address}, full L2 snapshot fan-out active)"); } else { diff --git a/server/tests/websocket_disabled_test.rs b/server/tests/websocket_disabled_test.rs index a6462cdb..11d5cab3 100644 --- a/server/tests/websocket_disabled_test.rs +++ b/server/tests/websocket_disabled_test.rs @@ -1,4 +1,4 @@ -//! Verifies that the WS port is not bound when `enable_websocket` is false. +//! Verifies run_websocket_server rejects a no-output config (enable_websocket=false, no multicast, no DoB) and does not bind the WS port. #![allow(unused_crate_dependencies)] #![allow(clippy::unwrap_used)] @@ -6,7 +6,6 @@ use std::{ net::{Ipv4Addr, SocketAddr, TcpListener as StdListener}, path::PathBuf, - time::Duration, }; use server::{IngestMode, run_websocket_server}; @@ -23,46 +22,37 @@ fn make_hl_data_root() -> PathBuf { } #[tokio::test(flavor = "multi_thread")] -async fn ws_port_not_bound_when_websocket_disabled() { +async fn run_websocket_server_rejects_no_output_config() { let probe = StdListener::bind((Ipv4Addr::LOCALHOST, 0)).expect("bind probe"); let port = probe.local_addr().expect("local_addr").port(); drop(probe); let hl_data_root = make_hl_data_root(); - let hl_data_path = hl_data_root.clone(); let address = format!("127.0.0.1:{port}"); - let server_handle = tokio::spawn(async move { - let _ = run_websocket_server( - &address, - true, - 1, - None, - None, - IngestMode::Block, - Some(hl_data_path), - false, - false, // enable_websocket = false - ) - .await; - }); - // This sleep gives the server task a chance to run far enough to bind the - // WS port *if it were going to*. The race is one-directional and safe in - // the disabled case: no bind ever happens, so whether the task has been - // polled yet or not, the port stays unbound and the rebind below succeeds. - // The test asserts absence-of-bind, not presence-of-park — it guards - // against a future flag-flip mistake re-introducing the bind, and cannot - // spuriously pass in the disabled path. - tokio::time::sleep(Duration::from_millis(200)).await; + let result = run_websocket_server( + &address, + true, + 1, + None, + None, + IngestMode::Block, + Some(hl_data_root.clone()), + false, + false, // enable_websocket = false + ) + .await; + + assert!( + result.is_err(), + "no-output config must be rejected at the run_websocket_server boundary; got: {result:?}" + ); let addr = SocketAddr::from((Ipv4Addr::LOCALHOST, port)); - let rebind = StdListener::bind(addr); assert!( - rebind.is_ok(), - "expected ws port {port} to be unbound when enable_websocket=false; rebind err: {:?}", - rebind.err() + StdListener::bind(addr).is_ok(), + "ws port {port} must remain unbound when the runner rejects the config" ); - server_handle.abort(); - let _ = std::fs::remove_dir_all(&hl_data_root); + drop(std::fs::remove_dir_all(&hl_data_root)); } From d641d6530540458b82747116576ef334bb236a98 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:27:54 -0400 Subject: [PATCH 36/65] websocket server: reject dob-only config at the runner boundary to match the cli --- server/src/servers/websocket_server.rs | 9 ++++ server/tests/websocket_disabled_test.rs | 62 +++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/server/src/servers/websocket_server.rs b/server/src/servers/websocket_server.rs index 8bb6fe0b..1a517437 100644 --- a/server/src/servers/websocket_server.rs +++ b/server/src/servers/websocket_server.rs @@ -75,6 +75,15 @@ pub async fn run_websocket_server( ); } + if dob_config.is_some() && multicast_config.is_none() { + return Err( + "dob_config requires multicast_config: the shared instrument registry is \ + only bootstrapped from the HL API when a multicast config is present, so a \ + DoB-only publisher would resolve no instruments and emit no market data" + .into(), + ); + } + if enable_websocket { info!("websocket mode: ENABLED (listener will bind {address}, full L2 snapshot fan-out active)"); } else { diff --git a/server/tests/websocket_disabled_test.rs b/server/tests/websocket_disabled_test.rs index 11d5cab3..c111ea34 100644 --- a/server/tests/websocket_disabled_test.rs +++ b/server/tests/websocket_disabled_test.rs @@ -56,3 +56,65 @@ async fn run_websocket_server_rejects_no_output_config() { drop(std::fs::remove_dir_all(&hl_data_root)); } + +#[tokio::test(flavor = "multi_thread")] +async fn run_websocket_server_rejects_dob_only_config() { + use std::time::Duration; + + use server::DobConfig; + + let probe = StdListener::bind((Ipv4Addr::LOCALHOST, 0)).expect("bind probe"); + let port = probe.local_addr().expect("local_addr").port(); + drop(probe); + + let hl_data_root = make_hl_data_root(); + let address = format!("127.0.0.1:{port}"); + + // dob_config=Some with multicast_config=None: the registry is never + // bootstrapped (DoB-only would be a dark publisher), so the runner must + // reject this at the library boundary exactly as the CLI does — and must + // not bind the WS port. enable_websocket value is irrelevant to the + // invariant (WS does not bootstrap the instrument registry); use false. + let dob = DobConfig { + group_addr: Ipv4Addr::new(239, 0, 0, 2), + mktdata_port: 0, + refdata_port: 0, + snapshot_port: 0, + bind_addr: Ipv4Addr::LOCALHOST, + channel_id: 1, + source_id: 1, + mtu: 1500, + heartbeat_interval: Duration::from_secs(1), + definition_cycle: Duration::from_secs(1), + manifest_cadence: Duration::from_secs(1), + channel_bound: 1, + snapshot_round_duration: Duration::from_secs(1), + snapshot_mtu: 1500, + }; + + let result = run_websocket_server( + &address, + true, + 1, + None, // multicast_config + Some(dob), // dob_config + IngestMode::Block, + Some(hl_data_root.clone()), + false, + false, // enable_websocket + ) + .await; + + assert!( + result.is_err(), + "DoB-only config (dob_config=Some, multicast_config=None) must be rejected at the run_websocket_server boundary" + ); + + let addr = SocketAddr::from((Ipv4Addr::LOCALHOST, port)); + assert!( + StdListener::bind(addr).is_ok(), + "ws port {port} must remain unbound when the runner rejects a DoB-only config" + ); + + drop(std::fs::remove_dir_all(&hl_data_root)); +} From fcd31f1d8c5f6b16866b166811ba12e80d15ab4f Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:41:35 -0400 Subject: [PATCH 37/65] publisher: own the provisional-supersede decision (resolves catch-up leak vs stranded-provisional dilemma) --- server/src/listeners/order_book/mod.rs | 45 ++--- .../order_book/stream_finalization_tests.rs | 86 +++++---- server/src/multicast/publisher.rs | 171 ++++++++++++------ 3 files changed, 194 insertions(+), 108 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 6a1cad08..40231579 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -518,23 +518,27 @@ impl StreamFinalizationMode { } } -/// Whether `emit_tob_snapshot` is taking the authoritative final snapshot for -/// the current dirty range, a provisional backstop snapshot, or a correction -/// that must bypass the publisher's staleness gate. +/// Emission semantics for `emit_tob_snapshot`. The listener no longer pre-bakes +/// a publish-vs-suppress boolean: it passes this kind through and the publisher +/// owns the decision and the provisional-supersede guarantee (the publisher is +/// the only component that knows whether a provisional was actually delivered +/// AND applies the freshness gate to the finalization, both with the same +/// clock). #[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum SnapshotEmission { - /// Stuck-stream 250ms backstop. Lag-gated at the publisher. Does NOT - /// close the dirty epoch; marks that a provisional was published so the - /// epoch's finalization knows to emit as `Correction`. +pub(crate) enum SnapshotEmission { + /// Stuck-stream 250ms backstop. The publisher publishes it iff fresh + /// (lag-gated); if it is actually delivered, the publisher remembers that + /// a provisional is pending so the block's finalization is force-published + /// even if it later turns stale. Does NOT close the dirty epoch. Provisional, - /// Normal block finalization / block-mode per-chunk emit. Lag-gated at - /// the publisher (catch-up suppression applies). Closes the stream - /// dirty epoch. + /// Normal block finalization / block-mode per-chunk emit. The publisher + /// publishes it iff fresh OR a provisional is pending delivery (the + /// supersede guarantee: subscribers are never stranded on a delivered + /// provisional that omits the block's final diffs). Closes the stream + /// dirty epoch (listener side). Authoritative, - /// Must reach subscribers regardless of staleness: (a) recovery - /// divergence correction, (b) the finalization that closes a dirty - /// epoch in which a `Provisional` was already published (it supersedes - /// that provisional). Bypasses the publisher freshness gate. + /// Recovery divergence correction. The publisher always publishes it + /// (recovery corrects *incorrect* data, not merely stale data). Correction, } @@ -1260,7 +1264,7 @@ impl OrderBookListener { source_local_time_ms, latest_heights, enqueued_at_ms, - bypass_staleness: matches!(emission, SnapshotEmission::Correction), + emission, }); let _unused = tx.send(snapshot_msg); } @@ -2228,12 +2232,11 @@ pub(crate) enum InternalMessage { source_local_time_ms: u64, latest_heights: IngestHeights, enqueued_at_ms: u64, - /// `true` only for `Correction` emissions (recovery divergence correction, - /// or finalization that supersedes a provisional already published this epoch). - /// Bypasses the publisher's staleness gate. `false` for `Provisional` and - /// `Authoritative` emissions, which remain lag-gated (catch-up suppression - /// applies to normal finalization). - bypass_staleness: bool, + /// Emission semantics; the publisher decides publish-vs-suppress and + /// owns the provisional-supersede guarantee. Provisional/Authoritative + /// are lag-gated; Authoritative is force-published when it supersedes a + /// delivered provisional; Correction always publishes. + emission: SnapshotEmission, }, Fills { batch: Batch, diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 73c2017b..a6b88b5d 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -164,11 +164,11 @@ fn snapshot_height(msg: &InternalMessage) -> u64 { *height } -/// Returns the `bypass_staleness` field of a Snapshot message, or `None` if -/// the message is not a Snapshot. -fn snapshot_bypass(msg: &InternalMessage) -> Option { - let InternalMessage::Snapshot { bypass_staleness, .. } = msg else { return None }; - Some(*bypass_staleness) +/// Returns the `emission` kind of a Snapshot message, or `None` if the message +/// is not a Snapshot. +fn snapshot_emission(msg: &InternalMessage) -> Option { + let InternalMessage::Snapshot { emission, .. } = msg else { return None }; + Some(*emission) } // --------------------------------------------------------------------------- @@ -477,21 +477,25 @@ async fn recovery_emits_authoritative_snapshot_without_later_diff() { // --------------------------------------------------------------------------- // Tests 8a/8b/8c (Task 5.9): 3-state emission model correctness. // -// 8a: normal finalization (no prior provisional) → bypass_staleness=false -// (Bug A fix: catch-up suppression preserved for normal finalization) -// 8b: finalization after a provisional → still Authoritative -// (bypass_staleness=false, lag-gated). The listener cannot confirm -// whether the publisher actually delivered the provisional (the gate is -// applied later with a different clock), so finalization never bypasses -// staleness. Only recovery (incorrect-data correction) bypasses. -// 8c: recovery over an unfinalized dirty block does NOT consume or clear -// that block's dirty epoch — the stream block still emits its own -// finalization snapshot (Bug B fix) +// 8a: normal finalization (no prior provisional) → emits `Authoritative` +// (catch-up suppression preserved for normal finalization; the publisher +// lag-gates it) +// 8b: a backstopped block emits `Provisional`, then its finalization emits +// `Authoritative`. The supersede guarantee (a delivered provisional is +// superseded by its block's finalization even when that finalization is +// now stale) lives in the publisher — the listener cannot confirm +// provisional delivery. This listener-level test only verifies the +// emission kinds; the publisher decision-matrix test +// (`snapshot_publish_decision_matrix`) proves the stale-provisional +// supersede behavior. +// 8c: recovery over an unfinalized dirty block emits `Correction` and does +// NOT consume or clear that block's dirty epoch — the stream block still +// emits its own finalization snapshot (Bug B fix) // --------------------------------------------------------------------------- /// Task 5.9 Bug A fix: a normal block finalization with no prior provisional -/// must emit as Authoritative (bypass_staleness=false) so catch-up suppression -/// applies to historical/backlog replay. This proves Bug A is fixed. +/// must emit as `Authoritative` so the publisher lag-gates it and catch-up +/// suppression applies to historical/backlog replay. This proves Bug A is fixed. #[tokio::test(flavor = "current_thread")] async fn normal_finalization_snapshot_is_lag_gated_not_bypassed() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -508,19 +512,23 @@ async fn normal_finalization_snapshot_is_lag_gated_not_bypassed() { let block_2: Vec<_> = snapshots.iter().filter(|m| snapshot_height(m) == 2).collect(); assert_eq!(block_2.len(), 1, "exactly one snapshot for height 2; got {}", block_2.len()); - let bypass = snapshot_bypass(block_2[0]).expect("snapshot has bypass_staleness"); - assert!( - !bypass, - "normal finalization (no prior provisional) must have bypass_staleness=false; got true" + let emission = snapshot_emission(block_2[0]).expect("snapshot has emission"); + assert_eq!( + emission, + super::SnapshotEmission::Authoritative, + "normal finalization must emit Authoritative" ); } -/// Finalization that follows a provisional in the same dirty epoch must still -/// emit as Authoritative (bypass_staleness=false, lag-gated). The listener -/// cannot reliably know whether the publisher delivered the provisional (the -/// staleness gate is applied later with a different clock), so any listener-side -/// prediction is unsound and could leak stale backlog quotes. Only recovery -/// (incorrect-data correction) bypasses staleness. +/// A backstopped block emits a `Provisional`; its finalization emits an +/// `Authoritative`. The listener cannot reliably know whether the publisher +/// delivered the provisional (the staleness gate is applied later with a +/// different clock), so the supersede guarantee (a delivered provisional is +/// superseded by its block's finalization even when that finalization is now +/// stale) lives in the publisher, keyed off the publisher's own observed +/// delivery. This listener-level test only verifies the emission kinds; the +/// publisher decision-matrix test (`snapshot_publish_decision_matrix`) proves +/// the stale-provisional-supersede behavior. #[tokio::test(flavor = "current_thread")] async fn finalization_after_provisional_is_lag_gated_not_bypassed() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -533,9 +541,10 @@ async fn finalization_after_provisional_is_lag_gated_not_bypassed() { listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); let provisional = drain_snapshots(&mut rx); assert_eq!(provisional.len(), 1, "backstop must emit one provisional; got {}", provisional.len()); - assert!( - !snapshot_bypass(&provisional[0]).unwrap_or(true), - "provisional snapshot must have bypass_staleness=false" + assert_eq!( + snapshot_emission(&provisional[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "backstop snapshot must emit Provisional" ); // Block 3: triggers finalization of block 2. @@ -546,10 +555,11 @@ async fn finalization_after_provisional_is_lag_gated_not_bypassed() { let block_2_finals: Vec<_> = after_final.iter().filter(|m| snapshot_height(m) == 2).collect(); assert_eq!(block_2_finals.len(), 1, "exactly one finalization snapshot for height 2; got {}", block_2_finals.len()); - let bypass = snapshot_bypass(block_2_finals[0]).expect("snapshot has bypass_staleness"); - assert!( - !bypass, - "finalization after provisional must be lag-gated Authoritative (bypass_staleness=false); listener cannot confirm provisional delivery so only recovery bypasses" + let emission = snapshot_emission(block_2_finals[0]).expect("snapshot has emission"); + assert_eq!( + emission, + super::SnapshotEmission::Authoritative, + "finalization after provisional must emit Authoritative; the publisher owns the supersede decision (listener cannot confirm provisional delivery)" ); } @@ -613,8 +623,12 @@ async fn recovery_does_not_consume_unfinalized_stream_dirty_block() { "recovery must emit exactly one Correction snapshot; got {}", recovery_snapshots.len() ); - let recovery_bypass = snapshot_bypass(&recovery_snapshots[0]).expect("has bypass_staleness"); - assert!(recovery_bypass, "recovery snapshot must have bypass_staleness=true (Correction)"); + let recovery_emission = snapshot_emission(&recovery_snapshots[0]).expect("has emission"); + assert_eq!( + recovery_emission, + super::SnapshotEmission::Correction, + "recovery snapshot must emit Correction" + ); // Now finalize block 2 by feeding block 3. // Bug B: before the fix, recovery would have cleared book_dirty so the diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 81e25c95..56422c8e 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -9,6 +9,7 @@ use log::{info, warn}; use tokio::net::UdpSocket; use crate::instruments::{InstrumentInfo, SharedRegistry, price_to_fixed, qty_to_fixed}; +use crate::listeners::order_book::SnapshotEmission; use crate::multicast::config::MulticastConfig; use crate::protocol::constants::{ AGGRESSOR_BUY, AGGRESSOR_SELL, ASSET_CLASS_CRYPTO_SPOT, CHANNEL_RESET_SIZE, END_OF_SESSION_SIZE, FLAG_SNAPSHOT, @@ -519,21 +520,34 @@ impl MulticastPublisher { lag_ms <= Self::CATCHUP_THRESHOLD_MS } - /// Decides whether to publish a snapshot message given production (real) - /// lag gating logic. Used by tests to assert the decision-matrix logic - /// directly without the test-mode lag override. The publisher loop uses - /// `bypass_staleness || should_publish_lag(lag_ms)` so that in test builds - /// (where `should_publish_lag` always returns `true`) fixture replays with - /// historical block times are not suppressed — matching the fills path. + /// Pure decision-matrix model of the publisher's snapshot publish-vs-suppress + /// + provisional-supersede logic, used by the decision-matrix test to assert + /// it directly without the test-mode lag override. The publisher loop uses + /// `should_publish_lag(lag_ms)` for the freshness gate so that in test builds + /// (where it always returns `true`) fixture replays with historical block + /// times are not suppressed — matching the fills path. /// - /// Only `Correction` emissions (recovery divergence correction) bypass the - /// staleness gate (`bypass_staleness = true`). Both `Provisional` - /// (stuck-stream 250ms backstop) and `Authoritative` (block finalization) - /// emissions keep the existing freshness gate so that catch-up suppression - /// still applies to historical/backlog replays. + /// Semantics (mirrors the loop): + /// - `Correction`: always published (recovery corrects *incorrect*, not + /// merely stale, data). + /// - `Provisional`: published only when fresh (lag within threshold), so + /// catch-up suppression is preserved. + /// - `Authoritative`: published when fresh, OR when `pending_provisional` + /// is set (a provisional was delivered and must be superseded by its + /// block's finalization even if that finalization is now stale). #[cfg(test)] - pub(crate) const fn snapshot_should_publish(lag_ms: u64, bypass_staleness: bool) -> bool { - bypass_staleness || lag_ms <= Self::CATCHUP_THRESHOLD_MS + pub(crate) const fn snapshot_should_publish( + emission: SnapshotEmission, + lag_ms: u64, + pending_provisional: bool, + ) -> bool { + match emission { + SnapshotEmission::Correction => true, + SnapshotEmission::Provisional => lag_ms <= Self::CATCHUP_THRESHOLD_MS, + SnapshotEmission::Authoritative => { + lag_ms <= Self::CATCHUP_THRESHOLD_MS || pending_provisional + } + } } fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { @@ -974,6 +988,12 @@ impl MulticastPublisher { let mut cached_snapshot: Option> = None; let mut had_activity = false; let mut caught_up = false; + // Set true when a `Provisional` snapshot was actually published to + // subscribers. While true, the next `Authoritative` is force-published + // even if it is now stale, so subscribers are never stranded on a + // delivered provisional that omits the block's final diffs. Cleared + // when an Authoritative or Correction is published. + let mut pending_provisional = false; let mut health = TobPublisherHealth::new(Self::now_ms()); let mut fill_pairs = FillPairAccumulator::new(Self::now_ms()); @@ -994,7 +1014,7 @@ impl MulticastPublisher { source_local_time_ms, latest_heights, enqueued_at_ms, - bypass_staleness, + emission, } => { cached_snapshot = Some(msg.clone()); let now_ms = Self::now_ms(); @@ -1010,16 +1030,33 @@ impl MulticastPublisher { Duration::from_millis(listener_to_publisher_ms), ); let lag_ms = now_ms.saturating_sub(*time); - // Correction snapshots (recovery divergence correction, or - // finalization superseding a provisional) bypass the staleness - // gate. Provisional and normal Authoritative snapshots remain - // lag-gated (catch-up suppression preserved — Bug A fix). - // `should_publish_lag` is used for the lag check so that in - // test builds (where it always returns true) fixture replays - // with historical block times are not suppressed — same - // pattern as the fills path. `snapshot_should_publish` is - // kept as a pure function used by the decision-matrix test. - if *bypass_staleness || Self::should_publish_lag(lag_ms) { + // Publish-vs-suppress + provisional-supersede is + // owned here (the publisher is the only component + // that knows whether a provisional was actually + // delivered AND applies the freshness gate to the + // finalization, both with the same clock): + // Correction -> always (recovery corrects + // incorrect, not merely stale, + // data). + // Provisional -> only when fresh (catch-up + // suppression preserved); if + // delivered, a provisional is + // now pending. + // Authoritative -> when fresh, OR when a + // provisional is pending + // delivery (force-publish so + // subscribers are not stranded + // on the stale provisional). + // `should_publish_lag` is the freshness gate + // (cfg(test) => always true so fixture replays are + // not suppressed — same pattern as the fills path). + let fresh = Self::should_publish_lag(lag_ms); + let publish = match emission { + SnapshotEmission::Correction => true, + SnapshotEmission::Provisional => fresh, + SnapshotEmission::Authoritative => fresh || pending_provisional, + }; + if publish { health.observe_publishable_lag(lag_ms); crate::metrics::observe_tob_source_lag( "snapshot", @@ -1041,6 +1078,7 @@ impl MulticastPublisher { had_activity = true; heartbeat_interval.reset(); } + pending_provisional = matches!(emission, SnapshotEmission::Provisional); } else { caught_up = false; crate::metrics::observe_tob_source_lag( @@ -1868,47 +1906,78 @@ mod tests { assert_eq!(report.receiver_lagged_messages, 19); } - /// Decision-matrix test for `snapshot_should_publish`. + /// Decision-matrix test for `snapshot_should_publish` under the + /// publisher-authority supersede model. /// /// Together with /// `stream_finalization_tests::finalization_after_provisional_is_lag_gated_not_bypassed` - /// (which proves finalization stays lag-gated Authoritative even after a - /// provisional, since the listener cannot confirm provisional delivery) this - /// constitutes end-to-end proof: only recovery Corrections bypass staleness, - /// while both provisional and finalization emissions stay lag-gated - /// (catch-up suppression preserved). + /// (which proves the listener emits `Provisional` then `Authoritative` for a + /// backstopped-then-finalized block, leaving the supersede decision to the + /// publisher) this constitutes end-to-end proof: recovery `Correction` + /// always publishes; `Provisional`/`Authoritative` stay lag-gated (catch-up + /// suppression preserved); a delivered provisional is always superseded by + /// its block's `Authoritative` finalization even when that finalization is + /// now stale (`pending_provisional` force-publishes it). #[test] - fn correction_snapshot_bypasses_staleness_suppression() { + fn snapshot_publish_decision_matrix() { let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; - // non-bypass (Provisional/Authoritative), stale → suppressed - assert!( - !MulticastPublisher::snapshot_should_publish(thr + 1, false), - "provisional/authoritative stale snapshot must be suppressed" - ); + // Correction: always published, regardless of lag or pending state. + for pending in [false, true] { + assert!( + MulticastPublisher::snapshot_should_publish(SnapshotEmission::Correction, 0, pending), + "correction at lag 0 must publish (pending={pending})" + ); + assert!( + MulticastPublisher::snapshot_should_publish(SnapshotEmission::Correction, thr, pending), + "correction at threshold must publish (pending={pending})" + ); + assert!( + MulticastPublisher::snapshot_should_publish( + SnapshotEmission::Correction, + thr + 10_000, + pending + ), + "arbitrarily stale correction must publish (pending={pending})" + ); + } - // correction (bypass_staleness=true), stale → published - assert!( - MulticastPublisher::snapshot_should_publish(thr + 1, true), - "correction stale snapshot must bypass suppression" - ); + // Provisional: lag-gated, independent of pending state. + for pending in [false, true] { + assert!( + MulticastPublisher::snapshot_should_publish(SnapshotEmission::Provisional, thr, pending), + "provisional at threshold must publish (pending={pending})" + ); + assert!( + !MulticastPublisher::snapshot_should_publish( + SnapshotEmission::Provisional, + thr + 1, + pending + ), + "provisional past threshold must be suppressed (pending={pending})" + ); + } - // non-bypass, fresh → published (normal operating case) + // Authoritative, no pending provisional: plain lag gate. assert!( - MulticastPublisher::snapshot_should_publish(0, false), - "non-bypass fresh snapshot must be published" + MulticastPublisher::snapshot_should_publish(SnapshotEmission::Authoritative, thr, false), + "authoritative at threshold must publish when no provisional pending" ); - - // non-bypass, exactly at threshold → published (boundary inclusive, matches <=) assert!( - MulticastPublisher::snapshot_should_publish(thr, false), - "non-bypass at-threshold snapshot must be published" + !MulticastPublisher::snapshot_should_publish(SnapshotEmission::Authoritative, thr + 1, false), + "authoritative past threshold must be suppressed when no provisional pending" ); - // correction, arbitrarily stale → still published + // Authoritative, provisional pending: supersede guarantee — published + // even arbitrarily stale (Codex's "provisional delivered, status + // arrives after the freshness threshold, final BBO still delivered"). assert!( - MulticastPublisher::snapshot_should_publish(thr + 10_000, true), - "arbitrarily stale correction snapshot must be published" + MulticastPublisher::snapshot_should_publish( + SnapshotEmission::Authoritative, + thr + 10_000, + true + ), + "stale authoritative must publish to supersede a delivered provisional" ); } } From 99ec7b9e38bc4b1169beddb58e9909cd20dae226 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 10:58:12 -0400 Subject: [PATCH 38/65] backstop: trigger on source age not local clock; tie supersede flag to actual send --- server/src/listeners/order_book/mod.rs | 70 +++++---- server/src/listeners/order_book/state.rs | 21 +-- .../order_book/stream_finalization_tests.rs | 146 +++++++++++++----- server/src/multicast/publisher.rs | 24 ++- 4 files changed, 179 insertions(+), 82 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 40231579..25190bb1 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -68,14 +68,23 @@ const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_secs(5); #[cfg(test)] const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_millis(50); -/// Stuck-stream backstop interval. Half of -/// `MulticastPublisher::CATCHUP_THRESHOLD_MS` (500ms) so a provisional -/// emission always fires before a dirty BBO ages past the TOB freshness -/// suppression cutoff. Backstop only runs when block finalization is -/// delayed; in healthy streaming, finalization emits first and this never +/// Stuck-stream backstop interval, measured in *source time* (HL block time) +/// consistent with the multicast publisher's freshness gate (which keys off +/// `now_ms() - snapshot.time`, the source block time — not local elapsed +/// time). Half of `MulticastPublisher::CATCHUP_THRESHOLD_MS` (500ms) so a +/// provisional emission always fires before a dirty BBO ages past the TOB +/// freshness suppression cutoff. Backstop only runs when block finalization +/// is delayed; in healthy streaming, finalization emits first and this never /// fires. const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_millis(250); +// The invariant still holds and now reads in source-time terms: a provisional +// is emitted once the dirtying event's source age reaches +// STREAM_DIRTY_BACKSTOP_INTERVAL (>=250ms). Because both the backstop trigger +// and the publisher's freshness gate measure source age on the same basis, a +// provisional emitted at source-age >=250ms still has >=250ms of headroom +// before the 500ms (CATCHUP_THRESHOLD_MS) gate — that margin absorbs queue / +// scheduler delay between the listener emit and the publisher's gate check. const _BACKSTOP_VS_FRESHNESS_INVARIANT: () = assert!( STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() * 2 <= crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS as u128, @@ -1125,27 +1134,34 @@ impl OrderBookListener { self.emit_authoritative_block_snapshot(&block); } - /// Stuck-stream backstop: if streaming + WS-disabled and the book has been - /// dirty for >= STREAM_DIRTY_BACKSTOP_INTERVAL without a finalization - /// emission, emit a provisional snapshot (does NOT clear book_dirty, so the - /// eventual authoritative finalization snapshot still fires). Shared by the - /// hl_listen backstop ticker arm and the test helper so the two cannot drift. + /// Stuck-stream backstop: if streaming + WS-disabled and the dirtying + /// event's SOURCE age (`now_ms() - dirty_block_time_ms`) has reached + /// STREAM_DIRTY_BACKSTOP_INTERVAL without a finalization emission, emit a + /// provisional snapshot (does NOT clear book_dirty, so the eventual + /// authoritative finalization snapshot still fires). The gate is measured + /// in source time (not local elapsed) so it matches the multicast + /// publisher's freshness gate. Shared by the hl_listen backstop ticker arm + /// and the test helper so the two cannot drift. fn try_emit_stuck_stream_backstop(&mut self) -> bool { self.try_emit_stuck_stream_backstop_inner(true) } /// Inner implementation. `enforce_interval = true` in all production paths - /// (ticker arm, existing test helper). `enforce_interval = false` is exposed - /// only via the `#[cfg(test)]` helper below, allowing tests to bypass the - /// elapsed-time gate without affecting any other backstop condition. + /// (ticker arm, existing test helper) gates emission on the dirtying + /// event's source age reaching STREAM_DIRTY_BACKSTOP_INTERVAL. + /// `enforce_interval = false` is exposed only via the `#[cfg(test)]` + /// helper below, allowing tests to bypass the source-age gate without + /// affecting any other backstop condition. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket && self.order_book_state.as_ref().is_some_and(|s| { s.book_dirty() && (!enforce_interval - || s.book_dirty_since() - .is_some_and(|since| since.elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL)) + || s.dirty_source_times().is_some_and(|(bt, _)| { + now_ms().saturating_sub(bt) + >= STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64 + })) }); if should_emit && let Some((bt, lt)) = @@ -1158,9 +1174,6 @@ impl OrderBookListener { SnapshotEmission::Provisional, false, // prevent_future_snaps=false: later finalization must still emit ); - if let Some(state) = self.order_book_state.as_mut() { - state.bump_dirty_since_after_provisional(); - } true } else { false @@ -1575,7 +1588,7 @@ impl OrderBookListener { // so multicast subscribers see the corrected BBO without waiting for an // unrelated future diff or block finalization (L2-5 + recovery // interaction, Bug B fix). Correction always bypasses the publisher - // staleness gate. Recovery NEVER touches book_dirty / book_dirty_since / + // staleness gate. Recovery NEVER touches book_dirty / // dirty_source_times — the stream dirty epoch for any concurrent // unfinalized block is preserved intact. // @@ -1712,21 +1725,22 @@ impl OrderBookListener { /// Test-only equivalent of the production `backstop_ticker.tick()` arm in /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty and - /// `book_dirty_since` has aged past `STREAM_DIRTY_BACKSTOP_INTERVAL`. - /// The elapsed-time gate is ENFORCED (enforce_interval=true), so calling - /// this immediately after a dirty block will NOT emit — exercising the real - /// gate. + /// the dirtying event's SOURCE age (`now_ms() - dirty_block_time_ms`) has + /// reached `STREAM_DIRTY_BACKSTOP_INTERVAL`. The source-age gate is + /// ENFORCED (enforce_interval=true), so calling this while the dirty + /// epoch's source time is still fresh (source-age < 250ms) will NOT emit — + /// exercising the real gate. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { self.try_emit_stuck_stream_backstop(); } - /// Test-only backstop variant that bypasses ONLY the elapsed-time gate - /// (`book_dirty_since().elapsed() >= STREAM_DIRTY_BACKSTOP_INTERVAL`). + /// Test-only backstop variant that bypasses ONLY the source-age gate + /// (`now_ms() - dirty_block_time_ms >= STREAM_DIRTY_BACKSTOP_INTERVAL`). /// All other conditions are real: streaming mode, WS disabled, book_dirty, - /// dirty_source_times present, emits `SnapshotEmission::Provisional`, calls - /// `bump_dirty_since_after_provisional`. Use this to exercise the provisional - /// emit path deterministically without a real sleep. + /// dirty_source_times present, emits `SnapshotEmission::Provisional`. Use + /// this to exercise the provisional emit path deterministically regardless + /// of the dirty epoch's source age and without a real sleep. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_ignoring_interval_for_test(&mut self) { self.try_emit_stuck_stream_backstop_inner(false); diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 0579d004..a6c8d090 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -32,9 +32,6 @@ pub(super) struct OrderBookState { /// reliable times without depending on the take-and-clear /// `OrderBookListener::last_batch_times()`. dirty_source_times: Option<(u64, u64)>, - /// `Instant` at which the book first became dirty in the current dirty - /// epoch. Used by the backstop to gate emission cadence. - book_dirty_since: Option, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -69,7 +66,6 @@ impl Clone for OrderBookState { enable_websocket: self.enable_websocket, book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, - book_dirty_since: self.book_dirty_since, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -95,7 +91,6 @@ impl OrderBookState { snapped: false, book_dirty: false, dirty_source_times: None, - book_dirty_since: None, dob_tap: None, } } @@ -121,31 +116,23 @@ impl OrderBookState { self.book_dirty } - pub(super) const fn book_dirty_since(&self) -> Option { - self.book_dirty_since - } - pub(super) const fn dirty_source_times(&self) -> Option<(u64, u64)> { self.dirty_source_times } + /// Marks the book dirty and, on the first transition of the current dirty + /// epoch, records the source `(block_time_ms, local_time_ms)`. The + /// stuck-stream backstop gates emission on the source age of this + /// `block_time_ms` (no local `Instant` is stamped). pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { if !self.book_dirty { - self.book_dirty_since = Some(std::time::Instant::now()); self.dirty_source_times = Some((block_time_ms, local_time_ms)); } self.book_dirty = true; } - pub(super) fn bump_dirty_since_after_provisional(&mut self) { - if self.book_dirty { - self.book_dirty_since = Some(std::time::Instant::now()); - } - } - pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; - self.book_dirty_since = None; self.dirty_source_times = None; } diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index a6b88b5d..84a887a8 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -10,8 +10,9 @@ //! (1) One authoritative snapshot per finalized BBO-changing block. //! (2) Clean (no-mutation) blocks emit nothing. //! (3) Finalized blocks emit in height order. -//! (4) Backstop fires only after the 250ms dirty interval, emits Provisional, -//! does NOT suppress the later authoritative finalization snapshot. +//! (4) Backstop fires only once the dirtying event's source age reaches the +//! 250ms interval, emits Provisional, does NOT suppress the later +//! authoritative finalization snapshot. //! (5) Recovery emits an authoritative snapshot with the repaired BBO //! immediately. @@ -263,48 +264,123 @@ async fn finalized_blocks_emit_snapshots_in_order() { } // --------------------------------------------------------------------------- -// Tests 4 + C1b (merged): Stuck-stream backstop — interval gate + emit path. +// Tests 4 + C1b (merged): Stuck-stream backstop — SOURCE-age gate + emit path. // -// Two-call design (Option C / Task 5.5b): -// 1. `fire_stream_dirty_backstop_for_test()` — enforces the real 250ms gate. -// Called immediately after the dirty block (~0ms elapsed), it must NOT -// emit. This assertion is load-bearing: it proves the interval gate is -// genuinely active on the production path. -// 2. `fire_stream_dirty_backstop_ignoring_interval_for_test()` — bypasses -// ONLY the elapsed-time comparison; all other conditions (streaming, -// !ws, book_dirty, dirty_source_times) remain real. This exercises the -// provisional emit path deterministically without any real sleep. +// The backstop now gates on the dirtying event's SOURCE age +// (`now_ms() - dirty_block_time_ms`) reaching `STREAM_DIRTY_BACKSTOP_INTERVAL` +// (250ms), measured on the same basis as the multicast publisher's freshness +// gate — NOT on local elapsed wall time. The test proves the source-age gate +// deterministically with block times taken relative to *now* (no sleeps, no +// mock clock): +// - Case A: a dirty epoch whose source-age is still < 250ms must NOT emit +// under the enforced gate (`fire_stream_dirty_backstop_for_test`). This +// assertion is load-bearing: it proves the source-age gate genuinely +// blocks a still-fresh dirty epoch on the production path. +// - Case B: a dirty epoch whose source-age has reached >= 250ms must emit +// exactly one Provisional snapshot under the enforced gate. +// - Case C: `fire_stream_dirty_backstop_ignoring_interval_for_test()` +// bypasses ONLY the source-age comparison; all other conditions +// (streaming, !ws, book_dirty, dirty_source_times) remain real, so it +// force-emits regardless of source age. // --------------------------------------------------------------------------- +/// Current epoch millis on the SAME basis as the listener's `now_ms()` +/// (`SystemTime::now().duration_since(UNIX_EPOCH).as_millis() as u64`), so the +/// source-age gate can be exercised deterministically with block times taken +/// relative to "now" — no sleeps, no mock clock. +fn now_ms_test() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + /// L2-5 contract (4): when no later block arrives, the backstop hook -/// (which production fires on a 5s ticker) emits one Provisional snapshot -/// for the dirty block AFTER the 250ms interval elapses — but NOT before. +/// (which production fires on a 250ms ticker) emits one Provisional snapshot +/// for the dirty block once the dirtying event's SOURCE age has reached the +/// 250ms backstop interval — but NOT before. /// -/// Also covers C1b (deferred Task 5.4 Step 7): `fire_stream_dirty_backstop_for_test` -/// gates on the real 250ms elapsed-time interval and does not emit early. +/// Also covers C1b: `fire_stream_dirty_backstop_for_test` gates on the real +/// source-age interval and does not emit while the dirty epoch is still fresh. #[tokio::test(flavor = "current_thread")] async fn stuck_stream_backstop_emits_dirty_snapshot() { - let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); - - let (s2, d2) = add_event(1_700_000_002_000, Side::Bid, 101, "100", "5"); - feed_block(&mut listener, 2, 1_700_000_002_000, vec![s2], vec![d2]); + // Case A (gate active — must NOT emit): the dirtying block's source time is + // only ~100ms old (< 250ms), so the enforced source-age gate must block. + { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let fresh_block_time = now_ms_test() - 100; + let (s2, d2) = add_event(fresh_block_time, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, fresh_block_time, vec![s2], vec![d2]); + + listener.fire_stream_dirty_backstop_for_test(); + let before_interval = drain_snapshots(&mut rx); + assert!( + before_interval.is_empty(), + "backstop must NOT fire while the dirty epoch's source-age (~100ms) is below the 250ms interval; got {} snapshot(s)", + before_interval.len(), + ); + } - // Real gate: firing immediately (~0ms elapsed) must NOT emit — proves the - // interval check is active on the production-path helper. - listener.fire_stream_dirty_backstop_for_test(); - let before_interval = drain_snapshots(&mut rx); - assert!( - before_interval.is_empty(), - "backstop must NOT fire before the 250ms dirty interval; got {} snapshot(s)", - before_interval.len(), - ); + // Case B (gate satisfied — must emit exactly one Provisional): the dirtying + // block's source time is ~300ms old (>= 250ms), so the enforced source-age + // gate is met and the backstop emits one Provisional snapshot. + { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let aged_block_time = now_ms_test() - 300; + let (s2, d2) = add_event(aged_block_time, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, aged_block_time, vec![s2], vec![d2]); + + listener.fire_stream_dirty_backstop_for_test(); + let emitted = drain_snapshots(&mut rx); + assert_eq!( + emitted.len(), + 1, + "backstop must emit exactly one snapshot once source-age (~300ms) reaches the 250ms interval; got {}", + emitted.len(), + ); + assert_eq!( + snapshot_emission(&emitted[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "backstop snapshot must emit Provisional", + ); + assert_eq!( + snapshot_height(&emitted[0]), + 2, + "backstop snapshot must carry the last applied height (2)", + ); + } - // Bypass only the elapsed check (all other conditions remain real); the - // backstop must now emit exactly one Provisional snapshot. - listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); - let after_interval = drain_snapshots(&mut rx); - assert_eq!(after_interval.len(), 1, "backstop must emit exactly one snapshot after interval; got {}", after_interval.len()); - assert_eq!(snapshot_height(&after_interval[0]), 2, "backstop snapshot must carry the last applied height (2)"); + // Case C (bypass): the interval-ignoring helper force-emits regardless of + // source age — here a still-fresh (~50ms) dirty epoch still emits one + // Provisional, proving the bypass seam is intact. + { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let fresh_block_time = now_ms_test() - 50; + let (s2, d2) = add_event(fresh_block_time, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, fresh_block_time, vec![s2], vec![d2]); + + listener.fire_stream_dirty_backstop_ignoring_interval_for_test(); + let emitted = drain_snapshots(&mut rx); + assert_eq!( + emitted.len(), + 1, + "interval-bypassing backstop must force-emit one snapshot regardless of source age; got {}", + emitted.len(), + ); + assert_eq!( + snapshot_emission(&emitted[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "bypassed backstop snapshot must emit Provisional", + ); + assert_eq!( + snapshot_height(&emitted[0]), + 2, + "bypassed backstop snapshot must carry the last applied height (2)", + ); + } } // --------------------------------------------------------------------------- diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 56422c8e..eed8749b 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1074,11 +1074,31 @@ impl MulticastPublisher { caught_up = true; } let snapshot_map = l2_snapshots.as_ref(); - if self.publish_quotes(snapshot_map, *time, false).await { + let sent = self.publish_quotes(snapshot_map, *time, false).await; + if sent { had_activity = true; heartbeat_interval.reset(); } - pending_provisional = matches!(emission, SnapshotEmission::Provisional); + // Drive the supersede flag from ACTUAL delivery, not the + // publish decision: a provisional only "pends" if it sent + // at least one quote; a superseding Authoritative/Correction + // only clears the pending state if it actually sent. If a + // supersede send fails while a provisional is pending, the + // flag stays set so a later snapshot still forces it through + // (subscribers are never stranded on the provisional). + match emission { + SnapshotEmission::Provisional => { + if sent { + pending_provisional = true; + } + } + SnapshotEmission::Authoritative + | SnapshotEmission::Correction => { + if sent { + pending_provisional = false; + } + } + } } else { caught_up = false; crate::metrics::observe_tob_source_lag( From 64e90759f05cd631238c6d8acea6d8006ea8d079 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 11:48:14 -0400 Subject: [PATCH 39/65] stream cpu: skip l4 fanout when websocket disabled; one-shot backstop per epoch --- .../order_book/block_mode_multicast_e2e.rs | 5 ++ server/src/listeners/order_book/mod.rs | 27 ++++++++ server/src/listeners/order_book/state.rs | 16 +++++ .../order_book/stream_finalization_tests.rs | 69 +++++++++++++++++++ 4 files changed, 117 insertions(+) diff --git a/server/src/listeners/order_book/block_mode_multicast_e2e.rs b/server/src/listeners/order_book/block_mode_multicast_e2e.rs index 1db582c0..bd0faf1f 100644 --- a/server/src/listeners/order_book/block_mode_multicast_e2e.rs +++ b/server/src/listeners/order_book/block_mode_multicast_e2e.rs @@ -303,6 +303,11 @@ async fn streaming_l4_flood_does_not_starve_tob_marketdata() { let mut listener = OrderBookListener::new_with_ingest_mode(Some(market_tx.clone()), true, IngestMode::Stream); listener.set_l4_message_tx(l4_tx); + // L4 fanout only occurs when WebSocket is enabled (the dedicated L4 channel + // feeds WS subscribers; multicast ignores L4BookUpdates and multicast-only + // mode skips L4 fanout entirely for CPU). This starvation guard is therefore + // only meaningful in the WS-enabled config. + listener.set_enable_websocket(true); listener.init_from_snapshot(snapshot, snapshot_height); let listener = Arc::new(Mutex::new(listener)); diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 25190bb1..99d41b37 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1142,6 +1142,14 @@ impl OrderBookListener { /// in source time (not local elapsed) so it matches the multicast /// publisher's freshness gate. Shared by the hl_listen backstop ticker arm /// and the test helper so the two cannot drift. + /// + /// Emits AT MOST ONE Provisional per dirty epoch (one-shot): the per-epoch + /// `provisional_emitted_this_epoch` flag suppresses every later tick within + /// the same stuck block, so a multi-second stall no longer re-runs the full + /// TOB recompute on every 250ms ticker fire. The flag is cleared at + /// finalization by `clear_book_dirty` (owned by + /// `emit_authoritative_block_snapshot`), so the next dirty epoch starts + /// fresh and the authoritative finalization snapshot is unaffected. fn try_emit_stuck_stream_backstop(&mut self) -> bool { self.try_emit_stuck_stream_backstop_inner(true) } @@ -1152,11 +1160,18 @@ impl OrderBookListener { /// `enforce_interval = false` is exposed only via the `#[cfg(test)]` /// helper below, allowing tests to bypass the source-age gate without /// affecting any other backstop condition. + /// + /// One-shot per dirty epoch: regardless of `enforce_interval`, emission is + /// gated on `!provisional_emitted_this_epoch()` and sets that flag after + /// emitting, so at most one Provisional fires per dirty block. The flag is + /// cleared only at finalization (`clear_book_dirty`), leaving the + /// authoritative finalization snapshot unaffected. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket && self.order_book_state.as_ref().is_some_and(|s| { s.book_dirty() + && !s.provisional_emitted_this_epoch() && (!enforce_interval || s.dirty_source_times().is_some_and(|(bt, _)| { now_ms().saturating_sub(bt) @@ -1174,6 +1189,9 @@ impl OrderBookListener { SnapshotEmission::Provisional, false, // prevent_future_snaps=false: later finalization must still emit ); + if let Some(state) = self.order_book_state.as_mut() { + state.mark_provisional_emitted(); + } true } else { false @@ -1502,6 +1520,15 @@ impl OrderBookListener { } fn publish_l4_update(&self, diff_batch: Batch, status_batch: Batch) { + // L4 book updates are consumed only by WebSocket subscribers; multicast + // ignores InternalMessage::L4BookUpdates. In multicast-only mode + // (enable_websocket=false, the default) skip the per-diff clone + task + // spawn + broadcast entirely — it is pure CPU/alloc waste with no + // consumer. (`l4_message_tx()` would otherwise fall back to the market + // channel and broadcast anyway.) + if !self.enable_websocket { + return; + } if let Some(tx) = self.l4_message_tx() { let tx = tx.clone(); tokio::spawn(async move { diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index a6c8d090..19320909 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -32,6 +32,11 @@ pub(super) struct OrderBookState { /// reliable times without depending on the take-and-clear /// `OrderBookListener::last_batch_times()`. dirty_source_times: Option<(u64, u64)>, + /// True once the stuck-stream backstop has emitted its single Provisional + /// snapshot for the current dirty epoch. Makes the backstop one-shot per + /// epoch (no per-tick TOB recompute during a stall). Reset by + /// `clear_book_dirty` at epoch close. + provisional_emitted_this_epoch: bool, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -66,6 +71,7 @@ impl Clone for OrderBookState { enable_websocket: self.enable_websocket, book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, + provisional_emitted_this_epoch: self.provisional_emitted_this_epoch, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -91,6 +97,7 @@ impl OrderBookState { snapped: false, book_dirty: false, dirty_source_times: None, + provisional_emitted_this_epoch: false, dob_tap: None, } } @@ -120,6 +127,14 @@ impl OrderBookState { self.dirty_source_times } + pub(super) const fn provisional_emitted_this_epoch(&self) -> bool { + self.provisional_emitted_this_epoch + } + + pub(super) fn mark_provisional_emitted(&mut self) { + self.provisional_emitted_this_epoch = true; + } + /// Marks the book dirty and, on the first transition of the current dirty /// epoch, records the source `(block_time_ms, local_time_ms)`. The /// stuck-stream backstop gates emission on the source age of this @@ -134,6 +149,7 @@ impl OrderBookState { pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.dirty_source_times = None; + self.provisional_emitted_this_epoch = false; } // forcibly take snapshot - (time, height, snapshot) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 84a887a8..1f44c86c 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -383,6 +383,75 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { } } +// --------------------------------------------------------------------------- +// Test 4b: the stuck-stream backstop is ONE-SHOT per dirty epoch. +// +// During a multi-second stall the production 250ms ticker re-runs the backstop +// many times for the same stuck block. The backstop must emit AT MOST ONE +// Provisional snapshot per dirty epoch: the per-epoch +// `provisional_emitted_this_epoch` flag suppresses every later tick within the +// same stuck block (no repeated full TOB recompute / quote spam). The flag is +// cleared only at finalization by `clear_book_dirty`, so the authoritative +// finalization snapshot for that block is unaffected and the next dirty epoch +// starts fresh. +// --------------------------------------------------------------------------- + +/// L2-5 contract (4) one-shot guard: under the enforced source-age gate, two +/// consecutive backstop ticks within the same dirty epoch emit exactly ONE +/// Provisional snapshot (the second tick is a no-op). Finalizing the block +/// afterwards still emits the Authoritative snapshot — proving the one-shot +/// guard does not suppress finalization (the epoch's `clear_book_dirty` reset +/// the flag). +#[tokio::test(flavor = "current_thread")] +async fn backstop_is_one_shot_per_dirty_epoch() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Dirty block 2 whose source-age is ~300ms (>= 250ms), so the enforced + // source-age gate is satisfied. The same relative time is passed to both + // `add_event` and `feed_block`. + let aged_block_time = now_ms_test() - 300; + let (s2, d2) = add_event(aged_block_time, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, aged_block_time, vec![s2], vec![d2]); + + // Fire the enforced backstop TWICE in the same dirty epoch. + listener.fire_stream_dirty_backstop_for_test(); + listener.fire_stream_dirty_backstop_for_test(); + + let emitted = drain_snapshots(&mut rx); + assert_eq!( + emitted.len(), + 1, + "backstop must emit exactly one Provisional per dirty epoch (the second tick is a no-op); got {}", + emitted.len(), + ); + assert_eq!( + snapshot_emission(&emitted[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "the single backstop snapshot must be Provisional", + ); + assert_eq!(snapshot_height(&emitted[0]), 2, "backstop snapshot must carry height 2"); + + // Block 3 finalizes block 2. The one-shot guard must NOT suppress the + // authoritative finalization snapshot — `clear_book_dirty` at epoch close + // resets `provisional_emitted_this_epoch`. + let finalize_block_time = now_ms_test() - 100; + let (s3, d3) = add_event(finalize_block_time, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, finalize_block_time, vec![s3], vec![d3]); + + let after_final = drain_snapshots(&mut rx); + let height_2_authoritative: Vec<_> = after_final + .iter() + .filter(|m| snapshot_height(m) == 2) + .filter(|m| snapshot_emission(m) == Some(super::SnapshotEmission::Authoritative)) + .collect(); + assert_eq!( + height_2_authoritative.len(), + 1, + "finalization must still emit exactly one Authoritative snapshot for height 2 after a one-shot backstop; got {}", + height_2_authoritative.len(), + ); +} + // --------------------------------------------------------------------------- // Test 5 (Codex finding #2): late diff after backstop still gets authoritative // final — the provisional emit must NOT clear the dirty flag. From 8916d433f3c30c9eebb8d2698db7390dab12a4cc Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 11:50:22 -0400 Subject: [PATCH 40/65] gitignore: exclude .claude/*.lock harness artifacts --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cb679d53..d91929f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target/ server/tmp/ .worktrees/ +.claude/*.lock From 5790c04b504a742320bfab65d684f2c666796de3 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:01:27 -0400 Subject: [PATCH 41/65] publisher: recovery correction must not clear the provisional-supersede obligation --- server/src/multicast/publisher.rs | 134 +++++++++++++++++++++++++----- 1 file changed, 114 insertions(+), 20 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index eed8749b..c58be5b5 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -550,6 +550,31 @@ impl MulticastPublisher { } } + /// Pure transition for the provisional-supersede obligation. + /// + /// A delivered `Provisional` creates the obligation. Only a delivered + /// `Authoritative` (the block's own finalization) discharges it. A + /// `Correction` is recovery divergence repair, which is orthogonal to the + /// stream dirty epoch — the listener never closes that epoch on recovery — + /// so it must leave the obligation untouched, otherwise a later stale + /// `Authoritative` finalization would be suppressed and strand subscribers + /// on the provisional. A snapshot that did not actually send (`sent == + /// false`) changes nothing. + const fn next_pending_provisional( + prev: bool, + emission: SnapshotEmission, + sent: bool, + ) -> bool { + if !sent { + return prev; + } + match emission { + SnapshotEmission::Provisional => true, + SnapshotEmission::Authoritative => false, + SnapshotEmission::Correction => prev, + } + } + fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -1079,26 +1104,19 @@ impl MulticastPublisher { had_activity = true; heartbeat_interval.reset(); } - // Drive the supersede flag from ACTUAL delivery, not the - // publish decision: a provisional only "pends" if it sent - // at least one quote; a superseding Authoritative/Correction - // only clears the pending state if it actually sent. If a - // supersede send fails while a provisional is pending, the - // flag stays set so a later snapshot still forces it through - // (subscribers are never stranded on the provisional). - match emission { - SnapshotEmission::Provisional => { - if sent { - pending_provisional = true; - } - } - SnapshotEmission::Authoritative - | SnapshotEmission::Correction => { - if sent { - pending_provisional = false; - } - } - } + // Drive the supersede flag from actual delivery via + // `next_pending_provisional`: a delivered Provisional sets + // the obligation; only the block's own delivered + // Authoritative finalization clears it; a recovery + // Correction is orthogonal to the stream dirty epoch and + // leaves it unchanged. A snapshot that did not send leaves + // the obligation untouched, so a later send still forces it + // through (subscribers are never stranded on the provisional). + pending_provisional = Self::next_pending_provisional( + pending_provisional, + *emission, + sent, + ); } else { caught_up = false; crate::metrics::observe_tob_source_lag( @@ -2000,4 +2018,80 @@ mod tests { "stale authoritative must publish to supersede a delivered provisional" ); } + + /// Locks in the recovery-orthogonality fix for the provisional-supersede + /// obligation. The dangerous interleaving is: a delivered `Provisional` + /// creates the obligation, then a recovery `Correction` is delivered + /// (repairing a diverged coin without closing the stream dirty epoch), + /// then the block's own `Authoritative` finalization arrives stale. If the + /// `Correction` cleared the obligation, that stale `Authoritative` would be + /// suppressed and subscribers would be stranded on the provisional. This + /// asserts every case of `next_pending_provisional`, in particular that a + /// delivered `Correction` leaves a pending obligation intact so the later + /// stale `Authoritative` still force-publishes. + #[test] + fn pending_provisional_transition_matrix() { + // Delivered Provisional creates the obligation regardless of prior state. + assert!( + MulticastPublisher::next_pending_provisional(false, SnapshotEmission::Provisional, true), + "delivered provisional must create the obligation" + ); + assert!( + MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Provisional, true), + "delivered provisional keeps the obligation set" + ); + + // Delivered Authoritative (the block's own finalization) discharges it. + assert!( + !MulticastPublisher::next_pending_provisional( + true, + SnapshotEmission::Authoritative, + true + ), + "delivered authoritative finalization must discharge the obligation" + ); + assert!( + !MulticastPublisher::next_pending_provisional( + false, + SnapshotEmission::Authoritative, + true + ), + "delivered authoritative with no obligation stays clear" + ); + + // THE FIX: delivered recovery Correction must NOT clear a pending + // obligation — it is orthogonal to the stream dirty epoch. + assert!( + MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Correction, true), + "recovery correction must not clear a pending provisional obligation" + ); + assert!( + !MulticastPublisher::next_pending_provisional( + false, + SnapshotEmission::Correction, + true + ), + "recovery correction with no obligation leaves it clear" + ); + + // Not sent: obligation is unchanged for every emission. A failed + // Authoritative send must preserve the obligation so a later send + // still force-publishes the finalization. + assert!( + MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Provisional, false), + "unsent provisional leaves the obligation unchanged" + ); + assert!( + MulticastPublisher::next_pending_provisional( + true, + SnapshotEmission::Authoritative, + false + ), + "failed authoritative send must preserve the obligation" + ); + assert!( + MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Correction, false), + "unsent correction leaves the obligation unchanged" + ); + } } From 6523ca5131201a41cda178076c74e3fc7a611a48 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:12:15 -0400 Subject: [PATCH 42/65] backstop: re-arm one-shot guard on post-provisional mutation so long stalls don't strand subscribers --- server/src/listeners/order_book/mod.rs | 31 +++++++----- server/src/listeners/order_book/state.rs | 8 +++ .../order_book/stream_finalization_tests.rs | 50 +++++++++++++++++++ 3 files changed, 77 insertions(+), 12 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 99d41b37..76186abf 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1143,13 +1143,17 @@ impl OrderBookListener { /// publisher's freshness gate. Shared by the hl_listen backstop ticker arm /// and the test helper so the two cannot drift. /// - /// Emits AT MOST ONE Provisional per dirty epoch (one-shot): the per-epoch - /// `provisional_emitted_this_epoch` flag suppresses every later tick within - /// the same stuck block, so a multi-second stall no longer re-runs the full - /// TOB recompute on every 250ms ticker fire. The flag is cleared at - /// finalization by `clear_book_dirty` (owned by - /// `emit_authoritative_block_snapshot`), so the next dirty epoch starts - /// fresh and the authoritative finalization snapshot is unaffected. + /// Emits at most one Provisional per *unchanged* dirty state: the + /// `provisional_emitted_this_epoch` guard suppresses repeat emits while the + /// book is unchanged (no per-tick TOB recompute during a static stall, so a + /// multi-second stall does not re-run the full recompute on every 250ms + /// ticker fire), but every real BBO-affecting mutation + /// (`mark_dirty_with_times`) re-arms the guard so a long stall with ongoing + /// diffs still re-publishes the updated provisional rather than stranding + /// subscribers on a stale BBO. The guard is also cleared at finalization by + /// `clear_book_dirty` (owned by `emit_authoritative_block_snapshot`), so the + /// next dirty epoch starts fresh and the authoritative finalization snapshot + /// is unaffected. fn try_emit_stuck_stream_backstop(&mut self) -> bool { self.try_emit_stuck_stream_backstop_inner(true) } @@ -1161,11 +1165,14 @@ impl OrderBookListener { /// helper below, allowing tests to bypass the source-age gate without /// affecting any other backstop condition. /// - /// One-shot per dirty epoch: regardless of `enforce_interval`, emission is - /// gated on `!provisional_emitted_this_epoch()` and sets that flag after - /// emitting, so at most one Provisional fires per dirty block. The flag is - /// cleared only at finalization (`clear_book_dirty`), leaving the - /// authoritative finalization snapshot unaffected. + /// One-shot per *unchanged* dirty state: regardless of `enforce_interval`, + /// emission is gated on `!provisional_emitted_this_epoch()` and sets that + /// flag after emitting, so at most one Provisional fires while the book is + /// unchanged. Every real BBO-affecting mutation (`mark_dirty_with_times`) + /// re-arms the guard, so a long stall with ongoing diffs re-publishes the + /// updated provisional. The flag is also cleared at finalization + /// (`clear_book_dirty`), leaving the authoritative finalization snapshot + /// unaffected. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 19320909..eb198ed5 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -139,11 +139,19 @@ impl OrderBookState { /// epoch, records the source `(block_time_ms, local_time_ms)`. The /// stuck-stream backstop gates emission on the source age of this /// `block_time_ms` (no local `Instant` is stamped). + /// + /// Every real BBO-affecting mutation also re-arms the one-shot backstop + /// guard (`provisional_emitted_this_epoch = false`): a single provisional + /// covers an unchanged dirty state, but a post-provisional mutation during + /// a long stall must be re-published so subscribers are not stranded on a + /// stale BBO. (Recovery never calls this — it is orthogonal to the dirty + /// epoch by design — so recovery cannot re-arm the guard.) pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { if !self.book_dirty { self.dirty_source_times = Some((block_time_ms, local_time_ms)); } self.book_dirty = true; + self.provisional_emitted_this_epoch = false; } pub(super) const fn clear_book_dirty(&mut self) { diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 1f44c86c..0c32ae44 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -452,6 +452,56 @@ async fn backstop_is_one_shot_per_dirty_epoch() { ); } +// --------------------------------------------------------------------------- +// Test 4c (Codex finding): the one-shot guard RE-ARMS on a post-provisional +// BBO mutation, so a long stall with ongoing diffs does not strand subscribers +// on a stale provisional. +// --------------------------------------------------------------------------- + +/// Codex finding: the one-shot backstop guard must RE-ARM on a post-provisional +/// BBO mutation, so a long stall with ongoing diffs does not strand subscribers +/// on a stale provisional. First provisional emits; a later same-height +/// BBO-changing diff (block still unfinalized) must let the next backstop tick +/// emit a SECOND provisional carrying the updated BBO. +#[tokio::test(flavor = "current_thread")] +async fn backstop_re_emits_after_post_provisional_mutation() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Stuck block 2, source-age ~300ms (>= 250ms gate). First BBO = bid 100. + let bt1 = now_ms_test() - 300; + let (s2, d2) = add_event(bt1, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, bt1, vec![s2], vec![d2]); + + listener.fire_stream_dirty_backstop_for_test(); + let first = drain_snapshots(&mut rx); + assert_eq!(first.len(), 1, "first backstop must emit one provisional; got {}", first.len()); + assert_eq!(snapshot_emission(&first[0]), Some(super::SnapshotEmission::Provisional)); + let bid_1 = snapshot_best_bid(&first[0]).expect("first provisional has a bid"); + + // Later same-height-2 BBO-changing diff (block still NOT finalized). + let bt2 = now_ms_test() - 300; + let (s2b, d2b) = add_event(bt2, Side::Bid, 102, "110", "3"); + feed_block(&mut listener, 2, bt2, vec![s2b], vec![d2b]); + + // The post-provisional mutation must have re-armed the guard: the next + // backstop tick emits a SECOND provisional with the UPDATED BBO. + listener.fire_stream_dirty_backstop_for_test(); + let second = drain_snapshots(&mut rx); + assert_eq!(second.len(), 1, "post-mutation backstop must re-emit one provisional; got {}", second.len()); + assert_eq!(snapshot_emission(&second[0]), Some(super::SnapshotEmission::Provisional)); + let bid_2 = snapshot_best_bid(&second[0]).expect("second provisional has a bid"); + assert_ne!(bid_1, bid_2, "second provisional must carry the updated BBO (re-arm worked); bid_1={bid_1:?} bid_2={bid_2:?}"); + + // Sanity: finalization still emits the authoritative snapshot. + let bt3 = now_ms_test() - 100; + let (s3, d3) = add_event(bt3, Side::Bid, 103, "90", "1"); + feed_block(&mut listener, 3, bt3, vec![s3], vec![d3]); + let finals = drain_snapshots(&mut rx); + let h2_finals: Vec<_> = finals.iter().filter(|m| snapshot_height(m) == 2).collect(); + assert!(!h2_finals.is_empty(), "finalization must still emit authoritative for height 2"); + assert_eq!(snapshot_emission(h2_finals[0]), Some(super::SnapshotEmission::Authoritative)); +} + // --------------------------------------------------------------------------- // Test 5 (Codex finding #2): late diff after backstop still gets authoritative // final — the provisional emit must NOT clear the dirty flag. From ee981aa57a6c9391254ad555b6f088d0d4950890 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:25:02 -0400 Subject: [PATCH 43/65] publisher: only enter caught-up state on a fresh publish, not a forced-stale supersede --- server/src/multicast/publisher.rs | 33 +++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index c58be5b5..c094d104 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -575,6 +575,19 @@ impl MulticastPublisher { } } + /// `caught_up` means the source is genuinely fresh and the publisher is + /// keeping up — it gates the periodic `snapshot_interval` resend of the + /// cached snapshot. A *forced-stale* publish (an `Authoritative` + /// superseding a pending provisional, or a recovery `Correction`) delivers + /// required data once but does NOT mean we are caught up; flipping + /// `caught_up` on such a publish would re-enable periodic stale resends and + /// defeat the freshness gate during stalls/recovery. Only a fresh publish + /// transitions into the caught-up state; a stale forced publish leaves the + /// prior state unchanged (a separate suppress path clears it). + const fn caught_up_after_publish(prev_caught_up: bool, fresh: bool) -> bool { + prev_caught_up || fresh + } + fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -1088,7 +1101,9 @@ impl MulticastPublisher { "published", Duration::from_millis(lag_ms), ); - if !caught_up { + let was_caught_up = caught_up; + caught_up = Self::caught_up_after_publish(caught_up, fresh); + if caught_up && !was_caught_up { info!( "tob marketdata caught up: publishing quote frames source={} height={} source_lag_ms={} dest={}", source, @@ -1096,7 +1111,6 @@ impl MulticastPublisher { lag_ms, self.config.dest(), ); - caught_up = true; } let snapshot_map = l2_snapshots.as_ref(); let sent = self.publish_quotes(snapshot_map, *time, false).await; @@ -2094,4 +2108,19 @@ mod tests { "unsent correction leaves the obligation unchanged" ); } + + /// A forced-stale publish (Authoritative superseding a pending provisional, + /// or a Correction) must NOT flip `caught_up` true, or the periodic + /// snapshot resend would rebroadcast stale quotes as if caught up. Only a + /// fresh publish transitions into caught-up. + #[test] + fn caught_up_after_publish_matrix() { + // fresh publish: transitions / stays caught up + assert!(MulticastPublisher::caught_up_after_publish(false, true)); + assert!(MulticastPublisher::caught_up_after_publish(true, true)); + // stale forced publish: must NOT flip false -> true + assert!(!MulticastPublisher::caught_up_after_publish(false, false)); + // stale forced publish while already caught up: leaves prior state + assert!(MulticastPublisher::caught_up_after_publish(true, false)); + } } From 7c9e04428a98c3b11a05f3bd0eb6f4f65e3e997a Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:38:46 -0400 Subject: [PATCH 44/65] publisher: clear caught-up on a forced-stale publish so periodic resend stays gated --- server/src/multicast/publisher.rs | 43 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index c094d104..b25f5574 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -575,17 +575,17 @@ impl MulticastPublisher { } } - /// `caught_up` means the source is genuinely fresh and the publisher is - /// keeping up — it gates the periodic `snapshot_interval` resend of the - /// cached snapshot. A *forced-stale* publish (an `Authoritative` - /// superseding a pending provisional, or a recovery `Correction`) delivers - /// required data once but does NOT mean we are caught up; flipping - /// `caught_up` on such a publish would re-enable periodic stale resends and - /// defeat the freshness gate during stalls/recovery. Only a fresh publish - /// transitions into the caught-up state; a stale forced publish leaves the - /// prior state unchanged (a separate suppress path clears it). - const fn caught_up_after_publish(prev_caught_up: bool, fresh: bool) -> bool { - prev_caught_up || fresh + /// `caught_up` means the most recent publish was genuinely fresh and the + /// publisher is keeping up — it gates the periodic `snapshot_interval` + /// resend of the cached snapshot. A *forced-stale* publish (an + /// `Authoritative` superseding a pending provisional, or a recovery + /// `Correction`, with `fresh == false`) delivers required data exactly + /// once but does NOT mean we are caught up; it must clear `caught_up` so + /// the periodic resend does not rebroadcast stale quotes during a stall or + /// recovery catch-up. Therefore the post-publish state is exactly `fresh` + /// (the suppress path independently sets it `false`). + const fn caught_up_after_publish(fresh: bool) -> bool { + fresh } fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { @@ -1102,7 +1102,7 @@ impl MulticastPublisher { Duration::from_millis(lag_ms), ); let was_caught_up = caught_up; - caught_up = Self::caught_up_after_publish(caught_up, fresh); + caught_up = Self::caught_up_after_publish(fresh); if caught_up && !was_caught_up { info!( "tob marketdata caught up: publishing quote frames source={} height={} source_lag_ms={} dest={}", @@ -2109,18 +2109,15 @@ mod tests { ); } - /// A forced-stale publish (Authoritative superseding a pending provisional, - /// or a Correction) must NOT flip `caught_up` true, or the periodic - /// snapshot resend would rebroadcast stale quotes as if caught up. Only a - /// fresh publish transitions into caught-up. + /// `caught_up` after a publish is exactly `fresh`. A fresh publish enters + /// caught-up; a forced-stale supersede/Correction (`fresh == false`) clears + /// it so the periodic snapshot resend does not rebroadcast stale quotes. + /// Scenario locked in: fresh Provisional (fresh=true → caught_up=true) then + /// stale forced Authoritative (fresh=false → caught_up=false → no periodic + /// stale resend). #[test] fn caught_up_after_publish_matrix() { - // fresh publish: transitions / stays caught up - assert!(MulticastPublisher::caught_up_after_publish(false, true)); - assert!(MulticastPublisher::caught_up_after_publish(true, true)); - // stale forced publish: must NOT flip false -> true - assert!(!MulticastPublisher::caught_up_after_publish(false, false)); - // stale forced publish while already caught up: leaves prior state - assert!(MulticastPublisher::caught_up_after_publish(true, false)); + assert!(MulticastPublisher::caught_up_after_publish(true)); + assert!(!MulticastPublisher::caught_up_after_publish(false)); } } From 269fb6ffcda600e0365504f832f75d686eb18538 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:40:06 -0400 Subject: [PATCH 45/65] l2: don't reserve full source depth for bucketed variants (bounded snapshot memory) --- server/src/order_book/levels.rs | 56 ++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/server/src/order_book/levels.rs b/server/src/order_book/levels.rs index 053af849..81d0d7c3 100644 --- a/server/src/order_book/levels.rs +++ b/server/src/order_book/levels.rs @@ -60,7 +60,17 @@ fn l2_levels_to_l2_levels( n_sig_figs: Option, mantissa: Option, ) -> Vec { - let cap = n_levels.unwrap_or(levels.len()); + // Pre-size to avoid realloc growth (L2-4) ONLY where the output length + // tracks the input: an explicit `n_levels` cap, or the non-bucketing + // identity/truncate transform. A bucketed transform (`n_sig_figs` set) + // collapses many source levels into far fewer, so reserving source depth + // would retain huge unused capacity in every stored snapshot — reserve + // nothing and let the small bucketed output grow. + let cap = match (n_levels, n_sig_figs) { + (Some(n), _) => n, + (None, None) => levels.len(), + (None, Some(_)) => 0, + }; let mut new_levels = Vec::with_capacity(cap); if n_levels == Some(0) { return new_levels; @@ -139,3 +149,47 @@ pub(super) fn build_l2_level( } false } + +#[cfg(test)] +mod tests { + use super::*; + + /// A bucketed L2 transform (`n_sig_figs` set, `n_levels == None`) collapses + /// thousands of source levels into a handful of buckets. The output Vec + /// must NOT be pre-reserved at full source depth, or every stored snapshot + /// would retain ~source-depth unused capacity (the over-reservation bug). + #[test] + fn bucketed_variant_does_not_reserve_full_source_depth() { + const DEPTH: usize = 4000; + // Deep bid side: distinct, monotonically descending prices in + // [100000, 103999] (all 6 digits). With n_sig_figs=2 the bid bucket is + // (px / 10000) * 10000, so every level collapses into the single + // 100000 bucket — a 4000 -> 1 collapse. + let mut deep_bids = Vec::with_capacity(DEPTH); + for i in 0..DEPTH { + let px = 103_999 - i as u64; + deep_bids.push(InnerLevel { px: Px::new(px), sz: Sz::new(10 + i as u64), n: 1 }); + } + + let bids = l2_levels_to_l2_levels(&deep_bids, Side::Bid, None, Some(2), None); + + // Bucketing must collapse the 4000 source levels into far fewer. + assert!( + bids.len() < 4000, + "bucketing should collapse levels, got len={}", + bids.len() + ); + // Capacity must not have been reserved at source depth. + assert!( + bids.capacity() <= bids.len().max(64), + "bucketed output over-reserved: capacity={} len={}", + bids.capacity(), + bids.len() + ); + assert!( + bids.capacity() < 4000, + "bucketed output retained ~source-depth capacity: {}", + bids.capacity() + ); + } +} From 651d830505802c2981be6483c85160baa5301b26 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 12:52:39 -0400 Subject: [PATCH 46/65] recovery: reject off-lock report on same-height intra-block race via mutation seq --- server/src/listeners/order_book/mod.rs | 72 +++++++++++++++---- server/src/listeners/order_book/state.rs | 13 ++++ .../order_book/stream_finalization_tests.rs | 53 ++++++++++++++ 3 files changed, 123 insertions(+), 15 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 76186abf..6531ca48 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -374,6 +374,23 @@ pub(crate) async fn hl_listen_fills_only( } } +/// The off-lock-validated recovery report is only safe to apply if, after +/// re-locking, the live book is still at the validated height AND no real +/// mutation raced in (height alone is insufficient — streaming applies many +/// diffs at the same block number; `mutation_seq` advances on every real +/// BBO-affecting mutation). +const fn recovery_report_still_applicable( + validated_height: u64, + validated_mutation_seq: u64, + live_height: Option, + live_mutation_seq: Option, +) -> bool { + matches!( + (live_height, live_mutation_seq), + (Some(h), Some(s)) if h == validated_height && s == validated_mutation_seq + ) +} + fn fetch_snapshot( dir: PathBuf, listener: Arc>, @@ -412,6 +429,7 @@ fn fetch_snapshot( // is found, re-lock and apply surgical per-coin // recovery rather than tearing down the feed. let state = guard.order_book_state.clone().expect("is_ready checked above"); + let validated_mutation_seq = state.mutation_seq(); drop(guard); let stored_snapshot = state.compute_snapshot().snapshot; info!("Validating snapshot at height {height}"); @@ -428,25 +446,35 @@ fn fetch_snapshot( report.extra_in_fresh.len(), ); // Off-lock validation (above) means the live book may have - // advanced past `height` while we computed/validated without - // the lock. Re-lock and re-check the live height *atomically* - // with apply_recovery (single held guard, no gap): only apply - // if the book is still exactly at the validated height. - // Otherwise the report is stale — discard it; a future - // validation cycle will re-derive against current state. - // This matters because apply_recovery emits a staleness- - // bypassing Correction snapshot; applying a raced report - // would broadcast a stale rollback to multicast subscribers. + // advanced — or, in streaming mode, mutated *within the same + // block number* — while we computed/validated without the + // lock. Re-lock and re-check BOTH the live height AND a + // monotonic mutation seq *atomically* with apply_recovery + // (single held guard, no gap): only apply if the book is + // still exactly at the validated height and no real + // BBO-affecting mutation raced in. Height alone is + // insufficient — streaming applies many diffs at the same + // block number, so a same-height intra-block race would + // otherwise slip through. Otherwise the report is stale — + // discard it; a future validation cycle will re-derive + // against current state. This matters because apply_recovery + // emits a staleness-bypassing Correction snapshot; applying + // a raced report would broadcast a stale rollback to + // multicast subscribers. let mut recovery_guard = listener.lock().await; - let live_height = recovery_guard - .order_book_state - .as_ref() - .map(|s| s.height()); - if live_height == Some(height) { + let live_state = recovery_guard.order_book_state.as_ref(); + let live_height = live_state.map(|s| s.height()); + let live_mutation_seq = live_state.map(|s| s.mutation_seq()); + if recovery_report_still_applicable( + height, + validated_mutation_seq, + live_height, + live_mutation_seq, + ) { recovery_guard.apply_recovery(&report, expected_snapshot); } else { log::warn!( - "snapshot validation: discarding stale recovery report — live height {live_height:?} no longer matches validated height {height}" + "snapshot validation: discarding stale recovery report — book moved since off-lock validation (validated height={height} seq={validated_mutation_seq}, live height={live_height:?} seq={live_mutation_seq:?})" ); } Ok(()) @@ -1738,6 +1766,20 @@ impl OrderBookListener { self.order_book_state.as_ref().map(|o| o.compute_l2_snapshots_for_test()) } + /// Test-only accessor for the live book height (mirrors the off-lock + /// recovery guard's `s.height()` read). + #[cfg(test)] + pub(crate) fn test_state_height(&self) -> u64 { + self.order_book_state.as_ref().map(|s| s.height()).unwrap_or(0) + } + + /// Test-only accessor for the live book mutation seq (mirrors the off-lock + /// recovery guard's `s.mutation_seq()` read). + #[cfg(test)] + pub(crate) fn test_state_mutation_seq(&self) -> u64 { + self.order_book_state.as_ref().map(|s| s.mutation_seq()).unwrap_or(0) + } + #[cfg(test)] pub(crate) fn finalize_streaming_for_test(&mut self) -> Result<()> { self.drain_streaming_blocks()?; diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index eb198ed5..aebd4c8f 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -37,6 +37,12 @@ pub(super) struct OrderBookState { /// epoch (no per-tick TOB recompute during a stall). Reset by /// `clear_book_dirty` at epoch close. provisional_emitted_this_epoch: bool, + /// Monotonic counter bumped on every real BBO-affecting book mutation (via + /// `mark_dirty_with_times`). Captured with a cloned state by the off-lock + /// snapshot-validation path so a recovery report can be rejected if any + /// intra-block mutation raced between clone and apply (height alone is + /// insufficient — streaming applies many diffs at the same block number). + mutation_seq: u64, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -72,6 +78,7 @@ impl Clone for OrderBookState { book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, provisional_emitted_this_epoch: self.provisional_emitted_this_epoch, + mutation_seq: self.mutation_seq, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -98,6 +105,7 @@ impl OrderBookState { book_dirty: false, dirty_source_times: None, provisional_emitted_this_epoch: false, + mutation_seq: 0, dob_tap: None, } } @@ -111,6 +119,10 @@ impl OrderBookState { self.height } + pub(super) const fn mutation_seq(&self) -> u64 { + self.mutation_seq + } + /// Returns the timestamp of the most-recent order book update, mirroring /// what `l2_snapshots` returns as `snapshot.0`. Used by /// `emit_tob_snapshot` to compute the default source times when no file @@ -152,6 +164,7 @@ impl OrderBookState { } self.book_dirty = true; self.provisional_emitted_this_epoch = false; + self.mutation_seq = self.mutation_seq.wrapping_add(1); } pub(super) const fn clear_book_dirty(&mut self) { diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 0c32ae44..64555fd8 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -968,3 +968,56 @@ async fn ws_enabled_recovery_does_not_explicitly_emit() { after_recovery.len() ); } + +// --------------------------------------------------------------------------- +// Codex finding: off-lock recovery guard must reject a same-height intra-block +// race (height alone is insufficient — streaming applies many diffs at the +// same block number). +// --------------------------------------------------------------------------- + +/// Codex finding: the off-lock recovery guard must reject a report when an +/// intra-block diff raced in even though the block number (height) is +/// unchanged. Proves `mutation_seq` advances on a same-height real mutation +/// while `height()` does not, so `recovery_report_still_applicable` rejects. +#[tokio::test(flavor = "current_thread")] +async fn recovery_report_rejected_on_same_height_intra_block_race() { + let (mut listener, mut _rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Apply a first diff at block 2 (this is the "clone-time" content). + let bt = now_ms_test() - 100; + let (s2, d2) = add_event(bt, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, bt, vec![s2], vec![d2]); + + // Capture what the off-lock validator would have captured at clone time. + let validated_height = listener.test_state_height(); + let validated_seq = listener.test_state_mutation_seq(); + + // Race: another diff arrives at the SAME block height 2 (still unfinalized). + let (s2b, d2b) = add_event(bt, Side::Bid, 102, "110", "3"); + feed_block(&mut listener, 2, bt, vec![s2b], vec![d2b]); + + let live_height = listener.test_state_height(); + let live_seq = listener.test_state_mutation_seq(); + + // Height is unchanged (the old height-only guard would WRONGLY accept) ... + assert_eq!(live_height, validated_height, "same block: height must be unchanged"); + // ... but a real mutation happened, so mutation_seq advanced ... + assert_ne!(live_seq, validated_seq, "a same-height intra-block diff must advance mutation_seq"); + // ... so the new guard correctly REJECTS the stale report. + assert!( + !super::recovery_report_still_applicable( + validated_height, + validated_seq, + Some(live_height), + Some(live_seq), + ), + "guard must reject a recovery report raced by a same-height intra-block mutation" + ); + // And it ACCEPTS when nothing raced (sanity). + assert!(super::recovery_report_still_applicable( + validated_height, + validated_seq, + Some(validated_height), + Some(validated_seq), + )); +} From 7dae2f43be78a970b2ed353fef21cfdd2ac6547b Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 13:05:51 -0400 Subject: [PATCH 47/65] recovery: only validate finalized stream heights; bump mutation seq on recovery mutations --- server/src/listeners/order_book/mod.rs | 44 +++++++++++ server/src/listeners/order_book/state.rs | 14 ++-- .../order_book/stream_finalization_tests.rs | 76 +++++++++++++++++++ 3 files changed, 129 insertions(+), 5 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 6531ca48..49f3b301 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -391,6 +391,26 @@ const fn recovery_report_still_applicable( ) } +/// In stream mode `OrderBookState::height()` advances as same-block diffs are +/// applied, but the book only completely and consistently represents a height +/// once that block is FINALIZED. Validating a partially-applied in-flight +/// block against a complete reference snapshot at the same height would report +/// spurious divergence and broadcast a staleness-bypassing Correction. Only +/// validate/recover a stream height that is finalized and has no block still +/// buffered for it. Block mode applies whole blocks atomically (height == +/// finalized), so it is always validatable. +const fn stream_recovery_height_is_consistent( + is_stream: bool, + height: u64, + finalized_height: Option, + has_buffered_block_at_height: bool, +) -> bool { + if !is_stream { + return true; + } + matches!(finalized_height, Some(f) if f >= height) && !has_buffered_block_at_height +} + fn fetch_snapshot( dir: PathBuf, listener: Arc>, @@ -423,6 +443,29 @@ fn fetch_snapshot( debug!("Validation skipped: our height {our_height} > snapshot height {height}"); Ok(()) } else { + // our_height == height. In stream mode this height + // may be a partially-applied, not-yet-finalized + // block; validating it against a complete + // reference snapshot would report spurious + // divergence and broadcast a staleness-bypassing + // Correction. Only proceed when the block is + // finalized and nothing is still buffered for it; + // otherwise defer to a future cycle (post-finalize). + let is_stream = guard.ingest_mode == IngestMode::Stream; + let finalized_height = guard.streaming_state.finalized_height; + let has_buffered_block_at_height = + guard.streaming_state.blocks.contains_key(&height); + if !stream_recovery_height_is_consistent( + is_stream, + height, + finalized_height, + has_buffered_block_at_height, + ) { + debug!( + "Validation skipped: stream height {height} not consistent yet (finalized={finalized_height:?}, buffered_at_height={has_buffered_block_at_height})" + ); + Ok(()) + } else { // Heights match — clone state under lock, then // compute snapshot + validate outside the lock // so we don't block the hot path. If divergence @@ -479,6 +522,7 @@ fn fetch_snapshot( } Ok(()) } + } } } else { guard.init_from_snapshot(expected_snapshot, height); diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index aebd4c8f..5511395d 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -37,11 +37,13 @@ pub(super) struct OrderBookState { /// epoch (no per-tick TOB recompute during a stall). Reset by /// `clear_book_dirty` at epoch close. provisional_emitted_this_epoch: bool, - /// Monotonic counter bumped on every real BBO-affecting book mutation (via - /// `mark_dirty_with_times`). Captured with a cloned state by the off-lock - /// snapshot-validation path so a recovery report can be rejected if any - /// intra-block mutation raced between clone and apply (height alone is - /// insufficient — streaming applies many diffs at the same block number). + /// Monotonic counter bumped on every book-content mutation (hot-path real + /// mutations via `mark_dirty_with_times`, and recovery repairs via + /// `replace_coin_from_snapshot`/`remove_coin`). Captured with a cloned + /// state by the off-lock snapshot-validation path so a recovery report is + /// rejected if ANY mutation (hot-path OR an overlapping recovery) raced + /// between clone and apply (height alone is insufficient — streaming + /// applies many diffs at the same block number). mutation_seq: u64, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no @@ -220,6 +222,7 @@ impl OrderBookState { ) { self.order_book.replace_coin_from_snapshot(coin, snapshot, ignore_triggers); self.snapped = false; + self.mutation_seq = self.mutation_seq.wrapping_add(1); } /// Removes a coin's book entirely. Used when a coin is in our state but missing @@ -227,6 +230,7 @@ impl OrderBookState { pub(super) fn remove_coin(&mut self, coin: &Coin) { self.order_book.remove_coin(coin); self.snapped = false; + self.mutation_seq = self.mutation_seq.wrapping_add(1); } pub(super) fn apply_updates( diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 64555fd8..cd15ba4b 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -1021,3 +1021,79 @@ async fn recovery_report_rejected_on_same_height_intra_block_race() { Some(validated_seq), )); } + +/// Finding 1: a stream height that is not yet finalized (or still has a +/// buffered block) must NOT be validated/recovered — only a finalized, +/// fully-drained height is a consistent snapshot to compare. Block mode is +/// always validatable. +#[test] +fn stream_recovery_height_consistency_matrix() { + use super::stream_recovery_height_is_consistent as ok; + // block mode: always validatable regardless of finalize state + assert!(ok(false, 10, None, true)); + assert!(ok(false, 10, Some(3), true)); + // stream: finalized >= height AND nothing buffered -> validatable + assert!(ok(true, 10, Some(10), false)); + assert!(ok(true, 10, Some(12), false)); + // stream: not finalized to height -> NOT validatable (in-flight block) + assert!(!ok(true, 10, Some(9), false)); + assert!(!ok(true, 10, None, false)); + // stream: finalized but a block still buffered at height -> NOT validatable + assert!(!ok(true, 10, Some(10), true)); +} + +/// Finding 2: a recovery mutation (replace_coin_from_snapshot / remove_coin) +/// must advance `mutation_seq`, so an overlapping validation task that +/// validated against the pre-recovery book is rejected by the height+seq +/// guard (no duplicate / stale recovery replay). +#[tokio::test(flavor = "current_thread")] +async fn recovery_mutation_advances_mutation_seq() { + use super::utils::ValidationReport; + + // Build a ready streaming state with at least one coin (seed_snapshot + // pre-loads one coin); mirror the construction used by + // `recovery_does_not_consume_unfinalized_stream_dirty_block`. + let (mut listener, mut _rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + let before = listener.test_state_mutation_seq(); + + // Drive a recovery (diverged coin) — apply_recovery calls + // replace_coin_from_snapshot internally, which must bump mutation_seq. + // Same fresh-Snapshots + ValidationReport shape as + // recovery_does_not_consume_unfinalized_stream_dirty_block. + let coin = Coin::new(TEST_COIN); + let mut repaired: crate::order_book::OrderBook = crate::order_book::OrderBook::new(); + repaired.add_order(InnerL4Order { + user: alloy::primitives::Address::new([0; 20]), + coin: coin.clone(), + side: Side::Ask, + limit_px: Px::parse_from_str("88888").expect("valid px"), + sz: Sz::parse_from_str("2").expect("valid sz"), + oid: 8_888, + timestamp: 0, + trigger_condition: String::new(), + is_trigger: false, + trigger_px: String::new(), + is_position_tpsl: false, + reduce_only: false, + order_type: String::new(), + tif: None, + cloid: None, + }); + let mut fresh_map: HashMap> = HashMap::new(); + fresh_map.insert(coin.clone(), repaired.to_snapshot()); + let fresh = crate::order_book::multi_book::Snapshots::new(fresh_map); + + let report = ValidationReport { + diverged: vec![(coin, "synthetic divergence for mutation-seq test".to_string())], + missing_in_fresh: vec![], + extra_in_fresh: vec![], + }; + + listener.apply_recovery(&report, fresh); + + let after = listener.test_state_mutation_seq(); + assert_ne!( + after, before, + "a recovery mutation must advance mutation_seq so overlapping stale validation reports are rejected" + ); +} From 0f7262c2e6f65e64b9bde8f6930a539542002856 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 13:22:31 -0400 Subject: [PATCH 48/65] backstop: freshness-window emit (retry-while-fresh, silent-when-stale); drop one-shot state; don't cache provisional for resend --- server/src/listeners/order_book/mod.rs | 119 ++++++------ server/src/listeners/order_book/state.rs | 37 +--- .../order_book/stream_finalization_tests.rs | 172 ++++++++++-------- server/src/multicast/publisher.rs | 10 +- 4 files changed, 179 insertions(+), 159 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 49f3b301..e2a55d04 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -78,13 +78,14 @@ const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_millis(50); /// fires. const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_millis(250); -// The invariant still holds and now reads in source-time terms: a provisional -// is emitted once the dirtying event's source age reaches -// STREAM_DIRTY_BACKSTOP_INTERVAL (>=250ms). Because both the backstop trigger -// and the publisher's freshness gate measure source age on the same basis, a -// provisional emitted at source-age >=250ms still has >=250ms of headroom -// before the 500ms (CATCHUP_THRESHOLD_MS) gate — that margin absorbs queue / -// scheduler delay between the listener emit and the publisher's gate check. +// The backstop emits a provisional only while the dirty book's source age is +// inside the freshness window `[STREAM_DIRTY_BACKSTOP_INTERVAL, +// CATCHUP_THRESHOLD_MS]` = `[250ms, 500ms]`: stuck enough to warrant a backstop +// AND still fresh enough that the publisher would actually deliver it. The +// invariant `2*INTERVAL <= CATCHUP_THRESHOLD_MS` guarantees that window is +// non-empty (lower bound 250ms <= upper bound 500ms) with margin to spare, so a +// stalled-but-fresh dirty epoch always has at least one tick inside the window +// to be re-published before it ages past the publisher's suppression cutoff. const _BACKSTOP_VS_FRESHNESS_INVARIANT: () = assert!( STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() * 2 <= crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS as u128, @@ -1206,55 +1207,71 @@ impl OrderBookListener { self.emit_authoritative_block_snapshot(&block); } - /// Stuck-stream backstop: if streaming + WS-disabled and the dirtying - /// event's SOURCE age (`now_ms() - dirty_block_time_ms`) has reached - /// STREAM_DIRTY_BACKSTOP_INTERVAL without a finalization emission, emit a + /// Stuck-stream backstop: if streaming + WS-disabled and the dirty book's + /// SOURCE age (`now_ms() - dirty_block_time_ms`) is inside the freshness + /// window `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]`, emit a /// provisional snapshot (does NOT clear book_dirty, so the eventual - /// authoritative finalization snapshot still fires). The gate is measured + /// authoritative finalization snapshot still fires). The window is measured /// in source time (not local elapsed) so it matches the multicast /// publisher's freshness gate. Shared by the hl_listen backstop ticker arm /// and the test helper so the two cannot drift. /// - /// Emits at most one Provisional per *unchanged* dirty state: the - /// `provisional_emitted_this_epoch` guard suppresses repeat emits while the - /// book is unchanged (no per-tick TOB recompute during a static stall, so a - /// multi-second stall does not re-run the full recompute on every 250ms - /// ticker fire), but every real BBO-affecting mutation - /// (`mark_dirty_with_times`) re-arms the guard so a long stall with ongoing - /// diffs still re-publishes the updated provisional rather than stranding - /// subscribers on a stale BBO. The guard is also cleared at finalization by - /// `clear_book_dirty` (owned by `emit_authoritative_block_snapshot`), so the - /// next dirty epoch starts fresh and the authoritative finalization snapshot - /// is unaffected. + /// Re-emits a Provisional on EVERY tick while the dirty book is inside the + /// window — it is deliberately NOT one-shot. The listener cannot observe + /// whether the publisher's multicast broadcast actually delivered an earlier + /// provisional (the staleness gate is applied later with a different clock), + /// so latching a one-shot at enqueue would strand subscribers if a fresh + /// provisional was dropped in transit. Re-emitting every ~250ms tick within + /// the ~250ms-wide window is bounded (≤~2 emits per stuck gap → CPU bound) + /// and retries a transport-dropped fresh provisional. Once the source ages + /// past `CATCHUP_THRESHOLD_MS` the publisher would suppress it anyway, so + /// the backstop goes silent (no wasted recompute). `dirty_source_times` + /// tracks the LATEST mutation, so ongoing diffs during a stall keep the + /// window fresh and re-publish updated state, while a no-diff stall ages out + /// and the backstop correctly stops. The authoritative finalization snapshot + /// (owned by `emit_authoritative_block_snapshot` via `clear_book_dirty`) is + /// unaffected. fn try_emit_stuck_stream_backstop(&mut self) -> bool { self.try_emit_stuck_stream_backstop_inner(true) } /// Inner implementation. `enforce_interval = true` in all production paths - /// (ticker arm, existing test helper) gates emission on the dirtying - /// event's source age reaching STREAM_DIRTY_BACKSTOP_INTERVAL. - /// `enforce_interval = false` is exposed only via the `#[cfg(test)]` - /// helper below, allowing tests to bypass the source-age gate without - /// affecting any other backstop condition. + /// (ticker arm, existing test helper) gates emission on the dirty book's + /// source age being inside the freshness window + /// `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]`. + /// `enforce_interval = false` is exposed only via the `#[cfg(test)]` helper + /// below and bypasses the WHOLE timing window (force-emit), leaving every + /// other backstop condition real. /// - /// One-shot per *unchanged* dirty state: regardless of `enforce_interval`, - /// emission is gated on `!provisional_emitted_this_epoch()` and sets that - /// flag after emitting, so at most one Provisional fires while the book is - /// unchanged. Every real BBO-affecting mutation (`mark_dirty_with_times`) - /// re-arms the guard, so a long stall with ongoing diffs re-publishes the - /// updated provisional. The flag is also cleared at finalization - /// (`clear_book_dirty`), leaving the authoritative finalization snapshot - /// unaffected. + /// Not one-shot: regardless of `enforce_interval`, emission is NOT latched — + /// every tick inside the fresh window re-emits a Provisional so a + /// broadcast-dropped fresh provisional is retried (the listener cannot + /// observe publisher delivery). Re-emits stop once the source ages past the + /// freshness threshold (the publisher would suppress it anyway). The + /// authoritative finalization snapshot (`clear_book_dirty`) is unaffected. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { + // Freshness window: emit a provisional while the dirty book is BOTH + // stuck enough to warrant a backstop (source age >= INTERVAL) AND still + // fresh enough that the publisher would actually deliver it (source age + // <= CATCHUP_THRESHOLD_MS). Re-emitting every tick inside this ~INTERVAL- + // wide window is bounded (CPU) and retries a broadcast-dropped fresh + // provisional (the listener cannot observe publisher delivery, so it + // must not latch a one-shot at enqueue). Once the source ages past the + // freshness threshold the publisher would suppress it anyway, so the + // backstop goes silent (no wasted recompute). `dirty_source_times` + // tracks the LATEST mutation, so ongoing diffs during a stall keep the + // window fresh and re-publish updated state. + const UPPER_MS: u64 = + crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS; + let lower_ms = STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64; let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket && self.order_book_state.as_ref().is_some_and(|s| { s.book_dirty() - && !s.provisional_emitted_this_epoch() && (!enforce_interval || s.dirty_source_times().is_some_and(|(bt, _)| { - now_ms().saturating_sub(bt) - >= STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64 + let age = now_ms().saturating_sub(bt); + age >= lower_ms && age <= UPPER_MS })) }); if should_emit @@ -1268,9 +1285,6 @@ impl OrderBookListener { SnapshotEmission::Provisional, false, // prevent_future_snaps=false: later finalization must still emit ); - if let Some(state) = self.order_book_state.as_mut() { - state.mark_provisional_emitted(); - } true } else { false @@ -1845,22 +1859,23 @@ impl OrderBookListener { /// Test-only equivalent of the production `backstop_ticker.tick()` arm in /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty and - /// the dirtying event's SOURCE age (`now_ms() - dirty_block_time_ms`) has - /// reached `STREAM_DIRTY_BACKSTOP_INTERVAL`. The source-age gate is - /// ENFORCED (enforce_interval=true), so calling this while the dirty - /// epoch's source time is still fresh (source-age < 250ms) will NOT emit — - /// exercising the real gate. + /// the dirty book's SOURCE age (`now_ms() - dirty_block_time_ms`) is inside + /// the freshness window `[STREAM_DIRTY_BACKSTOP_INTERVAL, + /// CATCHUP_THRESHOLD_MS]`. The window is ENFORCED (enforce_interval=true): + /// calling this while the dirty epoch is still too fresh (source-age < + /// 250ms) OR already stale (source-age > 500ms) will NOT emit. Within the + /// window it re-emits on every call — the backstop is not one-shot. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { self.try_emit_stuck_stream_backstop(); } - /// Test-only backstop variant that bypasses ONLY the source-age gate - /// (`now_ms() - dirty_block_time_ms >= STREAM_DIRTY_BACKSTOP_INTERVAL`). - /// All other conditions are real: streaming mode, WS disabled, book_dirty, - /// dirty_source_times present, emits `SnapshotEmission::Provisional`. Use - /// this to exercise the provisional emit path deterministically regardless - /// of the dirty epoch's source age and without a real sleep. + /// Test-only backstop variant that bypasses the WHOLE freshness-window + /// timing gate (force-emit). All other conditions are real: streaming mode, + /// WS disabled, book_dirty, dirty_source_times present, emits + /// `SnapshotEmission::Provisional`. Use this to exercise the provisional + /// emit path deterministically with historical fixture block times + /// (regardless of source age) and without a real sleep. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_ignoring_interval_for_test(&mut self) { self.try_emit_stuck_stream_backstop_inner(false); diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 5511395d..7780c497 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -32,11 +32,6 @@ pub(super) struct OrderBookState { /// reliable times without depending on the take-and-clear /// `OrderBookListener::last_batch_times()`. dirty_source_times: Option<(u64, u64)>, - /// True once the stuck-stream backstop has emitted its single Provisional - /// snapshot for the current dirty epoch. Makes the backstop one-shot per - /// epoch (no per-tick TOB recompute during a stall). Reset by - /// `clear_book_dirty` at epoch close. - provisional_emitted_this_epoch: bool, /// Monotonic counter bumped on every book-content mutation (hot-path real /// mutations via `mark_dirty_with_times`, and recovery repairs via /// `replace_coin_from_snapshot`/`remove_coin`). Captured with a cloned @@ -79,7 +74,6 @@ impl Clone for OrderBookState { enable_websocket: self.enable_websocket, book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, - provisional_emitted_this_epoch: self.provisional_emitted_this_epoch, mutation_seq: self.mutation_seq, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. @@ -106,7 +100,6 @@ impl OrderBookState { snapped: false, book_dirty: false, dirty_source_times: None, - provisional_emitted_this_epoch: false, mutation_seq: 0, dob_tap: None, } @@ -141,38 +134,20 @@ impl OrderBookState { self.dirty_source_times } - pub(super) const fn provisional_emitted_this_epoch(&self) -> bool { - self.provisional_emitted_this_epoch - } - - pub(super) fn mark_provisional_emitted(&mut self) { - self.provisional_emitted_this_epoch = true; - } - - /// Marks the book dirty and, on the first transition of the current dirty - /// epoch, records the source `(block_time_ms, local_time_ms)`. The - /// stuck-stream backstop gates emission on the source age of this - /// `block_time_ms` (no local `Instant` is stamped). - /// - /// Every real BBO-affecting mutation also re-arms the one-shot backstop - /// guard (`provisional_emitted_this_epoch = false`): a single provisional - /// covers an unchanged dirty state, but a post-provisional mutation during - /// a long stall must be re-published so subscribers are not stranded on a - /// stale BBO. (Recovery never calls this — it is orthogonal to the dirty - /// epoch by design — so recovery cannot re-arm the guard.) pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { - if !self.book_dirty { - self.dirty_source_times = Some((block_time_ms, local_time_ms)); - } + // Always update to the latest real mutation's source times (not just the + // first dirty transition): the stuck-stream backstop's freshness window + // is measured from the most recent mutation, so ongoing diffs during a + // stall keep emitting fresh provisionals while a no-diff stall correctly + // ages out of the window. + self.dirty_source_times = Some((block_time_ms, local_time_ms)); self.book_dirty = true; - self.provisional_emitted_this_epoch = false; self.mutation_seq = self.mutation_seq.wrapping_add(1); } pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.dirty_source_times = None; - self.provisional_emitted_this_epoch = false; } // forcibly take snapshot - (time, height, snapshot) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index cd15ba4b..13598424 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -264,24 +264,27 @@ async fn finalized_blocks_emit_snapshots_in_order() { } // --------------------------------------------------------------------------- -// Tests 4 + C1b (merged): Stuck-stream backstop — SOURCE-age gate + emit path. +// Tests 4 + C1b (merged): Stuck-stream backstop — freshness window + emit path. // -// The backstop now gates on the dirtying event's SOURCE age -// (`now_ms() - dirty_block_time_ms`) reaching `STREAM_DIRTY_BACKSTOP_INTERVAL` -// (250ms), measured on the same basis as the multicast publisher's freshness -// gate — NOT on local elapsed wall time. The test proves the source-age gate -// deterministically with block times taken relative to *now* (no sleeps, no -// mock clock): -// - Case A: a dirty epoch whose source-age is still < 250ms must NOT emit -// under the enforced gate (`fire_stream_dirty_backstop_for_test`). This -// assertion is load-bearing: it proves the source-age gate genuinely -// blocks a still-fresh dirty epoch on the production path. -// - Case B: a dirty epoch whose source-age has reached >= 250ms must emit -// exactly one Provisional snapshot under the enforced gate. -// - Case C: `fire_stream_dirty_backstop_ignoring_interval_for_test()` -// bypasses ONLY the source-age comparison; all other conditions -// (streaming, !ws, book_dirty, dirty_source_times) remain real, so it -// force-emits regardless of source age. +// The backstop emits a provisional only while the dirty book's SOURCE age +// (`now_ms() - dirty_block_time_ms`) is inside the freshness window +// `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]` = `[250ms, 500ms]`: +// stuck enough to warrant a backstop AND still fresh enough that the publisher +// would actually deliver it. Measured on the same basis as the multicast +// publisher's freshness gate — NOT on local elapsed wall time. The test proves +// the window deterministically with block times taken relative to *now* (no +// sleeps, no mock clock): +// - Case A: source-age ~100ms (< 250ms lower bound) must NOT emit under the +// enforced window. Load-bearing: proves a still-too-fresh dirty epoch is +// blocked on the production path. +// - Case B: source-age ~300ms (inside [250, 500]) emits exactly one +// Provisional snapshot under the enforced window. +// - Case C: source-age ~1000ms (> 500ms upper bound, stale) must NOT emit: +// the publisher would suppress it anyway, so the backstop must not waste a +// recompute. +// - Case D: `fire_stream_dirty_backstop_ignoring_interval_for_test()` +// bypasses the WHOLE timing window; all other conditions (streaming, !ws, +// book_dirty, dirty_source_times) remain real, so it force-emits. // --------------------------------------------------------------------------- /// Current epoch millis on the SAME basis as the listener's `now_ms()` @@ -295,17 +298,18 @@ fn now_ms_test() -> u64 { .unwrap_or(0) } -/// L2-5 contract (4): when no later block arrives, the backstop hook -/// (which production fires on a 250ms ticker) emits one Provisional snapshot -/// for the dirty block once the dirtying event's SOURCE age has reached the -/// 250ms backstop interval — but NOT before. -/// -/// Also covers C1b: `fire_stream_dirty_backstop_for_test` gates on the real -/// source-age interval and does not emit while the dirty epoch is still fresh. +/// L2-5 contract (4): the backstop emits a Provisional only while the dirty +/// book's SOURCE age is inside the freshness window `[250ms, 500ms]` — +/// stuck enough to warrant a backstop AND still fresh enough that the publisher +/// would deliver it. Three cases prove the window with deterministic relative +/// block times (no sleeps): too-fresh (no emit), in-window (one Provisional), +/// stale (no emit, the publisher would suppress it so the backstop must not +/// waste a recompute). Case D proves the interval-bypassing seam still +/// force-emits regardless of source age. #[tokio::test(flavor = "current_thread")] async fn stuck_stream_backstop_emits_dirty_snapshot() { - // Case A (gate active — must NOT emit): the dirtying block's source time is - // only ~100ms old (< 250ms), so the enforced source-age gate must block. + // Case A (too fresh — must NOT emit): source-age ~100ms is below the 250ms + // lower bound, so the enforced window blocks. { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -317,14 +321,13 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { let before_interval = drain_snapshots(&mut rx); assert!( before_interval.is_empty(), - "backstop must NOT fire while the dirty epoch's source-age (~100ms) is below the 250ms interval; got {} snapshot(s)", + "backstop must NOT fire while the dirty epoch's source-age (~100ms) is below the 250ms lower bound; got {} snapshot(s)", before_interval.len(), ); } - // Case B (gate satisfied — must emit exactly one Provisional): the dirtying - // block's source time is ~300ms old (>= 250ms), so the enforced source-age - // gate is met and the backstop emits one Provisional snapshot. + // Case B (in window — must emit exactly one Provisional): source-age ~300ms + // is inside [250, 500], so the enforced window is satisfied. { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -337,7 +340,7 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { assert_eq!( emitted.len(), 1, - "backstop must emit exactly one snapshot once source-age (~300ms) reaches the 250ms interval; got {}", + "backstop must emit exactly one snapshot when source-age (~300ms) is inside the [250,500] window; got {}", emitted.len(), ); assert_eq!( @@ -352,7 +355,26 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { ); } - // Case C (bypass): the interval-ignoring helper force-emits regardless of + // Case C (stale — must NOT emit): source-age ~1000ms is past the 500ms + // upper bound. The publisher would suppress a provisional this stale, so + // the backstop must go silent and not waste a recompute. + { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + let stale_block_time = now_ms_test() - 1000; + let (s2, d2) = add_event(stale_block_time, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, stale_block_time, vec![s2], vec![d2]); + + listener.fire_stream_dirty_backstop_for_test(); + let emitted = drain_snapshots(&mut rx); + assert!( + emitted.is_empty(), + "backstop must NOT fire once the dirty epoch's source-age (~1000ms) is past the 500ms upper bound (publisher would suppress it); got {} snapshot(s)", + emitted.len(), + ); + } + + // Case D (bypass): the interval-ignoring helper force-emits regardless of // source age — here a still-fresh (~50ms) dirty epoch still emits one // Provisional, proving the bypass seam is intact. { @@ -384,56 +406,54 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { } // --------------------------------------------------------------------------- -// Test 4b: the stuck-stream backstop is ONE-SHOT per dirty epoch. +// Test 4b (iter-15 regression): the backstop RE-EMITS within the fresh window +// to retry a transport-dropped provisional. // -// During a multi-second stall the production 250ms ticker re-runs the backstop -// many times for the same stuck block. The backstop must emit AT MOST ONE -// Provisional snapshot per dirty epoch: the per-epoch -// `provisional_emitted_this_epoch` flag suppresses every later tick within the -// same stuck block (no repeated full TOB recompute / quote spam). The flag is -// cleared only at finalization by `clear_book_dirty`, so the authoritative -// finalization snapshot for that block is unaffected and the next dirty epoch -// starts fresh. +// The listener cannot observe whether the publisher's multicast broadcast +// actually delivered an earlier provisional, so it must NOT latch a one-shot at +// enqueue. Two consecutive backstop ticks inside the fresh window — with NO +// intervening book change — must each emit a Provisional, so a fresh +// provisional dropped in transit is re-delivered on the next tick. // --------------------------------------------------------------------------- -/// L2-5 contract (4) one-shot guard: under the enforced source-age gate, two -/// consecutive backstop ticks within the same dirty epoch emit exactly ONE -/// Provisional snapshot (the second tick is a no-op). Finalizing the block -/// afterwards still emits the Authoritative snapshot — proving the one-shot -/// guard does not suppress finalization (the epoch's `clear_book_dirty` reset -/// the flag). +/// iter-15 regression: two consecutive enforced backstop ticks within the +/// fresh window, with no intervening mutation, must emit TWO Provisional +/// snapshots. This proves the backstop is NOT one-shot: the listener cannot +/// observe broadcast delivery, so it re-emits each tick while fresh to +/// re-deliver a transport-dropped provisional. #[tokio::test(flavor = "current_thread")] -async fn backstop_is_one_shot_per_dirty_epoch() { +async fn backstop_re_emits_within_fresh_window_to_retry_drops() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); - // Dirty block 2 whose source-age is ~300ms (>= 250ms), so the enforced - // source-age gate is satisfied. The same relative time is passed to both - // `add_event` and `feed_block`. + // Dirty block 2 whose source-age is ~300ms (inside [250,500]). The same + // relative time is passed to both `add_event` and `feed_block`. let aged_block_time = now_ms_test() - 300; let (s2, d2) = add_event(aged_block_time, Side::Bid, 101, "100", "5"); feed_block(&mut listener, 2, aged_block_time, vec![s2], vec![d2]); - // Fire the enforced backstop TWICE in the same dirty epoch. + // Fire the enforced backstop TWICE with NO intervening change. listener.fire_stream_dirty_backstop_for_test(); listener.fire_stream_dirty_backstop_for_test(); let emitted = drain_snapshots(&mut rx); assert_eq!( emitted.len(), - 1, - "backstop must emit exactly one Provisional per dirty epoch (the second tick is a no-op); got {}", + 2, + "backstop must re-emit each tick while fresh (NOT one-shot) so a dropped fresh provisional is retried; got {}", emitted.len(), ); - assert_eq!( - snapshot_emission(&emitted[0]).expect("snapshot has emission"), - super::SnapshotEmission::Provisional, - "the single backstop snapshot must be Provisional", - ); - assert_eq!(snapshot_height(&emitted[0]), 2, "backstop snapshot must carry height 2"); + for snap in &emitted { + assert_eq!( + snapshot_emission(snap).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "every re-emitted backstop snapshot must be Provisional", + ); + assert_eq!(snapshot_height(snap), 2, "backstop snapshot must carry height 2"); + } - // Block 3 finalizes block 2. The one-shot guard must NOT suppress the - // authoritative finalization snapshot — `clear_book_dirty` at epoch close - // resets `provisional_emitted_this_epoch`. + // Block 3 finalizes block 2. The non-one-shot backstop must NOT suppress + // the authoritative finalization snapshot — `clear_book_dirty` at epoch + // close still owns the lifecycle. let finalize_block_time = now_ms_test() - 100; let (s3, d3) = add_event(finalize_block_time, Side::Bid, 103, "90", "1"); feed_block(&mut listener, 3, finalize_block_time, vec![s3], vec![d3]); @@ -447,27 +467,28 @@ async fn backstop_is_one_shot_per_dirty_epoch() { assert_eq!( height_2_authoritative.len(), 1, - "finalization must still emit exactly one Authoritative snapshot for height 2 after a one-shot backstop; got {}", + "finalization must still emit exactly one Authoritative snapshot for height 2 after backstop re-emits; got {}", height_2_authoritative.len(), ); } // --------------------------------------------------------------------------- -// Test 4c (Codex finding): the one-shot guard RE-ARMS on a post-provisional -// BBO mutation, so a long stall with ongoing diffs does not strand subscribers +// Test 4c (Codex finding): a post-provisional BBO mutation keeps the freshness +// window fresh, so a long stall with ongoing diffs does not strand subscribers // on a stale provisional. // --------------------------------------------------------------------------- -/// Codex finding: the one-shot backstop guard must RE-ARM on a post-provisional -/// BBO mutation, so a long stall with ongoing diffs does not strand subscribers -/// on a stale provisional. First provisional emits; a later same-height -/// BBO-changing diff (block still unfinalized) must let the next backstop tick -/// emit a SECOND provisional carrying the updated BBO. +/// Codex finding: under the freshness-window model, re-emission is inherent +/// (not gated by a re-arm guard). `dirty_source_times` tracks the LATEST +/// mutation, so a later same-height BBO-changing diff (block still unfinalized) +/// refreshes the window and the next backstop tick emits a SECOND provisional +/// carrying the updated BBO — a long stall with ongoing diffs does not strand +/// subscribers on a stale provisional. #[tokio::test(flavor = "current_thread")] async fn backstop_re_emits_after_post_provisional_mutation() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); - // Stuck block 2, source-age ~300ms (>= 250ms gate). First BBO = bid 100. + // Stuck block 2, source-age ~300ms (inside [250,500] window). First BBO = bid 100. let bt1 = now_ms_test() - 300; let (s2, d2) = add_event(bt1, Side::Bid, 101, "100", "5"); feed_block(&mut listener, 2, bt1, vec![s2], vec![d2]); @@ -483,14 +504,15 @@ async fn backstop_re_emits_after_post_provisional_mutation() { let (s2b, d2b) = add_event(bt2, Side::Bid, 102, "110", "3"); feed_block(&mut listener, 2, bt2, vec![s2b], vec![d2b]); - // The post-provisional mutation must have re-armed the guard: the next - // backstop tick emits a SECOND provisional with the UPDATED BBO. + // The post-provisional mutation refreshed `dirty_source_times` (keeping the + // window fresh): the next backstop tick emits a SECOND provisional with the + // UPDATED BBO. listener.fire_stream_dirty_backstop_for_test(); let second = drain_snapshots(&mut rx); assert_eq!(second.len(), 1, "post-mutation backstop must re-emit one provisional; got {}", second.len()); assert_eq!(snapshot_emission(&second[0]), Some(super::SnapshotEmission::Provisional)); let bid_2 = snapshot_best_bid(&second[0]).expect("second provisional has a bid"); - assert_ne!(bid_1, bid_2, "second provisional must carry the updated BBO (re-arm worked); bid_1={bid_1:?} bid_2={bid_2:?}"); + assert_ne!(bid_1, bid_2, "second provisional must carry the updated BBO (window refreshed by latest mutation); bid_1={bid_1:?} bid_2={bid_2:?}"); // Sanity: finalization still emits the authoritative snapshot. let bt3 = now_ms_test() - 100; diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index b25f5574..eb7951d7 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1054,7 +1054,15 @@ impl MulticastPublisher { enqueued_at_ms, emission, } => { - cached_snapshot = Some(msg.clone()); + // Never cache a Provisional for the periodic + // snapshot resend: it is a partial, mid-stall + // book and the interval path rebroadcasts the + // cache as a full/authoritative snapshot. Only + // Authoritative/Correction may be periodically + // resent. + if !matches!(emission, SnapshotEmission::Provisional) { + cached_snapshot = Some(msg.clone()); + } let now_ms = Self::now_ms(); let queue_delay_ms = now_ms.saturating_sub(*enqueued_at_ms); let listener_to_publisher_ms = now_ms.saturating_sub(*source_local_time_ms); From 271b74f3bb738254c6294e0849e67b150c55f225 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 13:29:27 -0400 Subject: [PATCH 49/65] publisher: a provisional publish must not mark caught-up (no stale authoritative resend mid-stall) --- server/src/multicast/publisher.rs | 47 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index eb7951d7..2e45f1a6 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -575,17 +575,17 @@ impl MulticastPublisher { } } - /// `caught_up` means the most recent publish was genuinely fresh and the - /// publisher is keeping up — it gates the periodic `snapshot_interval` - /// resend of the cached snapshot. A *forced-stale* publish (an - /// `Authoritative` superseding a pending provisional, or a recovery - /// `Correction`, with `fresh == false`) delivers required data exactly - /// once but does NOT mean we are caught up; it must clear `caught_up` so - /// the periodic resend does not rebroadcast stale quotes during a stall or - /// recovery catch-up. Therefore the post-publish state is exactly `fresh` - /// (the suppress path independently sets it `false`). - const fn caught_up_after_publish(fresh: bool) -> bool { - fresh + /// `caught_up` gates the periodic `snapshot_interval` rebroadcast of the + /// cached authoritative snapshot. It must be true ONLY when the most recent + /// publish was a genuinely fresh `Authoritative` or `Correction` (a real, + /// full snapshot we are keeping up with). A `Provisional` is partial mid- + /// stall state that is intentionally NOT cached, so flipping `caught_up` + /// on it would let the interval path rebroadcast the OLDER cached + /// authoritative and roll subscribers back during the stall. A forced- + /// stale Authoritative/Correction (`fresh == false`) also must not enable + /// periodic resends. Hence: caught up iff fresh AND not a Provisional. + const fn caught_up_after_publish(emission: SnapshotEmission, fresh: bool) -> bool { + fresh && !matches!(emission, SnapshotEmission::Provisional) } fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { @@ -1110,7 +1110,7 @@ impl MulticastPublisher { Duration::from_millis(lag_ms), ); let was_caught_up = caught_up; - caught_up = Self::caught_up_after_publish(fresh); + caught_up = Self::caught_up_after_publish(*emission, fresh); if caught_up && !was_caught_up { info!( "tob marketdata caught up: publishing quote frames source={} height={} source_lag_ms={} dest={}", @@ -2117,15 +2117,22 @@ mod tests { ); } - /// `caught_up` after a publish is exactly `fresh`. A fresh publish enters - /// caught-up; a forced-stale supersede/Correction (`fresh == false`) clears - /// it so the periodic snapshot resend does not rebroadcast stale quotes. - /// Scenario locked in: fresh Provisional (fresh=true → caught_up=true) then - /// stale forced Authoritative (fresh=false → caught_up=false → no periodic - /// stale resend). + /// `caught_up` after a publish: true ONLY for a fresh Authoritative or + /// Correction. A Provisional (even fresh) must NOT enable periodic resend + /// (it would rebroadcast the older cached authoritative during a stall and + /// roll subscribers back). A forced-stale publish (fresh=false) also stays + /// not-caught-up. #[test] fn caught_up_after_publish_matrix() { - assert!(MulticastPublisher::caught_up_after_publish(true)); - assert!(!MulticastPublisher::caught_up_after_publish(false)); + use SnapshotEmission::{Authoritative, Correction, Provisional}; + // fresh authoritative / correction -> caught up + assert!(MulticastPublisher::caught_up_after_publish(Authoritative, true)); + assert!(MulticastPublisher::caught_up_after_publish(Correction, true)); + // fresh provisional -> NOT caught up (must not trigger periodic resend) + assert!(!MulticastPublisher::caught_up_after_publish(Provisional, true)); + // stale (forced) anything -> NOT caught up + assert!(!MulticastPublisher::caught_up_after_publish(Authoritative, false)); + assert!(!MulticastPublisher::caught_up_after_publish(Correction, false)); + assert!(!MulticastPublisher::caught_up_after_publish(Provisional, false)); } } From 793837e770ece499553570a4994d1f350426e2ed Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 13:45:04 -0400 Subject: [PATCH 50/65] backstop: dedup provisional by mutation seq instead of a phase-skippable freshness window --- server/src/listeners/order_book/mod.rs | 156 +++++++------ server/src/listeners/order_book/state.rs | 35 ++- .../order_book/stream_finalization_tests.rs | 207 ++++++++---------- 3 files changed, 206 insertions(+), 192 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index e2a55d04..6bd0d765 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -78,14 +78,15 @@ const STREAM_REORDER_FALLBACK_WINDOW: Duration = Duration::from_millis(50); /// fires. const STREAM_DIRTY_BACKSTOP_INTERVAL: Duration = Duration::from_millis(250); -// The backstop emits a provisional only while the dirty book's source age is -// inside the freshness window `[STREAM_DIRTY_BACKSTOP_INTERVAL, -// CATCHUP_THRESHOLD_MS]` = `[250ms, 500ms]`: stuck enough to warrant a backstop -// AND still fresh enough that the publisher would actually deliver it. The -// invariant `2*INTERVAL <= CATCHUP_THRESHOLD_MS` guarantees that window is -// non-empty (lower bound 250ms <= upper bound 500ms) with margin to spare, so a -// stalled-but-fresh dirty epoch always has at least one tick inside the window -// to be re-published before it ages past the publisher's suppression cutoff. +// The backstop emits a provisional once the dirty book's source age reaches +// `STREAM_DIRTY_BACKSTOP_INTERVAL` (no upper-bound window — a fixed-period +// ticker not phase-aligned to the dirty epoch could step OVER a bounded window +// entirely; CPU is instead bounded by per-content dedup on `mutation_seq`). +// The invariant `2*INTERVAL <= CATCHUP_THRESHOLD_MS` documents that a +// provisional emitted right at the >=INTERVAL stuck threshold still has at +// least INTERVAL of margin before the publisher's freshness cutoff, so the +// FIRST attempt for a freshly-stuck epoch is normally still deliverable (the +// publisher's own freshness gate, not the listener, decides staleness). const _BACKSTOP_VS_FRESHNESS_INVARIANT: () = assert!( STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() * 2 <= crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS as u128, @@ -1207,71 +1208,78 @@ impl OrderBookListener { self.emit_authoritative_block_snapshot(&block); } - /// Stuck-stream backstop: if streaming + WS-disabled and the dirty book's - /// SOURCE age (`now_ms() - dirty_block_time_ms`) is inside the freshness - /// window `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]`, emit a - /// provisional snapshot (does NOT clear book_dirty, so the eventual - /// authoritative finalization snapshot still fires). The window is measured - /// in source time (not local elapsed) so it matches the multicast - /// publisher's freshness gate. Shared by the hl_listen backstop ticker arm - /// and the test helper so the two cannot drift. + /// Stuck-stream backstop: if streaming + WS-disabled, the book is dirty, + /// the dirty epoch has been stuck for at least + /// `STREAM_DIRTY_BACKSTOP_INTERVAL` of SOURCE time + /// (`now_ms() - dirty_block_time_ms`), AND no provisional has yet been + /// attempted for the current dirty-content state, emit a Provisional + /// snapshot (does NOT clear book_dirty, so the eventual authoritative + /// finalization snapshot still fires). Shared by the hl_listen backstop + /// ticker arm and the test helper so the two cannot drift. /// - /// Re-emits a Provisional on EVERY tick while the dirty book is inside the - /// window — it is deliberately NOT one-shot. The listener cannot observe - /// whether the publisher's multicast broadcast actually delivered an earlier - /// provisional (the staleness gate is applied later with a different clock), - /// so latching a one-shot at enqueue would strand subscribers if a fresh - /// provisional was dropped in transit. Re-emitting every ~250ms tick within - /// the ~250ms-wide window is bounded (≤~2 emits per stuck gap → CPU bound) - /// and retries a transport-dropped fresh provisional. Once the source ages - /// past `CATCHUP_THRESHOLD_MS` the publisher would suppress it anyway, so - /// the backstop goes silent (no wasted recompute). `dirty_source_times` - /// tracks the LATEST mutation, so ongoing diffs during a stall keep the - /// window fresh and re-publish updated state, while a no-diff stall ages out - /// and the backstop correctly stops. The authoritative finalization snapshot - /// (owned by `emit_authoritative_block_snapshot` via `clear_book_dirty`) is - /// unaffected. + /// NO upper-bound window. Production drives this from a fixed ~250ms ticker + /// that is not phase-aligned to the dirty epoch; a bounded + /// `[INTERVAL, CATCHUP_THRESHOLD_MS]` window of ~INTERVAL width could be + /// phase-skipped entirely (a mutation lands just after a tick → the next + /// tick is already past the upper bound → no provisional ever enqueued for + /// that epoch). Removing the upper bound guarantees every stuck epoch gets + /// a provisional attempt as soon as it is stuck >= INTERVAL. CPU is bounded + /// instead by CONTENT de-duplication on the monotonic `mutation_seq`: at + /// most one Provisional per distinct dirty-content state. A no-new-diff + /// stall keeps `mutation_seq` static → exactly one emit then deduped (no + /// per-tick recompute). A stall with ongoing diffs bumps `mutation_seq` on + /// every real mutation → the next tick emits ONE updated provisional + /// (subscribers see current state). A transport-dropped provisional of + /// UNCHANGED content is intentionally NOT retried — that is accepted + /// best-effort for a degraded-stream safety net; the publisher-authority + /// supersede + fresh-Authoritative/periodic-resend machinery resyncs + /// subscribers when the stream recovers. The publisher's own freshness gate + /// still suppresses a stale provisional. The authoritative finalization + /// snapshot (owned by `emit_authoritative_block_snapshot` via + /// `clear_book_dirty`, which also resets the dedup tracker) is unaffected. fn try_emit_stuck_stream_backstop(&mut self) -> bool { self.try_emit_stuck_stream_backstop_inner(true) } /// Inner implementation. `enforce_interval = true` in all production paths - /// (ticker arm, existing test helper) gates emission on the dirty book's - /// source age being inside the freshness window - /// `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]`. - /// `enforce_interval = false` is exposed only via the `#[cfg(test)]` helper - /// below and bypasses the WHOLE timing window (force-emit), leaving every - /// other backstop condition real. + /// (ticker arm, existing test helper) gates emission on the dirty epoch + /// having been stuck for at least `STREAM_DIRTY_BACKSTOP_INTERVAL` of + /// source time (lower bound only — there is NO upper bound, see + /// `try_emit_stuck_stream_backstop`). `enforce_interval = false` is exposed + /// only via the `#[cfg(test)]` helper below and bypasses the stuck-time + /// gate (force-emit), leaving every other backstop condition real + /// (including the per-content dedup). /// - /// Not one-shot: regardless of `enforce_interval`, emission is NOT latched — - /// every tick inside the fresh window re-emits a Provisional so a - /// broadcast-dropped fresh provisional is retried (the listener cannot - /// observe publisher delivery). Re-emits stop once the source ages past the - /// freshness threshold (the publisher would suppress it anyway). The + /// Emission is bounded by content dedup on `mutation_seq`, not by tick + /// rate: at most one Provisional per distinct dirty-content state. A + /// no-diff stall recomputes once then dedups (CPU bound); an ongoing-diff + /// stall re-emits updated state because each real mutation bumps + /// `mutation_seq` (clearing the dedup). A transport-dropped provisional of + /// unchanged content is intentionally not retried — recovery is handled by + /// the publisher's freshness gate + supersede/resync machinery. The /// authoritative finalization snapshot (`clear_book_dirty`) is unaffected. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { - // Freshness window: emit a provisional while the dirty book is BOTH - // stuck enough to warrant a backstop (source age >= INTERVAL) AND still - // fresh enough that the publisher would actually deliver it (source age - // <= CATCHUP_THRESHOLD_MS). Re-emitting every tick inside this ~INTERVAL- - // wide window is bounded (CPU) and retries a broadcast-dropped fresh - // provisional (the listener cannot observe publisher delivery, so it - // must not latch a one-shot at enqueue). Once the source ages past the - // freshness threshold the publisher would suppress it anyway, so the - // backstop goes silent (no wasted recompute). `dirty_source_times` - // tracks the LATEST mutation, so ongoing diffs during a stall keep the - // window fresh and re-publish updated state. - const UPPER_MS: u64 = - crate::multicast::publisher::MulticastPublisher::CATCHUP_THRESHOLD_MS; + // Emit a provisional once the dirty book is stuck >= INTERVAL of source + // time (lower bound only — NO upper bound: a fixed-period ticker not + // phase-aligned to the dirty epoch could phase-skip a bounded window + // entirely, so a stuck epoch could get no provisional at all). CPU is + // bounded instead by per-content dedup on the monotonic `mutation_seq`: + // at most one Provisional per distinct dirty-content state. A no-new- + // diff stall keeps `mutation_seq` static → one emit then deduped; an + // ongoing-diff stall bumps `mutation_seq` each real mutation → + // re-emits updated state on the next tick. A transport-dropped + // provisional of unchanged content is intentionally not retried (the + // publisher's supersede/resync machinery handles recovery); the + // publisher's own freshness gate still suppresses stale provisionals. let lower_ms = STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64; let should_emit = self.ingest_mode == IngestMode::Stream && !self.enable_websocket && self.order_book_state.as_ref().is_some_and(|s| { s.book_dirty() + && !s.provisional_already_attempted_for_current_content() && (!enforce_interval || s.dirty_source_times().is_some_and(|(bt, _)| { - let age = now_ms().saturating_sub(bt); - age >= lower_ms && age <= UPPER_MS + now_ms().saturating_sub(bt) >= lower_ms })) }); if should_emit @@ -1285,6 +1293,9 @@ impl OrderBookListener { SnapshotEmission::Provisional, false, // prevent_future_snaps=false: later finalization must still emit ); + if let Some(state) = self.order_book_state.as_mut() { + state.mark_provisional_attempt(); + } true } else { false @@ -1858,24 +1869,29 @@ impl OrderBookListener { } /// Test-only equivalent of the production `backstop_ticker.tick()` arm in - /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty and - /// the dirty book's SOURCE age (`now_ms() - dirty_block_time_ms`) is inside - /// the freshness window `[STREAM_DIRTY_BACKSTOP_INTERVAL, - /// CATCHUP_THRESHOLD_MS]`. The window is ENFORCED (enforce_interval=true): - /// calling this while the dirty epoch is still too fresh (source-age < - /// 250ms) OR already stale (source-age > 500ms) will NOT emit. Within the - /// window it re-emits on every call — the backstop is not one-shot. + /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty, the + /// dirty epoch has been stuck for at least `STREAM_DIRTY_BACKSTOP_INTERVAL` + /// of SOURCE time (`now_ms() - dirty_block_time_ms` >= 250ms; lower bound + /// only — there is NO upper bound), AND no provisional has yet been + /// attempted for the current dirty-content state. The stuck-time lower + /// bound is ENFORCED (enforce_interval=true): calling this while the dirty + /// epoch is still too fresh (source-age < 250ms) will NOT emit. Emission is + /// content-deduped on `mutation_seq`: a second call with no intervening + /// mutation does NOT re-emit (this is the CPU bound that replaced the old + /// upper-bound window); a real mutation bumps `mutation_seq` and the next + /// call re-emits updated state. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_for_test(&mut self) { self.try_emit_stuck_stream_backstop(); } - /// Test-only backstop variant that bypasses the WHOLE freshness-window - /// timing gate (force-emit). All other conditions are real: streaming mode, - /// WS disabled, book_dirty, dirty_source_times present, emits - /// `SnapshotEmission::Provisional`. Use this to exercise the provisional - /// emit path deterministically with historical fixture block times - /// (regardless of source age) and without a real sleep. + /// Test-only backstop variant that bypasses the stuck-time lower-bound gate + /// (force-emit). All other conditions are real: streaming mode, WS + /// disabled, book_dirty, dirty_source_times present, the per-content dedup + /// on `mutation_seq` (so a second force-call with no intervening mutation + /// does NOT re-emit), emits `SnapshotEmission::Provisional`. Use this to + /// exercise the provisional emit path deterministically with historical + /// fixture block times (regardless of source age) and without a real sleep. #[cfg(test)] pub(crate) fn fire_stream_dirty_backstop_ignoring_interval_for_test(&mut self) { self.try_emit_stuck_stream_backstop_inner(false); diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 7780c497..6f4ccfc0 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -40,6 +40,12 @@ pub(super) struct OrderBookState { /// between clone and apply (height alone is insufficient — streaming /// applies many diffs at the same block number). mutation_seq: u64, + /// `mutation_seq` value at the last stuck-stream backstop provisional + /// attempt for the current dirty epoch. The backstop emits at most one + /// provisional per distinct dirty-content state (dedup by `mutation_seq`), + /// so a static stall does not recompute every tick while ongoing diffs + /// still re-publish updated state. Reset (None) at epoch close. + last_provisional_attempt_seq: Option, /// Present when the DoB emitter is wired in. The tap is NOT propagated to /// the cloned copy used for snapshot validation (validation reads only; no /// events should be emitted from it). @@ -75,6 +81,7 @@ impl Clone for OrderBookState { book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, mutation_seq: self.mutation_seq, + last_provisional_attempt_seq: self.last_provisional_attempt_seq, // The tap is intentionally not cloned: the clone is used only for // snapshot validation and must not emit DoB events. dob_tap: None, @@ -101,6 +108,7 @@ impl OrderBookState { book_dirty: false, dirty_source_times: None, mutation_seq: 0, + last_provisional_attempt_seq: None, dob_tap: None, } } @@ -118,6 +126,21 @@ impl OrderBookState { self.mutation_seq } + /// True iff a stuck-stream backstop provisional has already been attempted + /// for the CURRENT dirty-content state (same `mutation_seq`). The backstop + /// uses this to emit at most one provisional per distinct content state: + /// a no-new-diff stall recomputes once then dedups (CPU bound), while any + /// real mutation bumps `mutation_seq` and clears this automatically so an + /// ongoing-diff stall re-emits updated state on the next tick. + pub(super) const fn provisional_already_attempted_for_current_content(&self) -> bool { + matches!(self.last_provisional_attempt_seq, Some(s) if s == self.mutation_seq) + } + + /// Records that a provisional was attempted for the current content state. + pub(super) const fn mark_provisional_attempt(&mut self) { + self.last_provisional_attempt_seq = Some(self.mutation_seq); + } + /// Returns the timestamp of the most-recent order book update, mirroring /// what `l2_snapshots` returns as `snapshot.0`. Used by /// `emit_tob_snapshot` to compute the default source times when no file @@ -136,10 +159,10 @@ impl OrderBookState { pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { // Always update to the latest real mutation's source times (not just the - // first dirty transition): the stuck-stream backstop's freshness window - // is measured from the most recent mutation, so ongoing diffs during a - // stall keep emitting fresh provisionals while a no-diff stall correctly - // ages out of the window. + // first dirty transition): the stuck-stream backstop emits the most + // recent mutation's source times, so an ongoing-diff stall re-publishes + // current state. The mutation_seq bump below also clears the backstop's + // per-content dedup, so the next tick re-emits an updated provisional. self.dirty_source_times = Some((block_time_ms, local_time_ms)); self.book_dirty = true; self.mutation_seq = self.mutation_seq.wrapping_add(1); @@ -148,6 +171,10 @@ impl OrderBookState { pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.dirty_source_times = None; + // Epoch close resets the dedup tracker so the next dirty epoch's first + // stuck state emits a provisional (the backstop dedups by mutation_seq + // only WITHIN an epoch). + self.last_provisional_attempt_seq = None; } // forcibly take snapshot - (time, height, snapshot) diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index 13598424..e2d7e045 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -264,27 +264,35 @@ async fn finalized_blocks_emit_snapshots_in_order() { } // --------------------------------------------------------------------------- -// Tests 4 + C1b (merged): Stuck-stream backstop — freshness window + emit path. +// Tests 4 + C1b (merged): Stuck-stream backstop — stuck-time lower bound + +// per-content dedup (NO upper-bound window). // -// The backstop emits a provisional only while the dirty book's SOURCE age -// (`now_ms() - dirty_block_time_ms`) is inside the freshness window -// `[STREAM_DIRTY_BACKSTOP_INTERVAL, CATCHUP_THRESHOLD_MS]` = `[250ms, 500ms]`: -// stuck enough to warrant a backstop AND still fresh enough that the publisher -// would actually deliver it. Measured on the same basis as the multicast -// publisher's freshness gate — NOT on local elapsed wall time. The test proves -// the window deterministically with block times taken relative to *now* (no +// The backstop emits a Provisional once the dirty book's SOURCE age +// (`now_ms() - dirty_block_time_ms`) reaches `STREAM_DIRTY_BACKSTOP_INTERVAL` +// (250ms) — a LOWER bound only. There is NO upper-bound window: a fixed-period +// ticker not phase-aligned to the dirty epoch could phase-skip a bounded +// window entirely, leaving a stuck epoch with no provisional at all. CPU is +// bounded instead by per-content dedup on the monotonic `mutation_seq`: at +// most one Provisional per distinct dirty-content state. The publisher (not +// the listener) decides staleness via its own freshness gate. The test proves +// the contract deterministically with block times taken relative to *now* (no // sleeps, no mock clock): // - Case A: source-age ~100ms (< 250ms lower bound) must NOT emit under the -// enforced window. Load-bearing: proves a still-too-fresh dirty epoch is -// blocked on the production path. -// - Case B: source-age ~300ms (inside [250, 500]) emits exactly one -// Provisional snapshot under the enforced window. -// - Case C: source-age ~1000ms (> 500ms upper bound, stale) must NOT emit: -// the publisher would suppress it anyway, so the backstop must not waste a -// recompute. +// enforced lower bound. Load-bearing: proves a still-too-fresh dirty epoch +// is blocked on the production path. +// - Case B: source-age ~300ms (>= 250ms) first call emits exactly one +// Provisional snapshot under the enforced lower bound. +// - Case B2: an immediate SECOND call with no intervening diff (same +// `mutation_seq`) does NOT emit — content-dedup, the CPU bound that +// replaced the deleted upper-bound window (explicitly NOT the old +// "retry every tick" behavior). +// - Case C: source-age ~1000ms (well past the OLD 500ms upper bound) still +// emits exactly one Provisional — the upper bound is intentionally gone +// (no phase-miss); the publisher, not the listener, decides staleness. // - Case D: `fire_stream_dirty_backstop_ignoring_interval_for_test()` -// bypasses the WHOLE timing window; all other conditions (streaming, !ws, -// book_dirty, dirty_source_times) remain real, so it force-emits. +// bypasses the stuck-time lower bound; all other conditions (streaming, +// !ws, book_dirty, dirty_source_times, per-content dedup) remain real, so +// the first call force-emits. // --------------------------------------------------------------------------- /// Current epoch millis on the SAME basis as the listener's `now_ms()` @@ -298,18 +306,20 @@ fn now_ms_test() -> u64 { .unwrap_or(0) } -/// L2-5 contract (4): the backstop emits a Provisional only while the dirty -/// book's SOURCE age is inside the freshness window `[250ms, 500ms]` — -/// stuck enough to warrant a backstop AND still fresh enough that the publisher -/// would deliver it. Three cases prove the window with deterministic relative -/// block times (no sleeps): too-fresh (no emit), in-window (one Provisional), -/// stale (no emit, the publisher would suppress it so the backstop must not -/// waste a recompute). Case D proves the interval-bypassing seam still -/// force-emits regardless of source age. +/// L2-5 contract (4): the backstop emits a Provisional once the dirty book's +/// SOURCE age reaches the 250ms lower bound (NO upper bound), at most once per +/// distinct dirty-content state (dedup on `mutation_seq`). Four cases prove +/// the contract with deterministic relative block times (no sleeps): +/// too-fresh (no emit), stuck >= 250ms first call (one Provisional), an +/// immediate second call with no intervening diff (NO emit — content-dedup, +/// the CPU bound that replaced the deleted upper-bound window), and well past +/// the OLD 500ms upper bound (still emits — the upper bound is intentionally +/// gone; the publisher, not the listener, decides staleness). Case D proves +/// the lower-bound-bypassing seam still force-emits on its first call. #[tokio::test(flavor = "current_thread")] async fn stuck_stream_backstop_emits_dirty_snapshot() { // Case A (too fresh — must NOT emit): source-age ~100ms is below the 250ms - // lower bound, so the enforced window blocks. + // lower bound, so the enforced lower bound blocks. { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -326,8 +336,11 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { ); } - // Case B (in window — must emit exactly one Provisional): source-age ~300ms - // is inside [250, 500], so the enforced window is satisfied. + // Case B (stuck >= 250ms, first call — must emit exactly one Provisional) + // + Case B2 (immediate second call, no intervening diff — must NOT emit: + // content-dedup on `mutation_seq`; this is the CPU bound that replaced the + // deleted upper-bound window, and is explicitly NOT the old retry-each-tick + // behavior). { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -340,7 +353,7 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { assert_eq!( emitted.len(), 1, - "backstop must emit exactly one snapshot when source-age (~300ms) is inside the [250,500] window; got {}", + "backstop must emit exactly one snapshot when source-age (~300ms) is >= the 250ms lower bound (first content state); got {}", emitted.len(), ); assert_eq!( @@ -353,11 +366,24 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { 2, "backstop snapshot must carry the last applied height (2)", ); + + // Case B2: a SECOND enforced call immediately after, with NO + // intervening diff (same `mutation_seq`) → NO emit. Content-dedup is + // the CPU bound that replaced the deleted upper-bound window — and is + // explicitly NOT the old "retry every tick" behavior. + listener.fire_stream_dirty_backstop_for_test(); + let deduped = drain_snapshots(&mut rx); + assert!( + deduped.is_empty(), + "second backstop call with no intervening diff must be content-deduped (same mutation_seq) and emit nothing; got {} snapshot(s)", + deduped.len(), + ); } - // Case C (stale — must NOT emit): source-age ~1000ms is past the 500ms - // upper bound. The publisher would suppress a provisional this stale, so - // the backstop must go silent and not waste a recompute. + // Case C (well past the OLD 500ms upper bound — must STILL emit): the + // upper bound is intentionally gone (a fixed-period ticker could + // phase-skip a bounded window). source-age ~1000ms still emits exactly one + // Provisional; the publisher, not the listener, decides staleness. { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -367,16 +393,27 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { listener.fire_stream_dirty_backstop_for_test(); let emitted = drain_snapshots(&mut rx); - assert!( - emitted.is_empty(), - "backstop must NOT fire once the dirty epoch's source-age (~1000ms) is past the 500ms upper bound (publisher would suppress it); got {} snapshot(s)", + assert_eq!( emitted.len(), + 1, + "backstop must STILL emit a Provisional at source-age ~1000ms (NO upper bound — the publisher, not the listener, decides staleness); got {}", + emitted.len(), + ); + assert_eq!( + snapshot_emission(&emitted[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "backstop snapshot must emit Provisional", + ); + assert_eq!( + snapshot_height(&emitted[0]), + 2, + "backstop snapshot must carry the last applied height (2)", ); } - // Case D (bypass): the interval-ignoring helper force-emits regardless of - // source age — here a still-fresh (~50ms) dirty epoch still emits one - // Provisional, proving the bypass seam is intact. + // Case D (bypass): the lower-bound-ignoring helper force-emits on its + // first call regardless of source age — here a still-fresh (~50ms) dirty + // epoch still emits one Provisional, proving the bypass seam is intact. { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); @@ -389,7 +426,7 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { assert_eq!( emitted.len(), 1, - "interval-bypassing backstop must force-emit one snapshot regardless of source age; got {}", + "lower-bound-bypassing backstop must force-emit one snapshot regardless of source age; got {}", emitted.len(), ); assert_eq!( @@ -406,89 +443,23 @@ async fn stuck_stream_backstop_emits_dirty_snapshot() { } // --------------------------------------------------------------------------- -// Test 4b (iter-15 regression): the backstop RE-EMITS within the fresh window -// to retry a transport-dropped provisional. -// -// The listener cannot observe whether the publisher's multicast broadcast -// actually delivered an earlier provisional, so it must NOT latch a one-shot at -// enqueue. Two consecutive backstop ticks inside the fresh window — with NO -// intervening book change — must each emit a Provisional, so a fresh -// provisional dropped in transit is re-delivered on the next tick. -// --------------------------------------------------------------------------- - -/// iter-15 regression: two consecutive enforced backstop ticks within the -/// fresh window, with no intervening mutation, must emit TWO Provisional -/// snapshots. This proves the backstop is NOT one-shot: the listener cannot -/// observe broadcast delivery, so it re-emits each tick while fresh to -/// re-deliver a transport-dropped provisional. -#[tokio::test(flavor = "current_thread")] -async fn backstop_re_emits_within_fresh_window_to_retry_drops() { - let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); - - // Dirty block 2 whose source-age is ~300ms (inside [250,500]). The same - // relative time is passed to both `add_event` and `feed_block`. - let aged_block_time = now_ms_test() - 300; - let (s2, d2) = add_event(aged_block_time, Side::Bid, 101, "100", "5"); - feed_block(&mut listener, 2, aged_block_time, vec![s2], vec![d2]); - - // Fire the enforced backstop TWICE with NO intervening change. - listener.fire_stream_dirty_backstop_for_test(); - listener.fire_stream_dirty_backstop_for_test(); - - let emitted = drain_snapshots(&mut rx); - assert_eq!( - emitted.len(), - 2, - "backstop must re-emit each tick while fresh (NOT one-shot) so a dropped fresh provisional is retried; got {}", - emitted.len(), - ); - for snap in &emitted { - assert_eq!( - snapshot_emission(snap).expect("snapshot has emission"), - super::SnapshotEmission::Provisional, - "every re-emitted backstop snapshot must be Provisional", - ); - assert_eq!(snapshot_height(snap), 2, "backstop snapshot must carry height 2"); - } - - // Block 3 finalizes block 2. The non-one-shot backstop must NOT suppress - // the authoritative finalization snapshot — `clear_book_dirty` at epoch - // close still owns the lifecycle. - let finalize_block_time = now_ms_test() - 100; - let (s3, d3) = add_event(finalize_block_time, Side::Bid, 103, "90", "1"); - feed_block(&mut listener, 3, finalize_block_time, vec![s3], vec![d3]); - - let after_final = drain_snapshots(&mut rx); - let height_2_authoritative: Vec<_> = after_final - .iter() - .filter(|m| snapshot_height(m) == 2) - .filter(|m| snapshot_emission(m) == Some(super::SnapshotEmission::Authoritative)) - .collect(); - assert_eq!( - height_2_authoritative.len(), - 1, - "finalization must still emit exactly one Authoritative snapshot for height 2 after backstop re-emits; got {}", - height_2_authoritative.len(), - ); -} - -// --------------------------------------------------------------------------- -// Test 4c (Codex finding): a post-provisional BBO mutation keeps the freshness -// window fresh, so a long stall with ongoing diffs does not strand subscribers -// on a stale provisional. +// Test 4c (Codex finding): a real post-provisional BBO mutation bumps +// `mutation_seq`, clearing the per-content dedup, so a long stall with ongoing +// diffs does not strand subscribers on a stale provisional. // --------------------------------------------------------------------------- -/// Codex finding: under the freshness-window model, re-emission is inherent -/// (not gated by a re-arm guard). `dirty_source_times` tracks the LATEST -/// mutation, so a later same-height BBO-changing diff (block still unfinalized) -/// refreshes the window and the next backstop tick emits a SECOND provisional +/// Codex finding: under the per-content dedup model, re-emission is inherent +/// for a CHANGED content state (not gated by a window or a re-arm flag). The +/// first backstop call emits (first content state) then dedups; a later +/// same-height BBO-changing diff (block still unfinalized) bumps `mutation_seq` +/// which clears the dedup, so the next backstop tick emits a SECOND provisional /// carrying the updated BBO — a long stall with ongoing diffs does not strand /// subscribers on a stale provisional. #[tokio::test(flavor = "current_thread")] async fn backstop_re_emits_after_post_provisional_mutation() { let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); - // Stuck block 2, source-age ~300ms (inside [250,500] window). First BBO = bid 100. + // Stuck block 2, source-age ~300ms (>= 250ms lower bound). First BBO = bid 100. let bt1 = now_ms_test() - 300; let (s2, d2) = add_event(bt1, Side::Bid, 101, "100", "5"); feed_block(&mut listener, 2, bt1, vec![s2], vec![d2]); @@ -504,15 +475,15 @@ async fn backstop_re_emits_after_post_provisional_mutation() { let (s2b, d2b) = add_event(bt2, Side::Bid, 102, "110", "3"); feed_block(&mut listener, 2, bt2, vec![s2b], vec![d2b]); - // The post-provisional mutation refreshed `dirty_source_times` (keeping the - // window fresh): the next backstop tick emits a SECOND provisional with the - // UPDATED BBO. + // The post-provisional mutation bumped `mutation_seq`, clearing the + // per-content dedup: the next backstop tick emits a SECOND provisional + // with the UPDATED BBO. listener.fire_stream_dirty_backstop_for_test(); let second = drain_snapshots(&mut rx); assert_eq!(second.len(), 1, "post-mutation backstop must re-emit one provisional; got {}", second.len()); assert_eq!(snapshot_emission(&second[0]), Some(super::SnapshotEmission::Provisional)); let bid_2 = snapshot_best_bid(&second[0]).expect("second provisional has a bid"); - assert_ne!(bid_1, bid_2, "second provisional must carry the updated BBO (window refreshed by latest mutation); bid_1={bid_1:?} bid_2={bid_2:?}"); + assert_ne!(bid_1, bid_2, "second provisional must carry the updated BBO (dedup cleared by the mutation_seq bump); bid_1={bid_1:?} bid_2={bid_2:?}"); // Sanity: finalization still emits the authoritative snapshot. let bt3 = now_ms_test() - 100; From 77a137d25333f157b60b0714bf37f70826b2ed62 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 13:57:17 -0400 Subject: [PATCH 51/65] docs: reconcile readme/changelog with final design (dob requires multicast; flag-based rollback) --- CHANGELOG.md | 37 ++++++++++++++++++++----------------- README.md | 4 ++-- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de07c839..31942800 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,23 +11,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - The WebSocket listener is now **disabled by default**. The publisher is multicast-only unless `--enable-websocket` is passed. Existing deployments that relied on the default `--address`/`--port` WebSocket server MUST add - `--enable-websocket` to retain that behavior. Startup is now rejected if no - output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`) - is configured. - - **Rollback:** to restore the previous behavior without a code revert, pass - `--enable-websocket` — this re-enables the WS listener and the full L2 - snapshot fan-out (all 7 variants per coin). If a regression is isolated to - the streaming finalization-driven snapshot path (L2-5) and `--enable-websocket` - does not address it, revert the L2-5 commits (`feat: 5s stuck-stream snapshot - backstop on dedicated 250ms ticker`, `perf: emit streaming l2 snapshot at - block finalization, not per chunk`, `feat: add book_dirty flag to - OrderBookState set only on real mutations`, `fix: emit authoritative tob - snapshot after streaming recovery`); the L2-1 (`perf: replace Px::num_digits - f64 log10 with u64::ilog10`), L2-4 (`perf: pre-size L2 level output Vecs to - skip realloc growth`), and L2-3 (`perf: skip 6 bucketed l2 variants and cap - unbucketed to bbo when websocket disabled`) changes are independent and can - stand alone. + `--enable-websocket` to retain that behavior. +- Output-mode configuration is now validated. A run with no output configured + is rejected, and `--dob-group` is **not** a standalone mode — it requires + `--multicast-group` (the shared instrument registry is only bootstrapped + from the HL API in multicast mode, so a DoB-only publisher would resolve no + instruments). Valid output configurations are `--enable-websocket` and/or + `--multicast-group` (the latter optionally with `--dob-group`). Both checks + are enforced by the CLI **and** by the `run_websocket_server` library entry + point, so alternate callers cannot start a publisher that emits nothing. + + **Rollback:** the durable rollback contract is the `--enable-websocket` + flag. Passing it restores the pre-L2-5 streaming cadence **byte-for-byte** + (per-chunk TOB emission, full 7-variant L2 fan-out, recovery carried by the + next per-chunk snapshot) — this equivalence is enforced by discriminating + WS-enabled tests and the streaming goldens, and held across the entire + change set, so it is a safe operational revert without touching code. The + WS-disabled L2-1 (integer `u64::ilog10` digit count), L2-3 (BBO-only L2 in + multicast-only mode), and L2-4 (pre-sized L2 level vectors) perf levers are + independent of the streaming finalization/backstop path (L2-5) and of each + other; any one can be reverted alone if isolated to it. ### Performance diff --git a/README.md b/README.md index 6ae053e8..cc812874 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The `l4book` subscription first sends a snapshot of the entire book and then for cargo run --release --bin dz_hl_publisher -- --address 0.0.0.0 --port 8000 --enable-websocket ``` -The publisher is **multicast-only by default**. The WebSocket listener is **not bound** unless `--enable-websocket` is passed — `--address` and `--port` are accepted but unused in that mode (a startup log line states the active output mode). At least one output mode (`--enable-websocket`, `--multicast-group`, or `--dob-group`) is required; starting without any of these is rejected at startup. +The publisher is **multicast-only by default**. The WebSocket listener is **not bound** unless `--enable-websocket` is passed — `--address` and `--port` are accepted but unused in that mode (a startup log line states the active output mode). At least one output mode (`--enable-websocket` or `--multicast-group`) is required; starting with no output configured is rejected. `--dob-group` is **not** a standalone output mode — the shared instrument registry is only bootstrapped from the HL API in multicast mode, so `--dob-group` requires `--multicast-group` (a DoB-only configuration would resolve no instruments and is rejected). These checks are enforced both by the CLI and by the `run_websocket_server` library entry point. By default the server reads `$HOME/hl/data/node_*_by_block`. To opt into streaming disk ingest, use: @@ -235,7 +235,7 @@ This joins the multicast group and prints received datagrams to stdout. ## DZ-DoB -Binary depth-of-book multicast (frame magic `0x4444`). Three streams off the same `--dob-group`: +Binary depth-of-book multicast (frame magic `0x4444`). `--dob-group` requires `--multicast-group` (the instrument registry is bootstrapped only in multicast mode; a DoB-only publisher is rejected at startup). Three streams off the same `--dob-group`: - **mktdata** (`--dob-mktdata-port`, default `6000`) — incremental L4 events. - **refdata** (`--dob-refdata-port`, default `6001`) — `InstrumentDefinition` retransmissions. From 9fa87c61913c189e3db24d4bc1c9e9721717bd4a Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 14:10:00 -0400 Subject: [PATCH 52/65] publisher: discharge supersede only on full local send; resend cached authoritative while obligation pending --- server/src/multicast/publisher.rs | 212 ++++++++++++++++-------------- 1 file changed, 115 insertions(+), 97 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 2e45f1a6..9f05e47d 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -288,6 +288,19 @@ enum TobHealthLevel { Warn, } +/// Local send outcome for a quote batch. `None` = nothing was sent (no +/// frames, or every `send_frame` failed locally); `Partial` = some frames +/// sent locally but at least one failed; `All` = every attempted frame was +/// accepted by the local socket (>=1 attempted). This is about LOCAL socket +/// acceptance only — UDP multicast has no delivery guarantee; cross-network +/// loss is recovered by the periodic authoritative resync. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum QuoteSendOutcome { + None, + Partial, + All, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] struct TobHealthReport { level: TobHealthLevel, @@ -552,26 +565,44 @@ impl MulticastPublisher { /// Pure transition for the provisional-supersede obligation. /// - /// A delivered `Provisional` creates the obligation. Only a delivered - /// `Authoritative` (the block's own finalization) discharges it. A - /// `Correction` is recovery divergence repair, which is orthogonal to the - /// stream dirty epoch — the listener never closes that epoch on recovery — - /// so it must leave the obligation untouched, otherwise a later stale - /// `Authoritative` finalization would be suppressed and strand subscribers - /// on the provisional. A snapshot that did not actually send (`sent == - /// false`) changes nothing. + /// A delivered `Provisional` creates the obligation. Only a FULLY-sent + /// `Authoritative` (the block's own finalization, every frame accepted by + /// the local socket) discharges it: a `Partial` or `None` supersede is + /// INCOMPLETE, so the obligation must be preserved and re-forced by a later + /// snapshot — otherwise a subscriber that saw the provisional could be + /// stranded on a partial/old book. A `Correction` is recovery divergence + /// repair, orthogonal to the stream dirty epoch (the listener never closes + /// that epoch on recovery), so it must leave the obligation untouched + /// regardless of outcome; otherwise a later stale `Authoritative` + /// finalization would be suppressed and strand subscribers on the + /// provisional. By outcome: + /// - `None` (nothing delivered locally): obligation unchanged. + /// - `Partial`: a partially-delivered Provisional still creates the + /// obligation; a partial Authoritative supersede does NOT discharge it. + /// - `All` (full local send): previous semantics — Provisional sets, + /// Authoritative clears, Correction unchanged. const fn next_pending_provisional( prev: bool, emission: SnapshotEmission, - sent: bool, + outcome: QuoteSendOutcome, ) -> bool { - if !sent { - return prev; - } - match emission { - SnapshotEmission::Provisional => true, - SnapshotEmission::Authoritative => false, - SnapshotEmission::Correction => prev, + match outcome { + // Nothing delivered locally — obligation unchanged. + QuoteSendOutcome::None => prev, + // Partial local send: a provisional that partially reached + // subscribers still must be superseded; a partial supersede is + // INCOMPLETE so the obligation must NOT be discharged. + QuoteSendOutcome::Partial => match emission { + SnapshotEmission::Provisional => true, + SnapshotEmission::Authoritative => prev, + SnapshotEmission::Correction => prev, + }, + // Full local send. + QuoteSendOutcome::All => match emission { + SnapshotEmission::Provisional => true, + SnapshotEmission::Authoritative => false, + SnapshotEmission::Correction => prev, + }, } } @@ -827,12 +858,14 @@ impl MulticastPublisher { >, time: u64, is_snapshot: bool, - ) -> bool { + ) -> QuoteSendOutcome { let flags = if is_snapshot { FLAG_SNAPSHOT } else { 0 }; let source_timestamp_ns = time * 1_000_000; // HL time is ms let mut fb = FrameBuilder::new(0, self.next_seq(), Self::now_ns(), self.config.mtu); let mut sent_any = false; + let mut any_failed = false; + let mut attempted = false; let default_params = crate::listeners::order_book::L2SnapshotParams::new(None, None); // Snapshot-then-lookup against the lock-free ArcSwap. Holding the load @@ -912,7 +945,10 @@ impl MulticastPublisher { } Err(FrameError::ExceedsMtu { .. } | FrameError::MaxMessages) => { if !fb.is_empty() { - sent_any |= self.send_frame(fb.finalize(), self.config.dest(), "marketdata", "quote").await; + let ok = self.send_frame(fb.finalize(), self.config.dest(), "marketdata", "quote").await; + attempted = true; + sent_any |= ok; + any_failed |= !ok; } fb = FrameBuilder::new(0, self.next_seq(), Self::now_ns(), self.config.mtu); let buf = fb.message_buffer(QUOTE_SIZE).expect("Quote fits in empty frame"); @@ -923,9 +959,18 @@ impl MulticastPublisher { } if !fb.is_empty() { - sent_any |= self.send_frame(fb.finalize(), self.config.dest(), "marketdata", "quote").await; + let ok = self.send_frame(fb.finalize(), self.config.dest(), "marketdata", "quote").await; + attempted = true; + sent_any |= ok; + any_failed |= !ok; + } + if !attempted || !sent_any { + QuoteSendOutcome::None + } else if any_failed { + QuoteSendOutcome::Partial + } else { + QuoteSendOutcome::All } - sent_any } /// Encodes fills as Trade messages, batching into frames. @@ -1121,23 +1166,23 @@ impl MulticastPublisher { ); } let snapshot_map = l2_snapshots.as_ref(); - let sent = self.publish_quotes(snapshot_map, *time, false).await; - if sent { + let outcome = self.publish_quotes(snapshot_map, *time, false).await; + if outcome != QuoteSendOutcome::None { had_activity = true; heartbeat_interval.reset(); } - // Drive the supersede flag from actual delivery via - // `next_pending_provisional`: a delivered Provisional sets - // the obligation; only the block's own delivered - // Authoritative finalization clears it; a recovery - // Correction is orthogonal to the stream dirty epoch and - // leaves it unchanged. A snapshot that did not send leaves - // the obligation untouched, so a later send still forces it - // through (subscribers are never stranded on the provisional). + // Drive the supersede obligation from LOCAL + // send outcome: a (even partially) delivered + // Provisional creates it; only a FULLY-sent + // Authoritative finalization discharges it (a + // partial/failed supersede keeps it so a later + // snapshot re-forces the correction); a + // recovery Correction is orthogonal and leaves + // it unchanged. pending_provisional = Self::next_pending_provisional( pending_provisional, *emission, - sent, + outcome, ); } else { caught_up = false; @@ -1279,10 +1324,24 @@ impl MulticastPublisher { } } _ = snapshot_interval.tick() => { - if caught_up { + // Resend the cached authoritative snapshot when we are + // caught up (normal periodic refresh) OR while a provisional + // supersede obligation is still pending — in the latter case + // a subscriber that saw a provisional has not been confirmed + // to have received the (fully-sent) correction, so keep + // re-attempting the cached authoritative. This is bounded: + // `pending_provisional` is a single obligation cleared by a + // fully-sent Authoritative or a fresh one, and is chronically + // set only during a genuine stall (catch-up backlog finalizes + // fast and clears it), so this does NOT reintroduce catch-up + // stale flooding. `Provisional` is never cached, so the + // resent snapshot is always the correct authoritative state. + if caught_up || pending_provisional { if let Some(ref cached) = cached_snapshot && let InternalMessage::Snapshot { l2_snapshots, time, .. } = cached.as_ref() { - if self.publish_quotes(l2_snapshots.as_ref(), *time, true).await { + if self.publish_quotes(l2_snapshots.as_ref(), *time, true).await + != QuoteSendOutcome::None + { had_activity = true; heartbeat_interval.reset(); } @@ -1632,9 +1691,9 @@ mod tests { config.port = recv_socket.local_addr().unwrap().port(); let publisher = MulticastPublisher::new(send_socket, config, test_registry()); - let sent = publisher.publish_quotes(&HashMap::new(), MulticastPublisher::now_ms(), false).await; + let outcome = publisher.publish_quotes(&HashMap::new(), MulticastPublisher::now_ms(), false).await; - assert!(!sent, "empty quote batch must not count as marketdata activity"); + assert_eq!(outcome, QuoteSendOutcome::None, "empty quote batch must not count as marketdata activity"); assert!( tokio::time::timeout( Duration::from_millis(50), @@ -2051,70 +2110,29 @@ mod tests { /// asserts every case of `next_pending_provisional`, in particular that a /// delivered `Correction` leaves a pending obligation intact so the later /// stale `Authoritative` still force-publishes. + /// + /// The `Partial` cases lock in the iter-18 partial-send finding: a partial + /// local send of a superseding `Authoritative` is INCOMPLETE and must NOT + /// discharge the obligation, while a partially-delivered `Provisional` + /// still creates it. #[test] fn pending_provisional_transition_matrix() { - // Delivered Provisional creates the obligation regardless of prior state. - assert!( - MulticastPublisher::next_pending_provisional(false, SnapshotEmission::Provisional, true), - "delivered provisional must create the obligation" - ); - assert!( - MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Provisional, true), - "delivered provisional keeps the obligation set" - ); - - // Delivered Authoritative (the block's own finalization) discharges it. - assert!( - !MulticastPublisher::next_pending_provisional( - true, - SnapshotEmission::Authoritative, - true - ), - "delivered authoritative finalization must discharge the obligation" - ); - assert!( - !MulticastPublisher::next_pending_provisional( - false, - SnapshotEmission::Authoritative, - true - ), - "delivered authoritative with no obligation stays clear" - ); - - // THE FIX: delivered recovery Correction must NOT clear a pending - // obligation — it is orthogonal to the stream dirty epoch. - assert!( - MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Correction, true), - "recovery correction must not clear a pending provisional obligation" - ); - assert!( - !MulticastPublisher::next_pending_provisional( - false, - SnapshotEmission::Correction, - true - ), - "recovery correction with no obligation leaves it clear" - ); - - // Not sent: obligation is unchanged for every emission. A failed - // Authoritative send must preserve the obligation so a later send - // still force-publishes the finalization. - assert!( - MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Provisional, false), - "unsent provisional leaves the obligation unchanged" - ); - assert!( - MulticastPublisher::next_pending_provisional( - true, - SnapshotEmission::Authoritative, - false - ), - "failed authoritative send must preserve the obligation" - ); - assert!( - MulticastPublisher::next_pending_provisional(true, SnapshotEmission::Correction, false), - "unsent correction leaves the obligation unchanged" - ); + use QuoteSendOutcome::{All, None, Partial}; + use SnapshotEmission::{Authoritative, Correction, Provisional}; + // No local send: obligation unchanged. + assert!(MulticastPublisher::next_pending_provisional(true, Authoritative, None)); + assert!(!MulticastPublisher::next_pending_provisional(false, Provisional, None)); + // Provisional delivered (even partially) -> obligation set. + assert!(MulticastPublisher::next_pending_provisional(false, Provisional, Partial)); + assert!(MulticastPublisher::next_pending_provisional(false, Provisional, All)); + // Authoritative supersede: cleared ONLY on a full send. + assert!(!MulticastPublisher::next_pending_provisional(true, Authoritative, All)); + assert!(MulticastPublisher::next_pending_provisional(true, Authoritative, Partial)); // iter-18 fix: partial supersede does NOT discharge + assert!(MulticastPublisher::next_pending_provisional(true, Authoritative, None)); + // Correction is orthogonal to the stream dirty epoch -> obligation unchanged. + assert!(MulticastPublisher::next_pending_provisional(true, Correction, All)); + assert!(!MulticastPublisher::next_pending_provisional(false, Correction, All)); + assert!(MulticastPublisher::next_pending_provisional(true, Correction, Partial)); } /// `caught_up` after a publish: true ONLY for a fresh Authoritative or From edee645c2c2fa7559e36136cd79bf2dbdd307288 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 14:25:20 -0400 Subject: [PATCH 53/65] fix: revert pending-driven resend (rolls back subscribers); gate backstop on dirty-epoch age not latest mutation --- server/src/listeners/order_book/mod.rs | 47 +++++++++------ server/src/listeners/order_book/state.rs | 41 +++++++++---- .../order_book/stream_finalization_tests.rs | 58 +++++++++++++++++++ server/src/multicast/publisher.rs | 49 ++++++++++------ 4 files changed, 149 insertions(+), 46 deletions(-) diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 6bd0d765..3dec65ff 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -1259,17 +1259,24 @@ impl OrderBookListener { /// the publisher's freshness gate + supersede/resync machinery. The /// authoritative finalization snapshot (`clear_book_dirty`) is unaffected. fn try_emit_stuck_stream_backstop_inner(&mut self, enforce_interval: bool) -> bool { - // Emit a provisional once the dirty book is stuck >= INTERVAL of source - // time (lower bound only — NO upper bound: a fixed-period ticker not - // phase-aligned to the dirty epoch could phase-skip a bounded window - // entirely, so a stuck epoch could get no provisional at all). CPU is - // bounded instead by per-content dedup on the monotonic `mutation_seq`: - // at most one Provisional per distinct dirty-content state. A no-new- - // diff stall keeps `mutation_seq` static → one emit then deduped; an - // ongoing-diff stall bumps `mutation_seq` each real mutation → - // re-emits updated state on the next tick. A transport-dropped - // provisional of unchanged content is intentionally not retried (the - // publisher's supersede/resync machinery handles recovery); the + // Emit a provisional once the dirty EPOCH has been stuck >= INTERVAL of + // source time (lower bound only — NO upper bound: a fixed-period ticker + // not phase-aligned to the dirty epoch could phase-skip a bounded + // window entirely, so a stuck epoch could get no provisional at all). + // The age gate measures the dirty-EPOCH age (now - the epoch's FIRST + // mutation time, via `dirty_epoch_started_at_ms`), NOT time since the + // latest mutation: a stream that keeps applying fresh sub-INTERVAL- + // spaced diffs while finalization is BLOCKED would otherwise perpetually + // reset a latest-mutation clock below threshold and the backstop would + // never fire during the stall. `dirty_source_times()` (latest mutation) + // is used ONLY for the emitted snapshot's source metadata below, never + // for this gate. CPU is still bounded by per-content dedup on the + // monotonic `mutation_seq`: at most one Provisional per distinct dirty- + // content state. A no-new-diff stall keeps `mutation_seq` static → one + // emit then deduped; an ongoing-diff stall bumps `mutation_seq` each + // real mutation → re-emits updated state on the next tick. A transport- + // dropped provisional of unchanged content is intentionally not retried + // (the publisher's supersede/resync machinery handles recovery); the // publisher's own freshness gate still suppresses stale provisionals. let lower_ms = STREAM_DIRTY_BACKSTOP_INTERVAL.as_millis() as u64; let should_emit = self.ingest_mode == IngestMode::Stream @@ -1278,8 +1285,8 @@ impl OrderBookListener { s.book_dirty() && !s.provisional_already_attempted_for_current_content() && (!enforce_interval - || s.dirty_source_times().is_some_and(|(bt, _)| { - now_ms().saturating_sub(bt) >= lower_ms + || s.dirty_epoch_started_at_ms().is_some_and(|epoch_started| { + now_ms().saturating_sub(epoch_started) >= lower_ms })) }); if should_emit @@ -1871,11 +1878,13 @@ impl OrderBookListener { /// Test-only equivalent of the production `backstop_ticker.tick()` arm in /// `hl_listen`. Fires a provisional TOB snapshot if the book is dirty, the /// dirty epoch has been stuck for at least `STREAM_DIRTY_BACKSTOP_INTERVAL` - /// of SOURCE time (`now_ms() - dirty_block_time_ms` >= 250ms; lower bound - /// only — there is NO upper bound), AND no provisional has yet been - /// attempted for the current dirty-content state. The stuck-time lower - /// bound is ENFORCED (enforce_interval=true): calling this while the dirty - /// epoch is still too fresh (source-age < 250ms) will NOT emit. Emission is + /// of SOURCE time (`now_ms() - dirty_epoch_started_at_ms` >= 250ms — the + /// dirty-EPOCH age, anchored at the epoch's FIRST mutation, NOT time since + /// the latest mutation; lower bound only — there is NO upper bound), AND no + /// provisional has yet been attempted for the current dirty-content state. + /// The stuck-time lower bound is ENFORCED (enforce_interval=true): calling + /// this while the dirty epoch is still too fresh (epoch-age < 250ms) will + /// NOT emit. Emission is /// content-deduped on `mutation_seq`: a second call with no intervening /// mutation does NOT re-emit (this is the CPU bound that replaced the old /// upper-bound window); a real mutation bumps `mutation_seq` and the next @@ -1887,7 +1896,7 @@ impl OrderBookListener { /// Test-only backstop variant that bypasses the stuck-time lower-bound gate /// (force-emit). All other conditions are real: streaming mode, WS - /// disabled, book_dirty, dirty_source_times present, the per-content dedup + /// disabled, book_dirty, dirty_epoch_started_at_ms present, the per-content dedup /// on `mutation_seq` (so a second force-call with no intervening mutation /// does NOT re-emit), emits `SnapshotEmission::Provisional`. Use this to /// exercise the provisional emit path deterministically with historical diff --git a/server/src/listeners/order_book/state.rs b/server/src/listeners/order_book/state.rs index 6f4ccfc0..d2b723bc 100644 --- a/server/src/listeners/order_book/state.rs +++ b/server/src/listeners/order_book/state.rs @@ -26,12 +26,22 @@ pub(super) struct OrderBookState { /// snapshot for the closing block. Soft-tolerance no-op branches MUST NOT /// set this — see `apply_stream_diff`. book_dirty: bool, - /// Source `(block_time_ms, local_time_ms)` recorded the first time the - /// book became dirty in the current dirty epoch. Reset to `None` when - /// `book_dirty` is cleared. Lets the stuck-stream backstop attach - /// reliable times without depending on the take-and-clear - /// `OrderBookListener::last_batch_times()`. + /// Source `(block_time_ms, local_time_ms)` of the LATEST book-content + /// mutation in the current dirty epoch (updated on every mutation). Reset + /// to `None` when `book_dirty` is cleared. Lets the stuck-stream backstop + /// attach reliable source times to the emitted provisional without + /// depending on the take-and-clear `OrderBookListener::last_batch_times()`. + /// NOTE: this is the latest-mutation time, NOT the epoch-age clock — the + /// backstop age gate uses `dirty_epoch_started_at_ms` instead. dirty_source_times: Option<(u64, u64)>, + /// Source `block_time_ms` of the FIRST mutation of the current dirty epoch + /// (set once on the clean→dirty transition, reset at epoch close). The + /// stuck-stream backstop age gate uses THIS (epoch age), not the latest + /// mutation time, so a stream that keeps mutating while finalization is + /// blocked still trips the backstop instead of perpetually resetting the + /// age. `dirty_source_times` continues to track the LATEST mutation for the + /// emitted provisional's source metadata. + dirty_epoch_started_at_ms: Option, /// Monotonic counter bumped on every book-content mutation (hot-path real /// mutations via `mark_dirty_with_times`, and recovery repairs via /// `replace_coin_from_snapshot`/`remove_coin`). Captured with a cloned @@ -80,6 +90,7 @@ impl Clone for OrderBookState { enable_websocket: self.enable_websocket, book_dirty: self.book_dirty, dirty_source_times: self.dirty_source_times, + dirty_epoch_started_at_ms: self.dirty_epoch_started_at_ms, mutation_seq: self.mutation_seq, last_provisional_attempt_seq: self.last_provisional_attempt_seq, // The tap is intentionally not cloned: the clone is used only for @@ -107,6 +118,7 @@ impl OrderBookState { snapped: false, book_dirty: false, dirty_source_times: None, + dirty_epoch_started_at_ms: None, mutation_seq: 0, last_provisional_attempt_seq: None, dob_tap: None, @@ -157,12 +169,20 @@ impl OrderBookState { self.dirty_source_times } + pub(super) const fn dirty_epoch_started_at_ms(&self) -> Option { + self.dirty_epoch_started_at_ms + } + pub(super) fn mark_dirty_with_times(&mut self, block_time_ms: u64, local_time_ms: u64) { - // Always update to the latest real mutation's source times (not just the - // first dirty transition): the stuck-stream backstop emits the most - // recent mutation's source times, so an ongoing-diff stall re-publishes - // current state. The mutation_seq bump below also clears the backstop's - // per-content dedup, so the next tick re-emits an updated provisional. + if !self.book_dirty { + // First mutation of a new dirty epoch: anchor the epoch-age clock. + self.dirty_epoch_started_at_ms = Some(block_time_ms); + } + // Latest mutation's source times — used for the emitted provisional's + // snapshot metadata (NOT for the backstop age gate, which uses the + // epoch-start anchor above). The mutation_seq bump below also clears the + // backstop's per-content dedup, so the next tick re-emits an updated + // provisional. self.dirty_source_times = Some((block_time_ms, local_time_ms)); self.book_dirty = true; self.mutation_seq = self.mutation_seq.wrapping_add(1); @@ -171,6 +191,7 @@ impl OrderBookState { pub(super) const fn clear_book_dirty(&mut self) { self.book_dirty = false; self.dirty_source_times = None; + self.dirty_epoch_started_at_ms = None; // Epoch close resets the dedup tracker so the next dirty epoch's first // stuck state emits a provisional (the backstop dedups by mutation_seq // only WITHIN an epoch). diff --git a/server/src/listeners/order_book/stream_finalization_tests.rs b/server/src/listeners/order_book/stream_finalization_tests.rs index e2d7e045..bd1f6e13 100644 --- a/server/src/listeners/order_book/stream_finalization_tests.rs +++ b/server/src/listeners/order_book/stream_finalization_tests.rs @@ -495,6 +495,64 @@ async fn backstop_re_emits_after_post_provisional_mutation() { assert_eq!(snapshot_emission(h2_finals[0]), Some(super::SnapshotEmission::Authoritative)); } +// --------------------------------------------------------------------------- +// Test 4d (Finding #2 regression): the backstop age gate must measure the +// dirty-EPOCH age (now - the epoch's FIRST mutation), NOT time since the +// latest mutation. A stream that keeps applying fresh sub-250ms-spaced diffs +// while finalization is BLOCKED previously kept resetting `dirty_source_times` +// below the 250ms threshold so the backstop never fired during the stall — +// TOB multicast went stale on an actively-stalled stream. The fix anchors the +// age clock at the clean→dirty transition (`dirty_epoch_started_at_ms`). +// --------------------------------------------------------------------------- + +/// Finding #2 fix: the backstop fires during a CONTINUOUSLY-mutating stall. +/// A dirty epoch is opened with a 300ms-old block (epoch age >= 250), then a +/// SECOND same-height (still-unfinalized) mutation arrives with a FRESH 10ms-old +/// block time. Under the old latest-mutation gate the age would be +/// `now - (now-10) = 10 < 250` and the backstop would NOT fire; under the +/// epoch-age gate the age is `now - (now-300) = 300 >= 250` so exactly one +/// Provisional is emitted despite the fresh latest mutation. +#[tokio::test(flavor = "current_thread")] +async fn backstop_fires_during_continuously_mutating_stall() { + let (mut listener, mut rx) = OrderBookListener::for_test_streaming_with_snapshot(seed_snapshot(), 1); + + // Open the dirty epoch with a 300ms-old block: epoch-start anchored at + // now-300 (epoch age 300 >= 250). + let epoch_start_bt = now_ms_test() - 300; + let (s2, d2) = add_event(epoch_start_bt, Side::Bid, 101, "100", "5"); + feed_block(&mut listener, 2, epoch_start_bt, vec![s2], vec![d2]); + + // A SECOND same-height-2 mutation (block still NOT finalized — same height, + // no higher block) with a FRESH ~10ms-old block time. This advances + // `dirty_source_times` to a near-now value (latest-mutation age ~10ms < + // 250) while the epoch-start anchor stays at now-300. + let fresh_bt = now_ms_test() - 10; + let (s2b, d2b) = add_event(fresh_bt, Side::Bid, 102, "110", "3"); + feed_block(&mut listener, 2, fresh_bt, vec![s2b], vec![d2b]); + + // Enforced-interval backstop: the epoch-age gate (300 >= 250) fires even + // though the latest-mutation age (~10ms) is well below 250 — the old + // latest-mutation gate would NOT have emitted here. + listener.fire_stream_dirty_backstop_for_test(); + let emitted = drain_snapshots(&mut rx); + assert_eq!( + emitted.len(), + 1, + "backstop must fire on the dirty-EPOCH age (300ms >= 250ms) despite a fresh latest mutation (~10ms); the old latest-mutation gate would NOT have emitted; got {} snapshot(s)", + emitted.len(), + ); + assert_eq!( + snapshot_emission(&emitted[0]).expect("snapshot has emission"), + super::SnapshotEmission::Provisional, + "stuck-stream backstop snapshot must emit Provisional", + ); + assert_eq!( + snapshot_height(&emitted[0]), + 2, + "backstop snapshot must carry the last applied height (2)", + ); +} + // --------------------------------------------------------------------------- // Test 5 (Codex finding #2): late diff after backstop still gets authoritative // final — the provisional emit must NOT clear the dirty flag. diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 9f05e47d..b87f06cb 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1175,10 +1175,19 @@ impl MulticastPublisher { // send outcome: a (even partially) delivered // Provisional creates it; only a FULLY-sent // Authoritative finalization discharges it (a - // partial/failed supersede keeps it so a later - // snapshot re-forces the correction); a - // recovery Correction is orthogonal and leaves - // it unchanged. + // partial/failed supersede keeps it). This + // obligation is tracked so the IN-LOOP publish + // decision `Authoritative => fresh || + // pending_provisional` still force-publishes a + // later same-epoch Authoritative even if + // stale, and so a PARTIAL local send of a + // supersede does not falsely discharge it. It + // intentionally does NOT drive periodic + // resends — doing so would rebroadcast the + // prior block's authoritative (see the + // `snapshot_interval` arm). A recovery + // Correction is orthogonal and leaves it + // unchanged. pending_provisional = Self::next_pending_provisional( pending_provisional, *emission, @@ -1324,19 +1333,25 @@ impl MulticastPublisher { } } _ = snapshot_interval.tick() => { - // Resend the cached authoritative snapshot when we are - // caught up (normal periodic refresh) OR while a provisional - // supersede obligation is still pending — in the latter case - // a subscriber that saw a provisional has not been confirmed - // to have received the (fully-sent) correction, so keep - // re-attempting the cached authoritative. This is bounded: - // `pending_provisional` is a single obligation cleared by a - // fully-sent Authoritative or a fresh one, and is chronically - // set only during a genuine stall (catch-up backlog finalizes - // fast and clears it), so this does NOT reintroduce catch-up - // stale flooding. `Provisional` is never cached, so the - // resent snapshot is always the correct authoritative state. - if caught_up || pending_provisional { + // Periodic refresh of the last fresh full snapshot. Gated on + // `caught_up` ONLY: `caught_up` is true exactly when the most + // recent publish was a genuinely fresh Authoritative/ + // Correction, i.e. `cached_snapshot` is a current, correct + // full book. It is deliberately NOT extended to + // `pending_provisional`: while a provisional is pending but + // its block has not finalized, `cached_snapshot` is the + // PREVIOUS block's authoritative (the provisional is not + // cached; the awaited supersede does not exist yet), so + // resending it here would roll subscribers backward. The + // awaited correction is delivered at finalization (the + // forced supersede). If that forced-stale supersede is lost + // on the wire during a CONTINUED stall, the subscriber is + // recovered by the normal fresh-snapshot resync when the + // stream catches up — there is no fresher correct data + // during the stall, so this is the accepted best-effort + // contract for a degraded feed (no stale rebroadcast, no + // rollback). + if caught_up { if let Some(ref cached) = cached_snapshot && let InternalMessage::Snapshot { l2_snapshots, time, .. } = cached.as_ref() { if self.publish_quotes(l2_snapshots.as_ref(), *time, true).await From 6de440dd016b7ccaa1ad10f119ecfd1f4dec6151 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 14:38:34 -0400 Subject: [PATCH 54/65] publisher: retry only the cached forced-stale supersede (S2), never the prior block (S1) --- server/src/multicast/publisher.rs | 122 ++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 22 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index b87f06cb..eb2b456e 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -619,6 +619,43 @@ impl MulticastPublisher { fresh && !matches!(emission, SnapshotEmission::Provisional) } + /// Whether the periodic `snapshot_interval` arm must keep retrying the + /// cached snapshot because it is a forced-stale supersede whose LOCAL send + /// did not complete. + /// + /// Distinguishes the two provisional-supersede states the single + /// `pending_provisional` bool cannot: + /// - A `Provisional` is never cached and (S1) the awaited supersede does + /// not exist yet — `cached_snapshot` is the PRIOR block's authoritative, + /// so this MUST stay unchanged (resending the cache would roll + /// subscribers back). + /// - An `Authoritative`/`Correction` IS cached. If it published fresh + /// (`caught_up`), the normal `caught_up` periodic refresh covers it → + /// clear. If it was a forced-stale supersede that fully sent locally + /// (`All`) there is nothing to retry → clear. If its local send was + /// `Partial`/`None` (S2), `cached_snapshot` IS that supersede → arm the + /// retry so the interval re-sends exactly it (delivering the awaited + /// correction, not a rollback) until a full local send. + const fn next_retry_cached_supersede( + prev: bool, + emission: SnapshotEmission, + caught_up: bool, + outcome: QuoteSendOutcome, + ) -> bool { + match emission { + SnapshotEmission::Provisional => prev, + SnapshotEmission::Authoritative | SnapshotEmission::Correction => { + if caught_up { + false + } else if matches!(outcome, QuoteSendOutcome::All) { + false + } else { + true + } + } + } + } + fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -1077,6 +1114,12 @@ impl MulticastPublisher { // delivered provisional that omits the block's final diffs. Cleared // when an Authoritative or Correction is published. let mut pending_provisional = false; + // S2 retry flag: set only when a forced-stale Authoritative/Correction + // supersede was emitted+cached but its local send did not complete; + // drives a bounded `snapshot_interval` retry of that cached supersede + // until a full local send. Never set by a Provisional (S1: cache is the + // prior block — must not resend). + let mut retry_cached_supersede = false; let mut health = TobPublisherHealth::new(Self::now_ms()); let mut fill_pairs = FillPairAccumulator::new(Self::now_ms()); @@ -1193,6 +1236,12 @@ impl MulticastPublisher { *emission, outcome, ); + retry_cached_supersede = Self::next_retry_cached_supersede( + retry_cached_supersede, + *emission, + caught_up, + outcome, + ); } else { caught_up = false; crate::metrics::observe_tob_source_lag( @@ -1333,33 +1382,38 @@ impl MulticastPublisher { } } _ = snapshot_interval.tick() => { - // Periodic refresh of the last fresh full snapshot. Gated on - // `caught_up` ONLY: `caught_up` is true exactly when the most - // recent publish was a genuinely fresh Authoritative/ - // Correction, i.e. `cached_snapshot` is a current, correct - // full book. It is deliberately NOT extended to - // `pending_provisional`: while a provisional is pending but - // its block has not finalized, `cached_snapshot` is the - // PREVIOUS block's authoritative (the provisional is not - // cached; the awaited supersede does not exist yet), so - // resending it here would roll subscribers backward. The - // awaited correction is delivered at finalization (the - // forced supersede). If that forced-stale supersede is lost - // on the wire during a CONTINUED stall, the subscriber is - // recovered by the normal fresh-snapshot resync when the - // stream catches up — there is no fresher correct data - // during the stall, so this is the accepted best-effort - // contract for a degraded feed (no stale rebroadcast, no - // rollback). - if caught_up { + // Resend the cached full snapshot when either: + // - `caught_up`: the cache is a current fresh + // Authoritative/Correction (normal periodic refresh); or + // - `retry_cached_supersede`: the cache IS a forced-stale + // Authoritative/Correction supersede whose local send did + // not complete (S2) — retry exactly it until a full local + // send. This is NOT gated on bare `pending_provisional`: + // in S1 (provisional delivered, block not yet finalized) + // the cache is the PRIOR block's authoritative and + // resending it would roll subscribers back; + // `retry_cached_supersede` is armed only AFTER the + // supersede itself is cached, so the resend here is + // always the awaited correction, never a rollback. It is + // bounded: cleared on a full local send below or when a + // fresh snapshot supersedes (see + // `next_retry_cached_supersede`), so it cannot become a + // catch-up stale flood. + if caught_up || retry_cached_supersede { if let Some(ref cached) = cached_snapshot && let InternalMessage::Snapshot { l2_snapshots, time, .. } = cached.as_ref() { - if self.publish_quotes(l2_snapshots.as_ref(), *time, true).await - != QuoteSendOutcome::None - { + let outcome = + self.publish_quotes(l2_snapshots.as_ref(), *time, true).await; + if outcome != QuoteSendOutcome::None { had_activity = true; heartbeat_interval.reset(); } + if outcome == QuoteSendOutcome::All { + // The cached supersede (or periodic refresh) + // fully sent locally — the S2 retry + // obligation is discharged. + retry_cached_supersede = false; + } } } } @@ -2150,6 +2204,30 @@ mod tests { assert!(MulticastPublisher::next_pending_provisional(true, Correction, Partial)); } + /// Locks the S1/S2 separation: a Provisional never arms the retry (S1 — the + /// cache is the prior block; resending it would roll subscribers back), and + /// a forced-stale Authoritative/Correction supersede with an incomplete + /// local send (S2) arms it so the interval re-sends exactly that cached + /// supersede until a full local send. + #[test] + fn next_retry_cached_supersede_matrix() { + use QuoteSendOutcome::{All, None, Partial}; + use SnapshotEmission::{Authoritative, Correction, Provisional}; + // S1: Provisional never (un)arms — unchanged from prev. + assert!(!MulticastPublisher::next_retry_cached_supersede(false, Provisional, false, None)); + assert!(MulticastPublisher::next_retry_cached_supersede(true, Provisional, false, All)); + // Fresh Authoritative/Correction: normal caught_up refresh covers it -> clear. + assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, true, All)); + assert!(!MulticastPublisher::next_retry_cached_supersede(true, Correction, true, Partial)); + // Forced-stale supersede fully sent locally -> nothing to retry -> clear. + assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, All)); + // S2: forced-stale supersede, incomplete local send -> arm the retry. + assert!(MulticastPublisher::next_retry_cached_supersede(false, Authoritative, false, Partial)); + assert!(MulticastPublisher::next_retry_cached_supersede(false, Authoritative, false, None)); + assert!(MulticastPublisher::next_retry_cached_supersede(false, Correction, false, Partial)); + assert!(MulticastPublisher::next_retry_cached_supersede(false, Correction, false, None)); + } + /// `caught_up` after a publish: true ONLY for a fresh Authoritative or /// Correction. A Provisional (even fresh) must NOT enable periodic resend /// (it would rebroadcast the older cached authoritative during a stall and From 19d545e1f17465782c9dab93edc3b1c964b28685 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 14:50:04 -0400 Subject: [PATCH 55/65] publisher: cache only published snapshots so a suppressed stale one can't poison the resend/retry cache --- server/src/multicast/publisher.rs | 46 +++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index eb2b456e..cbccbb80 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1142,15 +1142,6 @@ impl MulticastPublisher { enqueued_at_ms, emission, } => { - // Never cache a Provisional for the periodic - // snapshot resend: it is a partial, mid-stall - // book and the interval path rebroadcasts the - // cache as a full/authoritative snapshot. Only - // Authoritative/Correction may be periodically - // resent. - if !matches!(emission, SnapshotEmission::Provisional) { - cached_snapshot = Some(msg.clone()); - } let now_ms = Self::now_ms(); let queue_delay_ms = now_ms.saturating_sub(*enqueued_at_ms); let listener_to_publisher_ms = now_ms.saturating_sub(*source_local_time_ms); @@ -1191,6 +1182,22 @@ impl MulticastPublisher { SnapshotEmission::Authoritative => fresh || pending_provisional, }; if publish { + // Cache ONLY a snapshot we actually publish, + // and never a Provisional. A suppressed + // (catch-up-gated) snapshot must not touch + // the cache: it would poison the periodic + // resend and silently cancel an armed + // `retry_cached_supersede` for a real + // incompletely-sent supersede. With this, + // `cached_snapshot` is exactly the last + // PUBLISHED full (Authoritative/Correction) + // snapshot — the invariant the periodic + // resend and S2 retry both assume. A + // published Provisional is still excluded + // (partial mid-stall book). + if !matches!(emission, SnapshotEmission::Provisional) { + cached_snapshot = Some(msg.clone()); + } health.observe_publishable_lag(lag_ms); crate::metrics::observe_tob_source_lag( "snapshot", @@ -2228,6 +2235,27 @@ mod tests { assert!(MulticastPublisher::next_retry_cached_supersede(false, Correction, false, None)); } + /// Regression: a suppressed stale Authoritative (no pending provisional) + /// must not disarm an armed `retry_cached_supersede`. Suppression never + /// reaches the publish branch, so it cannot run `next_retry_cached_supersede` + /// and cannot touch `cached_snapshot` (the assignment now lives inside the + /// publish branch). This asserts the transition fn itself never *clears* an + /// armed retry for the inputs a suppressed-then-stale sequence would imply + /// IF it (incorrectly) reached the publish branch — i.e. the only legitimate + /// disarms are a fresh publish or a full local send. + #[test] + fn suppressed_snapshot_must_not_disarm_supersede_retry() { + use QuoteSendOutcome::{All, None, Partial}; + use SnapshotEmission::{Authoritative, Correction}; + // Armed (prev=true). The ONLY disarms are: fresh publish, or full send. + assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, true, All)); // fresh -> clear (legit) + assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, All)); // full send -> clear (legit) + // A forced-stale incomplete send keeps it armed (must NOT be cleared). + assert!(MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, Partial)); + assert!(MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, None)); + assert!(MulticastPublisher::next_retry_cached_supersede(true, Correction, false, Partial)); + } + /// `caught_up` after a publish: true ONLY for a fresh Authoritative or /// Correction. A Provisional (even fresh) must NOT enable periodic resend /// (it would rebroadcast the older cached authoritative during a stall and From a1623d7d61d61eab211b32443fc3bf6c5b25fceb Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 14:59:43 -0400 Subject: [PATCH 56/65] publisher: a fully-sent interval retry discharges the provisional obligation (no stuck pending) --- server/src/multicast/publisher.rs | 62 ++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index cbccbb80..bfd2c22b 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1408,7 +1408,13 @@ impl MulticastPublisher { // catch-up stale flood. if caught_up || retry_cached_supersede { if let Some(ref cached) = cached_snapshot - && let InternalMessage::Snapshot { l2_snapshots, time, .. } = cached.as_ref() { + && let InternalMessage::Snapshot { + l2_snapshots, + time, + emission, + .. + } = cached.as_ref() + { let outcome = self.publish_quotes(l2_snapshots.as_ref(), *time, true).await; if outcome != QuoteSendOutcome::None { @@ -1416,6 +1422,22 @@ impl MulticastPublisher { heartbeat_interval.reset(); } if outcome == QuoteSendOutcome::All { + // A fully-sent retry of the cached supersede + // discharges the provisional obligation + // exactly as a fully-sent first attempt + // would — apply the SAME transition the + // primary publish path uses, so a cached + // Authoritative that fully sends clears + // `pending_provisional` (a Correction is + // orthogonal and leaves it unchanged). + // Without this the obligation stays stuck + // true and a later stale Authoritative would + // force-publish past the freshness gate. + pending_provisional = Self::next_pending_provisional( + pending_provisional, + *emission, + QuoteSendOutcome::All, + ); // The cached supersede (or periodic refresh) // fully sent locally — the S2 retry // obligation is discharged. @@ -2274,4 +2296,42 @@ mod tests { assert!(!MulticastPublisher::caught_up_after_publish(Correction, false)); assert!(!MulticastPublisher::caught_up_after_publish(Provisional, false)); } + + /// Regression: a fully-sent interval RETRY of a cached supersede must + /// discharge the provisional obligation exactly like a fully-sent first + /// attempt, otherwise `pending_provisional` stays stuck true and a later + /// stale Authoritative force-publishes past the freshness gate. + /// + /// Composed proof using the pure transitions the loop applies: + /// 1. Provisional delivered -> pending = true + /// 2. forced-stale Authoritative, Partial -> pending stays true (kept), + /// retry armed + /// 3. interval retry, All (Authoritative) -> pending discharged (false) + /// 4. later stale Authoritative, no pending -> NOT published (suppressed) + #[test] + fn interval_retry_full_send_discharges_supersede() { + use QuoteSendOutcome::{All, Partial}; + use SnapshotEmission::{Authoritative, Provisional}; + let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; + + // 1. provisional delivered + let pending = MulticastPublisher::next_pending_provisional(false, Provisional, All); + assert!(pending, "delivered provisional creates the obligation"); + + // 2. forced-stale Authoritative, partial local send -> obligation kept + let pending = MulticastPublisher::next_pending_provisional(pending, Authoritative, Partial); + assert!(pending, "partial supersede must NOT discharge the obligation"); + + // 3. interval retry fully sends the cached Authoritative supersede + let pending = MulticastPublisher::next_pending_provisional(pending, Authoritative, All); + assert!(!pending, "a fully-sent retry of the supersede discharges the obligation"); + + // 4. a later STALE Authoritative with no pending obligation: the publish + // decision is `fresh || pending`; both are false -> suppressed. + let fresh_false_lag = thr + 10_000; + assert!( + !MulticastPublisher::snapshot_should_publish(Authoritative, fresh_false_lag, pending), + "after the supersede is discharged, a later stale Authoritative must stay suppressed" + ); + } } From fc77a81cccd03f3368a8035dcf3367e5f5e1a398 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 15:13:16 -0400 Subject: [PATCH 57/65] publisher: drop racy cached-supersede resend; rely on in-loop force-publish + standard caught-up resync --- server/src/multicast/publisher.rs | 245 +++++------------------------- 1 file changed, 41 insertions(+), 204 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index bfd2c22b..6a20ddad 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -619,43 +619,6 @@ impl MulticastPublisher { fresh && !matches!(emission, SnapshotEmission::Provisional) } - /// Whether the periodic `snapshot_interval` arm must keep retrying the - /// cached snapshot because it is a forced-stale supersede whose LOCAL send - /// did not complete. - /// - /// Distinguishes the two provisional-supersede states the single - /// `pending_provisional` bool cannot: - /// - A `Provisional` is never cached and (S1) the awaited supersede does - /// not exist yet — `cached_snapshot` is the PRIOR block's authoritative, - /// so this MUST stay unchanged (resending the cache would roll - /// subscribers back). - /// - An `Authoritative`/`Correction` IS cached. If it published fresh - /// (`caught_up`), the normal `caught_up` periodic refresh covers it → - /// clear. If it was a forced-stale supersede that fully sent locally - /// (`All`) there is nothing to retry → clear. If its local send was - /// `Partial`/`None` (S2), `cached_snapshot` IS that supersede → arm the - /// retry so the interval re-sends exactly it (delivering the awaited - /// correction, not a rollback) until a full local send. - const fn next_retry_cached_supersede( - prev: bool, - emission: SnapshotEmission, - caught_up: bool, - outcome: QuoteSendOutcome, - ) -> bool { - match emission { - SnapshotEmission::Provisional => prev, - SnapshotEmission::Authoritative | SnapshotEmission::Correction => { - if caught_up { - false - } else if matches!(outcome, QuoteSendOutcome::All) { - false - } else { - true - } - } - } - } - fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { lag_ms <= Self::CATCHUP_THRESHOLD_MS } @@ -1114,12 +1077,6 @@ impl MulticastPublisher { // delivered provisional that omits the block's final diffs. Cleared // when an Authoritative or Correction is published. let mut pending_provisional = false; - // S2 retry flag: set only when a forced-stale Authoritative/Correction - // supersede was emitted+cached but its local send did not complete; - // drives a bounded `snapshot_interval` retry of that cached supersede - // until a full local send. Never set by a Provisional (S1: cache is the - // prior block — must not resend). - let mut retry_cached_supersede = false; let mut health = TobPublisherHealth::new(Self::now_ms()); let mut fill_pairs = FillPairAccumulator::new(Self::now_ms()); @@ -1186,15 +1143,13 @@ impl MulticastPublisher { // and never a Provisional. A suppressed // (catch-up-gated) snapshot must not touch // the cache: it would poison the periodic - // resend and silently cancel an armed - // `retry_cached_supersede` for a real - // incompletely-sent supersede. With this, - // `cached_snapshot` is exactly the last - // PUBLISHED full (Authoritative/Correction) - // snapshot — the invariant the periodic - // resend and S2 retry both assume. A - // published Provisional is still excluded - // (partial mid-stall book). + // `caught_up` resend with stale state. With + // this, `cached_snapshot` is exactly the + // last PUBLISHED full (Authoritative/ + // Correction) snapshot — the invariant the + // periodic resync assumes. A published + // Provisional is still excluded (partial + // mid-stall book). if !matches!(emission, SnapshotEmission::Provisional) { cached_snapshot = Some(msg.clone()); } @@ -1225,30 +1180,24 @@ impl MulticastPublisher { // send outcome: a (even partially) delivered // Provisional creates it; only a FULLY-sent // Authoritative finalization discharges it (a - // partial/failed supersede keeps it). This - // obligation is tracked so the IN-LOOP publish - // decision `Authoritative => fresh || - // pending_provisional` still force-publishes a - // later same-epoch Authoritative even if - // stale, and so a PARTIAL local send of a - // supersede does not falsely discharge it. It - // intentionally does NOT drive periodic - // resends — doing so would rebroadcast the - // prior block's authoritative (see the - // `snapshot_interval` arm). A recovery - // Correction is orthogonal and leaves it - // unchanged. + // partial/failed supersede keeps it). A + // `Partial`/`None` local send keeps + // `pending_provisional` so the NEXT real + // Authoritative/Correction for the epoch is + // force-published by the in-loop + // `Authoritative => fresh || + // pending_provisional` decision — a race-free + // retry that always uses the newest snapshot, + // never a stale cached one. Stream-recovery + // resync is covered separately by the + // `caught_up`-gated periodic resend. A + // recovery Correction is orthogonal and + // leaves it unchanged. pending_provisional = Self::next_pending_provisional( pending_provisional, *emission, outcome, ); - retry_cached_supersede = Self::next_retry_cached_supersede( - retry_cached_supersede, - *emission, - caught_up, - outcome, - ); } else { caught_up = false; crate::metrics::observe_tob_source_lag( @@ -1389,60 +1338,31 @@ impl MulticastPublisher { } } _ = snapshot_interval.tick() => { - // Resend the cached full snapshot when either: - // - `caught_up`: the cache is a current fresh - // Authoritative/Correction (normal periodic refresh); or - // - `retry_cached_supersede`: the cache IS a forced-stale - // Authoritative/Correction supersede whose local send did - // not complete (S2) — retry exactly it until a full local - // send. This is NOT gated on bare `pending_provisional`: - // in S1 (provisional delivered, block not yet finalized) - // the cache is the PRIOR block's authoritative and - // resending it would roll subscribers back; - // `retry_cached_supersede` is armed only AFTER the - // supersede itself is cached, so the resend here is - // always the awaited correction, never a rollback. It is - // bounded: cleared on a full local send below or when a - // fresh snapshot supersedes (see - // `next_retry_cached_supersede`), so it cannot become a - // catch-up stale flood. - if caught_up || retry_cached_supersede { + // Standard periodic full-snapshot resync: rebroadcast the + // last PUBLISHED full (Authoritative/Correction) snapshot + // while caught up. This is the recovery path for subscribers + // that missed datagrams over fire-and-forget UDP multicast. + // Gated on `caught_up` ONLY (true exactly when the most + // recent publish was a genuinely fresh Authoritative/ + // Correction — i.e. `cached_snapshot` is current and + // correct). It is deliberately NOT extended to a pending + // provisional / supersede-retry: resending a cached snapshot + // from a timer races newer state and can roll subscribers + // back. A missed forced-stale supersede is instead retried + // race-free by the in-loop `Authoritative => fresh || + // pending_provisional` decision (the next real finalization/ + // recovery snapshot is force-published) and, on stream + // recovery, by this caught-up resync — the accepted + // best-effort contract for a degraded multicast feed. + if caught_up { if let Some(ref cached) = cached_snapshot - && let InternalMessage::Snapshot { - l2_snapshots, - time, - emission, - .. - } = cached.as_ref() - { - let outcome = - self.publish_quotes(l2_snapshots.as_ref(), *time, true).await; - if outcome != QuoteSendOutcome::None { + && let InternalMessage::Snapshot { l2_snapshots, time, .. } = cached.as_ref() { + if self.publish_quotes(l2_snapshots.as_ref(), *time, true).await + != QuoteSendOutcome::None + { had_activity = true; heartbeat_interval.reset(); } - if outcome == QuoteSendOutcome::All { - // A fully-sent retry of the cached supersede - // discharges the provisional obligation - // exactly as a fully-sent first attempt - // would — apply the SAME transition the - // primary publish path uses, so a cached - // Authoritative that fully sends clears - // `pending_provisional` (a Correction is - // orthogonal and leaves it unchanged). - // Without this the obligation stays stuck - // true and a later stale Authoritative would - // force-publish past the freshness gate. - pending_provisional = Self::next_pending_provisional( - pending_provisional, - *emission, - QuoteSendOutcome::All, - ); - // The cached supersede (or periodic refresh) - // fully sent locally — the S2 retry - // obligation is discharged. - retry_cached_supersede = false; - } } } } @@ -2233,51 +2153,6 @@ mod tests { assert!(MulticastPublisher::next_pending_provisional(true, Correction, Partial)); } - /// Locks the S1/S2 separation: a Provisional never arms the retry (S1 — the - /// cache is the prior block; resending it would roll subscribers back), and - /// a forced-stale Authoritative/Correction supersede with an incomplete - /// local send (S2) arms it so the interval re-sends exactly that cached - /// supersede until a full local send. - #[test] - fn next_retry_cached_supersede_matrix() { - use QuoteSendOutcome::{All, None, Partial}; - use SnapshotEmission::{Authoritative, Correction, Provisional}; - // S1: Provisional never (un)arms — unchanged from prev. - assert!(!MulticastPublisher::next_retry_cached_supersede(false, Provisional, false, None)); - assert!(MulticastPublisher::next_retry_cached_supersede(true, Provisional, false, All)); - // Fresh Authoritative/Correction: normal caught_up refresh covers it -> clear. - assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, true, All)); - assert!(!MulticastPublisher::next_retry_cached_supersede(true, Correction, true, Partial)); - // Forced-stale supersede fully sent locally -> nothing to retry -> clear. - assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, All)); - // S2: forced-stale supersede, incomplete local send -> arm the retry. - assert!(MulticastPublisher::next_retry_cached_supersede(false, Authoritative, false, Partial)); - assert!(MulticastPublisher::next_retry_cached_supersede(false, Authoritative, false, None)); - assert!(MulticastPublisher::next_retry_cached_supersede(false, Correction, false, Partial)); - assert!(MulticastPublisher::next_retry_cached_supersede(false, Correction, false, None)); - } - - /// Regression: a suppressed stale Authoritative (no pending provisional) - /// must not disarm an armed `retry_cached_supersede`. Suppression never - /// reaches the publish branch, so it cannot run `next_retry_cached_supersede` - /// and cannot touch `cached_snapshot` (the assignment now lives inside the - /// publish branch). This asserts the transition fn itself never *clears* an - /// armed retry for the inputs a suppressed-then-stale sequence would imply - /// IF it (incorrectly) reached the publish branch — i.e. the only legitimate - /// disarms are a fresh publish or a full local send. - #[test] - fn suppressed_snapshot_must_not_disarm_supersede_retry() { - use QuoteSendOutcome::{All, None, Partial}; - use SnapshotEmission::{Authoritative, Correction}; - // Armed (prev=true). The ONLY disarms are: fresh publish, or full send. - assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, true, All)); // fresh -> clear (legit) - assert!(!MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, All)); // full send -> clear (legit) - // A forced-stale incomplete send keeps it armed (must NOT be cleared). - assert!(MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, Partial)); - assert!(MulticastPublisher::next_retry_cached_supersede(true, Authoritative, false, None)); - assert!(MulticastPublisher::next_retry_cached_supersede(true, Correction, false, Partial)); - } - /// `caught_up` after a publish: true ONLY for a fresh Authoritative or /// Correction. A Provisional (even fresh) must NOT enable periodic resend /// (it would rebroadcast the older cached authoritative during a stall and @@ -2296,42 +2171,4 @@ mod tests { assert!(!MulticastPublisher::caught_up_after_publish(Correction, false)); assert!(!MulticastPublisher::caught_up_after_publish(Provisional, false)); } - - /// Regression: a fully-sent interval RETRY of a cached supersede must - /// discharge the provisional obligation exactly like a fully-sent first - /// attempt, otherwise `pending_provisional` stays stuck true and a later - /// stale Authoritative force-publishes past the freshness gate. - /// - /// Composed proof using the pure transitions the loop applies: - /// 1. Provisional delivered -> pending = true - /// 2. forced-stale Authoritative, Partial -> pending stays true (kept), - /// retry armed - /// 3. interval retry, All (Authoritative) -> pending discharged (false) - /// 4. later stale Authoritative, no pending -> NOT published (suppressed) - #[test] - fn interval_retry_full_send_discharges_supersede() { - use QuoteSendOutcome::{All, Partial}; - use SnapshotEmission::{Authoritative, Provisional}; - let thr = MulticastPublisher::CATCHUP_THRESHOLD_MS; - - // 1. provisional delivered - let pending = MulticastPublisher::next_pending_provisional(false, Provisional, All); - assert!(pending, "delivered provisional creates the obligation"); - - // 2. forced-stale Authoritative, partial local send -> obligation kept - let pending = MulticastPublisher::next_pending_provisional(pending, Authoritative, Partial); - assert!(pending, "partial supersede must NOT discharge the obligation"); - - // 3. interval retry fully sends the cached Authoritative supersede - let pending = MulticastPublisher::next_pending_provisional(pending, Authoritative, All); - assert!(!pending, "a fully-sent retry of the supersede discharges the obligation"); - - // 4. a later STALE Authoritative with no pending obligation: the publish - // decision is `fresh || pending`; both are false -> suppressed. - let fresh_false_lag = thr + 10_000; - assert!( - !MulticastPublisher::snapshot_should_publish(Authoritative, fresh_false_lag, pending), - "after the supersede is discharged, a later stale Authoritative must stay suppressed" - ); - } } From 618c460bf442e9884d65050a96996dc1cb96a03c Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 15:25:01 -0400 Subject: [PATCH 58/65] publisher: drop caught-up on broadcast lag so stale cache is not rebroadcast as current --- server/src/multicast/publisher.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 6a20ddad..08aedac1 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1324,6 +1324,18 @@ impl MulticastPublisher { } }, Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { + // The bounded broadcast channel dropped `n` messages + // (shared with high-volume fills). One of them may + // have been the sole per-finalized-block authoritative + // snapshot, so `cached_snapshot` can no longer be + // asserted current. Per the `caught_up` invariant + // (true only when the most recent publish was a + // genuinely fresh Authoritative/Correction), drop + // caught-up state: the periodic resync must NOT keep + // rebroadcasting a possibly-stale cached snapshot as + // if current. The next genuine fresh snapshot + // re-establishes `caught_up` and refreshes the cache. + caught_up = false; crate::metrics::inc_tob_receiver_lag("event", 1); crate::metrics::inc_tob_receiver_lag("message", n); if let Some(report) = health.record_receiver_lag(n, Self::now_ms()) { From a23e3c9a01a1dd17a11704e3b7c4344fbc46a9cc Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 15:33:30 -0400 Subject: [PATCH 59/65] publisher: corrections stay resync-eligible; broadcast lag also clears pending provisional --- server/src/multicast/publisher.rs | 102 +++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 30 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 08aedac1..9d725fd8 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -607,16 +607,44 @@ impl MulticastPublisher { } /// `caught_up` gates the periodic `snapshot_interval` rebroadcast of the - /// cached authoritative snapshot. It must be true ONLY when the most recent - /// publish was a genuinely fresh `Authoritative` or `Correction` (a real, - /// full snapshot we are keeping up with). A `Provisional` is partial mid- - /// stall state that is intentionally NOT cached, so flipping `caught_up` - /// on it would let the interval path rebroadcast the OLDER cached - /// authoritative and roll subscribers back during the stall. A forced- - /// stale Authoritative/Correction (`fresh == false`) also must not enable - /// periodic resends. Hence: caught up iff fresh AND not a Provisional. + /// cached authoritative snapshot. Rule, per emission kind: + /// + /// - `Correction`: ALWAYS resync-eligible, regardless of source lag. A + /// Correction is divergence-repair that bypasses the publish freshness + /// gate precisely so a subscriber is never stranded on incorrect + /// (diverged) data; it must likewise stay eligible for the periodic + /// resync, which is the only recovery for a Correction datagram lost + /// over fire-and-forget UDP. Its content is the corrected book from a + /// fresh venue snapshot, and Corrections are recovery-only (rare), so + /// periodically resending the single cached Correction is correct + /// recovery resync, not stale catch-up flooding. + /// - `Authoritative`: resync-eligible iff `fresh`. A forced-stale + /// Authoritative (`fresh == false`) must NOT enable the periodic + /// resend, which would rebroadcast stale data as current (catch-up + /// suppression). + /// - `Provisional`: NEVER. It is partial mid-stall state that is + /// intentionally NOT cached, so flipping `caught_up` on it would let + /// the interval path rebroadcast the OLDER cached authoritative and + /// roll subscribers back during the stall. const fn caught_up_after_publish(emission: SnapshotEmission, fresh: bool) -> bool { - fresh && !matches!(emission, SnapshotEmission::Provisional) + match emission { + // Divergence-repair: a Correction bypasses the publish freshness + // gate AND must stay eligible for the periodic resync (the only + // recovery for a datagram lost over fire-and-forget UDP). Its + // content is the corrected book from a fresh venue snapshot; a + // subscriber must never be stranded on incorrect (diverged) data. + // Corrections are recovery-only (rare), so periodic resync of the + // single cached Correction is not catch-up flooding. + SnapshotEmission::Correction => true, + // A fresh Authoritative is current → resync-eligible. A + // forced-stale Authoritative (fresh == false) must NOT enable the + // periodic resend (it would rebroadcast stale data as current — + // catch-up suppression). + SnapshotEmission::Authoritative => fresh, + // A Provisional is a partial mid-stall book — never periodically + // resync (would rebroadcast an incomplete book / risk rollback). + SnapshotEmission::Provisional => false, + } } fn should_warn_for_receiver_lag(lag_ms: u64) -> bool { @@ -1325,17 +1353,29 @@ impl MulticastPublisher { }, Err(tokio::sync::broadcast::error::RecvError::Lagged(n)) => { // The bounded broadcast channel dropped `n` messages - // (shared with high-volume fills). One of them may - // have been the sole per-finalized-block authoritative - // snapshot, so `cached_snapshot` can no longer be - // asserted current. Per the `caught_up` invariant - // (true only when the most recent publish was a - // genuinely fresh Authoritative/Correction), drop - // caught-up state: the periodic resync must NOT keep - // rebroadcasting a possibly-stale cached snapshot as - // if current. The next genuine fresh snapshot - // re-establishes `caught_up` and refreshes the cache. + // (shared with high-volume fills). The drop may have + // taken the sole per-finalized-block authoritative + // snapshot AND/OR a delivered Provisional and its + // superseding Authoritative, so neither the + // caught-up state nor the in-flight provisional model + // can be asserted any longer. Invalidate BOTH: + // + // - Drop caught-up state: the periodic resync must + // NOT keep rebroadcasting a possibly-stale cached + // snapshot as if current. The next genuine fresh + // snapshot re-establishes `caught_up` and refreshes + // the cache. caught_up = false; + // A Lagged drop may also have dropped a delivered + // Provisional and/or its superseding Authoritative, + // so the publisher's in-flight provisional/supersede + // model is unreliable. Clear it so a later unrelated + // Authoritative is not force-published as current via + // a stuck `fresh || pending_provisional` (stale + // backlog leak). A genuinely stranded subscriber is + // resynced by the standard caught-up resync once a + // fresh Authoritative/Correction re-establishes it. + pending_provisional = false; crate::metrics::inc_tob_receiver_lag("event", 1); crate::metrics::inc_tob_receiver_lag("message", n); if let Some(report) = health.record_receiver_lag(n, Self::now_ms()) { @@ -2165,22 +2205,24 @@ mod tests { assert!(MulticastPublisher::next_pending_provisional(true, Correction, Partial)); } - /// `caught_up` after a publish: true ONLY for a fresh Authoritative or - /// Correction. A Provisional (even fresh) must NOT enable periodic resend - /// (it would rebroadcast the older cached authoritative during a stall and - /// roll subscribers back). A forced-stale publish (fresh=false) also stays - /// not-caught-up. + /// `caught_up` after a publish: a Correction is ALWAYS resync-eligible + /// (fresh or not) — it is divergence-repair and must stay recoverable over + /// lossy UDP. An Authoritative is resync-eligible iff fresh (a forced-stale + /// one must not enable periodic resend — catch-up suppression). A + /// Provisional is NEVER resync-eligible (it would rebroadcast the older + /// cached authoritative during a stall and roll subscribers back). #[test] fn caught_up_after_publish_matrix() { use SnapshotEmission::{Authoritative, Correction, Provisional}; - // fresh authoritative / correction -> caught up - assert!(MulticastPublisher::caught_up_after_publish(Authoritative, true)); + // Correction is divergence-repair: ALWAYS resync-eligible (fresh or not). assert!(MulticastPublisher::caught_up_after_publish(Correction, true)); - // fresh provisional -> NOT caught up (must not trigger periodic resend) - assert!(!MulticastPublisher::caught_up_after_publish(Provisional, true)); - // stale (forced) anything -> NOT caught up + assert!(MulticastPublisher::caught_up_after_publish(Correction, false)); + // Authoritative: resync-eligible iff fresh (forced-stale must not enable + // periodic resend — catch-up suppression). + assert!(MulticastPublisher::caught_up_after_publish(Authoritative, true)); assert!(!MulticastPublisher::caught_up_after_publish(Authoritative, false)); - assert!(!MulticastPublisher::caught_up_after_publish(Correction, false)); + // Provisional: never (partial mid-stall book). + assert!(!MulticastPublisher::caught_up_after_publish(Provisional, true)); assert!(!MulticastPublisher::caught_up_after_publish(Provisional, false)); } } From 3951bcee27913b10a4a8ea10adfb7b637f4704f5 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 15:41:37 -0400 Subject: [PATCH 60/65] publisher: keep pending-provisional across broadcast lag (next authoritative supersedes; revert lag-clear) --- server/src/multicast/publisher.rs | 41 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/server/src/multicast/publisher.rs b/server/src/multicast/publisher.rs index 9d725fd8..2f4da48d 100644 --- a/server/src/multicast/publisher.rs +++ b/server/src/multicast/publisher.rs @@ -1355,27 +1355,28 @@ impl MulticastPublisher { // The bounded broadcast channel dropped `n` messages // (shared with high-volume fills). The drop may have // taken the sole per-finalized-block authoritative - // snapshot AND/OR a delivered Provisional and its - // superseding Authoritative, so neither the - // caught-up state nor the in-flight provisional model - // can be asserted any longer. Invalidate BOTH: - // - // - Drop caught-up state: the periodic resync must - // NOT keep rebroadcasting a possibly-stale cached - // snapshot as if current. The next genuine fresh - // snapshot re-establishes `caught_up` and refreshes - // the cache. + // snapshot, so caught-up state can no longer be + // asserted: drop it so the periodic resync does not + // keep rebroadcasting a possibly-stale cached + // snapshot as if current. The next genuine fresh + // snapshot re-establishes `caught_up` and refreshes + // the cache. caught_up = false; - // A Lagged drop may also have dropped a delivered - // Provisional and/or its superseding Authoritative, - // so the publisher's in-flight provisional/supersede - // model is unreliable. Clear it so a later unrelated - // Authoritative is not force-published as current via - // a stuck `fresh || pending_provisional` (stale - // backlog leak). A genuinely stranded subscriber is - // resynced by the standard caught-up resync once a - // fresh Authoritative/Correction re-establishes it. - pending_provisional = false; + // `pending_provisional` is deliberately NOT cleared + // here. If the drop took a delivered provisional's + // superseding Authoritative, the obligation must + // survive so the NEXT finalized-block Authoritative + // is force-published (`fresh || pending_provisional`) + // and supersedes the partial provisional the + // subscriber is on — the correct, complete, + // most-recent finalized book (beneficial resync, not + // a backlog flood: this is the L2-5 per-block + // cadence, and the stuck-stream backstop barely fires + // during fast catch-up so the obligation is + // realistically set only during a genuine stall). + // Its only discharge remains a fully-sent + // superseding Authoritative (`QuoteSendOutcome::All`) + // or a fresh one — the single race-free rule. crate::metrics::inc_tob_receiver_lag("event", 1); crate::metrics::inc_tob_receiver_lag("message", n); if let Some(report) = health.record_receiver_lag(n, Self::now_ms()) { From 31ac40f84092144c06582c89ee6e3e610169efc5 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 17:19:02 -0400 Subject: [PATCH 61/65] gitignore: exclude docker cross-build output dir --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d91929f2..0f73cc95 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ target/ server/tmp/ .worktrees/ .claude/*.lock +target-docker/ From d57d8c0bd45f8f16066998c58893d5301c42e327 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 18:35:08 -0400 Subject: [PATCH 62/65] docs: update ARCHITECTURE.md for multicast-only default, L2-5 finalization emission, supersede model, recovery guards --- ARCHITECTURE.md | 133 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 8 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index d30ad398..11bb784a 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -7,8 +7,10 @@ the file tree alone. The main binary is `dz_hl_publisher` in `binaries/src/bin/dz_hl_publisher.rs`. It can serve WebSocket clients, publish top-of-book (TOB) multicast, publish depth-of-book (DoB) multicast, and expose -Prometheus metrics. Production deployments in this repo mostly use it as a -multicast publisher. +Prometheus metrics. Production deployments in this repo run it **multicast-only: +the WebSocket listener is disabled by default** and is opt-in via +`--enable-websocket`. At least one output must be configured (see Process +Startup); a no-output run is rejected. ## High-Level Model @@ -59,11 +61,19 @@ Important source files: `dz_hl_publisher` parses CLI flags and constructs: - `IngestMode`: `block` or `stream`. +- WebSocket listener: off unless `--enable-websocket` is passed. - Optional TOB `MulticastConfig`. - Optional DoB `DobConfig`. - Optional Prometheus metrics listener. - Optional streaming-only separate fill ingest. +**Output-mode validation.** A run with no output (`--enable-websocket` absent +and neither `--multicast-group` nor `--dob-group`) is rejected. `--dob-group` is +not a standalone mode: the instrument registry is only bootstrapped from the HL +API in multicast mode, so `--dob-group` requires `--multicast-group`. Both rules +are enforced at the CLI **and** again at the `run_websocket_server` library +boundary, so an alternate caller cannot start a publisher that emits nothing. + It then calls `run_websocket_server`. The function name is historical: this is also the top-level coordinator for multicast-only deployments. @@ -72,7 +82,11 @@ also the top-level coordinator for multicast-only deployments. - `market_message_tx`: carries `InternalMessage::Snapshot` and `InternalMessage::Fills` for TOB and WebSocket L2/trade consumers. - `l4_message_tx`: carries `InternalMessage::L4BookUpdates` for WebSocket L4 - consumers. TOB does not subscribe to this high-volume channel. + consumers. TOB does not subscribe to this high-volume channel. When the + WebSocket listener is disabled, the per-diff L4 fan-out is skipped entirely + (`publish_l4_update` returns early): in multicast-only mode there are no L4 + consumers, so the clone + task-spawn + broadcast per applied diff is pure + overhead and is elided. If TOB or DoB is enabled, a shared instrument registry is also created. TOB uses it to map coins to instrument definitions. DoB uses it to resolve internal coins @@ -121,9 +135,23 @@ After initialization: - Every validation interval, the listener compares its computed L4 snapshot with a fresh validator snapshot. +Validation is done off-lock to avoid blocking the hot path: the listener clones +book state under the lock (capturing the snapshot height and a monotonic +`mutation_seq`), drops the lock, computes and compares the snapshot, then +re-acquires the lock to apply repairs. Before applying, it re-checks that the +live height **and** `mutation_seq` still match what was validated, and (in +streaming mode) that the height is finalized with no block still buffered for +it. If anything moved, the report is stale and discarded — a later validation +cycle re-derives against current state. This prevents a raced report from +overwriting newer book state. Recovery's own per-coin mutations also bump +`mutation_seq`, so an overlapping validation task cannot replay a stale repair. + When validation finds a per-coin divergence, the listener repairs only affected coins. If DoB is enabled, this recovery also emits an `InstrumentReset` and -queues a priority DoB snapshot for the affected instrument. +queues a priority DoB snapshot for the affected instrument. On the TOB side, +recovery emits a `Correction` snapshot (see TOB Publishing Hot Path) so the +corrected book reaches subscribers without waiting for an unrelated future +diff. ## Block Mode Hot Path @@ -166,6 +194,9 @@ Key properties: mutations. - Fills publish TOB trades but do not mutate book state. - TOB snapshots are emitted once per file-read chunk, not per individual diff. + This per-chunk cadence applies to block mode and to WebSocket-enabled + streaming. Multicast-only streaming instead emits per finalized block (see + Streaming Finalization). - In streaming mode, optional separate fill ingest sends fills directly to the market broadcast and does not compute book snapshots for fill-only rows. @@ -196,7 +227,7 @@ sequenceDiagram L->>B: drain earliest block B->>S: apply_stream_diff when ready S->>D: immediate DoB mutation event - L->>M: Snapshot after read chunk + L->>M: Snapshot at block finalization (per chunk if --enable-websocket) L->>M: Fills as fill rows arrive M->>T: pair fill sides by tid M->>T: freshness decision, then Quote/Trade or suppression @@ -266,6 +297,20 @@ meaningful diffs after finalization are fatal because they would mutate a closed DoB block. Late statuses after finalization are metadata-only unless a pending raw diff can consume them, so they are counted and ignored. +In multicast-only streaming, finalization also drives the **authoritative TOB +L2 snapshot**: instead of emitting once per file-read chunk, the listener emits +one snapshot when a block finalizes. A real BBO-affecting mutation opens a +`book_dirty` epoch (stamped with the dirtying row's source time and a +`mutation_seq`); finalization emits the authoritative snapshot and closes the +epoch. If a block stays unfinalized, a stuck-stream backstop emits a +`Provisional` snapshot — gated so it only fires once the dirty epoch's source +age is at least `STREAM_DIRTY_BACKSTOP_INTERVAL`, deduplicated by `mutation_seq` +so a static stall does not recompute every tick while ongoing diffs still +re-publish updated state. Recovery (per-coin divergence repair) emits a +`Correction`. WebSocket-enabled streaming bypasses all of this and keeps the +pre-existing per-chunk cadence byte-for-byte; that flag is the durable rollback +path for this emission model. + ## TOB Publishing Hot Path TOB has two UDP channels: @@ -281,6 +326,17 @@ The TOB publisher subscribes only to `market_message_tx`. It receives: It intentionally ignores `InternalMessage::L4BookUpdates`, so high-volume L4 traffic cannot evict TOB snapshots from the TOB receiver. +The L2 snapshot carried in `InternalMessage::Snapshot` is computed by +`compute_l2_snapshots`. With the WebSocket listener enabled it produces the full +set of WebSocket `l2Book` aggregation variants (the `n_sig_figs`/`mantissa` +bucket combinations, 7 per coin). With the WebSocket listener disabled there are +no `l2Book` subscribers, so it computes a single unbucketed snapshot — the TOB +publisher only ever uses level 1 (best bid/ask) of it anyway. This affects the +TOB/WebSocket-L2 path only. The DoB feed is a separate path (per-order L4 deltas +via `DobApplyTap` and full per-instrument resting-order snapshots via +`clone_coin_orders`) and is unaffected — DoB depth/level coverage does not +change with this setting. + ```mermaid flowchart LR Listener["OrderBookListener"] -->|"Snapshot"| MarketTx["market broadcast"] @@ -308,6 +364,40 @@ marketdata is suppressed. Heartbeats and refdata can still be emitted. Activity tracking is based on actual marketdata datagrams, so suppressed messages do not starve heartbeats. +### Snapshot emission kinds and the supersede model + +`InternalMessage::Snapshot` carries an emission kind that the publisher owns the +interpretation of (the publisher is the only component that both knows whether a +provisional was actually sent and applies the freshness gate, with one clock): + +- **Authoritative** — normal block finalization / block-mode per-chunk emit. + Lag-gated: published only when fresh, so catch-up backlog is suppressed. +- **Provisional** — stuck-stream backstop. Lag-gated like Authoritative. +- **Correction** — recovery divergence repair. Always published, bypassing the + freshness gate, because it corrects *incorrect* (diverged) data rather than + merely stale data. + +A `Provisional` that is actually sent creates a supersede obligation: the next +finalized-block `Authoritative` is force-published even if it is now stale, so a +subscriber that saw the partial provisional converges to the block's final book. +The obligation is discharged only by a *fully locally sent* Authoritative (a +partial UDP send keeps it, so a later snapshot re-forces it); a recovery +`Correction` is orthogonal to the stream dirty epoch and never discharges it. + +`caught_up` gates the periodic `snapshot_interval` resync (a bounded +rebroadcast of the last published full snapshot — the recovery path for +subscribers that missed a datagram over fire-and-forget UDP). It is true only +when the most recent publish was a genuinely fresh `Authoritative`, or any +`Correction` (a correction must stay resync-eligible — leaving a subscriber on +diverged data is worse than stale). It is cleared on suppression, on a +forced-stale publish, and on broadcast-receiver `Lagged` (a lag drop may have +skipped the only finalization snapshot, so the cache can no longer be asserted +current). Broadcast `Lagged` does not clear the supersede obligation: keeping it +lets the next finalized Authoritative resync a stranded subscriber. There is +one documented irreducible best-effort gap here: a lost forced-stale supersede +during a continued stall is recovered by the next snapshot / the caught-up +resync once the stream recovers, not by unbounded stale rebroadcast. + Important latency components: - Validator/source lag: `local_time - block_time`. @@ -384,9 +474,17 @@ There are three notable fanout boundaries: | `l4_message_tx` | Tokio broadcast | WebSocket `L4BookUpdates` | High-volume L4 traffic should not affect TOB. | | DoB MPSC | bounded Tokio MPSC | `DobEvent` | Full queue drops DoB mutation events. | +The DoB MPSC depth (and the DoB snapshot-request channel) is sized by +`--dob-channel-bound`. Hyperliquid emits large `OrderAdd` bursts at block +boundaries, so this must be sized for burst absorption, not average rate; a +too-shallow bound drops events during normal block-boundary bursts. Production +runs use a deep bound (`65536`); the historical `4096` default is too shallow. + Broadcast lag and bounded-channel drops are intentionally different failure -modes. Broadcast lag is receiver-side loss on an internal pub-sub channel. DoB -MPSC full is producer-side backpressure at the apply tap. +modes. Broadcast lag is receiver-side loss on an internal pub-sub channel; the +TOB publisher mitigates it by invalidating `caught_up` on `Lagged` so it does +not rebroadcast a possibly-stale cached snapshot as current. DoB MPSC full is +producer-side backpressure at the apply tap and is correctness-significant. ## Metrics and Observability @@ -492,10 +590,29 @@ active instrument. This usually points at stale or incomplete instrument metadata. The mutation still applies to the internal book, but no DoB event is emitted for that coin. +### `dob_tap: channel full, dropping ...` + +The bounded DoB MPSC backpressured and the tap dropped a mutation event. This is +correctness-significant: downstream reconstructed DoB books are inconsistent for +that instrument until the next reset/snapshot. The usual cause is a +`--dob-channel-bound` too shallow to absorb Hyperliquid block-boundary +`OrderAdd` bursts (it is bursty/activity-dependent, not a constant rate). The +fix is a deeper bound (production: `65536`); persistent drops at a deep bound +instead indicate the DoB emitter cannot keep up with steady-state load and need +a throughput investigation, not a larger buffer. + ## Design Constraints -- Block mode remains supported and is the compatibility baseline. +- Block mode remains supported and is the compatibility baseline; it must not + regress. - Streaming mode is opt-in and optimized for lower latency. +- The WebSocket listener is off by default. `--enable-websocket` is the durable + rollback contract: it restores the pre-finalization-driven streaming emission + byte-for-byte (per-chunk cadence, full L2 fan-out, recovery carried by the + next per-chunk snapshot). +- A run with no configured output is rejected; `--dob-group` requires + `--multicast-group`. These are enforced at the CLI and the + `run_websocket_server` boundary. - Fills do not mutate book state. - Raw-book `New` diffs insert resting orders directly; they do not invoke local matching. The raw diff price/size are authoritative for the resting book; From 7693bc6015644dfdb6f93c95ee8a41bc403250b3 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 18:38:45 -0400 Subject: [PATCH 63/65] docs: add repo claude.md onboarding pointing at architecture.md --- CLAUDE.md | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..8ebc9612 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,105 @@ +# CLAUDE.md + +Guidance for Claude Code (and any coding agent) working in this repository. + +## First: become an expert in the system + +**Before doing any work, read [`ARCHITECTURE.md`](ARCHITECTURE.md) in full and +treat it as the authoritative mental model of this system.** It is organized +around runtime dataflow and the TOB/DoB publishing hot paths, not the file tree, +and it documents the non-obvious invariants. Do not propose or make changes to +the listener, order book, finalization, recovery, or publisher paths until you +can explain, from `ARCHITECTURE.md`: + +- the block-mode vs streaming-mode hot paths and how they converge on the same + raw-diff apply semantics; +- streaming finalization (watermark vs grace fallback) and the L2-5 + finalization-driven TOB snapshot emission model (`book_dirty` epoch, + stuck-stream `Provisional` backstop, recovery `Correction`); +- the TOB publisher supersede model (`Authoritative`/`Provisional`/`Correction`, + `caught_up`, the supersede obligation, broadcast-`Lagged` hygiene); +- the off-lock snapshot-validation/recovery guard (`mutation_seq` + finalized + height re-check); +- TOB vs DZ-DoB being independent feeds. + +Also skim `README.md` for the CLI flag reference, the metric reference, and the +fixture/regeneration notes. Keep `ARCHITECTURE.md` updated in the same change +whenever you alter an architecture-relevant path (dataflow, channels, emission +model, finalization, recovery, validation, output modes). + +## Source map + +`ARCHITECTURE.md` has the authoritative file table. Quick orientation: + +| Area | Files | +|------|-------| +| Binary / CLI | `binaries/src/bin/dz_hl_publisher.rs` | +| Runtime orchestration | `server/src/servers/websocket_server.rs` | +| Listener / ingest / finalization | `server/src/listeners/order_book/mod.rs` | +| Book mutation | `server/src/listeners/order_book/state.rs`, `server/src/order_book/` | +| DoB apply tap | `server/src/listeners/order_book/dob_tap.rs` | +| TOB publisher | `server/src/multicast/publisher.rs` | +| DoB emitters | `server/src/multicast/dob.rs` | +| Wire formats | `server/src/protocol/`, `server/src/protocol/dob/` | +| E2E / parity tests | `server/src/listeners/order_book/block_mode_multicast_e2e.rs` | + +## Build / test / lint + +Rust workspace (`server`, `binaries`), Rust 1.90. Linux release binaries are +cross-built via Docker `linux/amd64` (Debian bookworm; `openssl-sys` cannot +cross-compile from macOS). For any hot-path change run: + +```bash +cargo test -p server +cargo test -p server listeners::order_book::block_mode_multicast_e2e -- --nocapture +cargo test -p server dual_validator_fixture_matches_block_and_stream_goldens -- --nocapture +cargo clippy --workspace --all-targets +``` + +CI runs `cargo clippy --workspace --all-targets` and `cargo test --workspace` on +`main` and PRs. Clippy is `warn`-level pedantic/nursery: there is a large +pre-existing baseline of warnings; do not chase them — only ensure your change +adds no *new* warnings in files you touch. + +## Hard invariants — do not violate + +- **Goldens are byte-identical.** Block-mode and stream goldens + (`server/tests/fixtures/**`) must not change unless a behavior change is + intended and deliberate. A moved `.bin`/`.json` golden is a stop-and-explain + signal, never a silent regenerate. +- **Block mode must not regress.** It is the compatibility baseline. Streaming + changes use separate code but must converge on the same raw-diff apply + semantics. +- **`--enable-websocket` is the durable rollback contract.** WS-enabled + streaming is byte-for-byte the pre-L2-5 per-chunk cadence. Do not couple + WS-enabled behavior to the finalization-driven path. +- **DoB drops are correctness-significant** (unlike TOB freshness suppression). + The DoB MPSC is sized by `--dob-channel-bound`; it must absorb Hyperliquid + block-boundary `OrderAdd` bursts (production runs `65536`, not the shallow + `4096` default). +- **The stuck-stream backstop / supersede model contains an irreducible + best-effort tradeoff** over fire-and-forget UDP multicast during a stalled + feed. The convergent design is: lag-gated Provisional/Authoritative, only + Correction bypasses freshness, race-free in-loop force-publish of the next + finalized Authoritative, standard `caught_up` periodic resync. Do **not** + re-introduce a cached-supersede periodic-retry mechanism — it was removed + after multiple iterations because resending a *cached* snapshot from a timer + is fundamentally racy (rollback / cache-poison). See `ARCHITECTURE.md` + "Snapshot emission kinds and the supersede model". +- **Output-config validation** (no-output rejected; `--dob-group` requires + `--multicast-group`) is enforced at both the CLI and the + `run_websocket_server` boundary. Keep both in sync. +- **Metrics labels stay low-cardinality.** Never add coin, oid, block height, or + subscription labels. +- **Recovery is off-lock.** Validation clones under the lock, validates without + it, and re-checks live height + `mutation_seq` (+ stream finalized/not + buffered) before applying. Do not apply a recovery report without that + re-check; recovery mutations must keep bumping `mutation_seq`. + +## Conventions + +- Branch names use an `ss/` prefix. +- Commit messages: lowercase, concise, no `Co-Authored-By` lines. +- Prefer in-place doc edits over appended change-logs (keep `ARCHITECTURE.md` + and `README.md` reading as the current system, not a history). +- Production deployments are multicast-only; never assume a WS server is bound. From 13dc3e81be8e195b2cc0a3a19ae5c9c6dbe64da3 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Mon, 18 May 2026 19:01:49 -0400 Subject: [PATCH 64/65] binaries: default --dob-channel-bound to 65536 (burst-safe); reconcile docs --- ARCHITECTURE.md | 21 +++++++++++---------- CHANGELOG.md | 5 +++++ CLAUDE.md | 7 ++++--- binaries/src/bin/dz_hl_publisher.rs | 7 +++++-- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 11bb784a..ea9b4d6f 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -475,10 +475,12 @@ There are three notable fanout boundaries: | DoB MPSC | bounded Tokio MPSC | `DobEvent` | Full queue drops DoB mutation events. | The DoB MPSC depth (and the DoB snapshot-request channel) is sized by -`--dob-channel-bound`. Hyperliquid emits large `OrderAdd` bursts at block -boundaries, so this must be sized for burst absorption, not average rate; a -too-shallow bound drops events during normal block-boundary bursts. Production -runs use a deep bound (`65536`); the historical `4096` default is too shallow. +`--dob-channel-bound`, which **defaults to `65536`**. Hyperliquid emits large +`OrderAdd` bursts at block boundaries, so this must be sized for burst +absorption, not average rate; the default is chosen accordingly. Overriding it +to a shallow value (e.g. the old `4096`) drops correctness-significant DoB +mutation events during normal block-boundary bursts — do not lower it without +evidence the workload's bursts fit. Broadcast lag and bounded-channel drops are intentionally different failure modes. Broadcast lag is receiver-side loss on an internal pub-sub channel; the @@ -594,12 +596,11 @@ emitted for that coin. The bounded DoB MPSC backpressured and the tap dropped a mutation event. This is correctness-significant: downstream reconstructed DoB books are inconsistent for -that instrument until the next reset/snapshot. The usual cause is a -`--dob-channel-bound` too shallow to absorb Hyperliquid block-boundary -`OrderAdd` bursts (it is bursty/activity-dependent, not a constant rate). The -fix is a deeper bound (production: `65536`); persistent drops at a deep bound -instead indicate the DoB emitter cannot keep up with steady-state load and need -a throughput investigation, not a larger buffer. +that instrument until the next reset/snapshot. With the default +`--dob-channel-bound` (`65536`) this should not occur under normal +block-boundary bursts; if it does, either the bound was overridden too low or +the DoB emitter cannot keep up with steady-state load — the latter needs a +throughput investigation, not a larger buffer. ## Design Constraints diff --git a/CHANGELOG.md b/CHANGELOG.md index 31942800..81867bf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,3 +53,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Streaming recovery now emits an authoritative TOB snapshot immediately, so a per-coin divergence repair is reflected to multicast subscribers without waiting for an unrelated later diff. +- `--dob-channel-bound` default raised `4096` → `65536`. The old default + dropped correctness-significant DoB mutation events during normal Hyperliquid + block-boundary `OrderAdd` bursts; the bound must be sized for burst + absorption, so the safe value is now the default (deployments no longer have + to remember to pass it explicitly). diff --git a/CLAUDE.md b/CLAUDE.md index 8ebc9612..0638e67c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -74,9 +74,10 @@ adds no *new* warnings in files you touch. streaming is byte-for-byte the pre-L2-5 per-chunk cadence. Do not couple WS-enabled behavior to the finalization-driven path. - **DoB drops are correctness-significant** (unlike TOB freshness suppression). - The DoB MPSC is sized by `--dob-channel-bound`; it must absorb Hyperliquid - block-boundary `OrderAdd` bursts (production runs `65536`, not the shallow - `4096` default). + The DoB MPSC is sized by `--dob-channel-bound` (default `65536`, chosen for + Hyperliquid block-boundary `OrderAdd` burst absorption). Do not lower it + without evidence the workload's bursts fit; a shallow bound silently drops + DoB mutation events. - **The stuck-stream backstop / supersede model contains an irreducible best-effort tradeoff** over fire-and-forget UDP multicast during a stalled feed. The convergent design is: lag-gated Provisional/Authoritative, only diff --git a/binaries/src/bin/dz_hl_publisher.rs b/binaries/src/bin/dz_hl_publisher.rs index bd22e842..d8fd3639 100644 --- a/binaries/src/bin/dz_hl_publisher.rs +++ b/binaries/src/bin/dz_hl_publisher.rs @@ -126,8 +126,11 @@ struct Args { #[arg(long, default_value_t = 1232)] dob_mtu: u16, - /// Bound on the MPSC channel between L4 apply and the DoB emitter. - #[arg(long, default_value_t = 4096)] + /// Bound on the MPSC channel between L4 apply and the DoB emitter (also + /// the DoB snapshot-request channel). Sized for Hyperliquid block-boundary + /// `OrderAdd` burst absorption: a shallow bound drops correctness- + /// significant DoB mutation events during normal bursts. + #[arg(long, default_value_t = 65536)] dob_channel_bound: usize, /// Target round-robin duration for the DoB snapshot stream (seconds). From e352d03709a67bd05f712c8595333df6ae0c6239 Mon Sep 17 00:00:00 2001 From: Steve Shaw Date: Tue, 19 May 2026 13:00:20 -0400 Subject: [PATCH 65/65] order book: gate block-mode l4 fanout on enable_websocket block-mode receive_block_batch broadcast InternalMessage::L4BookUpdates whenever l4_message_tx() was set, which run_websocket_server does unconditionally. in multicast-only mode (enable_websocket=false) that is pure clone+spawn+broadcast waste with no consumer (the multicast publisher ignores L4BookUpdates) and contradicted the architecture claim that ws-disabled skips l4 fanout entirely. apply the same guard the streaming path's publish_l4_update already uses. adds discriminating block-mode tests for both ws configs; reconciles ARCHITECTURE.md. --- ARCHITECTURE.md | 9 +-- .../order_book/block_mode_multicast_e2e.rs | 61 +++++++++++++++++++ server/src/listeners/order_book/mod.rs | 11 +++- 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index ea9b4d6f..97ef6199 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -83,10 +83,11 @@ also the top-level coordinator for multicast-only deployments. `InternalMessage::Fills` for TOB and WebSocket L2/trade consumers. - `l4_message_tx`: carries `InternalMessage::L4BookUpdates` for WebSocket L4 consumers. TOB does not subscribe to this high-volume channel. When the - WebSocket listener is disabled, the per-diff L4 fan-out is skipped entirely - (`publish_l4_update` returns early): in multicast-only mode there are no L4 - consumers, so the clone + task-spawn + broadcast per applied diff is pure - overhead and is elided. + WebSocket listener is disabled, the L4 fan-out is skipped entirely on both + ingest paths — streaming (`publish_l4_update` returns early) and block mode + (`receive_block_batch` is `enable_websocket`-gated): in multicast-only mode + there are no L4 consumers, so the clone + task-spawn + broadcast per applied + block/diff is pure overhead and is elided. If TOB or DoB is enabled, a shared instrument registry is also created. TOB uses it to map coins to instrument definitions. DoB uses it to resolve internal coins diff --git a/server/src/listeners/order_book/block_mode_multicast_e2e.rs b/server/src/listeners/order_book/block_mode_multicast_e2e.rs index bd0faf1f..d8a98308 100644 --- a/server/src/listeners/order_book/block_mode_multicast_e2e.rs +++ b/server/src/listeners/order_book/block_mode_multicast_e2e.rs @@ -328,6 +328,67 @@ async fn streaming_l4_flood_does_not_starve_tob_marketdata() { assert_has_msg_type(&tob_quote_packets, tob_const::MSG_TYPE_TRADE, "TOB trade under L4 flood"); } +// Block-mode L4 fanout must obey the same `enable_websocket` gate as the +// streaming path's `publish_l4_update`: in multicast-only mode (WS disabled, +// the default) `receive_block_batch` must NOT clone/spawn/broadcast +// InternalMessage::L4BookUpdates — the multicast publisher ignores it and +// `l4_message_tx()` would otherwise fall back to the market channel and +// broadcast pure waste. Discriminating: fails if the `enable_websocket` guard +// in `receive_block_batch` is absent. +async fn block_mode_l4_fanout_emits(enable_websocket: bool) -> bool { + let root = fixture_root(); + let (snapshot_height, snapshot) = + load_snapshots_from_json::(&root.join("out.json")) + .await + .expect("fixture snapshot loads"); + + let (market_tx, _) = broadcast_channel::>(8); + let (l4_tx, mut l4_rx) = broadcast_channel::>(8); + let mut listener = + OrderBookListener::new_with_ingest_mode(Some(market_tx), true, IngestMode::Block); + listener.set_l4_message_tx(l4_tx); + listener.set_enable_websocket(enable_websocket); + listener.init_from_snapshot(snapshot, snapshot_height); + + // A status/diff pair at the SAME height makes `pop_cache` return the pair + // and the apply path run, reaching the L4 fanout block. The phantom New + // (no matching opening status) is warn-skipped, not an error — exactly the + // production race shape — and the L4 send block still runs after it. + let next_height = snapshot_height + 1; + let phantom_new_diff = NodeDataOrderDiff::new_for_test( + Address::new([0; 20]), + u64::MAX, + "1.0".to_string(), + "BTC".to_string(), + OrderDiff::New { sz: "1.0".to_string() }, + ); + let status_batch = + Batch::new_for_test(next_height, 1_700_000_000_000, Vec::::new()); + let diff_batch = Batch::new_for_test(next_height, 1_700_000_000_000, vec![phantom_new_diff]); + listener.receive_batch(EventBatch::Orders(status_batch)).expect("status batch"); + listener.receive_batch(EventBatch::BookDiffs(diff_batch)).expect("phantom New is skipped, not error"); + + // The L4 send is `tokio::spawn`ed; give it a chance to run before checking. + sleep(Duration::from_millis(100)).await; + matches!(l4_rx.try_recv(), Ok(msg) if matches!(msg.as_ref(), InternalMessage::L4BookUpdates { .. })) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn block_mode_multicast_only_skips_l4_fanout() { + assert!( + !block_mode_l4_fanout_emits(false).await, + "multicast-only block mode (enable_websocket=false) must NOT broadcast L4BookUpdates" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn block_mode_ws_enabled_still_emits_l4_fanout() { + assert!( + block_mode_l4_fanout_emits(true).await, + "WS-enabled block mode must still broadcast L4BookUpdates (rollback contract)" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn streaming_live_shape_split_fill_rows_match_block_tob_trades() { let root = fixture_root(); diff --git a/server/src/listeners/order_book/mod.rs b/server/src/listeners/order_book/mod.rs index 3dec65ff..d4ea7e17 100644 --- a/server/src/listeners/order_book/mod.rs +++ b/server/src/listeners/order_book/mod.rs @@ -929,7 +929,16 @@ impl OrderBookListener { if let Some(cache) = &mut self.fetched_snapshot_cache { cache.push_back((order_statuses.clone(), order_diffs.clone())); } - if let Some(tx) = self.l4_message_tx() { + // L4 book updates are consumed only by WebSocket subscribers; + // multicast ignores InternalMessage::L4BookUpdates. In + // multicast-only mode (enable_websocket=false, the default) + // skip the clone + task spawn + broadcast entirely — pure + // CPU/alloc waste with no consumer, and `l4_message_tx()` would + // otherwise fall back to the market channel and broadcast + // anyway. Mirrors the streaming path's `publish_l4_update`. + if self.enable_websocket + && let Some(tx) = self.l4_message_tx() + { let tx = tx.clone(); tokio::spawn(async move { let updates = Arc::new(InternalMessage::L4BookUpdates {