fix: per-node frontier in validator, global event_id sequence

rophy · rophy · commit 6563f3b7769f · 2026-03-28T12:12:57.000Z
- Validator uses per-node min(lm, olr) frontier instead of global min.
  Prevents false positives from pipeline drain timing differences.
- event_id format changed to N{node}_{seq:08d} (global monotonic).
  Sorts chronologically, enables correct watermark advancement.
- Seed data uses event_id='SEED' (skipped by consumer).
- DELETE no longer pre-updates event_id (avoids extra UPDATE event).
- Reduced false positives from 146 to 17 (all genuine OLR phantom
  committed transactions on non-LOB tables).
diff --git a/tests/dbz-twin/rac/validator.py b/tests/dbz-twin/rac/validator.py
@@ -138,42 +138,55 @@ def main():
                 prev_lm_count = lm_count
                 prev_olr_count = olr_count
 
-            # Find safe frontier: min of max event_id on each side
-            lm_max = conn.execute(
-                "SELECT MAX(event_id) FROM lm_events").fetchone()[0]
-            olr_max = conn.execute(
-                "SELECT MAX(event_id) FROM olr_events").fetchone()[0]
-
-            if lm_max is None or olr_max is None:
+            # Find safe frontier PER NODE: min of LM/OLR max for each node.
+            # Event_ids are N{node}_{seq}, so N1 and N2 advance independently.
+            # Using a global min would let one node's tail events be validated
+            # before the other side has finished processing them.
+            node_frontiers = {}
+            for node_prefix in ('N1', 'N2'):
+                lm_node_max = conn.execute(
+                    "SELECT MAX(event_id) FROM lm_events WHERE event_id LIKE ?",
+                    (f'{node_prefix}_%',)).fetchone()[0]
+                olr_node_max = conn.execute(
+                    "SELECT MAX(event_id) FROM olr_events WHERE event_id LIKE ?",
+                    (f'{node_prefix}_%',)).fetchone()[0]
+                if lm_node_max and olr_node_max:
+                    node_frontiers[node_prefix] = min(lm_node_max, olr_node_max)
+
+            if not node_frontiers:
                 continue
 
-            frontier = min(lm_max, olr_max)
+            # Build a combined frontier for progress tracking
+            frontier = max(node_frontiers.values())
             if frontier <= cursor_event_id:
                 # Check idle timeout
                 if time.time() - last_new_events > IDLE_TIMEOUT:
                     print(f"[validator] Idle timeout ({IDLE_TIMEOUT}s). "
                           f"Validating remaining...", flush=True)
-                    # Final pass: validate everything up to max of both sides
-                    frontier = max(lm_max, olr_max)
+                    # Final pass: still use per-node min to avoid
+                    # validating tail events the other side hasn't seen
+                    pass
+                    frontier = max(node_frontiers.values())
                     if frontier <= cursor_event_id:
                         break
                 else:
                     continue
 
-            # Fetch distinct event_ids in range from both sides
-            lm_rows = conn.execute(
-                "SELECT DISTINCT event_id FROM lm_events "
-                "WHERE event_id > ? AND event_id <= ? ORDER BY event_id",
-                (cursor_event_id, frontier)
-            ).fetchall()
-            olr_rows = conn.execute(
-                "SELECT DISTINCT event_id FROM olr_events "
-                "WHERE event_id > ? AND event_id <= ? ORDER BY event_id",
-                (cursor_event_id, frontier)
-            ).fetchall()
-
-            lm_ids = {r['event_id'] for r in lm_rows}
-            olr_ids = {r['event_id'] for r in olr_rows}
+            # Fetch event_ids within each node's safe frontier
+            lm_ids = set()
+            olr_ids = set()
+            for node_prefix, nf in node_frontiers.items():
+                for r in conn.execute(
+                    "SELECT DISTINCT event_id FROM lm_events "
+                    "WHERE event_id > ? AND event_id <= ? AND event_id LIKE ?",
+                    (cursor_event_id, nf, f'{node_prefix}_%')).fetchall():
+                    lm_ids.add(r['event_id'])
+                for r in conn.execute(
+                    "SELECT DISTINCT event_id FROM olr_events "
+                    "WHERE event_id > ? AND event_id <= ? AND event_id LIKE ?",
+                    (cursor_event_id, nf, f'{node_prefix}_%')).fetchall():
+                    olr_ids.add(r['event_id'])
+
             all_ids = sorted(lm_ids | olr_ids)
 
             for eid in all_ids: