feat(redis-worker,webapp): observability index for in-flight DRAINING entries

d-cs · claude · d-cs · commit 2d185abdc064 · 2026-06-02T09:22:23.000+01:00
Adds a Redis sorted set `mollifier:draining` mirroring entries currently
in DRAINING state (popped by the drainer, not yet acked/failed/requeued),
scored by pop wall-clock millis. Maintained atomically with the existing
per-entry status transitions:

  - popAndMarkDraining → ZADD score=now-ms
  - ackMollifierEntry → ZREM
  - failMollifierEntry → ZREM
  - requeueMollifierEntry → ZREM

Each pre-existing Lua picks up one extra Redis op; ack/fail also gain a
runId arg so they can ZREM without a hash read. Buffer exposes:

  - getDrainingCount(): ZCARD — gauge value
  - listStaleDraining(olderThanMs, limit): ZRANGEBYSCORE — forensics
    after an ECS OOM ("which entries were stranded?")

NOT load-bearing for correctness — per-entry hash still carries status,
stale-sweep still scans queue LISTs. The set is a fast top-level index
so a wiped/out-of-date set just over-reports the gauge; recovery paths
are untouched. A test pins this graceful-degradation invariant.

Wires `mollifier.draining.current` ObservableGauge polled every 15s on
the drainer worker pods. unref'd setInterval so it can't block graceful
shutdown; idempotent under dev hot-reload. Test seam exported for unit
testing without spinning a real OTel meter.

Tests:
  - 7 redisTest cases in buffer.test.ts (lifecycle on every Lua boundary,
    requeue-and-repop score replacement, listStaleDraining cutoff/limit,
    graceful-degradation when set is wiped)
  - 6 unit tests in webapp for the gauge poller (eager fire, cadence,
    null buffer no-op, transient-error survives, idempotent start,
    stop halts loop)

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainingGauge.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainingGauge.server.ts
@@ -0,0 +1,63 @@
+import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+import { reportDrainingCount } from "./mollifierTelemetry.server";
+
+// How often we ZCARD the draining-tracker set. Each poll is a single
+// O(1) Redis call, so cadence is bounded by "how fresh do we want the
+// gauge?" rather than cost. 15s gives a tight-enough window to spot a
+// brief OOM-induced spike without burning RTTs, and lines up well with
+// typical Prometheus scrape intervals.
+const POLL_INTERVAL_MS = 15_000;
+
+let intervalHandle: ReturnType<typeof setInterval> | null = null;
+
+// Polls `mollifier:draining` cardinality on an interval and feeds the
+// gauge in `mollifierTelemetry.server.ts`. Started from the drainer
+// worker bootstrap (alongside `drainer.start()`) so it runs on the same
+// pods that actually pop/ack entries — observability is colocated with
+// the lifecycle.
+//
+// Idempotent: a second call is a no-op (Remix dev hot-reload re-runs
+// the bootstrap; the existing interval keeps ticking).
+export function startMollifierDrainingGauge(opts: {
+  intervalMs?: number;
+  getBuffer?: typeof getMollifierBuffer;
+} = {}): void {
+  if (intervalHandle !== null) return;
+
+  const intervalMs = opts.intervalMs ?? POLL_INTERVAL_MS;
+  const getBuffer = opts.getBuffer ?? getMollifierBuffer;
+
+  // Fire one poll immediately so the gauge populates before the first
+  // scrape rather than reading 0 for a full interval after boot.
+  const tick = async () => {
+    const buffer = getBuffer();
+    if (!buffer) return;
+    try {
+      const count = await buffer.getDrainingCount();
+      reportDrainingCount(count);
+    } catch (err) {
+      // Transient Redis blip — don't tank the loop, just leave the
+      // gauge at its last-known value. A sustained Redis outage will
+      // surface via the drainer's own alerts long before this gauge
+      // staleness becomes a primary signal.
+      logger.warn("Mollifier draining gauge poll failed; keeping previous value", { err });
+    }
+  };
+
+  void tick();
+  // unref so the interval doesn't keep the process alive past
+  // graceful shutdown — the gauge is best-effort, not a flush boundary.
+  intervalHandle = setInterval(() => {
+    void tick();
+  }, intervalMs);
+  intervalHandle.unref?.();
+}
+
+// Test seam. Production code never calls this; lifecycle is implicitly
+// process-end.
+export function stopMollifierDrainingGauge(): void {
+  if (intervalHandle === null) return;
+  clearInterval(intervalHandle);
+  intervalHandle = null;
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts
@@ -90,6 +90,39 @@ meter.addBatchObservableCallback(
   [staleEntriesGauge],
 );
 
+// Observability gauge for entries currently in DRAINING state — popped
+// by the drainer but not yet acked/failed/requeued. Backed by the
+// `mollifier:draining` ZSET (see `MollifierBuffer.getDrainingCount`)
+// and polled by the loop in `mollifierDrainingGaugeLoop.server.ts`.
+//
+// Useful for:
+//   - "Is anything mid-drain right now?" panels
+//   - Post-crash forensics ("how many entries got stranded by that ECS OOM?")
+//   - Alerting: a sustained non-zero with no drainer progress is a stall
+//
+// No `envId` attribute — same high-cardinality constraint as the other
+// mollifier gauges. The per-entry hash carries env/org for drill-down.
+export const drainingCountGauge = meter.createObservableGauge(
+  "mollifier.draining.current",
+  {
+    description:
+      "Mollifier buffer entries currently in DRAINING state (popped but not yet acked/failed/requeued)",
+  },
+);
+
+let latestDrainingCount = 0;
+
+export function reportDrainingCount(count: number): void {
+  latestDrainingCount = count;
+}
+
+meter.addBatchObservableCallback(
+  (result) => {
+    result.observe(drainingCountGauge, latestDrainingCount);
+  },
+  [drainingCountGauge],
+);
+
 // Electric SQL's shape-stream protocol adds a `handle=` query param on
 // every reconnect after the initial GET. Gating the realtime-buffered
 // log/counter on its absence keeps the signal at one tick per
diff --git a/apps/webapp/app/v3/mollifierDrainerWorker.server.ts b/apps/webapp/app/v3/mollifierDrainerWorker.server.ts
@@ -5,6 +5,7 @@ import {
   getMollifierDrainer,
   MollifierConfigurationError,
 } from "./mollifier/mollifierDrainer.server";
+import { startMollifierDrainingGauge } from "./mollifier/mollifierDrainingGauge.server";
 
 declare global {
   // eslint-disable-next-line no-var
@@ -92,6 +93,12 @@ export function initMollifierDrainerWorker(
       signalsEmitter.on("SIGINT", stopDrainer);
       global.__mollifierShutdownRegistered__ = true;
       drainer.start();
+      // Spin up the observability-only gauge poller for the
+      // `mollifier:draining` ZSET cardinality. Colocated with the
+      // drainer because that's the loop creating the DRAINING entries
+      // — same pod, same Redis client lifecycle. Idempotent + unref'd
+      // so it's safe under dev hot-reload and doesn't block shutdown.
+      startMollifierDrainingGauge();
     }
   } catch (error) {
     // Deterministic misconfig (shutdown-timeout vs GRACEFUL_SHUTDOWN_TIMEOUT,
diff --git a/apps/webapp/test/mollifierDrainingGauge.test.ts b/apps/webapp/test/mollifierDrainingGauge.test.ts
@@ -0,0 +1,116 @@
+import { describe, expect, it, vi, beforeEach, afterEach } from "vitest";
+
+// Same defensive mocks as mollifierDrainerWorker.test.ts: importing
+// the gauge module transitively loads telemetry → meter → OTel
+// initialisation, plus the buffer singleton's runtime resolution.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+vi.mock("~/services/logger.server", () => ({
+  logger: { warn: vi.fn(), error: vi.fn(), info: vi.fn(), debug: vi.fn() },
+}));
+
+const reportDrainingCount = vi.fn();
+vi.mock("~/v3/mollifier/mollifierTelemetry.server", () => ({
+  reportDrainingCount: (count: number) => reportDrainingCount(count),
+}));
+
+import {
+  startMollifierDrainingGauge,
+  stopMollifierDrainingGauge,
+} from "~/v3/mollifier/mollifierDrainingGauge.server";
+
+// The gauge poller reads `mollifier:draining` cardinality on a cadence
+// and forwards it to `reportDrainingCount`. These tests pin the
+// observable contract: the gauge value is the buffer's count, transient
+// errors keep the last value, and the loop never blocks the main thread
+// (unref'd interval — verified implicitly because Vitest exits cleanly).
+describe("startMollifierDrainingGauge", () => {
+  beforeEach(() => {
+    reportDrainingCount.mockReset();
+    stopMollifierDrainingGauge();
+  });
+
+  afterEach(() => {
+    stopMollifierDrainingGauge();
+  });
+
+  it("fires an immediate poll on start so the gauge populates before the first scrape", async () => {
+    const buffer = { getDrainingCount: vi.fn().mockResolvedValue(7) } as any;
+    startMollifierDrainingGauge({
+      intervalMs: 100_000, // long — we're checking the immediate fire, not the interval
+      getBuffer: () => buffer,
+    });
+
+    // Wait one microtask tick so the eager poll resolves.
+    await new Promise((r) => setImmediate(r));
+    expect(reportDrainingCount).toHaveBeenCalledWith(7);
+    expect(buffer.getDrainingCount).toHaveBeenCalledTimes(1);
+  });
+
+  it("polls on the configured cadence", async () => {
+    const buffer = { getDrainingCount: vi.fn().mockResolvedValue(3) } as any;
+    startMollifierDrainingGauge({
+      intervalMs: 20,
+      getBuffer: () => buffer,
+    });
+
+    // Eager tick + at least one interval tick.
+    await new Promise((r) => setTimeout(r, 80));
+    expect(buffer.getDrainingCount.mock.calls.length).toBeGreaterThanOrEqual(2);
+    expect(reportDrainingCount).toHaveBeenCalledWith(3);
+  });
+
+  it("no-ops when the buffer singleton returns null (mollifier disabled)", async () => {
+    startMollifierDrainingGauge({
+      intervalMs: 20,
+      getBuffer: () => null,
+    });
+    await new Promise((r) => setTimeout(r, 60));
+    expect(reportDrainingCount).not.toHaveBeenCalled();
+  });
+
+  it("swallows a transient ZCARD failure so the loop keeps running", async () => {
+    let calls = 0;
+    const buffer = {
+      getDrainingCount: vi.fn(async () => {
+        calls += 1;
+        if (calls === 1) throw new Error("transient redis blip");
+        return 4;
+      }),
+    } as any;
+    startMollifierDrainingGauge({
+      intervalMs: 20,
+      getBuffer: () => buffer,
+    });
+
+    await new Promise((r) => setTimeout(r, 80));
+    // First call threw → no report. Second call succeeded → reported.
+    // The gauge keeps its previous value (stale-but-non-zero) between
+    // the failed poll and the next successful one — better than
+    // crashing the loop and going silent forever.
+    expect(reportDrainingCount).toHaveBeenCalledWith(4);
+    expect(buffer.getDrainingCount.mock.calls.length).toBeGreaterThanOrEqual(2);
+  });
+
+  it("is idempotent: a second start does not spawn a parallel loop", async () => {
+    const buffer = { getDrainingCount: vi.fn().mockResolvedValue(1) } as any;
+    startMollifierDrainingGauge({ intervalMs: 25, getBuffer: () => buffer });
+    startMollifierDrainingGauge({ intervalMs: 25, getBuffer: () => buffer });
+
+    await new Promise((r) => setTimeout(r, 90));
+    // One eager + a small number of interval ticks. Doubled-loop would
+    // produce ~2× the calls in the same window. Upper bound is generous
+    // for CI jitter; the property is "single loop", not exact count.
+    expect(buffer.getDrainingCount.mock.calls.length).toBeLessThan(8);
+  });
+
+  it("stop halts the polling loop", async () => {
+    const buffer = { getDrainingCount: vi.fn().mockResolvedValue(2) } as any;
+    startMollifierDrainingGauge({ intervalMs: 20, getBuffer: () => buffer });
+    await new Promise((r) => setTimeout(r, 50));
+    const callsAtStop = buffer.getDrainingCount.mock.calls.length;
+    stopMollifierDrainingGauge();
+
+    await new Promise((r) => setTimeout(r, 80));
+    expect(buffer.getDrainingCount.mock.calls.length).toBe(callsAtStop);
+  });
+});
diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts