feat(webapp): drainer emits admin-only LOG event with buffered window

d-cs · claude · d-cs · commit 607661027ee9 · 2026-05-29T18:09:03.000+01:00
After engine.trigger lands the PG row, the drainer calls
recordRunDebugLog with the original bufferedAt as startTime and the
dwell duration as the event duration. The helper flips this to
TaskEventKind.LOG, which the trace view + logs download already gate
behind admin (eventRepository.server.ts:108,
resources.runs.\$runParam.logs.download.ts:118).

Admins now see "Mollifier buffered for Xms" rendered at the historical
instant inside the run's existing trace, sitting between trigger and
dequeue. Customers see no change — the LOG-kind filter strips the event
from their view. No schema change on TaskRun; the audit trail lives in
the same ClickHouse store the rest of the trace events use.

Best-effort: recordRunDebugLog has its own try/catch and returns a
result. The drainer logs non-RUN_NOT_FOUND failures but never fails
materialisation because the audit trail couldn't be written.

Skipped on the cancel-bifurcation path (the run never ran) and on the
terminal SYSTEM_FAILURE path (the customer-visible outcome is the
failure row, not a "buffered for Xms" note).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts
@@ -7,6 +7,7 @@ import type {
   MollifierDrainerTerminalFailureHandler,
 } from "@trigger.dev/redis-worker";
 import { logger } from "~/services/logger.server";
+import { recordRunDebugLog } from "~/v3/eventRepository/index.server";
 import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server";
 import { startSpan } from "~/v3/tracing.server";
 import type { MollifierSnapshot } from "./mollifierSnapshot.server";
@@ -129,8 +130,10 @@ export function createDrainerHandler(deps: {
         span.setAttribute("mollifier.run_friendly_id", input.runId);
         span.setAttribute("taskRunId", input.runId);
 
+        let triggerSucceeded = false;
         try {
           await deps.engine.trigger(input.payload as any, deps.prisma);
+          triggerSucceeded = true;
         } catch (err) {
           // The retryable-PG class re-throws so the drainer's outer
           // worker loop can `buffer.requeue` (handled in
@@ -179,6 +182,47 @@ export function createDrainerHandler(deps: {
             throw err;
           }
         }
+
+        // Admin-only audit trail emitted once engine.trigger has
+        // landed a PG row. `recordRunDebugLog` flips this to
+        // `TaskEventKind.LOG`, which the trace view + logs download
+        // already gate behind admin
+        // (`eventRepository.server.ts:108`,
+        // `resources.runs.$runParam.logs.download.ts:118`). Encoding
+        // the buffered window as `startTime` + `duration` makes the
+        // event render at the historical instant inside the run's
+        // existing trace — admins see "Mollifier buffered for Xms"
+        // sitting between trigger and dequeue. Best-effort: the
+        // helper has its own try/catch and returns a result, so it
+        // never throws into the materialisation path. Failures are
+        // logged but not surfaced because the customer-visible run
+        // has already landed.
+        if (triggerSucceeded) {
+          const debugResult = await recordRunDebugLog(
+            RunId.fromFriendlyId(input.runId),
+            `Mollifier buffered for ${dwellMs}ms before materialising`,
+            {
+              attributes: {
+                runId: input.runId,
+                metadata: {
+                  "mollifier.bufferedAt": input.createdAt.toISOString(),
+                  "mollifier.materialisedAt": new Date().toISOString(),
+                  "mollifier.dwellMs": dwellMs,
+                  "mollifier.attempts": input.attempts,
+                },
+              },
+              startTime: input.createdAt,
+              duration: dwellMs * 1_000_000,
+              parentId: snapshotSpanId,
+            }
+          );
+          if (!debugResult.success && debugResult.code !== "RUN_NOT_FOUND") {
+            logger.warn("mollifier drainer: failed to record admin debug log", {
+              runId: input.runId,
+              code: debugResult.code,
+            });
+          }
+        }
       });
     });
   };
diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts
@@ -19,6 +19,22 @@ vi.mock("~/v3/services/alerts/performTaskRunAlerts.server", () => ({
   },
 }));
 
+// The drainer calls `recordRunDebugLog` after a successful engine.trigger
+// to emit an admin-only LOG-kind event encoding the buffered window.
+// The real implementation imports the configured event repository (prisma
+// + clickhouse + env), which has heavy side-effects on first import.
+// Stub it to a vi.fn so the unit tests can assert call shape without
+// dragging the whole eventRepository graph into webapp test setup.
+// `vi.hoisted` is required because `vi.mock` factories are hoisted above
+// regular `const`s — referencing a top-level variable from inside the
+// factory otherwise fires `Cannot access 'X' before initialization`.
+const { recordRunDebugLogMock } = vi.hoisted(() => ({
+  recordRunDebugLogMock: vi.fn(async () => ({ success: true as const })),
+}));
+vi.mock("~/v3/eventRepository/index.server", () => ({
+  recordRunDebugLog: recordRunDebugLogMock,
+}));
+
 import {
   createDrainerHandler,
   isRetryablePgError,
@@ -371,4 +387,105 @@ describe("createDrainerHandler", () => {
     ).rejects.toThrow("engine rejected the snapshot");
     expect(createFailedTaskRun).not.toHaveBeenCalled();
   });
+
+  it("emits an admin-only LOG-kind event with the buffered window after engine.trigger succeeds", async () => {
+    // The drainer's audit trail rides the existing TaskEventKind.LOG
+    // filter pattern (`eventRepository.server.ts:108` + `logs.download.ts:118`)
+    // — admins see the buffered window in the trace; non-admins don't.
+    recordRunDebugLogMock.mockClear();
+    const trigger = vi.fn(async () => ({ friendlyId: "run_z" }));
+    const handler = createDrainerHandler({
+      engine: { trigger } as any,
+      prisma: {} as any,
+    });
+
+    const bufferedAt = new Date(Date.now() - 4_000);
+    await handler({
+      runId: "run_z",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: { taskIdentifier: "t", spanId: "snapspan", traceId: "snaptrace" },
+      attempts: 2,
+      createdAt: bufferedAt,
+    } as any);
+
+    expect(recordRunDebugLogMock).toHaveBeenCalledOnce();
+    const [callRunId, message, options] = recordRunDebugLogMock.mock.calls[0] as [
+      string,
+      string,
+      any,
+    ];
+    // Internal cuid derived from the friendlyId, mirroring what
+    // `findRunForEventCreation` queries on.
+    expect(callRunId).toBe("z");
+    expect(message).toMatch(/Mollifier buffered for \d+ms/);
+    // Encodes the historical buffered window so the trace view places
+    // the LOG event between trigger and dequeue (not at "now").
+    expect(options.startTime).toBe(bufferedAt);
+    expect(options.duration).toBeGreaterThan(0);
+    expect(options.parentId).toBe("snapspan");
+    expect(options.attributes.metadata["mollifier.bufferedAt"]).toBe(bufferedAt.toISOString());
+    expect(options.attributes.metadata["mollifier.attempts"]).toBe(2);
+  });
+
+  it("does NOT emit the admin LOG event when engine.trigger fails non-retryably", async () => {
+    // The audit trail is for runs that actually materialised. On a
+    // terminal SYSTEM_FAILURE path the customer-visible outcome is the
+    // failure row; emitting a "buffered for Xms" event next to it would
+    // imply the buffered window completed normally.
+    recordRunDebugLogMock.mockClear();
+    const trigger = vi.fn(async () => {
+      throw new Error("engine rejected the snapshot");
+    });
+    const createFailedTaskRun = vi.fn(async () => ({ id: "internal" }));
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: "run_z",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: { taskIdentifier: "t", environment: envFixture },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(recordRunDebugLogMock).not.toHaveBeenCalled();
+  });
+
+  it("does NOT emit the admin LOG event on the cancel-bifurcation path", async () => {
+    // Cancel-bifurcation writes a CANCELED row directly without calling
+    // engine.trigger. There's no buffered-then-materialised window to
+    // describe — the run never ran.
+    recordRunDebugLogMock.mockClear();
+    const friendlyId = RunId.generate().friendlyId;
+    const createCancelledRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId,
+      status: "CANCELED",
+    }));
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: friendlyId,
+      envId: "env_a",
+      orgId: "org_1",
+      payload: {
+        friendlyId,
+        taskIdentifier: "t",
+        environment: envFixture,
+        cancelledAt: new Date().toISOString(),
+        cancelReason: "Canceled by user",
+      },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(recordRunDebugLogMock).not.toHaveBeenCalled();
+  });
 });