fix(webapp): close cross-env mollifier mutation gap + Devin review followups

d-cs · claude · d-cs · commit 77fc3b7823b6 · 2026-05-29T15:51:35.000+01:00
Four Devin-flagged issues on PR #3756: * Cross-env auth gate on the buffer mutation path (#2). mutateWithFallback and applyMetadataMutationToBufferedRun now verify entry.envId/orgId match the caller's environmentId/organizationId before any buffer write, so a token authed in env A can't mutate a buffered run in env B by guessing the friendlyId. Mismatches return not_found (no existence leak), mirroring the env scoping the PG path already enforces via Prisma filters. * Unhandled error in routeOperationsToRun (#0). The parent/root op fan-out is documented as best-effort but a Redis throw used to 500 the request even though the primary mutation already landed. The buffer fallback now runs through tryCatch and warns instead of throwing. * Silent no-op when parent metadata routes to an internal id (#1). The PG service accepts internal ids, but the buffer is keyed by friendlyId; passing an internal cuid to the fallback was a silent miss. Made it an intentional skip (with a comment explaining why a buffered child's parent is always materialised already). * BufferedReplayInputSchema strips seedMetadata (#3). Replays from a buffered source were silently losing initial metadata vs PG-sourced replays. Added seedMetadata + seedMetadataType to the schema. Tests added: cross-env + cross-org gate cases on both helpers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
@@ -86,20 +86,41 @@ async function routeOperationsToRun(
   if (!error) return;
 
   // PG service threw — commonly "Cannot update metadata for a completed
-  // run", but it could also be a transient PG failure. The parent/root
-  // ops are auxiliary, so we stay best-effort and don't surface this to
-  // the caller — but we must not swallow the failure silently, otherwise
-  // a genuine PG outage on these ops is invisible. Warn, then try the
-  // buffer in case the target is itself buffered.
-  logger.warn("metadata route: parent/root PG op failed, falling back to buffer", {
+  // run", but it could also be a transient PG failure. Parent/root ops
+  // are auxiliary (the caller's primary mutation already landed); stay
+  // best-effort and don't surface this to the caller — but warn so a
+  // genuine PG outage on these ops isn't invisible.
+  logger.warn("metadata route: parent/root PG op failed", {
     targetRunId,
     error: error instanceof Error ? error.message : String(error),
   });
 
-  await applyMetadataMutationToBufferedRun({
-    runId: targetRunId,
-    body: { operations },
-  });
+  // Buffer fallback only makes sense for friendlyId-keyed entries. The
+  // PG-side parent/root IDs are internal cuids; the buffer keys entries
+  // by friendlyId, so passing the internal id would silently no-op.
+  // Skip explicitly — a buffered child's parent is always materialised
+  // in PG already (a buffered run hasn't executed, so it can't have
+  // triggered the child), so the buffered-parent branch isn't actually
+  // reachable. Treating the no-op as intentional rather than incidental.
+  if (!targetRunId.startsWith("run_")) return;
+
+  // Best-effort buffer fallback. Wrap so a transient Redis throw on
+  // this auxiliary op can't 500 the request after the primary mutation
+  // already succeeded.
+  const [bufferError] = await tryCatch(
+    applyMetadataMutationToBufferedRun({
+      runId: targetRunId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      body: { operations },
+    })
+  );
+  if (bufferError) {
+    logger.warn("metadata route: buffer fallback for parent/root op failed", {
+      targetRunId,
+      error: bufferError instanceof Error ? bufferError.message : String(bufferError),
+    });
+  }
 }
 
 const { action } = createActionApiRoute(
@@ -133,6 +154,8 @@ const { action } = createActionApiRoute(
     // PG miss. Target run is either buffered or genuinely absent.
     const bufferOutcome = await applyMetadataMutationToBufferedRun({
       runId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
       body: { metadata: body.metadata, operations: body.operations },
     });
 
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts
@@ -38,6 +38,13 @@ const BufferedReplayInputSchema = z.object({
   workerQueue: z.string().nullable().optional(),
   machinePreset: z.string().nullable().optional(),
   realtimeStreamsVersion: z.string().nullable().optional(),
+  // ReplayTaskRunService.getExistingMetadata reads these to preserve
+  // the original run's metadata on replay. Without them in the schema
+  // they'd be stripped by Zod's default key-passthrough behaviour, and
+  // a buffered-source replay would silently lose metadata that a
+  // PG-source replay carries over.
+  seedMetadata: z.string().nullable().optional(),
+  seedMetadataType: z.string().nullable().optional(),
 });
 
 export async function action({ request, params }: ActionFunctionArgs) {
diff --git a/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts
@@ -19,6 +19,13 @@ export type ApplyMetadataMutationOutcome =
 // callers never lose an increment / append / set.
 export async function applyMetadataMutationToBufferedRun(input: {
   runId: string;
+  // Env+org scoping closes a cross-environment write gap on the buffer
+  // path: the route's PG path is already env-scoped via Prisma filters,
+  // and this helper now enforces the same isolation before any buffer
+  // write so a caller authed in env A can't mutate a buffered run that
+  // belongs to env B.
+  environmentId: string;
+  organizationId: string;
   body: Pick<FlushedRunMetadata, "metadata" | "operations">;
   buffer?: MollifierBuffer | null;
   maxRetries?: number;
@@ -37,6 +44,14 @@ export async function applyMetadataMutationToBufferedRun(input: {
   for (let attempt = 0; attempt <= maxRetries; attempt++) {
     const entry = await buffer.getEntry(input.runId);
     if (!entry) return { kind: "not_found" };
+    // Env+org check: an entry from a different env is treated as a
+    // miss (not 403) so existence in other envs doesn't leak.
+    if (
+      entry.envId !== input.environmentId ||
+      entry.orgId !== input.organizationId
+    ) {
+      return { kind: "not_found" };
+    }
     if (entry.status !== "QUEUED" || entry.materialised) {
       return { kind: "busy" };
     }
diff --git a/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts
@@ -82,6 +82,27 @@ export async function mutateWithFallback<TResponse>(
     return { kind: "not_found" };
   }
 
+  // Env-scoped authorization for the buffer path. The replica/writer
+  // lookups above are already env-scoped via findRunInPg; this closes
+  // the same gap on the buffer side so a caller authed in env A can't
+  // mutate a buffered run that belongs to env B (or a different org)
+  // by guessing its friendlyId. Non-atomic w.r.t. the mutateSnapshot
+  // call below, but the TOCTOU is benign: runIds are globally unique,
+  // so a cross-env entry can't suddenly appear after a same-env check.
+  // A genuinely-missing entry (entry === null) falls through and is
+  // handled by the existing not_found / writer-recovery path below.
+  const entryForAuth = await buffer.getEntry(input.runId);
+  if (
+    entryForAuth &&
+    (entryForAuth.envId !== input.environmentId ||
+      entryForAuth.orgId !== input.organizationId)
+  ) {
+    // Hide existence on env mismatch: return not_found, same shape as
+    // a true miss, rather than 403 which would leak that the runId
+    // exists in some other env.
+    return { kind: "not_found" };
+  }
+
   // Path 2 — buffer snapshot mutation.
   const result: MutateSnapshotResult = await buffer.mutateSnapshot(
     input.runId,
diff --git a/apps/webapp/test/mollifierApplyMetadataMutation.test.ts b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts
@@ -86,6 +86,8 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     const { buffer, state } = makeBufferStub();
     const result = await applyMetadataMutationToBufferedRun({
       runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
       body: { metadata: { counter: 1 } },
       buffer,
     });
@@ -99,6 +101,8 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     state.pendingConflictsForNextN = 5;
     const result = await applyMetadataMutationToBufferedRun({
       runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
       body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
       buffer,
     });
@@ -124,6 +128,8 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     stub.state.pendingConflictsForNextN = 11;
     const result = await applyMetadataMutationToBufferedRun({
       runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
       body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
       buffer: stub.buffer,
     });
@@ -137,6 +143,8 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     stub.state.pendingConflictsForNextN = 99;
     const result = await applyMetadataMutationToBufferedRun({
       runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
       body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
       buffer: stub.buffer,
       maxRetries: 12,
@@ -152,13 +160,46 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     stub.state.pendingConflictsForNextN = 8;
     const result = await applyMetadataMutationToBufferedRun({
       runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
       body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
       buffer: stub.buffer,
       maxRetries: 3,
     });
     expect(result.kind).toBe("version_exhausted");
   });
 
+  it("returns not_found when the buffered entry belongs to a different env (cross-env auth gate)", async () => {
+    // Same shape as a normal apply call, but the caller's environmentId
+    // doesn't match the entry's envId. The helper must refuse the
+    // mutation and return not_found (without leaking existence) and
+    // must NOT call casSetMetadata.
+    const stub = makeBufferStub();
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_OTHER",
+      organizationId: "org_1",
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("not_found");
+    expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled();
+    expect(stub.state.version).toBe(0);
+  });
+
+  it("returns not_found when the buffered entry belongs to a different org (cross-org auth gate)", async () => {
+    const stub = makeBufferStub();
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_OTHER",
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("not_found");
+    expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled();
+  });
+
   it("N-way concurrent applies all converge under default budget", async () => {
     // Simulate N parallel writers against a shared state. Each writer
     // reads, applies a delta, CAS-writes. The Lua CAS forces them to
@@ -173,6 +214,8 @@ describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
     const calls = Array.from({ length: N }, () =>
       applyMetadataMutationToBufferedRun({
         runId: "run_1",
+        environmentId: "env_a",
+        organizationId: "org_1",
         body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
         buffer: sharedStub.buffer,
       }),
diff --git a/apps/webapp/test/mollifierMutateWithFallback.test.ts b/apps/webapp/test/mollifierMutateWithFallback.test.ts
@@ -23,18 +23,37 @@ function fakePrisma(rows: Array<TaskRun | null>): PrismaStub {
   return { taskRun: { findFirst: fn } };
 }
 
+// Env-matching entry returned by the env-pre-check getEntry call that
+// mutateWithFallback now does before any buffer write (cross-env auth
+// gate). Same envId/orgId as `baseInput` so the check passes and the
+// flow under test proceeds to mutateSnapshot.
+const preCheckEntry = (): BufferEntry =>
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "QUEUED",
+    materialised: false,
+  }) as unknown as BufferEntry;
+
 function bufferReturning(result: MutateSnapshotResult): MollifierBuffer {
+  const getEntry = vi.fn(async () => preCheckEntry());
   return {
     mutateSnapshot: vi.fn(async () => result),
-    getEntry: vi.fn(async () => null),
+    getEntry,
   } as unknown as MollifierBuffer;
 }
 
 // Buffer whose mutateSnapshot returns "busy" and whose getEntry walks a
-// scripted sequence of entry states (the drainer's progress). The last
-// element repeats once the sequence is exhausted.
+// scripted sequence of entry states. The pre-check getEntry call (one
+// extra read before the busy-wait loop, used for env authorization)
+// consumes the first scripted result, then the busy-wait loop pops the
+// remainder; the last element repeats once the sequence is exhausted.
 function bufferBusy(entries: Array<BufferEntry | null>): MollifierBuffer {
   const getEntry = vi.fn();
+  // Pre-check consumes one entry. Use a QUEUED env-matching entry so
+  // the env-check passes and the flow reaches mutateSnapshot (which
+  // returns "busy") and enters the wait-loop.
+  getEntry.mockResolvedValueOnce(preCheckEntry());
   for (const e of entries) getEntry.mockResolvedValueOnce(e);
   getEntry.mockResolvedValue(entries.length ? entries[entries.length - 1] : null);
   return {
@@ -44,11 +63,26 @@ function bufferBusy(entries: Array<BufferEntry | null>): MollifierBuffer {
 }
 
 const entryDraining = (): BufferEntry =>
-  ({ status: "DRAINING", materialised: false }) as unknown as BufferEntry;
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "DRAINING",
+    materialised: false,
+  }) as unknown as BufferEntry;
 const entryQueued = (): BufferEntry =>
-  ({ status: "QUEUED", materialised: false }) as unknown as BufferEntry;
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "QUEUED",
+    materialised: false,
+  }) as unknown as BufferEntry;
 const entryMaterialised = (): BufferEntry =>
-  ({ status: "DRAINING", materialised: true }) as unknown as BufferEntry;
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "DRAINING",
+    materialised: true,
+  }) as unknown as BufferEntry;
 
 const fakeRun = (overrides: Partial<TaskRun> = {}): TaskRun =>
   ({
@@ -150,8 +184,9 @@ describe("mutateWithFallback", () => {
     });
     expect(result).toEqual({ kind: "pg", response: "pg-after-wait" });
     expect(pgMutation).toHaveBeenCalledWith(row);
-    // Detection happened against Redis (3 polls), the primary exactly once.
-    expect(buffer.getEntry).toHaveBeenCalledTimes(3);
+    // One env-pre-check call + 3 busy-wait polls = 4 getEntry reads;
+    // primary read exactly once.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(4);
     expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
   });
 
@@ -227,7 +262,8 @@ describe("mutateWithFallback", () => {
       random: () => 0,
     });
     expect(result).toEqual({ kind: "pg", response: "pg-after-requeue" });
-    expect(buffer.getEntry).toHaveBeenCalledTimes(3);
+    // One env-pre-check + 3 busy-wait polls.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(4);
     expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
   });
 
@@ -278,8 +314,8 @@ describe("mutateWithFallback", () => {
       abortSignal: controller.signal,
     });
     expect(result).toEqual({ kind: "timed_out" });
-    // One buffer poll happened before the sleep+abort; primary untouched.
-    expect(buffer.getEntry).toHaveBeenCalledTimes(1);
+    // One env-pre-check + one busy-wait poll before sleep+abort; primary untouched.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(2);
     expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0);
   });
 
@@ -313,6 +349,64 @@ describe("mutateWithFallback", () => {
     ).rejects.toThrow(/limit_exceeded/);
   });
 
+  it("replica miss + buffer entry belongs to a different env → not_found (cross-env auth gate)", async () => {
+    // Same flow as the applied_to_snapshot test, except the entry's
+    // envId doesn't match input.environmentId. mutateWithFallback must
+    // refuse the write and return not_found (without leaking that the
+    // runId exists in another env), and must NOT call mutateSnapshot.
+    const crossEnvEntry: BufferEntry = {
+      envId: "env_OTHER",
+      orgId: "org_1",
+      status: "QUEUED",
+      materialised: false,
+    } as unknown as BufferEntry;
+    const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const);
+    const buffer = {
+      mutateSnapshot,
+      getEntry: vi.fn(async () => crossEnvEntry),
+    } as unknown as MollifierBuffer;
+
+    const pgMutation = vi.fn(async () => "pg");
+    const synthesisedResponse = vi.fn(() => "snap");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse,
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+    expect(mutateSnapshot).not.toHaveBeenCalled();
+    expect(pgMutation).not.toHaveBeenCalled();
+    expect(synthesisedResponse).not.toHaveBeenCalled();
+  });
+
+  it("replica miss + buffer entry belongs to a different org → not_found (cross-org auth gate)", async () => {
+    const crossOrgEntry: BufferEntry = {
+      envId: "env_a",
+      orgId: "org_OTHER",
+      status: "QUEUED",
+      materialised: false,
+    } as unknown as BufferEntry;
+    const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const);
+    const buffer = {
+      mutateSnapshot,
+      getEntry: vi.fn(async () => crossOrgEntry),
+    } as unknown as MollifierBuffer;
+
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+    expect(mutateSnapshot).not.toHaveBeenCalled();
+  });
+
   it("buffer is null (mollifier disabled) → not_found after replica miss", async () => {
     const result = await mutateWithFallback({
       ...baseInput,