refactor(webapp): unify Phase 2 retry idempotency check across all 3 branches

matt-aitken · claude · matt-aitken · commit 5a25abf2cd61 · 2026-05-27T15:18:56.000-07:00
After settling the operational contract — ABORTED throws because zero
TaskRun records exist for the customer to monitor; every other terminal
state returns sealed:true because TaskRun records exist (some may be
in failed state, but per-run signals reach the customer through run
monitoring) — three inconsistencies remained between the pre-loop
check and the two post-loop race handlers:

1. Seal-failed branch threw "unexpected state" on sealed=true + PENDING,
   which is the legitimate post-callback "all runs created" state
   (V2 batchCompletionCallback resets PROCESSING → PENDING and leaves
   sealed=true). Pre-loop and count-mismatch both accept this state.
2. Count-mismatch branch admitted sealed=true + ABORTED via the bare
   `currentBatch?.sealed` clause, returning sealed:true. Pre-loop
   throws on this state. The count-mismatch outcome would silently
   hide a batch where zero TaskRuns were created.
3. Count-mismatch branch's fall-through return (sealed:false) implies
   "retry with missing items", which is wrong for ABORTED — a fresh
   batch is needed.

Extracted the per-status policy into an exported helper:

  isIdempotentRetrySuccess(status, sealed) returns true for
  PROCESSING, COMPLETED, PARTIAL_FAILED, or (sealed &amp;&amp; PENDING).
  ABORTED is excluded so the customer's batchTrigger() retry fires.

All three branches now call the same helper. The count-mismatch
branch additionally throws explicitly on ABORTED before falling
through to the sealed:false return.

Tests (TDD red-then-green):
- New: seal-failed race with sealed=true + PENDING returns sealed:true
  (was throwing "unexpected state"). Uses racingPrisma to set the
  exact post-callback shape during the seal updateMany.
- New: count-mismatch race with sealed=true + ABORTED throws
  ServiceValidationError (was returning sealed:false). Uses a
  call-counter on findFirst to flip the batch state between the
  pre-loop read and the re-query.
All 36 tests in streamBatchItems.test.ts pass; webapp typecheck clean.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/runEngine/services/streamBatchItems.server.ts b/apps/webapp/app/runEngine/services/streamBatchItems.server.ts
@@ -4,12 +4,45 @@ import {
 } from "@trigger.dev/core/v3";
 import { BatchId } from "@trigger.dev/core/v3/isomorphic";
 import type { BatchItem, RunEngine } from "@internal/run-engine";
+import type { BatchTaskRunStatus } from "@trigger.dev/database";
 import { prisma, type PrismaClientOrTransaction } from "~/db.server";
 import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
 import { ServiceValidationError, WithRunEngine } from "../../v3/services/baseService.server";
 import { BatchPayloadProcessor } from "../concerns/batchPayloads.server";
 
+/**
+ * Phase 2 retry idempotency check (TRI-9944).
+ *
+ * Returns true when the batch is in a state that means the Phase 2 stream's
+ * job has already been done by an earlier (or concurrent) request — every
+ * item is enqueued, runs have been or are being created, and at least one
+ * TaskRun record exists for the customer to monitor. A retry should return
+ * sealed:true in these states so the SDK stops retrying.
+ *
+ *  - PROCESSING / sealed=true + PENDING: original sealed; runs are executing
+ *    (PENDING after callback "all runs created") or about to.
+ *  - COMPLETED: every run reached a terminal state (tryCompleteBatch).
+ *  - PARTIAL_FAILED: at least one TaskRun record exists; per-run failures
+ *    are visible at the run level.
+ *
+ * ABORTED is intentionally excluded — it means ZERO TaskRun records were
+ * created (every per-item attempt failed AND the pre-failed-TaskRun fallback
+ * also failed). The customer has nothing to monitor at the run level, so
+ * the trigger call must throw to give their retry/error handling a chance.
+ */
+export function isIdempotentRetrySuccess(
+  status: BatchTaskRunStatus | null | undefined,
+  sealed: boolean | null | undefined
+): boolean {
+  return (
+    status === "PROCESSING" ||
+    status === "COMPLETED" ||
+    status === "PARTIAL_FAILED" ||
+    (sealed === true && status === "PENDING")
+  );
+}
+
 export type StreamBatchItemsServiceOptions = {
   maxItemBytes: number;
 };
@@ -100,26 +133,7 @@ export class StreamBatchItemsService extends WithRunEngine {
           throw new ServiceValidationError(`Batch ${batchFriendlyId} not found`);
         }
 
-        // Phase 2 retry idempotency (TRI-9944): a successful original request
-        // sealed the batch (sealed=true, status=PROCESSING) and the V2 batch
-        // completion callback can then independently update status to:
-        //   - PENDING (all runs created — sealed stays true)
-        //   - PARTIAL_FAILED (some run creations failed — sealed stays true/false)
-        //   - COMPLETED (set by tryCompleteBatch after every run reaches a final
-        //     state — sealed is NOT set by this path)
-        // For all of these the Phase 2 stream did its job, so a retry should
-        // return sealed:true and the SDK stops retrying.
-        //
-        // ABORTED is explicitly excluded — it means every run-creation attempt
-        // failed and the batch is terminally broken; surface that as an error
-        // rather than masking it as success.
-        const isIdempotentRetrySuccess =
-          batch.status === "PROCESSING" ||
-          batch.status === "COMPLETED" ||
-          batch.status === "PARTIAL_FAILED" ||
-          (batch.sealed && batch.status === "PENDING");
-
-        if (isIdempotentRetrySuccess) {
+        if (isIdempotentRetrySuccess(batch.status, batch.sealed)) {
           logger.info("Batch already sealed/completed - treating Phase 2 retry as success", {
             batchId: batchFriendlyId,
             batchSealed: batch.sealed,
@@ -137,6 +151,8 @@ export class StreamBatchItemsService extends WithRunEngine {
 
         if (batch.status !== "PENDING") {
           // ABORTED or any other unexpected non-PENDING state — surface as an error.
+          // For ABORTED specifically, throwing is required so the customer's
+          // batchTrigger() retries (a new batch) can recreate the runs.
           throw new ServiceValidationError(
             `Batch ${batchFriendlyId} is not in PENDING status (current: ${batch.status})`
           );
@@ -253,18 +269,14 @@ export class StreamBatchItemsService extends WithRunEngine {
             select: { sealed: true, status: true },
           });
 
-          if (
-            currentBatch?.sealed ||
-            currentBatch?.status === "COMPLETED" ||
-            currentBatch?.status === "PARTIAL_FAILED"
-          ) {
+          if (isIdempotentRetrySuccess(currentBatch?.status, currentBatch?.sealed)) {
             logger.info("Batch already sealed before count check (fast completion)", {
               batchId: batchFriendlyId,
               itemsAccepted,
               itemsDeduplicated,
               enqueuedCount,
               expectedCount: batch.runCount,
-              batchStatus: currentBatch.status,
+              batchStatus: currentBatch?.status,
             });
 
             return {
@@ -276,6 +288,15 @@ export class StreamBatchItemsService extends WithRunEngine {
             };
           }
 
+          if (currentBatch?.status === "ABORTED") {
+            // Zero TaskRuns exist — the count-mismatch sealed:false semantics
+            // ("retry with missing items") would mislead the SDK. Throw so the
+            // customer's batchTrigger() retry creates a fresh batch.
+            throw new ServiceValidationError(
+              `Batch ${batchFriendlyId} is not in PENDING status (current: ABORTED)`
+            );
+          }
+
           logger.warn("Batch item count mismatch", {
             batchId: batchFriendlyId,
             expected: batch.runCount,
@@ -337,18 +358,14 @@ export class StreamBatchItemsService extends WithRunEngine {
             },
           });
 
-          if (
-            (currentBatch?.sealed && currentBatch.status === "PROCESSING") ||
-            currentBatch?.status === "COMPLETED" ||
-            currentBatch?.status === "PARTIAL_FAILED"
-          ) {
+          if (isIdempotentRetrySuccess(currentBatch?.status, currentBatch?.sealed)) {
             logger.info("Batch already sealed/completed by concurrent path", {
               batchId: batchFriendlyId,
               itemsAccepted,
               itemsDeduplicated,
               envId: environment.id,
-              batchStatus: currentBatch.status,
-              batchSealed: currentBatch.sealed,
+              batchStatus: currentBatch?.status,
+              batchSealed: currentBatch?.sealed,
             });
 
             span.setAttribute("itemsAccepted", itemsAccepted);
diff --git a/apps/webapp/test/engine/streamBatchItems.test.ts b/apps/webapp/test/engine/streamBatchItems.test.ts
@@ -1083,6 +1083,245 @@ describe("StreamBatchItemsService", () => {
       await engine.quit();
     }
   );
+
+  containerTest(
+    "should return sealed=true when seal-failed race produces sealed=true + PENDING (post-callback all-created)",
+    async ({ prisma, redisOptions }) => {
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+          disabled: true,
+        },
+        queue: {
+          redis: redisOptions,
+        },
+        runLock: {
+          redis: redisOptions,
+        },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+          },
+          baseCostInCents: 0.0005,
+        },
+        batchQueue: {
+          redis: redisOptions,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const batch = await createBatch(prisma, authenticatedEnvironment.id, {
+        runCount: 2,
+        status: "PENDING",
+        sealed: false,
+      });
+
+      await engine.initializeBatch({
+        batchId: batch.id,
+        friendlyId: batch.friendlyId,
+        environmentId: authenticatedEnvironment.id,
+        environmentType: authenticatedEnvironment.type,
+        organizationId: authenticatedEnvironment.organizationId,
+        projectId: authenticatedEnvironment.projectId,
+        runCount: 2,
+        processingConcurrency: 10,
+      });
+
+      await engine.enqueueBatchItem(batch.id, authenticatedEnvironment.id, 0, {
+        task: "test-task",
+        payload: JSON.stringify({ data: "item1" }),
+        payloadType: "application/json",
+      });
+      await engine.enqueueBatchItem(batch.id, authenticatedEnvironment.id, 1, {
+        task: "test-task",
+        payload: JSON.stringify({ data: "item2" }),
+        payloadType: "application/json",
+      });
+
+      // Simulate the race where a concurrent path seals the batch (sealed=true,
+      // PROCESSING), then the V2 batchCompletionCallback fires with all runs
+      // created successfully and resets status to PENDING (sealed stays true).
+      // Our seal updateMany then fails the conditional (sealed=false no longer
+      // matches), and the re-query sees sealed=true + PENDING — a perfectly
+      // valid post-callback state that the SDK retry should treat as success.
+      const racingPrisma = {
+        ...prisma,
+        batchTaskRun: {
+          ...prisma.batchTaskRun,
+          findFirst: prisma.batchTaskRun.findFirst.bind(prisma.batchTaskRun),
+          updateMany: async () => {
+            await prisma.batchTaskRun.update({
+              where: { id: batch.id },
+              data: {
+                sealed: true,
+                sealedAt: new Date(),
+                // Intentionally leave status as PENDING — that's exactly what
+                // the V2 batchCompletionCallback does after all runs are
+                // created (status PROCESSING → PENDING).
+              },
+            });
+            return { count: 0 };
+          },
+          findUnique: prisma.batchTaskRun.findUnique.bind(prisma.batchTaskRun),
+        },
+      } as unknown as PrismaClient;
+
+      const service = new StreamBatchItemsService({
+        prisma: racingPrisma,
+        engine,
+      });
+
+      const result = await service.call(
+        authenticatedEnvironment,
+        batch.friendlyId,
+        itemsToAsyncIterable([]),
+        {
+          maxItemBytes: 1024 * 1024,
+        }
+      );
+
+      expect(result.sealed).toBe(true);
+      expect(result.id).toBe(batch.friendlyId);
+
+      const updatedBatch = await prisma.batchTaskRun.findUnique({
+        where: { id: batch.id },
+      });
+
+      expect(updatedBatch?.sealed).toBe(true);
+      expect(updatedBatch?.status).toBe("PENDING");
+
+      await engine.quit();
+    }
+  );
+
+  containerTest(
+    "should throw when count-mismatch race produces sealed=true + ABORTED (no TaskRuns created)",
+    async ({ prisma, redisOptions }) => {
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+          disabled: true,
+        },
+        queue: {
+          redis: redisOptions,
+        },
+        runLock: {
+          redis: redisOptions,
+        },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+          },
+          baseCostInCents: 0.0005,
+        },
+        batchQueue: {
+          redis: redisOptions,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const batch = await createBatch(prisma, authenticatedEnvironment.id, {
+        runCount: 3,
+        status: "PENDING",
+        sealed: false,
+      });
+
+      await engine.initializeBatch({
+        batchId: batch.id,
+        friendlyId: batch.friendlyId,
+        environmentId: authenticatedEnvironment.id,
+        environmentType: authenticatedEnvironment.type,
+        organizationId: authenticatedEnvironment.organizationId,
+        projectId: authenticatedEnvironment.projectId,
+        runCount: 3,
+        processingConcurrency: 10,
+      });
+
+      // Only enqueue 2 items so the post-loop count check trips into the
+      // mismatch handler. The race we're simulating: between our pre-loop
+      // findFirst and the count-mismatch re-query, a concurrent path sealed
+      // the batch, runs were attempted, every run-creation failed AND the
+      // pre-failed-TaskRun fallback also failed → callback sets ABORTED.
+      // The customer has zero TaskRun records to monitor, so the retry must
+      // throw rather than silently succeed.
+      await engine.enqueueBatchItem(batch.id, authenticatedEnvironment.id, 0, {
+        task: "test-task",
+        payload: JSON.stringify({ data: "item1" }),
+        payloadType: "application/json",
+      });
+      await engine.enqueueBatchItem(batch.id, authenticatedEnvironment.id, 1, {
+        task: "test-task",
+        payload: JSON.stringify({ data: "item2" }),
+        payloadType: "application/json",
+      });
+
+      // Override findFirst to flip the batch to sealed=true + ABORTED on the
+      // re-query that happens INSIDE the count-mismatch branch. The first
+      // findFirst (pre-loop) must still see PENDING + sealed=false so we
+      // pass through and reach the count-mismatch branch.
+      let findFirstCallCount = 0;
+      const racingPrisma = {
+        ...prisma,
+        batchTaskRun: {
+          ...prisma.batchTaskRun,
+          findFirst: async (args: Parameters<typeof prisma.batchTaskRun.findFirst>[0]) => {
+            findFirstCallCount++;
+            if (findFirstCallCount >= 2) {
+              await prisma.batchTaskRun.update({
+                where: { id: batch.id },
+                data: {
+                  sealed: true,
+                  sealedAt: new Date(),
+                  status: "ABORTED",
+                  completedAt: new Date(),
+                },
+              });
+            }
+            return prisma.batchTaskRun.findFirst.call(prisma.batchTaskRun, args);
+          },
+          updateMany: prisma.batchTaskRun.updateMany.bind(prisma.batchTaskRun),
+          findUnique: prisma.batchTaskRun.findUnique.bind(prisma.batchTaskRun),
+        },
+      } as unknown as PrismaClient;
+
+      const service = new StreamBatchItemsService({
+        prisma: racingPrisma,
+        engine,
+      });
+
+      await expect(
+        service.call(authenticatedEnvironment, batch.friendlyId, itemsToAsyncIterable([]), {
+          maxItemBytes: 1024 * 1024,
+        })
+      ).rejects.toThrow(ServiceValidationError);
+
+      await engine.quit();
+    }
+  );
 });
 
 describe("createNdjsonParserStream", () => {