diff --git a/apps/api/openapi.json b/apps/api/openapi.json index de6f8265f9..2552f8594c 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -3749,7 +3749,7 @@ "failure": { "anyOf": [ { - "$ref": "#/components/schemas/_ConductorFailureResponse" + "$ref": "#/components/schemas/ConductorFailureResponse" }, { "type": "null" @@ -3956,6 +3956,47 @@ "title": "ConductRunResponse", "type": "object" }, + "ConductorFailureResponse": { + "description": "JSON wire shape for a `ConductorFailure`.", + "properties": { + "error_class": { + "title": "Error Class", + "type": "string" + }, + "message": { + "title": "Message", + "type": "string" + }, + "source_kind": { + "title": "Source Kind", + "type": "string" + }, + "step_index": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Step Index" + }, + "target": { + "title": "Target", + "type": "string" + } + }, + "required": [ + "step_index", + "source_kind", + "target", + "error_class", + "message" + ], + "title": "ConductorFailureResponse", + "type": "object" + }, "ConduitListResponse": { "description": "Page of conduits plus opaque next-page cursor.", "properties": { @@ -7054,6 +7095,23 @@ "title": "HoldCampaignRequest", "type": "object" }, + "HoldProcedureRequest": { + "description": "Body for `POST /procedures/{procedure_id}/hold`.", + "properties": { + "reason": { + "description": "Free-form reason for the hold (1-500 chars after trimming). Required: pausing a halted conduct is a deliberate operator act (unlike a routine RunHeld, which carries no reason).", + "maxLength": 500, + "minLength": 1, + "title": "Reason", + "type": "string" + } + }, + "required": [ + "reason" + ], + "title": "HoldProcedureRequest", + "type": "object" + }, "HoldVisitRequest": { "description": "Body for `POST /visits/{visit_id}/hold`.\n\n`reason` is operator-supplied free text (audit-log breadcrumb).\nExamples: \"beam dump\", \"equipment fault\", \"safety hold pending\nradiation door reset\", \"extended user break\". MUST NOT contain PII.", "properties": { @@ -9655,10 +9713,11 @@ "type": "object" }, "ProcedureStatus": { - "description": "The Procedure's lifecycle state.\n\nFive values declared day one for forward-compat\n(additive-state pattern; legacy events fold cleanly because\nonly DEFINED is reachable after register_procedure):\n\n - `Defined` -- registration-time genesis; pre-execution.\n Operator can edit / inspect / submit for\n review (future Decision BC integration).\n Cannot accept step events yet.\n - `Running` -- post-start_procedure. Step events accepted\n via append_activities.\n - `Completed` -- happy path via complete_procedure.\n Strict-not-idempotent.\n - `Aborted` -- emergency exit via abort_procedure.\n - `Truncated` -- retroactive cleanup via truncate_procedure.\n Mirrors RunTruncated.\n\n`Verifying` and `Held / Resumed` are deliberately NOT in this\nenum. Per [[project_operation_design]] standards-corpus research:\n`Verifying` is NOT standards-blessed at FSM level (PackML uses\n`Completing` for closeout/check work; OPC UA Programs has no\nVerify state). Per-step Check happens within Running synchronously\n(via the Step logbook's check_passed field). Held / Resumed\ndeferred until pilot operator feedback surfaces a need.\n\nNaming convention (per Run BC gate review): gerund /\nadjective for active steady-states (matches PackML / Bluesky);\npast-participle for terminals. `Defined` is past-participle (a\nprocedure WAS defined); `Running` is gerund-as-adjective; the\nrest are past-participle terminals.\n\nEnum values are PascalCase strings (matches BC-map status\nvocabulary; log lines and DTOs read naturally without mapping).", + "description": "The Procedure's lifecycle state.\n\nSix values declared for forward-compat (additive-state pattern;\nlegacy events fold cleanly because only DEFINED is reachable after\nregister_procedure):\n\n - `Defined` -- registration-time genesis; pre-execution.\n Operator can edit / inspect / submit for\n review (future Decision BC integration).\n Cannot accept step events yet.\n - `Running` -- post-start_procedure. Step events accepted\n via append_activities.\n - `Held` -- operator-paused mid-conduct via hold_procedure\n (Running <-> Held, resumable via\n resume_procedure). The resumable-conduct\n pause state; mirrors `RunStatus.HELD`. No step\n events accepted while Held; the conduct is\n paused, not advancing.\n - `Completed` -- happy path via complete_procedure.\n Strict-not-idempotent.\n - `Aborted` -- emergency exit via abort_procedure.\n - `Truncated` -- retroactive cleanup via truncate_procedure.\n Mirrors RunTruncated.\n\n`Verifying` is deliberately NOT in this enum. Per\n[[project_operation_design]] standards-corpus research: `Verifying`\nis NOT standards-blessed at FSM level (PackML uses `Completing` for\ncloseout/check work; OPC UA Programs has no Verify state). Per-step\nCheck happens within Running synchronously (via the Step logbook's\ncheck_passed field).\n\n`Held` lands in Tier 1 of [[project_resumable_conduct_design]]:\noperator-pause of a halted conduct, additive to the Layer-1 FSM,\nmirroring `RunStatus.HELD` (Procedure is an execution-FSM sibling of\nRun). The PackML operator=`Held` / external-blocker=`Suspended`\nsplit is honored: this is the operator-pause, so `Held`, not\n`Suspended`. The `HOLDING` / `RESTARTING` transient states are\ndeliberately omitted (Run-precedent deferral).\n\nNaming convention (per Run BC gate review): gerund /\nadjective for active steady-states (matches PackML / Bluesky);\npast-participle for the pause-state and terminals. `Defined` is\npast-participle (a procedure WAS defined); `Running` is\ngerund-as-adjective; `Held` is past-participle (mirrors\n`RunStatus.HELD`); the rest are past-participle terminals.\n\nEnum values are PascalCase strings (matches BC-map status\nvocabulary; log lines and DTOs read naturally without mapping).", "enum": [ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated" @@ -10314,6 +10373,79 @@ "title": "RecipeResponse", "type": "object" }, + "ReconductProcedureRequest": { + "additionalProperties": false, + "description": "Body for `POST /procedures/{procedure_id}/reconduct`.", + "properties": { + "re_establishment_boundary": { + "description": "Index in the pinned resolved step list from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "minimum": 0.0, + "title": "Re Establishment Boundary", + "type": "integer" + } + }, + "required": [ + "re_establishment_boundary" + ], + "title": "ReconductProcedureRequest", + "type": "object" + }, + "ReconductProcedureResponse": { + "description": "Response body for the reconduct_procedure slice.\n\n`succeeded` is the replay's pass/fail bit. `acquisition_halt` is True\niff the replay stopped at an acquisition needing an operator decision\n(the Procedure is left Running). `failure` is non-null iff `succeeded`\nis False (a halt or a genuine step failure).", + "properties": { + "acquisition_halt": { + "title": "Acquisition Halt", + "type": "boolean" + }, + "actuation_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Actuation Kind" + }, + "completed_count": { + "title": "Completed Count", + "type": "integer" + }, + "failure": { + "anyOf": [ + { + "$ref": "#/components/schemas/ConductorFailureResponse" + }, + { + "type": "null" + } + ] + }, + "procedure_id": { + "format": "uuid", + "title": "Procedure Id", + "type": "string" + }, + "re_establishment_boundary": { + "title": "Re Establishment Boundary", + "type": "integer" + }, + "succeeded": { + "title": "Succeeded", + "type": "boolean" + } + }, + "required": [ + "procedure_id", + "completed_count", + "succeeded", + "re_establishment_boundary", + "acquisition_halt" + ], + "title": "ReconductProcedureResponse", + "type": "object" + }, "RecordAcquisitionRequest": { "additionalProperties": false, "description": "Body for `POST /acquisitions`.", @@ -12495,6 +12627,22 @@ "title": "RestoreSupplyRequest", "type": "object" }, + "ResumeProcedureRequest": { + "description": "Body for `POST /procedures/{procedure_id}/resume`.", + "properties": { + "re_establishment_boundary": { + "description": "Index in the pinned resolved step list from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "minimum": 0.0, + "title": "Re Establishment Boundary", + "type": "integer" + } + }, + "required": [ + "re_establishment_boundary" + ], + "title": "ResumeProcedureRequest", + "type": "object" + }, "RetireCautionRequest": { "description": "Body for `POST /cautions/{caution_id}/retire`.", "properties": { @@ -14479,6 +14627,92 @@ "title": "TruncateRunRequest", "type": "object" }, + "TryConductProcedureRequest": { + "additionalProperties": false, + "description": "Body for `POST /procedures/{procedure_id}/try-conduct`.", + "properties": { + "steps": { + "description": "Steps the Conductor walks in order (0-500). Empty list is valid: start + complete fire with no steps.", + "items": { + "discriminator": { + "mapping": { + "action": "#/components/schemas/_ActionStepRequest", + "check": "#/components/schemas/_CheckStepRequest", + "setpoint": "#/components/schemas/_SetpointStepRequest" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/components/schemas/_SetpointStepRequest" + }, + { + "$ref": "#/components/schemas/_ActionStepRequest" + }, + { + "$ref": "#/components/schemas/_CheckStepRequest" + } + ] + }, + "maxItems": 500, + "title": "Steps", + "type": "array" + } + }, + "title": "TryConductProcedureRequest", + "type": "object" + }, + "TryConductProcedureResponse": { + "description": "Response body for the try_conduct_procedure slice.\n\n`succeeded` is the canonical pass/fail bit; `failure` is non-null iff\n`succeeded` is False. `held` is True iff a recoverable step failure paused\nthe Procedure to `Held` (resumable via `reconduct`); a terminal `Aborted`\noutcome carries `succeeded=False` + `failure` + `held=False`.\n\n`actuation_kind` is the raw `ActuationKind` value the Conductor observed,\nor None when nothing instrumented was actuated. Read-only operator\nvisibility; the gate that consumes it reads the value server-side off the\nProcedure stream, never back from this response.", + "properties": { + "actuation_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Actuation Kind" + }, + "completed_count": { + "title": "Completed Count", + "type": "integer" + }, + "failure": { + "anyOf": [ + { + "$ref": "#/components/schemas/ConductorFailureResponse" + }, + { + "type": "null" + } + ] + }, + "held": { + "default": false, + "title": "Held", + "type": "boolean" + }, + "procedure_id": { + "format": "uuid", + "title": "Procedure Id", + "type": "string" + }, + "succeeded": { + "title": "Succeeded", + "type": "boolean" + } + }, + "required": [ + "procedure_id", + "completed_count", + "succeeded" + ], + "title": "TryConductProcedureResponse", + "type": "object" + }, "UnbindPlanRoleRequest": { "description": "Body for `POST /plans/{plan_id}/unbind-role`.", "properties": { @@ -15319,47 +15553,6 @@ "title": "_CheckStepRequest", "type": "object" }, - "_ConductorFailureResponse": { - "description": "JSON wire shape for `ConductorFailure`.", - "properties": { - "error_class": { - "title": "Error Class", - "type": "string" - }, - "message": { - "title": "Message", - "type": "string" - }, - "source_kind": { - "title": "Source Kind", - "type": "string" - }, - "step_index": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Step Index" - }, - "target": { - "title": "Target", - "type": "string" - } - }, - "required": [ - "step_index", - "source_kind", - "target", - "error_class", - "message" - ], - "title": "_ConductorFailureResponse", - "type": "object" - }, "_EqualsCriterion": { "additionalProperties": false, "description": "JSON wire shape for an `EqualsCriterion`.", @@ -35894,7 +36087,7 @@ } }, { - "description": "Optional status filter (one of: Defined, Running, Completed, Aborted, Truncated). Omit to return all statuses.", + "description": "Optional status filter (one of: Defined, Running, Held, Completed, Aborted, Truncated). Omit to return all statuses.", "in": "query", "name": "status", "required": false, @@ -35904,6 +36097,7 @@ "enum": [ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated" @@ -35914,7 +36108,7 @@ "type": "null" } ], - "description": "Optional status filter (one of: Defined, Running, Completed, Aborted, Truncated). Omit to return all statuses.", + "description": "Optional status filter (one of: Defined, Running, Held, Completed, Aborted, Truncated). Omit to return all statuses.", "title": "Status" } }, @@ -36593,29 +36787,273 @@ } } }, - "description": "Procedure is not in `Running` status (complete requires `Running` today; re-completing a `Completed` procedure raises, completing an `Aborted` procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + "description": "Procedure is not in `Running` status (complete requires `Running` today; re-completing a `Completed` procedure raises, completing an `Aborted` procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + }, + "422": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + }, + "description": "Validation Error" + } + }, + "summary": "Mark an existing Procedure as completed (happy-path terminal)", + "tags": [ + "operation" + ] + } + }, + "/procedures/{procedure_id}/conduct": { + "post": { + "description": "Conduct a Procedure end-to-end. Failures land in the response body.", + "operationId": "post_procedures_conduct_procedures__procedure_id__conduct_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ConductProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ConductProcedureResponse" + } + } + }, + "description": "Successful Response" + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "422": { + "description": "Request body failed schema validation: unknown step kind, missing required field, batch over cap, invalid criterion shape." + } + }, + "summary": "Conduct a Procedure: start -> walk steps via ControlPort + actions + checks -> complete (success) or abort (failure).", + "tags": [ + "operation" + ] + } + }, + "/procedures/{procedure_id}/hold": { + "post": { + "operationId": "post_procedures_hold_procedures__procedure_id__hold_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HoldProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "Successful Response" + }, + "400": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Domain invariant violated: whitespace-only reason." + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "404": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Running` status (hold requires `Running`; holding a `Defined` / `Held` / terminal procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." }, "422": { + "description": "Path parameter or request body failed schema validation." + } + }, + "summary": "Pause an actively-running Procedure conduct (Running -> Held)", + "tags": [ + "operation" + ] + } + }, + "/procedures/{procedure_id}/iterations": { + "get": { + "operationId": "list_procedure_iterations_procedures__procedure_id__iterations_get", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "responses": { + "200": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/HTTPValidationError" + "$ref": "#/components/schemas/ProcedureIterationsResponse" } } }, - "description": "Validation Error" + "description": "Successful Response" + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the query." + }, + "422": { + "description": "Path parameter failed schema validation." } }, - "summary": "Mark an existing Procedure as completed (happy-path terminal)", + "summary": "List the convergence-loop iterations of a Procedure", "tags": [ "operation" ] } }, - "/procedures/{procedure_id}/conduct": { + "/procedures/{procedure_id}/iterations/end": { "post": { - "description": "Conduct a Procedure end-to-end. Failures land in the response body.", - "operationId": "post_procedures_conduct_procedures__procedure_id__conduct_post", + "operationId": "post_procedures_end_iteration_procedures__procedure_id__iterations_end_post", "parameters": [ { "description": "Target procedure's id.", @@ -36653,22 +37091,25 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ConductProcedureRequest" + "$ref": "#/components/schemas/EndProcedureIterationRequest" } } }, "required": true }, "responses": { - "200": { + "204": { + "description": "Successful Response" + }, + "400": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ConductProcedureResponse" + "$ref": "#/components/schemas/ErrorResponse" } } }, - "description": "Successful Response" + "description": "Domain invariant violated: whitespace-only reason." }, "403": { "content": { @@ -36680,19 +37121,39 @@ }, "description": "Authorize port denied the command." }, + "404": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Running`, no iteration is open, or iteration_index does not match the open iteration, OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + }, "422": { - "description": "Request body failed schema validation: unknown step kind, missing required field, batch over cap, invalid criterion shape." + "description": "Path parameter or request body failed schema validation." } }, - "summary": "Conduct a Procedure: start -> walk steps via ControlPort + actions + checks -> complete (success) or abort (failure).", + "summary": "Close the open convergence-loop iteration on a Running Procedure", "tags": [ "operation" ] } }, - "/procedures/{procedure_id}/iterations": { - "get": { - "operationId": "list_procedure_iterations_procedures__procedure_id__iterations_get", + "/procedures/{procedure_id}/iterations/start": { + "post": { + "operationId": "post_procedures_start_iteration_procedures__procedure_id__iterations_start_post", "parameters": [ { "description": "Target procedure's id.", @@ -36726,18 +37187,31 @@ } } ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/StartProcedureIterationRequest" + } + } + }, + "required": true + }, "responses": { - "200": { + "204": { + "description": "Successful Response" + }, + "403": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ProcedureIterationsResponse" + "$ref": "#/components/schemas/ErrorResponse" } } }, - "description": "Successful Response" + "description": "Authorize port denied the command." }, - "403": { + "404": { "content": { "application/json": { "schema": { @@ -36745,21 +37219,32 @@ } } }, - "description": "Authorize port denied the query." + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Running`, an iteration is already open, or iteration_index is not the strict successor of the current iteration_count, OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." }, "422": { - "description": "Path parameter failed schema validation." + "description": "Path parameter or request body failed schema validation." } }, - "summary": "List the convergence-loop iterations of a Procedure", + "summary": "Begin one convergence-loop iteration on a Running Procedure", "tags": [ "operation" ] } }, - "/procedures/{procedure_id}/iterations/end": { + "/procedures/{procedure_id}/reconduct": { "post": { - "operationId": "post_procedures_end_iteration_procedures__procedure_id__iterations_end_post", + "description": "Resume + replay a Held Procedure. Replay outcomes land in the body.", + "operationId": "post_procedures_reconduct_procedures__procedure_id__reconduct_post", "parameters": [ { "description": "Target procedure's id.", @@ -36797,14 +37282,21 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/EndProcedureIterationRequest" + "$ref": "#/components/schemas/ReconductProcedureRequest" } } }, "required": true }, "responses": { - "204": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReconductProcedureResponse" + } + } + }, "description": "Successful Response" }, "400": { @@ -36815,7 +37307,7 @@ } } }, - "description": "Domain invariant violated: whitespace-only reason." + "description": "re_establishment_boundary is past the pinned resolved step count." }, "403": { "content": { @@ -36845,21 +37337,31 @@ } } }, - "description": "Procedure is not in `Running`, no iteration is open, or iteration_index does not match the open iteration, OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + "description": "Procedure is not in `Held` status, OR its parent Run is itself `Held` (off-diagonal guard)." }, "422": { "description": "Path parameter or request body failed schema validation." + }, + "500": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Held Procedure is missing its pinned resolved steps (corruption)." } }, - "summary": "Close the open convergence-loop iteration on a Running Procedure", + "summary": "Resume a held Procedure and replay its pinned step-list tail (Held -> Running)", "tags": [ "operation" ] } }, - "/procedures/{procedure_id}/iterations/start": { + "/procedures/{procedure_id}/resume": { "post": { - "operationId": "post_procedures_start_iteration_procedures__procedure_id__iterations_start_post", + "operationId": "post_procedures_resume_procedures__procedure_id__resume_post", "parameters": [ { "description": "Target procedure's id.", @@ -36897,7 +37399,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/StartProcedureIterationRequest" + "$ref": "#/components/schemas/ResumeProcedureRequest" } } }, @@ -36907,6 +37409,16 @@ "204": { "description": "Successful Response" }, + "400": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Domain invariant violated: negative re_establishment_boundary." + }, "403": { "content": { "application/json": { @@ -36935,13 +37447,13 @@ } } }, - "description": "Procedure is not in `Running`, an iteration is already open, or iteration_index is not the strict successor of the current iteration_count, OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + "description": "Procedure is not in `Held` status (resume requires `Held`; resuming a `Running` / `Defined` / terminal procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." }, "422": { "description": "Path parameter or request body failed schema validation." } }, - "summary": "Begin one convergence-loop iteration on a Running Procedure", + "summary": "Resume a held Procedure conduct (Held -> Running)", "tags": [ "operation" ] @@ -37134,6 +37646,84 @@ ] } }, + "/procedures/{procedure_id}/try-conduct": { + "post": { + "description": "Conduct a Procedure, pausing to Held on a recoverable failure.", + "operationId": "post_procedures_try_conduct_procedures__procedure_id__try_conduct_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TryConductProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TryConductProcedureResponse" + } + } + }, + "description": "Successful Response" + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "422": { + "description": "Request body failed schema validation: unknown step kind, missing required field, batch over cap, invalid criterion shape." + } + }, + "summary": "Conduct a Procedure, pausing to Held on a recoverable failure: start -> walk steps -> complete (success) / pause to Held (recoverable setpoint or check failure) / abort (acquisition failure).", + "tags": [ + "operation" + ] + } + }, "/recipes": { "post": { "operationId": "post_recipes_recipes_post", diff --git a/apps/api/src/cora/operation/_conduct_preparation.py b/apps/api/src/cora/operation/_conduct_preparation.py new file mode 100644 index 0000000000..c4368723dd --- /dev/null +++ b/apps/api/src/cora/operation/_conduct_preparation.py @@ -0,0 +1,249 @@ +"""Shared pre-conduct pipeline for the conduct verb-family slices. + +`conduct_procedure` and `try_conduct_procedure` resolve the SAME step list +the same way before handing it to the Conductor, then pin it identically: + + 1. recipe re-expansion when the Procedure was created from a recipe + (the five-step replay gate per [[project-run-procedure-replay-design]]); + 2. pseudoaxis -> constituent expansion when the Procedure is a Run phase + (resolve each virtual-axis SetpointStep's constituents from the Run's + Plan wires); + 3. pin the FINAL resolved list as a `ResolvedStepsRecorded` provenance + event BEFORE any step executes, so a later resume replays this exact + list rather than re-deriving it. + +A slice cannot import a sibling slice (the cross-slice-independence fitness), +so this BC-level module owns the shared pipeline, mirroring `_conduct_wire` +(shared HTTP/MCP shapes) and `_resolved_steps_replay` (the resume-side read). +The pin is emitted inline rather than via a dedicated command slice: +`ResolvedStepsRecorded` is an internal provenance event with no operator +entry point, exactly like `RecipeExpansionRecorded`. +""" + +from collections.abc import Mapping, Sequence +from datetime import datetime +from typing import TYPE_CHECKING, Any +from uuid import UUID + +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.ports import EventStore +from cora.infrastructure.ports.event_store import StoredEvent +from cora.operation._recipe_replay import ( + find_recipe_expansion_record, + pins_from_payload, + verify_bindings_hash, + verify_steps_hash, +) +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureBoundCapabilityDeprecatedError, + ProcedureStatus, + ProcedureStepsForbiddenForRecipeDrivenError, + RecipeExpanderVersionMismatchError, + RecipeExpansionRecordNotFoundError, + ResolvedStepsRecorded, + event_type_name, + to_payload, +) +from cora.operation.conductor import Step, step_to_payload +from cora.operation.ports.recipe_expander import RecipeExpander +from cora.recipe.aggregates.capability import CapabilityStatus, load_capability +from cora.recipe.aggregates.plan import ( + PlanNotFoundError, + constituents_from_wires, + load_plan, +) +from cora.recipe.aggregates.recipe import load_recipe_at_version +from cora.run.aggregates.run import RunNotFoundError, load_run + +if TYPE_CHECKING: + from cora.operation._pseudoaxis_expander import ConstituentResolver + + +def decide_resolved_steps_recorded( + state: Procedure | None, + resolved_steps: Sequence[Mapping[str, Any]], + *, + now: datetime, +) -> list[ResolvedStepsRecorded]: + """Pin the resolved step list iff the Procedure is pre-conduct (Defined). + + Returns a single `ResolvedStepsRecorded` when `state` is `Defined` + (the normal conduct path, before `start_procedure` transitions it to + `Running`). Returns `[]` when `state` is None or not `Defined`: a + conduct of a missing / already-running / terminal Procedure records no + resolved steps and lets the Conductor's `start_procedure` produce the + normal lifecycle failure, preserving the conduct route's failures-in-body + contract instead of raising a fresh HTTP error here. Kept as a pure + function so the decision is unit-testable without an event store. + """ + if state is None or state.status is not ProcedureStatus.DEFINED: + return [] + steps = tuple(dict(step) for step in resolved_steps) + return [ + ResolvedStepsRecorded( + procedure_id=state.id, + resolved_steps=steps, + step_count=len(steps), + occurred_at=now, + ) + ] + + +async def resolve_and_pin_conduct_steps( + deps: Kernel, + *, + command_name: str, + procedure: Procedure, + stored_events: list[StoredEvent], + caller_steps: Sequence[Step], + expansion_port: RecipeExpander, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None, +) -> tuple[Step, ...]: + """Resolve the final conduct step list + pin it as `ResolvedStepsRecorded`. + + The shared pre-Conductor work for `conduct` / `try_conduct`: recipe + re-expansion (recipe-driven Procedures) -> pseudoaxis constituent + expansion (Run-phase Procedures) -> pin. Returns the resolved steps to + hand to the Conductor. `command_name` rides the pinned event's metadata. + """ + if procedure.recipe_id is not None: + steps = await _re_expand_steps( + procedure_id=procedure.id, + recipe_id=procedure.recipe_id, + caller_steps=caller_steps, + stored_events=stored_events, + event_store=deps.event_store, + expansion_port=expansion_port, + ) + else: + steps = tuple(caller_steps) + + # A Phase-of-Run Procedure resolves a pseudoaxis's constituent motors from + # its Run's Plan wires: parent_run_id -> Run.plan_id -> Plan.wires (the + # same load chain start_procedure walks for its Supply gate). A missing + # Run / Plan in that chain is corruption, so raise rather than silently + # skip. Standalone / recipe-driven Procedures (no parent_run_id) pass no + # resolver, so any pseudoaxis SetpointStep hits the wiring-deferred default + # and is rejected with PartitionRuleNotFoundError. + constituent_resolver: ConstituentResolver | None = None + if procedure.parent_run_id is not None: + parent_run = await load_run(deps.event_store, procedure.parent_run_id) + if parent_run is None: + raise RunNotFoundError(procedure.parent_run_id) + plan = await load_plan(deps.event_store, parent_run.plan_id) + if plan is None: + raise PlanNotFoundError(parent_run.plan_id) + + def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: + return constituents_from_wires(plan, asset_id) + + constituent_resolver = _resolve_constituents + + # Pre-Conductor PseudoAxis expansion: rewrite any virtual-axis SetpointStep + # into N sequential constituent SetpointSteps so the Conductor's dispatch + # loop walks the constituents in declared order. ActionStep / CheckStep + # pass through unchanged ([[project-pseudoaxis-design]] v3). + steps = await expansion_port.expand_pseudoaxis( + steps, + event_store=deps.event_store, + correlation_id=correlation_id, + constituent_resolver=constituent_resolver, + ) + + # Pin the resolved step list (after recipe + pseudoaxis expansion) BEFORE + # conducting, so a future resume replays this exact list. The helper emits + # the event only while the Procedure is still Defined and returns [] + # otherwise, leaving the Conductor's start_procedure to surface a lifecycle + # failure (keeps the conduct route's failures-in-body contract). + resolved_steps_events = decide_resolved_steps_recorded( + procedure, + tuple(step_to_payload(step) for step in steps), + now=deps.clock.now(), + ) + if resolved_steps_events: + _, current_version = await deps.event_store.load( + stream_type="Procedure", stream_id=procedure.id + ) + await deps.event_store.append( + stream_type="Procedure", + stream_id=procedure.id, + expected_version=current_version, + events=[ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=deps.id_generator.new_id(), + command_name=command_name, + correlation_id=correlation_id, + causation_id=causation_id, + principal_id=principal_id, + ) + for event in resolved_steps_events + ], + ) + + return steps + + +async def _re_expand_steps( + *, + procedure_id: UUID, + recipe_id: UUID, + caller_steps: Sequence[Step], + stored_events: list[StoredEvent], + event_store: EventStore, + expansion_port: RecipeExpander, +) -> tuple[Step, ...]: + """Run the recipe-replay gate per [[project-run-procedure-replay-design]]. + + Six steps: reject non-empty caller steps -> find_recipe_expansion_record + (raise RecipeExpansionRecordNotFoundError on None) -> pins_from_payload + -> port-version strict-equals (raise RecipeExpanderVersionMismatchError + on drift) -> load_recipe_at_version (raise RecipeExpansionRecordNotFoundError + when None on a recipe-driven Procedure; RecipeVersionNotFoundError + propagates from helper) -> load_capability + reject Deprecated + (raise ProcedureBoundCapabilityDeprecatedError, symmetric to + start_run's RunBoundPlanDeprecatedError) -> verify_bindings_hash -> + expand -> verify_steps_hash -> return the re-expanded tuple. + """ + if list(caller_steps): + raise ProcedureStepsForbiddenForRecipeDrivenError(procedure_id) + + record = find_recipe_expansion_record(stored_events) + if record is None: + raise RecipeExpansionRecordNotFoundError(procedure_id) + + pins = pins_from_payload(procedure_id, record.payload) + + if pins.expansion_port_version != expansion_port.version: + raise RecipeExpanderVersionMismatchError( + procedure_id, + pins.expansion_port_version, + expansion_port.version, + ) + + recipe = await load_recipe_at_version( + event_store, + recipe_id, + pins.recipe_version, + ) + if recipe is None: + raise RecipeExpansionRecordNotFoundError(procedure_id) + + # Capability-deprecation gate: reject conduct against a tombstoned + # Capability before running the expansion port. Symmetric to start_run's + # RunBoundPlanDeprecatedError: re-expanding a Recipe against a Deprecated + # Capability would silently execute against a contract operators retired. + capability = await load_capability(event_store, recipe.capability_id) + if capability is not None and capability.status == CapabilityStatus.DEPRECATED: + raise ProcedureBoundCapabilityDeprecatedError(procedure_id, recipe.capability_id) + + verify_bindings_hash(procedure_id, pins) + expanded = expansion_port.expand(recipe.steps, dict(pins.bindings)) + verify_steps_hash(procedure_id, expanded, pins) + return expanded diff --git a/apps/api/src/cora/operation/_conduct_wire.py b/apps/api/src/cora/operation/_conduct_wire.py new file mode 100644 index 0000000000..9f542a86c9 --- /dev/null +++ b/apps/api/src/cora/operation/_conduct_wire.py @@ -0,0 +1,163 @@ +"""Shared HTTP/MCP wire shapes for the conduct verb-family slices. + +`conduct_procedure` and `try_conduct_procedure` accept the SAME step-list +body and surface the SAME per-step failure shape; this BC-level module owns +those wire types + converters so both slices reuse them. A slice cannot +import a sibling slice (the cross-slice-independence fitness), so the shared +seam lives here, outside `features/`, exactly like the resolved-steps replay +helper (`_resolved_steps_replay`) and preparation pipeline +(`_conduct_preparation`). + +The Conductor's `Step = SetpointStep | ActionStep | CheckStep` and +`CheckCriterion = EqualsCriterion | WithinToleranceCriterion` discriminated +unions land on the wire as JSON discriminated unions with a `kind` field. +Pydantic's `Field(discriminator="kind")` validates the union at parse time so +a malformed step kind fails the request with a 422 before the handler runs. + +Per-step `value` and `criterion.expected` are typed broadly +(`int | float | bool | str | list[Any]`) to match the ControlPort's value +vocabulary. Tuples-on-the-wire arrive as lists; the converter widens to +tuple for the in-process Conductor. +""" + +from typing import Annotated, Any, Literal, cast + +from pydantic import BaseModel, Field + +from cora.operation.conductor import ( + ActionStep, + CheckCriterion, + CheckStep, + ConductorFailure, + EqualsCriterion, + SetpointStep, + Step, + WithinToleranceCriterion, +) + +STEP_BATCH_MAX = 500 +"""Mirror of `append_activities`'s batch cap. A single conduct request never +carries more than this many steps; larger procedures split client-side via +multiple sequential runs.""" + + +class _SetpointStepRequest(BaseModel): + """JSON wire shape for a `SetpointStep`.""" + + kind: Literal["setpoint"] + address: str = Field(..., min_length=1) + value: int | float | bool | str | list[Any] + verify: bool = False + + model_config = {"extra": "forbid"} + + +class _ActionStepRequest(BaseModel): + """JSON wire shape for an `ActionStep`.""" + + kind: Literal["action"] + name: str = Field(..., min_length=1) + params: dict[str, Any] = Field(default_factory=dict) + + model_config = {"extra": "forbid"} + + +class _EqualsCriterion(BaseModel): + """JSON wire shape for an `EqualsCriterion`.""" + + kind: Literal["equals"] + expected: int | float | bool | str | list[Any] + + model_config = {"extra": "forbid"} + + +class _WithinToleranceCriterion(BaseModel): + """JSON wire shape for a `WithinToleranceCriterion`.""" + + kind: Literal["within_tolerance"] + expected: float + tolerance: float = Field(..., ge=0.0) + + model_config = {"extra": "forbid"} + + +_CriterionRequest = Annotated[ + _EqualsCriterion | _WithinToleranceCriterion, + Field(discriminator="kind"), +] + + +class _CheckStepRequest(BaseModel): + """JSON wire shape for a `CheckStep`.""" + + kind: Literal["check"] + address: str = Field(..., min_length=1) + criterion: _CriterionRequest + + model_config = {"extra": "forbid"} + + +StepRequest = Annotated[ + _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, + Field(discriminator="kind"), +] +"""The wire-side step union a conduct request body carries (`list[StepRequest]`).""" + + +class ConductorFailureResponse(BaseModel): + """JSON wire shape for a `ConductorFailure`.""" + + step_index: int | None + source_kind: str + target: str + error_class: str + message: str + + +def criterion_from_wire( + wire: _EqualsCriterion | _WithinToleranceCriterion, +) -> CheckCriterion: + """Build a Conductor `CheckCriterion` from a Pydantic wire model. + + The seam between the JSON shape and the in-process Conductor type; REST + routes + MCP tools across the conduct family share it. + """ + if isinstance(wire, _EqualsCriterion): + expected: Any = wire.expected + if isinstance(expected, list): + # wire.expected is a JSON list of Any; tuple-coerce for the in-process EqualsCriterion + return EqualsCriterion(expected=cast("tuple[Any, ...]", tuple(expected))) # pyright: ignore[reportUnknownArgumentType] + return EqualsCriterion(expected=expected) + return WithinToleranceCriterion(expected=wire.expected, tolerance=wire.tolerance) + + +def step_from_wire( + wire: _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, +) -> Step: + """Build a Conductor `Step` from a Pydantic wire model (REST + MCP share it).""" + if isinstance(wire, _SetpointStepRequest): + value: Any = wire.value + if isinstance(value, list): + return SetpointStep( + address=wire.address, + value=cast("tuple[Any, ...]", tuple(value)), # pyright: ignore[reportUnknownArgumentType] + verify=wire.verify, + ) + return SetpointStep(address=wire.address, value=value, verify=wire.verify) + if isinstance(wire, _ActionStepRequest): + return ActionStep(name=wire.name, params=wire.params) + return CheckStep( + address=wire.address, + criterion=criterion_from_wire(wire.criterion), + ) + + +def failure_to_wire(failure: ConductorFailure) -> ConductorFailureResponse: + """Project a `ConductorFailure` onto its JSON wire shape.""" + return ConductorFailureResponse( + step_index=failure.step_index, + source_kind=failure.source_kind, + target=failure.target, + error_class=failure.error_class, + message=failure.message, + ) diff --git a/apps/api/src/cora/operation/_resolved_steps_replay.py b/apps/api/src/cora/operation/_resolved_steps_replay.py new file mode 100644 index 0000000000..b91f3064f8 --- /dev/null +++ b/apps/api/src/cora/operation/_resolved_steps_replay.py @@ -0,0 +1,41 @@ +"""Resolved-steps replay helper for the `reconduct_procedure` handler. + +The resume path replays a halted conduct from PINNED resolved steps rather +than re-deriving the step list. This module locates the +`ResolvedStepsRecorded` provenance event (pinned once at conduct start by +`_conduct_preparation.resolve_and_pin_conduct_steps`) in a Procedure stream so +the handler can parse `resolved_steps` back into `Step`s via +`conductor.steps_from_payload` and hand them to `Conductor.execute_from`. + +Sibling of `_recipe_replay.find_recipe_expansion_record` (the recipe +genesis provenance finder), kept separate because that module's tuple of +helpers is recipe-expansion-specific. This is the SECOND handler-tier +payload-direct reader; per the replay-design rule-of-three note, when a +THIRD lands the two `find_*_record` head-scanners should hoist to a +generic `cora.infrastructure.event_payload` helper. +""" + +from collections.abc import Iterable + +from cora.infrastructure.ports.event_store import StoredEvent + + +def find_resolved_steps_record( + stored_events: Iterable[StoredEvent], +) -> StoredEvent | None: + """Locate the `ResolvedStepsRecorded` event in a Procedure stream. + + Scans linearly from head, returns the first match, early-exits on the + first hit. A conduct pins exactly one `ResolvedStepsRecorded` at start + (only while the Procedure is `Defined`), so a Held Procedure that has + been conducted carries exactly one; head-scan returns it. + + Returns `None` when no match. The caller decides whether None is an + error: the `reconduct_procedure` handler raises + `ResolvedStepsRecordNotFoundError` (a Held Procedure missing its pinned + resolved steps is corruption, not an operational outcome). + """ + for event in stored_events: + if event.event_type == "ResolvedStepsRecorded": + return event + return None diff --git a/apps/api/src/cora/operation/aggregates/procedure/__init__.py b/apps/api/src/cora/operation/aggregates/procedure/__init__.py index 227f068df7..c43a31c3fc 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/__init__.py +++ b/apps/api/src/cora/operation/aggregates/procedure/__init__.py @@ -21,9 +21,11 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -45,11 +47,13 @@ STEP_KIND_VALUES, STEPS_LOGBOOK_SCHEMA, InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureInterruptedAtError, InvalidProcedureIterationCapError, InvalidProcedureIterationEndReasonError, InvalidProcedureKindError, InvalidProcedureNameError, + InvalidProcedureReEstablishmentBoundaryError, InvalidProcedureTruncateReasonError, InvalidRecipeBindingsError, InvalidStepKindError, @@ -61,11 +65,14 @@ ProcedureCannotAbortError, ProcedureCannotCompleteError, ProcedureCannotEndIterationError, + ProcedureCannotHoldError, + ProcedureCannotResumeError, ProcedureCannotStartError, ProcedureCannotStartIterationError, ProcedureCannotTruncateError, ProcedureCapabilityExecutorMismatchError, ProcedureEnclosureCoverageMismatchError, + ProcedureHoldReason, ProcedureIterationLimitReachedError, ProcedureName, ProcedureNotFoundError, @@ -84,7 +91,9 @@ RecipeExpansionOverflowError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + ResolvedStepsRecordNotFoundError, StepKind, + merge_actuation_kinds, ) __all__ = [ @@ -98,11 +107,13 @@ "ActivityStore", "InMemoryActivityStore", "InvalidProcedureAbortReasonError", + "InvalidProcedureHoldReasonError", "InvalidProcedureInterruptedAtError", "InvalidProcedureIterationCapError", "InvalidProcedureIterationEndReasonError", "InvalidProcedureKindError", "InvalidProcedureNameError", + "InvalidProcedureReEstablishmentBoundaryError", "InvalidProcedureTruncateReasonError", "InvalidRecipeBindingsError", "InvalidStepKindError", @@ -117,6 +128,8 @@ "ProcedureCannotAbortError", "ProcedureCannotCompleteError", "ProcedureCannotEndIterationError", + "ProcedureCannotHoldError", + "ProcedureCannotResumeError", "ProcedureCannotStartError", "ProcedureCannotStartIterationError", "ProcedureCannotTruncateError", @@ -124,6 +137,8 @@ "ProcedureCompleted", "ProcedureEnclosureCoverageMismatchError", "ProcedureEvent", + "ProcedureHeld", + "ProcedureHoldReason", "ProcedureIterationEnded", "ProcedureIterationLimitReachedError", "ProcedureIterationStarted", @@ -134,6 +149,7 @@ "ProcedureRequiresAvailableSupplyError", "ProcedureRequiresOpenBeamShuttersError", "ProcedureRequiresPermittedEnclosureError", + "ProcedureResumed", "ProcedureStarted", "ProcedureStatus", "ProcedureStepsForbiddenForRecipeDrivenError", @@ -148,6 +164,7 @@ "RecipeExpansionRecordNotFoundError", "RecipeExpansionRecorded", "RecipeExpansionReplayMismatchError", + "ResolvedStepsRecordNotFoundError", "ResolvedStepsRecorded", "StepKind", "event_type_name", @@ -156,5 +173,6 @@ "from_stored", "load_procedure", "load_procedure_with_events", + "merge_actuation_kinds", "to_payload", ] diff --git a/apps/api/src/cora/operation/aggregates/procedure/entries.py b/apps/api/src/cora/operation/aggregates/procedure/entries.py index 2111fe85c0..4b72db4d9a 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/entries.py +++ b/apps/api/src/cora/operation/aggregates/procedure/entries.py @@ -77,7 +77,6 @@ # adapter class. The dataclass + Protocol stay strictly typed for # every caller above the boundary. -import json from dataclasses import dataclass from datetime import datetime from typing import Any, Protocol @@ -170,12 +169,17 @@ async def append(self, rows: list[Activity]) -> None: row.actor_id, row.command_name, row.step_kind, - # asyncpg encodes Python dict to jsonb when the - # column is jsonb-typed; explicit json.dumps - # keeps the contract obvious and matches the - # decision_reasonings adapter's posture (which - # also has a JSON body column). - json.dumps(row.payload), + # Pass the dict; the pool's jsonb codec (pool.py + # set_type_codec encoder=json.dumps) serializes it ONCE + # into a real jsonb OBJECT, exactly like the event store + # passes event.payload. An EXTRA json.dumps here (the + # former code) double-encoded it into a jsonb SCALAR + # string, which made server-side `payload->>'key'` + # return NULL and silently no-op'd the conductor's + # in-flight-marker filters. (The decision_inferences + # adapter still json.dumps-es into jsonb; harmless only + # while nothing queries its jsonb server-side.) + row.payload, row.sampled_at, row.occurred_at, row.correlation_id, diff --git a/apps/api/src/cora/operation/aggregates/procedure/events.py b/apps/api/src/cora/operation/aggregates/procedure/events.py index 60150889cb..de64a56df3 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/events.py +++ b/apps/api/src/cora/operation/aggregates/procedure/events.py @@ -23,8 +23,10 @@ `ProcedureActivitiesLogbookOpened` is the lazy envelope event for the per-step logbook table. `ProcedureTruncated` mirrors RunTruncated. -`ProcedureHeld` / `ProcedureResumed` are deferred until the pilot -needs the surface. +`ProcedureHeld` (Running -> Held) / `ProcedureResumed` (Held -> Running) +are the operator-pause / resume pair for resumable conduct (Tier 1 of +[[project_resumable_conduct_design]]); the state name mirrors +`RunStatus.HELD`. `ProcedureIterationStarted` / `ProcedureIterationEnded` are the first-class boundary pair for the convergence-driven iteration loop @@ -177,7 +179,7 @@ class RecipeExpansionRecorded: @dataclass(frozen=True) class ProcedureStarted: - """A Procedure transitioned out of Defined into Running (10c-b). + """A Procedure transitioned out of Defined into Running. Slim payload by design: the start fact is what the event encodes. Status is implicit (`Running`); the evolver sets it. No reason @@ -264,7 +266,7 @@ class ProcedureActivitiesLogbookOpened: @dataclass(frozen=True) class ProcedureTruncated: - """A Procedure reached its partial-data terminal (Running -> Truncated, 10c-c). + """A Procedure reached its partial-data terminal (Running | Held -> Truncated). Cleanup terminal for a Procedure that became de-facto dead through interruption (power loss, process crash, hardware fault, weekend @@ -287,7 +289,7 @@ class ProcedureTruncated: emergency exit while the system is still responsive; Truncated is a cleanup mechanism for known-dead Procedures. The system itself does not detect de-facto-dead Procedures (separate liveness - concern, out of scope for 10c-c); operators must invoke truncate + concern, out of scope here); operators must invoke truncate explicitly. Mirrors `RunTruncated` from Run BC's 6f-4. """ @@ -299,7 +301,7 @@ class ProcedureTruncated: @dataclass(frozen=True) class ProcedureAborted: - """A Procedure reached its emergency-exit terminal (Running -> Aborted). + """A Procedure reached its emergency-exit terminal (Running | Held -> Aborted). `reason` is a free-form string (1-500 chars after trimming), captured verbatim from the operator. Mirror of RunAborted.reason @@ -312,9 +314,9 @@ class ProcedureAborted: fold via `payload.get("actuation_kind")` -> None. Carries honest provenance for a Dataset produced by an aborted conduct. - Single-source guard at the decider (Running only). Held/Resumed - deferred to 10c-c per pilot need; if Held lands, the abort source - set widens to `Running | Held` to match Run BC's precedent. + Multi-source guard at the decider: `Running | Held` (a paused + conduct stays abortable; resumable conduct widened the source set, + matching Run BC's `abort_run`). """ procedure_id: UUID @@ -323,6 +325,80 @@ class ProcedureAborted: actuation_kind: str | None = None +@dataclass(frozen=True) +class ProcedureHeld: + """A Procedure conduct was operator-paused (Running -> Held). + + Tier 1 of [[project_resumable_conduct_design]]: the operator pauses + a halted conduct so it can be re-established and resumed later rather + than aborted-and-reseeded. Additive to the Layer-1 FSM; the state + name mirrors `RunStatus.HELD` (Procedure is an execution-FSM sibling + of Run). + + `reason` is a free-form string (1-500 chars after trimming), captured + verbatim. REQUIRED, unlike `RunHeld` (slim, no reason: a routine Run + pause): pausing a halted conduct is a deliberate, high-information + operator act, matching `AgentSuspended.reason`. Same future-additive + structured-taxonomy posture as `ProcedureAborted.reason`. + + `decided_by_decision_id` mirrors `RunHeld`: optional Decision-causation + link to the Decision BC record that justified this hold. None for + operator-routed holds; set when an in-process agent runtime issues the + hold. NO existence check per the cross-BC eventual-consistency stance. + Forward-compat via `payload.get("decided_by_decision_id")` -> None. + + `actuation_kind` is the raw `ActuationKind` value the Conductor observed + in the conduct UP TO this pause (None for an operator hold issued outside + a conduct). It is carried so a later resume can fold the pre-hold + provenance with the replay tail's: without it, a `reconduct` from a + boundary past a simulated prefix would complete as `Physical` and slip + past the `promote_dataset` Simulated/Hybrid gate. The evolver merges it + into `Procedure.actuation_kind` (via `merge_actuation_kinds`); + `ProcedureResumed` then carries it forward. Additive: legacy streams fold + via `payload.get("actuation_kind")` -> None. + + Status is NOT carried (the event type encodes the transition); the + evolver maps `ProcedureHeld -> HELD`. + """ + + procedure_id: UUID + reason: str + occurred_at: datetime + decided_by_decision_id: UUID | None = None + actuation_kind: str | None = None + + +@dataclass(frozen=True) +class ProcedureResumed: + """A held Procedure conduct was resumed (Held -> Running). + + Inverse of `ProcedureHeld`. Mirrors `RunResumed`. Hold <-> Resume is + bidirectional and unlimited-cycle within one conduct. + + `re_establishment_boundary` is the index in the pinned resolved + step list from which resume re-drives setpoints + re-runs checks (NOT + a continuity proof; the pre-effect in-flight marker is the only + continuity fact the aggregate owns). It is `>= 0`; the Conductor's + `execute_from` consumes it to replay the pinned step-list tail. Per + [[project_resumable_conduct_design]] the field is the + re-establishment boundary, deliberately NOT a "verified continuity" + claim. + + `decided_by_decision_id` mirrors `RunResumed`: optional + Decision-causation link; None for operator-routed resumes, set when + an in-process agent runtime issues an autonomous resume. NO existence + check (cross-BC eventual-consistency). Forward-compat via + `payload.get("decided_by_decision_id")` -> None. + + Status is NOT carried; the evolver maps `ProcedureResumed -> RUNNING`. + """ + + procedure_id: UUID + re_establishment_boundary: int + occurred_at: datetime + decided_by_decision_id: UUID | None = None + + @dataclass(frozen=True) class ProcedureIterationStarted: """One convergence-loop iteration began on a Running Procedure. @@ -418,6 +494,8 @@ class ResolvedStepsRecorded: | ProcedureCompleted | ProcedureAborted | ProcedureTruncated + | ProcedureHeld + | ProcedureResumed | ProcedureActivitiesLogbookOpened | ProcedureIterationStarted | ProcedureIterationEnded @@ -514,6 +592,36 @@ def to_payload(event: ProcedureEvent) -> dict[str, Any]: "interrupted_at": interrupted_at_iso, "occurred_at": occurred_at.isoformat(), } + case ProcedureHeld( + procedure_id=procedure_id, + reason=reason, + occurred_at=occurred_at, + decided_by_decision_id=decided_by_decision_id, + actuation_kind=actuation_kind, + ): + return { + "procedure_id": str(procedure_id), + "reason": reason, + "decided_by_decision_id": ( + str(decided_by_decision_id) if decided_by_decision_id is not None else None + ), + "occurred_at": occurred_at.isoformat(), + "actuation_kind": actuation_kind, + } + case ProcedureResumed( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + occurred_at=occurred_at, + decided_by_decision_id=decided_by_decision_id, + ): + return { + "procedure_id": str(procedure_id), + "re_establishment_boundary": re_establishment_boundary, + "decided_by_decision_id": ( + str(decided_by_decision_id) if decided_by_decision_id is not None else None + ), + "occurred_at": occurred_at.isoformat(), + } case ProcedureActivitiesLogbookOpened( procedure_id=procedure_id, logbook_id=logbook_id, @@ -690,6 +798,36 @@ def _build_truncated() -> ProcedureTruncated: ) return deserialize_or_raise("ProcedureTruncated", _build_truncated) + case "ProcedureHeld": + + def _build_held() -> ProcedureHeld: + raw_decided_by = payload.get("decided_by_decision_id") + return ProcedureHeld( + procedure_id=UUID(payload["procedure_id"]), + reason=payload["reason"], + decided_by_decision_id=( + UUID(raw_decided_by) if raw_decided_by is not None else None + ), + occurred_at=datetime.fromisoformat(payload["occurred_at"]), + # Additive: pre-activation streams omit the key -> None. + actuation_kind=payload.get("actuation_kind"), + ) + + return deserialize_or_raise("ProcedureHeld", _build_held) + case "ProcedureResumed": + + def _build_resumed() -> ProcedureResumed: + raw_decided_by = payload.get("decided_by_decision_id") + return ProcedureResumed( + procedure_id=UUID(payload["procedure_id"]), + re_establishment_boundary=int(payload["re_establishment_boundary"]), + decided_by_decision_id=( + UUID(raw_decided_by) if raw_decided_by is not None else None + ), + occurred_at=datetime.fromisoformat(payload["occurred_at"]), + ) + + return deserialize_or_raise("ProcedureResumed", _build_resumed) case "ProcedureActivitiesLogbookOpened": return deserialize_or_raise( "ProcedureActivitiesLogbookOpened", @@ -759,9 +897,11 @@ def _build_truncated() -> ProcedureTruncated: "ProcedureActivitiesLogbookOpened", "ProcedureCompleted", "ProcedureEvent", + "ProcedureHeld", "ProcedureIterationEnded", "ProcedureIterationStarted", "ProcedureRegistered", + "ProcedureResumed", "ProcedureStarted", "ProcedureTruncated", "RecipeExpansionRecorded", diff --git a/apps/api/src/cora/operation/aggregates/procedure/evolver.py b/apps/api/src/cora/operation/aggregates/procedure/evolver.py index 95548d6c99..7a4d50e81e 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/evolver.py +++ b/apps/api/src/cora/operation/aggregates/procedure/evolver.py @@ -10,6 +10,8 @@ - `ProcedureCompleted` -> COMPLETED (happy-path terminal) - `ProcedureAborted` -> ABORTED (emergency-exit terminal) - `ProcedureTruncated` -> TRUNCATED (partial-data terminal; mirrors RunTruncated) + - `ProcedureHeld` -> HELD (operator-pause; mirrors RunHeld) + - `ProcedureResumed` -> RUNNING (resume from Held; mirrors RunResumed) - `ProcedureActivitiesLogbookOpened` -> STATUS UNCHANGED (sets activity_logbook_id; lazy-open envelope event from append_activities, orthogonal to lifecycle) @@ -67,9 +69,11 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -79,6 +83,7 @@ Procedure, ProcedureName, ProcedureStatus, + merge_actuation_kinds, ) @@ -193,6 +198,59 @@ def evolve(state: Procedure | None, event: ProcedureEvent) -> Procedure: ), actuation_kind=prior.actuation_kind, ) + case ProcedureHeld(actuation_kind=held_actuation_kind): + # Operator-pause transition (Running -> Held). Status-only change; + # every non-status field carries verbatim from prior (especially + # the iteration denorms). Mirrors RunHeld. EXCEPT actuation_kind: + # the conduct's observed-so-far kind rides ProcedureHeld and is + # MERGED into state so it survives the hold->resume boundary (a + # reconduct from a boundary past a simulated prefix would otherwise + # complete as Physical and bypass the promote_dataset gate). Merge, + # not set, so a manual operator hold (actuation_kind=None) cannot + # wipe a prior conduct's recorded kind. + prior = require_state(state, "ProcedureHeld") + return Procedure( + id=prior.id, + name=prior.name, + kind=prior.kind, + target_asset_ids=prior.target_asset_ids, + status=ProcedureStatus.HELD, + parent_run_id=prior.parent_run_id, + activity_logbook_id=prior.activity_logbook_id, + capability_id=prior.capability_id, + recipe_id=prior.recipe_id, + current_iteration_index=prior.current_iteration_index, + iteration_count=prior.iteration_count, + consecutive_unconverged_iterations=prior.consecutive_unconverged_iterations, + max_consecutive_unconverged_iterations=( + prior.max_consecutive_unconverged_iterations + ), + actuation_kind=merge_actuation_kinds(prior.actuation_kind, held_actuation_kind), + ) + case ProcedureResumed(): + # Resume transition (Held -> Running). Status-only change; every + # non-status field carries verbatim from prior. The + # re_establishment_boundary rides the event for the Conductor's + # replay, not folded into state. Mirrors RunResumed. + prior = require_state(state, "ProcedureResumed") + return Procedure( + id=prior.id, + name=prior.name, + kind=prior.kind, + target_asset_ids=prior.target_asset_ids, + status=ProcedureStatus.RUNNING, + parent_run_id=prior.parent_run_id, + activity_logbook_id=prior.activity_logbook_id, + capability_id=prior.capability_id, + recipe_id=prior.recipe_id, + current_iteration_index=prior.current_iteration_index, + iteration_count=prior.iteration_count, + consecutive_unconverged_iterations=prior.consecutive_unconverged_iterations, + max_consecutive_unconverged_iterations=( + prior.max_consecutive_unconverged_iterations + ), + actuation_kind=prior.actuation_kind, + ) case ProcedureActivitiesLogbookOpened(logbook_id=logbook_id): # Lazy open-on-first-write: preserve all # prior state, set activity_logbook_id. Status NOT touched -- the diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index 4f8bf8367c..cbeca2b2d0 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -22,9 +22,9 @@ Full FSM (Running / Completed / Aborted / Truncated transitions) + per-step logbook follow. Projection + list_procedures follow. -## ProcedureStatus FSM (locked initial) +## ProcedureStatus FSM - Defined -> Running -> Completed | Aborted | Truncated + Defined -> Running <-> Held -> Completed | Aborted | Truncated REVISED from BC map's `Idle -> Starting -> Running -> Verifying -> Complete | Aborted` per the standards-corpus research at @@ -32,7 +32,9 @@ at FSM level (PackML uses `Completing` for closeout/check work; OPC UA Programs has no Verify state); per-step Check happens within Running; transient states deferred until real async window appears -(Run BC precedent). Held/Resumed deferred to 10c-c per pilot need. +(Run BC precedent). `Held` is the operator-pause state for resumable +conduct (Tier 1 of [[project_resumable_conduct_design]]; mirrors +`RunStatus.HELD`). ## Status as enum-in-state, derived-from-event-type-in-evolver @@ -180,9 +182,9 @@ class ProcedureStatus(StrEnum): """The Procedure's lifecycle state. - Five values declared day one for forward-compat - (additive-state pattern; legacy events fold cleanly because - only DEFINED is reachable after register_procedure): + Six values declared for forward-compat (additive-state pattern; + legacy events fold cleanly because only DEFINED is reachable after + register_procedure): - `Defined` -- registration-time genesis; pre-execution. Operator can edit / inspect / submit for @@ -190,25 +192,39 @@ class ProcedureStatus(StrEnum): Cannot accept step events yet. - `Running` -- post-start_procedure. Step events accepted via append_activities. + - `Held` -- operator-paused mid-conduct via hold_procedure + (Running <-> Held, resumable via + resume_procedure). The resumable-conduct + pause state; mirrors `RunStatus.HELD`. No step + events accepted while Held; the conduct is + paused, not advancing. - `Completed` -- happy path via complete_procedure. Strict-not-idempotent. - `Aborted` -- emergency exit via abort_procedure. - `Truncated` -- retroactive cleanup via truncate_procedure. Mirrors RunTruncated. - `Verifying` and `Held / Resumed` are deliberately NOT in this - enum. Per [[project_operation_design]] standards-corpus research: - `Verifying` is NOT standards-blessed at FSM level (PackML uses - `Completing` for closeout/check work; OPC UA Programs has no - Verify state). Per-step Check happens within Running synchronously - (via the Step logbook's check_passed field). Held / Resumed - deferred until pilot operator feedback surfaces a need. + `Verifying` is deliberately NOT in this enum. Per + [[project_operation_design]] standards-corpus research: `Verifying` + is NOT standards-blessed at FSM level (PackML uses `Completing` for + closeout/check work; OPC UA Programs has no Verify state). Per-step + Check happens within Running synchronously (via the Step logbook's + check_passed field). + + `Held` lands in Tier 1 of [[project_resumable_conduct_design]]: + operator-pause of a halted conduct, additive to the Layer-1 FSM, + mirroring `RunStatus.HELD` (Procedure is an execution-FSM sibling of + Run). The PackML operator=`Held` / external-blocker=`Suspended` + split is honored: this is the operator-pause, so `Held`, not + `Suspended`. The `HOLDING` / `RESTARTING` transient states are + deliberately omitted (Run-precedent deferral). Naming convention (per Run BC gate review): gerund / adjective for active steady-states (matches PackML / Bluesky); - past-participle for terminals. `Defined` is past-participle (a - procedure WAS defined); `Running` is gerund-as-adjective; the - rest are past-participle terminals. + past-participle for the pause-state and terminals. `Defined` is + past-participle (a procedure WAS defined); `Running` is + gerund-as-adjective; `Held` is past-participle (mirrors + `RunStatus.HELD`); the rest are past-participle terminals. Enum values are PascalCase strings (matches BC-map status vocabulary; log lines and DTOs read naturally without mapping). @@ -216,6 +232,7 @@ class ProcedureStatus(StrEnum): DEFINED = "Defined" RUNNING = "Running" + HELD = "Held" COMPLETED = "Completed" ABORTED = "Aborted" TRUNCATED = "Truncated" @@ -457,6 +474,28 @@ def __init__(self, procedure_id: UUID) -> None: self.procedure_id = procedure_id +class ResolvedStepsRecordNotFoundError(Exception): + """A Held Procedure cannot locate its pinned `ResolvedStepsRecorded` record. + + Raised by the `reconduct_procedure` (resume-and-replay) handler when a + Held Procedure's stream carries no `ResolvedStepsRecorded` event. A + conduct pins exactly one at start (while `Defined`), so a conducted + Procedure always has it; its absence is corruption (stream truncation, + a manual event-store write, or a partial-write failure), not an + operational outcome. Kept OUT of the conduct/reconduct failures-in-body + contract (that is for step outcomes like an IOC rejecting a write). + Sibling of `RecipeExpansionRecordNotFoundError`. Mapped to HTTP 500. + """ + + def __init__(self, procedure_id: UUID) -> None: + super().__init__( + f"Procedure {procedure_id} is Held but its pinned " + f"ResolvedStepsRecorded record could not be located; resume " + f"replay cannot proceed." + ) + self.procedure_id = procedure_id + + class RecipeExpansionReplayMismatchError(Exception): """Replay-time hash drift on a recipe-driven Procedure. @@ -744,30 +783,32 @@ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> Non class ProcedureCannotAbortError(Exception): - """Attempted to abort a Procedure not in `Running`. + """Attempted to abort a Procedure not in `Running` or `Held`. - Single-source guard: `abort_procedure` accepts only `Running` (no - Held state in the Procedure FSM today; deferred to 10c-c per pilot - need). Aborting a `Defined` Procedure raises (use a different - workflow, for example: never start it, then leave it Defined or - extend the FSM with a cancel-defined slice if real); aborting any + Source guard: `abort_procedure` accepts `Running | Held` (a paused + conduct stays abortable; resumable conduct widened the set, mirroring + Run BC's `abort_run`). Aborting a `Defined` Procedure raises (use a + different workflow, for example: never start it, then leave it Defined + or extend the FSM with a cancel-defined slice if real); aborting any terminal raises (strict-not-idempotent). Mapped to HTTP 409. """ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: super().__init__( f"Procedure {procedure_id} cannot be aborted: currently in status " - f"{current_status.value}, abort requires {ProcedureStatus.RUNNING.value}" + f"{current_status.value}, abort requires " + f"{ProcedureStatus.RUNNING.value} or {ProcedureStatus.HELD.value}" ) self.procedure_id = procedure_id self.current_status = current_status class ProcedureCannotTruncateError(Exception): - """Attempted to truncate a Procedure not in `Running`. + """Attempted to truncate a Procedure not in `Running` or `Held`. - Single-source guard: `truncate_procedure` accepts only `Running` - today (Held/Resumed deferred to future iteration). Mirrors + Source guard: `truncate_procedure` accepts `Running | Held` (a paused + Procedure that became de-facto dead can be closed retroactively; + resumable conduct widened the set alongside abort). Mirrors `ProcedureCannotAbortError`'s source set: a Defined Procedure hasn't started so there's no execution to truncate; terminal Procedures are already closed (re-truncating a `Truncated` @@ -783,12 +824,78 @@ class ProcedureCannotTruncateError(Exception): def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: super().__init__( f"Procedure {procedure_id} cannot be truncated: currently in status " - f"{current_status.value}, truncate requires {ProcedureStatus.RUNNING.value}" + f"{current_status.value}, truncate requires " + f"{ProcedureStatus.RUNNING.value} or {ProcedureStatus.HELD.value}" + ) + self.procedure_id = procedure_id + self.current_status = current_status + + +class ProcedureCannotHoldError(Exception): + """Attempted to hold a Procedure not in `Running`. + + Single-source guard: `hold_procedure` accepts only `Running`. + Re-holding an already-`Held` Procedure raises (strict-not- + idempotent); holding a `Defined` or terminal Procedure raises. + Mirrors `RunCannotHoldError`. Hold <-> Resume is bidirectional and + unlimited-cycle: an operator can hold -> resume -> hold repeatedly + within one conduct, each hold requiring an intervening resume. + Mapped to HTTP 409. + """ + + def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: + super().__init__( + f"Procedure {procedure_id} cannot be held: currently in status " + f"{current_status.value}, hold requires {ProcedureStatus.RUNNING.value}" ) self.procedure_id = procedure_id self.current_status = current_status +class ProcedureCannotResumeError(Exception): + """Attempted to resume a Procedure that cannot be resumed. + + Two refusal reasons, both HTTP 409: + - status guard: `resume_procedure` accepts only `Held` (the + inverse of hold, which requires `Running`). Resuming an + already-`Running` Procedure raises (strict-not-idempotent); + resuming a `Defined` or terminal Procedure raises. Mirrors + `RunCannotResumeError`. + - off-diagonal guard (`parent_run_held=True`): a Held Procedure + whose parent Run is itself `Held` cannot resume to `Running` + and walk real setpoints while the Run is paused. The + one-directional Operation -> Run read enforces this; there is + NO cascade from Run-resume into Procedure-resume (that is a + Layer-3 saga, deferred). See [[project_resumable_conduct_design]]. + + `parent_run_held` distinguishes the two for operator-facing + messaging; `current_status` is carried in both cases. + """ + + def __init__( + self, + procedure_id: UUID, + current_status: "ProcedureStatus", + *, + parent_run_held: bool = False, + ) -> None: + if parent_run_held: + message = ( + f"Procedure {procedure_id} cannot be resumed: its parent Run is " + f"{ProcedureStatus.HELD.value}. Resume the Run first; CORA does not " + f"cascade a Run resume into its Procedures." + ) + else: + message = ( + f"Procedure {procedure_id} cannot be resumed: currently in status " + f"{current_status.value}, resume requires {ProcedureStatus.HELD.value}" + ) + super().__init__(message) + self.procedure_id = procedure_id + self.current_status = current_status + self.parent_run_held = parent_run_held + + class ProcedureCannotStartIterationError(Exception): """Attempted to start an iteration that fails a start-gate. @@ -835,7 +942,9 @@ class ProcedureCannotEndIterationError(Exception): """Attempted to end an iteration that fails an end-gate. Raised by the `end_iteration` decider when: - - The Procedure is not in `Running`. + - The Procedure is not in `Running` or `Held` (an open iteration can + be closed even while the conduct is paused; resumable conduct + widened the source set, but `start_iteration` stays Running-only). - No iteration is currently open (`current_iteration_index` is None); there is nothing to end. - The supplied `iteration_index` does not match the open @@ -1043,6 +1152,50 @@ def __init__(self, value: str) -> None: self.value = value +class InvalidProcedureHoldReasonError(ValueError): + """The supplied hold reason is empty, whitespace-only, or too long. + + Validated at the API boundary via Pydantic min_length / max_length, + AND defensively at the decider via the `ProcedureHoldReason` VO so + direct in-process callers (sagas, tests) get the same protection. + Sibling of `InvalidProcedureAbortReasonError`; distinct class for + BC-local HTTP-status registration. Mapped to HTTP 400. + + Unlike `RunHeld` (slim, no reason: a routine Run pause), a Procedure + hold carries a required reason because pausing a halted conduct is a + deliberate, high-information operator act (matching + `AgentSuspended.reason` and [[project_resumable_conduct_design]]). + The state NAME mirrors Run (`Held`); the reason payload follows the + operator-pause-with-context precedent. + """ + + def __init__(self, value: str) -> None: + super().__init__( + f"Procedure hold reason must be 1-{REASON_MAX_LENGTH} chars " + f"after trimming (got: {value!r})" + ) + self.value = value + + +class InvalidProcedureReEstablishmentBoundaryError(ValueError): + """The supplied resume re-establishment boundary is negative. + + `re_establishment_boundary` is the index in the pinned resolved + step list from which resume re-drives setpoints + re-runs checks. It + must be >= 0 (a step position; 0 means re-establish from the very + first step). Validated at the API boundary via Pydantic `ge=0` AND + defensively at the `resume_procedure` decider. The upper bound + (boundary vs step-list length) is enforced by the Conductor's + `execute_from` replay, not the decider (the step list is not folded + into Procedure state). Mapped to HTTP 400. See + [[project_resumable_conduct_design]]. + """ + + def __init__(self, value: int) -> None: + super().__init__(f"re_establishment_boundary must be >= 0 (got: {value})") + self.value = value + + @bounded_name( max_length=PROCEDURE_NAME_MAX_LENGTH, error_class=InvalidProcedureNameError, @@ -1099,6 +1252,29 @@ def __post_init__(self) -> None: object.__setattr__(self, "value", trimmed) +@dataclass(frozen=True) +class ProcedureHoldReason: + """Free-form hold reason. Trimmed; 1-500 chars. + + Sibling of `ProcedureAbortReason`; same shape (trimmed + bounded), + distinct class for BC-local HTTP-status registration. The + on-the-wire representation in `ProcedureHeld.reason` is `str` + (post-trim); the VO exists at decider-input time only. A Procedure + hold REQUIRES a reason (unlike Run's slim `RunHeld`); see + `InvalidProcedureHoldReasonError`. + """ + + value: str + + def __post_init__(self) -> None: + trimmed = validate_bounded_text( + self.value, + max_length=REASON_MAX_LENGTH, + error_class=InvalidProcedureHoldReasonError, + ) + object.__setattr__(self, "value", trimmed) + + @dataclass(frozen=True) class Procedure: """Aggregate root: one execution of an episodic operational task. @@ -1247,3 +1423,34 @@ class Procedure: enum; state stores the raw string (cross-BC string-snapshot seam, mirroring how the Data BC stores it). Additive-state default None: legacy + pre-activation streams fold cleanly.""" + + +def merge_actuation_kinds(first: str | None, second: str | None) -> str | None: + """Combine two observed actuation-kind values into the honest aggregate kind. + + Mirrors the Conductor `_ActuationObserver`'s flag collapse, but over the + persisted raw string values (an `ActuationKind` value or None) so a resume + can fold the PRE-HOLD conduct's observed kind (carried on `ProcedureHeld`) + with the replay tail's kind before the terminal event. Without this, a + reconduct from a boundary past a simulated prefix would complete as + `Physical` and slip past the `promote_dataset` Simulated/Hybrid gate. None + contributes nothing; a `Physical` + `Simulated` mix (or either with + `Hybrid`) collapses to `Hybrid`. Pure + no `ActuationKind` import: the + aggregate stores the raw string by design (the cross-BC snapshot seam).""" + simulated_seen = False + physical_seen = False + for kind in (first, second): + if kind == "Simulated": + simulated_seen = True + elif kind == "Physical": + physical_seen = True + elif kind == "Hybrid": + simulated_seen = True + physical_seen = True + if simulated_seen and physical_seen: + return "Hybrid" + if simulated_seen: + return "Simulated" + if physical_seen: + return "Physical" + return None diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 5285e9ca74..8e9197f8e1 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -37,6 +37,29 @@ False, "reason": "out_of_range"}`); the Conductor treats any return from a body as success-shaped at this tier. +## Resume (execute_from) + +`execute_from` replays the PINNED resolved step list from a re-establishment +boundary rather than re-deriving the step list: re-drive setpoints, re-run +checks as fresh gates, and halt-for-operator on an acquisition +(`ActionStep`). It is the Tier-1 resumable-conduct primitive +([[project_resumable_conduct_design]]); the step list comes from +`ResolvedStepsRecorded` parsed via `steps_from_payload`. Like `execute` +it drives no FSM transition. + +## Pre-effect in-flight marker (side-effecting steps) + +A setpoint and an action are side-effecting: each records a SEPARATE +`result="in_flight"` step entry BEFORE the effect runs, then the +`ok` / `failed` outcome entry after. This doubles the per-step append +count for those two kinds. A check is a pure read (always safe to +re-run), so it records no marker, only its single outcome entry. The +marker is the resume substrate: an `in_flight` entry with no matching +outcome for the same `step_index` is the one step that was mid-flight +when a conduct halted, even if the halt was a crash or cancellation +(the marker append completes before the effect). See +[[project_resumable_conduct_design]] Tier 1. + ## Check semantics A `CheckStep` carries an address + an acceptance criterion. The @@ -90,8 +113,9 @@ import asyncio import contextlib from collections.abc import AsyncIterator, Awaitable, Callable, Mapping, Sequence -from dataclasses import dataclass, field -from typing import Any, Protocol +from dataclasses import dataclass, field, replace +from enum import StrEnum +from typing import Any, Protocol, cast from uuid import UUID from cora.infrastructure.ports.clock import Clock @@ -99,7 +123,7 @@ from cora.infrastructure.ports.id_generator import IdGenerator from cora.infrastructure.routing import NIL_SENTINEL_ID from cora.operation._control_dispatch_context import with_dispatch_correlation_id -from cora.operation.aggregates.procedure import ProcedureNotFoundError +from cora.operation.aggregates.procedure import ProcedureNotFoundError, merge_actuation_kinds from cora.operation.errors import CheckFailedError, UnauthorizedError, UnknownActionError from cora.operation.features.abort_procedure.command import AbortProcedure from cora.operation.features.abort_procedure.handler import Handler as AbortProcedureHandler @@ -114,6 +138,10 @@ from cora.operation.features.complete_procedure.handler import ( Handler as CompleteProcedureHandler, ) +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler as HoldProcedureHandler +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler as ResumeProcedureHandler from cora.operation.features.start_procedure.command import StartProcedure from cora.operation.features.start_procedure.handler import Handler as StartProcedureHandler from cora.operation.ports.control_port import ( @@ -197,8 +225,50 @@ read-side filters can separate successful vs failed steps without parsing the message string.""" +_RESULT_IN_FLIGHT = "in_flight" +"""Pre-effect in-flight marker discriminator, written to a SEPARATE +step entry BEFORE a side-effecting step (setpoint / action) actuates, +then followed by the `ok` / `failed` outcome entry after. A check is a +pure read (always safe to re-run), so it records NO marker -- only its +single outcome entry. + +The marker is what lets a future resume identify the one step that was +mid-flight when a conduct halted: an `in_flight` entry with no matching +outcome entry for the same `step_index` is the interrupted step. The +marker is recorded even when the effect then raises or is cancelled +(the marker append completes before the effect runs); that is the +point -- a crashed write leaves a marker-without-outcome behind so the +step is recoverable. See [[project_resumable_conduct_design]] Tier 1.""" + _QUALITY_GOOD = "Good" +_RESUME_HALT_ERROR_CLASS = "AcquisitionResumeRequiresOperator" +"""`error_class` on the `ConductorFailure` that `execute_from` returns when +a resume reaches an `ActionStep` (an acquisition). It is NOT an exception +and NOT a step failure: re-running an interrupted acquisition is +non-idempotent (fly-scan triggers are one-shot, a mid-arm collect reads +identically for finished / aborted / never-armed), so resume HALTS and +hands the decision (redo-fresh vs reseed) back to the operator rather than +auto-skipping or auto-rerunning. See [[project_resumable_conduct_design]].""" + + +class ResumePolicy(StrEnum): + """How `execute_from` re-establishes state while replaying a step-list tail. + + `RE_ESTABLISH` (the only member today): re-drive setpoints (absolute + writes are idempotent; CORA has no relative-setpoint type), re-run + checks as fresh gates, and HALT on an acquisition (`ActionStep`) for an + operator decision. This is the locked default per + [[project_resumable_conduct_design]]. + + A future `COMPARE` member (read-and-compare instead of re-drive) is an + Anti-hook-until-lease: its single-writer guarantee is unsatisfiable on a + multi-writer floor until a Conduit/Surface write-ownership lease exists, + so it is deliberately absent rather than stubbed. + """ + + RE_ESTABLISH = "re_establish" + @dataclass(frozen=True) class SetpointStep: @@ -409,12 +479,21 @@ class ConductorResult: (any simulator touch is disqualifying), so the failure-path result still reports the kind. Do not "fix" the observe-before-dispatch ordering in `_ActuationObserver` without revisiting this contract. + + `held` is True ONLY when `try_conduct` paused the Procedure to `Held` + on a recoverable step failure (and the hold transition itself + succeeded). Every other path (`execute` / `conduct` / `execute_from` + / `reconduct`, and a `try_conduct` whose hold itself failed) leaves it + False. It reflects the ACTUAL transition, not the mere recoverability + of the failure, so a caller can distinguish a resumable `Held` outcome + from a terminal `Aborted` one (both carry `succeeded=False` + `failure`). """ procedure_id: UUID completed_count: int failure: ConductorFailure | None = None actuation_kind: ActuationKind | None = None + held: bool = False @property def succeeded(self) -> bool: @@ -509,6 +588,8 @@ def __init__( start_procedure: StartProcedureHandler | None = None, complete_procedure: CompleteProcedureHandler | None = None, abort_procedure: AbortProcedureHandler | None = None, + resume_procedure: ResumeProcedureHandler | None = None, + hold_procedure: HoldProcedureHandler | None = None, ) -> None: self._control_port = control_port self._append_step = append_step @@ -518,6 +599,8 @@ def __init__( self._start_procedure = start_procedure self._complete_procedure = complete_procedure self._abort_procedure = abort_procedure + self._resume_procedure = resume_procedure + self._hold_procedure = hold_procedure async def execute( self, @@ -573,6 +656,104 @@ async def execute( actuation_kind=observer.actuation_kind, ) + async def execute_from( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + steps: Sequence[Step], + boundary: int, + policy: ResumePolicy = ResumePolicy.RE_ESTABLISH, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Resume a halted conduct by REPLAYING the pinned resolved steps from `boundary`. + + `steps` is the FINAL resolved step list pinned on + `ResolvedStepsRecorded` at first conduct (parse the event's + `resolved_steps` back via `steps_from_payload`). Resume NEVER + re-derives the step list -- a re-derived list could silently skip or + mis-target a step (the end-of-run "home to 0" aliasing the + start-of-run "home to 0" after an index shift). It replays + `steps[boundary:]` verbatim: + + - `SetpointStep` -> RE-DRIVE (idempotent absolute write). The + recorded `step_index` is the ABSOLUTE position in the step list, so the + replayed journal lines up with the original conduct. + - `CheckStep` -> RE-RUN as a fresh gate (a passing check proves + "now", not "continuously", so it is re-evaluated). + - `ActionStep` -> HALT for an operator decision (an interrupted + acquisition is non-idempotent; see `_RESUME_HALT_ERROR_CLASS`). + The action is NOT executed and NOTHING is recorded for it; the + returned `ConductorResult.failure` carries the halt so the + caller (a resume orchestrator) routes the decision. + + `boundary` is the re-establishment boundary from `ProcedureResumed`: + the index from which re-drive + re-run resumes. `boundary >= + len(steps)` replays an empty tail (a no-op resume). Like + `execute`, this drives no FSM transition; it walks + records. + + See [[project_resumable_conduct_design]] Tier 1. + """ + if boundary < 0: + msg = f"boundary must be >= 0 (got {boundary})" + raise ValueError(msg) + if policy is not ResumePolicy.RE_ESTABLISH: # pragma: no cover - only member today + msg = f"unsupported resume policy: {policy}" + raise ValueError(msg) + envelope = _Envelope( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + surface_id=surface_id, + ) + observer = _ActuationObserver(self._control_port) + completed = 0 + for index in range(boundary, len(steps)): + step = steps[index] + if isinstance(step, ActionStep): + # Halt-for-operator: do not re-run an interrupted acquisition. + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + failure=ConductorFailure( + step_index=index, + source_kind=_STEP_KIND_ACTION, + target=step.name, + error_class=_RESUME_HALT_ERROR_CLASS, + message=( + f"resume halted at step {index} (action {step.name!r}): an " + "interrupted acquisition needs an operator decision " + "(redo-fresh vs reseed); not auto-rerun, not auto-skipped" + ), + ), + actuation_kind=observer.actuation_kind, + ) + with with_dispatch_correlation_id(correlation_id): + if isinstance(step, SetpointStep): + failure = await self._run_setpoint( + step, index=index, envelope=envelope, port=observer + ) + else: + failure = await self._run_check( + step, index=index, envelope=envelope, port=observer + ) + if failure is not None: + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + failure=failure, + actuation_kind=observer.actuation_kind, + ) + completed += 1 + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + actuation_kind=observer.actuation_kind, + ) + async def conduct( self, *, @@ -711,7 +892,7 @@ async def conduct( # that is what the caller needs to triage. failure = result.failure assert failure is not None # not result.succeeded implies failure - reason = _derive_abort_reason(failure) + reason = _derive_failure_reason(failure) with contextlib.suppress(Exception): await self._abort_procedure( AbortProcedure( @@ -727,6 +908,306 @@ async def conduct( ) return result + async def try_conduct( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + steps: Sequence[Step], + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Drive the lifecycle like `conduct()`, but PAUSE to Held on a recoverable failure. + + The pause-capable twin of `conduct()`. Identical start -> execute -> + complete-on-success path; the only divergence is the failure branch: + + - a RECOVERABLE step failure (setpoint / check: re-drivable / + re-runnable on resume) -> best-effort `hold_procedure` (Running -> + Held). On a successful hold the result carries `held=True` so the + caller can offer `reconduct`; if the hold itself fails the + Procedure is left Running (same posture as conduct's best-effort + abort that fails) and `held` stays False. + - a NON-recoverable step failure (an action: an interrupted + acquisition is not auto-resumable, Tier 2) -> best-effort + `abort_procedure`, exactly like `conduct()`. Holding here would + strand a Procedure whose replay tail starts with an acquisition + that `reconduct` can only halt-for-operator on. + - lifecycle failures (start / complete rejected) and a mid-execute + `CancelledError` keep `conduct()`'s behavior verbatim (no hold). + + Requires `start_procedure` + `complete_procedure` + `abort_procedure` + + `hold_procedure` handlers at __init__; raises `RuntimeError` (a + wiring bug) otherwise. + + This is the Tier-1 producer that makes a Held + pinned-resolved-steps + state reachable, so the `reconduct` resume path has something to + resume. See [[project_resumable_conduct_design]] Tier 1. + """ + if ( + self._start_procedure is None + or self._complete_procedure is None + or self._abort_procedure is None + or self._hold_procedure is None + ): + raise RuntimeError( + "Conductor.try_conduct() requires start_procedure + complete_procedure + " + "abort_procedure + hold_procedure handlers at __init__; only execute() is " + "available without them." + ) + envelope_kwargs: dict[str, Any] = { + "principal_id": principal_id, + "correlation_id": correlation_id, + "causation_id": causation_id, + "surface_id": surface_id, + } + try: + await self._start_procedure( + StartProcedure(procedure_id=procedure_id), **envelope_kwargs + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=0, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_START, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + try: + result = await self.execute( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + steps=steps, + causation_id=causation_id, + surface_id=surface_id, + ) + except asyncio.CancelledError: + # Mirror conduct(): best-effort abort so the FSM is not orphaned in + # Running, then re-raise. A cancellation is not a recoverable step + # failure, so it aborts rather than pausing to Held. + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure(procedure_id=procedure_id, reason="cancelled mid-execute"), + **envelope_kwargs, + ) + raise + actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + if result.succeeded: + try: + await self._complete_procedure( + CompleteProcedure(procedure_id=procedure_id, actuation_kind=actuation_kind), + **envelope_kwargs, + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_COMPLETE, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + return result + failure = result.failure + assert failure is not None # not result.succeeded implies failure + if _is_recoverable_failure(failure): + # Pause-to-Held instead of abort: a setpoint / check failure is + # re-drivable / re-runnable, so keep the conduct resumable. The + # hold is best-effort: if it fails, leave the Procedure Running + # (held stays False) and surface the original step failure, the + # same posture as conduct()'s best-effort abort that fails. + held_ok = False + with contextlib.suppress(Exception): + await self._hold_procedure( + HoldProcedure( + procedure_id=procedure_id, + reason=_derive_failure_reason(failure), + # Carry the observed-so-far kind so a later reconduct + # folds the pre-hold provenance with the replay tail. + actuation_kind=actuation_kind, + ), + **envelope_kwargs, + ) + held_ok = True + if held_ok: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=failure, + actuation_kind=result.actuation_kind, + held=True, + ) + return result + # Non-recoverable step failure (action): best-effort abort, exactly + # like conduct(). Holding would strand a Procedure whose replay tail + # starts with an interrupted acquisition. + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure( + procedure_id=procedure_id, + reason=_derive_failure_reason(failure), + actuation_kind=actuation_kind, + ), + **envelope_kwargs, + ) + return result + + async def reconduct( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + steps: Sequence[Step], + boundary: int, + prior_actuation_kind: str | None = None, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Resume a Held Procedure and REPLAY its pinned resolved steps from `boundary`. + + The resume twin of `conduct()`: where `conduct()` drives + start -> execute -> complete | abort, this drives + resume -> execute_from -> complete | (leave Running) | abort. + + 1. Issue `resume_procedure` (transitions Held -> Running). Its OWN + authz + off-diagonal parent-Run-Held guard fire here; a non-Held + Procedure or a held parent Run raises `ProcedureCannotResumeError` + which PROPAGATES (mapped to 409 at the route) rather than landing + in the result body. A refused resume is a guard outcome, not a + replay outcome, and no replay has happened yet. + 2. Call `self.execute_from(steps, boundary)`: re-drive setpoints, + re-run checks, halt-for-operator on an acquisition. + 3. Terminalize three-way: + - clean tail (incl. empty) -> `complete_procedure` (Completed). + - acquisition halt -> NO transition; the Procedure stays Running + and the operator decides redo-fresh vs reseed from the result. + - genuine step failure -> best-effort `abort_procedure` (if the + abort itself fails, the original step failure is what + surfaces, mirroring `conduct()`). + + `steps` is the parsed `ResolvedStepsRecorded.resolved_steps`: the + caller locates + parses the PINNED record (resume NEVER re-derives the + step list). `boundary` is single-sourced: it rides into both + `ProcedureResumed.re_establishment_boundary` (audit) and + `execute_from(boundary=...)` (replay). + + Requires `resume_procedure` + `complete_procedure` + `abort_procedure` + handlers at __init__; raises `RuntimeError` (a wiring bug) otherwise. + + Unlike `conduct()`, this does NOT best-effort-abort on a mid-replay + `CancelledError`: a cancellation after the resume leaves the Procedure + Running with partial replay history, the same posture as the + acquisition-halt branch (the operator reconciles). See + [[project_resumable_conduct_design]] Tier 1. + """ + if ( + self._resume_procedure is None + or self._complete_procedure is None + or self._abort_procedure is None + ): + raise RuntimeError( + "Conductor.reconduct() requires resume_procedure + complete_procedure + " + "abort_procedure handlers at __init__; only execute_from() is available " + "without them." + ) + envelope_kwargs: dict[str, Any] = { + "principal_id": principal_id, + "correlation_id": correlation_id, + "causation_id": causation_id, + "surface_id": surface_id, + } + # Held -> Running. Refusals (not-Held / held parent Run / authz deny / + # not-found) propagate to the route as their mapped HTTP codes; no + # replay has happened, so they are NOT swallowed into the result body. + await self._resume_procedure( + ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + **envelope_kwargs, + ) + result = await self.execute_from( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + steps=steps, + boundary=boundary, + causation_id=causation_id, + surface_id=surface_id, + ) + # Fold the pre-hold conduct's kind (carried on the Held procedure, + # passed in by the handler) with the replay tail's observed kind, so a + # boundary>0 resume past a simulated prefix does not complete as + # Physical and bypass the promote_dataset gate. boundary=0 re-observes + # everything, so the merge is a no-op there. + tail_actuation_kind = ( + result.actuation_kind.value if result.actuation_kind is not None else None + ) + actuation_kind = merge_actuation_kinds(prior_actuation_kind, tail_actuation_kind) + # Report the merged kind on the result too, so the response body matches + # the kind threaded onto the terminal event (not just the replay tail). + merged_result = replace( + result, + actuation_kind=(ActuationKind(actuation_kind) if actuation_kind is not None else None), + ) + if result.succeeded: + # Clean tail (incl. empty tail): auto-complete, threading the + # merged observed kind onto ProcedureCompleted (Data BC gate carrier). + try: + await self._complete_procedure( + CompleteProcedure(procedure_id=procedure_id, actuation_kind=actuation_kind), + **envelope_kwargs, + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_COMPLETE, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + return merged_result + if is_acquisition_halt(result.failure): + # Halt-for-operator: leave the Procedure Running; no transition. + # RESIDUAL: the replay tail's observed kind is NOT persisted here + # (no terminal event), so a later manual complete/abort -- which + # SETs actuation_kind from the command, not merges -- could stamp + # over a tail simulator touch. Narrower than the hold->resume gap + # this method closes; the design-memo second-writer hazard, aligned + # with the Tier-2 acquisition-decomposition deferral. + return merged_result + # Genuine step failure: best-effort abort (if abort itself fails, the + # original step failure is what surfaces). Mirrors conduct(). + failure = result.failure + assert failure is not None # not succeeded + not halt -> failure + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure( + procedure_id=procedure_id, + reason=_derive_failure_reason(failure), + actuation_kind=actuation_kind, + ), + **envelope_kwargs, + ) + return merged_result + async def _dispatch( self, step: Step, @@ -756,6 +1237,16 @@ async def _run_setpoint( port: ControlPort, ) -> ConductorFailure | None: payload_body: dict[str, Any] = {"address": step.address, "value": step.value} + # Pre-effect in-flight marker (side-effecting step): record intent + # BEFORE the write so a halt mid-write leaves a marker-without-outcome + # the resume reader can identify. See `_RESULT_IN_FLIGHT`. + await self._record( + envelope=envelope, + index=index, + step_kind=_STEP_KIND_SETPOINT, + body=payload_body, + result=_RESULT_IN_FLIGHT, + ) try: await port.write(step.address, step.value, wait=True) except _CONTROL_ERRORS as exc: @@ -817,6 +1308,18 @@ async def _run_action( port: ControlPort, ) -> ConductorFailure | None: payload_body: dict[str, Any] = {"name": step.name, "params": dict(step.params)} + # Pre-effect in-flight marker (side-effecting step): record intent + # BEFORE the action body runs so a halt mid-action leaves a + # marker-without-outcome the resume reader can identify. An unknown + # action still records the marker (the step kind is side-effecting) + # then its failure outcome. See `_RESULT_IN_FLIGHT`. + await self._record( + envelope=envelope, + index=index, + step_kind=_STEP_KIND_ACTION, + body=payload_body, + result=_RESULT_IN_FLIGHT, + ) body = self._action_registry.lookup(step.name) if body is None: exc = UnknownActionError(step.name) @@ -969,7 +1472,7 @@ async def _record( `index` is the step's zero-based position in the conducted step list; it rides the payload as `step_index` so a future resume can map a recorded outcome back to its position in the pinned - conduct manifest. + resolved step list. """ payload: dict[str, Any] = {**body, "step_index": index, "result": result} if error_class is not None: @@ -1020,11 +1523,12 @@ def _criterion_to_dict(criterion: CheckCriterion) -> dict[str, Any]: def step_to_payload(step: Step) -> dict[str, Any]: - """Serialize a `Step` to a JSON-clean dict (inverse of `step_from_wire`). + """Serialize a `Step` to a JSON-clean dict (inverse of `steps_from_payload`). Mirrors the conduct route's wire shape (the `kind` discriminator + field names) so the resolved step list pinned on `ResolvedStepsRecorded` - round-trips back to `Step` objects via `step_from_wire` at resume. A + round-trips back to `Step` objects via `steps_from_payload` at resume + (and via the route's Pydantic `step_from_wire` on the live HTTP path). A tuple `value` serializes as a list (JSON has no tuple); the criterion reuses `_criterion_to_dict` so the wire shape stays single-sourced. """ @@ -1045,6 +1549,83 @@ def step_to_payload(step: Step) -> dict[str, Any]: } +def _criterion_from_dict(criterion: Mapping[str, Any]) -> CheckCriterion: + """Rebuild a `CheckCriterion` from its `_criterion_to_dict` shape.""" + kind = criterion["kind"] + if kind == "equals": + expected: Any = criterion["expected"] + if isinstance(expected, list): + expected = cast("tuple[Any, ...]", tuple(expected)) # pyright: ignore[reportUnknownArgumentType] + return EqualsCriterion(expected=expected) + if kind == "within_tolerance": + return WithinToleranceCriterion( + expected=criterion["expected"], tolerance=criterion["tolerance"] + ) + msg = f"unknown criterion kind: {kind!r}" + raise ValueError(msg) + + +def _step_from_payload(payload: Mapping[str, Any]) -> Step: + """Rebuild one `Step` from its `step_to_payload` wire shape.""" + kind = payload["kind"] + if kind == "setpoint": + value: Any = payload["value"] + if isinstance(value, list): + value = cast("tuple[Any, ...]", tuple(value)) # pyright: ignore[reportUnknownArgumentType] + return SetpointStep( + address=payload["address"], value=value, verify=payload.get("verify", False) + ) + if kind == "action": + return ActionStep(name=payload["name"], params=dict(payload.get("params", {}))) + if kind == "check": + return CheckStep( + address=payload["address"], criterion=_criterion_from_dict(payload["criterion"]) + ) + msg = f"unknown step kind: {kind!r}" + raise ValueError(msg) + + +def steps_from_payload(resolved_steps: Sequence[Mapping[str, Any]]) -> tuple[Step, ...]: + """Parse the pinned `ResolvedStepsRecorded.resolved_steps` back into `Step`s. + + The exact inverse of `step_to_payload` (the serialization used to pin the + resolved step list). A resume reads the pinned event's `resolved_steps`, + parses them with this helper, and hands the result to + `Conductor.execute_from` -- it NEVER re-derives the step list from live + `Plan.wires` / partition rules. Pure; no Pydantic (that lives at the HTTP + boundary in `step_from_wire`). See [[project_resumable_conduct_design]]. + """ + return tuple(_step_from_payload(step) for step in resolved_steps) + + +def is_acquisition_halt(failure: ConductorFailure | None) -> bool: + """True iff `failure` is `execute_from`'s halt-for-operator on an acquisition. + + Distinguishes the resume halt (an `ActionStep` reached during replay, + which is a needs-operator-decision hand-off, NOT a failure) from a + genuine step failure (a setpoint/check that failed). A resume + orchestration completes on success, leaves the Procedure Running on an + acquisition halt, and aborts on a genuine failure -- this predicate is + the branch. See `_RESUME_HALT_ERROR_CLASS` and + [[project_resumable_conduct_design]].""" + return failure is not None and failure.error_class == _RESUME_HALT_ERROR_CLASS + + +def _is_recoverable_failure(failure: ConductorFailure) -> bool: + """True iff a conduct step failure is safe to PAUSE-and-resume, not abort. + + Recoverable = a setpoint or check failure: on `reconduct` a setpoint is + re-driven (idempotent absolute write) and a check is re-run as a fresh + gate, so the conduct can honestly continue from the boundary. An action + failure is NOT recoverable here: an interrupted acquisition is + non-idempotent (Tier 2 per-point decomposition is the real fix), and a + Held Procedure whose replay tail starts with that acquisition could only + halt-for-operator on `reconduct`. This is `try_conduct`'s hold-vs-abort + branch; lifecycle failures never reach it (handled before the step-failure + branch). See [[project_resumable_conduct_design]] Tier 1.""" + return failure.source_kind in (_STEP_KIND_SETPOINT, _STEP_KIND_CHECK) + + def _criterion_matches(criterion: CheckCriterion, value: Any) -> bool: """True iff `value` satisfies `criterion`. @@ -1068,14 +1649,15 @@ def _mismatch_reason(criterion: CheckCriterion, value: Any) -> str: return f"value {value!r} not within {criterion.tolerance} of expected {criterion.expected}" -def _derive_abort_reason(failure: ConductorFailure) -> str: - """Build a Procedure-aggregate-compliant abort reason from a step failure. +def _derive_failure_reason(failure: ConductorFailure) -> str: + """Build a Procedure-aggregate-compliant reason string from a step failure. - Truncates to `REASON_MAX_LENGTH` so the AbortProcedure - handler does not reject the cleanup call. The format leads with - the step pointer (kind + index + target) so an operator scanning - the abort reason knows immediately which step in the conducted - sequence killed the Procedure. + Used for both the abort path (`conduct` / `reconduct`) and the + pause-to-Held path (`try_conduct`). Truncates to `REASON_MAX_LENGTH` so + the AbortProcedure / HoldProcedure handler does not reject the call. The + format leads with the step pointer (kind + index + target) so an operator + scanning the reason knows immediately which step in the conducted sequence + halted the Procedure. """ if failure.step_index is None: prefix = f"{failure.source_kind} {failure.target}" @@ -1113,8 +1695,11 @@ def _reading_to_dict(reading: Reading) -> dict[str, Any]: "ConductorResult", "EqualsCriterion", "InMemoryActionRegistry", + "ResumePolicy", "SetpointStep", "Step", "WithinToleranceCriterion", + "is_acquisition_halt", "step_to_payload", + "steps_from_payload", ] diff --git a/apps/api/src/cora/operation/features/__init__.py b/apps/api/src/cora/operation/features/__init__.py index 3d2669bf13..ca82555e5b 100644 --- a/apps/api/src/cora/operation/features/__init__.py +++ b/apps/api/src/cora/operation/features/__init__.py @@ -25,11 +25,18 @@ - `truncate_procedure` (Running -> Truncated; partial-data terminal mirroring RunTruncated; reason + optional interrupted_at) +Resumable-conduct pause/resume pair (Tier 1 of +[[project_resumable_conduct_design]]; the state name mirrors +`RunStatus.HELD`): + - `hold_procedure` (Running -> Held; operator-pause of a halted + conduct, required reason) + - `resume_procedure` (Held -> Running; carries the + `re_establishment_boundary` the Conductor replays from) + Read side: - projection (`proj_operation_procedure_summary`) + `list_procedures` (cursor-paginated; status / kind / parent_run_id / target_asset_id filters) - - Held / Resumed only if pilot operator feedback surfaces a need """ from cora.operation.features import ( @@ -37,8 +44,11 @@ append_activities, complete_procedure, get_procedure, + hold_procedure, list_procedures, + reconduct_procedure, register_procedure, + resume_procedure, start_procedure, truncate_procedure, ) @@ -48,8 +58,11 @@ "append_activities", "complete_procedure", "get_procedure", + "hold_procedure", "list_procedures", + "reconduct_procedure", "register_procedure", + "resume_procedure", "start_procedure", "truncate_procedure", ] diff --git a/apps/api/src/cora/operation/features/abort_procedure/decider.py b/apps/api/src/cora/operation/features/abort_procedure/decider.py index 88bca4d23e..888aa528a7 100644 --- a/apps/api/src/cora/operation/features/abort_procedure/decider.py +++ b/apps/api/src/cora/operation/features/abort_procedure/decider.py @@ -1,9 +1,10 @@ """Pure decider for the `AbortProcedure` command. -Single-source emergency-exit terminal: `Running -> Aborted`. Source -set is just `Running` today (Held / Resumed deferred to 10c-c per -pilot need; if Held lands, this source set widens to `Running | Held` -to mirror Run BC's `abort_run` precedent). +Multi-source emergency-exit terminal: `Running | Held -> Aborted`. +`Held` was added when resumable conduct landed +([[project_resumable_conduct_design]] Tier 1); abort widened to accept +it so a paused Procedure stays abortable rather than stranded. Mirrors +Run BC's `abort_run` (`Running | Held` source set). `reason` validation goes through the `ProcedureAbortReason` VO (which calls the shared `validate_bounded_text` helper). The on-the-wire @@ -13,7 +14,7 @@ - State must not be None -> ProcedureNotFoundError - command.reason must be 1-500 chars after trimming -> InvalidProcedureAbortReasonError - - State.status must be in {Running} + - State.status must be in {Running, Held} -> ProcedureCannotAbortError(current_status=...) """ @@ -29,7 +30,10 @@ ) from cora.operation.features.abort_procedure.command import AbortProcedure -_ABORTABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) +_ABORTABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) def decide( diff --git a/apps/api/src/cora/operation/features/conduct_procedure/handler.py b/apps/api/src/cora/operation/features/conduct_procedure/handler.py index 8392c3237a..bf016f3411 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/handler.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/handler.py @@ -42,53 +42,25 @@ call site. """ -from collections.abc import Sequence -from typing import TYPE_CHECKING, Protocol +from typing import Protocol from uuid import UUID -from cora.infrastructure.event_envelope import to_new_event from cora.infrastructure.kernel import Kernel from cora.infrastructure.logging import get_logger -from cora.infrastructure.ports import Deny, EventStore -from cora.infrastructure.ports.event_store import StoredEvent +from cora.infrastructure.ports import Deny from cora.infrastructure.routing import NIL_SENTINEL_ID -from cora.operation._recipe_replay import ( - find_recipe_expansion_record, - pins_from_payload, - verify_bindings_hash, - verify_steps_hash, -) +from cora.operation._conduct_preparation import resolve_and_pin_conduct_steps from cora.operation.aggregates.procedure import ( - ProcedureBoundCapabilityDeprecatedError, ProcedureNotFoundError, - ProcedureStepsForbiddenForRecipeDrivenError, - RecipeExpanderVersionMismatchError, - RecipeExpansionRecordNotFoundError, - event_type_name, load_procedure_with_events, - to_payload, ) -from cora.operation.conductor import Conductor, Step, step_to_payload +from cora.operation.conductor import Conductor from cora.operation.errors import UnauthorizedError from cora.operation.features.conduct_procedure.command import ( ConductProcedure, ConductProcedureResult, ) -from cora.operation.features.conduct_procedure.manifest import ( - decide_resolved_steps_recorded, -) from cora.operation.ports.recipe_expander import RecipeExpander -from cora.recipe.aggregates.capability import CapabilityStatus, load_capability -from cora.recipe.aggregates.plan import ( - PlanNotFoundError, - constituents_from_wires, - load_plan, -) -from cora.recipe.aggregates.recipe import load_recipe_at_version -from cora.run.aggregates.run import RunNotFoundError, load_run - -if TYPE_CHECKING: - from cora.operation._pseudoaxis_expander import ConstituentResolver _COMMAND_NAME = "ConductProcedure" @@ -167,97 +139,18 @@ async def handler( if procedure is None: raise ProcedureNotFoundError(command.procedure_id) - if procedure.recipe_id is not None: - steps = await _re_expand_steps( - procedure_id=procedure.id, - recipe_id=procedure.recipe_id, - caller_steps=command.steps, - stored_events=stored_events, - event_store=deps.event_store, - expansion_port=expansion_port, - ) - else: - steps = tuple(command.steps) - - # A Phase-of-Run Procedure resolves a pseudoaxis's constituent - # motors from its Run's Plan wires: parent_run_id -> Run.plan_id - # -> Plan.wires (the same load chain start_procedure walks for - # its Supply gate). A missing Run / Plan in that chain is - # corruption, so raise rather than silently skip. Standalone / - # recipe-driven Procedures (no parent_run_id) pass no resolver, so - # any pseudoaxis SetpointStep hits the wiring-deferred default and - # is rejected with PartitionRuleNotFoundError. - # - # This composes with recipe-driven expansion rather than - # conflicting: the recipe block above produces the STEPS; the wires - # resolve each pseudoaxis step's CONSTITUENTS. A Procedure that is - # both recipe-driven and a Run phase gets recipe steps with - # wire-resolved constituents. (Watch item: this loads Run + Plan - # once per conduct; if a high-frequency re-conduct loop makes that - # latency matter, cache per command-lifetime.) - constituent_resolver: ConstituentResolver | None = None - if procedure.parent_run_id is not None: - parent_run = await load_run(deps.event_store, procedure.parent_run_id) - if parent_run is None: - raise RunNotFoundError(procedure.parent_run_id) - plan = await load_plan(deps.event_store, parent_run.plan_id) - if plan is None: - raise PlanNotFoundError(parent_run.plan_id) - - def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: - return constituents_from_wires(plan, asset_id) - - constituent_resolver = _resolve_constituents - - # Pre-Conductor PseudoAxis expansion: rewrite any virtual-axis - # SetpointStep into N sequential constituent SetpointSteps so - # the Conductor's existing dispatch loop walks the constituents - # in declared order. ActionStep / CheckStep pass through - # unchanged. PseudoAxis evaluator errors propagate to the - # routes layer for HTTP status mapping - # ([[project-pseudoaxis-design]] v3). - steps = await expansion_port.expand_pseudoaxis( - steps, - event_store=deps.event_store, + steps = await resolve_and_pin_conduct_steps( + deps, + command_name=_COMMAND_NAME, + procedure=procedure, + stored_events=stored_events, + caller_steps=command.steps, + expansion_port=expansion_port, + principal_id=principal_id, correlation_id=correlation_id, - constituent_resolver=constituent_resolver, + causation_id=causation_id, ) - # Pin the resolved step list (after recipe + pseudoaxis expansion) - # BEFORE conducting, so a future resume replays this exact list - # instead of re-deriving it from live Plan.wires / partition rules. - # Provenance-only ResolvedStepsRecorded; the helper emits it only - # while the Procedure is still Defined and returns [] otherwise, - # leaving the Conductor's start_procedure to surface a lifecycle - # failure (keeps the conduct route's failures-in-body contract). - manifest_events = decide_resolved_steps_recorded( - procedure, - tuple(step_to_payload(step) for step in steps), - now=deps.clock.now(), - ) - if manifest_events: - _, current_version = await deps.event_store.load( - stream_type="Procedure", stream_id=command.procedure_id - ) - await deps.event_store.append( - stream_type="Procedure", - stream_id=command.procedure_id, - expected_version=current_version, - events=[ - to_new_event( - event_type=event_type_name(event), - payload=to_payload(event), - occurred_at=event.occurred_at, - event_id=deps.id_generator.new_id(), - command_name=_COMMAND_NAME, - correlation_id=correlation_id, - causation_id=causation_id, - principal_id=principal_id, - ) - for event in manifest_events - ], - ) - result = await conductor.conduct( procedure_id=command.procedure_id, principal_id=principal_id, @@ -287,64 +180,3 @@ def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: ) return handler - - -async def _re_expand_steps( - *, - procedure_id: UUID, - recipe_id: UUID, - caller_steps: Sequence[Step], - stored_events: list[StoredEvent], - event_store: EventStore, - expansion_port: RecipeExpander, -) -> tuple[Step, ...]: - """Run the recipe-replay gate per [[project-run-procedure-replay-design]]. - - Six steps: reject non-empty caller steps -> find_recipe_expansion_record - (raise RecipeExpansionRecordNotFoundError on None) -> pins_from_payload - -> port-version strict-equals (raise RecipeExpanderVersionMismatchError - on drift) -> load_recipe_at_version (raise RecipeExpansionRecordNotFoundError - when None on a recipe-driven Procedure; RecipeVersionNotFoundError - propagates from helper) -> load_capability + reject Deprecated - (raise ProcedureBoundCapabilityDeprecatedError, symmetric to - start_run's RunBoundPlanDeprecatedError) -> verify_bindings_hash -> - expand -> verify_steps_hash -> return the re-expanded tuple. - """ - if list(caller_steps): - raise ProcedureStepsForbiddenForRecipeDrivenError(procedure_id) - - record = find_recipe_expansion_record(stored_events) - if record is None: - raise RecipeExpansionRecordNotFoundError(procedure_id) - - pins = pins_from_payload(procedure_id, record.payload) - - if pins.expansion_port_version != expansion_port.version: - raise RecipeExpanderVersionMismatchError( - procedure_id, - pins.expansion_port_version, - expansion_port.version, - ) - - recipe = await load_recipe_at_version( - event_store, - recipe_id, - pins.recipe_version, - ) - if recipe is None: - raise RecipeExpansionRecordNotFoundError(procedure_id) - - # Capability-deprecation gate: reject conduct against a tombstoned - # Capability before running the expansion port. Symmetric to - # start_run's RunBoundPlanDeprecatedError. Per the 2026-06-04 domain - # harmony audit: re-expanding a Recipe against a Deprecated - # Capability would silently execute against a contract operators - # have retired. - capability = await load_capability(event_store, recipe.capability_id) - if capability is not None and capability.status == CapabilityStatus.DEPRECATED: - raise ProcedureBoundCapabilityDeprecatedError(procedure_id, recipe.capability_id) - - verify_bindings_hash(procedure_id, pins) - expanded = expansion_port.expand(recipe.steps, dict(pins.bindings)) - verify_steps_hash(procedure_id, expanded, pins) - return expanded diff --git a/apps/api/src/cora/operation/features/conduct_procedure/manifest.py b/apps/api/src/cora/operation/features/conduct_procedure/manifest.py deleted file mode 100644 index 11e95f6ee0..0000000000 --- a/apps/api/src/cora/operation/features/conduct_procedure/manifest.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Pure decision helper: pin the resolved step list at conduct start. - -The `conduct_procedure` orchestration handler calls this AFTER it has -resolved the final step list (recipe re-expansion + pseudoaxis + -constituent resolution) and BEFORE handing the list to the Conductor, so -every conduct pins its manifest before any step executes. - -Emitted inline from the conduct flow rather than via a dedicated command -slice: `ResolvedStepsRecorded` is an internal provenance event with no -operator entry point, exactly like `RecipeExpansionRecorded`. Kept as a -pure function so the decision is unit-testable without an event store. -""" - -from collections.abc import Mapping, Sequence -from datetime import datetime -from typing import Any - -from cora.operation.aggregates.procedure import ( - Procedure, - ProcedureStatus, - ResolvedStepsRecorded, -) - - -def decide_resolved_steps_recorded( - state: Procedure | None, - resolved_steps: Sequence[Mapping[str, Any]], - *, - now: datetime, -) -> list[ResolvedStepsRecorded]: - """Pin the resolved step list iff the Procedure is pre-conduct (Defined). - - Returns a single `ResolvedStepsRecorded` when `state` is `Defined` - (the normal conduct path, before `start_procedure` transitions it to - `Running`). Returns `[]` when `state` is None or not `Defined`: a - conduct of a missing / already-running / terminal Procedure records no - manifest and lets the Conductor's `start_procedure` produce the normal - lifecycle failure, preserving the conduct route's failures-in-body - contract instead of raising a fresh HTTP error here. - """ - if state is None or state.status is not ProcedureStatus.DEFINED: - return [] - steps = tuple(dict(step) for step in resolved_steps) - return [ - ResolvedStepsRecorded( - procedure_id=state.id, - resolved_steps=steps, - step_count=len(steps), - occurred_at=now, - ) - ] diff --git a/apps/api/src/cora/operation/features/conduct_procedure/route.py b/apps/api/src/cora/operation/features/conduct_procedure/route.py index 971593d5e9..97bc90aa68 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/route.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/route.py @@ -23,20 +23,13 @@ ## Pydantic wire types -The Conductor's `Step = SetpointStep | ActionStep | CheckStep` and -`CheckCriterion = EqualsCriterion | WithinToleranceCriterion` discriminated unions -land on the wire as JSON discriminated unions with a `kind` field. -Pydantic's `Field(discriminator="kind")` validates the union at -parse time so a malformed step kind fails the request with a 422 -before the handler ever runs. - -Per-step `value` and `criterion.expected` are typed broadly -(`int | float | bool | str | list[Any]`) to match the -ControlPort's value vocabulary. Tuples-on-the-wire arrive as lists; -the converter widens to tuple for the in-process Conductor. +The shared step-list body + per-step failure shape live in the BC-level +`cora.operation._conduct_wire` module (reused by `try_conduct_procedure`, +which a slice cannot import directly). This slice owns only the +conduct-specific request/response envelope. """ -from typing import Annotated, Any, Literal, cast +from typing import Annotated from uuid import UUID from fastapi import APIRouter, Depends, Path, Request, status @@ -48,15 +41,12 @@ get_principal_id, get_surface_id, ) -from cora.operation.conductor import ( - ActionStep, - CheckCriterion, - CheckStep, - ConductorFailure, - EqualsCriterion, - SetpointStep, - Step, - WithinToleranceCriterion, +from cora.operation._conduct_wire import ( + STEP_BATCH_MAX, + ConductorFailureResponse, + StepRequest, + failure_to_wire, + step_from_wire, ) from cora.operation.features.conduct_procedure.command import ( ConductProcedure, @@ -64,82 +54,15 @@ ) from cora.operation.features.conduct_procedure.handler import Handler -_STEP_BATCH_MAX = 500 -"""Mirror of `append_activities`'s batch cap. A single -`ConductProcedure` request never carries more than this many steps; -larger procedures split client-side via multiple sequential runs.""" - - -class _SetpointStepRequest(BaseModel): - """JSON wire shape for a `SetpointStep`.""" - - kind: Literal["setpoint"] - address: str = Field(..., min_length=1) - value: int | float | bool | str | list[Any] - verify: bool = False - - model_config = {"extra": "forbid"} - - -class _ActionStepRequest(BaseModel): - """JSON wire shape for an `ActionStep`.""" - - kind: Literal["action"] - name: str = Field(..., min_length=1) - params: dict[str, Any] = Field(default_factory=dict) - - model_config = {"extra": "forbid"} - - -class _EqualsCriterion(BaseModel): - """JSON wire shape for an `EqualsCriterion`.""" - - kind: Literal["equals"] - expected: int | float | bool | str | list[Any] - - model_config = {"extra": "forbid"} - - -class _WithinToleranceCriterion(BaseModel): - """JSON wire shape for a `WithinToleranceCriterion`.""" - - kind: Literal["within_tolerance"] - expected: float - tolerance: float = Field(..., ge=0.0) - - model_config = {"extra": "forbid"} - - -_CriterionRequest = Annotated[ - _EqualsCriterion | _WithinToleranceCriterion, - Field(discriminator="kind"), -] - - -class _CheckStepRequest(BaseModel): - """JSON wire shape for a `CheckStep`.""" - - kind: Literal["check"] - address: str = Field(..., min_length=1) - criterion: _CriterionRequest - - model_config = {"extra": "forbid"} - - -_StepRequest = Annotated[ - _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, - Field(discriminator="kind"), -] - class ConductProcedureRequest(BaseModel): """Body for `POST /procedures/{procedure_id}/conduct`.""" - steps: list[_StepRequest] = Field( - default_factory=list[_StepRequest], - max_length=_STEP_BATCH_MAX, + steps: list[StepRequest] = Field( + default_factory=list[StepRequest], + max_length=STEP_BATCH_MAX, description=( - f"Steps the Conductor walks in order (0-{_STEP_BATCH_MAX}). " + f"Steps the Conductor walks in order (0-{STEP_BATCH_MAX}). " "Empty list is valid: start + complete fire with no steps." ), ) @@ -147,16 +70,6 @@ class ConductProcedureRequest(BaseModel): model_config = {"extra": "forbid"} -class _ConductorFailureResponse(BaseModel): - """JSON wire shape for `ConductorFailure`.""" - - step_index: int | None - source_kind: str - target: str - error_class: str - message: str - - class ConductProcedureResponse(BaseModel): """Response body for the conduct_procedure slice. @@ -174,63 +87,10 @@ class ConductProcedureResponse(BaseModel): procedure_id: UUID completed_count: int succeeded: bool - failure: _ConductorFailureResponse | None = None + failure: ConductorFailureResponse | None = None actuation_kind: str | None = None -def criterion_from_wire( - wire: _EqualsCriterion | _WithinToleranceCriterion, -) -> CheckCriterion: - """Build a Conductor `CheckCriterion` from a Pydantic wire model. - - Public because `tool.py` calls it too (MCP + REST share the same - wire schema; the converter is the seam between the JSON shape - and the in-process Conductor type). - """ - if isinstance(wire, _EqualsCriterion): - expected: Any = wire.expected - if isinstance(expected, list): - # wire.expected is a JSON list of Any; tuple-coerce for the in-process EqualsCriterion - return EqualsCriterion(expected=cast("tuple[Any, ...]", tuple(expected))) # pyright: ignore[reportUnknownArgumentType] - return EqualsCriterion(expected=expected) - return WithinToleranceCriterion(expected=wire.expected, tolerance=wire.tolerance) - - -def step_from_wire( - wire: _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, -) -> Step: - """Build a Conductor `Step` from a Pydantic wire model. - - Public because `tool.py` calls it too (MCP + REST share the same - wire schema). - """ - if isinstance(wire, _SetpointStepRequest): - value: Any = wire.value - if isinstance(value, list): - return SetpointStep( - address=wire.address, - value=cast("tuple[Any, ...]", tuple(value)), # pyright: ignore[reportUnknownArgumentType] - verify=wire.verify, - ) - return SetpointStep(address=wire.address, value=value, verify=wire.verify) - if isinstance(wire, _ActionStepRequest): - return ActionStep(name=wire.name, params=wire.params) - return CheckStep( - address=wire.address, - criterion=criterion_from_wire(wire.criterion), - ) - - -def _failure_to_wire(failure: ConductorFailure) -> _ConductorFailureResponse: - return _ConductorFailureResponse( - step_index=failure.step_index, - source_kind=failure.source_kind, - target=failure.target, - error_class=failure.error_class, - message=failure.message, - ) - - def result_to_wire(result: ConductProcedureResult) -> ConductProcedureResponse: """Build a `ConductProcedureResponse` from the slice's `ConductProcedureResult`. @@ -240,7 +100,7 @@ def result_to_wire(result: ConductProcedureResult) -> ConductProcedureResponse: procedure_id=result.procedure_id, completed_count=result.completed_count, succeeded=result.succeeded, - failure=_failure_to_wire(result.failure) if result.failure is not None else None, + failure=failure_to_wire(result.failure) if result.failure is not None else None, actuation_kind=result.actuation_kind, ) diff --git a/apps/api/src/cora/operation/features/conduct_procedure/tool.py b/apps/api/src/cora/operation/features/conduct_procedure/tool.py index 41363cf85d..2baa0c9160 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/tool.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/tool.py @@ -16,13 +16,13 @@ from cora.infrastructure.mcp_principal import get_mcp_principal_id from cora.infrastructure.observability import current_correlation_id from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation._conduct_wire import step_from_wire from cora.operation.features.conduct_procedure.command import ConductProcedure from cora.operation.features.conduct_procedure.handler import Handler from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, ConductProcedureResponse, result_to_wire, - step_from_wire, ) diff --git a/apps/api/src/cora/operation/features/end_iteration/decider.py b/apps/api/src/cora/operation/features/end_iteration/decider.py index e025ec01c7..14f280111e 100644 --- a/apps/api/src/cora/operation/features/end_iteration/decider.py +++ b/apps/api/src/cora/operation/features/end_iteration/decider.py @@ -1,9 +1,13 @@ """Pure decider for the `EndProcedureIteration` command. -Closes the currently-open convergence-loop iteration on a Running -Procedure. Iteration is orthogonal to the lifecycle FSM (the Procedure -stays Running); this folds onto the iteration denorm by clearing the -open-index marker. +Closes the currently-open convergence-loop iteration on a Running or +Held Procedure. Iteration is orthogonal to the lifecycle FSM; this +folds onto the iteration denorm by clearing the open-index marker. + +`Held` is accepted (alongside `Running`) so an iteration left open when +an operator paused the conduct can still be closed while paused +([[project_resumable_conduct_design]] Tier 1). `start_iteration` is NOT +widened: a new iteration cannot be opened while paused (resume first). `reason` is optional; when present it is trimmed and bounded 1-500 chars via the shared `validate_bounded_text` helper (matching the @@ -15,7 +19,7 @@ - state is None -> ProcedureNotFoundError - command.reason, when present, must be 1-500 chars after trimming -> InvalidProcedureIterationEndReasonError - - status is not Running, OR no iteration is open + - status is not in {Running, Held}, OR no iteration is open (current_iteration_index is None), OR iteration_index does not equal the open current_iteration_index -> ProcedureCannotEndIterationError """ @@ -34,6 +38,11 @@ from cora.shared.bounded_text import validate_bounded_text from cora.shared.text_bounds import REASON_MAX_LENGTH +_ITERATION_ENDABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) + def decide( state: Procedure | None, @@ -54,7 +63,7 @@ def decide( else None ) if ( - state.status is not ProcedureStatus.RUNNING + state.status not in _ITERATION_ENDABLE_STATUSES or state.current_iteration_index is None or command.iteration_index != state.current_iteration_index ): diff --git a/apps/api/src/cora/operation/features/hold_procedure/__init__.py b/apps/api/src/cora/operation/features/hold_procedure/__init__.py new file mode 100644 index 0000000000..fcae57cb76 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/__init__.py @@ -0,0 +1,23 @@ +"""Vertical slice for the `HoldProcedure` command. + +from cora.operation.features import hold_procedure + +cmd = hold_procedure.HoldProcedure(procedure_id=..., reason="...") +handler = hold_procedure.bind(deps) +await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.hold_procedure import tool +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.decider import decide +from cora.operation.features.hold_procedure.handler import Handler, bind +from cora.operation.features.hold_procedure.route import router + +__all__ = [ + "Handler", + "HoldProcedure", + "bind", + "decide", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/hold_procedure/command.py b/apps/api/src/cora/operation/features/hold_procedure/command.py new file mode 100644 index 0000000000..861194ca32 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/command.py @@ -0,0 +1,34 @@ +"""The `HoldProcedure` command -- intent dataclass for this slice. + +Single-source pause transition: `Running -> Held`. The operator pauses +a halted conduct so it can be re-established and resumed rather than +aborted-and-reseeded (Tier 1 of [[project_resumable_conduct_design]]). + +Carries a REQUIRED free-form `reason` (1-500 chars after trim; validated +at the API boundary AND defensively at the decider via the +`ProcedureHoldReason` VO). Unlike `HoldRun` (slim, no reason: a routine +Run pause), pausing a halted conduct is a deliberate, high-information +operator act, so the reason is mandatory (matching `AgentSuspended.reason`). + +`decided_by_decision_id` mirrors `HoldRun`: optional Decision-causation +link. The operator-facing route leaves it None; an in-process agent +runtime sets it to link an autonomous hold to its Decision. NO existence +check at the decider per the cross-BC eventual-consistency stance. +""" + +from dataclasses import dataclass +from uuid import UUID + + +@dataclass(frozen=True) +class HoldProcedure: + """Pause an actively-running Procedure conduct (Running -> Held).""" + + procedure_id: UUID + reason: str + decided_by_decision_id: UUID | None = None + actuation_kind: str | None = None + """The raw `ActuationKind` value the Conductor observed in the conduct up + to this pause. `Conductor.try_conduct` sets it so the pre-hold provenance + survives the hold->resume boundary (see `ProcedureHeld.actuation_kind`); an + operator hold issued outside a conduct leaves it None.""" diff --git a/apps/api/src/cora/operation/features/hold_procedure/decider.py b/apps/api/src/cora/operation/features/hold_procedure/decider.py new file mode 100644 index 0000000000..41b249faf4 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/decider.py @@ -0,0 +1,58 @@ +"""Pure decider for the `HoldProcedure` command. + +Single-source pause transition: `Running -> Held`. Re-holding an +already-`Held` Procedure raises (strict-not-idempotent); holding a +`Defined` or terminal Procedure raises. Mirrors `hold_run`. + +Hold <-> Resume is bidirectional and unlimited-cycle: an operator can +hold -> resume -> hold repeatedly within one conduct, each hold +requiring an intervening resume. + +`reason` validation goes through the `ProcedureHoldReason` VO (which +calls the shared `validate_bounded_text` helper). The on-the-wire +payload in `ProcedureHeld.reason` carries the trimmed string. + +Invariants: + - State must not be None -> ProcedureNotFoundError + - command.reason must be 1-500 chars after trimming + -> InvalidProcedureHoldReasonError + - State.status must be in {Running} + -> ProcedureCannotHoldError(current_status=...) +""" + +from datetime import datetime + +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureHoldReason, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features.hold_procedure.command import HoldProcedure + +_HOLDABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) + + +def decide( + state: Procedure | None, + command: HoldProcedure, + *, + now: datetime, +) -> list[ProcedureHeld]: + """Decide the events produced by holding an existing Procedure.""" + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + reason = ProcedureHoldReason(command.reason) + if state.status not in _HOLDABLE_STATUSES: + raise ProcedureCannotHoldError(state.id, current_status=state.status) + return [ + ProcedureHeld( + procedure_id=state.id, + reason=reason.value, + decided_by_decision_id=command.decided_by_decision_id, + occurred_at=now, + actuation_kind=command.actuation_kind, + ) + ] diff --git a/apps/api/src/cora/operation/features/hold_procedure/handler.py b/apps/api/src/cora/operation/features/hold_procedure/handler.py new file mode 100644 index 0000000000..ff342be306 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/handler.py @@ -0,0 +1,45 @@ +"""Application handler for the `hold_procedure` slice. + +Update-style handler. Canonical body lives in +`cora.operation._procedure_update_handler.make_procedure_update_handler`; +this module is a thin slice-specific bind, mirroring abort_procedure / +truncate_procedure. + +The command's `reason` field IS captured on the emitted `ProcedureHeld` +event payload but is intentionally NOT logged at the handler boundary +(mirrors abort_procedure / hold_run precedent), so this slice does not +pass `extra_log_fields`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._procedure_update_handler import make_procedure_update_handler +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.decider import decide + + +class Handler(Protocol): + """Callable interface every hold_procedure handler implements.""" + + async def __call__( + self, + command: HoldProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: ... + + +def bind(deps: Kernel) -> Handler: + """Build a hold_procedure handler closed over the shared deps.""" + return make_procedure_update_handler( + deps, + command_name="HoldProcedure", + log_prefix="hold_procedure", + decide_fn=decide, + ) diff --git a/apps/api/src/cora/operation/features/hold_procedure/route.py b/apps/api/src/cora/operation/features/hold_procedure/route.py new file mode 100644 index 0000000000..48b5b31810 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/route.py @@ -0,0 +1,91 @@ +"""HTTP route for the `hold_procedure` slice. + +Action endpoint at `POST /procedures/{procedure_id}/hold`. Body carries +`reason` (1-500 chars). 204 No Content on success. Mirrors abort_procedure. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler +from cora.shared.text_bounds import REASON_MAX_LENGTH + + +class HoldProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/hold`.""" + + reason: str = Field( + ..., + min_length=1, + max_length=REASON_MAX_LENGTH, + description=( + "Free-form reason for the hold (1-500 chars after trimming). " + "Required: pausing a halted conduct is a deliberate operator act " + "(unlike a routine RunHeld, which carries no reason)." + ), + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.hold_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/hold", + status_code=status.HTTP_204_NO_CONTENT, + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "Domain invariant violated: whitespace-only reason.", + }, + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Running` status (hold requires " + "`Running`; holding a `Defined` / `Held` / terminal procedure " + "raises), OR a concurrent write to the same procedure stream " + "conflicted (optimistic concurrency)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + }, + summary="Pause an actively-running Procedure conduct (Running -> Held)", +) +async def post_procedures_hold( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: HoldProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> None: + await handler( + HoldProcedure(procedure_id=procedure_id, reason=body.reason), + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) diff --git a/apps/api/src/cora/operation/features/hold_procedure/tool.py b/apps/api/src/cora/operation/features/hold_procedure/tool.py new file mode 100644 index 0000000000..437cf02fa0 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/tool.py @@ -0,0 +1,52 @@ +"""MCP tool for the `hold_procedure` slice.""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler +from cora.shared.text_bounds import REASON_MAX_LENGTH + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `hold_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="hold_procedure", + description=( + "Pause an actively-running Procedure conduct (Running -> Held) so it " + "can be re-established and resumed later. The inverse of resume_procedure. " + "Requires the Procedure to currently be in `Running`. Holding a " + "`Defined` / `Held` / terminal Procedure raises. Reason is required " + "(1-500 chars), captured verbatim for audit." + ), + ) + async def hold_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + reason: Annotated[ + str, + Field( + min_length=1, + max_length=REASON_MAX_LENGTH, + description="Free-form reason for the hold (1-500 chars after trimming).", + ), + ], + ) -> None: + handler = get_handler() + await handler( + HoldProcedure(procedure_id=procedure_id, reason=reason), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) diff --git a/apps/api/src/cora/operation/features/list_procedures/query.py b/apps/api/src/cora/operation/features/list_procedures/query.py index 02fd8467b8..2e283c8220 100644 --- a/apps/api/src/cora/operation/features/list_procedures/query.py +++ b/apps/api/src/cora/operation/features/list_procedures/query.py @@ -1,15 +1,16 @@ """The `ListProcedures` query: intent dataclass for keyset-paginated list of procedures from the projection. -Four optional filters: status (one of the 5 ProcedureStatus values), +Four optional filters: status (one of the 6 ProcedureStatus values), kind (free-form bare-str discriminator, exact match), parent_run_id (UUID for Phase-of-Run filtering), target_asset_id (UUID for "procedures targeting this Asset" via the GIN index on the target_asset_ids UUID[] column). -`ProcedureStatusFilter` is locked at the full enum width day one -(Defined / Running / Completed / Aborted / Truncated). Same forward- -compat motivation as ListSupplies's SupplyStatusFilter. +`ProcedureStatusFilter` mirrors the full `ProcedureStatus` enum +(Defined / Running / Held / Completed / Aborted / Truncated). `Held` +was added when resumable conduct surfaced it in the read model (the +projection folds ProcedureHeld -> status='Held'). Cursor encodes (registered_at, procedure_id) -- `registered_at` is set once at ProcedureRegistered (immutable), so it's a stable keyset @@ -23,6 +24,7 @@ ProcedureStatusFilter = Literal[ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated", @@ -40,7 +42,7 @@ class ListProcedures: """Page size cap. Default 50, max 100 (route enforces).""" status: ProcedureStatusFilter | None = None - """Optional status filter (one of the five ProcedureStatus values).""" + """Optional status filter (one of the six ProcedureStatus values).""" kind: str | None = None """Optional kind filter (free-form, exact match; for example 'bakeout').""" diff --git a/apps/api/src/cora/operation/features/list_procedures/route.py b/apps/api/src/cora/operation/features/list_procedures/route.py index c630499542..b294571fb7 100644 --- a/apps/api/src/cora/operation/features/list_procedures/route.py +++ b/apps/api/src/cora/operation/features/list_procedures/route.py @@ -99,7 +99,7 @@ async def list_procedures( Query( alias="status", description=( - "Optional status filter (one of: Defined, Running, " + "Optional status filter (one of: Defined, Running, Held, " "Completed, Aborted, Truncated). Omit to return all statuses." ), ), diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py new file mode 100644 index 0000000000..13dd8f4493 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py @@ -0,0 +1,42 @@ +"""Vertical slice for the `ReconductProcedure` command. + +Operator-facing resume-and-replay orchestration: resumes a Held +Procedure and hands control to the `Conductor` runtime, which replays the +pinned step-list tail from the re-establishment boundary (re-drive +setpoints, re-run checks, halt-for-operator on an acquisition), then +auto-completes on a clean tail / aborts on a step failure / leaves +Running on an acquisition halt. Returns a structured +`ReconductProcedureResult`; replay outcomes are encoded in the result, +not raised. + + from cora.operation.features import reconduct_procedure + + cmd = reconduct_procedure.ReconductProcedure(procedure_id=..., re_establishment_boundary=K) + handler = reconduct_procedure.bind( + deps, conductor=conductor, resume_procedure=..., complete_procedure=..., abort_procedure=... + ) + result = await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.reconduct_procedure import tool +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.operation.features.reconduct_procedure.handler import Handler, bind +from cora.operation.features.reconduct_procedure.route import ( + ReconductProcedureRequest, + ReconductProcedureResponse, + router, +) + +__all__ = [ + "Handler", + "ReconductProcedure", + "ReconductProcedureRequest", + "ReconductProcedureResponse", + "ReconductProcedureResult", + "bind", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/command.py b/apps/api/src/cora/operation/features/reconduct_procedure/command.py new file mode 100644 index 0000000000..4b228e565d --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/command.py @@ -0,0 +1,46 @@ +"""The `ReconductProcedure` command + result -- intent dataclass for this slice. + +Resume-and-replay orchestration: resume a `Held` Procedure and replay its +PINNED step-list tail from the re-establishment boundary (Tier 1 of +[[project_resumable_conduct_design]]). Mirrors `ConductProcedure` (the +conduct orchestration) but for the resume path; carries the +`re_establishment_boundary` (single-sourced -- it rides into both +`ProcedureResumed` and `Conductor.execute_from`). +""" + +from dataclasses import dataclass +from uuid import UUID + +from cora.operation.conductor import ConductorFailure + + +@dataclass(frozen=True) +class ReconductProcedure: + """Resume a held Procedure and replay its pinned step-list tail.""" + + procedure_id: UUID + re_establishment_boundary: int + + +@dataclass(frozen=True) +class ReconductProcedureResult: + """Outcome of a reconduct (resume + replay). + + `succeeded` is the canonical pass/fail bit (the replay's outcome). + `acquisition_halt` is True iff replay stopped at an acquisition that + needs an operator decision (redo-fresh vs reseed): in that case the + Procedure is LEFT Running (no complete, no abort) and `failure` carries + the halt. On a clean replay the Procedure is auto-completed; on a + genuine step failure it is aborted. `completed_count` is the number of + re-driven / re-run tail steps that succeeded; `actuation_kind` is the + Conductor's observed kind over the replay (None when nothing + instrumented was actuated). + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool = False + failure: ConductorFailure | None = None + actuation_kind: str | None = None diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py new file mode 100644 index 0000000000..2f6c415ded --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py @@ -0,0 +1,208 @@ +"""Application handler for the `reconduct_procedure` slice. + +Resume-and-replay orchestration. Mirrors `conduct_procedure`: a thin +slice handler that authz-checks + loads + locates the pinned resolved steps, +then delegates the resume + replay + terminalize composition to +`Conductor.reconduct` (the resume twin of `Conductor.conduct`). No +`decider.py`: like `conduct_procedure` this is an orchestration entry +point, not an aggregate-state-mutating decider. + +This handler imports NO sibling slice: the resume / complete / abort +handlers it composes live on the injected `Conductor` (a non-slice +module), exactly as `conduct_procedure` delegates start / complete / +abort to `Conductor.conduct`. That keeps the slice independent (the +cross-slice fitness) and the composition in the one place that already +owns lifecycle-handler orchestration. + +## Flow + + 1. authz `ReconductProcedure`. + 2. load the Procedure + its raw events. + 3. status guard FIRST: a non-Held Procedure is a `ProcedureCannotResumeError` + (409), raised BEFORE the step-list lookup so a Defined / Completed + Procedure is never a misleading 500 and no resume-then-fail partial + state can occur. + 4. locate the PINNED `ResolvedStepsRecorded` (a conducted, Held Procedure + ALWAYS has exactly one; its absence is corruption -> + `ResolvedStepsRecordNotFoundError`, 500) and parse it back into `Step`s + via `steps_from_payload` -- resume NEVER re-derives the step list. + 5. `Conductor.reconduct(steps, boundary)`: resume (Held -> Running, with + its own authz + off-diagonal parent-Run-Held guard) -> `execute_from` + (re-drive setpoints, re-run checks, halt-for-operator on an acquisition) + -> terminalize (complete on a clean tail / leave Running on an + acquisition halt / best-effort abort on a genuine step failure). + 6. project the `ConductorResult` onto `ReconductProcedureResult` + (`acquisition_halt` is the named branch on the resume halt). + +The `re_establishment_boundary` is single-sourced: the operator supplies +it once; `Conductor.reconduct` rides it into both +`ProcedureResumed.re_establishment_boundary` (audit) and +`execute_from(boundary=...)` (replay). + +## Authorization scope + +`ReconductProcedure` is authz-checked as its own command. The wrapped +`resume_procedure` / `complete_procedure` / `abort_procedure` handlers +(on the Conductor) each authz internally with their OWN command names; an +operator authorized to call `ReconductProcedure` is NOT automatically +authorized for those individually. Same layering as `conduct_procedure`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._resolved_steps_replay import find_resolved_steps_record +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + ProcedureCannotResumeError, + ProcedureNotFoundError, + ProcedureStatus, + ResolvedStepsRecordNotFoundError, + load_procedure_with_events, +) +from cora.operation.conductor import Conductor, is_acquisition_halt, steps_from_payload +from cora.operation.errors import UnauthorizedError +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) + +_COMMAND_NAME = "ReconductProcedure" + +_log = get_logger(__name__) + + +class Handler(Protocol): + """Callable interface every reconduct_procedure handler implements.""" + + async def __call__( + self, + command: ReconductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ReconductProcedureResult: ... + + +def bind(deps: Kernel, *, conductor: Conductor) -> Handler: + """Build a reconduct_procedure handler closed over deps + the Conductor. + + `conductor` is the same BC-internal Conductor `conduct_procedure` uses; + it carries the resume / complete / abort handlers (wired at app + composition) that `Conductor.reconduct` composes, so the internal + transitions land with the same observability shape as direct REST / MCP + calls. + """ + + async def handler( + command: ReconductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ReconductProcedureResult: + _log.info( + "reconduct_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + re_establishment_boundary=command.re_establishment_boundary, + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + authz = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(authz, Deny): + _log.info( + "reconduct_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=authz.reason, + ) + raise UnauthorizedError(authz.reason) + + procedure, stored_events = await load_procedure_with_events( + deps.event_store, command.procedure_id + ) + if procedure is None: + raise ProcedureNotFoundError(command.procedure_id) + + # Status guard FIRST (mirrors resume's `{Held}` source set): a + # non-Held Procedure is a 409, not a 500. This keeps the + # missing-record case below as genuine corruption (a conducted, + # Held Procedure ALWAYS has its pinned resolved steps) and avoids resuming + # then failing to find them. The off-diagonal parent-Run-Held + # guard stays inside Conductor.reconduct's resume call. + if procedure.status is not ProcedureStatus.HELD: + raise ProcedureCannotResumeError(command.procedure_id, current_status=procedure.status) + + # Replay the PINNED resolved steps, never re-derive. A Held Procedure that + # was conducted always has exactly one ResolvedStepsRecorded; its + # absence here is corruption (500), not an operational outcome. + record = find_resolved_steps_record(stored_events) + if record is None: + raise ResolvedStepsRecordNotFoundError(command.procedure_id) + steps = steps_from_payload(record.payload["resolved_steps"]) + + # Upper-bound guard: a boundary PAST the pinned step count would replay + # an empty tail and silently auto-complete with nothing re-driven. The + # resume decider only floors at 0 (it has no manifest to size against); + # the bound lives here, where the manifest is known. `boundary == + # len(steps)` is allowed (a deliberate "everything already done, + # complete" resume); only strictly-past is rejected. + if command.re_establishment_boundary > len(steps): + raise InvalidProcedureReEstablishmentBoundaryError(command.re_establishment_boundary) + + result = await conductor.reconduct( + procedure_id=command.procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + steps=steps, + boundary=command.re_establishment_boundary, + # The pre-hold conduct's observed kind (folded onto the Held + # Procedure) so the terminal event reflects the FULL provenance, + # not just the replay tail -- guards the promote_dataset gate + # against a boundary>0 resume past a simulated prefix. + prior_actuation_kind=procedure.actuation_kind, + causation_id=causation_id, + surface_id=surface_id, + ) + + actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + acquisition_halt = is_acquisition_halt(result.failure) + + _log.info( + "reconduct_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + completed_count=result.completed_count, + succeeded=result.succeeded, + acquisition_halt=acquisition_halt, + failure_class=(result.failure.error_class if result.failure is not None else None), + ) + + return ReconductProcedureResult( + procedure_id=command.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + re_establishment_boundary=command.re_establishment_boundary, + acquisition_halt=acquisition_halt, + failure=result.failure, + actuation_kind=actuation_kind, + ) + + return handler diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/route.py b/apps/api/src/cora/operation/features/reconduct_procedure/route.py new file mode 100644 index 0000000000..2e854cb30e --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/route.py @@ -0,0 +1,148 @@ +"""HTTP route for the `reconduct_procedure` slice. + +`POST /procedures/{procedure_id}/reconduct` resumes a Held Procedure and +replays its pinned step-list tail from `re_establishment_boundary`. + +## Response code: 200, replay outcomes in body + +Like `conduct_procedure`, replay outcomes (a step that failed, an +acquisition that needs an operator decision) are NORMAL operational +results that land in the body, not HTTP errors. Only protocol / auth / +guard faults map to HTTP codes: 403 (authz deny), 404 (no procedure), +409 (Procedure not Held, or parent Run Held -- from the resume guard), +422 (negative boundary / malformed id), 500 (Held Procedure missing its +pinned resolved steps -- corruption). +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation._conduct_wire import ConductorFailureResponse, failure_to_wire +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.operation.features.reconduct_procedure.handler import Handler + + +class ReconductProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/reconduct`.""" + + re_establishment_boundary: int = Field( + ..., + ge=0, + description=( + "Index in the pinned resolved step list from which the resume " + "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " + "from the first step). NOT a continuity proof." + ), + ) + + model_config = {"extra": "forbid"} + + +class ReconductProcedureResponse(BaseModel): + """Response body for the reconduct_procedure slice. + + `succeeded` is the replay's pass/fail bit. `acquisition_halt` is True + iff the replay stopped at an acquisition needing an operator decision + (the Procedure is left Running). `failure` is non-null iff `succeeded` + is False (a halt or a genuine step failure). + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool + failure: ConductorFailureResponse | None = None + actuation_kind: str | None = None + + +def result_to_wire(result: ReconductProcedureResult) -> ReconductProcedureResponse: + """Build a `ReconductProcedureResponse` from the slice result. + + Public because `tool.py` calls it too. + """ + return ReconductProcedureResponse( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + re_establishment_boundary=result.re_establishment_boundary, + acquisition_halt=result.acquisition_halt, + failure=failure_to_wire(result.failure) if result.failure is not None else None, + actuation_kind=result.actuation_kind, + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.reconduct_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/reconduct", + status_code=status.HTTP_200_OK, + response_model=ReconductProcedureResponse, + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "re_establishment_boundary is past the pinned resolved step count.", + }, + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Held` status, OR its parent Run is " + "itself `Held` (off-diagonal guard)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + status.HTTP_500_INTERNAL_SERVER_ERROR: { + "model": ErrorResponse, + "description": "Held Procedure is missing its pinned resolved steps (corruption).", + }, + }, + summary="Resume a held Procedure and replay its pinned step-list tail (Held -> Running)", +) +async def post_procedures_reconduct( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: ReconductProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> ReconductProcedureResponse: + """Resume + replay a Held Procedure. Replay outcomes land in the body.""" + command = ReconductProcedure( + procedure_id=procedure_id, + re_establishment_boundary=body.re_establishment_boundary, + ) + result = await handler( + command, + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) + return result_to_wire(result) diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/tool.py b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py new file mode 100644 index 0000000000..680ee07468 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py @@ -0,0 +1,91 @@ +"""MCP tool for the `reconduct_procedure` slice. + +Mirrors the REST route: resumes a Held Procedure and replays its pinned +step-list tail, returning a structured summary. Replay outcomes (a step +failure, an acquisition halt) land in the return value, not raised; the +LLM caller inspects `succeeded` / `acquisition_halt` / `failure`. +""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import BaseModel, Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.reconduct_procedure.command import ReconductProcedure +from cora.operation.features.reconduct_procedure.handler import Handler +from cora.operation.features.reconduct_procedure.route import ( + ReconductProcedureResponse, + result_to_wire, +) + + +class _ToolResult(BaseModel): + """MCP-shape mirror of `ReconductProcedureResponse` for tool-output validation.""" + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool + failure: dict[str, Any] | None = None + actuation_kind: str | None = None + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `reconduct_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="reconduct_procedure", + description=( + "Resume a Held Procedure and replay its pinned step-list tail from " + "re_establishment_boundary: re-drive setpoints, re-run checks, and " + "HALT for an operator decision at an acquisition. On a clean tail " + "the Procedure auto-completes; on an acquisition halt it stays " + "Running (acquisition_halt=True); on a genuine step failure it " + "aborts. Returns a structured summary; outcomes DO NOT raise. " + "Requires the Procedure to be Held (and, for a Phase-of-Run " + "Procedure, its parent Run not Held)." + ), + ) + async def reconduct_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + re_establishment_boundary: Annotated[ + int, + Field( + ge=0, + description=( + "Step-list index the resume re-drives setpoints / re-runs " + "checks from (>= 0; 0 = from the first step)." + ), + ), + ], + ) -> _ToolResult: + handler = get_handler() + result = await handler( + ReconductProcedure( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + ), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) + wire: ReconductProcedureResponse = result_to_wire(result) + return _ToolResult( + procedure_id=wire.procedure_id, + completed_count=wire.completed_count, + succeeded=wire.succeeded, + re_establishment_boundary=wire.re_establishment_boundary, + acquisition_halt=wire.acquisition_halt, + failure=wire.failure.model_dump() if wire.failure is not None else None, + actuation_kind=wire.actuation_kind, + ) diff --git a/apps/api/src/cora/operation/features/resume_procedure/__init__.py b/apps/api/src/cora/operation/features/resume_procedure/__init__.py new file mode 100644 index 0000000000..421023b376 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/__init__.py @@ -0,0 +1,23 @@ +"""Vertical slice for the `ResumeProcedure` command. + +from cora.operation.features import resume_procedure + +cmd = resume_procedure.ResumeProcedure(procedure_id=..., re_establishment_boundary=0) +handler = resume_procedure.bind(deps) +await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.resume_procedure import tool +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.decider import decide +from cora.operation.features.resume_procedure.handler import Handler, bind +from cora.operation.features.resume_procedure.route import router + +__all__ = [ + "Handler", + "ResumeProcedure", + "bind", + "decide", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/resume_procedure/command.py b/apps/api/src/cora/operation/features/resume_procedure/command.py new file mode 100644 index 0000000000..a65c713911 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/command.py @@ -0,0 +1,27 @@ +"""The `ResumeProcedure` command -- intent dataclass for this slice. + +Single-source resume transition: `Held -> Running`. The inverse of +hold_procedure. Carries `re_establishment_boundary`: the index in the +pinned resolved step list from which a resume re-drives setpoints and +re-runs checks (Tier 1 of [[project_resumable_conduct_design]]). It is +NOT a continuity proof; it is the re-establishment boundary the +Conductor's `execute_from` replays from. + +`decided_by_decision_id` mirrors `ResumeRun`: optional Decision-causation +link. The operator-facing route leaves it None; an in-process agent +runtime sets it to link an autonomous, safety-gated resume to its +Decision. NO existence check at the decider per the cross-BC +eventual-consistency stance. +""" + +from dataclasses import dataclass +from uuid import UUID + + +@dataclass(frozen=True) +class ResumeProcedure: + """Resume a held Procedure conduct (Held -> Running).""" + + procedure_id: UUID + re_establishment_boundary: int + decided_by_decision_id: UUID | None = None diff --git a/apps/api/src/cora/operation/features/resume_procedure/decider.py b/apps/api/src/cora/operation/features/resume_procedure/decider.py new file mode 100644 index 0000000000..f958845b6d --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/decider.py @@ -0,0 +1,72 @@ +"""Pure decider for the `ResumeProcedure` command. + +Single-source resume transition: `Held -> Running`. The inverse of +hold (which requires `Running`). Resuming an already-`Running` Procedure +raises (strict-not-idempotent); resuming a `Defined` or terminal +Procedure raises. Mirrors `resume_run`. + +Off-diagonal guard: a Held Procedure whose parent Run is itself `Held` +cannot resume to `Running` and walk real setpoints while the Run is +paused. The decider takes a `parent_run_held` fact the handler derives +from a one-directional Operation -> Run read (tach-legal); there is NO +cascade from Run-resume into Procedure-resume (that is a Layer-3 saga, +deferred). `parent_run_held` defaults False, which is correct for a +standalone Procedure (no parent Run). See +[[project_resumable_conduct_design]]. + +Invariants: + - State must not be None -> ProcedureNotFoundError + - command.re_establishment_boundary must be >= 0 + -> InvalidProcedureReEstablishmentBoundaryError + - State.status must be in {Held} + -> ProcedureCannotResumeError(current_status=...) + - parent_run_held must be False + -> ProcedureCannotResumeError(parent_run_held=True) +""" + +from datetime import datetime + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features.resume_procedure.command import ResumeProcedure + +_RESUMABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.HELD,) + + +def decide( + state: Procedure | None, + command: ResumeProcedure, + *, + parent_run_held: bool = False, + now: datetime, +) -> list[ProcedureResumed]: + """Decide the events produced by resuming a held Procedure. + + `parent_run_held` is the handler-derived fact that this Procedure's + parent Run is currently `Held`; standalone Procedures (no parent Run) + pass the default False. + """ + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + if command.re_establishment_boundary < 0: + raise InvalidProcedureReEstablishmentBoundaryError(command.re_establishment_boundary) + if state.status not in _RESUMABLE_STATUSES: + raise ProcedureCannotResumeError(state.id, current_status=state.status) + if parent_run_held: + raise ProcedureCannotResumeError( + state.id, current_status=state.status, parent_run_held=True + ) + return [ + ProcedureResumed( + procedure_id=state.id, + re_establishment_boundary=command.re_establishment_boundary, + decided_by_decision_id=command.decided_by_decision_id, + occurred_at=now, + ) + ] diff --git a/apps/api/src/cora/operation/features/resume_procedure/handler.py b/apps/api/src/cora/operation/features/resume_procedure/handler.py new file mode 100644 index 0000000000..725eaea411 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/handler.py @@ -0,0 +1,155 @@ +"""Application handler for the `resume_procedure` slice. + +Update-style handler with a custom body (NOT the update-handler +factory): resume reads the parent Run to enforce the off-diagonal guard +(a Held Procedure cannot resume while its parent Run is itself Held). +The factory at `cora.infrastructure.update_handler` loads exactly one +event-store stream; this slice loads a second (the parent Run), so it +stays longhand -- same reason `start_procedure`'s handler is custom. + +## Off-diagonal guard + +For a Phase-of-Run Procedure (`parent_run_id` set), the handler loads +the parent Run and passes `parent_run_held = (Run.status == Held)` into +the pure decider, which refuses with `ProcedureCannotResumeError` so a +Procedure cannot resume to Running and walk real setpoints while the Run +is paused. This is a one-directional Operation -> Run read +(tach-legal); there is NO cascade from Run-resume into Procedure-resume +(a Layer-3 saga, deferred). Standalone Procedures (no parent_run_id) +skip the load and pass `parent_run_held=False`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation.aggregates.procedure import ( + ProcedureNotFoundError, + event_type_name, + fold, + from_stored, + to_payload, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.decider import decide +from cora.run.aggregates.run import RunNotFoundError, RunStatus, load_run + +_STREAM_TYPE = "Procedure" +_COMMAND_NAME = "ResumeProcedure" + +_log = get_logger(__name__) + + +class Handler(Protocol): + """Callable interface every resume_procedure handler implements.""" + + async def __call__( + self, + command: ResumeProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: ... + + +def bind(deps: Kernel) -> Handler: + """Build a resume_procedure handler closed over the shared deps.""" + + async def handler( + command: ResumeProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: + _log.info( + "resume_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + decision = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(decision, Deny): + _log.info( + "resume_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=decision.reason, + ) + raise UnauthorizedError(decision.reason) + + stored, version = await deps.event_store.load(_STREAM_TYPE, command.procedure_id) + state = fold([from_stored(s) for s in stored]) + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + + # Off-diagonal guard: a Phase-of-Run Procedure cannot resume while + # its parent Run is Held. One-directional Operation -> Run read; a + # missing parent Run in the chain is corruption, so raise rather + # than silently skip the guard (mirrors start_procedure). Standalone + # Procedures (no parent_run_id) pass parent_run_held=False. + parent_run_held = False + if state.parent_run_id is not None: + parent_run = await load_run(deps.event_store, state.parent_run_id) + if parent_run is None: + raise RunNotFoundError(state.parent_run_id) + parent_run_held = parent_run.status == RunStatus.HELD + + now = deps.clock.now() + domain_events = decide( + state=state, + command=command, + parent_run_held=parent_run_held, + now=now, + ) + + new_events = [ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=deps.id_generator.new_id(), + command_name=_COMMAND_NAME, + correlation_id=correlation_id, + causation_id=causation_id, + principal_id=principal_id, + ) + for event in domain_events + ] + await deps.event_store.append( + stream_type=_STREAM_TYPE, + stream_id=command.procedure_id, + expected_version=version, + events=new_events, + ) + + _log.info( + "resume_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + parent_run_held=parent_run_held, + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + event_count=len(new_events), + ) + + return handler diff --git a/apps/api/src/cora/operation/features/resume_procedure/route.py b/apps/api/src/cora/operation/features/resume_procedure/route.py new file mode 100644 index 0000000000..08e977fa45 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/route.py @@ -0,0 +1,92 @@ +"""HTTP route for the `resume_procedure` slice. + +Action endpoint at `POST /procedures/{procedure_id}/resume`. Body carries +`re_establishment_boundary` (>= 0). 204 No Content on success. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler + + +class ResumeProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/resume`.""" + + re_establishment_boundary: int = Field( + ..., + ge=0, + description=( + "Index in the pinned resolved step list from which the resume " + "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " + "from the first step). NOT a continuity proof." + ), + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.resume_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/resume", + status_code=status.HTTP_204_NO_CONTENT, + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "Domain invariant violated: negative re_establishment_boundary.", + }, + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Held` status (resume requires `Held`; " + "resuming a `Running` / `Defined` / terminal procedure raises), " + "OR a concurrent write to the same procedure stream conflicted " + "(optimistic concurrency)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + }, + summary="Resume a held Procedure conduct (Held -> Running)", +) +async def post_procedures_resume( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: ResumeProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> None: + await handler( + ResumeProcedure( + procedure_id=procedure_id, + re_establishment_boundary=body.re_establishment_boundary, + ), + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) diff --git a/apps/api/src/cora/operation/features/resume_procedure/tool.py b/apps/api/src/cora/operation/features/resume_procedure/tool.py new file mode 100644 index 0000000000..94a4acc553 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/tool.py @@ -0,0 +1,56 @@ +"""MCP tool for the `resume_procedure` slice.""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `resume_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="resume_procedure", + description=( + "Resume a held Procedure conduct (Held -> Running). The inverse of " + "hold_procedure. Requires the Procedure to currently be in `Held`. " + "Resuming a `Running` / `Defined` / terminal Procedure raises. " + "re_establishment_boundary (>= 0) is the step-list index the resume " + "re-drives setpoints / re-runs checks from." + ), + ) + async def resume_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + re_establishment_boundary: Annotated[ + int, + Field( + ge=0, + description=( + "Index in the pinned resolved step list the resume re-drives " + "setpoints / re-runs checks from (>= 0; 0 = from the first step)." + ), + ), + ], + ) -> None: + handler = get_handler() + await handler( + ResumeProcedure( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + ), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) diff --git a/apps/api/src/cora/operation/features/truncate_procedure/decider.py b/apps/api/src/cora/operation/features/truncate_procedure/decider.py index b446ab9936..93d756eff4 100644 --- a/apps/api/src/cora/operation/features/truncate_procedure/decider.py +++ b/apps/api/src/cora/operation/features/truncate_procedure/decider.py @@ -1,9 +1,10 @@ """Pure decider for the `TruncateProcedure` command. -Single-source partial-data terminal: `Running -> Truncated`. Source -set is just `Running` today (Held / Resumed deferred per pilot need; -if Held lands, this source set widens to `Running | Held` to mirror -Run BC's `truncate_run` precedent). +Multi-source partial-data terminal: `Running | Held -> Truncated`. +`Held` was added when resumable conduct landed +([[project_resumable_conduct_design]] Tier 1); truncate widened to +accept it so a paused-then-de-facto-dead Procedure can be closed +retroactively. Mirrors Run BC's `truncate_run` (`Running | Held`). Truncating any terminal (Completed | Aborted | Truncated) raises; re-truncating a `Truncated` Procedure raises (strict-not-idempotent). @@ -29,7 +30,7 @@ -> InvalidProcedureTruncateReasonError - command.interrupted_at, when set, must not be in the future -> InvalidProcedureInterruptedAtError - - State.status must be in {Running} + - State.status must be in {Running, Held} -> ProcedureCannotTruncateError(current_status=...) """ @@ -46,7 +47,10 @@ ) from cora.operation.features.truncate_procedure.command import TruncateProcedure -_TRUNCATABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) +_TRUNCATABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) def decide( diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py b/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py new file mode 100644 index 0000000000..fdcb64653c --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py @@ -0,0 +1,39 @@ +"""Vertical slice for the `TryConductProcedure` command. + +Pause-capable conduct: the conduct verb-family's third member (conduct = +run-to-terminal, reconduct = resume-and-replay, try-conduct = +pause-to-Held-on-recoverable-failure). Hands control to the `Conductor` +runtime which, on a recoverable step failure, pauses the Procedure to `Held` +instead of aborting it, so an operator can `reconduct` from the pinned +resolved steps. Returns a structured `TryConductProcedureResult` whose `held` +flag distinguishes a paused (resumable) outcome from a terminal one. + + from cora.operation.features import try_conduct_procedure + + cmd = try_conduct_procedure.TryConductProcedure(procedure_id=..., steps=(...)) + handler = try_conduct_procedure.bind(deps, conductor=conductor, expansion_port=...) + result = await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.try_conduct_procedure import tool +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.features.try_conduct_procedure.handler import Handler, bind +from cora.operation.features.try_conduct_procedure.route import ( + TryConductProcedureRequest, + TryConductProcedureResponse, + router, +) + +__all__ = [ + "Handler", + "TryConductProcedure", + "TryConductProcedureRequest", + "TryConductProcedureResponse", + "TryConductProcedureResult", + "bind", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/command.py b/apps/api/src/cora/operation/features/try_conduct_procedure/command.py new file mode 100644 index 0000000000..07ea1d1563 --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/command.py @@ -0,0 +1,47 @@ +"""The `TryConductProcedure` command -- pause-capable conduct entry point. + +Like `ConductProcedure`, hands control to the `Conductor` runtime; the one +difference is the failure posture. On a RECOVERABLE step failure (a setpoint +or check: re-drivable / re-runnable on resume) the Conductor PAUSES the +Procedure to `Held` instead of aborting it, so the operator can fix the cause +and `reconduct` from the pinned resolved steps. A NON-recoverable failure (an +action: an interrupted acquisition), a lifecycle failure, and a mid-execute +cancellation keep `conduct`'s abort posture. + +`steps` is the caller-supplied sequence the Conductor walks (same wire shape +as `ConductProcedure`). +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from uuid import UUID + +from cora.operation.conductor import ConductorFailure, Step + + +@dataclass(frozen=True) +class TryConductProcedure: + """Conduct a Procedure, pausing to Held on a recoverable step failure.""" + + procedure_id: UUID + steps: Sequence[Step] + + +@dataclass(frozen=True) +class TryConductProcedureResult: + """Summary of a `TryConductProcedure` invocation. + + Mirrors `ConductProcedureResult` plus `held`: True iff a recoverable step + failure paused the Procedure to `Held` AND the pause transition itself + succeeded. `held` is what distinguishes a resumable outcome from a + terminal `Aborted` one: both carry `succeeded=False` + `failure`, but only + a `held` Procedure can be `reconduct`-ed. A `held` Procedure whose hold + transition failed (left Running) reports `held=False`. + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: ConductorFailure | None = None + actuation_kind: str | None = None diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py b/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py new file mode 100644 index 0000000000..c4558404d6 --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py @@ -0,0 +1,166 @@ +"""Application handler for the `try_conduct_procedure` slice. + +Pause-capable conduct. A thin orchestrator that delegates to +`Conductor.try_conduct()` (the pause-to-Held twin of `Conductor.conduct()`): +on a recoverable step failure the Conductor pauses the Procedure to `Held` +rather than aborting it, so the operator can `reconduct` from the pinned +resolved steps. This is the Tier-1 producer that makes a Held + pinned-steps +state reachable so the `reconduct` resume path has something to resume. + +Shares the pre-Conductor pipeline (recipe re-expansion + pseudoaxis + +resolved-steps pin) with `conduct_procedure` via the BC-level +`resolve_and_pin_conduct_steps`, and the HTTP/MCP wire shapes via +`_conduct_wire`. It imports NO sibling slice: the cross-slice-independence +fitness forbids that, and the shared seams live outside `features/`. + +## Why no `_decider` + +Like `conduct_procedure`, records no new events directly: the wrapped +start / append / complete / abort / hold handlers (on the Conductor) write. +An orchestration entry point, not an aggregate-state-mutating decider. + +## Authorization scope + +`TryConductProcedure` is authz-checked as its own command. The wrapped +transition handlers each authz internally with their OWN command names; an +operator authorized to call `TryConductProcedure` is NOT automatically +authorized for those individually. Same layering as `conduct_procedure`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._conduct_preparation import resolve_and_pin_conduct_steps +from cora.operation.aggregates.procedure import ( + ProcedureNotFoundError, + load_procedure_with_events, +) +from cora.operation.conductor import Conductor +from cora.operation.errors import UnauthorizedError +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.ports.recipe_expander import RecipeExpander + +_COMMAND_NAME = "TryConductProcedure" + +_log = get_logger(__name__) + + +class Handler(Protocol): + """Callable interface every try_conduct_procedure handler implements.""" + + async def __call__( + self, + command: TryConductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> TryConductProcedureResult: ... + + +def bind( + deps: Kernel, + *, + conductor: Conductor, + expansion_port: RecipeExpander, +) -> Handler: + """Build a try_conduct_procedure handler closed over deps + Conductor + port. + + `conductor` is the same BC-internal Conductor `conduct_procedure` uses; it + carries the start / complete / abort / hold handlers (wired at app + composition) that `Conductor.try_conduct` composes. `expansion_port` is + the same instance wired for `register_procedure_from_recipe` + conduct. + """ + + async def handler( + command: TryConductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> TryConductProcedureResult: + _log.info( + "try_conduct_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + step_count=len(command.steps), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + authz = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(authz, Deny): + _log.info( + "try_conduct_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=authz.reason, + ) + raise UnauthorizedError(authz.reason) + + procedure, stored_events = await load_procedure_with_events( + deps.event_store, command.procedure_id + ) + if procedure is None: + raise ProcedureNotFoundError(command.procedure_id) + + steps = await resolve_and_pin_conduct_steps( + deps, + command_name=_COMMAND_NAME, + procedure=procedure, + stored_events=stored_events, + caller_steps=command.steps, + expansion_port=expansion_port, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + ) + + result = await conductor.try_conduct( + procedure_id=command.procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + surface_id=surface_id, + steps=steps, + ) + + _log.info( + "try_conduct_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure_class=(result.failure.error_class if result.failure is not None else None), + ) + + return TryConductProcedureResult( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure=result.failure, + actuation_kind=( + result.actuation_kind.value if result.actuation_kind is not None else None + ), + ) + + return handler diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/route.py b/apps/api/src/cora/operation/features/try_conduct_procedure/route.py new file mode 100644 index 0000000000..ed7c14319c --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/route.py @@ -0,0 +1,151 @@ +"""HTTP route for the `try_conduct_procedure` slice. + +`POST /procedures/{procedure_id}/try-conduct` accepts the same step-list body +as conduct, but on a RECOVERABLE step failure (a setpoint / check) the +Procedure is PAUSED to `Held` (resumable via `reconduct`) instead of aborted. + +## Response code: always 200, failures in body + +Like `conduct`, this is an orchestration endpoint: step-level failures + the +pause-to-Held outcome are NORMAL operational results that land in the response +body, not HTTP 4xx / 5xx. `held` distinguishes a paused (resumable) outcome +from a terminal `Aborted` one (both carry `succeeded=False` + `failure`). +Only true protocol / auth / validation faults map to HTTP error codes (422 +for malformed JSON, 403 for authz deny). + +## Pydantic wire types + +The shared step-list body + per-step failure shape live in the BC-level +`cora.operation._conduct_wire` module (shared with `conduct_procedure`). This +slice owns only the try-conduct-specific request/response envelope, which adds +the `held` discriminator. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation._conduct_wire import ( + STEP_BATCH_MAX, + ConductorFailureResponse, + StepRequest, + failure_to_wire, + step_from_wire, +) +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.features.try_conduct_procedure.handler import Handler + + +class TryConductProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/try-conduct`.""" + + steps: list[StepRequest] = Field( + default_factory=list[StepRequest], + max_length=STEP_BATCH_MAX, + description=( + f"Steps the Conductor walks in order (0-{STEP_BATCH_MAX}). " + "Empty list is valid: start + complete fire with no steps." + ), + ) + + model_config = {"extra": "forbid"} + + +class TryConductProcedureResponse(BaseModel): + """Response body for the try_conduct_procedure slice. + + `succeeded` is the canonical pass/fail bit; `failure` is non-null iff + `succeeded` is False. `held` is True iff a recoverable step failure paused + the Procedure to `Held` (resumable via `reconduct`); a terminal `Aborted` + outcome carries `succeeded=False` + `failure` + `held=False`. + + `actuation_kind` is the raw `ActuationKind` value the Conductor observed, + or None when nothing instrumented was actuated. Read-only operator + visibility; the gate that consumes it reads the value server-side off the + Procedure stream, never back from this response. + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: ConductorFailureResponse | None = None + actuation_kind: str | None = None + + +def result_to_wire(result: TryConductProcedureResult) -> TryConductProcedureResponse: + """Build a `TryConductProcedureResponse` from the slice's result. + + Public because `tool.py` calls it too. + """ + return TryConductProcedureResponse( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure=failure_to_wire(result.failure) if result.failure is not None else None, + actuation_kind=result.actuation_kind, + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.try_conduct_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/try-conduct", + status_code=status.HTTP_200_OK, + response_model=TryConductProcedureResponse, + responses={ + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": ( + "Request body failed schema validation: unknown step kind, " + "missing required field, batch over cap, invalid criterion shape." + ), + }, + }, + summary=( + "Conduct a Procedure, pausing to Held on a recoverable failure: " + "start -> walk steps -> complete (success) / pause to Held " + "(recoverable setpoint or check failure) / abort (acquisition failure)." + ), +) +async def post_procedures_try_conduct( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: TryConductProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> TryConductProcedureResponse: + """Conduct a Procedure, pausing to Held on a recoverable failure.""" + command = TryConductProcedure( + procedure_id=procedure_id, + steps=tuple(step_from_wire(s) for s in body.steps), + ) + result = await handler( + command, + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) + return result_to_wire(result) diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py b/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py new file mode 100644 index 0000000000..a9f1da08eb --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py @@ -0,0 +1,86 @@ +"""MCP tool for the `try_conduct_procedure` slice. + +Mirrors the REST route: accepts a discriminated step list, returns a +structured summary. On a recoverable step failure the Procedure is PAUSED to +`Held` (resumable) instead of aborted; `held` in the return value flags that. +Failures land in the return value (not raised); the LLM caller inspects +`succeeded` + `held` + `failure` to decide reconduct / abort / escalation. +""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import BaseModel, Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation._conduct_wire import step_from_wire +from cora.operation.features.try_conduct_procedure.command import TryConductProcedure +from cora.operation.features.try_conduct_procedure.handler import Handler +from cora.operation.features.try_conduct_procedure.route import ( + TryConductProcedureRequest, + TryConductProcedureResponse, + result_to_wire, +) + + +class _ToolResult(BaseModel): + """MCP-shape mirror of `TryConductProcedureResponse` for tool-output validation.""" + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: dict[str, Any] | None = None + actuation_kind: str | None = None + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `try_conduct_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="try_conduct_procedure", + description=( + "Conduct an existing Procedure end-to-end like conduct_procedure, " + "but on a RECOVERABLE step failure (a setpoint write or read-back " + "check) PAUSE the Procedure to Held (resumable via " + "reconduct_procedure) instead of aborting it. An acquisition " + "(action) failure still aborts. Returns a structured summary; " + "`held` is True when the Procedure was paused (resumable). " + "Failures DO NOT raise." + ), + ) + async def try_conduct_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + body: Annotated[ + TryConductProcedureRequest, + Field(description="Step list the Conductor walks in order."), + ], + ) -> _ToolResult: + handler = get_handler() + command = TryConductProcedure( + procedure_id=procedure_id, + steps=tuple(step_from_wire(s) for s in body.steps), + ) + result = await handler( + command, + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) + wire: TryConductProcedureResponse = result_to_wire(result) + return _ToolResult( + procedure_id=wire.procedure_id, + completed_count=wire.completed_count, + succeeded=wire.succeeded, + held=wire.held, + failure=wire.failure.model_dump() if wire.failure is not None else None, + actuation_kind=wire.actuation_kind, + ) diff --git a/apps/api/src/cora/operation/projections/procedure.py b/apps/api/src/cora/operation/projections/procedure.py index 78b8e4ed66..fd2700243a 100644 --- a/apps/api/src/cora/operation/projections/procedure.py +++ b/apps/api/src/cora/operation/projections/procedure.py @@ -12,6 +12,11 @@ - ProcedureTruncated -> UPDATE status='Truncated' + status-change ts + last_status_reason + interrupted_at + - ProcedureHeld -> UPDATE status='Held' + status-change ts + + last_status_reason + - ProcedureResumed -> UPDATE status='Running' + status-change ts + (clears last_status_reason: + Running is not reason-bearing) - ProcedureActivitiesLogbookOpened -> UPDATE activity_logbook_id (status NOT touched; logbook is orthogonal to lifecycle) @@ -26,16 +31,19 @@ ordered per-stream delivery; equals the count because the start decider enforces strict-successor indexing). -The 4 status-change UPDATEs share the same SQL shape (status literal + -status-change timestamp + optional reason); per-event arms differ only -in which status string + which payload fields they pull. A future -parameterized `_UPDATE_STATUS_SQL` hoist (mirroring proj_supply_summary's -later cleanup) becomes worthwhile when a 5th status-change arm -lands -- today the 4 arms keep the dispatch readable. +The 6 status-change UPDATEs (Started / Completed / Aborted / Truncated / +Held / Resumed) keep per-event SQL constants rather than a parameterized +`_UPDATE_STATUS_SQL`. The "hoist at the 5th arm" note from the 4-arm era +was re-evaluated when Held/Resumed landed: the arms are NOT uniform +(Truncated also sets interrupted_at, Resumed CLEARS last_status_reason +rather than setting it), so a single parameterized SQL would need +conditional columns and read worse than the explicit constants. Revisit +only if a future arm restores uniformity. -All branches idempotent. The CHECK constraint on `status` is locked -with the full enum values day one (5 statuses) so no future migration -is needed even if Held/Resumed land later. +All branches idempotent. The status CHECK was widened to admit 'Held' in +migration `20260621060000_proc_summary_status_admit_held` (Resumed maps +back to 'Running', so 'Held' is the only new persisted value). See +[[project_resumable_conduct_design]]. """ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false @@ -91,6 +99,24 @@ WHERE procedure_id = $1 """ +_UPDATE_HELD_SQL = """ +UPDATE proj_operation_procedure_summary +SET status = 'Held', + last_status_changed_at = $2, + last_status_reason = $3, + updated_at = now() +WHERE procedure_id = $1 +""" + +_UPDATE_RESUMED_SQL = """ +UPDATE proj_operation_procedure_summary +SET status = 'Running', + last_status_changed_at = $2, + last_status_reason = NULL, + updated_at = now() +WHERE procedure_id = $1 +""" + _UPDATE_STEPS_LOGBOOK_OPENED_SQL = """ UPDATE proj_operation_procedure_summary SET activity_logbook_id = $2, @@ -117,6 +143,8 @@ class ProcedureSummaryProjection: "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", } @@ -191,6 +219,23 @@ async def apply( ) return + if event.event_type == "ProcedureHeld": + await conn.execute( + _UPDATE_HELD_SQL, + UUID(event.payload["procedure_id"]), + datetime.fromisoformat(event.payload["occurred_at"]), + event.payload["reason"], + ) + return + + if event.event_type == "ProcedureResumed": + await conn.execute( + _UPDATE_RESUMED_SQL, + UUID(event.payload["procedure_id"]), + datetime.fromisoformat(event.payload["occurred_at"]), + ) + return + if event.event_type == "ProcedureActivitiesLogbookOpened": await conn.execute( _UPDATE_STEPS_LOGBOOK_OPENED_SQL, diff --git a/apps/api/src/cora/operation/routes.py b/apps/api/src/cora/operation/routes.py index 1ca6622aa9..9c71460360 100644 --- a/apps/api/src/cora/operation/routes.py +++ b/apps/api/src/cora/operation/routes.py @@ -36,11 +36,13 @@ from cora.operation.aggregates.procedure import ( InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureInterruptedAtError, InvalidProcedureIterationCapError, InvalidProcedureIterationEndReasonError, InvalidProcedureKindError, InvalidProcedureNameError, + InvalidProcedureReEstablishmentBoundaryError, InvalidProcedureTruncateReasonError, InvalidRecipeBindingsError, InvalidStepKindError, @@ -50,6 +52,8 @@ ProcedureCannotAbortError, ProcedureCannotCompleteError, ProcedureCannotEndIterationError, + ProcedureCannotHoldError, + ProcedureCannotResumeError, ProcedureCannotStartError, ProcedureCannotStartIterationError, ProcedureCannotTruncateError, @@ -70,6 +74,7 @@ RecipeExpansionOverflowError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + ResolvedStepsRecordNotFoundError, ) from cora.operation.errors import ( AssetNotPseudoAxisError, @@ -89,13 +94,17 @@ conduct_procedure, end_iteration, get_procedure, + hold_procedure, list_procedure_iterations, list_procedures, + reconduct_procedure, register_procedure, register_procedure_from_recipe, + resume_procedure, start_iteration, start_procedure, truncate_procedure, + try_conduct_procedure, ) @@ -230,6 +239,9 @@ def register_operation_routes(app: FastAPI) -> None: app.include_router(complete_procedure.router) app.include_router(abort_procedure.router) app.include_router(truncate_procedure.router) + app.include_router(hold_procedure.router) + app.include_router(resume_procedure.router) + app.include_router(reconduct_procedure.router) app.include_router(start_iteration.router) app.include_router(end_iteration.router) app.include_router(append_activities.router) @@ -237,14 +249,17 @@ def register_operation_routes(app: FastAPI) -> None: app.include_router(list_procedures.router) app.include_router(list_procedure_iterations.router) app.include_router(conduct_procedure.router) + app.include_router(try_conduct_procedure.router) for validation_cls in ( InvalidProcedureNameError, InvalidProcedureKindError, InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureTruncateReasonError, InvalidProcedureIterationEndReasonError, InvalidProcedureIterationCapError, InvalidProcedureInterruptedAtError, + InvalidProcedureReEstablishmentBoundaryError, InvalidStepKindError, # Recipe-driven conduct_procedure path: caller-supplied steps with # recipe_id set are rejected up front per the replay-design lock @@ -267,6 +282,10 @@ def register_operation_routes(app: FastAPI) -> None: ProcedureCannotCompleteError, ProcedureCannotAbortError, ProcedureCannotTruncateError, + # resumable-conduct pause/resume guards (Running->Held->Running): + # holding a non-Running procedure, or resuming a non-Held one. + ProcedureCannotHoldError, + ProcedureCannotResumeError, # iteration boundary guards (start/end): not-Running, no/already-open # iteration, and non-sequential / mismatched operator-supplied index. ProcedureCannotStartIterationError, @@ -349,6 +368,9 @@ def register_operation_routes(app: FastAPI) -> None: RecipeExpanderVersionMismatchError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + # resumable conduct: a Held Procedure missing its pinned resolved steps + # (corruption); kept out of the reconduct failures-in-body contract. + ResolvedStepsRecordNotFoundError, # PseudoAxis pre-Conductor expansion ([[project-pseudoaxis-design]] # v3): the partition-rule math kernel returned a non-finite result, # rejected an unsupported AggregatorKind / PartitionKind variant, diff --git a/apps/api/src/cora/operation/tools.py b/apps/api/src/cora/operation/tools.py index db4cf52261..04c4e76560 100644 --- a/apps/api/src/cora/operation/tools.py +++ b/apps/api/src/cora/operation/tools.py @@ -17,17 +17,21 @@ from cora.operation.features.conduct_procedure import tool as conduct_procedure_tool from cora.operation.features.end_iteration import tool as end_iteration_tool from cora.operation.features.get_procedure import tool as get_procedure_tool +from cora.operation.features.hold_procedure import tool as hold_procedure_tool from cora.operation.features.list_procedure_iterations import ( tool as list_procedure_iterations_tool, ) from cora.operation.features.list_procedures import tool as list_procedures_tool +from cora.operation.features.reconduct_procedure import tool as reconduct_procedure_tool from cora.operation.features.register_procedure import tool as register_procedure_tool from cora.operation.features.register_procedure_from_recipe import ( tool as register_procedure_from_recipe_tool, ) +from cora.operation.features.resume_procedure import tool as resume_procedure_tool from cora.operation.features.start_iteration import tool as start_iteration_tool from cora.operation.features.start_procedure import tool as start_procedure_tool from cora.operation.features.truncate_procedure import tool as truncate_procedure_tool +from cora.operation.features.try_conduct_procedure import tool as try_conduct_procedure_tool from cora.operation.wire import OperationHandlers @@ -61,6 +65,18 @@ def register_operation_tools( mcp, get_handler=lambda: get_handlers().truncate_procedure, ) + hold_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().hold_procedure, + ) + resume_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().resume_procedure, + ) + reconduct_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().reconduct_procedure, + ) start_iteration_tool.register( mcp, get_handler=lambda: get_handlers().start_iteration, @@ -89,3 +105,7 @@ def register_operation_tools( mcp, get_handler=lambda: get_handlers().conduct_procedure, ) + try_conduct_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().try_conduct_procedure, + ) diff --git a/apps/api/src/cora/operation/wire.py b/apps/api/src/cora/operation/wire.py index 6d7b8b86f4..ed2c664c58 100644 --- a/apps/api/src/cora/operation/wire.py +++ b/apps/api/src/cora/operation/wire.py @@ -74,13 +74,17 @@ conduct_procedure, end_iteration, get_procedure, + hold_procedure, list_procedure_iterations, list_procedures, + reconduct_procedure, register_procedure, register_procedure_from_recipe, + resume_procedure, start_iteration, start_procedure, truncate_procedure, + try_conduct_procedure, ) from cora.operation.ports.control_port import ControlPort @@ -103,6 +107,9 @@ class OperationHandlers: complete_procedure: complete_procedure.Handler abort_procedure: abort_procedure.Handler truncate_procedure: truncate_procedure.Handler + hold_procedure: hold_procedure.Handler + resume_procedure: resume_procedure.Handler + reconduct_procedure: reconduct_procedure.Handler start_iteration: start_iteration.Handler end_iteration: end_iteration.Handler append_activities: append_activities.Handler @@ -110,6 +117,7 @@ class OperationHandlers: list_procedures: list_procedures.Handler list_procedure_iterations: list_procedure_iterations.Handler conduct_procedure: conduct_procedure.Handler + try_conduct_procedure: try_conduct_procedure.Handler control_port: ControlPort """The ControlPort the Conductor talks to. Surfaced on the bundle so the FastAPI lifespan's teardown can call `aclose()` on it @@ -177,6 +185,22 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="AbortProcedure", bc=_BC, ) + # Hoisted to a local so the bundle field AND the Conductor share ONE + # post-tracing resume handler instance (mirrors the start/complete/abort + # hoist; Conductor.reconduct composes this resume handler). + resume_handler = with_tracing( + resume_procedure.bind(deps), + command_name="ResumeProcedure", + bc=_BC, + ) + # Hoisted likewise so the bundle field AND the Conductor share ONE + # post-tracing hold handler instance; Conductor.try_conduct composes it + # to pause-to-Held on a recoverable conduct failure. + hold_handler = with_tracing( + hold_procedure.bind(deps), + command_name="HoldProcedure", + bc=_BC, + ) append_step_handler = with_tracing( append_activities.bind(deps, step_store=step_store), command_name="AppendProcedureActivities", @@ -199,6 +223,24 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> start_procedure=start_handler, complete_procedure=complete_handler, abort_procedure=abort_handler, + resume_procedure=resume_handler, + hold_procedure=hold_handler, + ) + # Resume-and-replay orchestration: a thin slice handler over + # Conductor.reconduct (which composes resume + execute_from + + # complete/abort). Reuses the same conductor; no sibling-slice imports. + reconduct_handler = with_tracing( + reconduct_procedure.bind(deps, conductor=conductor), + command_name="ReconductProcedure", + bc=_BC, + ) + # Pause-capable conduct: a thin slice handler over Conductor.try_conduct + # (which composes start + execute + complete/hold/abort). Reuses the same + # conductor + recipe expander as conduct; no sibling-slice imports. + try_conduct_handler = with_tracing( + try_conduct_procedure.bind(deps, conductor=conductor, expansion_port=recipe_expander), + command_name="TryConductProcedure", + bc=_BC, ) return OperationHandlers( register_procedure=with_tracing( @@ -235,6 +277,9 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="TruncateProcedure", bc=_BC, ), + hold_procedure=hold_handler, + resume_procedure=resume_handler, + reconduct_procedure=reconduct_handler, start_iteration=with_tracing( start_iteration.bind(deps), command_name="StartProcedureIteration", @@ -269,5 +314,6 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="ConductProcedure", bc=_BC, ), + try_conduct_procedure=try_conduct_handler, control_port=control_port, ) diff --git a/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py new file mode 100644 index 0000000000..2dde1ec3f6 --- /dev/null +++ b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py @@ -0,0 +1,161 @@ +"""Architecture fitness: every non-genesis, Procedure-constructing arm of +the Procedure evolver MUST carry all additive state fields through from +prior state. + +The Procedure aggregate accreted a wide additive-field set (the +iteration denorms, the recipe/capability binding, the activity logbook +id, the actuation-kind provenance carrier). Constructing +`Procedure(id=..., name=..., status=...)` on a new transition arm +without explicitly threading those fields silently WIPES them to their +defaults (empty frozenset / None / 0) on the next replay. The Tier-1 +`ProcedureHeld` / `ProcedureResumed` arms are the latest pair that must +carry the iteration denorms verbatim; this AST check pins the whole +matrix so the bug class cannot recur when a new arm lands. + +Precedent: `test_asset_evolver_lifecycle_dates_carry_forward.py` (same +structural AST shape, narrower field set). Behavior-side per-arm +preservation coverage lives in `tests/unit/operation/test_procedure_evolver.py`; +this fitness exists because behavior tests only catch arms someone +remembered to parametrize. + +## What is checked + +For every `case (...):` arm in `evolve` that builds a +`return Procedure(...)`: + + - the genesis arm (`ProcedureRegistered`) is exempt: it writes / + defaults every field at initial-state construction. + - provenance-only arms that return `require_state(...)` (no + `Procedure(...)` constructor) are exempt: passthrough preserves + every field by definition. + - every other arm MUST pass `=prior.` for each + carry-forward field, UNLESS the arm is a declared per-field writer + (it legitimately sets that field from the event or a computation). +""" + +from __future__ import annotations + +import ast +from pathlib import Path + +import pytest + +_REPO_ROOT = Path(__file__).resolve().parents[4] +_EVOLVER_PATH = ( + _REPO_ROOT + / "apps" + / "api" + / "src" + / "cora" + / "operation" + / "aggregates" + / "procedure" + / "evolver.py" +) + +_GENESIS_ARM = "ProcedureRegistered" + +# Carry-forward fields and the arms that legitimately WRITE each (so are +# exempt from the `=prior.` requirement for that field). The +# genesis arm writes every field and is exempt globally below. +_WRITER_ARMS_PER_FIELD: dict[str, frozenset[str]] = { + "kind": frozenset(), + "target_asset_ids": frozenset(), + "parent_run_id": frozenset(), + "activity_logbook_id": frozenset({"ProcedureActivitiesLogbookOpened"}), + "capability_id": frozenset(), + "recipe_id": frozenset(), + "current_iteration_index": frozenset({"ProcedureIterationStarted", "ProcedureIterationEnded"}), + "iteration_count": frozenset({"ProcedureIterationStarted"}), + "consecutive_unconverged_iterations": frozenset({"ProcedureIterationEnded"}), + "max_consecutive_unconverged_iterations": frozenset(), + # Terminal arms snapshot the Conductor's observed kind from the event; + # ProcedureHeld MERGES the conduct's observed-so-far kind into state (via + # merge_actuation_kinds) so the pre-hold provenance survives the + # hold->resume boundary. + "actuation_kind": frozenset({"ProcedureCompleted", "ProcedureAborted", "ProcedureHeld"}), +} + + +def _arm_event_type_name(case_node: ast.match_case) -> str | None: + pattern = case_node.pattern + if isinstance(pattern, ast.MatchClass) and isinstance(pattern.cls, ast.Name): + return pattern.cls.id + return None + + +def _return_procedure_kwargs(case_node: ast.match_case) -> dict[str, ast.expr] | None: + """Kwargs from the `return Procedure(...)` call in this arm, or None + when the arm constructs no Procedure (it returns require_state / + state directly -- a passthrough that preserves every field).""" + for stmt in ast.walk(case_node): + if ( + isinstance(stmt, ast.Return) + and isinstance(stmt.value, ast.Call) + and isinstance(stmt.value.func, ast.Name) + and stmt.value.func.id == "Procedure" + ): + return {kw.arg: kw.value for kw in stmt.value.keywords if kw.arg is not None} + return None + + +def _is_prior_attribute_access(node: ast.expr, field: str) -> bool: + return ( + isinstance(node, ast.Attribute) + and node.attr == field + and isinstance(node.value, ast.Name) + and node.value.id == "prior" + ) + + +def _find_evolve_match_cases() -> list[ast.match_case]: + tree = ast.parse(_EVOLVER_PATH.read_text(encoding="utf-8")) + evolve_func = next( + (n for n in tree.body if isinstance(n, ast.FunctionDef) and n.name == "evolve"), + None, + ) + assert evolve_func is not None, "Could not locate `evolve` in evolver.py" + match_stmt = next((n for n in evolve_func.body if isinstance(n, ast.Match)), None) + assert match_stmt is not None, "Could not locate `match event:` in `evolve`" + return list(match_stmt.cases) + + +@pytest.mark.architecture +def test_procedure_evolver_non_writer_arms_carry_all_additive_fields() -> None: + """Every non-genesis Procedure-constructing arm threads each additive + field as `=prior.` unless it is a declared writer of + that field.""" + violations: list[str] = [] + for case in _find_evolve_match_cases(): + event_name = _arm_event_type_name(case) + if event_name is None: + continue # wildcard `case _:` (assert_never guard) + if event_name == _GENESIS_ARM: + continue # genesis writes / defaults every field + kwargs = _return_procedure_kwargs(case) + if kwargs is None: + continue # passthrough arm (returns require_state/state); preserves all + for field, writer_arms in _WRITER_ARMS_PER_FIELD.items(): + if event_name in writer_arms: + continue + value = kwargs.get(field) + if value is None: + violations.append( + f" - {event_name}: missing `{field}=prior.{field}` kwarg in Procedure(...)" + ) + continue + if not _is_prior_attribute_access(value, field): + violations.append( + f" - {event_name}: `{field}=...` is not " + f"`prior.{field}` (got `{ast.unparse(value)}`)" + ) + assert not violations, ( + "Procedure evolver arms drop an additive-state field on replay.\n" + "Every non-genesis arm that constructs `Procedure(...)` must thread\n" + "each additive field as `=prior.` unless it legitimately\n" + "writes that field (see `_WRITER_ARMS_PER_FIELD`). Otherwise the field\n" + "silently wipes to its default on next replay (the dropped-iteration-\n" + "denorm bug class). Add the carry-forward kwarg, or register a new\n" + "writer arm with rationale.\n\n" + "Violations:\n" + "\n".join(violations) + ) diff --git a/apps/api/tests/architecture/test_slice_contract.py b/apps/api/tests/architecture/test_slice_contract.py index 3ef83df5fb..5d45ee3178 100644 --- a/apps/api/tests/architecture/test_slice_contract.py +++ b/apps/api/tests/architecture/test_slice_contract.py @@ -56,6 +56,14 @@ # complete_procedure / abort_procedure handlers; no direct event # emission. See [[project_edge_runtime_design]]. "cora.operation.features.conduct_procedure", + # Resume-and-replay entry: delegates resume_procedure + + # Conductor.execute_from + complete/abort; no direct event emission. + # See [[project_resumable_conduct_design]]. + "cora.operation.features.reconduct_procedure", + # Pause-capable conduct entry: delegates Conductor.try_conduct + # (start + execute + complete/hold/abort); no direct event emission. + # See [[project_resumable_conduct_design]]. + "cora.operation.features.try_conduct_procedure", # Bulk-mint sweep: enumerates Assets missing a persistent id and # delegates each to the assign_asset_persistent_id handler; no direct # event emission. See [[project_asset_persistent_id_design]]. diff --git a/apps/api/tests/contract/test_hold_procedure_endpoint.py b/apps/api/tests/contract/test_hold_procedure_endpoint.py new file mode 100644 index 0000000000..d4b69c3b40 --- /dev/null +++ b/apps/api/tests/contract/test_hold_procedure_endpoint.py @@ -0,0 +1,100 @@ +"""Contract tests for `POST /procedures/{procedure_id}/hold`. + +Action endpoint with `reason` body, 204 on success. Covers happy path +(after register + start) plus error surfaces: 400 whitespace-only +reason, 404, 409 from-Defined, 409 re-hold, 422 missing/too-long reason +or malformed id. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register_and_start(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + started = client.post(f"/procedures/{pid}/start") + assert started.status_code == 204 + return pid + + +@pytest.mark.contract +def test_post_hold_returns_204_for_running_procedure() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}) + assert response.status_code == 204 + + +@pytest.mark.contract +def test_post_hold_marks_status_held_visible_via_get() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}) + response = client.get(f"/procedures/{pid}") + assert response.json()["status"] == "Held" + + +@pytest.mark.contract +def test_post_hold_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post(f"/procedures/{uuid4()}/hold", json={"reason": "x"}) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_hold_returns_409_for_defined_procedure() -> None: + """Hold requires Running; from Defined raises CannotHold.""" + with TestClient(create_app()) as client: + body: dict[str, Any] = {"name": "X", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "test"}) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_hold_returns_409_when_re_holding() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + first = client.post(f"/procedures/{pid}/hold", json={"reason": "first"}) + second = client.post(f"/procedures/{pid}/hold", json={"reason": "second"}) + assert first.status_code == 204 + assert second.status_code == 409 + + +@pytest.mark.contract +def test_post_hold_returns_400_for_whitespace_only_reason() -> None: + """Whitespace-only slips past Pydantic min_length=1; the VO rejects -> 400.""" + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": " "}) + assert response.status_code == 400 + assert "detail" in response.json() + + +@pytest.mark.contract +def test_post_hold_returns_422_for_missing_reason() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_hold_returns_422_for_too_long_reason() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "x" * 501}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_hold_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post("/procedures/not-a-uuid/hold", json={"reason": "x"}) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_hold_procedure_mcp_tool.py b/apps/api/tests/contract/test_hold_procedure_mcp_tool.py new file mode 100644 index 0000000000..4bbcb99152 --- /dev/null +++ b/apps/api/tests/contract/test_hold_procedure_mcp_tool.py @@ -0,0 +1,73 @@ +"""Contract tests for the `hold_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_and_start_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + pid = UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": "start_procedure", "arguments": {"procedure_id": str(pid)}}, + }, + headers=headers, + ) + return pid + + +@pytest.mark.contract +def test_mcp_lists_hold_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "hold_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_hold_procedure_tool_succeeds_for_running() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_and_start_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "hold_procedure", + "arguments": {"procedure_id": str(pid), "reason": "beam dropped"}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is False diff --git a/apps/api/tests/contract/test_list_procedures_endpoint.py b/apps/api/tests/contract/test_list_procedures_endpoint.py index 42dde43c0c..ac26c01504 100644 --- a/apps/api/tests/contract/test_list_procedures_endpoint.py +++ b/apps/api/tests/contract/test_list_procedures_endpoint.py @@ -35,12 +35,10 @@ def test_get_procedures_returns_empty_page_with_no_data(client: TestClient) -> N @pytest.mark.contract @pytest.mark.parametrize( "status_value", - ["Defined", "Running", "Completed", "Aborted", "Truncated"], + ["Defined", "Running", "Held", "Completed", "Aborted", "Truncated"], ) -def test_get_procedures_accepts_each_status_locked_day_one( - client: TestClient, status_value: str -) -> None: - """All 5 statuses accepted; the Literal is locked at the full FSM.""" +def test_get_procedures_accepts_each_status(client: TestClient, status_value: str) -> None: + """All 6 ProcedureStatus values are accepted by the status filter.""" with client: response = client.get(f"/procedures?status={status_value}") assert response.status_code == 200 diff --git a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py new file mode 100644 index 0000000000..6c218d4005 --- /dev/null +++ b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py @@ -0,0 +1,192 @@ +"""Contract tests for `POST /procedures/{procedure_id}/reconduct`. + +Resume-and-replay: resumes a Held Procedure and replays its pinned +step-list tail. 200 with replay outcomes in body; 404/409/422/500 for +protocol / guard / corruption faults. + +The 200 happy paths are now API-reachable via `try_conduct_procedure`: it +conducts a Procedure that pauses to `Held` on a recoverable step failure, +leaving the pinned `ResolvedStepsRecorded` for `reconduct` to replay. The +test wire-up uses `InMemoryControlPort` with no pre-connected addresses, so a +setpoint fails (recoverable -> Held); reconduct then replays the pinned tail +from the operator's boundary (an empty tail completes; a tail starting with an +acquisition halts-for-operator). +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + return UUID(client.post("/procedures", json=body).json()["procedure_id"]) + + +def _try_conduct_to_held(client: TestClient, steps: list[dict[str, Any]]) -> UUID: + """Register + try-conduct a Procedure to Held (the recoverable setpoint at + index 0 fails on the unconnected port), leaving a pinned resolved-step list + `reconduct` can replay. Returns the Held Procedure's id.""" + pid = _register(client) + held = client.post(f"/procedures/{pid}/try-conduct", json={"steps": steps}) + assert held.status_code == 200 + assert held.json()["held"] is True + return pid + + +@pytest.mark.contract +def test_post_reconduct_completes_held_procedure_with_empty_tail() -> None: + """Reconduct a Held Procedure past the end of its resolved steps (empty + tail): nothing to replay, so it auto-completes (200, succeeded).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + # boundary == len(resolved steps): the replayed tail is empty. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 1} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is True + assert body["acquisition_halt"] is False + + +@pytest.mark.contract +def test_post_reconduct_halts_on_acquisition_in_replayed_tail() -> None: + """Reconduct replaying a tail that starts with an acquisition halts for the + operator (200, acquisition_halt=True), leaving the Procedure Running.""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, + [ + {"kind": "setpoint", "address": "2bma:x", "value": 1.0}, + {"kind": "action", "name": "collect"}, + ], + ) + # boundary == 1 skips the prefix setpoint; the tail starts with the action. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 1} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["acquisition_halt"] is True + + +@pytest.mark.contract +def test_post_reconduct_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + f"/procedures/{uuid4()}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_reconduct_returns_409_for_defined_procedure() -> None: + """A Defined (non-Held) Procedure cannot be reconducted.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_reconduct_returns_409_for_completed_procedure_with_resolved_steps() -> None: + """A conduct pins resolved steps then completes; reconducting the (Completed) + Procedure is refused by the resume status guard (not Held).""" + with TestClient(create_app()) as client: + pid = _register(client) + # Conduct an EMPTY step list: pins ResolvedStepsRecorded, then + # start -> (no steps) -> complete, leaving the Procedure Completed + # WITH a pinned (empty) resolved steps. + conducted = client.post(f"/procedures/{pid}/conduct", json={"steps": []}) + assert conducted.status_code == 200 + assert conducted.json()["succeeded"] is True + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_reconduct_returns_500_for_held_procedure_without_resolved_steps() -> None: + """A Procedure started directly (no conduct) then held is Held WITHOUT a + pinned resolved steps; reconduct cannot locate it (corruption-shaped 500).""" + with TestClient(create_app()) as client: + pid = _register(client) + assert client.post(f"/procedures/{pid}/start").status_code == 204 + assert client.post(f"/procedures/{pid}/hold", json={"reason": "pause"}).status_code == 204 + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 500 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_negative_boundary() -> None: + """Pydantic ge=0 rejects a negative boundary at the wire before the handler.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": -1} + ) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_missing_boundary() -> None: + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post(f"/procedures/{pid}/reconduct", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + "/procedures/not-a-uuid/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_400_for_boundary_past_step_count() -> None: + """A boundary strictly past the pinned step count is rejected (it would + replay an empty tail and silently auto-complete).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 2} + ) + assert response.status_code == 400 + + +@pytest.mark.contract +def test_post_reconduct_aborts_on_a_genuine_step_failure() -> None: + """Replaying a tail whose setpoint still fails (unconnected address) aborts: + 200 with succeeded=False + acquisition_halt=False (a genuine step failure, + not an acquisition halt).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + # boundary 0 re-drives the still-unconnected setpoint -> it fails again. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["acquisition_halt"] is False + assert body["failure"]["source_kind"] == "setpoint" diff --git a/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py b/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py new file mode 100644 index 0000000000..b5df533fee --- /dev/null +++ b/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py @@ -0,0 +1,64 @@ +"""Contract tests for the `reconduct_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + return UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + + +@pytest.mark.contract +def test_mcp_lists_reconduct_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "reconduct_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_reconduct_procedure_tool_errors_for_non_held() -> None: + """Reconducting a Defined (non-Held) Procedure surfaces the resume guard + as an MCP error (the tool wiring is exercised end-to-end).""" + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "reconduct_procedure", + "arguments": {"procedure_id": str(pid), "re_establishment_boundary": 0}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is True diff --git a/apps/api/tests/contract/test_resume_procedure_endpoint.py b/apps/api/tests/contract/test_resume_procedure_endpoint.py new file mode 100644 index 0000000000..6cf3fa8d7b --- /dev/null +++ b/apps/api/tests/contract/test_resume_procedure_endpoint.py @@ -0,0 +1,88 @@ +"""Contract tests for `POST /procedures/{procedure_id}/resume`. + +Action endpoint with `re_establishment_boundary` body, 204 on success. +Covers happy path (register + start + hold) plus error surfaces: 404, +409 from-Running (not Held), 422 missing / negative boundary, 422 +malformed id. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register_start_hold(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + assert client.post(f"/procedures/{pid}/start").status_code == 204 + assert ( + client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}).status_code == 204 + ) + return pid + + +@pytest.mark.contract +def test_post_resume_returns_204_for_held_procedure() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 0}) + assert response.status_code == 204 + + +@pytest.mark.contract +def test_post_resume_marks_status_running_visible_via_get() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 1}) + response = client.get(f"/procedures/{pid}") + assert response.json()["status"] == "Running" + + +@pytest.mark.contract +def test_post_resume_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + f"/procedures/{uuid4()}/resume", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_resume_returns_409_for_running_procedure() -> None: + """Resume requires Held; from Running (never held) raises CannotResume.""" + with TestClient(create_app()) as client: + body: dict[str, Any] = {"name": "X", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + client.post(f"/procedures/{pid}/start") + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 0}) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_missing_boundary() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_negative_boundary() -> None: + """Pydantic ge=0 rejects a negative boundary at the wire before the decider.""" + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": -1}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + "/procedures/not-a-uuid/resume", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_resume_procedure_mcp_tool.py b/apps/api/tests/contract/test_resume_procedure_mcp_tool.py new file mode 100644 index 0000000000..da4a0ff0a6 --- /dev/null +++ b/apps/api/tests/contract/test_resume_procedure_mcp_tool.py @@ -0,0 +1,86 @@ +"""Contract tests for the `resume_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_start_hold_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + pid = UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": "start_procedure", "arguments": {"procedure_id": str(pid)}}, + }, + headers=headers, + ) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "hold_procedure", + "arguments": {"procedure_id": str(pid), "reason": "beam dropped"}, + }, + }, + headers=headers, + ) + return pid + + +@pytest.mark.contract +def test_mcp_lists_resume_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "resume_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_resume_procedure_tool_succeeds_for_held() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_start_hold_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 4, + "method": "tools/call", + "params": { + "name": "resume_procedure", + "arguments": {"procedure_id": str(pid), "re_establishment_boundary": 0}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is False diff --git a/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py b/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py new file mode 100644 index 0000000000..ee403945ff --- /dev/null +++ b/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py @@ -0,0 +1,86 @@ +"""Contract tests for `POST /procedures/{procedure_id}/try-conduct`. + +Pause-capable conduct: like conduct, but a RECOVERABLE step failure (setpoint +/ check) PAUSES the Procedure to Held (resumable via reconduct) instead of +aborting. Always 200 with the outcome in the body; `held` flags the pause. +404 for an unknown procedure, 422 for a malformed body. + +The test wire-up uses `InMemoryControlPort` with no pre-connected addresses, +so a setpoint to any address fails with ControlNotConnectedError: that is the +recoverable failure this slice pauses on. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + return UUID(client.post("/procedures", json=body).json()["procedure_id"]) + + +@pytest.mark.contract +def test_post_try_conduct_empty_steps_completes() -> None: + """An empty step list starts + completes the Procedure (no failure to pause on).""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post(f"/procedures/{pid}/try-conduct", json={"steps": []}) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is True + assert body["held"] is False + + +@pytest.mark.contract +def test_post_try_conduct_recoverable_setpoint_pauses_to_held() -> None: + """A setpoint to an unconnected address is recoverable: pause to Held, not abort.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}]}, + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["held"] is True + assert body["failure"]["source_kind"] == "setpoint" + + +@pytest.mark.contract +def test_post_try_conduct_action_failure_aborts_not_held() -> None: + """An unregistered action is an acquisition failure: abort (not held).""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "action", "name": "unregistered"}]}, + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["held"] is False + assert body["failure"]["source_kind"] == "action" + + +@pytest.mark.contract +def test_post_try_conduct_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post(f"/procedures/{uuid4()}/try-conduct", json={"steps": []}) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_try_conduct_returns_422_for_unknown_step_kind() -> None: + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "teleport", "address": "x", "value": 1}]}, + ) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py b/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py new file mode 100644 index 0000000000..6fb1f452ff --- /dev/null +++ b/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py @@ -0,0 +1,72 @@ +"""Contract tests for the `try_conduct_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + return UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + + +@pytest.mark.contract +def test_mcp_lists_try_conduct_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "try_conduct_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_try_conduct_procedure_pauses_to_held() -> None: + """A recoverable setpoint failure pauses the Procedure to Held via the tool; + the structured output carries held=True (the tool wiring is exercised + end-to-end).""" + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "try_conduct_procedure", + "arguments": { + "procedure_id": str(pid), + "body": { + "steps": [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + }, + }, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + structured = body["result"]["structuredContent"] + assert structured["held"] is True + assert structured["succeeded"] is False diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py index 487f17e92d..313cfde7bc 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py @@ -85,7 +85,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -929,7 +928,7 @@ def t(seconds: int) -> datetime: # The final setpoint records the calibrated rotation-axis pixel position -- # the artifact a downstream science scan will read. - final_setpoint_payload = json.loads(step_rows[-1]["payload"]) + final_setpoint_payload = step_rows[-1]["payload"] assert final_setpoint_payload["channel"] == "RotationCenter" assert final_setpoint_payload["target_value"] == 1024.5 assert final_setpoint_payload["units"] == "px" @@ -938,7 +937,7 @@ def t(seconds: int) -> datetime: # judgment + supporting evidence. Iteration is no longer encoded here # (no `evidence['iteration']`); it is first-class, asserted via the # per-iteration read model below. - convergence_check_payload = json.loads(step_rows[11]["payload"]) + convergence_check_payload = step_rows[11]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["source"] == "live_tomostream_centroid" assert convergence_check_payload["evidence"]["offset_px"] == 0.5 @@ -1056,7 +1055,7 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg """ SELECT step_kind, payload, sampled_at FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at """, procedure_id, diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py index 81f0ae7003..7337a32850 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py @@ -657,7 +657,9 @@ async def test_focus_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 13 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py index 8ad022bd62..fcdf0a7583 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py @@ -102,7 +102,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -663,7 +662,9 @@ async def test_pitch_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 14 @@ -687,7 +688,7 @@ async def test_pitch_alignment_plays_out_end_to_end( # The convergence Check (iteration 2's 180° check) records the sharpness # delta. Iteration is no longer encoded here (no `evidence['iteration']`); # it is first-class, asserted via the per-iteration read model below. - convergence_check_payload = json.loads(rows[12]["payload"]) + convergence_check_payload = rows[12]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["evidence"]["delta_sharpness"] == 0.02 assert "iteration" not in convergence_check_payload["evidence"] diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py index 9d7a1f766a..4884ac342b 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py @@ -592,7 +592,9 @@ async def test_resolution_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 13 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py index e9dcb02388..728dcadee7 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py @@ -100,7 +100,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -651,7 +650,9 @@ async def test_roll_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 14 @@ -676,7 +677,7 @@ async def test_roll_alignment_plays_out_end_to_end( # judgment + supporting evidence. Iteration is no longer encoded here # (no `evidence['iteration']`); it is first-class, asserted via the # per-iteration read model below. - convergence_check_payload = json.loads(rows[12]["payload"]) + convergence_check_payload = rows[12]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["evidence"]["delta_y_px"] == 0.3 assert "iteration" not in convergence_check_payload["evidence"] diff --git a/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py index 5947220475..eec5129885 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py @@ -211,7 +211,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) diff --git a/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py b/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py index 2b2d4e1856..52d25be56c 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py +++ b/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py @@ -457,7 +457,9 @@ async def test_dark_baseline_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 5 diff --git a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py index 60cb2acf2b..120d047734 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py +++ b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py @@ -219,7 +219,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) @@ -516,9 +518,8 @@ def t(seconds: int) -> datetime: "setpoint", "setpoint", # finalize: converged AX + AY ] - import json - last = json.loads(step_rows[-1]["payload"]) + last = step_rows[-1]["payload"] assert last["channel"] == "DetectorTable.AY" assert last["units"] == "deg" diff --git a/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py index 2066d4c45f..51cb059b2c 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py @@ -207,7 +207,7 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg """ SELECT step_kind, payload, sampled_at FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at """, procedure_id, diff --git a/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py b/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py index 9ebe08cd0d..bae8081cdf 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py +++ b/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py @@ -354,7 +354,9 @@ async def test_energy_setting_records_a_coordinated_move(db_pool: asyncpg.Pool) await _drain(db_pool) async with db_pool.acquire() as conn: rows = await conn.fetch( - "SELECT step_kind FROM entries_operation_procedure_activities WHERE procedure_id = $1", + "SELECT step_kind FROM entries_operation_procedure_activities " + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight'", procedure_id, ) kinds = [r["step_kind"] for r in rows] diff --git a/apps/api/tests/integration/scenarios/test_2bm_first_light.py b/apps/api/tests/integration/scenarios/test_2bm_first_light.py index 6f84888a38..6c38380104 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_first_light.py +++ b/apps/api/tests/integration/scenarios/test_2bm_first_light.py @@ -468,7 +468,9 @@ async def test_first_light_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 7 diff --git a/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py b/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py index a0ecbda1f9..0f85209dd9 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py +++ b/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py @@ -479,7 +479,9 @@ async def test_flat_baseline_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 8 diff --git a/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py b/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py index d87f875f38..48fefb3407 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py +++ b/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py @@ -690,7 +690,9 @@ async def test_hexapod_reboot_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at, event_id", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at, event_id", _PROCEDURE_ID, ) assert len(rows) == 17 diff --git a/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py b/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py index bb8b0fa651..1805aa18bb 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py +++ b/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py @@ -686,7 +686,9 @@ async def test_motor_homing_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 9 diff --git a/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py index 3da5e1f6e3..3733f146e0 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py @@ -520,7 +520,9 @@ async def test_sensitivity_characterization_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at, event_id", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at, event_id", _PROCEDURE_ID, ) assert len(rows) == 8 diff --git a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py index a622efa507..5bda2a3ed1 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py +++ b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py @@ -22,7 +22,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -184,7 +183,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) @@ -368,7 +369,7 @@ def t(seconds: int) -> datetime: "action", "check", # close ] - close_size = json.loads(step_rows[4]["payload"]) + close_size = step_rows[4]["payload"] assert close_size["channel"] == "Hsize" assert close_size["target_value"] == 0.5 diff --git a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py index d46cae34f1..524455f51d 100644 --- a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py @@ -23,7 +23,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from uuid import UUID @@ -121,8 +120,9 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( started_event_id = UUID("01900000-0000-7000-8000-0000020d0101") logbook_id = UUID("01900000-0000-7000-8000-0000020d0102") open_event_id = UUID("01900000-0000-7000-8000-0000020d0103") - collect_step_id = UUID("01900000-0000-7000-8000-0000020d0104") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0105") + collect_marker_id = UUID("01900000-0000-7000-8000-0000020d0104") + collect_step_id = UUID("01900000-0000-7000-8000-0000020d0105") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0106") deps = build_postgres_deps( db_pool, @@ -131,6 +131,7 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( started_event_id, logbook_id, open_event_id, + collect_marker_id, collect_step_id, completed_event_id, ], @@ -178,13 +179,13 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( """ SELECT step_kind, payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at, event_id """, procedure_id, ) assert [r["step_kind"] for r in rows] == ["action"] - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "collect" assert payload["result"] == "ok" result_data = payload["result_data"] @@ -211,8 +212,9 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects started_event_id = UUID("01900000-0000-7000-8000-0000020d0201") logbook_id = UUID("01900000-0000-7000-8000-0000020d0202") open_event_id = UUID("01900000-0000-7000-8000-0000020d0203") - discrete_step_id = UUID("01900000-0000-7000-8000-0000020d0204") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0205") + discrete_marker_id = UUID("01900000-0000-7000-8000-0000020d0204") + discrete_step_id = UUID("01900000-0000-7000-8000-0000020d0205") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0206") deps = build_postgres_deps( db_pool, @@ -221,6 +223,7 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects started_event_id, logbook_id, open_event_id, + discrete_marker_id, discrete_step_id, completed_event_id, ], @@ -269,11 +272,11 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects """ SELECT payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' """, procedure_id, ) - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "discrete" assert payload["result"] == "ok" result_data = payload["result_data"] @@ -296,8 +299,9 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020d0301") logbook_id = UUID("01900000-0000-7000-8000-0000020d0302") open_event_id = UUID("01900000-0000-7000-8000-0000020d0303") - continuous_step_id = UUID("01900000-0000-7000-8000-0000020d0304") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0305") + continuous_marker_id = UUID("01900000-0000-7000-8000-0000020d0304") + continuous_step_id = UUID("01900000-0000-7000-8000-0000020d0305") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0306") deps = build_postgres_deps( db_pool, @@ -306,6 +310,7 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( started_event_id, logbook_id, open_event_id, + continuous_marker_id, continuous_step_id, completed_event_id, ], @@ -356,11 +361,11 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( """ SELECT payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' """, procedure_id, ) - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "continuous" assert payload["result"] == "ok" result_data = payload["result_data"] diff --git a/apps/api/tests/integration/test_append_activities_handler_postgres.py b/apps/api/tests/integration/test_append_activities_handler_postgres.py index 997d354695..1ca26d607e 100644 --- a/apps/api/tests/integration/test_append_activities_handler_postgres.py +++ b/apps/api/tests/integration/test_append_activities_handler_postgres.py @@ -12,7 +12,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from uuid import UUID, uuid4 @@ -207,7 +206,7 @@ async def test_append_activities_lazy_open_and_polymorphic_round_trip( # recorded_at is DEFAULT now() at the DB layer; must come AFTER occurred_at. assert setpoint_row["recorded_at"] >= setpoint_row["occurred_at"] # asyncpg returns jsonb as a JSON string for plain SELECT; decode it. - setpoint_payload = json.loads(setpoint_row["payload"]) + setpoint_payload = setpoint_row["payload"] assert setpoint_payload == { "channel": "T_oven", "target_value": 423.0, @@ -215,10 +214,10 @@ async def test_append_activities_lazy_open_and_polymorphic_round_trip( "ramp_rate": 5.0, } - action_payload = json.loads(by_kind["action"]["payload"]) + action_payload = by_kind["action"]["payload"] assert action_payload == {"action_name": "open_valve", "params": {"valve": "V12"}} - check_payload = json.loads(by_kind["check"]["payload"]) + check_payload = by_kind["check"]["payload"] assert check_payload["passed"] is True assert check_payload["actual"] == 422.8 @@ -308,7 +307,7 @@ async def test_append_activities_dedups_on_event_id_in_postgres( rows = await _read_steps_for_procedure(db_pool, procedure_id) assert len(rows) == 1 assert rows[0]["step_kind"] == "setpoint" - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload == {"channel": "X", "target_value": 1.0} @@ -317,3 +316,57 @@ async def test_postgres_step_store_handles_empty_batch(db_pool: asyncpg.Pool) -> """Empty batch is a no-op at the adapter layer (early return).""" store = PostgresActivityStore(db_pool) await store.append([]) # No exception, no rows touched. + + +@pytest.mark.integration +async def test_payload_stores_as_real_jsonb_so_server_side_filters_work( + db_pool: asyncpg.Pool, +) -> None: + """Regression: payload must persist as a real jsonb OBJECT (not a double- + encoded jsonb scalar string), so server-side `payload->>'key'` works. When + payload was double-encoded (json.dumps bound to a jsonb column with no + `::jsonb` cast), `payload->>'result'` returned NULL and the conductor's + in-flight-marker filters (`payload->>'result' IS DISTINCT FROM 'in_flight'`) + silently no-op'd, leaking marker rows into assertions.""" + procedure_id = UUID("01900000-0000-7000-8000-0000010c0d01") + logbook_id = UUID("01900000-0000-7000-8000-0000010c0d02") + open_event_id = UUID("01900000-0000-7000-8000-0000010c0d03") + deps = build_postgres_deps(db_pool, now=_NOW, ids=[logbook_id, open_event_id]) + step_store = PostgresActivityStore(db_pool) + await _seed_running_procedure(deps.event_store, procedure_id) + + handler = bind_append(deps, step_store=step_store) + await handler( + AppendProcedureActivities( + procedure_id=procedure_id, + entries=( + _entry( + event_id=UUID("01900000-0000-7000-8000-0000010c0e01"), + step_kind="setpoint", + payload={"address": "2bma:x", "result": "in_flight"}, + sampled_at=datetime(2026, 5, 15, 12, 0, 1, tzinfo=UTC), + ), + _entry( + event_id=UUID("01900000-0000-7000-8000-0000010c0e02"), + step_kind="setpoint", + payload={"address": "2bma:x", "result": "ok"}, + sampled_at=datetime(2026, 5, 15, 12, 0, 2, tzinfo=UTC), + ), + ), + ), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + async with db_pool.acquire() as conn: + # Server-side extraction returns the actual value (not NULL), so the + # marker filter excludes the in_flight row and keeps only the outcome. + rows = await conn.fetch( + """ + SELECT payload->>'result' AS result + FROM entries_operation_procedure_activities + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' + """, + procedure_id, + ) + assert [r["result"] for r in rows] == ["ok"] diff --git a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py index 14c5cf5609..88c24c0439 100644 --- a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py @@ -104,9 +104,10 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( logbook_id = UUID("01900000-0000-7000-8000-0000020c0101") open_event_id = UUID("01900000-0000-7000-8000-0000020c0102") started_event_id = UUID("01900000-0000-7000-8000-0000020c0103") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0104") - check_step_id = UUID("01900000-0000-7000-8000-0000020c0105") - completed_event_id = UUID("01900000-0000-7000-8000-0000020c0106") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0104") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0105") + check_step_id = UUID("01900000-0000-7000-8000-0000020c0106") + completed_event_id = UUID("01900000-0000-7000-8000-0000020c0107") deps = build_postgres_deps( db_pool, @@ -115,7 +116,8 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, check_step_id, completed_event_id, ], @@ -167,17 +169,29 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( """, procedure_id, ) - assert [r["step_kind"] for r in rows] == ["setpoint", "check"] - import json - setpoint_payload = json.loads(rows[0]["payload"]) + parsed = [(r["step_kind"], r["payload"]) for r in rows] + # The setpoint is side-effecting: it records a pre-effect in-flight + # marker then the `ok` outcome, both round-tripping into Postgres. The + # check (pure read) records only its outcome -- no marker. + assert [(k, p["result"]) for k, p in parsed] == [ + ("setpoint", "in_flight"), + ("setpoint", "ok"), + ("check", "ok"), + ] + setpoint_marker = parsed[0][1] + assert setpoint_marker["address"] == f"{softioc}double_value" + assert setpoint_marker["value"] == 7.5 + assert "post_reading" not in setpoint_marker # marker precedes the write + + setpoint_payload = parsed[1][1] assert setpoint_payload["address"] == f"{softioc}double_value" assert setpoint_payload["value"] == 7.5 assert setpoint_payload["result"] == "ok" assert setpoint_payload["post_reading"]["value"] == 7.5 assert setpoint_payload["post_reading"]["quality"] == "Good" - check_payload = json.loads(rows[1]["payload"]) + check_payload = parsed[2][1] assert check_payload["address"] == f"{softioc}double_value" assert check_payload["criterion"] == { "kind": "within_tolerance", @@ -204,8 +218,9 @@ async def test_conductor_aborts_procedure_when_setpoint_fails_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020c0201") logbook_id = UUID("01900000-0000-7000-8000-0000020c0202") open_event_id = UUID("01900000-0000-7000-8000-0000020c0203") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0204") - aborted_event_id = UUID("01900000-0000-7000-8000-0000020c0205") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0204") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0205") + aborted_event_id = UUID("01900000-0000-7000-8000-0000020c0206") deps = build_postgres_deps( db_pool, @@ -214,7 +229,8 @@ async def test_conductor_aborts_procedure_when_setpoint_fails_against_softioc( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, aborted_event_id, ], ) @@ -271,9 +287,10 @@ async def test_conductor_completes_procedure_with_equals_check_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020c0301") logbook_id = UUID("01900000-0000-7000-8000-0000020c0302") open_event_id = UUID("01900000-0000-7000-8000-0000020c0303") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0304") - check_step_id = UUID("01900000-0000-7000-8000-0000020c0305") - completed_event_id = UUID("01900000-0000-7000-8000-0000020c0306") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0304") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0305") + check_step_id = UUID("01900000-0000-7000-8000-0000020c0306") + completed_event_id = UUID("01900000-0000-7000-8000-0000020c0307") deps = build_postgres_deps( db_pool, @@ -282,7 +299,8 @@ async def test_conductor_completes_procedure_with_equals_check_against_softioc( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, check_step_id, completed_event_id, ], diff --git a/apps/api/tests/integration/test_held_status_projection_postgres.py b/apps/api/tests/integration/test_held_status_projection_postgres.py new file mode 100644 index 0000000000..c1e4f66b7b --- /dev/null +++ b/apps/api/tests/integration/test_held_status_projection_postgres.py @@ -0,0 +1,113 @@ +"""End-to-end: the Held/Resumed FSM drives `status` in the +`proj_operation_procedure_summary` read model against real Postgres. + +This is the only place the widened status CHECK (migration +20260621060000, admitting 'Held') is actually exercised: the projection +unit tests use a mocked connection, so the column constraint is enforced +only here. + +Pins: + - ProcedureHeld folds to status='Held' + last_status_reason (the hold + reason), proving the CHECK admits 'Held'. + - ProcedureResumed folds back to status='Running' and clears + last_status_reason (Running is not reason-bearing). + - The list_procedures read path surfaces + filters on status='Held'. +""" + +# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import asyncpg +import pytest + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.projection import ProjectionRegistry, drain_projections +from cora.operation._projections import register_operation_projections +from cora.operation.features.hold_procedure import HoldProcedure +from cora.operation.features.hold_procedure import bind as bind_hold +from cora.operation.features.list_procedures import ListProcedures +from cora.operation.features.list_procedures import bind as bind_list +from cora.operation.features.register_procedure import RegisterProcedure +from cora.operation.features.register_procedure import bind as bind_register +from cora.operation.features.resume_procedure import ResumeProcedure +from cora.operation.features.resume_procedure import bind as bind_resume +from cora.operation.features.start_procedure import StartProcedure +from cora.operation.features.start_procedure import bind as bind_start +from tests.integration._helpers import build_postgres_deps + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +def _build_deps(db_pool: asyncpg.Pool, ids: list[UUID]) -> Kernel: + return build_postgres_deps(db_pool, now=_NOW, ids=ids) + + +async def _drain(db_pool: asyncpg.Pool) -> None: + registry = ProjectionRegistry() + register_operation_projections(registry) + await drain_projections(db_pool, registry, deadline_seconds=2.0) + + +async def _status_row(db_pool: asyncpg.Pool, proc_id: UUID) -> asyncpg.Record: + async with db_pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT status, last_status_reason FROM proj_operation_procedure_summary " + "WHERE procedure_id = $1", + proc_id, + ) + assert row is not None + return row + + +@pytest.mark.integration +async def test_hold_then_resume_drives_status_in_read_model(db_pool: asyncpg.Pool) -> None: + proc_id = uuid4() + deps = _build_deps(db_pool, [proc_id, *[uuid4() for _ in range(8)]]) + + await bind_register(deps)( + RegisterProcedure(name="2-BM center alignment", kind="center_alignment"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await bind_start(deps)( + StartProcedure(procedure_id=proc_id), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + # Hold: the projection writes status='Held' (the CHECK must admit it) + + # the hold reason. + await bind_hold(deps)( + HoldProcedure(procedure_id=proc_id, reason="beam dropped"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await _drain(db_pool) + held = await _status_row(db_pool, proc_id) + assert held["status"] == "Held" + assert held["last_status_reason"] == "beam dropped" + + # The list read path surfaces + filters on the new status. + page = await bind_list(deps)( + ListProcedures(status="Held"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + item = next(i for i in page.items if i.procedure_id == proc_id) + assert item.status == "Held" + assert item.last_status_reason == "beam dropped" + + # Resume: back to Running, hold reason cleared. + await bind_resume(deps)( + ResumeProcedure(procedure_id=proc_id, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await _drain(db_pool) + resumed = await _status_row(db_pool, proc_id) + assert resumed["status"] == "Running" + assert resumed["last_status_reason"] is None diff --git a/apps/api/tests/unit/operation/test_abort_procedure_decider.py b/apps/api/tests/unit/operation/test_abort_procedure_decider.py index 3654ae0a40..2b5111dd3c 100644 --- a/apps/api/tests/unit/operation/test_abort_procedure_decider.py +++ b/apps/api/tests/unit/operation/test_abort_procedure_decider.py @@ -57,6 +57,21 @@ def test_decide_emits_procedure_aborted_when_running() -> None: assert events[0].actuation_kind is None +@pytest.mark.unit +def test_decide_emits_procedure_aborted_when_held() -> None: + """Resumable conduct: a paused (Held) Procedure stays abortable.""" + proc = _procedure(status=ProcedureStatus.HELD) + events = abort_procedure.decide( + state=proc, + command=AbortProcedure(procedure_id=proc.id, reason="paused then abandoned"), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureAborted) + assert events[0].procedure_id == proc.id + assert events[0].reason == "paused then abandoned" + + @pytest.mark.unit @pytest.mark.parametrize("kind", ["Physical", "Simulated", "Hybrid"]) def test_decide_snapshots_actuation_kind_onto_aborted_event(kind: str) -> None: diff --git a/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py index 2cdc86eab5..7dc8559514 100644 --- a/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py +++ b/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py @@ -10,8 +10,8 @@ - state=None always raises `ProcedureNotFoundError` carrying command.procedure_id. - - The source-state partition is total over `ProcedureStatus`: the - sole source `{Running}` emits exactly one `ProcedureAborted` + - The source-state partition is total over `ProcedureStatus`: each + source in `{Running, Held}` emits exactly one `ProcedureAborted` (procedure_id=state.id, reason threaded, occurred_at=now); every other status raises `ProcedureCannotAbortError` carrying the current status. @@ -46,7 +46,7 @@ _REASON = printable_ascii_text(min_size=1, max_size=500) -_ABORTABLE_SOURCES = (ProcedureStatus.RUNNING,) +_ABORTABLE_SOURCES = (ProcedureStatus.RUNNING, ProcedureStatus.HELD) _DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_ABORTABLE_SOURCES)) @@ -91,7 +91,7 @@ def test_abort_from_permitted_source_emits_single_event( reason: str, now: datetime, ) -> None: - """Running emits one ProcedureAborted with the threaded reason.""" + """Running or Held emits one ProcedureAborted with the threaded reason.""" events = abort_procedure.decide( state=_procedure(procedure_id=procedure_id, status=source), command=AbortProcedure(procedure_id=procedure_id, reason=reason), @@ -113,7 +113,7 @@ def test_abort_from_disallowed_source_always_raises_cannot_abort( reason: str, now: datetime, ) -> None: - """Any non-Running source raises ProcedureCannotAbortError carrying the status. + """Any source outside {Running, Held} raises ProcedureCannotAbortError. A valid reason is supplied so the source-state guard is what fires (reason validation runs first in the decider). diff --git a/apps/api/tests/unit/operation/test_collect_action_body.py b/apps/api/tests/unit/operation/test_collect_action_body.py index 569df29bdf..2c1f6ffa51 100644 --- a/apps/api/tests/unit/operation/test_collect_action_body.py +++ b/apps/api/tests/unit/operation/test_collect_action_body.py @@ -475,7 +475,7 @@ async def test_conductor_executes_collect_action_and_records_step_entry() -> Non control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -496,7 +496,9 @@ async def test_conductor_executes_collect_action_and_records_step_entry() -> Non ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "collect" assert entry.payload["result"] == "ok" diff --git a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py index 9b6026bc51..8d25a4dd45 100644 --- a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py @@ -29,6 +29,7 @@ from cora.infrastructure.ports.clock import FakeClock from cora.infrastructure.ports.id_generator import UUIDv7Generator from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._conduct_wire import criterion_from_wire, step_from_wire from cora.operation.adapters.in_memory_recipe_expander import ( InMemoryRecipeExpander, ) @@ -58,9 +59,7 @@ from cora.operation.features.conduct_procedure.handler import bind from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, - criterion_from_wire, result_to_wire, - step_from_wire, ) _NOW = datetime(2026, 6, 2, 12, 0, 0, tzinfo=UTC) @@ -225,7 +224,7 @@ async def test_conduct_procedure_handler_dispatches_to_conductor_with_envelope() @pytest.mark.unit async def test_conduct_procedure_pins_resolved_steps_before_conducting() -> None: - """The handler appends a ResolvedStepsRecorded manifest (the resolved + """The handler appends a ResolvedStepsRecorded event (the resolved step list) to the Procedure stream before dispatching to the Conductor.""" procedure_id = uuid4() store = InMemoryEventStore() @@ -248,14 +247,14 @@ async def test_conduct_procedure_pins_resolved_steps_before_conducting() -> None ) stored, _ = await store.load(stream_type="Procedure", stream_id=procedure_id) - manifests = [ + recorded = [ event for event in (from_stored(s) for s in stored) if isinstance(event, ResolvedStepsRecorded) ] - assert len(manifests) == 1 - assert manifests[0].step_count == 2 - assert manifests[0].resolved_steps == tuple(step_to_payload(step) for step in steps) + assert len(recorded) == 1 + assert recorded[0].step_count == 2 + assert recorded[0].resolved_steps == tuple(step_to_payload(step) for step in steps) @pytest.mark.unit diff --git a/apps/api/tests/unit/operation/test_conductor.py b/apps/api/tests/unit/operation/test_conductor.py index faa2aea184..967e5bc435 100644 --- a/apps/api/tests/unit/operation/test_conductor.py +++ b/apps/api/tests/unit/operation/test_conductor.py @@ -2,11 +2,18 @@ Coverage spans both step kinds shipped to date (setpoint + action): + Pre-effect in-flight marker (side-effecting steps): + - a setpoint / action records a `result="in_flight"` marker entry + BEFORE the effect, then the `ok` / `failed` outcome after (two + appends per side-effecting step); a check records no marker + - a cancelled / crashing effect still leaves the marker behind + (marker-without-outcome = the interrupted step), enabling resume + Setpoint: - empty steps -> trivially succeeds, no handler call - - 3 setpoints -> 3 ControlPort writes + 3 step entries recorded + - 3 setpoints -> 3 ControlPort writes; each records marker + outcome - first write raises ControlNotConnectedError -> halt at index 0, - failure entry recorded, ConductorResult.failure populated + marker + failure entry recorded, ConductorResult.failure populated - middle write raises ControlTimeoutError -> halt at index N, earlier successes recorded, failure entry for the failing step, remaining steps untouched @@ -140,19 +147,23 @@ class _SequenceIdGenerator: """Deterministic id_generator that returns a pre-supplied list of ids. Lets tests pin event_id values into the recorded entries so the - payload assertion is exact. Raises on exhaustion so missing ids - are loud, not silent. + payload assertion is exact. The pre-effect in-flight marker doubles + the per-step append count for setpoint / action steps, so a test + supplies only the ids it actually asserts on (pinned first, in + order) and lets the rest auto-generate. Append-COUNT assertions are + pinned via `len(appender.calls)`, not via id exhaustion, so lenient + generation here masks no over-append bug. """ ids: list[UUID] _index: int = 0 def new_id(self) -> UUID: - if self._index >= len(self.ids): - raise RuntimeError("FixedIdGenerator exhausted") - out = self.ids[self._index] - self._index += 1 - return out + if self._index < len(self.ids): + out = self.ids[self._index] + self._index += 1 + return out + return uuid4() def _conductor( @@ -219,34 +230,45 @@ async def test_execute_setpoints_writes_each_step_via_control_port_in_order() -> @pytest.mark.unit -async def test_execute_setpoint_records_success_entry_with_expected_payload() -> None: - """Each successful write produces one append call with the expected payload.""" +async def test_execute_setpoint_records_in_flight_marker_then_success_entry() -> None: + """A successful write produces two append calls: the pre-effect in-flight + marker first, then the `ok` outcome entry, both carrying the envelope.""" port = InMemoryControlPort(now=lambda: _FIXED_NOW) port.simulate_connect("2bma:rot:val") appender = _FakeAppendStep() procedure_id = uuid4() principal_id = uuid4() correlation_id = uuid4() - event_id = uuid4() - conductor = _conductor(port, appender, ids=[event_id]) + marker_id = uuid4() + outcome_id = uuid4() + conductor = _conductor(port, appender, ids=[marker_id, outcome_id]) await conductor.execute( procedure_id=procedure_id, principal_id=principal_id, correlation_id=correlation_id, steps=(SetpointStep(address="2bma:rot:val", value=12.5),), ) - assert len(appender.calls) == 1 - call = appender.calls[0] - assert call.command.procedure_id == procedure_id - assert call.principal_id == principal_id - assert call.correlation_id == correlation_id - assert len(call.command.entries) == 1 - entry = call.command.entries[0] - assert entry.event_id == event_id - assert entry.step_kind == "setpoint" - assert entry.sampled_at == _FIXED_NOW - assert entry.occurred_at == _FIXED_NOW - assert entry.payload == { + assert len(appender.calls) == 2 + for call in appender.calls: + assert call.command.procedure_id == procedure_id + assert call.principal_id == principal_id + assert call.correlation_id == correlation_id + assert len(call.command.entries) == 1 + marker = appender.calls[0].command.entries[0] + assert marker.event_id == marker_id + assert marker.step_kind == "setpoint" + assert marker.payload == { + "address": "2bma:rot:val", + "value": 12.5, + "step_index": 0, + "result": "in_flight", + } + outcome = appender.calls[1].command.entries[0] + assert outcome.event_id == outcome_id + assert outcome.step_kind == "setpoint" + assert outcome.sampled_at == _FIXED_NOW + assert outcome.occurred_at == _FIXED_NOW + assert outcome.payload == { "address": "2bma:rot:val", "value": 12.5, "step_index": 0, @@ -254,6 +276,26 @@ async def test_execute_setpoint_records_success_entry_with_expected_payload() -> } +@pytest.mark.unit +async def test_execute_check_records_no_in_flight_marker() -> None: + """A check is a pure read (always safe to re-run), so it records its + single outcome entry only -- no pre-effect in-flight marker.""" + port = InMemoryControlPort() + port.set_reading("2bma:rot:rbv", _good_reading(45.0)) + appender = _FakeAppendStep() + conductor = _conductor(port, appender) + await conductor.execute( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(CheckStep(address="2bma:rot:rbv", criterion=EqualsCriterion(expected=45.0)),), + ) + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "ok" + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert "in_flight" not in results + + @pytest.mark.unit async def test_execute_halts_at_first_not_connected_error_on_setpoint() -> None: """First write raises ControlNotConnectedError -> failure at index 0.""" @@ -279,9 +321,11 @@ async def test_execute_halts_at_first_not_connected_error_on_setpoint() -> None: error_class="ControlNotConnectedError", message="Control address '2bma:rot:val' not connected", ) - # Exactly one failure entry recorded; the second step is untouched. - assert len(appender.calls) == 1 - failure_entry = appender.calls[0].command.entries[0] + # In-flight marker then the failure outcome for step 0; the second + # step is untouched (no marker, no outcome). + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + failure_entry = appender.calls[1].command.entries[0] assert failure_entry.payload["result"] == "failed" assert failure_entry.payload["error_class"] == "ControlNotConnectedError" assert "not connected" in failure_entry.payload["message"] @@ -310,10 +354,11 @@ async def test_execute_records_earlier_setpoint_successes_before_middle_failure( assert result.failure is not None assert result.failure.step_index == 1 assert result.failure.target == "2bma:cam:exposure" - # 2 append calls: one OK at index 0, one FAILED at index 1; index 2 never tried. - assert len(appender.calls) == 2 - assert appender.calls[0].command.entries[0].payload["result"] == "ok" - assert appender.calls[1].command.entries[0].payload["result"] == "failed" + # 4 append calls: marker+ok at index 0, marker+failed at index 1; + # index 2 never tried (no marker). + assert len(appender.calls) == 4 + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert results == ["in_flight", "ok", "in_flight", "failed"] @pytest.mark.unit @@ -336,10 +381,12 @@ async def test_execute_records_step_index_matching_conduct_position() -> None: ) assert result.failure is not None assert result.failure.step_index == 1 - # Success entry at index 0, failure entry at index 1; index 2 never tried. - assert appender.calls[0].command.entries[0].payload["step_index"] == 0 - assert appender.calls[1].command.entries[0].payload["step_index"] == 1 - assert appender.calls[1].command.entries[0].payload["result"] == "failed" + # marker+ok at index 0, marker+failed at index 1; index 2 never tried. + # Every entry (marker and outcome) carries its step's position. + step_indices = [c.command.entries[0].payload["step_index"] for c in appender.calls] + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert step_indices == [0, 0, 1, 1] + assert results == ["in_flight", "ok", "in_flight", "failed"] @pytest.mark.unit @@ -365,7 +412,12 @@ async def test_execute_passes_through_causation_and_surface_ids() -> None: @pytest.mark.unit async def test_execute_does_not_catch_non_port_exceptions_on_setpoint() -> None: - """A CancelledError mid-write propagates; nothing is recorded for it.""" + """A CancelledError mid-write propagates; the pre-effect in-flight marker + IS recorded (it lands before the write), but no outcome entry follows. + + The marker-without-outcome is exactly the resume substrate: the + interrupted step is identifiable after a crash / cancellation. + """ class _CancellingPort: async def read(self, _address: str) -> Reading: # pragma: no cover # unused @@ -391,7 +443,9 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove correlation_id=uuid4(), steps=(SetpointStep(address="anywhere", value=1.0),), ) - assert appender.calls == [] + # Only the in-flight marker; the cancelled write recorded no outcome. + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" @pytest.mark.unit @@ -441,7 +495,17 @@ async def home_motor(ctx: ActionContext) -> Mapping[str, Any]: port.simulate_connect("2bma:rot:val") await captured[0].control_port.write("2bma:rot:val", 4.2) assert (await port.read("2bma:rot:val")).value == 4.2 - entry = appender.calls[0].command.entries[0] + # marker (no result_data yet) then the outcome carrying result_data. + assert len(appender.calls) == 2 + marker = appender.calls[0].command.entries[0] + assert marker.step_kind == "action" + assert marker.payload == { + "name": "home_motor", + "params": {"axis": "rot"}, + "step_index": 0, + "result": "in_flight", + } + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload == { "name": "home_motor", @@ -474,9 +538,10 @@ async def test_execute_action_unknown_name_records_failure_and_halts() -> None: assert result.failure.source_kind == "action" assert result.failure.target == "nope" assert result.failure.error_class == "UnknownActionError" - # Only one record (the failure); the second action is untouched. - assert len(appender.calls) == 1 - payload = appender.calls[0].command.entries[0].payload + # marker then the failure outcome; the second action is untouched. + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "UnknownActionError" assert payload["name"] == "nope" @@ -504,14 +569,19 @@ async def picky(_ctx: ActionContext) -> Mapping[str, Any]: assert result.failure.error_class == "ControlTimeoutError" assert result.failure.source_kind == "action" assert result.failure.target == "picky" - payload = appender.calls[0].command.entries[0].payload + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "ControlTimeoutError" @pytest.mark.unit async def test_execute_action_body_raising_non_port_exception_propagates() -> None: - """Generic exceptions in a body propagate; the Conductor does not swallow them.""" + """Generic exceptions in a body propagate; the Conductor does not swallow them. + + The pre-effect in-flight marker IS recorded (it lands before the body + runs), but no outcome entry follows the propagating exception. + """ async def buggy(_ctx: ActionContext) -> Mapping[str, Any]: raise RuntimeError("oops") @@ -527,7 +597,9 @@ async def buggy(_ctx: ActionContext) -> Mapping[str, Any]: correlation_id=uuid4(), steps=(ActionStep(name="buggy"),), ) - assert appender.calls == [] + # Only the in-flight marker; the crashing body recorded no outcome. + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" @pytest.mark.unit @@ -563,9 +635,14 @@ async def close_shutter(_ctx: ActionContext) -> Mapping[str, Any]: assert result.succeeded is True assert result.completed_count == 3 assert invocations == ["open_shutter", "close_shutter"] - # 3 recorded entries in order: action / setpoint / action. - kinds = [c.command.entries[0].step_kind for c in appender.calls] - assert kinds == ["action", "setpoint", "action"] + # Outcome entries in order: action / setpoint / action (each preceded + # by its in-flight marker, filtered out here). + outcome_kinds = [ + c.command.entries[0].step_kind + for c in appender.calls + if c.command.entries[0].payload["result"] != "in_flight" + ] + assert outcome_kinds == ["action", "setpoint", "action"] @pytest.mark.unit @@ -820,8 +897,14 @@ async def open_shutter(_ctx: ActionContext) -> Mapping[str, Any]: ) assert result.succeeded is True assert result.completed_count == 3 - kinds = [c.command.entries[0].step_kind for c in appender.calls] - assert kinds == ["setpoint", "action", "check"] + # Outcome entries in order: setpoint / action / check. The setpoint and + # action each prepend an in-flight marker; the check (pure read) does not. + outcome_kinds = [ + c.command.entries[0].step_kind + for c in appender.calls + if c.command.entries[0].payload["result"] != "in_flight" + ] + assert outcome_kinds == ["setpoint", "action", "check"] # --- conduct (FSM lifecycle) coverage ----------------------------------- @@ -940,6 +1023,35 @@ async def test_conduct_without_lifecycle_handlers_raises_runtime_error() -> None ) +@pytest.mark.unit +async def test_try_conduct_without_handlers_raises_runtime_error() -> None: + """try_conduct() requires start + complete + abort + hold; a missing one is + a wiring bug, not a runtime failure, so it propagates.""" + conductor = _conductor(InMemoryControlPort(), _FakeAppendStep()) # no FSM handlers + with pytest.raises(RuntimeError, match="try_conduct"): + await conductor.try_conduct( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(), + ) + + +@pytest.mark.unit +async def test_reconduct_without_handlers_raises_runtime_error() -> None: + """reconduct() requires resume + complete + abort; a missing one is a + wiring bug, so it propagates.""" + conductor = _conductor(InMemoryControlPort(), _FakeAppendStep()) # no FSM handlers + with pytest.raises(RuntimeError, match="reconduct"): + await conductor.reconduct( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(), + boundary=0, + ) + + @pytest.mark.unit async def test_conduct_start_failure_records_lifecycle_failure_without_execute() -> None: """start_procedure rejection -> lifecycle failure; no steps attempted.""" @@ -1053,7 +1165,8 @@ async def test_setpoint_default_verify_omits_post_reading_from_payload() -> None correlation_id=uuid4(), steps=(SetpointStep(address="2bma:rot:val", value=1.0),), ) - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert "post_reading" not in payload assert "post_read_error" not in payload @@ -1071,7 +1184,8 @@ async def test_setpoint_verify_attaches_post_reading_to_payload() -> None: correlation_id=uuid4(), steps=(SetpointStep(address="2bma:rot:val", value=4.2, verify=True),), ) - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker (no post_reading); calls[1] the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert payload["post_reading"]["value"] == 4.2 assert payload["post_reading"]["quality"] == "Good" @@ -1125,7 +1239,8 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove steps=(SetpointStep(address="2bma:rot:val", value=4.2, verify=True),), ) assert result.succeeded is True - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert payload["post_reading"]["quality"] == "Bad" assert payload["post_reading"]["quality_detail"] == "alarm_status=3" @@ -1162,7 +1277,8 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove steps=(SetpointStep(address="lonely", value=1.0, verify=True),), ) assert result.succeeded is True - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert "post_reading" not in payload assert payload["post_read_error"]["error_class"] == "ControlNotConnectedError" @@ -1182,7 +1298,10 @@ async def test_setpoint_verify_does_not_change_write_failure_halt_behavior() -> steps=(SetpointStep(address="missing", value=1.0, verify=True),), ) assert result.succeeded is False - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the failure outcome + # (the write failed before the verify post-read, so no post_reading). + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "ControlNotConnectedError" assert "post_reading" not in payload @@ -1461,9 +1580,10 @@ async def test_execute_setpoint_via_registry_with_unrouted_address_records_failu assert result.failure is not None assert result.failure.error_class == "NoAdapterForAddressError" assert result.failure.source_kind == "setpoint" - # Recorded in logbook, not propagated as a 500. - assert len(appender.calls) == 1 - assert appender.calls[0].command.entries[0].payload["result"] == "failed" + # Recorded in logbook (marker + failure), not propagated as a 500. + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + assert appender.calls[1].command.entries[0].payload["result"] == "failed" # --- actuation provenance (ActuationKind) ------------------------------- diff --git a/apps/api/tests/unit/operation/test_conductor_execute_from.py b/apps/api/tests/unit/operation/test_conductor_execute_from.py new file mode 100644 index 0000000000..b2e1e19c34 --- /dev/null +++ b/apps/api/tests/unit/operation/test_conductor_execute_from.py @@ -0,0 +1,334 @@ +"""Behavioural tests for `Conductor.execute_from` (resumable conduct, Tier 1). + +`execute_from` REPLAYS a pinned resolved step list from a re-establishment +boundary rather than re-deriving the step list: + + - setpoint -> re-drive (idempotent absolute write) + - check -> re-run as a fresh gate + - action -> HALT for an operator decision (interrupted acquisition) + +Headline acceptance test (per the design memo): replay walks the pinned +tail BYTE-FOR-BYTE -- two identical setpoints land on the in-memory port, +identical to what the original conduct wrote. `steps_from_payload` is the +exact inverse of `step_to_payload`, so the pinned `ResolvedStepsRecorded` +step list round-trips into the replayed `Step`s. +""" + +from collections.abc import AsyncIterator, Mapping +from dataclasses import dataclass, field +from datetime import UTC, datetime +from typing import Any +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.ports.clock import FakeClock +from cora.operation.conductor import ( + ActionStep, + CheckStep, + Conductor, + EqualsCriterion, + ResumePolicy, + SetpointStep, + Step, + WithinToleranceCriterion, + step_to_payload, + steps_from_payload, +) +from cora.operation.features.append_activities.command import AppendProcedureActivities +from cora.operation.ports.control_port import ControlNotConnectedError, Reading + +_FIXED_NOW = datetime(2026, 6, 21, 9, 0, 0, tzinfo=UTC) + + +@dataclass +class _FakeAppendStep: + """Captures each append call (the replayed journal).""" + + calls: list[AppendProcedureActivities] = field(default_factory=list[AppendProcedureActivities]) + + async def __call__(self, command: AppendProcedureActivities, **_kwargs: Any) -> int: + self.calls.append(command) + return len(command.entries) + + +@dataclass +class _LenientIds: + """id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +@dataclass +class _RecordingControlPort: + """Captures writes in order (for byte-for-byte assertions); reads from a seed.""" + + writes: list[tuple[str, Any]] = field(default_factory=list[tuple[str, Any]]) + readings: dict[str, Reading] = field(default_factory=dict[str, Reading]) + + async def read(self, address: str) -> Reading: + if address not in self.readings: + raise ControlNotConnectedError(address) + return self.readings[address] + + async def write( + self, + address: str, + value: int | float | bool | str | tuple[Any, ...], + *, + wait: bool = True, + timeout_s: float = 30.0, + ) -> None: + _ = (wait, timeout_s) + self.writes.append((address, value)) + + def subscribe(self, address: str) -> AsyncIterator[Reading]: # pragma: no cover - unused + raise NotImplementedError + + +def _conductor(port: _RecordingControlPort, appender: _FakeAppendStep) -> Conductor: + return Conductor( + control_port=port, # type: ignore[arg-type] + append_step=appender, # type: ignore[arg-type] + clock=FakeClock(_FIXED_NOW), + id_generator=_LenientIds(), # type: ignore[arg-type] + ) + + +def _good_reading(value: Any) -> Reading: + return Reading(value=value, kind="Scalar", quality="Good", sampled_at=_FIXED_NOW) + + +def _pin_and_parse(steps: tuple[Step, ...]) -> tuple[Step, ...]: + """Serialize steps the way conduct pins them, then parse back (the + ResolvedStepsRecorded round-trip a real resume performs).""" + steps_wire = tuple(step_to_payload(s) for s in steps) + return steps_from_payload(steps_wire) + + +# --- headline acceptance: byte-for-byte replay of the pinned tail ---------- + + +@pytest.mark.unit +async def test_execute_from_replays_pinned_tail_byte_for_byte() -> None: + """Two setpoints pinned on the step list re-drive byte-for-byte on resume.""" + original = ( + SetpointStep(address="2bma:rot:val", value=45.0), + SetpointStep(address="2bma:cam:exposure", value=0.025), + ) + steps = _pin_and_parse(original) + assert steps == original # the pinned step list round-trips to the same Steps + + port = _RecordingControlPort() + appender = _FakeAppendStep() + result = await _conductor(port, appender).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=0, + ) + + assert result.succeeded is True + assert result.completed_count == 2 + # Byte-for-byte: the replayed writes equal the pinned step list.s setpoints. + assert port.writes == [("2bma:rot:val", 45.0), ("2bma:cam:exposure", 0.025)] + + +@pytest.mark.unit +async def test_execute_from_boundary_skips_the_prefix() -> None: + """boundary=K re-drives only steps[K:]; the prefix is not re-driven.""" + steps = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + SetpointStep(address="2bma:c", value=3.0), + ) + ) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=1, + ) + assert result.completed_count == 2 + assert port.writes == [("2bma:b", 2.0), ("2bma:c", 3.0)] # 2bma:a (prefix) untouched + + +@pytest.mark.unit +async def test_execute_from_records_marker_and_outcome_with_absolute_index() -> None: + """A re-driven setpoint records the in-flight marker + ok outcome, each + carrying its ABSOLUTE position in the step list (so the replayed journal lines up).""" + steps = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ) + ) + appender = _FakeAppendStep() + await _conductor(_RecordingControlPort(), appender).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=1, + ) + payloads = [c.entries[0].payload for c in appender.calls] + # Only the boundary step (index 1) replayed: marker then outcome, both index 1. + assert [(p["step_index"], p["result"]) for p in payloads] == [(1, "in_flight"), (1, "ok")] + + +@pytest.mark.unit +async def test_execute_from_on_action_requires_operator_decision() -> None: + """An acquisition (ActionStep) is NOT re-run: resume halts for an operator + decision; the action and everything after it are untouched.""" + steps = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + ActionStep(name="collect", params={"dwell": 0.1}), + SetpointStep(address="2bma:c", value=3.0), + ) + ) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=0, + ) + assert result.succeeded is False + assert result.completed_count == 1 # the leading setpoint re-driven + assert result.failure is not None + assert result.failure.step_index == 1 + assert result.failure.source_kind == "action" + assert result.failure.target == "collect" + assert result.failure.error_class == "AcquisitionResumeRequiresOperator" + # The action did not run and the trailing setpoint was never reached. + assert port.writes == [("2bma:a", 1.0)] + + +@pytest.mark.unit +async def test_execute_from_reruns_check_fresh() -> None: + """A check in the tail is re-run as a fresh gate (read + evaluate).""" + steps = _pin_and_parse( + (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) + ) + port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(45.0)}) + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=0, + ) + assert result.succeeded is True + assert result.completed_count == 1 + + +@pytest.mark.unit +async def test_execute_from_check_mismatch_on_rerun_halts() -> None: + """A re-run check whose criterion no longer matches halts the resume.""" + steps = _pin_and_parse( + (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) + ) + port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(12.5)}) + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=0, + ) + assert result.succeeded is False + assert result.failure is not None + assert result.failure.error_class == "CheckFailedError" + assert result.failure.source_kind == "check" + + +@pytest.mark.unit +async def test_execute_from_boundary_past_end_is_a_no_op() -> None: + """Boundary >= len(steps) replays an empty tail (a no-op resume).""" + steps = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=5, + ) + assert result.succeeded is True + assert result.completed_count == 0 + assert port.writes == [] + + +@pytest.mark.unit +async def test_execute_from_rejects_negative_boundary() -> None: + with pytest.raises(ValueError, match="boundary must be >= 0"): + await _conductor(_RecordingControlPort(), _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(), + boundary=-1, + ) + + +@pytest.mark.unit +async def test_execute_from_explicit_re_establish_policy_is_the_default() -> None: + """Passing the only policy member behaves identically to the default.""" + steps = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=steps, + boundary=0, + policy=ResumePolicy.RE_ESTABLISH, + ) + assert result.succeeded is True + assert port.writes == [("2bma:a", 1.0)] + + +# --- steps_from_payload is the exact inverse of step_to_payload ----------- + + +@pytest.mark.unit +@pytest.mark.parametrize( + "step", + [ + SetpointStep(address="2bma:rot", value=12.5, verify=True), + SetpointStep(address="2bma:energy", value=(1, 2, 3)), + ActionStep(name="collect", params={"dwell": 0.1, "detector": "2bma:cam1"}), + CheckStep(address="2bma:shutter", criterion=EqualsCriterion(expected="Open")), + CheckStep(address="2bma:idx", criterion=EqualsCriterion(expected=(1, 2))), + CheckStep( + address="2bma:temp", + criterion=WithinToleranceCriterion(expected=100.0, tolerance=0.5), + ), + ], +) +def test_steps_from_payload_round_trips_step_to_payload(step: Step) -> None: + assert steps_from_payload((step_to_payload(step),)) == (step,) + + +@pytest.mark.unit +def test_steps_from_payload_rejects_unknown_kind() -> None: + with pytest.raises(ValueError, match="unknown step kind"): + steps_from_payload(({"kind": "bogus"},)) + + +@pytest.mark.unit +def test_steps_from_payload_rejects_unknown_criterion_kind() -> None: + bad: Mapping[str, Any] = { + "kind": "check", + "address": "x", + "criterion": {"kind": "bogus"}, + } + with pytest.raises(ValueError, match="unknown criterion kind"): + steps_from_payload((bad,)) diff --git a/apps/api/tests/unit/operation/test_continuous_action_body.py b/apps/api/tests/unit/operation/test_continuous_action_body.py index 9a913443f1..0055f47df0 100644 --- a/apps/api/tests/unit/operation/test_continuous_action_body.py +++ b/apps/api/tests/unit/operation/test_continuous_action_body.py @@ -406,7 +406,7 @@ async def test_conductor_executes_continuous_action_and_records_step_entry() -> control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -432,7 +432,9 @@ async def test_conductor_executes_continuous_action_and_records_step_entry() -> ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "continuous" assert entry.payload["result"] == "ok" diff --git a/apps/api/tests/unit/operation/test_discrete_action_body.py b/apps/api/tests/unit/operation/test_discrete_action_body.py index 0e2c9a43de..7397ed8f72 100644 --- a/apps/api/tests/unit/operation/test_discrete_action_body.py +++ b/apps/api/tests/unit/operation/test_discrete_action_body.py @@ -371,7 +371,7 @@ async def test_conductor_executes_discrete_action_and_records_step_entry() -> No control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -393,7 +393,9 @@ async def test_conductor_executes_discrete_action_and_records_step_entry() -> No ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "discrete" assert entry.payload["result"] == "ok" diff --git a/apps/api/tests/unit/operation/test_end_iteration_decider.py b/apps/api/tests/unit/operation/test_end_iteration_decider.py index 1928b6c4c0..6c141be807 100644 --- a/apps/api/tests/unit/operation/test_end_iteration_decider.py +++ b/apps/api/tests/unit/operation/test_end_iteration_decider.py @@ -65,6 +65,23 @@ def test_decide_emits_iteration_ended_with_verdict() -> None: assert event.occurred_at == _NOW +@pytest.mark.unit +def test_decide_emits_iteration_ended_when_held() -> None: + """Resumable conduct: an iteration left open when the conduct was paused + can still be closed while Held (start_iteration stays Running-only).""" + proc = _procedure(status=ProcedureStatus.HELD) # iteration 1 open, paused + events = end_iteration.decide( + state=proc, + command=EndProcedureIteration( + procedure_id=proc.id, iteration_index=1, converged=False, reason=None + ), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureIterationEnded) + assert events[0].iteration_index == 1 + + @pytest.mark.unit def test_decide_passes_none_verdict_and_none_reason() -> None: proc = _procedure() diff --git a/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py b/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py index 5bfd1dbe99..77217ede46 100644 --- a/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py +++ b/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py @@ -2,10 +2,10 @@ Universal claims across generated inputs: - - Running + open iteration + matching index emits exactly one + - Running or Held + open iteration + matching index emits exactly one ProcedureIterationEnded carrying the verdict/reason verbatim and now. - state=None always raises ProcedureNotFoundError. - - A non-Running status always raises ProcedureCannotEndIterationError. + - A disallowed status (not Running/Held) always raises ProcedureCannotEndIterationError. - No open iteration always raises ProcedureCannotEndIterationError. - A mismatched index always raises ProcedureCannotEndIterationError. - Pure: same (state, command, now) returns the same events. @@ -36,7 +36,7 @@ from datetime import datetime from uuid import UUID -_NON_RUNNING = st.sampled_from( +_DISALLOWED_STATUSES = st.sampled_from( [ ProcedureStatus.DEFINED, ProcedureStatus.COMPLETED, @@ -74,6 +74,7 @@ def _procedure( @pytest.mark.unit @given( procedure_id=st.uuids(), + status=st.sampled_from([ProcedureStatus.RUNNING, ProcedureStatus.HELD]), open_index=st.integers(min_value=1, max_value=500), converged=_CONVERGED, reason=_REASON, @@ -81,12 +82,18 @@ def _procedure( ) def test_end_iteration_emits_single_event_carrying_verdict( procedure_id: UUID, + status: ProcedureStatus, open_index: int, converged: bool | None, reason: str | None, now: datetime, ) -> None: - state = _procedure(procedure_id, iteration_count=open_index, current_iteration_index=open_index) + state = _procedure( + procedure_id, + status=status, + iteration_count=open_index, + current_iteration_index=open_index, + ) events = end_iteration.decide( state=state, command=EndProcedureIteration( @@ -127,11 +134,11 @@ def test_end_iteration_on_none_state_always_raises_not_found( @pytest.mark.unit @given( procedure_id=st.uuids(), - status=_NON_RUNNING, + status=_DISALLOWED_STATUSES, open_index=st.integers(min_value=1, max_value=100), now=aware_datetimes(), ) -def test_end_iteration_on_non_running_always_raises( +def test_end_iteration_on_disallowed_status_always_raises( procedure_id: UUID, status: ProcedureStatus, open_index: int, now: datetime ) -> None: state = _procedure( diff --git a/apps/api/tests/unit/operation/test_hold_procedure_decider.py b/apps/api/tests/unit/operation/test_hold_procedure_decider.py new file mode 100644 index 0000000000..bd89465080 --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_decider.py @@ -0,0 +1,150 @@ +"""Pure-decider tests for `hold_procedure` slice. + +Single-source pause transition: `Running -> Held`. Reason field +validated via `ProcedureHoldReason` VO (1-500 chars after trim). +Mirrors `hold_run`; the state name is `Held` (Procedure is an +execution-FSM sibling of Run), with a REQUIRED reason (unlike +slim `RunHeld`). +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.operation.aggregates.procedure import ( + InvalidProcedureHoldReasonError, + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureName, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from cora.shared.text_bounds import REASON_MAX_LENGTH + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) + + +def _procedure( + *, + procedure_id: UUID | None = None, + status: ProcedureStatus = ProcedureStatus.RUNNING, +) -> Procedure: + return Procedure( + id=procedure_id or uuid4(), + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +def test_decide_emits_procedure_held_when_running() -> None: + proc = _procedure() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="beam dropped"), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureHeld) + assert events[0].procedure_id == proc.id + assert events[0].reason == "beam dropped" + assert events[0].occurred_at == _NOW + assert events[0].decided_by_decision_id is None + + +@pytest.mark.unit +def test_decide_threads_decided_by_decision_id() -> None: + proc = _procedure() + decision_id = uuid4() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure( + procedure_id=proc.id, reason="autonomous hold", decided_by_decision_id=decision_id + ), + now=_NOW, + ) + assert events[0].decided_by_decision_id == decision_id + + +@pytest.mark.unit +def test_decide_trims_reason_via_vo() -> None: + proc = _procedure() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason=" investigating fault "), + now=_NOW, + ) + assert events[0].reason == "investigating fault" + + +@pytest.mark.unit +def test_decide_rejects_when_state_is_none() -> None: + pid = uuid4() + with pytest.raises(ProcedureNotFoundError) as exc: + hold_procedure.decide( + state=None, + command=HoldProcedure(procedure_id=pid, reason="x"), + now=_NOW, + ) + assert exc.value.procedure_id == pid + + +@pytest.mark.unit +def test_decide_rejects_whitespace_only_reason() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureHoldReasonError): + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason=" "), + now=_NOW, + ) + + +@pytest.mark.unit +def test_decide_rejects_too_long_reason() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureHoldReasonError): + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="x" * (REASON_MAX_LENGTH + 1)), + now=_NOW, + ) + + +@pytest.mark.unit +@pytest.mark.parametrize( + "status", + [ + ProcedureStatus.DEFINED, + ProcedureStatus.HELD, + ProcedureStatus.COMPLETED, + ProcedureStatus.ABORTED, + ProcedureStatus.TRUNCATED, + ], +) +def test_decide_rejects_non_running_status(status: ProcedureStatus) -> None: + """Holding a non-Running procedure raises (re-holding a Held one too).""" + proc = _procedure(status=status) + with pytest.raises(ProcedureCannotHoldError) as exc: + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="x"), + now=_NOW, + ) + assert exc.value.current_status is status + + +@pytest.mark.unit +def test_decide_is_pure_same_inputs_same_outputs() -> None: + proc = _procedure() + cmd = HoldProcedure(procedure_id=proc.id, reason="break") + first = hold_procedure.decide(state=proc, command=cmd, now=_NOW) + second = hold_procedure.decide(state=proc, command=cmd, now=_NOW) + assert first == second diff --git a/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py new file mode 100644 index 0000000000..45a8d726e1 --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py @@ -0,0 +1,167 @@ +"""Property-based tests for `hold_procedure.decide` (Operation BC). + +Complements the example-based `test_hold_procedure_decider.py` with +universal claims across generated inputs. The decider is a pure +single-source pause transition with a reason: + + (state, command, now) -> list[ProcedureHeld] + +Load-bearing properties: + + - state=None always raises `ProcedureNotFoundError` carrying + command.procedure_id. + - The source-state partition is total over `ProcedureStatus`: the + sole source `{Running}` emits exactly one `ProcedureHeld` + (procedure_id=state.id, reason threaded, occurred_at=now); every + other status raises `ProcedureCannotHoldError` carrying the current + status. (Adding a new status auto-extends `_DISALLOWED_SOURCES`.) + - The emitted event's procedure_id is `state.id`, never + command.procedure_id. + - Pure: same (state, command, now) returns equal events. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from hypothesis import assume, given +from hypothesis import strategies as st + +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureName, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from tests._strategies import aware_datetimes, printable_ascii_text + +if TYPE_CHECKING: + from datetime import datetime + from uuid import UUID + +_REASON = printable_ascii_text(min_size=1, max_size=500) + +_HOLDABLE_SOURCES = (ProcedureStatus.RUNNING,) +_DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_HOLDABLE_SOURCES)) + + +def _procedure(*, procedure_id: UUID, status: ProcedureStatus) -> Procedure: + return Procedure( + id=procedure_id, + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), reason=_REASON, now=aware_datetimes()) +def test_hold_with_none_state_always_raises_not_found( + procedure_id: UUID, + reason: str, + now: datetime, +) -> None: + """Empty stream always raises `ProcedureNotFoundError` carrying command id.""" + with pytest.raises(ProcedureNotFoundError) as exc: + hold_procedure.decide( + state=None, + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert exc.value.procedure_id == procedure_id + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_HOLDABLE_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_from_permitted_source_emits_single_event( + procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """Running emits one ProcedureHeld with the threaded reason.""" + events = hold_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert events == [ProcedureHeld(procedure_id=procedure_id, reason=reason, occurred_at=now)] + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_DISALLOWED_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_from_disallowed_source_always_raises_cannot_hold( + procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """Any non-Running source raises ProcedureCannotHoldError carrying the status. + + A valid reason is supplied so the source-state guard is what fires + (reason validation runs first in the decider). + """ + with pytest.raises(ProcedureCannotHoldError) as exc: + hold_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert exc.value.current_status is source + + +@pytest.mark.unit +@given( + state_procedure_id=st.uuids(), + command_procedure_id=st.uuids(), + source=st.sampled_from(_HOLDABLE_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_uses_state_id_not_command_procedure_id( + state_procedure_id: UUID, + command_procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """The emitted event's procedure_id is state.id, not command.procedure_id.""" + assume(state_procedure_id != command_procedure_id) + events = hold_procedure.decide( + state=_procedure(procedure_id=state_procedure_id, status=source), + command=HoldProcedure(procedure_id=command_procedure_id, reason=reason), + now=now, + ) + assert events[0].procedure_id == state_procedure_id + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), reason=_REASON, now=aware_datetimes()) +def test_hold_is_pure_same_input_same_output( + procedure_id: UUID, + reason: str, + now: datetime, +) -> None: + """Two calls with identical args return equal events (no clock leakage).""" + state = _procedure(procedure_id=procedure_id, status=ProcedureStatus.RUNNING) + command = HoldProcedure(procedure_id=procedure_id, reason=reason) + first = hold_procedure.decide(state=state, command=command, now=now) + second = hold_procedure.decide(state=state, command=command, now=now) + assert first == second diff --git a/apps/api/tests/unit/operation/test_hold_procedure_handler.py b/apps/api/tests/unit/operation/test_hold_procedure_handler.py new file mode 100644 index 0000000000..1378ea1025 --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_handler.py @@ -0,0 +1,127 @@ +"""Application-handler tests for `hold_procedure` slice. + +Update-style handler via `make_procedure_update_handler`. The reason is +captured on the emitted `ProcedureHeld` payload but NOT logged at the +handler boundary (mirrors abort_procedure precedent). +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.operation.aggregates.procedure import ( + InvalidProcedureHoldReasonError, + ProcedureCannotHoldError, + ProcedureNotFoundError, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from tests.unit._helpers import build_deps as _build_deps_shared +from tests.unit.operation._helpers import seed_running_procedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 5, 15, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0e01") +_EVENT_ID = UUID("01900000-0000-7000-8000-0000000c0e02") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +async def _seed_running_procedure(store: InMemoryEventStore) -> None: + await seed_running_procedure( + store, + procedure_id=_PROCEDURE_ID, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + + +@pytest.mark.unit +async def test_handler_appends_procedure_held_event_with_trimmed_reason() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason=" beam dropped "), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + events, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 3 + assert events[2].event_type == "ProcedureHeld" + assert events[2].payload == { + "procedure_id": str(_PROCEDURE_ID), + "reason": "beam dropped", + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + # Operator hold (no conduct observer) leaves actuation_kind None. + "actuation_kind": None, + } + + +@pytest.mark.unit +async def test_handler_raises_when_procedure_not_found() -> None: + store = InMemoryEventStore() + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + with pytest.raises(ProcedureNotFoundError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="x"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_cannot_hold_when_re_holding() -> None: + """Strict-not-idempotent: re-holding a Held procedure raises.""" + store = InMemoryEventStore() + await _seed_running_procedure(store) + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="first"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + with pytest.raises(ProcedureCannotHoldError): + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="second"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_invalid_reason_for_whitespace_only() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + with pytest.raises(InvalidProcedureHoldReasonError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason=" "), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store, deny=True) + handler = hold_procedure.bind(deps) + with pytest.raises(UnauthorizedError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="r"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + _, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 2 diff --git a/apps/api/tests/unit/operation/test_procedure.py b/apps/api/tests/unit/operation/test_procedure.py index e5f592b1c2..c2f95b7816 100644 --- a/apps/api/tests/unit/operation/test_procedure.py +++ b/apps/api/tests/unit/operation/test_procedure.py @@ -68,18 +68,22 @@ def test_procedure_name_rejects_over_max_length() -> None: @pytest.mark.unit def test_procedure_status_values_locked() -> None: - """Pin the 5-state FSM values; future additions must be a deliberate test edit. + """Pin the 6-state FSM values; future additions must be a deliberate test edit. The FSM was REVISED from BC map's `Idle/Starting/Running/Verifying/Complete/Aborted` per standards-corpus research at [[project_operation_design]]: Verifying is NOT - standards-blessed at FSM level; transient states deferred per Run BC precedent.""" + standards-blessed at FSM level; transient states deferred per Run BC precedent. + `Held` lands in Tier 1 of [[project_resumable_conduct_design]] (operator-pause of + a halted conduct; mirrors RunStatus.HELD).""" assert ProcedureStatus.DEFINED.value == "Defined" assert ProcedureStatus.RUNNING.value == "Running" + assert ProcedureStatus.HELD.value == "Held" assert ProcedureStatus.COMPLETED.value == "Completed" assert ProcedureStatus.ABORTED.value == "Aborted" assert ProcedureStatus.TRUNCATED.value == "Truncated" assert {s.value for s in ProcedureStatus} == { "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated", @@ -90,7 +94,8 @@ def test_procedure_status_values_locked() -> None: def test_procedure_status_is_terminal_partitions_the_fsm() -> None: """is_terminal is True exactly for the terminal states. Pinned so a new state must consciously classify itself (consumers like register_dataset's - terminal-producing-Procedure guard rely on this).""" + terminal-producing-Procedure guard rely on this). `Held` is a pause-state, + NOT terminal.""" terminal = {s for s in ProcedureStatus if s.is_terminal} assert terminal == { ProcedureStatus.COMPLETED, @@ -99,6 +104,7 @@ def test_procedure_status_is_terminal_partitions_the_fsm() -> None: } assert not ProcedureStatus.DEFINED.is_terminal assert not ProcedureStatus.RUNNING.is_terminal + assert not ProcedureStatus.HELD.is_terminal # ---------- Error class shapes ---------- diff --git a/apps/api/tests/unit/operation/test_procedure_events.py b/apps/api/tests/unit/operation/test_procedure_events.py index 3bd8bd0670..9e93702c17 100644 --- a/apps/api/tests/unit/operation/test_procedure_events.py +++ b/apps/api/tests/unit/operation/test_procedure_events.py @@ -11,9 +11,11 @@ ProcedureAborted, ProcedureActivitiesLogbookOpened, ProcedureCompleted, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -728,6 +730,8 @@ def test_iteration_ended_round_trips(converged: bool | None, reason: str | None) "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", "ProcedureIterationEnded", @@ -742,3 +746,90 @@ def test_from_stored_raises_on_malformed_payload(event_type: str) -> None: in the load path.""" with pytest.raises(ValueError, match=f"Malformed {event_type} payload"): from_stored(_stored(event_type, {})) + + +# --- ProcedureHeld / ProcedureResumed (resumable conduct, Tier 1) --- + + +@pytest.mark.unit +def test_event_type_names_for_hold_resume() -> None: + held = ProcedureHeld(procedure_id=uuid4(), reason="pause", occurred_at=_NOW) + resumed = ProcedureResumed(procedure_id=uuid4(), re_establishment_boundary=0, occurred_at=_NOW) + assert event_type_name(held) == "ProcedureHeld" + assert event_type_name(resumed) == "ProcedureResumed" + + +@pytest.mark.unit +def test_to_payload_serializes_procedure_held() -> None: + pid = uuid4() + decision_id = uuid4() + payload = to_payload( + ProcedureHeld( + procedure_id=pid, + reason="beam dropped", + decided_by_decision_id=decision_id, + occurred_at=_NOW, + actuation_kind="Simulated", + ) + ) + assert payload == { + "procedure_id": str(pid), + "reason": "beam dropped", + "decided_by_decision_id": str(decision_id), + "occurred_at": _NOW.isoformat(), + "actuation_kind": "Simulated", + } + + +@pytest.mark.unit +def test_to_payload_serializes_procedure_resumed_with_null_decision() -> None: + pid = uuid4() + payload = to_payload( + ProcedureResumed(procedure_id=pid, re_establishment_boundary=5, occurred_at=_NOW) + ) + assert payload == { + "procedure_id": str(pid), + "re_establishment_boundary": 5, + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +@pytest.mark.parametrize("decision_id", [None, uuid4()]) +def test_procedure_held_round_trips(decision_id: UUID | None) -> None: + event = ProcedureHeld( + procedure_id=uuid4(), + reason="investigating fault", + decided_by_decision_id=decision_id, + occurred_at=_NOW, + ) + rebuilt = from_stored(_stored("ProcedureHeld", to_payload(event))) + assert rebuilt == event + + +@pytest.mark.unit +@pytest.mark.parametrize("decision_id", [None, uuid4()]) +def test_procedure_resumed_round_trips(decision_id: UUID | None) -> None: + event = ProcedureResumed( + procedure_id=uuid4(), + re_establishment_boundary=3, + decided_by_decision_id=decision_id, + occurred_at=_NOW, + ) + rebuilt = from_stored(_stored("ProcedureResumed", to_payload(event))) + assert rebuilt == event + + +@pytest.mark.unit +def test_from_stored_held_without_decided_by_key_folds_to_none() -> None: + """Forward-compat: a pre-supervisor stream omits decided_by_decision_id.""" + pid = uuid4() + rebuilt = from_stored( + _stored( + "ProcedureHeld", + {"procedure_id": str(pid), "reason": "pause", "occurred_at": _NOW.isoformat()}, + ) + ) + assert isinstance(rebuilt, ProcedureHeld) + assert rebuilt.decided_by_decision_id is None diff --git a/apps/api/tests/unit/operation/test_procedure_evolver.py b/apps/api/tests/unit/operation/test_procedure_evolver.py index d54cc9fd75..b9745401b6 100644 --- a/apps/api/tests/unit/operation/test_procedure_evolver.py +++ b/apps/api/tests/unit/operation/test_procedure_evolver.py @@ -13,10 +13,12 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureName, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureStatus, ProcedureTruncated, @@ -377,6 +379,126 @@ def test_evolve_procedure_aborted_on_empty_state_raises() -> None: evolve(None, ProcedureAborted(procedure_id=uuid4(), reason="x", occurred_at=_NOW)) +# --- ProcedureHeld / ProcedureResumed arms (resumable conduct, Tier 1) --- + + +def _to_running(prior: Procedure) -> Procedure: + return evolve(prior, ProcedureStarted(procedure_id=prior.id, occurred_at=_NOW)) + + +@pytest.mark.unit +def test_evolve_procedure_held_sets_status_to_held() -> None: + running = _to_running(_defined()) + state = evolve( + running, ProcedureHeld(procedure_id=running.id, reason="beam dropped", occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.HELD + + +@pytest.mark.unit +def test_evolve_procedure_resumed_sets_status_to_running() -> None: + running = _to_running(_defined()) + held = evolve( + running, ProcedureHeld(procedure_id=running.id, reason="beam dropped", occurred_at=_NOW) + ) + state = evolve( + held, ProcedureResumed(procedure_id=held.id, re_establishment_boundary=2, occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.RUNNING + + +@pytest.mark.unit +def test_evolve_procedure_held_preserves_iteration_denorms_and_actuation_kind() -> None: + """The load-bearing carry-forward: the Held arm must not wipe the + iteration denorms (the bug class the AST fitness guards).""" + running = _to_running(_defined(name="alignment", kind="alignment")) + # Open + close an iteration so the denorms are non-default. + started_iter = evolve( + running, + ProcedureIterationStarted(procedure_id=running.id, iteration_index=1, occurred_at=_NOW), + ) + ended_iter = evolve( + started_iter, + ProcedureIterationEnded( + procedure_id=running.id, + iteration_index=1, + converged=False, + reason=None, + occurred_at=_NOW, + ), + ) + state = evolve( + ended_iter, ProcedureHeld(procedure_id=running.id, reason="pause", occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.HELD + assert state.iteration_count == ended_iter.iteration_count == 1 + assert state.current_iteration_index is None + assert ( + state.consecutive_unconverged_iterations + == ended_iter.consecutive_unconverged_iterations + == 1 + ) + assert state.kind == "alignment" + + +@pytest.mark.unit +def test_evolve_procedure_resumed_preserves_iteration_denorms() -> None: + running = _to_running(_defined()) + started_iter = evolve( + running, + ProcedureIterationStarted(procedure_id=running.id, iteration_index=1, occurred_at=_NOW), + ) + held = evolve( + started_iter, ProcedureHeld(procedure_id=running.id, reason="pause", occurred_at=_NOW) + ) + state = evolve( + held, ProcedureResumed(procedure_id=held.id, re_establishment_boundary=0, occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.RUNNING + # An iteration left open across the hold stays open on resume. + assert state.current_iteration_index == 1 + assert state.iteration_count == 1 + + +@pytest.mark.unit +def test_fold_hold_resume_cycle_lands_running() -> None: + pid = uuid4() + state = fold( + [ + ProcedureRegistered( + procedure_id=pid, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ), + ProcedureStarted(procedure_id=pid, occurred_at=_NOW), + ProcedureHeld(procedure_id=pid, reason="first pause", occurred_at=_NOW), + ProcedureResumed(procedure_id=pid, re_establishment_boundary=0, occurred_at=_NOW), + ProcedureHeld(procedure_id=pid, reason="second pause", occurred_at=_NOW), + ProcedureResumed(procedure_id=pid, re_establishment_boundary=3, occurred_at=_NOW), + ] + ) + assert state is not None + assert state.status is ProcedureStatus.RUNNING + + +@pytest.mark.unit +def test_evolve_procedure_held_on_empty_state_raises() -> None: + with pytest.raises(ValueError, match="ProcedureHeld"): + evolve(None, ProcedureHeld(procedure_id=uuid4(), reason="x", occurred_at=_NOW)) + + +@pytest.mark.unit +def test_evolve_procedure_resumed_on_empty_state_raises() -> None: + with pytest.raises(ValueError, match="ProcedureResumed"): + evolve( + None, + ProcedureResumed(procedure_id=uuid4(), re_establishment_boundary=0, occurred_at=_NOW), + ) + + # --- ProcedureActivitiesLogbookOpened arm --- diff --git a/apps/api/tests/unit/operation/test_procedure_summary_projection.py b/apps/api/tests/unit/operation/test_procedure_summary_projection.py index 49f995ec3d..884b83b9ea 100644 --- a/apps/api/tests/unit/operation/test_procedure_summary_projection.py +++ b/apps/api/tests/unit/operation/test_procedure_summary_projection.py @@ -49,6 +49,8 @@ def test_projection_metadata() -> None: "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", } @@ -70,6 +72,17 @@ def test_projection_does_not_subscribe_to_iteration_ended() -> None: assert "ProcedureIterationEnded" not in proj.subscribed_event_types +@pytest.mark.unit +def test_projection_subscribes_to_hold_resume() -> None: + """Resumable conduct now surfaces Held in the read model: migration + 20260621060000 widened the `status` CHECK to admit 'Held', so the + projection folds ProcedureHeld -> status='Held' and ProcedureResumed -> + status='Running'. See [[project_resumable_conduct_design]].""" + proj = ProcedureSummaryProjection() + assert "ProcedureHeld" in proj.subscribed_event_types + assert "ProcedureResumed" in proj.subscribed_event_types + + @pytest.mark.unit async def test_procedure_registered_inserts_with_defined_status_and_null_audit() -> None: proj = ProcedureSummaryProjection() @@ -122,11 +135,12 @@ async def test_procedure_registered_handles_null_parent_run() -> None: assert args[5] is None -# NOTE: the 4 status-change UPDATE arms (Started/Completed/Aborted/Truncated) -# use literal status strings in SQL today (per-event SQL constants in the -# projection). When `_UPDATE_STATUS_SQL` parameterized hoist lands (trigger: -# 5th status-change arm), flip these substring assertions to `"SET status = $5"` -# in lockstep with the projection refactor. +# NOTE: the 6 status-change UPDATE arms (Started/Completed/Aborted/Truncated/ +# Held/Resumed) use literal status strings in SQL via per-event SQL constants. +# The old "hoist to a parameterized `_UPDATE_STATUS_SQL` at the 5th arm" plan +# was re-evaluated when Held/Resumed landed and dropped: the arms are NOT +# uniform (Truncated also sets interrupted_at, Resumed CLEARS the reason), so a +# single parameterized SQL reads worse. These substring assertions stay. @pytest.mark.unit @@ -213,6 +227,46 @@ async def test_procedure_truncated_handles_null_interrupted_at() -> None: assert conn.execute.call_args.args[4] is None +@pytest.mark.unit +async def test_procedure_held_updates_status_and_reason() -> None: + proj = ProcedureSummaryProjection() + conn = AsyncMock() + event = _stored( + "ProcedureHeld", + { + "procedure_id": str(_PROCEDURE_ID), + "reason": "beam dropped", + "occurred_at": _NOW.isoformat(), + }, + ) + await proj.apply(event, conn) + sql = conn.execute.call_args.args[0] + assert "SET status = 'Held'" in sql + assert conn.execute.call_args.args[1] == _PROCEDURE_ID + assert conn.execute.call_args.args[2] == _NOW + assert conn.execute.call_args.args[3] == "beam dropped" + + +@pytest.mark.unit +async def test_procedure_resumed_updates_status_to_running_and_clears_reason() -> None: + proj = ProcedureSummaryProjection() + conn = AsyncMock() + event = _stored( + "ProcedureResumed", + { + "procedure_id": str(_PROCEDURE_ID), + "re_establishment_boundary": 0, + "occurred_at": _NOW.isoformat(), + }, + ) + await proj.apply(event, conn) + sql = conn.execute.call_args.args[0] + assert "SET status = 'Running'" in sql + assert "last_status_reason = NULL" in sql + assert conn.execute.call_args.args[1] == _PROCEDURE_ID + assert conn.execute.call_args.args[2] == _NOW + + @pytest.mark.unit async def test_procedure_steps_logbook_opened_updates_logbook_id() -> None: proj = ProcedureSummaryProjection() diff --git a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py new file mode 100644 index 0000000000..bd7a7f9b07 --- /dev/null +++ b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py @@ -0,0 +1,438 @@ +"""Application-handler tests for `reconduct_procedure` (resume + replay). + +Orchestration handler composing `resume_procedure` + `Conductor.execute_from` ++ complete/abort. Pins the three-way terminal contract and the guards: + + - clean tail -> resume + auto-complete (Completed) + - acquisition halt -> resume, NO complete/abort, stays Running, halt in result + - genuine step failure -> resume + abort (Aborted) + - missing pinned resolved steps -> ResolvedStepsRecordNotFoundError + - not Held / parent Run Held -> ProcedureCannotResumeError (no replay) + - authz deny -> UnauthorizedError +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.operation.adapters.control_port_registry import ControlPortRegistry +from cora.operation.adapters.in_memory_control_port import InMemoryControlPort +from cora.operation.aggregates.procedure import ( + InMemoryActivityStore, + InvalidProcedureReEstablishmentBoundaryError, + ProcedureCannotResumeError, + ProcedureHeld, + ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + ProcedureStatus, + ResolvedStepsRecorded, + ResolvedStepsRecordNotFoundError, + event_type_name, + load_procedure, + to_payload, +) +from cora.operation.conductor import ActionStep, Conductor, SetpointStep, Step, step_to_payload +from cora.operation.errors import UnauthorizedError +from cora.operation.features import ( + abort_procedure, + append_activities, + complete_procedure, + reconduct_procedure, + resume_procedure, +) +from cora.operation.features.reconduct_procedure import ( + Handler as ReconductHandler, +) +from cora.operation.features.reconduct_procedure import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.operation.ports.control_port import ActuationKind, ControlPort +from cora.run.aggregates.run import RunHeld, RunStarted +from cora.run.aggregates.run import event_type_name as run_event_type_name +from cora.run.aggregates.run import to_payload as run_to_payload +from tests.unit._helpers import build_deps as _build_deps_shared + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 6, 21, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000d0a01") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +@dataclass +class _LenientIds: + """Conductor id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +def _deps(store: InMemoryEventStore, *, deny: bool = False) -> Kernel: + # Generous id pool: resume + lazy logbook-open + complete/abort all draw + # from deps.id_generator (the conductor's activity rows use a lenient one). + return _build_deps_shared( + ids=[uuid4() for _ in range(30)], now=_NOW, event_store=store, deny=deny + ) + + +def _make_reconduct(deps: Kernel, port: ControlPort) -> ReconductHandler: + conductor = Conductor( + control_port=port, + append_step=append_activities.bind(deps, step_store=InMemoryActivityStore()), + clock=deps.clock, + id_generator=_LenientIds(), + resume_procedure=resume_procedure.bind(deps), + complete_procedure=complete_procedure.bind(deps), + abort_procedure=abort_procedure.bind(deps), + ) + return reconduct_procedure.bind(deps, conductor=conductor) + + +async def _seed_held_with_steps( + store: InMemoryEventStore, + *, + steps: Sequence[Step], + procedure_id: UUID = _PROCEDURE_ID, + parent_run_id: UUID | None = None, + held_actuation_kind: str | None = None, +) -> None: + """Land a conducted-then-Held Procedure: Registered + ResolvedStepsRecorded + (the pinned resolved steps) + Started + Held. `held_actuation_kind` is the + kind the pre-hold conduct observed (carried on ProcedureHeld).""" + resolved = tuple(step_to_payload(s) for s in steps) + events = [ + ProcedureRegistered( + procedure_id=procedure_id, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=parent_run_id, + occurred_at=_PRIOR, + ), + ResolvedStepsRecorded( + procedure_id=procedure_id, + resolved_steps=resolved, + step_count=len(resolved), + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=procedure_id, occurred_at=_PRIOR), + ProcedureHeld( + procedure_id=procedure_id, + reason="beam dropped", + occurred_at=_PRIOR, + actuation_kind=held_actuation_kind, + ), + ] + await store.append( + stream_type="Procedure", + stream_id=procedure_id, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _seed_held_run(store: InMemoryEventStore, *, run_id: UUID) -> None: + events: list[RunStarted | RunHeld] = [ + RunStarted( + run_id=run_id, name="parent", plan_id=uuid4(), subject_id=None, occurred_at=_PRIOR + ), + RunHeld(run_id=run_id, occurred_at=_PRIOR), + ] + await store.append( + stream_type="Run", + stream_id=run_id, + expected_version=0, + events=[ + to_new_event( + event_type=run_event_type_name(e), + payload=run_to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _status(store: InMemoryEventStore) -> ProcedureStatus: + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + return state.status + + +async def _call(handler: ReconductHandler, boundary: int) -> ReconductProcedureResult: + return await handler( + ReconductProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=boundary), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_clean_tail_resumes_then_auto_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + port.simulate_connect("2bma:b") + await _seed_held_with_steps( + store, + steps=( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is True + assert result.acquisition_halt is False + assert result.completed_count == 2 + assert await _status(store) is ProcedureStatus.COMPLETED + assert (await port.read("2bma:a")).value == 1.0 + assert (await port.read("2bma:b")).value == 2.0 + + +@pytest.mark.unit +async def test_boundary_replays_only_the_tail_then_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:b") # only the tail step is re-driven + await _seed_held_with_steps( + store, + steps=( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 1) + assert result.succeeded is True + assert result.completed_count == 1 + assert await _status(store) is ProcedureStatus.COMPLETED + # The prefix step (2bma:a) was never re-driven. + with pytest.raises(Exception, match="not connected"): + await port.read("2bma:a") + + +@pytest.mark.unit +async def test_acquisition_halt_resumes_but_leaves_running() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + await _seed_held_with_steps( + store, + steps=( + SetpointStep(address="2bma:a", value=1.0), + ActionStep(name="collect", params={"dwell": 0.1}), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is False + assert result.acquisition_halt is True + assert result.failure is not None + assert result.failure.error_class == "AcquisitionResumeRequiresOperator" + # Resumed (Held -> Running) but NOT completed/aborted: stays Running. + assert await _status(store) is ProcedureStatus.RUNNING + events, _ = await store.load("Procedure", _PROCEDURE_ID) + types = [e.event_type for e in events] + assert "ProcedureResumed" in types + assert "ProcedureCompleted" not in types + assert "ProcedureAborted" not in types + + +@pytest.mark.unit +async def test_genuine_step_failure_resumes_then_aborts() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a NOT connected -> write fails + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is False + assert result.acquisition_halt is False + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.ABORTED + + +@pytest.mark.unit +async def test_raises_when_resolved_steps_record_missing() -> None: + """A Held Procedure with no pinned ResolvedStepsRecorded is corruption.""" + store = InMemoryEventStore() + # Seed Held WITHOUT a ResolvedStepsRecorded. + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="x", + kind="bakeout", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_PRIOR), + ProcedureHeld(procedure_id=_PROCEDURE_ID, reason="paused", occurred_at=_PRIOR), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + deps = _deps(store) + with pytest.raises(ResolvedStepsRecordNotFoundError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_reconduct_raises_not_found_when_procedure_absent() -> None: + store = InMemoryEventStore() + deps = _deps(store) + with pytest.raises(ProcedureNotFoundError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_cannot_resume_when_not_held() -> None: + """A Running (not Held) Procedure with resolved steps cannot be reconducted.""" + store = InMemoryEventStore() + # Registered + ResolvedStepsRecorded + Started (Running, has resolved steps). + resolved = (step_to_payload(SetpointStep(address="2bma:a", value=1.0)),) + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="x", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_PRIOR, + ), + ResolvedStepsRecorded( + procedure_id=_PROCEDURE_ID, + resolved_steps=resolved, + step_count=1, + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_PRIOR), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + deps = _deps(store) + with pytest.raises(ProcedureCannotResumeError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_cannot_resume_when_parent_run_held() -> None: + """Off-diagonal guard: a Phase-of-Run Procedure whose parent Run is Held.""" + store = InMemoryEventStore() + parent_run_id = uuid4() + await _seed_held_run(store, run_id=parent_run_id) + await _seed_held_with_steps( + store, + steps=(SetpointStep(address="2bma:a", value=1.0),), + parent_run_id=parent_run_id, + ) + deps = _deps(store) + with pytest.raises(ProcedureCannotResumeError) as exc: + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + assert exc.value.parent_run_held is True + + +@pytest.mark.unit +async def test_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store, deny=True) + with pytest.raises(UnauthorizedError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_when_boundary_past_step_count() -> None: + """A boundary strictly past the pinned step count is rejected (it would + replay an empty tail and silently auto-complete). boundary == count is + allowed (a deliberate complete-with-nothing resume).""" + store = InMemoryEventStore() + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store) + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 2) # only 1 step pinned + + +@pytest.mark.unit +async def test_reconduct_folds_pre_hold_actuation_kind_into_completion() -> None: + """Regression (provenance gate): a conduct that touched a SIMULATED route + before the hold must not complete as Physical when reconducted over a + physical tail. The pre-hold kind carried on ProcedureHeld is merged with + the replay-tail kind, so the terminal event reports Hybrid and the + promote_dataset Simulated/Hybrid gate still bites.""" + store = InMemoryEventStore() + inner = InMemoryControlPort() + inner.simulate_connect("real:a") + registry = ControlPortRegistry() + registry.register("real:", inner, is_simulated=False) # the replay tail is physical + await _seed_held_with_steps( + store, + steps=(SetpointStep(address="real:a", value=1.0),), + held_actuation_kind="Simulated", # the pre-hold prefix touched a simulator + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, registry), 0) + + assert result.succeeded is True + # Merged, NOT the tail-only Physical -> the response + the terminal event agree. + assert result.actuation_kind == ActuationKind.HYBRID.value + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + assert state.status is ProcedureStatus.COMPLETED + assert state.actuation_kind == ActuationKind.HYBRID.value diff --git a/apps/api/tests/unit/operation/test_record_resolved_steps.py b/apps/api/tests/unit/operation/test_record_resolved_steps.py index 51bd345ec0..8540e824d8 100644 --- a/apps/api/tests/unit/operation/test_record_resolved_steps.py +++ b/apps/api/tests/unit/operation/test_record_resolved_steps.py @@ -1,4 +1,4 @@ -"""Tier-0 manifest recording: decide_resolved_steps_recorded + step_to_payload. +"""Tier-0 resolved-steps recording: decide_resolved_steps_recorded + step_to_payload. Covers: - the helper emits one ResolvedStepsRecorded for a Defined Procedure, @@ -8,7 +8,7 @@ the conduct route keeps its failures-in-body contract). - step_to_payload round-trips every step kind back to an equal Step via the public wire path (ConductProcedureRequest validation + step_from_wire), - proving a pinned manifest can be replayed. + proving a pinned step list can be replayed. """ from datetime import UTC, datetime @@ -16,6 +16,7 @@ import pytest +from cora.operation._conduct_preparation import decide_resolved_steps_recorded from cora.operation.aggregates.procedure import ( ProcedureRegistered, ProcedureStarted, @@ -31,9 +32,6 @@ WithinToleranceCriterion, step_to_payload, ) -from cora.operation.features.conduct_procedure.manifest import ( - decide_resolved_steps_recorded, -) from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, step_from_wire, @@ -55,7 +53,7 @@ def _registered() -> tuple[UUID, ProcedureRegistered]: @pytest.mark.unit -def test_decide_records_manifest_for_defined_procedure() -> None: +def test_decide_records_resolved_steps_for_defined_procedure() -> None: procedure_id, registered = _registered() state = fold([registered]) steps = ( diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider.py b/apps/api/tests/unit/operation/test_resume_procedure_decider.py new file mode 100644 index 0000000000..5ac93884bc --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider.py @@ -0,0 +1,188 @@ +"""Pure-decider tests for `resume_procedure` slice. + +Single-source resume transition: `Held -> Running`. Carries +`re_establishment_boundary` (>= 0). Mirrors `resume_run`. The +off-diagonal guard (refuse while the parent Run is Held) lives in the +decider via the `parent_run_held` fact the handler derives from a +one-directional Operation -> Run read; these tests exercise it with +the flag directly. +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureName, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features import resume_procedure +from cora.operation.features.resume_procedure import ResumeProcedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) + + +def _procedure( + *, + procedure_id: UUID | None = None, + status: ProcedureStatus = ProcedureStatus.HELD, +) -> Procedure: + return Procedure( + id=procedure_id or uuid4(), + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +def test_decide_emits_procedure_resumed_when_held() -> None: + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=3), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureResumed) + assert events[0].procedure_id == proc.id + assert events[0].re_establishment_boundary == 3 + assert events[0].occurred_at == _NOW + assert events[0].decided_by_decision_id is None + + +@pytest.mark.unit +def test_decide_threads_decided_by_decision_id() -> None: + proc = _procedure() + decision_id = uuid4() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure( + procedure_id=proc.id, + re_establishment_boundary=0, + decided_by_decision_id=decision_id, + ), + now=_NOW, + ) + assert events[0].decided_by_decision_id == decision_id + + +@pytest.mark.unit +def test_decide_rejects_when_parent_run_held() -> None: + """Off-diagonal guard: a Held Procedure whose parent Run is Held cannot + resume (it would walk real setpoints while the Run is paused).""" + proc = _procedure() # status Held + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=True, + now=_NOW, + ) + assert exc.value.parent_run_held is True + assert "parent Run is Held" in str(exc.value) + + +@pytest.mark.unit +def test_decide_allows_when_parent_run_not_held() -> None: + """A Held Procedure whose parent Run is NOT Held resumes normally.""" + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=False, + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureResumed) + + +@pytest.mark.unit +def test_decide_status_guard_precedes_parent_run_guard() -> None: + """A non-Held Procedure raises the status-guard form even if the parent + Run is also Held (status checked first; parent_run_held flag not set).""" + proc = _procedure(status=ProcedureStatus.RUNNING) + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=True, + now=_NOW, + ) + assert exc.value.parent_run_held is False + assert exc.value.current_status is ProcedureStatus.RUNNING + + +@pytest.mark.unit +def test_decide_accepts_zero_boundary() -> None: + """Boundary 0 = re-establish from the first step (valid).""" + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + now=_NOW, + ) + assert events[0].re_establishment_boundary == 0 + + +@pytest.mark.unit +def test_decide_rejects_negative_boundary() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=-1), + now=_NOW, + ) + + +@pytest.mark.unit +def test_decide_rejects_when_state_is_none() -> None: + pid = uuid4() + with pytest.raises(ProcedureNotFoundError) as exc: + resume_procedure.decide( + state=None, + command=ResumeProcedure(procedure_id=pid, re_establishment_boundary=0), + now=_NOW, + ) + assert exc.value.procedure_id == pid + + +@pytest.mark.unit +@pytest.mark.parametrize( + "status", + [ + ProcedureStatus.DEFINED, + ProcedureStatus.RUNNING, + ProcedureStatus.COMPLETED, + ProcedureStatus.ABORTED, + ProcedureStatus.TRUNCATED, + ], +) +def test_decide_rejects_non_held_status(status: ProcedureStatus) -> None: + """Resuming a non-Held procedure raises (resuming a Running one too).""" + proc = _procedure(status=status) + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + now=_NOW, + ) + assert exc.value.current_status is status + + +@pytest.mark.unit +def test_decide_is_pure_same_inputs_same_outputs() -> None: + proc = _procedure() + cmd = ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=2) + first = resume_procedure.decide(state=proc, command=cmd, now=_NOW) + second = resume_procedure.decide(state=proc, command=cmd, now=_NOW) + assert first == second diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py new file mode 100644 index 0000000000..33185d98e6 --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py @@ -0,0 +1,222 @@ +"""Property-based tests for `resume_procedure.decide` (Operation BC). + +Complements the example-based `test_resume_procedure_decider.py` with +universal claims across generated inputs. The decider is a pure +single-source resume transition carrying a re-establishment boundary: + + (state, command, now) -> list[ProcedureResumed] + +Load-bearing properties: + + - state=None always raises `ProcedureNotFoundError` carrying + command.procedure_id. + - A negative re_establishment_boundary always raises + `InvalidProcedureReEstablishmentBoundaryError` (validated before the + status guard). + - The source-state partition is total over `ProcedureStatus`: the sole + source `{Held}` emits exactly one `ProcedureResumed` (procedure_id= + state.id, boundary threaded, occurred_at=now); every other status + raises `ProcedureCannotResumeError`. (Adding a new status + auto-extends `_DISALLOWED_SOURCES`.) + - The emitted event's procedure_id is `state.id`, never + command.procedure_id. + - Pure: same (state, command, now) returns equal events. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from hypothesis import assume, given +from hypothesis import strategies as st + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureName, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features import resume_procedure +from cora.operation.features.resume_procedure import ResumeProcedure +from tests._strategies import aware_datetimes + +if TYPE_CHECKING: + from datetime import datetime + from uuid import UUID + +_BOUNDARY = st.integers(min_value=0, max_value=1_000_000) + +_RESUMABLE_SOURCES = (ProcedureStatus.HELD,) +_DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_RESUMABLE_SOURCES)) + + +def _procedure(*, procedure_id: UUID, status: ProcedureStatus) -> Procedure: + return Procedure( + id=procedure_id, + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), boundary=_BOUNDARY, now=aware_datetimes()) +def test_resume_with_none_state_always_raises_not_found( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Empty stream always raises `ProcedureNotFoundError` carrying command id.""" + with pytest.raises(ProcedureNotFoundError) as exc: + resume_procedure.decide( + state=None, + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert exc.value.procedure_id == procedure_id + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(list(ProcedureStatus)), + boundary=st.integers(max_value=-1), + now=aware_datetimes(), +) +def test_resume_with_negative_boundary_always_raises_invalid( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """A negative boundary raises before the status guard, for any source state.""" + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_RESUMABLE_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_from_permitted_source_emits_single_event( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """Held emits one ProcedureResumed with the threaded boundary.""" + events = resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert events == [ + ProcedureResumed( + procedure_id=procedure_id, re_establishment_boundary=boundary, occurred_at=now + ) + ] + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_DISALLOWED_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_from_disallowed_source_always_raises_cannot_resume( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """Any non-Held source raises ProcedureCannotResumeError carrying the status. + + A valid (non-negative) boundary is supplied so the source-state guard + is what fires (boundary validation runs first in the decider). + """ + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert exc.value.current_status is source + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_with_parent_run_held_always_raises( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Off-diagonal guard: a Held Procedure whose parent Run is Held always + raises (the status guard passes, so the parent-Run guard is what fires).""" + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=ProcedureStatus.HELD), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + parent_run_held=True, + now=now, + ) + assert exc.value.parent_run_held is True + + +@pytest.mark.unit +@given( + state_procedure_id=st.uuids(), + command_procedure_id=st.uuids(), + source=st.sampled_from(_RESUMABLE_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_uses_state_id_not_command_procedure_id( + state_procedure_id: UUID, + command_procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """The emitted event's procedure_id is state.id, not command.procedure_id.""" + assume(state_procedure_id != command_procedure_id) + events = resume_procedure.decide( + state=_procedure(procedure_id=state_procedure_id, status=source), + command=ResumeProcedure( + procedure_id=command_procedure_id, re_establishment_boundary=boundary + ), + now=now, + ) + assert events[0].procedure_id == state_procedure_id + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), boundary=_BOUNDARY, now=aware_datetimes()) +def test_resume_is_pure_same_input_same_output( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Two calls with identical args return equal events (no clock leakage).""" + state = _procedure(procedure_id=procedure_id, status=ProcedureStatus.HELD) + command = ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary) + first = resume_procedure.decide(state=state, command=command, now=now) + second = resume_procedure.decide(state=state, command=command, now=now) + assert first == second diff --git a/apps/api/tests/unit/operation/test_resume_procedure_handler.py b/apps/api/tests/unit/operation/test_resume_procedure_handler.py new file mode 100644 index 0000000000..626ff4135e --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_handler.py @@ -0,0 +1,282 @@ +"""Application-handler tests for `resume_procedure` slice. + +Custom cross-aggregate handler. Source state is `Held`, reached here by +seeding Running then holding. Covers the status-guard path AND the +off-diagonal guard (the handler loads the parent Run and refuses while +the Run is itself `Held`). +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event +from cora.operation.aggregates.procedure import ( + ProcedureCannotResumeError, + ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + event_type_name, + to_payload, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import hold_procedure, resume_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from cora.operation.features.resume_procedure import ResumeProcedure +from cora.run.aggregates.run import RunHeld, RunNotFoundError, RunStarted +from cora.run.aggregates.run import event_type_name as run_event_type_name +from cora.run.aggregates.run import to_payload as run_to_payload +from tests.unit._helpers import build_deps as _build_deps_shared +from tests.unit.operation._helpers import seed_running_procedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 5, 15, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0f01") +_EVENT_ID = UUID("01900000-0000-7000-8000-0000000c0f02") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +async def _seed_held_procedure( + store: InMemoryEventStore, + *, + procedure_id: UUID = _PROCEDURE_ID, + parent_run_id: UUID | None = None, +) -> None: + """Land `procedure_id` in `Held`, optionally as a Phase-of-Run Procedure.""" + if parent_run_id is None: + await seed_running_procedure( + store, + procedure_id=procedure_id, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + else: + # Phase-of-Run: ProcedureRegistered carries parent_run_id, then Started. + registered = ProcedureRegistered( + procedure_id=procedure_id, + name="mid-run alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=parent_run_id, + occurred_at=_PRIOR, + ) + started = ProcedureStarted(procedure_id=procedure_id, occurred_at=_PRIOR) + await store.append( + stream_type="Procedure", + stream_id=procedure_id, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in (registered, started) + ], + ) + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_PRIOR, event_store=store))( + HoldProcedure(procedure_id=procedure_id, reason="beam dropped"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +async def _seed_run(store: InMemoryEventStore, *, run_id: UUID, held: bool) -> None: + """Land a parent Run in `Running` (held=False) or `Held` (held=True).""" + events: list[object] = [ + RunStarted( + run_id=run_id, + name="parent run", + plan_id=uuid4(), + subject_id=None, + occurred_at=_PRIOR, + ) + ] + if held: + events.append(RunHeld(run_id=run_id, occurred_at=_PRIOR)) + await store.append( + stream_type="Run", + stream_id=run_id, + expected_version=0, + events=[ + to_new_event( + event_type=run_event_type_name(e), # type: ignore[arg-type] + payload=run_to_payload(e), # type: ignore[arg-type] + occurred_at=e.occurred_at, # type: ignore[attr-defined] + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +@pytest.mark.unit +async def test_handler_appends_procedure_resumed_event() -> None: + store = InMemoryEventStore() + await _seed_held_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = resume_procedure.bind(deps) + + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=2), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + events, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 4 # Registered, Started, Held, Resumed + assert events[3].event_type == "ProcedureResumed" + assert events[3].payload == { + "procedure_id": str(_PROCEDURE_ID), + "re_establishment_boundary": 2, + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +async def test_handler_raises_when_procedure_not_found() -> None: + store = InMemoryEventStore() + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = resume_procedure.bind(deps) + with pytest.raises(ProcedureNotFoundError): + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_cannot_resume_when_running() -> None: + """Resuming a Running (not Held) procedure raises.""" + store = InMemoryEventStore() + await seed_running_procedure( + store, + procedure_id=_PROCEDURE_ID, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(ProcedureCannotResumeError): + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_round_trips_hold_then_resume_back_to_running() -> None: + """Hold then resume lands the Procedure back in Running (bidirectional cycle).""" + store = InMemoryEventStore() + await _seed_held_procedure(store) + await resume_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + # A second hold now succeeds (the cycle is open again). + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="second pause"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + events, _ = await store.load("Procedure", _PROCEDURE_ID) + assert [e.event_type for e in events] == [ + "ProcedureRegistered", + "ProcedureStarted", + "ProcedureHeld", + "ProcedureResumed", + "ProcedureHeld", + ] + + +@pytest.mark.unit +async def test_handler_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_held_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store, deny=True) + handler = resume_procedure.bind(deps) + with pytest.raises(UnauthorizedError): + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +# --- off-diagonal guard: parent Run Held --- + +_PARENT_RUN_ID = UUID("01900000-0000-7000-8000-0000000c0f0a") +_PHASE_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0f0b") + + +@pytest.mark.unit +async def test_handler_refuses_resume_when_parent_run_held() -> None: + """A Phase-of-Run Procedure cannot resume while its parent Run is Held.""" + store = InMemoryEventStore() + await _seed_run(store, run_id=_PARENT_RUN_ID, held=True) + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(ProcedureCannotResumeError) as exc: + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + assert exc.value.parent_run_held is True + # No ProcedureResumed appended (still Held: Registered, Started, Held). + events, version = await store.load("Procedure", _PHASE_PROCEDURE_ID) + assert version == 3 + assert events[-1].event_type == "ProcedureHeld" + + +@pytest.mark.unit +async def test_handler_allows_resume_when_parent_run_running() -> None: + """A Phase-of-Run Procedure resumes when its parent Run is Running.""" + store = InMemoryEventStore() + await _seed_run(store, run_id=_PARENT_RUN_ID, held=False) + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=4), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + events, _ = await store.load("Procedure", _PHASE_PROCEDURE_ID) + assert events[-1].event_type == "ProcedureResumed" + assert events[-1].payload["re_establishment_boundary"] == 4 + + +@pytest.mark.unit +async def test_handler_raises_run_not_found_when_parent_run_missing() -> None: + """Phase-of-Run Procedure with a parent_run_id pointing at an empty Run + stream is corruption: the handler raises rather than skipping the guard.""" + store = InMemoryEventStore() + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) # parent Run never seeded + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(RunNotFoundError): + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) diff --git a/apps/api/tests/unit/operation/test_truncate_procedure_decider.py b/apps/api/tests/unit/operation/test_truncate_procedure_decider.py index 446e55412d..cd3cab4cbe 100644 --- a/apps/api/tests/unit/operation/test_truncate_procedure_decider.py +++ b/apps/api/tests/unit/operation/test_truncate_procedure_decider.py @@ -63,6 +63,25 @@ def test_decide_emits_procedure_truncated_when_running() -> None: assert events[0].occurred_at == _NOW +@pytest.mark.unit +def test_decide_emits_procedure_truncated_when_held() -> None: + """Resumable conduct: a paused (Held) Procedure that became de-facto + dead can be truncated retroactively.""" + proc = _procedure(status=ProcedureStatus.HELD) + events = truncate_procedure.decide( + state=proc, + command=TruncateProcedure( + procedure_id=proc.id, + reason="paused over the weekend, hardware died", + interrupted_at=None, + ), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureTruncated) + assert events[0].procedure_id == proc.id + + @pytest.mark.unit def test_decide_accepts_none_interrupted_at() -> None: """interrupted_at is optional; None is valid (operator doesn't know when).""" diff --git a/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py index d8cb3b749e..ffa261011c 100644 --- a/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py +++ b/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py @@ -10,8 +10,8 @@ - state=None always raises `ProcedureNotFoundError` carrying command.procedure_id. - - The source-state partition is total over `ProcedureStatus`: the - sole source `{Running}` emits exactly one `ProcedureTruncated` + - The source-state partition is total over `ProcedureStatus`: each + source in `{Running, Held}` emits exactly one `ProcedureTruncated` (procedure_id=state.id, reason threaded, occurred_at=now); every other status raises `ProcedureCannotTruncateError` carrying the current status. @@ -50,7 +50,7 @@ _REASON = printable_ascii_text(min_size=1, max_size=500) -_TRUNCATABLE_SOURCES = (ProcedureStatus.RUNNING,) +_TRUNCATABLE_SOURCES = (ProcedureStatus.RUNNING, ProcedureStatus.HELD) _DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_TRUNCATABLE_SOURCES)) @@ -97,7 +97,7 @@ def test_truncate_from_permitted_source_emits_single_event( reason: str, now: _datetime, ) -> None: - """Running emits one ProcedureTruncated with the threaded reason.""" + """Running or Held emits one ProcedureTruncated with the threaded reason.""" events = truncate_procedure.decide( state=_procedure(procedure_id=procedure_id, status=source), command=TruncateProcedure(procedure_id=procedure_id, reason=reason, interrupted_at=None), diff --git a/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py b/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py new file mode 100644 index 0000000000..5735feb911 --- /dev/null +++ b/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py @@ -0,0 +1,366 @@ +"""Application-handler tests for `try_conduct_procedure` (pause-to-Held conduct). + +Orchestration handler delegating to `Conductor.try_conduct`. Pins the +hold-vs-abort branch + the guards against a real Conductor + real +start/complete/abort/hold handlers over an in-memory store: + + - recoverable setpoint failure -> start + pause to Held (held=True), manifest pinned + - recoverable check failure -> start + pause to Held + - action (acquisition) failure -> start + abort (held=False, Aborted) + - clean run -> start + complete (Completed) + - hold itself fails -> left Running, original failure surfaced + - authz deny -> UnauthorizedError + - unknown procedure -> ProcedureNotFoundError +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation.adapters.in_memory_control_port import InMemoryControlPort +from cora.operation.adapters.in_memory_recipe_expander import InMemoryRecipeExpander +from cora.operation.aggregates.procedure import ( + InMemoryActivityStore, + ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + ProcedureStatus, + event_type_name, + load_procedure, + to_payload, +) +from cora.operation.conductor import ( + ActionStep, + CheckStep, + Conductor, + EqualsCriterion, + SetpointStep, + Step, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import ( + abort_procedure, + append_activities, + complete_procedure, + hold_procedure, + start_procedure, + try_conduct_procedure, +) +from cora.operation.features.complete_procedure.command import CompleteProcedure +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.try_conduct_procedure import ( + Handler as TryConductHandler, +) +from cora.operation.features.try_conduct_procedure import ( + TryConductProcedure, + TryConductProcedureResult, +) +from tests.unit._helpers import build_deps as _build_deps_shared + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000d0b01") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +@dataclass +class _LenientIds: + """Conductor id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +async def _raising_hold( + command: HoldProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, +) -> None: + _ = (command, principal_id, correlation_id, causation_id, surface_id) + msg = "hold backend unavailable" + raise RuntimeError(msg) + + +async def _raising_complete( + command: CompleteProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, +) -> None: + _ = (command, principal_id, correlation_id, causation_id, surface_id) + msg = "complete backend unavailable" + raise RuntimeError(msg) + + +def _deps(store: InMemoryEventStore, *, deny: bool = False) -> Kernel: + return _build_deps_shared( + ids=[uuid4() for _ in range(30)], now=_NOW, event_store=store, deny=deny + ) + + +def _make_try_conduct( + deps: Kernel, + port: InMemoryControlPort, + *, + hold_fails: bool = False, + complete_fails: bool = False, +) -> TryConductHandler: + conductor = Conductor( + control_port=port, + append_step=append_activities.bind(deps, step_store=InMemoryActivityStore()), + clock=deps.clock, + id_generator=_LenientIds(), + start_procedure=start_procedure.bind(deps), + complete_procedure=_raising_complete if complete_fails else complete_procedure.bind(deps), + abort_procedure=abort_procedure.bind(deps), + hold_procedure=_raising_hold if hold_fails else hold_procedure.bind(deps), + ) + return try_conduct_procedure.bind( + deps, conductor=conductor, expansion_port=InMemoryRecipeExpander() + ) + + +async def _seed_defined(store: InMemoryEventStore) -> None: + """Seed a standalone Defined Procedure (no recipe, no parent Run).""" + event = ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ) + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + ], + ) + + +async def _seed_running(store: InMemoryEventStore) -> None: + """Seed a Registered + Started (Running) Procedure so try_conduct's + start_procedure rejects it (Defined-only) as a lifecycle failure.""" + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_NOW), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _status(store: InMemoryEventStore) -> ProcedureStatus: + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + return state.status + + +async def _event_types(store: InMemoryEventStore) -> list[str]: + events, _ = await store.load("Procedure", _PROCEDURE_ID) + return [e.event_type for e in events] + + +async def _call(handler: TryConductHandler, steps: Sequence[Step]) -> TryConductProcedureResult: + return await handler( + TryConductProcedure(procedure_id=_PROCEDURE_ID, steps=steps), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_recoverable_setpoint_failure_pauses_to_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a NOT connected -> write fails (recoverable) + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + assert result.succeeded is False + assert result.held is True + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.HELD + types = await _event_types(store) + assert "ResolvedStepsRecorded" in types # manifest pinned -> reconduct-ready + assert "ProcedureHeld" in types + assert "ProcedureAborted" not in types + assert "ProcedureCompleted" not in types + + +@pytest.mark.unit +async def test_recoverable_check_failure_pauses_to_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # read of unconnected address fails (recoverable) + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (CheckStep(address="2bma:a", criterion=EqualsCriterion(expected=1.0)),), + ) + + assert result.held is True + assert result.failure is not None + assert result.failure.source_kind == "check" + assert await _status(store) is ProcedureStatus.HELD + + +@pytest.mark.unit +async def test_action_failure_aborts_not_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + await _seed_defined(store) + # An unregistered action -> UnknownActionError (source_kind=action), which + # is NOT recoverable: an interrupted acquisition aborts rather than pausing. + result = await _call(_make_try_conduct(_deps(store), port), (ActionStep(name="unregistered"),)) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "action" + assert await _status(store) is ProcedureStatus.ABORTED + + +@pytest.mark.unit +async def test_clean_run_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + assert result.succeeded is True + assert result.held is False + assert result.completed_count == 1 + assert await _status(store) is ProcedureStatus.COMPLETED + assert (await port.read("2bma:a")).value == 1.0 + + +@pytest.mark.unit +async def test_empty_step_list_completes() -> None: + store = InMemoryEventStore() + await _seed_defined(store) + result = await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + assert result.succeeded is True + assert result.held is False + assert await _status(store) is ProcedureStatus.COMPLETED + + +@pytest.mark.unit +async def test_hold_itself_failing_leaves_running() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a not connected -> recoverable failure + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port, hold_fails=True), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + # Recoverable failure, but the hold transition itself failed: leave the + # Procedure Running and surface the original step failure (held=False). + assert result.held is False + assert result.succeeded is False + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.RUNNING + types = await _event_types(store) + assert "ProcedureHeld" not in types + assert "ProcedureAborted" not in types + + +@pytest.mark.unit +async def test_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_defined(store) + deps = _deps(store, deny=True) + with pytest.raises(UnauthorizedError): + await _call(_make_try_conduct(deps, InMemoryControlPort()), ()) + + +@pytest.mark.unit +async def test_try_conduct_raises_not_found_when_procedure_absent() -> None: + store = InMemoryEventStore() + with pytest.raises(ProcedureNotFoundError): + await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + +@pytest.mark.unit +async def test_start_rejected_records_lifecycle_failure() -> None: + """An already-Running Procedure cannot start: a lifecycle failure lands in + the result (not held, not a step failure), and no step runs.""" + store = InMemoryEventStore() + await _seed_running(store) + result = await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "lifecycle" + assert result.failure.target == "start" + assert await _status(store) is ProcedureStatus.RUNNING + + +@pytest.mark.unit +async def test_complete_rejected_records_lifecycle_failure() -> None: + """A clean run whose complete transition itself fails records a lifecycle + failure (target=complete), not held.""" + store = InMemoryEventStore() + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), InMemoryControlPort(), complete_fails=True), () + ) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "lifecycle" + assert result.failure.target == "complete" diff --git a/infra/atlas/migrations/20260621060000_proc_summary_status_admit_held.sql b/infra/atlas/migrations/20260621060000_proc_summary_status_admit_held.sql new file mode 100644 index 0000000000..74d4f056f1 --- /dev/null +++ b/infra/atlas/migrations/20260621060000_proc_summary_status_admit_held.sql @@ -0,0 +1,24 @@ +-- Procedure summary projection: admit 'Held' in the status CHECK. +-- +-- Tier-1 resumable conduct landed the Held/Resumed FSM (ProcedureHeld / +-- ProcedureResumed) and try_conduct_procedure makes a Held Procedure +-- operator-reachable. The summary read model can now surface it: widen the +-- status CHECK so the ProcedureSummaryProjection can fold ProcedureHeld into +-- status='Held'. ProcedureResumed maps back to 'Running', so 'Held' is the +-- only new persisted status value. +-- +-- The init migration declared the CHECK inline on the column, so Postgres +-- auto-named it proj_operation_procedure_summary_status_check. Drop + re-add +-- with the widened value set. Loosening a CHECK is non-destructive: no +-- existing row (one of the 5 prior statuses) can violate the wider set, so +-- this needs no backfill and no data-safety opt-out. +-- +-- Forward-only: a rollback is a new compensating migration. Mutable read +-- model; cora_app keeps its existing DML grants. + +ALTER TABLE proj_operation_procedure_summary + DROP CONSTRAINT proj_operation_procedure_summary_status_check; + +ALTER TABLE proj_operation_procedure_summary + ADD CONSTRAINT proj_operation_procedure_summary_status_check + CHECK (status IN ('Defined', 'Running', 'Held', 'Completed', 'Aborted', 'Truncated')); diff --git a/infra/atlas/migrations/atlas.sum b/infra/atlas/migrations/atlas.sum index 09bb8ecf61..e37fb9565f 100644 --- a/infra/atlas/migrations/atlas.sum +++ b/infra/atlas/migrations/atlas.sum @@ -1,4 +1,4 @@ -h1:HF4+zVNUHazywE7Lwf5d+HiDdQJeythSjb2rKXsnz3w= +h1:Y+l/6SoAljHGZB3HpMSkt1T+g8sLWBXjasyGqKmGdZ8= 20260509120000_init_events.sql h1:GmgCZKfaqXu1m96/cKAks2vhaLWTdEaHTLkFtUo9FXg= 20260509170000_init_idempotency.sql h1:Nbu8DIE4Sv1WiHw3G22+tYffPhKc5Jryw3PMK8wB2zY= 20260510010000_add_event_id.sql h1:RbtYP6uMnOB20zhJ9dNXUi4YVqbmlEzf562pmygnRW8= @@ -149,3 +149,4 @@ h1:HF4+zVNUHazywE7Lwf5d+HiDdQJeythSjb2rKXsnz3w= 20260621030000_add_entries_run_observations_run_channel_recorded_idx.sql h1:Diaq8aq1dsFZLjwpLb5SyDHc4IxFaVaj6g3YDW+lkpQ= 20260621040000_init_entries_run_feed_heartbeats.sql h1:MlR+EKgFhxmTKqOpa5DD5WKchzTzwEcDeKoOAC8hTc0= 20260621050000_add_proj_run_summary_rule_inputs.sql h1:W6pzjGGbLEABcxj60nNOyNlTiczU4T+N8mYErqJJwaQ= +20260621060000_proc_summary_status_admit_held.sql h1:XNZsm+19l14iXCiquKSPJ/kMXoSbpuWqojPw+2NFS6o=