From 436fdbaa8d5135994044a5573a9e41eab4fef5b7 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 00:07:31 +0300 Subject: [PATCH 01/12] feat(operation): record a pre-effect in-flight marker on side-effecting steps The Conductor now writes a SEPARATE journal entry (result="in_flight") BEFORE the effect for side-effecting steps (setpoint + action), then the ok/failed outcome after. A check records no marker: a check is a pure read, always safe to re-run. Why: a future resume must identify the one step that was mid-flight when a conduct halted. The marker lands before the write/action body and is durable even when the effect then raises or is cancelled, so a marker-without-outcome for a step_index pinpoints the interrupted step. This is the Tier 1 substrate for resumable conduct; no reader exists yet. This doubles the per-step append count for setpoint/action. Updated the conductor unit tests, the collect/discrete/continuous action-body tests, and the conduct-driven integration/scenario tests (which now filter in_flight markers out of their activity-row assertions). The softioc conductor test additionally asserts the marker round-trips into Postgres. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/src/cora/operation/conductor.py | 50 +++++ .../scenarios/test_2bm_alignment_center.py | 2 +- .../scenarios/test_2bm_alignment_focus.py | 4 +- .../scenarios/test_2bm_alignment_pitch.py | 4 +- .../test_2bm_alignment_resolution.py | 4 +- .../scenarios/test_2bm_alignment_roll.py | 4 +- .../test_2bm_blade_throw_characterization.py | 4 +- .../scenarios/test_2bm_dark_baseline.py | 4 +- .../test_2bm_detector_z_rail_alignment.py | 4 +- .../test_2bm_energy_characterization.py | 2 +- .../scenarios/test_2bm_energy_setting.py | 4 +- .../scenarios/test_2bm_first_light.py | 4 +- .../scenarios/test_2bm_flat_baseline.py | 4 +- .../scenarios/test_2bm_hexapod_reboot.py | 4 +- .../scenarios/test_2bm_motor_homing.py | 4 +- .../test_2bm_sensitivity_characterization.py | 4 +- .../scenarios/test_2bm_slit_centering.py | 4 +- ...t_acquisitions_against_softioc_postgres.py | 6 +- ...test_conductor_against_softioc_postgres.py | 19 +- .../operation/test_collect_action_body.py | 6 +- .../tests/unit/operation/test_conductor.py | 207 +++++++++++++----- .../operation/test_continuous_action_body.py | 6 +- .../operation/test_discrete_action_body.py | 6 +- 23 files changed, 274 insertions(+), 86 deletions(-) diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 5285e9ca74..f4defc7153 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -37,6 +37,19 @@ False, "reason": "out_of_range"}`); the Conductor treats any return from a body as success-shaped at this tier. +## Pre-effect in-flight marker (side-effecting steps) + +A setpoint and an action are side-effecting: each records a SEPARATE +`result="in_flight"` step entry BEFORE the effect runs, then the +`ok` / `failed` outcome entry after. This doubles the per-step append +count for those two kinds. A check is a pure read (always safe to +re-run), so it records no marker, only its single outcome entry. The +marker is the resume substrate: an `in_flight` entry with no matching +outcome for the same `step_index` is the one step that was mid-flight +when a conduct halted, even if the halt was a crash or cancellation +(the marker append completes before the effect). See +[[project_resumable_conduct_design]] Tier 1. + ## Check semantics A `CheckStep` carries an address + an acceptance criterion. The @@ -197,6 +210,21 @@ read-side filters can separate successful vs failed steps without parsing the message string.""" +_RESULT_IN_FLIGHT = "in_flight" +"""Pre-effect in-flight marker discriminator, written to a SEPARATE +step entry BEFORE a side-effecting step (setpoint / action) actuates, +then followed by the `ok` / `failed` outcome entry after. A check is a +pure read (always safe to re-run), so it records NO marker -- only its +single outcome entry. + +The marker is what lets a future resume identify the one step that was +mid-flight when a conduct halted: an `in_flight` entry with no matching +outcome entry for the same `step_index` is the interrupted step. The +marker is recorded even when the effect then raises or is cancelled +(the marker append completes before the effect runs); that is the +point -- a crashed write leaves a marker-without-outcome behind so the +step is recoverable. See [[project_resumable_conduct_design]] Tier 1.""" + _QUALITY_GOOD = "Good" @@ -756,6 +784,16 @@ async def _run_setpoint( port: ControlPort, ) -> ConductorFailure | None: payload_body: dict[str, Any] = {"address": step.address, "value": step.value} + # Pre-effect in-flight marker (side-effecting step): record intent + # BEFORE the write so a halt mid-write leaves a marker-without-outcome + # the resume reader can identify. See `_RESULT_IN_FLIGHT`. + await self._record( + envelope=envelope, + index=index, + step_kind=_STEP_KIND_SETPOINT, + body=payload_body, + result=_RESULT_IN_FLIGHT, + ) try: await port.write(step.address, step.value, wait=True) except _CONTROL_ERRORS as exc: @@ -817,6 +855,18 @@ async def _run_action( port: ControlPort, ) -> ConductorFailure | None: payload_body: dict[str, Any] = {"name": step.name, "params": dict(step.params)} + # Pre-effect in-flight marker (side-effecting step): record intent + # BEFORE the action body runs so a halt mid-action leaves a + # marker-without-outcome the resume reader can identify. An unknown + # action still records the marker (the step kind is side-effecting) + # then its failure outcome. See `_RESULT_IN_FLIGHT`. + await self._record( + envelope=envelope, + index=index, + step_kind=_STEP_KIND_ACTION, + body=payload_body, + result=_RESULT_IN_FLIGHT, + ) body = self._action_registry.lookup(step.name) if body is None: exc = UnknownActionError(step.name) diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py index 487f17e92d..b463f22b21 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py @@ -1056,7 +1056,7 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg """ SELECT step_kind, payload, sampled_at FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at """, procedure_id, diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py index 81f0ae7003..7337a32850 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_focus.py @@ -657,7 +657,9 @@ async def test_focus_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 13 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py index 8ad022bd62..6d15ad966d 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py @@ -663,7 +663,9 @@ async def test_pitch_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 14 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py index 9d7a1f766a..4884ac342b 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_resolution.py @@ -592,7 +592,9 @@ async def test_resolution_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 13 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py index e9dcb02388..e25dc682bb 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py @@ -651,7 +651,9 @@ async def test_roll_alignment_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 14 diff --git a/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py index b63d976604..2b6535b819 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_blade_throw_characterization.py @@ -211,7 +211,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) diff --git a/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py b/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py index 2b2d4e1856..52d25be56c 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py +++ b/apps/api/tests/integration/scenarios/test_2bm_dark_baseline.py @@ -457,7 +457,9 @@ async def test_dark_baseline_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 5 diff --git a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py index 60cb2acf2b..aee487af5e 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py +++ b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py @@ -219,7 +219,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) diff --git a/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py index aaf0739136..ec30980c28 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_energy_characterization.py @@ -201,7 +201,7 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg """ SELECT step_kind, payload, sampled_at FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at """, procedure_id, diff --git a/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py b/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py index 9ebe08cd0d..bae8081cdf 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py +++ b/apps/api/tests/integration/scenarios/test_2bm_energy_setting.py @@ -354,7 +354,9 @@ async def test_energy_setting_records_a_coordinated_move(db_pool: asyncpg.Pool) await _drain(db_pool) async with db_pool.acquire() as conn: rows = await conn.fetch( - "SELECT step_kind FROM entries_operation_procedure_activities WHERE procedure_id = $1", + "SELECT step_kind FROM entries_operation_procedure_activities " + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight'", procedure_id, ) kinds = [r["step_kind"] for r in rows] diff --git a/apps/api/tests/integration/scenarios/test_2bm_first_light.py b/apps/api/tests/integration/scenarios/test_2bm_first_light.py index 6f84888a38..6c38380104 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_first_light.py +++ b/apps/api/tests/integration/scenarios/test_2bm_first_light.py @@ -468,7 +468,9 @@ async def test_first_light_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 7 diff --git a/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py b/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py index a0ecbda1f9..0f85209dd9 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py +++ b/apps/api/tests/integration/scenarios/test_2bm_flat_baseline.py @@ -479,7 +479,9 @@ async def test_flat_baseline_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 8 diff --git a/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py b/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py index d87f875f38..48fefb3407 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py +++ b/apps/api/tests/integration/scenarios/test_2bm_hexapod_reboot.py @@ -690,7 +690,9 @@ async def test_hexapod_reboot_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at, event_id", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at, event_id", _PROCEDURE_ID, ) assert len(rows) == 17 diff --git a/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py b/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py index bb8b0fa651..1805aa18bb 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py +++ b/apps/api/tests/integration/scenarios/test_2bm_motor_homing.py @@ -686,7 +686,9 @@ async def test_motor_homing_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", _PROCEDURE_ID, ) assert len(rows) == 9 diff --git a/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py b/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py index 3da5e1f6e3..3733f146e0 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py +++ b/apps/api/tests/integration/scenarios/test_2bm_sensitivity_characterization.py @@ -520,7 +520,9 @@ async def test_sensitivity_characterization_plays_out_end_to_end( async with db_pool.acquire() as conn: rows = await conn.fetch( "SELECT step_kind, payload FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at, event_id", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at, event_id", _PROCEDURE_ID, ) assert len(rows) == 8 diff --git a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py index a622efa507..da1cf587d0 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py +++ b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py @@ -184,7 +184,9 @@ async def _read_steps(db_pool: asyncpg.Pool, procedure_id: UUID) -> list[asyncpg return await conn.fetch( "SELECT step_kind, payload, sampled_at " "FROM entries_operation_procedure_activities " - "WHERE procedure_id = $1 ORDER BY sampled_at", + "WHERE procedure_id = $1 " + "AND payload->>'result' IS DISTINCT FROM 'in_flight' " + "ORDER BY sampled_at", procedure_id, ) diff --git a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py index d46cae34f1..a860a94895 100644 --- a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py @@ -178,7 +178,7 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( """ SELECT step_kind, payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' ORDER BY sampled_at, event_id """, procedure_id, @@ -269,7 +269,7 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects """ SELECT payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' """, procedure_id, ) @@ -356,7 +356,7 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( """ SELECT payload FROM entries_operation_procedure_activities - WHERE procedure_id = $1 + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' """, procedure_id, ) diff --git a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py index 14c5cf5609..4ded95ad05 100644 --- a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py @@ -167,17 +167,30 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( """, procedure_id, ) - assert [r["step_kind"] for r in rows] == ["setpoint", "check"] import json - setpoint_payload = json.loads(rows[0]["payload"]) + parsed = [(r["step_kind"], json.loads(r["payload"])) for r in rows] + # The setpoint is side-effecting: it records a pre-effect in-flight + # marker then the `ok` outcome, both round-tripping into Postgres. The + # check (pure read) records only its outcome -- no marker. + assert [(k, p["result"]) for k, p in parsed] == [ + ("setpoint", "in_flight"), + ("setpoint", "ok"), + ("check", "ok"), + ] + setpoint_marker = parsed[0][1] + assert setpoint_marker["address"] == f"{softioc}double_value" + assert setpoint_marker["value"] == 7.5 + assert "post_reading" not in setpoint_marker # marker precedes the write + + setpoint_payload = parsed[1][1] assert setpoint_payload["address"] == f"{softioc}double_value" assert setpoint_payload["value"] == 7.5 assert setpoint_payload["result"] == "ok" assert setpoint_payload["post_reading"]["value"] == 7.5 assert setpoint_payload["post_reading"]["quality"] == "Good" - check_payload = json.loads(rows[1]["payload"]) + check_payload = parsed[2][1] assert check_payload["address"] == f"{softioc}double_value" assert check_payload["criterion"] == { "kind": "within_tolerance", diff --git a/apps/api/tests/unit/operation/test_collect_action_body.py b/apps/api/tests/unit/operation/test_collect_action_body.py index 569df29bdf..2c1f6ffa51 100644 --- a/apps/api/tests/unit/operation/test_collect_action_body.py +++ b/apps/api/tests/unit/operation/test_collect_action_body.py @@ -475,7 +475,7 @@ async def test_conductor_executes_collect_action_and_records_step_entry() -> Non control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -496,7 +496,9 @@ async def test_conductor_executes_collect_action_and_records_step_entry() -> Non ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "collect" assert entry.payload["result"] == "ok" diff --git a/apps/api/tests/unit/operation/test_conductor.py b/apps/api/tests/unit/operation/test_conductor.py index faa2aea184..f38ee0fc7e 100644 --- a/apps/api/tests/unit/operation/test_conductor.py +++ b/apps/api/tests/unit/operation/test_conductor.py @@ -2,11 +2,18 @@ Coverage spans both step kinds shipped to date (setpoint + action): + Pre-effect in-flight marker (side-effecting steps): + - a setpoint / action records a `result="in_flight"` marker entry + BEFORE the effect, then the `ok` / `failed` outcome after (two + appends per side-effecting step); a check records no marker + - a cancelled / crashing effect still leaves the marker behind + (marker-without-outcome = the interrupted step), enabling resume + Setpoint: - empty steps -> trivially succeeds, no handler call - - 3 setpoints -> 3 ControlPort writes + 3 step entries recorded + - 3 setpoints -> 3 ControlPort writes; each records marker + outcome - first write raises ControlNotConnectedError -> halt at index 0, - failure entry recorded, ConductorResult.failure populated + marker + failure entry recorded, ConductorResult.failure populated - middle write raises ControlTimeoutError -> halt at index N, earlier successes recorded, failure entry for the failing step, remaining steps untouched @@ -140,19 +147,23 @@ class _SequenceIdGenerator: """Deterministic id_generator that returns a pre-supplied list of ids. Lets tests pin event_id values into the recorded entries so the - payload assertion is exact. Raises on exhaustion so missing ids - are loud, not silent. + payload assertion is exact. The pre-effect in-flight marker doubles + the per-step append count for setpoint / action steps, so a test + supplies only the ids it actually asserts on (pinned first, in + order) and lets the rest auto-generate. Append-COUNT assertions are + pinned via `len(appender.calls)`, not via id exhaustion, so lenient + generation here masks no over-append bug. """ ids: list[UUID] _index: int = 0 def new_id(self) -> UUID: - if self._index >= len(self.ids): - raise RuntimeError("FixedIdGenerator exhausted") - out = self.ids[self._index] - self._index += 1 - return out + if self._index < len(self.ids): + out = self.ids[self._index] + self._index += 1 + return out + return uuid4() def _conductor( @@ -219,34 +230,45 @@ async def test_execute_setpoints_writes_each_step_via_control_port_in_order() -> @pytest.mark.unit -async def test_execute_setpoint_records_success_entry_with_expected_payload() -> None: - """Each successful write produces one append call with the expected payload.""" +async def test_execute_setpoint_records_in_flight_marker_then_success_entry() -> None: + """A successful write produces two append calls: the pre-effect in-flight + marker first, then the `ok` outcome entry, both carrying the envelope.""" port = InMemoryControlPort(now=lambda: _FIXED_NOW) port.simulate_connect("2bma:rot:val") appender = _FakeAppendStep() procedure_id = uuid4() principal_id = uuid4() correlation_id = uuid4() - event_id = uuid4() - conductor = _conductor(port, appender, ids=[event_id]) + marker_id = uuid4() + outcome_id = uuid4() + conductor = _conductor(port, appender, ids=[marker_id, outcome_id]) await conductor.execute( procedure_id=procedure_id, principal_id=principal_id, correlation_id=correlation_id, steps=(SetpointStep(address="2bma:rot:val", value=12.5),), ) - assert len(appender.calls) == 1 - call = appender.calls[0] - assert call.command.procedure_id == procedure_id - assert call.principal_id == principal_id - assert call.correlation_id == correlation_id - assert len(call.command.entries) == 1 - entry = call.command.entries[0] - assert entry.event_id == event_id - assert entry.step_kind == "setpoint" - assert entry.sampled_at == _FIXED_NOW - assert entry.occurred_at == _FIXED_NOW - assert entry.payload == { + assert len(appender.calls) == 2 + for call in appender.calls: + assert call.command.procedure_id == procedure_id + assert call.principal_id == principal_id + assert call.correlation_id == correlation_id + assert len(call.command.entries) == 1 + marker = appender.calls[0].command.entries[0] + assert marker.event_id == marker_id + assert marker.step_kind == "setpoint" + assert marker.payload == { + "address": "2bma:rot:val", + "value": 12.5, + "step_index": 0, + "result": "in_flight", + } + outcome = appender.calls[1].command.entries[0] + assert outcome.event_id == outcome_id + assert outcome.step_kind == "setpoint" + assert outcome.sampled_at == _FIXED_NOW + assert outcome.occurred_at == _FIXED_NOW + assert outcome.payload == { "address": "2bma:rot:val", "value": 12.5, "step_index": 0, @@ -254,6 +276,26 @@ async def test_execute_setpoint_records_success_entry_with_expected_payload() -> } +@pytest.mark.unit +async def test_execute_check_records_no_in_flight_marker() -> None: + """A check is a pure read (always safe to re-run), so it records its + single outcome entry only -- no pre-effect in-flight marker.""" + port = InMemoryControlPort() + port.set_reading("2bma:rot:rbv", _good_reading(45.0)) + appender = _FakeAppendStep() + conductor = _conductor(port, appender) + await conductor.execute( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(CheckStep(address="2bma:rot:rbv", criterion=EqualsCriterion(expected=45.0)),), + ) + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "ok" + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert "in_flight" not in results + + @pytest.mark.unit async def test_execute_halts_at_first_not_connected_error_on_setpoint() -> None: """First write raises ControlNotConnectedError -> failure at index 0.""" @@ -279,9 +321,11 @@ async def test_execute_halts_at_first_not_connected_error_on_setpoint() -> None: error_class="ControlNotConnectedError", message="Control address '2bma:rot:val' not connected", ) - # Exactly one failure entry recorded; the second step is untouched. - assert len(appender.calls) == 1 - failure_entry = appender.calls[0].command.entries[0] + # In-flight marker then the failure outcome for step 0; the second + # step is untouched (no marker, no outcome). + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + failure_entry = appender.calls[1].command.entries[0] assert failure_entry.payload["result"] == "failed" assert failure_entry.payload["error_class"] == "ControlNotConnectedError" assert "not connected" in failure_entry.payload["message"] @@ -310,10 +354,11 @@ async def test_execute_records_earlier_setpoint_successes_before_middle_failure( assert result.failure is not None assert result.failure.step_index == 1 assert result.failure.target == "2bma:cam:exposure" - # 2 append calls: one OK at index 0, one FAILED at index 1; index 2 never tried. - assert len(appender.calls) == 2 - assert appender.calls[0].command.entries[0].payload["result"] == "ok" - assert appender.calls[1].command.entries[0].payload["result"] == "failed" + # 4 append calls: marker+ok at index 0, marker+failed at index 1; + # index 2 never tried (no marker). + assert len(appender.calls) == 4 + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert results == ["in_flight", "ok", "in_flight", "failed"] @pytest.mark.unit @@ -336,10 +381,12 @@ async def test_execute_records_step_index_matching_conduct_position() -> None: ) assert result.failure is not None assert result.failure.step_index == 1 - # Success entry at index 0, failure entry at index 1; index 2 never tried. - assert appender.calls[0].command.entries[0].payload["step_index"] == 0 - assert appender.calls[1].command.entries[0].payload["step_index"] == 1 - assert appender.calls[1].command.entries[0].payload["result"] == "failed" + # marker+ok at index 0, marker+failed at index 1; index 2 never tried. + # Every entry (marker and outcome) carries its step's position. + step_indices = [c.command.entries[0].payload["step_index"] for c in appender.calls] + results = [c.command.entries[0].payload["result"] for c in appender.calls] + assert step_indices == [0, 0, 1, 1] + assert results == ["in_flight", "ok", "in_flight", "failed"] @pytest.mark.unit @@ -365,7 +412,12 @@ async def test_execute_passes_through_causation_and_surface_ids() -> None: @pytest.mark.unit async def test_execute_does_not_catch_non_port_exceptions_on_setpoint() -> None: - """A CancelledError mid-write propagates; nothing is recorded for it.""" + """A CancelledError mid-write propagates; the pre-effect in-flight marker + IS recorded (it lands before the write), but no outcome entry follows. + + The marker-without-outcome is exactly the resume substrate: the + interrupted step is identifiable after a crash / cancellation. + """ class _CancellingPort: async def read(self, _address: str) -> Reading: # pragma: no cover # unused @@ -391,7 +443,9 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove correlation_id=uuid4(), steps=(SetpointStep(address="anywhere", value=1.0),), ) - assert appender.calls == [] + # Only the in-flight marker; the cancelled write recorded no outcome. + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" @pytest.mark.unit @@ -441,7 +495,17 @@ async def home_motor(ctx: ActionContext) -> Mapping[str, Any]: port.simulate_connect("2bma:rot:val") await captured[0].control_port.write("2bma:rot:val", 4.2) assert (await port.read("2bma:rot:val")).value == 4.2 - entry = appender.calls[0].command.entries[0] + # marker (no result_data yet) then the outcome carrying result_data. + assert len(appender.calls) == 2 + marker = appender.calls[0].command.entries[0] + assert marker.step_kind == "action" + assert marker.payload == { + "name": "home_motor", + "params": {"axis": "rot"}, + "step_index": 0, + "result": "in_flight", + } + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload == { "name": "home_motor", @@ -474,9 +538,10 @@ async def test_execute_action_unknown_name_records_failure_and_halts() -> None: assert result.failure.source_kind == "action" assert result.failure.target == "nope" assert result.failure.error_class == "UnknownActionError" - # Only one record (the failure); the second action is untouched. - assert len(appender.calls) == 1 - payload = appender.calls[0].command.entries[0].payload + # marker then the failure outcome; the second action is untouched. + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "UnknownActionError" assert payload["name"] == "nope" @@ -504,14 +569,19 @@ async def picky(_ctx: ActionContext) -> Mapping[str, Any]: assert result.failure.error_class == "ControlTimeoutError" assert result.failure.source_kind == "action" assert result.failure.target == "picky" - payload = appender.calls[0].command.entries[0].payload + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "ControlTimeoutError" @pytest.mark.unit async def test_execute_action_body_raising_non_port_exception_propagates() -> None: - """Generic exceptions in a body propagate; the Conductor does not swallow them.""" + """Generic exceptions in a body propagate; the Conductor does not swallow them. + + The pre-effect in-flight marker IS recorded (it lands before the body + runs), but no outcome entry follows the propagating exception. + """ async def buggy(_ctx: ActionContext) -> Mapping[str, Any]: raise RuntimeError("oops") @@ -527,7 +597,9 @@ async def buggy(_ctx: ActionContext) -> Mapping[str, Any]: correlation_id=uuid4(), steps=(ActionStep(name="buggy"),), ) - assert appender.calls == [] + # Only the in-flight marker; the crashing body recorded no outcome. + assert len(appender.calls) == 1 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" @pytest.mark.unit @@ -563,9 +635,14 @@ async def close_shutter(_ctx: ActionContext) -> Mapping[str, Any]: assert result.succeeded is True assert result.completed_count == 3 assert invocations == ["open_shutter", "close_shutter"] - # 3 recorded entries in order: action / setpoint / action. - kinds = [c.command.entries[0].step_kind for c in appender.calls] - assert kinds == ["action", "setpoint", "action"] + # Outcome entries in order: action / setpoint / action (each preceded + # by its in-flight marker, filtered out here). + outcome_kinds = [ + c.command.entries[0].step_kind + for c in appender.calls + if c.command.entries[0].payload["result"] != "in_flight" + ] + assert outcome_kinds == ["action", "setpoint", "action"] @pytest.mark.unit @@ -820,8 +897,14 @@ async def open_shutter(_ctx: ActionContext) -> Mapping[str, Any]: ) assert result.succeeded is True assert result.completed_count == 3 - kinds = [c.command.entries[0].step_kind for c in appender.calls] - assert kinds == ["setpoint", "action", "check"] + # Outcome entries in order: setpoint / action / check. The setpoint and + # action each prepend an in-flight marker; the check (pure read) does not. + outcome_kinds = [ + c.command.entries[0].step_kind + for c in appender.calls + if c.command.entries[0].payload["result"] != "in_flight" + ] + assert outcome_kinds == ["setpoint", "action", "check"] # --- conduct (FSM lifecycle) coverage ----------------------------------- @@ -1053,7 +1136,8 @@ async def test_setpoint_default_verify_omits_post_reading_from_payload() -> None correlation_id=uuid4(), steps=(SetpointStep(address="2bma:rot:val", value=1.0),), ) - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert "post_reading" not in payload assert "post_read_error" not in payload @@ -1071,7 +1155,8 @@ async def test_setpoint_verify_attaches_post_reading_to_payload() -> None: correlation_id=uuid4(), steps=(SetpointStep(address="2bma:rot:val", value=4.2, verify=True),), ) - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker (no post_reading); calls[1] the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert payload["post_reading"]["value"] == 4.2 assert payload["post_reading"]["quality"] == "Good" @@ -1125,7 +1210,8 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove steps=(SetpointStep(address="2bma:rot:val", value=4.2, verify=True),), ) assert result.succeeded is True - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert payload["post_reading"]["quality"] == "Bad" assert payload["post_reading"]["quality_detail"] == "alarm_status=3" @@ -1162,7 +1248,8 @@ def subscribe(self, _address: str) -> AsyncIterator[Reading]: # pragma: no cove steps=(SetpointStep(address="lonely", value=1.0, verify=True),), ) assert result.succeeded is True - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the outcome. + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "ok" assert "post_reading" not in payload assert payload["post_read_error"]["error_class"] == "ControlNotConnectedError" @@ -1182,7 +1269,10 @@ async def test_setpoint_verify_does_not_change_write_failure_halt_behavior() -> steps=(SetpointStep(address="missing", value=1.0, verify=True),), ) assert result.succeeded is False - payload = appender.calls[0].command.entries[0].payload + # calls[0] is the in-flight marker; calls[1] is the failure outcome + # (the write failed before the verify post-read, so no post_reading). + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + payload = appender.calls[1].command.entries[0].payload assert payload["result"] == "failed" assert payload["error_class"] == "ControlNotConnectedError" assert "post_reading" not in payload @@ -1461,9 +1551,10 @@ async def test_execute_setpoint_via_registry_with_unrouted_address_records_failu assert result.failure is not None assert result.failure.error_class == "NoAdapterForAddressError" assert result.failure.source_kind == "setpoint" - # Recorded in logbook, not propagated as a 500. - assert len(appender.calls) == 1 - assert appender.calls[0].command.entries[0].payload["result"] == "failed" + # Recorded in logbook (marker + failure), not propagated as a 500. + assert len(appender.calls) == 2 + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + assert appender.calls[1].command.entries[0].payload["result"] == "failed" # --- actuation provenance (ActuationKind) ------------------------------- diff --git a/apps/api/tests/unit/operation/test_continuous_action_body.py b/apps/api/tests/unit/operation/test_continuous_action_body.py index 9a913443f1..0055f47df0 100644 --- a/apps/api/tests/unit/operation/test_continuous_action_body.py +++ b/apps/api/tests/unit/operation/test_continuous_action_body.py @@ -406,7 +406,7 @@ async def test_conductor_executes_continuous_action_and_records_step_entry() -> control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -432,7 +432,9 @@ async def test_conductor_executes_continuous_action_and_records_step_entry() -> ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "continuous" assert entry.payload["result"] == "ok" diff --git a/apps/api/tests/unit/operation/test_discrete_action_body.py b/apps/api/tests/unit/operation/test_discrete_action_body.py index 0e2c9a43de..7397ed8f72 100644 --- a/apps/api/tests/unit/operation/test_discrete_action_body.py +++ b/apps/api/tests/unit/operation/test_discrete_action_body.py @@ -371,7 +371,7 @@ async def test_conductor_executes_discrete_action_and_records_step_entry() -> No control_port=port, append_step=appender, clock=FakeClock(_FIXED_NOW), - id_generator=_SequenceIdGenerator([uuid4()]), + id_generator=_SequenceIdGenerator([uuid4(), uuid4()]), action_registry=registry, ) result = await conductor.execute( @@ -393,7 +393,9 @@ async def test_conductor_executes_discrete_action_and_records_step_entry() -> No ) assert result.succeeded is True assert result.completed_count == 1 - entry = appender.calls[0].command.entries[0] + # calls[0] is the pre-effect in-flight marker; calls[1] is the outcome. + assert appender.calls[0].command.entries[0].payload["result"] == "in_flight" + entry = appender.calls[1].command.entries[0] assert entry.step_kind == "action" assert entry.payload["name"] == "discrete" assert entry.payload["result"] == "ok" From 1ca2b90f18a61873149b548e75dbca73ffae8513 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 07:48:36 +0300 Subject: [PATCH 02/12] feat(operation): add Held/Resumed pause-resume FSM to Procedure Tier 1 of resumable conduct: an operator can pause a halted Procedure conduct (hold_procedure, Running -> Held) and later resume it (resume_procedure, Held -> Running) rather than abort-and-reseed. Adds the ProcedureHeld / ProcedureResumed events, the HELD status value, the two evolver arms, and both command slices (decider + handler + route + MCP tool + tests). Naming: the pause state is `Held`, mirroring RunStatus.HELD, per a naming-r3 verdict. Procedure is an execution-FSM sibling of Run (a dozen "Mirrors Run..." docstrings); Agent's config-FSM `Suspended` was the rejected alternative. This also frees `Suspended` for the data-transfer twin's intervention-required semantics and matches the PackML operator=Held / external-blocker=Suspended split. ProcedureHeld diverges from slim RunHeld by carrying a required reason (a halted-conduct pause is a deliberate, high-information act; mirrors AgentSuspended.reason); ProcedureResumed carries the re_establishment_boundary the Conductor's execute_from will replay from (a boundary, NOT a continuity proof). Adds an AST evolver carry-forward fitness pinning that every non-genesis arm threads each additive field (the iteration denorms especially) so a new arm cannot silently wipe state on replay. Deferred to follow-up slices (intentional intermediate state): abort / truncate / end_iteration are not yet widened to accept Held (a Held Procedure is momentarily un-abortable; the abort PBT auto-flags Held as a disallowed source until then); resume's off-diagonal guard (refuse while the parent Run is Held) lands next. The proj_operation_procedure_ summary read model is deliberately left unsubscribed (its status CHECK admits only 5 values; a held Procedure reads as Running until a forward-only migration widens it) with a drift-catching test pinning the omission. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/openapi.json | 236 +++++++++++++++++- .../aggregates/procedure/__init__.py | 14 ++ .../operation/aggregates/procedure/events.py | 129 +++++++++- .../operation/aggregates/procedure/evolver.py | 51 ++++ .../operation/aggregates/procedure/state.py | 180 +++++++++++-- .../src/cora/operation/features/__init__.py | 13 +- .../features/hold_procedure/__init__.py | 23 ++ .../features/hold_procedure/command.py | 29 +++ .../features/hold_procedure/decider.py | 57 +++++ .../features/hold_procedure/handler.py | 45 ++++ .../features/hold_procedure/route.py | 91 +++++++ .../operation/features/hold_procedure/tool.py | 52 ++++ .../features/resume_procedure/__init__.py | 23 ++ .../features/resume_procedure/command.py | 27 ++ .../features/resume_procedure/decider.py | 56 +++++ .../features/resume_procedure/handler.py | 45 ++++ .../features/resume_procedure/route.py | 92 +++++++ .../features/resume_procedure/tool.py | 56 +++++ apps/api/src/cora/operation/routes.py | 14 ++ apps/api/src/cora/operation/tools.py | 10 + apps/api/src/cora/operation/wire.py | 14 ++ .../test_procedure_evolver_carry_forward.py | 158 ++++++++++++ .../contract/test_hold_procedure_endpoint.py | 100 ++++++++ .../contract/test_hold_procedure_mcp_tool.py | 73 ++++++ .../test_resume_procedure_endpoint.py | 88 +++++++ .../test_resume_procedure_mcp_tool.py | 86 +++++++ .../operation/test_hold_procedure_decider.py | 150 +++++++++++ .../test_hold_procedure_decider_properties.py | 167 +++++++++++++ .../operation/test_hold_procedure_handler.py | 125 ++++++++++ .../tests/unit/operation/test_procedure.py | 12 +- .../unit/operation/test_procedure_events.py | 89 +++++++ .../unit/operation/test_procedure_evolver.py | 122 +++++++++ .../test_procedure_summary_projection.py | 16 ++ .../test_resume_procedure_decider.py | 140 +++++++++++ ...est_resume_procedure_decider_properties.py | 199 +++++++++++++++ .../test_resume_procedure_handler.py | 143 +++++++++++ 36 files changed, 2895 insertions(+), 30 deletions(-) create mode 100644 apps/api/src/cora/operation/features/hold_procedure/__init__.py create mode 100644 apps/api/src/cora/operation/features/hold_procedure/command.py create mode 100644 apps/api/src/cora/operation/features/hold_procedure/decider.py create mode 100644 apps/api/src/cora/operation/features/hold_procedure/handler.py create mode 100644 apps/api/src/cora/operation/features/hold_procedure/route.py create mode 100644 apps/api/src/cora/operation/features/hold_procedure/tool.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/__init__.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/command.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/decider.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/handler.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/route.py create mode 100644 apps/api/src/cora/operation/features/resume_procedure/tool.py create mode 100644 apps/api/tests/architecture/test_procedure_evolver_carry_forward.py create mode 100644 apps/api/tests/contract/test_hold_procedure_endpoint.py create mode 100644 apps/api/tests/contract/test_hold_procedure_mcp_tool.py create mode 100644 apps/api/tests/contract/test_resume_procedure_endpoint.py create mode 100644 apps/api/tests/contract/test_resume_procedure_mcp_tool.py create mode 100644 apps/api/tests/unit/operation/test_hold_procedure_decider.py create mode 100644 apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py create mode 100644 apps/api/tests/unit/operation/test_hold_procedure_handler.py create mode 100644 apps/api/tests/unit/operation/test_resume_procedure_decider.py create mode 100644 apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py create mode 100644 apps/api/tests/unit/operation/test_resume_procedure_handler.py diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 3990bc39ea..f9f066d769 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -7011,6 +7011,23 @@ "title": "HoldCampaignRequest", "type": "object" }, + "HoldProcedureRequest": { + "description": "Body for `POST /procedures/{procedure_id}/hold`.", + "properties": { + "reason": { + "description": "Free-form reason for the hold (1-500 chars after trimming). Required: pausing a halted conduct is a deliberate operator act (unlike a routine RunHeld, which carries no reason).", + "maxLength": 500, + "minLength": 1, + "title": "Reason", + "type": "string" + } + }, + "required": [ + "reason" + ], + "title": "HoldProcedureRequest", + "type": "object" + }, "HoldVisitRequest": { "description": "Body for `POST /visits/{visit_id}/hold`.\n\n`reason` is operator-supplied free text (audit-log breadcrumb).\nExamples: \"beam dump\", \"equipment fault\", \"safety hold pending\nradiation door reset\", \"extended user break\". MUST NOT contain PII.", "properties": { @@ -9606,10 +9623,11 @@ "type": "object" }, "ProcedureStatus": { - "description": "The Procedure's lifecycle state.\n\nFive values declared day one for forward-compat\n(additive-state pattern; legacy events fold cleanly because\nonly DEFINED is reachable after register_procedure):\n\n - `Defined` -- registration-time genesis; pre-execution.\n Operator can edit / inspect / submit for\n review (future Decision BC integration).\n Cannot accept step events yet.\n - `Running` -- post-start_procedure. Step events accepted\n via append_activities.\n - `Completed` -- happy path via complete_procedure.\n Strict-not-idempotent.\n - `Aborted` -- emergency exit via abort_procedure.\n - `Truncated` -- retroactive cleanup via truncate_procedure.\n Mirrors RunTruncated.\n\n`Verifying` and `Held / Resumed` are deliberately NOT in this\nenum. Per [[project_operation_design]] standards-corpus research:\n`Verifying` is NOT standards-blessed at FSM level (PackML uses\n`Completing` for closeout/check work; OPC UA Programs has no\nVerify state). Per-step Check happens within Running synchronously\n(via the Step logbook's check_passed field). Held / Resumed\ndeferred until pilot operator feedback surfaces a need.\n\nNaming convention (per Run BC gate review): gerund /\nadjective for active steady-states (matches PackML / Bluesky);\npast-participle for terminals. `Defined` is past-participle (a\nprocedure WAS defined); `Running` is gerund-as-adjective; the\nrest are past-participle terminals.\n\nEnum values are PascalCase strings (matches BC-map status\nvocabulary; log lines and DTOs read naturally without mapping).", + "description": "The Procedure's lifecycle state.\n\nSix values declared for forward-compat (additive-state pattern;\nlegacy events fold cleanly because only DEFINED is reachable after\nregister_procedure):\n\n - `Defined` -- registration-time genesis; pre-execution.\n Operator can edit / inspect / submit for\n review (future Decision BC integration).\n Cannot accept step events yet.\n - `Running` -- post-start_procedure. Step events accepted\n via append_activities.\n - `Held` -- operator-paused mid-conduct via hold_procedure\n (Running <-> Held, resumable via\n resume_procedure). The resumable-conduct\n pause state; mirrors `RunStatus.HELD`. No step\n events accepted while Held; the conduct is\n paused, not advancing.\n - `Completed` -- happy path via complete_procedure.\n Strict-not-idempotent.\n - `Aborted` -- emergency exit via abort_procedure.\n - `Truncated` -- retroactive cleanup via truncate_procedure.\n Mirrors RunTruncated.\n\n`Verifying` is deliberately NOT in this enum. Per\n[[project_operation_design]] standards-corpus research: `Verifying`\nis NOT standards-blessed at FSM level (PackML uses `Completing` for\ncloseout/check work; OPC UA Programs has no Verify state). Per-step\nCheck happens within Running synchronously (via the Step logbook's\ncheck_passed field).\n\n`Held` lands in Tier 1 of [[project_resumable_conduct_design]]:\noperator-pause of a halted conduct, additive to the Layer-1 FSM,\nmirroring `RunStatus.HELD` (Procedure is an execution-FSM sibling of\nRun). The PackML operator=`Held` / external-blocker=`Suspended`\nsplit is honored: this is the operator-pause, so `Held`, not\n`Suspended`. The `HOLDING` / `RESTARTING` transient states are\ndeliberately omitted (Run-precedent deferral).\n\nNaming convention (per Run BC gate review): gerund /\nadjective for active steady-states (matches PackML / Bluesky);\npast-participle for the pause-state and terminals. `Defined` is\npast-participle (a procedure WAS defined); `Running` is\ngerund-as-adjective; `Held` is past-participle (mirrors\n`RunStatus.HELD`); the rest are past-participle terminals.\n\nEnum values are PascalCase strings (matches BC-map status\nvocabulary; log lines and DTOs read naturally without mapping).", "enum": [ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated" @@ -12446,6 +12464,22 @@ "title": "RestoreSupplyRequest", "type": "object" }, + "ResumeProcedureRequest": { + "description": "Body for `POST /procedures/{procedure_id}/resume`.", + "properties": { + "re_establishment_boundary": { + "description": "Index in the pinned conduct manifest from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "minimum": 0.0, + "title": "Re Establishment Boundary", + "type": "integer" + } + }, + "required": [ + "re_establishment_boundary" + ], + "title": "ResumeProcedureRequest", + "type": "object" + }, "RetireCautionRequest": { "description": "Body for `POST /cautions/{caution_id}/retire`.", "properties": { @@ -36641,6 +36675,106 @@ ] } }, + "/procedures/{procedure_id}/hold": { + "post": { + "operationId": "post_procedures_hold_procedures__procedure_id__hold_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HoldProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "Successful Response" + }, + "400": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Domain invariant violated: whitespace-only reason." + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "404": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Running` status (hold requires `Running`; holding a `Defined` / `Held` / terminal procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + }, + "422": { + "description": "Path parameter or request body failed schema validation." + } + }, + "summary": "Pause an actively-running Procedure conduct (Running -> Held)", + "tags": [ + "operation" + ] + } + }, "/procedures/{procedure_id}/iterations": { "get": { "operationId": "list_procedure_iterations_procedures__procedure_id__iterations_get", @@ -36898,6 +37032,106 @@ ] } }, + "/procedures/{procedure_id}/resume": { + "post": { + "operationId": "post_procedures_resume_procedures__procedure_id__resume_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ResumeProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "Successful Response" + }, + "400": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Domain invariant violated: negative re_establishment_boundary." + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "404": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Held` status (resume requires `Held`; resuming a `Running` / `Defined` / terminal procedure raises), OR a concurrent write to the same procedure stream conflicted (optimistic concurrency)." + }, + "422": { + "description": "Path parameter or request body failed schema validation." + } + }, + "summary": "Resume a held Procedure conduct (Held -> Running)", + "tags": [ + "operation" + ] + } + }, "/procedures/{procedure_id}/start": { "post": { "operationId": "post_procedures_start_procedures__procedure_id__start_post", diff --git a/apps/api/src/cora/operation/aggregates/procedure/__init__.py b/apps/api/src/cora/operation/aggregates/procedure/__init__.py index 227f068df7..759c01b78d 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/__init__.py +++ b/apps/api/src/cora/operation/aggregates/procedure/__init__.py @@ -21,9 +21,11 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -45,11 +47,13 @@ STEP_KIND_VALUES, STEPS_LOGBOOK_SCHEMA, InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureInterruptedAtError, InvalidProcedureIterationCapError, InvalidProcedureIterationEndReasonError, InvalidProcedureKindError, InvalidProcedureNameError, + InvalidProcedureReEstablishmentBoundaryError, InvalidProcedureTruncateReasonError, InvalidRecipeBindingsError, InvalidStepKindError, @@ -61,11 +65,14 @@ ProcedureCannotAbortError, ProcedureCannotCompleteError, ProcedureCannotEndIterationError, + ProcedureCannotHoldError, + ProcedureCannotResumeError, ProcedureCannotStartError, ProcedureCannotStartIterationError, ProcedureCannotTruncateError, ProcedureCapabilityExecutorMismatchError, ProcedureEnclosureCoverageMismatchError, + ProcedureHoldReason, ProcedureIterationLimitReachedError, ProcedureName, ProcedureNotFoundError, @@ -98,11 +105,13 @@ "ActivityStore", "InMemoryActivityStore", "InvalidProcedureAbortReasonError", + "InvalidProcedureHoldReasonError", "InvalidProcedureInterruptedAtError", "InvalidProcedureIterationCapError", "InvalidProcedureIterationEndReasonError", "InvalidProcedureKindError", "InvalidProcedureNameError", + "InvalidProcedureReEstablishmentBoundaryError", "InvalidProcedureTruncateReasonError", "InvalidRecipeBindingsError", "InvalidStepKindError", @@ -117,6 +126,8 @@ "ProcedureCannotAbortError", "ProcedureCannotCompleteError", "ProcedureCannotEndIterationError", + "ProcedureCannotHoldError", + "ProcedureCannotResumeError", "ProcedureCannotStartError", "ProcedureCannotStartIterationError", "ProcedureCannotTruncateError", @@ -124,6 +135,8 @@ "ProcedureCompleted", "ProcedureEnclosureCoverageMismatchError", "ProcedureEvent", + "ProcedureHeld", + "ProcedureHoldReason", "ProcedureIterationEnded", "ProcedureIterationLimitReachedError", "ProcedureIterationStarted", @@ -134,6 +147,7 @@ "ProcedureRequiresAvailableSupplyError", "ProcedureRequiresOpenBeamShuttersError", "ProcedureRequiresPermittedEnclosureError", + "ProcedureResumed", "ProcedureStarted", "ProcedureStatus", "ProcedureStepsForbiddenForRecipeDrivenError", diff --git a/apps/api/src/cora/operation/aggregates/procedure/events.py b/apps/api/src/cora/operation/aggregates/procedure/events.py index 60150889cb..958d5f4ed5 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/events.py +++ b/apps/api/src/cora/operation/aggregates/procedure/events.py @@ -23,8 +23,10 @@ `ProcedureActivitiesLogbookOpened` is the lazy envelope event for the per-step logbook table. `ProcedureTruncated` mirrors RunTruncated. -`ProcedureHeld` / `ProcedureResumed` are deferred until the pilot -needs the surface. +`ProcedureHeld` (Running -> Held) / `ProcedureResumed` (Held -> Running) +are the operator-pause / resume pair for resumable conduct (Tier 1 of +[[project_resumable_conduct_design]]); the state name mirrors +`RunStatus.HELD`. `ProcedureIterationStarted` / `ProcedureIterationEnded` are the first-class boundary pair for the convergence-driven iteration loop @@ -323,6 +325,69 @@ class ProcedureAborted: actuation_kind: str | None = None +@dataclass(frozen=True) +class ProcedureHeld: + """A Procedure conduct was operator-paused (Running -> Held). + + Tier 1 of [[project_resumable_conduct_design]]: the operator pauses + a halted conduct so it can be re-established and resumed later rather + than aborted-and-reseeded. Additive to the Layer-1 FSM; the state + name mirrors `RunStatus.HELD` (Procedure is an execution-FSM sibling + of Run). + + `reason` is a free-form string (1-500 chars after trimming), captured + verbatim. REQUIRED, unlike `RunHeld` (slim, no reason: a routine Run + pause): pausing a halted conduct is a deliberate, high-information + operator act, matching `AgentSuspended.reason`. Same future-additive + structured-taxonomy posture as `ProcedureAborted.reason`. + + `decided_by_decision_id` mirrors `RunHeld`: optional Decision-causation + link to the Decision BC record that justified this hold. None for + operator-routed holds; set when an in-process agent runtime issues the + hold. NO existence check per the cross-BC eventual-consistency stance. + Forward-compat via `payload.get("decided_by_decision_id")` -> None. + + Status is NOT carried (the event type encodes the transition); the + evolver maps `ProcedureHeld -> HELD`. + """ + + procedure_id: UUID + reason: str + occurred_at: datetime + decided_by_decision_id: UUID | None = None + + +@dataclass(frozen=True) +class ProcedureResumed: + """A held Procedure conduct was resumed (Held -> Running). + + Inverse of `ProcedureHeld`. Mirrors `RunResumed`. Hold <-> Resume is + bidirectional and unlimited-cycle within one conduct. + + `re_establishment_boundary` is the index in the pinned conduct + manifest from which resume re-drives setpoints + re-runs checks (NOT + a continuity proof; the pre-effect in-flight marker is the only + continuity fact the aggregate owns). It is `>= 0`; the Conductor's + `execute_from` consumes it to replay the pinned manifest tail. Per + [[project_resumable_conduct_design]] the field is the + re-establishment boundary, deliberately NOT a "verified continuity" + claim. + + `decided_by_decision_id` mirrors `RunResumed`: optional + Decision-causation link; None for operator-routed resumes, set when + an in-process agent runtime issues an autonomous resume. NO existence + check (cross-BC eventual-consistency). Forward-compat via + `payload.get("decided_by_decision_id")` -> None. + + Status is NOT carried; the evolver maps `ProcedureResumed -> RUNNING`. + """ + + procedure_id: UUID + re_establishment_boundary: int + occurred_at: datetime + decided_by_decision_id: UUID | None = None + + @dataclass(frozen=True) class ProcedureIterationStarted: """One convergence-loop iteration began on a Running Procedure. @@ -418,6 +483,8 @@ class ResolvedStepsRecorded: | ProcedureCompleted | ProcedureAborted | ProcedureTruncated + | ProcedureHeld + | ProcedureResumed | ProcedureActivitiesLogbookOpened | ProcedureIterationStarted | ProcedureIterationEnded @@ -514,6 +581,34 @@ def to_payload(event: ProcedureEvent) -> dict[str, Any]: "interrupted_at": interrupted_at_iso, "occurred_at": occurred_at.isoformat(), } + case ProcedureHeld( + procedure_id=procedure_id, + reason=reason, + occurred_at=occurred_at, + decided_by_decision_id=decided_by_decision_id, + ): + return { + "procedure_id": str(procedure_id), + "reason": reason, + "decided_by_decision_id": ( + str(decided_by_decision_id) if decided_by_decision_id is not None else None + ), + "occurred_at": occurred_at.isoformat(), + } + case ProcedureResumed( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + occurred_at=occurred_at, + decided_by_decision_id=decided_by_decision_id, + ): + return { + "procedure_id": str(procedure_id), + "re_establishment_boundary": re_establishment_boundary, + "decided_by_decision_id": ( + str(decided_by_decision_id) if decided_by_decision_id is not None else None + ), + "occurred_at": occurred_at.isoformat(), + } case ProcedureActivitiesLogbookOpened( procedure_id=procedure_id, logbook_id=logbook_id, @@ -690,6 +785,34 @@ def _build_truncated() -> ProcedureTruncated: ) return deserialize_or_raise("ProcedureTruncated", _build_truncated) + case "ProcedureHeld": + + def _build_held() -> ProcedureHeld: + raw_decided_by = payload.get("decided_by_decision_id") + return ProcedureHeld( + procedure_id=UUID(payload["procedure_id"]), + reason=payload["reason"], + decided_by_decision_id=( + UUID(raw_decided_by) if raw_decided_by is not None else None + ), + occurred_at=datetime.fromisoformat(payload["occurred_at"]), + ) + + return deserialize_or_raise("ProcedureHeld", _build_held) + case "ProcedureResumed": + + def _build_resumed() -> ProcedureResumed: + raw_decided_by = payload.get("decided_by_decision_id") + return ProcedureResumed( + procedure_id=UUID(payload["procedure_id"]), + re_establishment_boundary=int(payload["re_establishment_boundary"]), + decided_by_decision_id=( + UUID(raw_decided_by) if raw_decided_by is not None else None + ), + occurred_at=datetime.fromisoformat(payload["occurred_at"]), + ) + + return deserialize_or_raise("ProcedureResumed", _build_resumed) case "ProcedureActivitiesLogbookOpened": return deserialize_or_raise( "ProcedureActivitiesLogbookOpened", @@ -759,9 +882,11 @@ def _build_truncated() -> ProcedureTruncated: "ProcedureActivitiesLogbookOpened", "ProcedureCompleted", "ProcedureEvent", + "ProcedureHeld", "ProcedureIterationEnded", "ProcedureIterationStarted", "ProcedureRegistered", + "ProcedureResumed", "ProcedureStarted", "ProcedureTruncated", "RecipeExpansionRecorded", diff --git a/apps/api/src/cora/operation/aggregates/procedure/evolver.py b/apps/api/src/cora/operation/aggregates/procedure/evolver.py index 95548d6c99..d8f2b56238 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/evolver.py +++ b/apps/api/src/cora/operation/aggregates/procedure/evolver.py @@ -10,6 +10,8 @@ - `ProcedureCompleted` -> COMPLETED (happy-path terminal) - `ProcedureAborted` -> ABORTED (emergency-exit terminal) - `ProcedureTruncated` -> TRUNCATED (partial-data terminal; mirrors RunTruncated) + - `ProcedureHeld` -> HELD (operator-pause; mirrors RunHeld) + - `ProcedureResumed` -> RUNNING (resume from Held; mirrors RunResumed) - `ProcedureActivitiesLogbookOpened` -> STATUS UNCHANGED (sets activity_logbook_id; lazy-open envelope event from append_activities, orthogonal to lifecycle) @@ -67,9 +69,11 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -193,6 +197,53 @@ def evolve(state: Procedure | None, event: ProcedureEvent) -> Procedure: ), actuation_kind=prior.actuation_kind, ) + case ProcedureHeld(): + # Operator-pause transition (Running -> Held). Status-only + # change; every non-status field carries verbatim from prior + # (especially the iteration denorms). Mirrors RunHeld. + prior = require_state(state, "ProcedureHeld") + return Procedure( + id=prior.id, + name=prior.name, + kind=prior.kind, + target_asset_ids=prior.target_asset_ids, + status=ProcedureStatus.HELD, + parent_run_id=prior.parent_run_id, + activity_logbook_id=prior.activity_logbook_id, + capability_id=prior.capability_id, + recipe_id=prior.recipe_id, + current_iteration_index=prior.current_iteration_index, + iteration_count=prior.iteration_count, + consecutive_unconverged_iterations=prior.consecutive_unconverged_iterations, + max_consecutive_unconverged_iterations=( + prior.max_consecutive_unconverged_iterations + ), + actuation_kind=prior.actuation_kind, + ) + case ProcedureResumed(): + # Resume transition (Held -> Running). Status-only change; every + # non-status field carries verbatim from prior. The + # re_establishment_boundary rides the event for the Conductor's + # replay, not folded into state. Mirrors RunResumed. + prior = require_state(state, "ProcedureResumed") + return Procedure( + id=prior.id, + name=prior.name, + kind=prior.kind, + target_asset_ids=prior.target_asset_ids, + status=ProcedureStatus.RUNNING, + parent_run_id=prior.parent_run_id, + activity_logbook_id=prior.activity_logbook_id, + capability_id=prior.capability_id, + recipe_id=prior.recipe_id, + current_iteration_index=prior.current_iteration_index, + iteration_count=prior.iteration_count, + consecutive_unconverged_iterations=prior.consecutive_unconverged_iterations, + max_consecutive_unconverged_iterations=( + prior.max_consecutive_unconverged_iterations + ), + actuation_kind=prior.actuation_kind, + ) case ProcedureActivitiesLogbookOpened(logbook_id=logbook_id): # Lazy open-on-first-write: preserve all # prior state, set activity_logbook_id. Status NOT touched -- the diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index 4f8bf8367c..dfb4837205 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -22,9 +22,9 @@ Full FSM (Running / Completed / Aborted / Truncated transitions) + per-step logbook follow. Projection + list_procedures follow. -## ProcedureStatus FSM (locked initial) +## ProcedureStatus FSM - Defined -> Running -> Completed | Aborted | Truncated + Defined -> Running <-> Held -> Completed | Aborted | Truncated REVISED from BC map's `Idle -> Starting -> Running -> Verifying -> Complete | Aborted` per the standards-corpus research at @@ -32,7 +32,9 @@ at FSM level (PackML uses `Completing` for closeout/check work; OPC UA Programs has no Verify state); per-step Check happens within Running; transient states deferred until real async window appears -(Run BC precedent). Held/Resumed deferred to 10c-c per pilot need. +(Run BC precedent). `Held` is the operator-pause state for resumable +conduct (Tier 1 of [[project_resumable_conduct_design]]; mirrors +`RunStatus.HELD`). ## Status as enum-in-state, derived-from-event-type-in-evolver @@ -180,9 +182,9 @@ class ProcedureStatus(StrEnum): """The Procedure's lifecycle state. - Five values declared day one for forward-compat - (additive-state pattern; legacy events fold cleanly because - only DEFINED is reachable after register_procedure): + Six values declared for forward-compat (additive-state pattern; + legacy events fold cleanly because only DEFINED is reachable after + register_procedure): - `Defined` -- registration-time genesis; pre-execution. Operator can edit / inspect / submit for @@ -190,25 +192,39 @@ class ProcedureStatus(StrEnum): Cannot accept step events yet. - `Running` -- post-start_procedure. Step events accepted via append_activities. + - `Held` -- operator-paused mid-conduct via hold_procedure + (Running <-> Held, resumable via + resume_procedure). The resumable-conduct + pause state; mirrors `RunStatus.HELD`. No step + events accepted while Held; the conduct is + paused, not advancing. - `Completed` -- happy path via complete_procedure. Strict-not-idempotent. - `Aborted` -- emergency exit via abort_procedure. - `Truncated` -- retroactive cleanup via truncate_procedure. Mirrors RunTruncated. - `Verifying` and `Held / Resumed` are deliberately NOT in this - enum. Per [[project_operation_design]] standards-corpus research: - `Verifying` is NOT standards-blessed at FSM level (PackML uses - `Completing` for closeout/check work; OPC UA Programs has no - Verify state). Per-step Check happens within Running synchronously - (via the Step logbook's check_passed field). Held / Resumed - deferred until pilot operator feedback surfaces a need. + `Verifying` is deliberately NOT in this enum. Per + [[project_operation_design]] standards-corpus research: `Verifying` + is NOT standards-blessed at FSM level (PackML uses `Completing` for + closeout/check work; OPC UA Programs has no Verify state). Per-step + Check happens within Running synchronously (via the Step logbook's + check_passed field). + + `Held` lands in Tier 1 of [[project_resumable_conduct_design]]: + operator-pause of a halted conduct, additive to the Layer-1 FSM, + mirroring `RunStatus.HELD` (Procedure is an execution-FSM sibling of + Run). The PackML operator=`Held` / external-blocker=`Suspended` + split is honored: this is the operator-pause, so `Held`, not + `Suspended`. The `HOLDING` / `RESTARTING` transient states are + deliberately omitted (Run-precedent deferral). Naming convention (per Run BC gate review): gerund / adjective for active steady-states (matches PackML / Bluesky); - past-participle for terminals. `Defined` is past-participle (a - procedure WAS defined); `Running` is gerund-as-adjective; the - rest are past-participle terminals. + past-participle for the pause-state and terminals. `Defined` is + past-participle (a procedure WAS defined); `Running` is + gerund-as-adjective; `Held` is past-participle (mirrors + `RunStatus.HELD`); the rest are past-participle terminals. Enum values are PascalCase strings (matches BC-map status vocabulary; log lines and DTOs read naturally without mapping). @@ -216,6 +232,7 @@ class ProcedureStatus(StrEnum): DEFINED = "Defined" RUNNING = "Running" + HELD = "Held" COMPLETED = "Completed" ABORTED = "Aborted" TRUNCATED = "Truncated" @@ -746,12 +763,14 @@ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> Non class ProcedureCannotAbortError(Exception): """Attempted to abort a Procedure not in `Running`. - Single-source guard: `abort_procedure` accepts only `Running` (no - Held state in the Procedure FSM today; deferred to 10c-c per pilot - need). Aborting a `Defined` Procedure raises (use a different - workflow, for example: never start it, then leave it Defined or - extend the FSM with a cancel-defined slice if real); aborting any - terminal raises (strict-not-idempotent). Mapped to HTTP 409. + Single-source guard: `abort_procedure` accepts only `Running` today. + The `Held` pause-state now exists (resumable conduct); widening the + abort source set to `Running | Held` so a paused Procedure stays + abortable is a follow-up slice (the abort PBT auto-includes `Held` as + a disallowed source until then). Aborting a `Defined` Procedure raises + (use a different workflow, for example: never start it, then leave it + Defined or extend the FSM with a cancel-defined slice if real); + aborting any terminal raises (strict-not-idempotent). Mapped to HTTP 409. """ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: @@ -767,7 +786,8 @@ class ProcedureCannotTruncateError(Exception): """Attempted to truncate a Procedure not in `Running`. Single-source guard: `truncate_procedure` accepts only `Running` - today (Held/Resumed deferred to future iteration). Mirrors + today; widening to `Running | Held` (so a paused Procedure can be + closed retroactively) is a follow-up slice alongside abort. Mirrors `ProcedureCannotAbortError`'s source set: a Defined Procedure hasn't started so there's no execution to truncate; terminal Procedures are already closed (re-truncating a `Truncated` @@ -789,6 +809,53 @@ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> Non self.current_status = current_status +class ProcedureCannotHoldError(Exception): + """Attempted to hold a Procedure not in `Running`. + + Single-source guard: `hold_procedure` accepts only `Running`. + Re-holding an already-`Held` Procedure raises (strict-not- + idempotent); holding a `Defined` or terminal Procedure raises. + Mirrors `RunCannotHoldError`. Hold <-> Resume is bidirectional and + unlimited-cycle: an operator can hold -> resume -> hold repeatedly + within one conduct, each hold requiring an intervening resume. + Mapped to HTTP 409. + """ + + def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: + super().__init__( + f"Procedure {procedure_id} cannot be held: currently in status " + f"{current_status.value}, hold requires {ProcedureStatus.RUNNING.value}" + ) + self.procedure_id = procedure_id + self.current_status = current_status + + +class ProcedureCannotResumeError(Exception): + """Attempted to resume a Procedure that cannot be resumed. + + Two refusal reasons, both HTTP 409: + - status guard: `resume_procedure` accepts only `Held` (the + inverse of hold, which requires `Running`). Resuming an + already-`Running` Procedure raises (strict-not-idempotent); + resuming a `Defined` or terminal Procedure raises. Mirrors + `RunCannotResumeError`. + - off-diagonal guard (added in a later slice): a Held Procedure + whose parent Run is itself `Held` cannot resume to `Running` + and walk real setpoints while the Run is paused. The + one-directional Operation -> Run read enforces this; there is + NO cascade from Run-resume into Procedure-resume (that is a + Layer-3 saga, deferred). See [[project_resumable_conduct_design]]. + """ + + def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: + super().__init__( + f"Procedure {procedure_id} cannot be resumed: currently in status " + f"{current_status.value}, resume requires {ProcedureStatus.HELD.value}" + ) + self.procedure_id = procedure_id + self.current_status = current_status + + class ProcedureCannotStartIterationError(Exception): """Attempted to start an iteration that fails a start-gate. @@ -1043,6 +1110,50 @@ def __init__(self, value: str) -> None: self.value = value +class InvalidProcedureHoldReasonError(ValueError): + """The supplied hold reason is empty, whitespace-only, or too long. + + Validated at the API boundary via Pydantic min_length / max_length, + AND defensively at the decider via the `ProcedureHoldReason` VO so + direct in-process callers (sagas, tests) get the same protection. + Sibling of `InvalidProcedureAbortReasonError`; distinct class for + BC-local HTTP-status registration. Mapped to HTTP 400. + + Unlike `RunHeld` (slim, no reason: a routine Run pause), a Procedure + hold carries a required reason because pausing a halted conduct is a + deliberate, high-information operator act (matching + `AgentSuspended.reason` and [[project_resumable_conduct_design]]). + The state NAME mirrors Run (`Held`); the reason payload follows the + operator-pause-with-context precedent. + """ + + def __init__(self, value: str) -> None: + super().__init__( + f"Procedure hold reason must be 1-{REASON_MAX_LENGTH} chars " + f"after trimming (got: {value!r})" + ) + self.value = value + + +class InvalidProcedureReEstablishmentBoundaryError(ValueError): + """The supplied resume re-establishment boundary is negative. + + `re_establishment_boundary` is the index in the pinned conduct + manifest from which resume re-drives setpoints + re-runs checks. It + must be >= 0 (a step position; 0 means re-establish from the very + first step). Validated at the API boundary via Pydantic `ge=0` AND + defensively at the `resume_procedure` decider. The upper bound + (boundary vs manifest length) is enforced by the Conductor's + `execute_from` replay, not the decider (the manifest is not folded + into Procedure state). Mapped to HTTP 400. See + [[project_resumable_conduct_design]]. + """ + + def __init__(self, value: int) -> None: + super().__init__(f"re_establishment_boundary must be >= 0 (got: {value})") + self.value = value + + @bounded_name( max_length=PROCEDURE_NAME_MAX_LENGTH, error_class=InvalidProcedureNameError, @@ -1099,6 +1210,29 @@ def __post_init__(self) -> None: object.__setattr__(self, "value", trimmed) +@dataclass(frozen=True) +class ProcedureHoldReason: + """Free-form hold reason. Trimmed; 1-500 chars. + + Sibling of `ProcedureAbortReason`; same shape (trimmed + bounded), + distinct class for BC-local HTTP-status registration. The + on-the-wire representation in `ProcedureHeld.reason` is `str` + (post-trim); the VO exists at decider-input time only. A Procedure + hold REQUIRES a reason (unlike Run's slim `RunHeld`); see + `InvalidProcedureHoldReasonError`. + """ + + value: str + + def __post_init__(self) -> None: + trimmed = validate_bounded_text( + self.value, + max_length=REASON_MAX_LENGTH, + error_class=InvalidProcedureHoldReasonError, + ) + object.__setattr__(self, "value", trimmed) + + @dataclass(frozen=True) class Procedure: """Aggregate root: one execution of an episodic operational task. diff --git a/apps/api/src/cora/operation/features/__init__.py b/apps/api/src/cora/operation/features/__init__.py index 3d2669bf13..4ed93561f5 100644 --- a/apps/api/src/cora/operation/features/__init__.py +++ b/apps/api/src/cora/operation/features/__init__.py @@ -25,11 +25,18 @@ - `truncate_procedure` (Running -> Truncated; partial-data terminal mirroring RunTruncated; reason + optional interrupted_at) +Resumable-conduct pause/resume pair (Tier 1 of +[[project_resumable_conduct_design]]; the state name mirrors +`RunStatus.HELD`): + - `hold_procedure` (Running -> Held; operator-pause of a halted + conduct, required reason) + - `resume_procedure` (Held -> Running; carries the + `re_establishment_boundary` the Conductor replays from) + Read side: - projection (`proj_operation_procedure_summary`) + `list_procedures` (cursor-paginated; status / kind / parent_run_id / target_asset_id filters) - - Held / Resumed only if pilot operator feedback surfaces a need """ from cora.operation.features import ( @@ -37,8 +44,10 @@ append_activities, complete_procedure, get_procedure, + hold_procedure, list_procedures, register_procedure, + resume_procedure, start_procedure, truncate_procedure, ) @@ -48,8 +57,10 @@ "append_activities", "complete_procedure", "get_procedure", + "hold_procedure", "list_procedures", "register_procedure", + "resume_procedure", "start_procedure", "truncate_procedure", ] diff --git a/apps/api/src/cora/operation/features/hold_procedure/__init__.py b/apps/api/src/cora/operation/features/hold_procedure/__init__.py new file mode 100644 index 0000000000..fcae57cb76 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/__init__.py @@ -0,0 +1,23 @@ +"""Vertical slice for the `HoldProcedure` command. + +from cora.operation.features import hold_procedure + +cmd = hold_procedure.HoldProcedure(procedure_id=..., reason="...") +handler = hold_procedure.bind(deps) +await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.hold_procedure import tool +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.decider import decide +from cora.operation.features.hold_procedure.handler import Handler, bind +from cora.operation.features.hold_procedure.route import router + +__all__ = [ + "Handler", + "HoldProcedure", + "bind", + "decide", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/hold_procedure/command.py b/apps/api/src/cora/operation/features/hold_procedure/command.py new file mode 100644 index 0000000000..356c6e972c --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/command.py @@ -0,0 +1,29 @@ +"""The `HoldProcedure` command -- intent dataclass for this slice. + +Single-source pause transition: `Running -> Held`. The operator pauses +a halted conduct so it can be re-established and resumed rather than +aborted-and-reseeded (Tier 1 of [[project_resumable_conduct_design]]). + +Carries a REQUIRED free-form `reason` (1-500 chars after trim; validated +at the API boundary AND defensively at the decider via the +`ProcedureHoldReason` VO). Unlike `HoldRun` (slim, no reason: a routine +Run pause), pausing a halted conduct is a deliberate, high-information +operator act, so the reason is mandatory (matching `AgentSuspended.reason`). + +`decided_by_decision_id` mirrors `HoldRun`: optional Decision-causation +link. The operator-facing route leaves it None; an in-process agent +runtime sets it to link an autonomous hold to its Decision. NO existence +check at the decider per the cross-BC eventual-consistency stance. +""" + +from dataclasses import dataclass +from uuid import UUID + + +@dataclass(frozen=True) +class HoldProcedure: + """Pause an actively-running Procedure conduct (Running -> Held).""" + + procedure_id: UUID + reason: str + decided_by_decision_id: UUID | None = None diff --git a/apps/api/src/cora/operation/features/hold_procedure/decider.py b/apps/api/src/cora/operation/features/hold_procedure/decider.py new file mode 100644 index 0000000000..136804a286 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/decider.py @@ -0,0 +1,57 @@ +"""Pure decider for the `HoldProcedure` command. + +Single-source pause transition: `Running -> Held`. Re-holding an +already-`Held` Procedure raises (strict-not-idempotent); holding a +`Defined` or terminal Procedure raises. Mirrors `hold_run`. + +Hold <-> Resume is bidirectional and unlimited-cycle: an operator can +hold -> resume -> hold repeatedly within one conduct, each hold +requiring an intervening resume. + +`reason` validation goes through the `ProcedureHoldReason` VO (which +calls the shared `validate_bounded_text` helper). The on-the-wire +payload in `ProcedureHeld.reason` carries the trimmed string. + +Invariants: + - State must not be None -> ProcedureNotFoundError + - command.reason must be 1-500 chars after trimming + -> InvalidProcedureHoldReasonError + - State.status must be in {Running} + -> ProcedureCannotHoldError(current_status=...) +""" + +from datetime import datetime + +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureHoldReason, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features.hold_procedure.command import HoldProcedure + +_HOLDABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) + + +def decide( + state: Procedure | None, + command: HoldProcedure, + *, + now: datetime, +) -> list[ProcedureHeld]: + """Decide the events produced by holding an existing Procedure.""" + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + reason = ProcedureHoldReason(command.reason) + if state.status not in _HOLDABLE_STATUSES: + raise ProcedureCannotHoldError(state.id, current_status=state.status) + return [ + ProcedureHeld( + procedure_id=state.id, + reason=reason.value, + decided_by_decision_id=command.decided_by_decision_id, + occurred_at=now, + ) + ] diff --git a/apps/api/src/cora/operation/features/hold_procedure/handler.py b/apps/api/src/cora/operation/features/hold_procedure/handler.py new file mode 100644 index 0000000000..ff342be306 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/handler.py @@ -0,0 +1,45 @@ +"""Application handler for the `hold_procedure` slice. + +Update-style handler. Canonical body lives in +`cora.operation._procedure_update_handler.make_procedure_update_handler`; +this module is a thin slice-specific bind, mirroring abort_procedure / +truncate_procedure. + +The command's `reason` field IS captured on the emitted `ProcedureHeld` +event payload but is intentionally NOT logged at the handler boundary +(mirrors abort_procedure / hold_run precedent), so this slice does not +pass `extra_log_fields`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._procedure_update_handler import make_procedure_update_handler +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.decider import decide + + +class Handler(Protocol): + """Callable interface every hold_procedure handler implements.""" + + async def __call__( + self, + command: HoldProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: ... + + +def bind(deps: Kernel) -> Handler: + """Build a hold_procedure handler closed over the shared deps.""" + return make_procedure_update_handler( + deps, + command_name="HoldProcedure", + log_prefix="hold_procedure", + decide_fn=decide, + ) diff --git a/apps/api/src/cora/operation/features/hold_procedure/route.py b/apps/api/src/cora/operation/features/hold_procedure/route.py new file mode 100644 index 0000000000..48b5b31810 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/route.py @@ -0,0 +1,91 @@ +"""HTTP route for the `hold_procedure` slice. + +Action endpoint at `POST /procedures/{procedure_id}/hold`. Body carries +`reason` (1-500 chars). 204 No Content on success. Mirrors abort_procedure. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler +from cora.shared.text_bounds import REASON_MAX_LENGTH + + +class HoldProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/hold`.""" + + reason: str = Field( + ..., + min_length=1, + max_length=REASON_MAX_LENGTH, + description=( + "Free-form reason for the hold (1-500 chars after trimming). " + "Required: pausing a halted conduct is a deliberate operator act " + "(unlike a routine RunHeld, which carries no reason)." + ), + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.hold_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/hold", + status_code=status.HTTP_204_NO_CONTENT, + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "Domain invariant violated: whitespace-only reason.", + }, + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Running` status (hold requires " + "`Running`; holding a `Defined` / `Held` / terminal procedure " + "raises), OR a concurrent write to the same procedure stream " + "conflicted (optimistic concurrency)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + }, + summary="Pause an actively-running Procedure conduct (Running -> Held)", +) +async def post_procedures_hold( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: HoldProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> None: + await handler( + HoldProcedure(procedure_id=procedure_id, reason=body.reason), + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) diff --git a/apps/api/src/cora/operation/features/hold_procedure/tool.py b/apps/api/src/cora/operation/features/hold_procedure/tool.py new file mode 100644 index 0000000000..437cf02fa0 --- /dev/null +++ b/apps/api/src/cora/operation/features/hold_procedure/tool.py @@ -0,0 +1,52 @@ +"""MCP tool for the `hold_procedure` slice.""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler +from cora.shared.text_bounds import REASON_MAX_LENGTH + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `hold_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="hold_procedure", + description=( + "Pause an actively-running Procedure conduct (Running -> Held) so it " + "can be re-established and resumed later. The inverse of resume_procedure. " + "Requires the Procedure to currently be in `Running`. Holding a " + "`Defined` / `Held` / terminal Procedure raises. Reason is required " + "(1-500 chars), captured verbatim for audit." + ), + ) + async def hold_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + reason: Annotated[ + str, + Field( + min_length=1, + max_length=REASON_MAX_LENGTH, + description="Free-form reason for the hold (1-500 chars after trimming).", + ), + ], + ) -> None: + handler = get_handler() + await handler( + HoldProcedure(procedure_id=procedure_id, reason=reason), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) diff --git a/apps/api/src/cora/operation/features/resume_procedure/__init__.py b/apps/api/src/cora/operation/features/resume_procedure/__init__.py new file mode 100644 index 0000000000..421023b376 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/__init__.py @@ -0,0 +1,23 @@ +"""Vertical slice for the `ResumeProcedure` command. + +from cora.operation.features import resume_procedure + +cmd = resume_procedure.ResumeProcedure(procedure_id=..., re_establishment_boundary=0) +handler = resume_procedure.bind(deps) +await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.resume_procedure import tool +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.decider import decide +from cora.operation.features.resume_procedure.handler import Handler, bind +from cora.operation.features.resume_procedure.route import router + +__all__ = [ + "Handler", + "ResumeProcedure", + "bind", + "decide", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/resume_procedure/command.py b/apps/api/src/cora/operation/features/resume_procedure/command.py new file mode 100644 index 0000000000..a9e8996470 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/command.py @@ -0,0 +1,27 @@ +"""The `ResumeProcedure` command -- intent dataclass for this slice. + +Single-source resume transition: `Held -> Running`. The inverse of +hold_procedure. Carries `re_establishment_boundary`: the index in the +pinned conduct manifest from which a resume re-drives setpoints and +re-runs checks (Tier 1 of [[project_resumable_conduct_design]]). It is +NOT a continuity proof; it is the re-establishment boundary the +Conductor's `execute_from` replays from. + +`decided_by_decision_id` mirrors `ResumeRun`: optional Decision-causation +link. The operator-facing route leaves it None; an in-process agent +runtime sets it to link an autonomous, safety-gated resume to its +Decision. NO existence check at the decider per the cross-BC +eventual-consistency stance. +""" + +from dataclasses import dataclass +from uuid import UUID + + +@dataclass(frozen=True) +class ResumeProcedure: + """Resume a held Procedure conduct (Held -> Running).""" + + procedure_id: UUID + re_establishment_boundary: int + decided_by_decision_id: UUID | None = None diff --git a/apps/api/src/cora/operation/features/resume_procedure/decider.py b/apps/api/src/cora/operation/features/resume_procedure/decider.py new file mode 100644 index 0000000000..b74b28e148 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/decider.py @@ -0,0 +1,56 @@ +"""Pure decider for the `ResumeProcedure` command. + +Single-source resume transition: `Held -> Running`. The inverse of +hold (which requires `Running`). Resuming an already-`Running` Procedure +raises (strict-not-idempotent); resuming a `Defined` or terminal +Procedure raises. Mirrors `resume_run`. + +The off-diagonal guard (refuse while the parent Run is `Held`) is NOT +in this pure decider: it needs a cross-aggregate Run read and lands in +the handler in a follow-up slice (it raises the same +`ProcedureCannotResumeError`). See [[project_resumable_conduct_design]]. + +Invariants: + - State must not be None -> ProcedureNotFoundError + - command.re_establishment_boundary must be >= 0 + -> InvalidProcedureReEstablishmentBoundaryError + - State.status must be in {Held} + -> ProcedureCannotResumeError(current_status=...) +""" + +from datetime import datetime + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features.resume_procedure.command import ResumeProcedure + +_RESUMABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.HELD,) + + +def decide( + state: Procedure | None, + command: ResumeProcedure, + *, + now: datetime, +) -> list[ProcedureResumed]: + """Decide the events produced by resuming a held Procedure.""" + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + if command.re_establishment_boundary < 0: + raise InvalidProcedureReEstablishmentBoundaryError(command.re_establishment_boundary) + if state.status not in _RESUMABLE_STATUSES: + raise ProcedureCannotResumeError(state.id, current_status=state.status) + return [ + ProcedureResumed( + procedure_id=state.id, + re_establishment_boundary=command.re_establishment_boundary, + decided_by_decision_id=command.decided_by_decision_id, + occurred_at=now, + ) + ] diff --git a/apps/api/src/cora/operation/features/resume_procedure/handler.py b/apps/api/src/cora/operation/features/resume_procedure/handler.py new file mode 100644 index 0000000000..f079f4d9cd --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/handler.py @@ -0,0 +1,45 @@ +"""Application handler for the `resume_procedure` slice. + +Update-style handler. Canonical body lives in +`cora.operation._procedure_update_handler.make_procedure_update_handler`; +this module is a thin slice-specific bind, mirroring resume_run. + +The off-diagonal guard (refuse to resume while the parent Run is `Held`) +is a cross-aggregate Run read added in a follow-up slice; it will replace +this factory bind with a custom handler (the factory loads exactly one +event-store stream). Until then the decider's status guard +(`Held -> Running`) is the only gate. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._procedure_update_handler import make_procedure_update_handler +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.decider import decide + + +class Handler(Protocol): + """Callable interface every resume_procedure handler implements.""" + + async def __call__( + self, + command: ResumeProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: ... + + +def bind(deps: Kernel) -> Handler: + """Build a resume_procedure handler closed over the shared deps.""" + return make_procedure_update_handler( + deps, + command_name="ResumeProcedure", + log_prefix="resume_procedure", + decide_fn=decide, + ) diff --git a/apps/api/src/cora/operation/features/resume_procedure/route.py b/apps/api/src/cora/operation/features/resume_procedure/route.py new file mode 100644 index 0000000000..52c9286c02 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/route.py @@ -0,0 +1,92 @@ +"""HTTP route for the `resume_procedure` slice. + +Action endpoint at `POST /procedures/{procedure_id}/resume`. Body carries +`re_establishment_boundary` (>= 0). 204 No Content on success. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler + + +class ResumeProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/resume`.""" + + re_establishment_boundary: int = Field( + ..., + ge=0, + description=( + "Index in the pinned conduct manifest from which the resume " + "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " + "from the first step). NOT a continuity proof." + ), + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.resume_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/resume", + status_code=status.HTTP_204_NO_CONTENT, + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "Domain invariant violated: negative re_establishment_boundary.", + }, + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Held` status (resume requires `Held`; " + "resuming a `Running` / `Defined` / terminal procedure raises), " + "OR a concurrent write to the same procedure stream conflicted " + "(optimistic concurrency)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + }, + summary="Resume a held Procedure conduct (Held -> Running)", +) +async def post_procedures_resume( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: ResumeProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> None: + await handler( + ResumeProcedure( + procedure_id=procedure_id, + re_establishment_boundary=body.re_establishment_boundary, + ), + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) diff --git a/apps/api/src/cora/operation/features/resume_procedure/tool.py b/apps/api/src/cora/operation/features/resume_procedure/tool.py new file mode 100644 index 0000000000..f80fb11366 --- /dev/null +++ b/apps/api/src/cora/operation/features/resume_procedure/tool.py @@ -0,0 +1,56 @@ +"""MCP tool for the `resume_procedure` slice.""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `resume_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="resume_procedure", + description=( + "Resume a held Procedure conduct (Held -> Running). The inverse of " + "hold_procedure. Requires the Procedure to currently be in `Held`. " + "Resuming a `Running` / `Defined` / terminal Procedure raises. " + "re_establishment_boundary (>= 0) is the manifest index the resume " + "re-drives setpoints / re-runs checks from." + ), + ) + async def resume_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + re_establishment_boundary: Annotated[ + int, + Field( + ge=0, + description=( + "Index in the pinned conduct manifest the resume re-drives " + "setpoints / re-runs checks from (>= 0; 0 = from the first step)." + ), + ), + ], + ) -> None: + handler = get_handler() + await handler( + ResumeProcedure( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + ), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) diff --git a/apps/api/src/cora/operation/routes.py b/apps/api/src/cora/operation/routes.py index 1ca6622aa9..5d922c3633 100644 --- a/apps/api/src/cora/operation/routes.py +++ b/apps/api/src/cora/operation/routes.py @@ -36,11 +36,13 @@ from cora.operation.aggregates.procedure import ( InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureInterruptedAtError, InvalidProcedureIterationCapError, InvalidProcedureIterationEndReasonError, InvalidProcedureKindError, InvalidProcedureNameError, + InvalidProcedureReEstablishmentBoundaryError, InvalidProcedureTruncateReasonError, InvalidRecipeBindingsError, InvalidStepKindError, @@ -50,6 +52,8 @@ ProcedureCannotAbortError, ProcedureCannotCompleteError, ProcedureCannotEndIterationError, + ProcedureCannotHoldError, + ProcedureCannotResumeError, ProcedureCannotStartError, ProcedureCannotStartIterationError, ProcedureCannotTruncateError, @@ -89,10 +93,12 @@ conduct_procedure, end_iteration, get_procedure, + hold_procedure, list_procedure_iterations, list_procedures, register_procedure, register_procedure_from_recipe, + resume_procedure, start_iteration, start_procedure, truncate_procedure, @@ -230,6 +236,8 @@ def register_operation_routes(app: FastAPI) -> None: app.include_router(complete_procedure.router) app.include_router(abort_procedure.router) app.include_router(truncate_procedure.router) + app.include_router(hold_procedure.router) + app.include_router(resume_procedure.router) app.include_router(start_iteration.router) app.include_router(end_iteration.router) app.include_router(append_activities.router) @@ -241,10 +249,12 @@ def register_operation_routes(app: FastAPI) -> None: InvalidProcedureNameError, InvalidProcedureKindError, InvalidProcedureAbortReasonError, + InvalidProcedureHoldReasonError, InvalidProcedureTruncateReasonError, InvalidProcedureIterationEndReasonError, InvalidProcedureIterationCapError, InvalidProcedureInterruptedAtError, + InvalidProcedureReEstablishmentBoundaryError, InvalidStepKindError, # Recipe-driven conduct_procedure path: caller-supplied steps with # recipe_id set are rejected up front per the replay-design lock @@ -267,6 +277,10 @@ def register_operation_routes(app: FastAPI) -> None: ProcedureCannotCompleteError, ProcedureCannotAbortError, ProcedureCannotTruncateError, + # resumable-conduct pause/resume guards (Running->Held->Running): + # holding a non-Running procedure, or resuming a non-Held one. + ProcedureCannotHoldError, + ProcedureCannotResumeError, # iteration boundary guards (start/end): not-Running, no/already-open # iteration, and non-sequential / mismatched operator-supplied index. ProcedureCannotStartIterationError, diff --git a/apps/api/src/cora/operation/tools.py b/apps/api/src/cora/operation/tools.py index db4cf52261..08e0731d55 100644 --- a/apps/api/src/cora/operation/tools.py +++ b/apps/api/src/cora/operation/tools.py @@ -17,6 +17,7 @@ from cora.operation.features.conduct_procedure import tool as conduct_procedure_tool from cora.operation.features.end_iteration import tool as end_iteration_tool from cora.operation.features.get_procedure import tool as get_procedure_tool +from cora.operation.features.hold_procedure import tool as hold_procedure_tool from cora.operation.features.list_procedure_iterations import ( tool as list_procedure_iterations_tool, ) @@ -25,6 +26,7 @@ from cora.operation.features.register_procedure_from_recipe import ( tool as register_procedure_from_recipe_tool, ) +from cora.operation.features.resume_procedure import tool as resume_procedure_tool from cora.operation.features.start_iteration import tool as start_iteration_tool from cora.operation.features.start_procedure import tool as start_procedure_tool from cora.operation.features.truncate_procedure import tool as truncate_procedure_tool @@ -61,6 +63,14 @@ def register_operation_tools( mcp, get_handler=lambda: get_handlers().truncate_procedure, ) + hold_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().hold_procedure, + ) + resume_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().resume_procedure, + ) start_iteration_tool.register( mcp, get_handler=lambda: get_handlers().start_iteration, diff --git a/apps/api/src/cora/operation/wire.py b/apps/api/src/cora/operation/wire.py index 6d7b8b86f4..8ceb274e2a 100644 --- a/apps/api/src/cora/operation/wire.py +++ b/apps/api/src/cora/operation/wire.py @@ -74,10 +74,12 @@ conduct_procedure, end_iteration, get_procedure, + hold_procedure, list_procedure_iterations, list_procedures, register_procedure, register_procedure_from_recipe, + resume_procedure, start_iteration, start_procedure, truncate_procedure, @@ -103,6 +105,8 @@ class OperationHandlers: complete_procedure: complete_procedure.Handler abort_procedure: abort_procedure.Handler truncate_procedure: truncate_procedure.Handler + hold_procedure: hold_procedure.Handler + resume_procedure: resume_procedure.Handler start_iteration: start_iteration.Handler end_iteration: end_iteration.Handler append_activities: append_activities.Handler @@ -235,6 +239,16 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="TruncateProcedure", bc=_BC, ), + hold_procedure=with_tracing( + hold_procedure.bind(deps), + command_name="HoldProcedure", + bc=_BC, + ), + resume_procedure=with_tracing( + resume_procedure.bind(deps), + command_name="ResumeProcedure", + bc=_BC, + ), start_iteration=with_tracing( start_iteration.bind(deps), command_name="StartProcedureIteration", diff --git a/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py new file mode 100644 index 0000000000..9ff5afa02f --- /dev/null +++ b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py @@ -0,0 +1,158 @@ +"""Architecture fitness: every non-genesis, Procedure-constructing arm of +the Procedure evolver MUST carry all additive state fields through from +prior state. + +The Procedure aggregate accreted a wide additive-field set (the +iteration denorms, the recipe/capability binding, the activity logbook +id, the actuation-kind provenance carrier). Constructing +`Procedure(id=..., name=..., status=...)` on a new transition arm +without explicitly threading those fields silently WIPES them to their +defaults (empty frozenset / None / 0) on the next replay. The Tier-1 +`ProcedureHeld` / `ProcedureResumed` arms are the latest pair that must +carry the iteration denorms verbatim; this AST check pins the whole +matrix so the bug class cannot recur when a new arm lands. + +Precedent: `test_asset_evolver_lifecycle_dates_carry_forward.py` (same +structural AST shape, narrower field set). Behavior-side per-arm +preservation coverage lives in `tests/unit/operation/test_procedure_evolver.py`; +this fitness exists because behavior tests only catch arms someone +remembered to parametrize. + +## What is checked + +For every `case (...):` arm in `evolve` that builds a +`return Procedure(...)`: + + - the genesis arm (`ProcedureRegistered`) is exempt: it writes / + defaults every field at initial-state construction. + - provenance-only arms that return `require_state(...)` (no + `Procedure(...)` constructor) are exempt: passthrough preserves + every field by definition. + - every other arm MUST pass `=prior.` for each + carry-forward field, UNLESS the arm is a declared per-field writer + (it legitimately sets that field from the event or a computation). +""" + +from __future__ import annotations + +import ast +from pathlib import Path + +import pytest + +_REPO_ROOT = Path(__file__).resolve().parents[4] +_EVOLVER_PATH = ( + _REPO_ROOT + / "apps" + / "api" + / "src" + / "cora" + / "operation" + / "aggregates" + / "procedure" + / "evolver.py" +) + +_GENESIS_ARM = "ProcedureRegistered" + +# Carry-forward fields and the arms that legitimately WRITE each (so are +# exempt from the `=prior.` requirement for that field). The +# genesis arm writes every field and is exempt globally below. +_WRITER_ARMS_PER_FIELD: dict[str, frozenset[str]] = { + "kind": frozenset(), + "target_asset_ids": frozenset(), + "parent_run_id": frozenset(), + "activity_logbook_id": frozenset({"ProcedureActivitiesLogbookOpened"}), + "capability_id": frozenset(), + "recipe_id": frozenset(), + "current_iteration_index": frozenset({"ProcedureIterationStarted", "ProcedureIterationEnded"}), + "iteration_count": frozenset({"ProcedureIterationStarted"}), + "consecutive_unconverged_iterations": frozenset({"ProcedureIterationEnded"}), + "max_consecutive_unconverged_iterations": frozenset(), + # Terminal arms snapshot the Conductor's observed kind from the event. + "actuation_kind": frozenset({"ProcedureCompleted", "ProcedureAborted"}), +} + + +def _arm_event_type_name(case_node: ast.match_case) -> str | None: + pattern = case_node.pattern + if isinstance(pattern, ast.MatchClass) and isinstance(pattern.cls, ast.Name): + return pattern.cls.id + return None + + +def _return_procedure_kwargs(case_node: ast.match_case) -> dict[str, ast.expr] | None: + """Kwargs from the `return Procedure(...)` call in this arm, or None + when the arm constructs no Procedure (it returns require_state / + state directly -- a passthrough that preserves every field).""" + for stmt in ast.walk(case_node): + if ( + isinstance(stmt, ast.Return) + and isinstance(stmt.value, ast.Call) + and isinstance(stmt.value.func, ast.Name) + and stmt.value.func.id == "Procedure" + ): + return {kw.arg: kw.value for kw in stmt.value.keywords if kw.arg is not None} + return None + + +def _is_prior_attribute_access(node: ast.expr, field: str) -> bool: + return ( + isinstance(node, ast.Attribute) + and node.attr == field + and isinstance(node.value, ast.Name) + and node.value.id == "prior" + ) + + +def _find_evolve_match_cases() -> list[ast.match_case]: + tree = ast.parse(_EVOLVER_PATH.read_text(encoding="utf-8")) + evolve_func = next( + (n for n in tree.body if isinstance(n, ast.FunctionDef) and n.name == "evolve"), + None, + ) + assert evolve_func is not None, "Could not locate `evolve` in evolver.py" + match_stmt = next((n for n in evolve_func.body if isinstance(n, ast.Match)), None) + assert match_stmt is not None, "Could not locate `match event:` in `evolve`" + return list(match_stmt.cases) + + +@pytest.mark.architecture +def test_procedure_evolver_non_writer_arms_carry_all_additive_fields() -> None: + """Every non-genesis Procedure-constructing arm threads each additive + field as `=prior.` unless it is a declared writer of + that field.""" + violations: list[str] = [] + for case in _find_evolve_match_cases(): + event_name = _arm_event_type_name(case) + if event_name is None: + continue # wildcard `case _:` (assert_never guard) + if event_name == _GENESIS_ARM: + continue # genesis writes / defaults every field + kwargs = _return_procedure_kwargs(case) + if kwargs is None: + continue # passthrough arm (returns require_state/state); preserves all + for field, writer_arms in _WRITER_ARMS_PER_FIELD.items(): + if event_name in writer_arms: + continue + value = kwargs.get(field) + if value is None: + violations.append( + f" - {event_name}: missing `{field}=prior.{field}` kwarg in Procedure(...)" + ) + continue + if not _is_prior_attribute_access(value, field): + violations.append( + f" - {event_name}: `{field}=...` is not " + f"`prior.{field}` (got `{ast.unparse(value)}`)" + ) + assert not violations, ( + "Procedure evolver arms drop an additive-state field on replay.\n" + "Every non-genesis arm that constructs `Procedure(...)` must thread\n" + "each additive field as `=prior.` unless it legitimately\n" + "writes that field (see `_WRITER_ARMS_PER_FIELD`). Otherwise the field\n" + "silently wipes to its default on next replay (the dropped-iteration-\n" + "denorm bug class). Add the carry-forward kwarg, or register a new\n" + "writer arm with rationale.\n\n" + "Violations:\n" + "\n".join(violations) + ) diff --git a/apps/api/tests/contract/test_hold_procedure_endpoint.py b/apps/api/tests/contract/test_hold_procedure_endpoint.py new file mode 100644 index 0000000000..d4b69c3b40 --- /dev/null +++ b/apps/api/tests/contract/test_hold_procedure_endpoint.py @@ -0,0 +1,100 @@ +"""Contract tests for `POST /procedures/{procedure_id}/hold`. + +Action endpoint with `reason` body, 204 on success. Covers happy path +(after register + start) plus error surfaces: 400 whitespace-only +reason, 404, 409 from-Defined, 409 re-hold, 422 missing/too-long reason +or malformed id. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register_and_start(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + started = client.post(f"/procedures/{pid}/start") + assert started.status_code == 204 + return pid + + +@pytest.mark.contract +def test_post_hold_returns_204_for_running_procedure() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}) + assert response.status_code == 204 + + +@pytest.mark.contract +def test_post_hold_marks_status_held_visible_via_get() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}) + response = client.get(f"/procedures/{pid}") + assert response.json()["status"] == "Held" + + +@pytest.mark.contract +def test_post_hold_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post(f"/procedures/{uuid4()}/hold", json={"reason": "x"}) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_hold_returns_409_for_defined_procedure() -> None: + """Hold requires Running; from Defined raises CannotHold.""" + with TestClient(create_app()) as client: + body: dict[str, Any] = {"name": "X", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "test"}) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_hold_returns_409_when_re_holding() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + first = client.post(f"/procedures/{pid}/hold", json={"reason": "first"}) + second = client.post(f"/procedures/{pid}/hold", json={"reason": "second"}) + assert first.status_code == 204 + assert second.status_code == 409 + + +@pytest.mark.contract +def test_post_hold_returns_400_for_whitespace_only_reason() -> None: + """Whitespace-only slips past Pydantic min_length=1; the VO rejects -> 400.""" + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": " "}) + assert response.status_code == 400 + assert "detail" in response.json() + + +@pytest.mark.contract +def test_post_hold_returns_422_for_missing_reason() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_hold_returns_422_for_too_long_reason() -> None: + with TestClient(create_app()) as client: + pid = _register_and_start(client) + response = client.post(f"/procedures/{pid}/hold", json={"reason": "x" * 501}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_hold_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post("/procedures/not-a-uuid/hold", json={"reason": "x"}) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_hold_procedure_mcp_tool.py b/apps/api/tests/contract/test_hold_procedure_mcp_tool.py new file mode 100644 index 0000000000..4bbcb99152 --- /dev/null +++ b/apps/api/tests/contract/test_hold_procedure_mcp_tool.py @@ -0,0 +1,73 @@ +"""Contract tests for the `hold_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_and_start_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + pid = UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": "start_procedure", "arguments": {"procedure_id": str(pid)}}, + }, + headers=headers, + ) + return pid + + +@pytest.mark.contract +def test_mcp_lists_hold_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "hold_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_hold_procedure_tool_succeeds_for_running() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_and_start_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "hold_procedure", + "arguments": {"procedure_id": str(pid), "reason": "beam dropped"}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is False diff --git a/apps/api/tests/contract/test_resume_procedure_endpoint.py b/apps/api/tests/contract/test_resume_procedure_endpoint.py new file mode 100644 index 0000000000..6cf3fa8d7b --- /dev/null +++ b/apps/api/tests/contract/test_resume_procedure_endpoint.py @@ -0,0 +1,88 @@ +"""Contract tests for `POST /procedures/{procedure_id}/resume`. + +Action endpoint with `re_establishment_boundary` body, 204 on success. +Covers happy path (register + start + hold) plus error surfaces: 404, +409 from-Running (not Held), 422 missing / negative boundary, 422 +malformed id. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register_start_hold(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + assert client.post(f"/procedures/{pid}/start").status_code == 204 + assert ( + client.post(f"/procedures/{pid}/hold", json={"reason": "beam dropped"}).status_code == 204 + ) + return pid + + +@pytest.mark.contract +def test_post_resume_returns_204_for_held_procedure() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 0}) + assert response.status_code == 204 + + +@pytest.mark.contract +def test_post_resume_marks_status_running_visible_via_get() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 1}) + response = client.get(f"/procedures/{pid}") + assert response.json()["status"] == "Running" + + +@pytest.mark.contract +def test_post_resume_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + f"/procedures/{uuid4()}/resume", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_resume_returns_409_for_running_procedure() -> None: + """Resume requires Held; from Running (never held) raises CannotResume.""" + with TestClient(create_app()) as client: + body: dict[str, Any] = {"name": "X", "kind": "bakeout"} + pid = UUID(client.post("/procedures", json=body).json()["procedure_id"]) + client.post(f"/procedures/{pid}/start") + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": 0}) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_missing_boundary() -> None: + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_negative_boundary() -> None: + """Pydantic ge=0 rejects a negative boundary at the wire before the decider.""" + with TestClient(create_app()) as client: + pid = _register_start_hold(client) + response = client.post(f"/procedures/{pid}/resume", json={"re_establishment_boundary": -1}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_resume_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + "/procedures/not-a-uuid/resume", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_resume_procedure_mcp_tool.py b/apps/api/tests/contract/test_resume_procedure_mcp_tool.py new file mode 100644 index 0000000000..da4a0ff0a6 --- /dev/null +++ b/apps/api/tests/contract/test_resume_procedure_mcp_tool.py @@ -0,0 +1,86 @@ +"""Contract tests for the `resume_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_start_hold_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + pid = UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 2, + "method": "tools/call", + "params": {"name": "start_procedure", "arguments": {"procedure_id": str(pid)}}, + }, + headers=headers, + ) + client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "hold_procedure", + "arguments": {"procedure_id": str(pid), "reason": "beam dropped"}, + }, + }, + headers=headers, + ) + return pid + + +@pytest.mark.contract +def test_mcp_lists_resume_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "resume_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_resume_procedure_tool_succeeds_for_held() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_start_hold_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 4, + "method": "tools/call", + "params": { + "name": "resume_procedure", + "arguments": {"procedure_id": str(pid), "re_establishment_boundary": 0}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is False diff --git a/apps/api/tests/unit/operation/test_hold_procedure_decider.py b/apps/api/tests/unit/operation/test_hold_procedure_decider.py new file mode 100644 index 0000000000..bd89465080 --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_decider.py @@ -0,0 +1,150 @@ +"""Pure-decider tests for `hold_procedure` slice. + +Single-source pause transition: `Running -> Held`. Reason field +validated via `ProcedureHoldReason` VO (1-500 chars after trim). +Mirrors `hold_run`; the state name is `Held` (Procedure is an +execution-FSM sibling of Run), with a REQUIRED reason (unlike +slim `RunHeld`). +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.operation.aggregates.procedure import ( + InvalidProcedureHoldReasonError, + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureName, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from cora.shared.text_bounds import REASON_MAX_LENGTH + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) + + +def _procedure( + *, + procedure_id: UUID | None = None, + status: ProcedureStatus = ProcedureStatus.RUNNING, +) -> Procedure: + return Procedure( + id=procedure_id or uuid4(), + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +def test_decide_emits_procedure_held_when_running() -> None: + proc = _procedure() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="beam dropped"), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureHeld) + assert events[0].procedure_id == proc.id + assert events[0].reason == "beam dropped" + assert events[0].occurred_at == _NOW + assert events[0].decided_by_decision_id is None + + +@pytest.mark.unit +def test_decide_threads_decided_by_decision_id() -> None: + proc = _procedure() + decision_id = uuid4() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure( + procedure_id=proc.id, reason="autonomous hold", decided_by_decision_id=decision_id + ), + now=_NOW, + ) + assert events[0].decided_by_decision_id == decision_id + + +@pytest.mark.unit +def test_decide_trims_reason_via_vo() -> None: + proc = _procedure() + events = hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason=" investigating fault "), + now=_NOW, + ) + assert events[0].reason == "investigating fault" + + +@pytest.mark.unit +def test_decide_rejects_when_state_is_none() -> None: + pid = uuid4() + with pytest.raises(ProcedureNotFoundError) as exc: + hold_procedure.decide( + state=None, + command=HoldProcedure(procedure_id=pid, reason="x"), + now=_NOW, + ) + assert exc.value.procedure_id == pid + + +@pytest.mark.unit +def test_decide_rejects_whitespace_only_reason() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureHoldReasonError): + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason=" "), + now=_NOW, + ) + + +@pytest.mark.unit +def test_decide_rejects_too_long_reason() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureHoldReasonError): + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="x" * (REASON_MAX_LENGTH + 1)), + now=_NOW, + ) + + +@pytest.mark.unit +@pytest.mark.parametrize( + "status", + [ + ProcedureStatus.DEFINED, + ProcedureStatus.HELD, + ProcedureStatus.COMPLETED, + ProcedureStatus.ABORTED, + ProcedureStatus.TRUNCATED, + ], +) +def test_decide_rejects_non_running_status(status: ProcedureStatus) -> None: + """Holding a non-Running procedure raises (re-holding a Held one too).""" + proc = _procedure(status=status) + with pytest.raises(ProcedureCannotHoldError) as exc: + hold_procedure.decide( + state=proc, + command=HoldProcedure(procedure_id=proc.id, reason="x"), + now=_NOW, + ) + assert exc.value.current_status is status + + +@pytest.mark.unit +def test_decide_is_pure_same_inputs_same_outputs() -> None: + proc = _procedure() + cmd = HoldProcedure(procedure_id=proc.id, reason="break") + first = hold_procedure.decide(state=proc, command=cmd, now=_NOW) + second = hold_procedure.decide(state=proc, command=cmd, now=_NOW) + assert first == second diff --git a/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py new file mode 100644 index 0000000000..45a8d726e1 --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_decider_properties.py @@ -0,0 +1,167 @@ +"""Property-based tests for `hold_procedure.decide` (Operation BC). + +Complements the example-based `test_hold_procedure_decider.py` with +universal claims across generated inputs. The decider is a pure +single-source pause transition with a reason: + + (state, command, now) -> list[ProcedureHeld] + +Load-bearing properties: + + - state=None always raises `ProcedureNotFoundError` carrying + command.procedure_id. + - The source-state partition is total over `ProcedureStatus`: the + sole source `{Running}` emits exactly one `ProcedureHeld` + (procedure_id=state.id, reason threaded, occurred_at=now); every + other status raises `ProcedureCannotHoldError` carrying the current + status. (Adding a new status auto-extends `_DISALLOWED_SOURCES`.) + - The emitted event's procedure_id is `state.id`, never + command.procedure_id. + - Pure: same (state, command, now) returns equal events. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from hypothesis import assume, given +from hypothesis import strategies as st + +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureCannotHoldError, + ProcedureHeld, + ProcedureName, + ProcedureNotFoundError, + ProcedureStatus, +) +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from tests._strategies import aware_datetimes, printable_ascii_text + +if TYPE_CHECKING: + from datetime import datetime + from uuid import UUID + +_REASON = printable_ascii_text(min_size=1, max_size=500) + +_HOLDABLE_SOURCES = (ProcedureStatus.RUNNING,) +_DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_HOLDABLE_SOURCES)) + + +def _procedure(*, procedure_id: UUID, status: ProcedureStatus) -> Procedure: + return Procedure( + id=procedure_id, + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), reason=_REASON, now=aware_datetimes()) +def test_hold_with_none_state_always_raises_not_found( + procedure_id: UUID, + reason: str, + now: datetime, +) -> None: + """Empty stream always raises `ProcedureNotFoundError` carrying command id.""" + with pytest.raises(ProcedureNotFoundError) as exc: + hold_procedure.decide( + state=None, + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert exc.value.procedure_id == procedure_id + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_HOLDABLE_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_from_permitted_source_emits_single_event( + procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """Running emits one ProcedureHeld with the threaded reason.""" + events = hold_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert events == [ProcedureHeld(procedure_id=procedure_id, reason=reason, occurred_at=now)] + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_DISALLOWED_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_from_disallowed_source_always_raises_cannot_hold( + procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """Any non-Running source raises ProcedureCannotHoldError carrying the status. + + A valid reason is supplied so the source-state guard is what fires + (reason validation runs first in the decider). + """ + with pytest.raises(ProcedureCannotHoldError) as exc: + hold_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=HoldProcedure(procedure_id=procedure_id, reason=reason), + now=now, + ) + assert exc.value.current_status is source + + +@pytest.mark.unit +@given( + state_procedure_id=st.uuids(), + command_procedure_id=st.uuids(), + source=st.sampled_from(_HOLDABLE_SOURCES), + reason=_REASON, + now=aware_datetimes(), +) +def test_hold_uses_state_id_not_command_procedure_id( + state_procedure_id: UUID, + command_procedure_id: UUID, + source: ProcedureStatus, + reason: str, + now: datetime, +) -> None: + """The emitted event's procedure_id is state.id, not command.procedure_id.""" + assume(state_procedure_id != command_procedure_id) + events = hold_procedure.decide( + state=_procedure(procedure_id=state_procedure_id, status=source), + command=HoldProcedure(procedure_id=command_procedure_id, reason=reason), + now=now, + ) + assert events[0].procedure_id == state_procedure_id + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), reason=_REASON, now=aware_datetimes()) +def test_hold_is_pure_same_input_same_output( + procedure_id: UUID, + reason: str, + now: datetime, +) -> None: + """Two calls with identical args return equal events (no clock leakage).""" + state = _procedure(procedure_id=procedure_id, status=ProcedureStatus.RUNNING) + command = HoldProcedure(procedure_id=procedure_id, reason=reason) + first = hold_procedure.decide(state=state, command=command, now=now) + second = hold_procedure.decide(state=state, command=command, now=now) + assert first == second diff --git a/apps/api/tests/unit/operation/test_hold_procedure_handler.py b/apps/api/tests/unit/operation/test_hold_procedure_handler.py new file mode 100644 index 0000000000..6dc7ef999e --- /dev/null +++ b/apps/api/tests/unit/operation/test_hold_procedure_handler.py @@ -0,0 +1,125 @@ +"""Application-handler tests for `hold_procedure` slice. + +Update-style handler via `make_procedure_update_handler`. The reason is +captured on the emitted `ProcedureHeld` payload but NOT logged at the +handler boundary (mirrors abort_procedure precedent). +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.operation.aggregates.procedure import ( + InvalidProcedureHoldReasonError, + ProcedureCannotHoldError, + ProcedureNotFoundError, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import hold_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from tests.unit._helpers import build_deps as _build_deps_shared +from tests.unit.operation._helpers import seed_running_procedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 5, 15, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0e01") +_EVENT_ID = UUID("01900000-0000-7000-8000-0000000c0e02") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +async def _seed_running_procedure(store: InMemoryEventStore) -> None: + await seed_running_procedure( + store, + procedure_id=_PROCEDURE_ID, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + + +@pytest.mark.unit +async def test_handler_appends_procedure_held_event_with_trimmed_reason() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason=" beam dropped "), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + events, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 3 + assert events[2].event_type == "ProcedureHeld" + assert events[2].payload == { + "procedure_id": str(_PROCEDURE_ID), + "reason": "beam dropped", + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +async def test_handler_raises_when_procedure_not_found() -> None: + store = InMemoryEventStore() + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + with pytest.raises(ProcedureNotFoundError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="x"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_cannot_hold_when_re_holding() -> None: + """Strict-not-idempotent: re-holding a Held procedure raises.""" + store = InMemoryEventStore() + await _seed_running_procedure(store) + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="first"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + with pytest.raises(ProcedureCannotHoldError): + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="second"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_invalid_reason_for_whitespace_only() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = hold_procedure.bind(deps) + with pytest.raises(InvalidProcedureHoldReasonError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason=" "), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_running_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store, deny=True) + handler = hold_procedure.bind(deps) + with pytest.raises(UnauthorizedError): + await handler( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="r"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + _, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 2 diff --git a/apps/api/tests/unit/operation/test_procedure.py b/apps/api/tests/unit/operation/test_procedure.py index e5f592b1c2..c2f95b7816 100644 --- a/apps/api/tests/unit/operation/test_procedure.py +++ b/apps/api/tests/unit/operation/test_procedure.py @@ -68,18 +68,22 @@ def test_procedure_name_rejects_over_max_length() -> None: @pytest.mark.unit def test_procedure_status_values_locked() -> None: - """Pin the 5-state FSM values; future additions must be a deliberate test edit. + """Pin the 6-state FSM values; future additions must be a deliberate test edit. The FSM was REVISED from BC map's `Idle/Starting/Running/Verifying/Complete/Aborted` per standards-corpus research at [[project_operation_design]]: Verifying is NOT - standards-blessed at FSM level; transient states deferred per Run BC precedent.""" + standards-blessed at FSM level; transient states deferred per Run BC precedent. + `Held` lands in Tier 1 of [[project_resumable_conduct_design]] (operator-pause of + a halted conduct; mirrors RunStatus.HELD).""" assert ProcedureStatus.DEFINED.value == "Defined" assert ProcedureStatus.RUNNING.value == "Running" + assert ProcedureStatus.HELD.value == "Held" assert ProcedureStatus.COMPLETED.value == "Completed" assert ProcedureStatus.ABORTED.value == "Aborted" assert ProcedureStatus.TRUNCATED.value == "Truncated" assert {s.value for s in ProcedureStatus} == { "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated", @@ -90,7 +94,8 @@ def test_procedure_status_values_locked() -> None: def test_procedure_status_is_terminal_partitions_the_fsm() -> None: """is_terminal is True exactly for the terminal states. Pinned so a new state must consciously classify itself (consumers like register_dataset's - terminal-producing-Procedure guard rely on this).""" + terminal-producing-Procedure guard rely on this). `Held` is a pause-state, + NOT terminal.""" terminal = {s for s in ProcedureStatus if s.is_terminal} assert terminal == { ProcedureStatus.COMPLETED, @@ -99,6 +104,7 @@ def test_procedure_status_is_terminal_partitions_the_fsm() -> None: } assert not ProcedureStatus.DEFINED.is_terminal assert not ProcedureStatus.RUNNING.is_terminal + assert not ProcedureStatus.HELD.is_terminal # ---------- Error class shapes ---------- diff --git a/apps/api/tests/unit/operation/test_procedure_events.py b/apps/api/tests/unit/operation/test_procedure_events.py index 3bd8bd0670..09b5870524 100644 --- a/apps/api/tests/unit/operation/test_procedure_events.py +++ b/apps/api/tests/unit/operation/test_procedure_events.py @@ -11,9 +11,11 @@ ProcedureAborted, ProcedureActivitiesLogbookOpened, ProcedureCompleted, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureTruncated, RecipeExpansionRecorded, @@ -728,6 +730,8 @@ def test_iteration_ended_round_trips(converged: bool | None, reason: str | None) "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", "ProcedureIterationEnded", @@ -742,3 +746,88 @@ def test_from_stored_raises_on_malformed_payload(event_type: str) -> None: in the load path.""" with pytest.raises(ValueError, match=f"Malformed {event_type} payload"): from_stored(_stored(event_type, {})) + + +# --- ProcedureHeld / ProcedureResumed (resumable conduct, Tier 1) --- + + +@pytest.mark.unit +def test_event_type_names_for_hold_resume() -> None: + held = ProcedureHeld(procedure_id=uuid4(), reason="pause", occurred_at=_NOW) + resumed = ProcedureResumed(procedure_id=uuid4(), re_establishment_boundary=0, occurred_at=_NOW) + assert event_type_name(held) == "ProcedureHeld" + assert event_type_name(resumed) == "ProcedureResumed" + + +@pytest.mark.unit +def test_to_payload_serializes_procedure_held() -> None: + pid = uuid4() + decision_id = uuid4() + payload = to_payload( + ProcedureHeld( + procedure_id=pid, + reason="beam dropped", + decided_by_decision_id=decision_id, + occurred_at=_NOW, + ) + ) + assert payload == { + "procedure_id": str(pid), + "reason": "beam dropped", + "decided_by_decision_id": str(decision_id), + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +def test_to_payload_serializes_procedure_resumed_with_null_decision() -> None: + pid = uuid4() + payload = to_payload( + ProcedureResumed(procedure_id=pid, re_establishment_boundary=5, occurred_at=_NOW) + ) + assert payload == { + "procedure_id": str(pid), + "re_establishment_boundary": 5, + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +@pytest.mark.parametrize("decision_id", [None, uuid4()]) +def test_procedure_held_round_trips(decision_id: UUID | None) -> None: + event = ProcedureHeld( + procedure_id=uuid4(), + reason="investigating fault", + decided_by_decision_id=decision_id, + occurred_at=_NOW, + ) + rebuilt = from_stored(_stored("ProcedureHeld", to_payload(event))) + assert rebuilt == event + + +@pytest.mark.unit +@pytest.mark.parametrize("decision_id", [None, uuid4()]) +def test_procedure_resumed_round_trips(decision_id: UUID | None) -> None: + event = ProcedureResumed( + procedure_id=uuid4(), + re_establishment_boundary=3, + decided_by_decision_id=decision_id, + occurred_at=_NOW, + ) + rebuilt = from_stored(_stored("ProcedureResumed", to_payload(event))) + assert rebuilt == event + + +@pytest.mark.unit +def test_from_stored_held_without_decided_by_key_folds_to_none() -> None: + """Forward-compat: a pre-supervisor stream omits decided_by_decision_id.""" + pid = uuid4() + rebuilt = from_stored( + _stored( + "ProcedureHeld", + {"procedure_id": str(pid), "reason": "pause", "occurred_at": _NOW.isoformat()}, + ) + ) + assert isinstance(rebuilt, ProcedureHeld) + assert rebuilt.decided_by_decision_id is None diff --git a/apps/api/tests/unit/operation/test_procedure_evolver.py b/apps/api/tests/unit/operation/test_procedure_evolver.py index d54cc9fd75..b9745401b6 100644 --- a/apps/api/tests/unit/operation/test_procedure_evolver.py +++ b/apps/api/tests/unit/operation/test_procedure_evolver.py @@ -13,10 +13,12 @@ ProcedureActivitiesLogbookOpened, ProcedureCompleted, ProcedureEvent, + ProcedureHeld, ProcedureIterationEnded, ProcedureIterationStarted, ProcedureName, ProcedureRegistered, + ProcedureResumed, ProcedureStarted, ProcedureStatus, ProcedureTruncated, @@ -377,6 +379,126 @@ def test_evolve_procedure_aborted_on_empty_state_raises() -> None: evolve(None, ProcedureAborted(procedure_id=uuid4(), reason="x", occurred_at=_NOW)) +# --- ProcedureHeld / ProcedureResumed arms (resumable conduct, Tier 1) --- + + +def _to_running(prior: Procedure) -> Procedure: + return evolve(prior, ProcedureStarted(procedure_id=prior.id, occurred_at=_NOW)) + + +@pytest.mark.unit +def test_evolve_procedure_held_sets_status_to_held() -> None: + running = _to_running(_defined()) + state = evolve( + running, ProcedureHeld(procedure_id=running.id, reason="beam dropped", occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.HELD + + +@pytest.mark.unit +def test_evolve_procedure_resumed_sets_status_to_running() -> None: + running = _to_running(_defined()) + held = evolve( + running, ProcedureHeld(procedure_id=running.id, reason="beam dropped", occurred_at=_NOW) + ) + state = evolve( + held, ProcedureResumed(procedure_id=held.id, re_establishment_boundary=2, occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.RUNNING + + +@pytest.mark.unit +def test_evolve_procedure_held_preserves_iteration_denorms_and_actuation_kind() -> None: + """The load-bearing carry-forward: the Held arm must not wipe the + iteration denorms (the bug class the AST fitness guards).""" + running = _to_running(_defined(name="alignment", kind="alignment")) + # Open + close an iteration so the denorms are non-default. + started_iter = evolve( + running, + ProcedureIterationStarted(procedure_id=running.id, iteration_index=1, occurred_at=_NOW), + ) + ended_iter = evolve( + started_iter, + ProcedureIterationEnded( + procedure_id=running.id, + iteration_index=1, + converged=False, + reason=None, + occurred_at=_NOW, + ), + ) + state = evolve( + ended_iter, ProcedureHeld(procedure_id=running.id, reason="pause", occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.HELD + assert state.iteration_count == ended_iter.iteration_count == 1 + assert state.current_iteration_index is None + assert ( + state.consecutive_unconverged_iterations + == ended_iter.consecutive_unconverged_iterations + == 1 + ) + assert state.kind == "alignment" + + +@pytest.mark.unit +def test_evolve_procedure_resumed_preserves_iteration_denorms() -> None: + running = _to_running(_defined()) + started_iter = evolve( + running, + ProcedureIterationStarted(procedure_id=running.id, iteration_index=1, occurred_at=_NOW), + ) + held = evolve( + started_iter, ProcedureHeld(procedure_id=running.id, reason="pause", occurred_at=_NOW) + ) + state = evolve( + held, ProcedureResumed(procedure_id=held.id, re_establishment_boundary=0, occurred_at=_NOW) + ) + assert state.status is ProcedureStatus.RUNNING + # An iteration left open across the hold stays open on resume. + assert state.current_iteration_index == 1 + assert state.iteration_count == 1 + + +@pytest.mark.unit +def test_fold_hold_resume_cycle_lands_running() -> None: + pid = uuid4() + state = fold( + [ + ProcedureRegistered( + procedure_id=pid, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ), + ProcedureStarted(procedure_id=pid, occurred_at=_NOW), + ProcedureHeld(procedure_id=pid, reason="first pause", occurred_at=_NOW), + ProcedureResumed(procedure_id=pid, re_establishment_boundary=0, occurred_at=_NOW), + ProcedureHeld(procedure_id=pid, reason="second pause", occurred_at=_NOW), + ProcedureResumed(procedure_id=pid, re_establishment_boundary=3, occurred_at=_NOW), + ] + ) + assert state is not None + assert state.status is ProcedureStatus.RUNNING + + +@pytest.mark.unit +def test_evolve_procedure_held_on_empty_state_raises() -> None: + with pytest.raises(ValueError, match="ProcedureHeld"): + evolve(None, ProcedureHeld(procedure_id=uuid4(), reason="x", occurred_at=_NOW)) + + +@pytest.mark.unit +def test_evolve_procedure_resumed_on_empty_state_raises() -> None: + with pytest.raises(ValueError, match="ProcedureResumed"): + evolve( + None, + ProcedureResumed(procedure_id=uuid4(), re_establishment_boundary=0, occurred_at=_NOW), + ) + + # --- ProcedureActivitiesLogbookOpened arm --- diff --git a/apps/api/tests/unit/operation/test_procedure_summary_projection.py b/apps/api/tests/unit/operation/test_procedure_summary_projection.py index 49f995ec3d..54e2f5f575 100644 --- a/apps/api/tests/unit/operation/test_procedure_summary_projection.py +++ b/apps/api/tests/unit/operation/test_procedure_summary_projection.py @@ -70,6 +70,22 @@ def test_projection_does_not_subscribe_to_iteration_ended() -> None: assert "ProcedureIterationEnded" not in proj.subscribed_event_types +@pytest.mark.unit +def test_projection_does_not_subscribe_to_hold_resume() -> None: + """Tier-1 resumable conduct deliberately leaves the summary read model + untouched: the `status` CHECK constraint admits only the 5 non-Held + statuses, so subscribing to ProcedureHeld/Resumed would write a value + the column rejects. A held Procedure therefore shows its last + subscribed status (Running) in `list_procedures`; terminal states are + still captured because abort/truncate/complete require resuming to + Running first (and those events ARE subscribed). Surfacing `Held` in + the read model is a follow-up that needs a forward-only migration to + widen the CHECK. See [[project_resumable_conduct_design]].""" + proj = ProcedureSummaryProjection() + assert "ProcedureHeld" not in proj.subscribed_event_types + assert "ProcedureResumed" not in proj.subscribed_event_types + + @pytest.mark.unit async def test_procedure_registered_inserts_with_defined_status_and_null_audit() -> None: proj = ProcedureSummaryProjection() diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider.py b/apps/api/tests/unit/operation/test_resume_procedure_decider.py new file mode 100644 index 0000000000..4bb094d1b3 --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider.py @@ -0,0 +1,140 @@ +"""Pure-decider tests for `resume_procedure` slice. + +Single-source resume transition: `Held -> Running`. Carries +`re_establishment_boundary` (>= 0). Mirrors `resume_run`. The +off-diagonal guard (parent Run Held) is NOT in this pure decider; it +lands in the handler in a follow-up slice. +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureName, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features import resume_procedure +from cora.operation.features.resume_procedure import ResumeProcedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) + + +def _procedure( + *, + procedure_id: UUID | None = None, + status: ProcedureStatus = ProcedureStatus.HELD, +) -> Procedure: + return Procedure( + id=procedure_id or uuid4(), + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +def test_decide_emits_procedure_resumed_when_held() -> None: + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=3), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureResumed) + assert events[0].procedure_id == proc.id + assert events[0].re_establishment_boundary == 3 + assert events[0].occurred_at == _NOW + assert events[0].decided_by_decision_id is None + + +@pytest.mark.unit +def test_decide_threads_decided_by_decision_id() -> None: + proc = _procedure() + decision_id = uuid4() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure( + procedure_id=proc.id, + re_establishment_boundary=0, + decided_by_decision_id=decision_id, + ), + now=_NOW, + ) + assert events[0].decided_by_decision_id == decision_id + + +@pytest.mark.unit +def test_decide_accepts_zero_boundary() -> None: + """Boundary 0 = re-establish from the first step (valid).""" + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + now=_NOW, + ) + assert events[0].re_establishment_boundary == 0 + + +@pytest.mark.unit +def test_decide_rejects_negative_boundary() -> None: + proc = _procedure() + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=-1), + now=_NOW, + ) + + +@pytest.mark.unit +def test_decide_rejects_when_state_is_none() -> None: + pid = uuid4() + with pytest.raises(ProcedureNotFoundError) as exc: + resume_procedure.decide( + state=None, + command=ResumeProcedure(procedure_id=pid, re_establishment_boundary=0), + now=_NOW, + ) + assert exc.value.procedure_id == pid + + +@pytest.mark.unit +@pytest.mark.parametrize( + "status", + [ + ProcedureStatus.DEFINED, + ProcedureStatus.RUNNING, + ProcedureStatus.COMPLETED, + ProcedureStatus.ABORTED, + ProcedureStatus.TRUNCATED, + ], +) +def test_decide_rejects_non_held_status(status: ProcedureStatus) -> None: + """Resuming a non-Held procedure raises (resuming a Running one too).""" + proc = _procedure(status=status) + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + now=_NOW, + ) + assert exc.value.current_status is status + + +@pytest.mark.unit +def test_decide_is_pure_same_inputs_same_outputs() -> None: + proc = _procedure() + cmd = ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=2) + first = resume_procedure.decide(state=proc, command=cmd, now=_NOW) + second = resume_procedure.decide(state=proc, command=cmd, now=_NOW) + assert first == second diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py new file mode 100644 index 0000000000..441112edf8 --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py @@ -0,0 +1,199 @@ +"""Property-based tests for `resume_procedure.decide` (Operation BC). + +Complements the example-based `test_resume_procedure_decider.py` with +universal claims across generated inputs. The decider is a pure +single-source resume transition carrying a re-establishment boundary: + + (state, command, now) -> list[ProcedureResumed] + +Load-bearing properties: + + - state=None always raises `ProcedureNotFoundError` carrying + command.procedure_id. + - A negative re_establishment_boundary always raises + `InvalidProcedureReEstablishmentBoundaryError` (validated before the + status guard). + - The source-state partition is total over `ProcedureStatus`: the sole + source `{Held}` emits exactly one `ProcedureResumed` (procedure_id= + state.id, boundary threaded, occurred_at=now); every other status + raises `ProcedureCannotResumeError`. (Adding a new status + auto-extends `_DISALLOWED_SOURCES`.) + - The emitted event's procedure_id is `state.id`, never + command.procedure_id. + - Pure: same (state, command, now) returns equal events. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from hypothesis import assume, given +from hypothesis import strategies as st + +from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, + Procedure, + ProcedureCannotResumeError, + ProcedureName, + ProcedureNotFoundError, + ProcedureResumed, + ProcedureStatus, +) +from cora.operation.features import resume_procedure +from cora.operation.features.resume_procedure import ResumeProcedure +from tests._strategies import aware_datetimes + +if TYPE_CHECKING: + from datetime import datetime + from uuid import UUID + +_BOUNDARY = st.integers(min_value=0, max_value=1_000_000) + +_RESUMABLE_SOURCES = (ProcedureStatus.HELD,) +_DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_RESUMABLE_SOURCES)) + + +def _procedure(*, procedure_id: UUID, status: ProcedureStatus) -> Procedure: + return Procedure( + id=procedure_id, + name=ProcedureName("X"), + kind="bakeout", + target_asset_ids=frozenset(), + status=status, + parent_run_id=None, + ) + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), boundary=_BOUNDARY, now=aware_datetimes()) +def test_resume_with_none_state_always_raises_not_found( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Empty stream always raises `ProcedureNotFoundError` carrying command id.""" + with pytest.raises(ProcedureNotFoundError) as exc: + resume_procedure.decide( + state=None, + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert exc.value.procedure_id == procedure_id + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(list(ProcedureStatus)), + boundary=st.integers(max_value=-1), + now=aware_datetimes(), +) +def test_resume_with_negative_boundary_always_raises_invalid( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """A negative boundary raises before the status guard, for any source state.""" + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_RESUMABLE_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_from_permitted_source_emits_single_event( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """Held emits one ProcedureResumed with the threaded boundary.""" + events = resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert events == [ + ProcedureResumed( + procedure_id=procedure_id, re_establishment_boundary=boundary, occurred_at=now + ) + ] + + +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + source=st.sampled_from(_DISALLOWED_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_from_disallowed_source_always_raises_cannot_resume( + procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """Any non-Held source raises ProcedureCannotResumeError carrying the status. + + A valid (non-negative) boundary is supplied so the source-state guard + is what fires (boundary validation runs first in the decider). + """ + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=source), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + now=now, + ) + assert exc.value.current_status is source + + +@pytest.mark.unit +@given( + state_procedure_id=st.uuids(), + command_procedure_id=st.uuids(), + source=st.sampled_from(_RESUMABLE_SOURCES), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_uses_state_id_not_command_procedure_id( + state_procedure_id: UUID, + command_procedure_id: UUID, + source: ProcedureStatus, + boundary: int, + now: datetime, +) -> None: + """The emitted event's procedure_id is state.id, not command.procedure_id.""" + assume(state_procedure_id != command_procedure_id) + events = resume_procedure.decide( + state=_procedure(procedure_id=state_procedure_id, status=source), + command=ResumeProcedure( + procedure_id=command_procedure_id, re_establishment_boundary=boundary + ), + now=now, + ) + assert events[0].procedure_id == state_procedure_id + + +@pytest.mark.unit +@given(procedure_id=st.uuids(), boundary=_BOUNDARY, now=aware_datetimes()) +def test_resume_is_pure_same_input_same_output( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Two calls with identical args return equal events (no clock leakage).""" + state = _procedure(procedure_id=procedure_id, status=ProcedureStatus.HELD) + command = ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary) + first = resume_procedure.decide(state=state, command=command, now=now) + second = resume_procedure.decide(state=state, command=command, now=now) + assert first == second diff --git a/apps/api/tests/unit/operation/test_resume_procedure_handler.py b/apps/api/tests/unit/operation/test_resume_procedure_handler.py new file mode 100644 index 0000000000..e9a5d1c497 --- /dev/null +++ b/apps/api/tests/unit/operation/test_resume_procedure_handler.py @@ -0,0 +1,143 @@ +"""Application-handler tests for `resume_procedure` slice. + +Update-style handler via `make_procedure_update_handler`. Source state +is `Held`, reached here by seeding Running then holding. The +off-diagonal parent-Run-Held guard is a follow-up slice; this test +covers the status-guard handler only. +""" + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.operation.aggregates.procedure import ( + ProcedureCannotResumeError, + ProcedureNotFoundError, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import hold_procedure, resume_procedure +from cora.operation.features.hold_procedure import HoldProcedure +from cora.operation.features.resume_procedure import ResumeProcedure +from tests.unit._helpers import build_deps as _build_deps_shared +from tests.unit.operation._helpers import seed_running_procedure + +_NOW = datetime(2026, 5, 15, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 5, 15, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0f01") +_EVENT_ID = UUID("01900000-0000-7000-8000-0000000c0f02") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +async def _seed_held_procedure(store: InMemoryEventStore) -> None: + await seed_running_procedure( + store, + procedure_id=_PROCEDURE_ID, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_PRIOR, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="beam dropped"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_appends_procedure_resumed_event() -> None: + store = InMemoryEventStore() + await _seed_held_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = resume_procedure.bind(deps) + + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=2), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + events, version = await store.load("Procedure", _PROCEDURE_ID) + assert version == 4 # Registered, Started, Held, Resumed + assert events[3].event_type == "ProcedureResumed" + assert events[3].payload == { + "procedure_id": str(_PROCEDURE_ID), + "re_establishment_boundary": 2, + "decided_by_decision_id": None, + "occurred_at": _NOW.isoformat(), + } + + +@pytest.mark.unit +async def test_handler_raises_when_procedure_not_found() -> None: + store = InMemoryEventStore() + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + handler = resume_procedure.bind(deps) + with pytest.raises(ProcedureNotFoundError): + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_raises_cannot_resume_when_running() -> None: + """Resuming a Running (not Held) procedure raises.""" + store = InMemoryEventStore() + await seed_running_procedure( + store, + procedure_id=_PROCEDURE_ID, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(ProcedureCannotResumeError): + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_handler_round_trips_hold_then_resume_back_to_running() -> None: + """Hold then resume lands the Procedure back in Running (bidirectional cycle).""" + store = InMemoryEventStore() + await _seed_held_procedure(store) + await resume_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + # A second hold now succeeds (the cycle is open again). + await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_NOW, event_store=store))( + HoldProcedure(procedure_id=_PROCEDURE_ID, reason="second pause"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + events, _ = await store.load("Procedure", _PROCEDURE_ID) + assert [e.event_type for e in events] == [ + "ProcedureRegistered", + "ProcedureStarted", + "ProcedureHeld", + "ProcedureResumed", + "ProcedureHeld", + ] + + +@pytest.mark.unit +async def test_handler_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_held_procedure(store) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store, deny=True) + handler = resume_procedure.bind(deps) + with pytest.raises(UnauthorizedError): + await handler( + ResumeProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) From 401a2bbdfb55b20b8957040b7d06f598127f0f1c Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 08:14:33 +0300 Subject: [PATCH 03/12] feat(operation): widen abort/truncate/end_iteration to accept Held Completes the resumable-conduct Tier-1 widening: a paused (Held) Procedure is no longer stranded. abort_procedure and truncate_procedure now accept `Running | Held` (a paused conduct stays abortable, and a paused-then-de-facto-dead one can be truncated retroactively); end_iteration accepts `Held` so an iteration left open when the operator paused can still be closed. start_iteration and complete_procedure and append_activities stay Running-only (no new iteration / no completion / no steps while paused; resume first). Mirrors Run BC's `Running | Held` source sets. The abort/truncate property-based tests' permitted-source sets gain Held (so the enum-derived disallowed set drops it); positive Held example tests added for all three verbs; the Cannot{Abort,Truncate, EndIteration}Error docstrings and the abort/truncate runtime messages now name `Running | Held` (matching RunCannotAbortError's "requires Running or Held"). Also drops a few stale "deferred to 10c-c / if Held lands" phase-tagged docstrings the FSM addition invalidated. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../operation/aggregates/procedure/events.py | 14 ++++---- .../operation/aggregates/procedure/state.py | 36 ++++++++++--------- .../features/abort_procedure/decider.py | 16 +++++---- .../features/end_iteration/decider.py | 21 +++++++---- .../features/truncate_procedure/decider.py | 16 +++++---- .../operation/test_abort_procedure_decider.py | 15 ++++++++ ...test_abort_procedure_decider_properties.py | 10 +++--- .../operation/test_end_iteration_decider.py | 17 +++++++++ .../test_end_iteration_decider_properties.py | 19 ++++++---- .../test_truncate_procedure_decider.py | 19 ++++++++++ ...t_truncate_procedure_decider_properties.py | 8 ++--- 11 files changed, 134 insertions(+), 57 deletions(-) diff --git a/apps/api/src/cora/operation/aggregates/procedure/events.py b/apps/api/src/cora/operation/aggregates/procedure/events.py index 958d5f4ed5..b643f03e11 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/events.py +++ b/apps/api/src/cora/operation/aggregates/procedure/events.py @@ -179,7 +179,7 @@ class RecipeExpansionRecorded: @dataclass(frozen=True) class ProcedureStarted: - """A Procedure transitioned out of Defined into Running (10c-b). + """A Procedure transitioned out of Defined into Running. Slim payload by design: the start fact is what the event encodes. Status is implicit (`Running`); the evolver sets it. No reason @@ -266,7 +266,7 @@ class ProcedureActivitiesLogbookOpened: @dataclass(frozen=True) class ProcedureTruncated: - """A Procedure reached its partial-data terminal (Running -> Truncated, 10c-c). + """A Procedure reached its partial-data terminal (Running | Held -> Truncated). Cleanup terminal for a Procedure that became de-facto dead through interruption (power loss, process crash, hardware fault, weekend @@ -289,7 +289,7 @@ class ProcedureTruncated: emergency exit while the system is still responsive; Truncated is a cleanup mechanism for known-dead Procedures. The system itself does not detect de-facto-dead Procedures (separate liveness - concern, out of scope for 10c-c); operators must invoke truncate + concern, out of scope here); operators must invoke truncate explicitly. Mirrors `RunTruncated` from Run BC's 6f-4. """ @@ -301,7 +301,7 @@ class ProcedureTruncated: @dataclass(frozen=True) class ProcedureAborted: - """A Procedure reached its emergency-exit terminal (Running -> Aborted). + """A Procedure reached its emergency-exit terminal (Running | Held -> Aborted). `reason` is a free-form string (1-500 chars after trimming), captured verbatim from the operator. Mirror of RunAborted.reason @@ -314,9 +314,9 @@ class ProcedureAborted: fold via `payload.get("actuation_kind")` -> None. Carries honest provenance for a Dataset produced by an aborted conduct. - Single-source guard at the decider (Running only). Held/Resumed - deferred to 10c-c per pilot need; if Held lands, the abort source - set widens to `Running | Held` to match Run BC's precedent. + Multi-source guard at the decider: `Running | Held` (a paused + conduct stays abortable; resumable conduct widened the source set, + matching Run BC's `abort_run`). """ procedure_id: UUID diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index dfb4837205..df62e693e4 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -761,33 +761,32 @@ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> Non class ProcedureCannotAbortError(Exception): - """Attempted to abort a Procedure not in `Running`. - - Single-source guard: `abort_procedure` accepts only `Running` today. - The `Held` pause-state now exists (resumable conduct); widening the - abort source set to `Running | Held` so a paused Procedure stays - abortable is a follow-up slice (the abort PBT auto-includes `Held` as - a disallowed source until then). Aborting a `Defined` Procedure raises - (use a different workflow, for example: never start it, then leave it - Defined or extend the FSM with a cancel-defined slice if real); - aborting any terminal raises (strict-not-idempotent). Mapped to HTTP 409. + """Attempted to abort a Procedure not in `Running` or `Held`. + + Source guard: `abort_procedure` accepts `Running | Held` (a paused + conduct stays abortable; resumable conduct widened the set, mirroring + Run BC's `abort_run`). Aborting a `Defined` Procedure raises (use a + different workflow, for example: never start it, then leave it Defined + or extend the FSM with a cancel-defined slice if real); aborting any + terminal raises (strict-not-idempotent). Mapped to HTTP 409. """ def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: super().__init__( f"Procedure {procedure_id} cannot be aborted: currently in status " - f"{current_status.value}, abort requires {ProcedureStatus.RUNNING.value}" + f"{current_status.value}, abort requires " + f"{ProcedureStatus.RUNNING.value} or {ProcedureStatus.HELD.value}" ) self.procedure_id = procedure_id self.current_status = current_status class ProcedureCannotTruncateError(Exception): - """Attempted to truncate a Procedure not in `Running`. + """Attempted to truncate a Procedure not in `Running` or `Held`. - Single-source guard: `truncate_procedure` accepts only `Running` - today; widening to `Running | Held` (so a paused Procedure can be - closed retroactively) is a follow-up slice alongside abort. Mirrors + Source guard: `truncate_procedure` accepts `Running | Held` (a paused + Procedure that became de-facto dead can be closed retroactively; + resumable conduct widened the set alongside abort). Mirrors `ProcedureCannotAbortError`'s source set: a Defined Procedure hasn't started so there's no execution to truncate; terminal Procedures are already closed (re-truncating a `Truncated` @@ -803,7 +802,8 @@ class ProcedureCannotTruncateError(Exception): def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: super().__init__( f"Procedure {procedure_id} cannot be truncated: currently in status " - f"{current_status.value}, truncate requires {ProcedureStatus.RUNNING.value}" + f"{current_status.value}, truncate requires " + f"{ProcedureStatus.RUNNING.value} or {ProcedureStatus.HELD.value}" ) self.procedure_id = procedure_id self.current_status = current_status @@ -902,7 +902,9 @@ class ProcedureCannotEndIterationError(Exception): """Attempted to end an iteration that fails an end-gate. Raised by the `end_iteration` decider when: - - The Procedure is not in `Running`. + - The Procedure is not in `Running` or `Held` (an open iteration can + be closed even while the conduct is paused; resumable conduct + widened the source set, but `start_iteration` stays Running-only). - No iteration is currently open (`current_iteration_index` is None); there is nothing to end. - The supplied `iteration_index` does not match the open diff --git a/apps/api/src/cora/operation/features/abort_procedure/decider.py b/apps/api/src/cora/operation/features/abort_procedure/decider.py index 88bca4d23e..888aa528a7 100644 --- a/apps/api/src/cora/operation/features/abort_procedure/decider.py +++ b/apps/api/src/cora/operation/features/abort_procedure/decider.py @@ -1,9 +1,10 @@ """Pure decider for the `AbortProcedure` command. -Single-source emergency-exit terminal: `Running -> Aborted`. Source -set is just `Running` today (Held / Resumed deferred to 10c-c per -pilot need; if Held lands, this source set widens to `Running | Held` -to mirror Run BC's `abort_run` precedent). +Multi-source emergency-exit terminal: `Running | Held -> Aborted`. +`Held` was added when resumable conduct landed +([[project_resumable_conduct_design]] Tier 1); abort widened to accept +it so a paused Procedure stays abortable rather than stranded. Mirrors +Run BC's `abort_run` (`Running | Held` source set). `reason` validation goes through the `ProcedureAbortReason` VO (which calls the shared `validate_bounded_text` helper). The on-the-wire @@ -13,7 +14,7 @@ - State must not be None -> ProcedureNotFoundError - command.reason must be 1-500 chars after trimming -> InvalidProcedureAbortReasonError - - State.status must be in {Running} + - State.status must be in {Running, Held} -> ProcedureCannotAbortError(current_status=...) """ @@ -29,7 +30,10 @@ ) from cora.operation.features.abort_procedure.command import AbortProcedure -_ABORTABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) +_ABORTABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) def decide( diff --git a/apps/api/src/cora/operation/features/end_iteration/decider.py b/apps/api/src/cora/operation/features/end_iteration/decider.py index e025ec01c7..14f280111e 100644 --- a/apps/api/src/cora/operation/features/end_iteration/decider.py +++ b/apps/api/src/cora/operation/features/end_iteration/decider.py @@ -1,9 +1,13 @@ """Pure decider for the `EndProcedureIteration` command. -Closes the currently-open convergence-loop iteration on a Running -Procedure. Iteration is orthogonal to the lifecycle FSM (the Procedure -stays Running); this folds onto the iteration denorm by clearing the -open-index marker. +Closes the currently-open convergence-loop iteration on a Running or +Held Procedure. Iteration is orthogonal to the lifecycle FSM; this +folds onto the iteration denorm by clearing the open-index marker. + +`Held` is accepted (alongside `Running`) so an iteration left open when +an operator paused the conduct can still be closed while paused +([[project_resumable_conduct_design]] Tier 1). `start_iteration` is NOT +widened: a new iteration cannot be opened while paused (resume first). `reason` is optional; when present it is trimmed and bounded 1-500 chars via the shared `validate_bounded_text` helper (matching the @@ -15,7 +19,7 @@ - state is None -> ProcedureNotFoundError - command.reason, when present, must be 1-500 chars after trimming -> InvalidProcedureIterationEndReasonError - - status is not Running, OR no iteration is open + - status is not in {Running, Held}, OR no iteration is open (current_iteration_index is None), OR iteration_index does not equal the open current_iteration_index -> ProcedureCannotEndIterationError """ @@ -34,6 +38,11 @@ from cora.shared.bounded_text import validate_bounded_text from cora.shared.text_bounds import REASON_MAX_LENGTH +_ITERATION_ENDABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) + def decide( state: Procedure | None, @@ -54,7 +63,7 @@ def decide( else None ) if ( - state.status is not ProcedureStatus.RUNNING + state.status not in _ITERATION_ENDABLE_STATUSES or state.current_iteration_index is None or command.iteration_index != state.current_iteration_index ): diff --git a/apps/api/src/cora/operation/features/truncate_procedure/decider.py b/apps/api/src/cora/operation/features/truncate_procedure/decider.py index b446ab9936..93d756eff4 100644 --- a/apps/api/src/cora/operation/features/truncate_procedure/decider.py +++ b/apps/api/src/cora/operation/features/truncate_procedure/decider.py @@ -1,9 +1,10 @@ """Pure decider for the `TruncateProcedure` command. -Single-source partial-data terminal: `Running -> Truncated`. Source -set is just `Running` today (Held / Resumed deferred per pilot need; -if Held lands, this source set widens to `Running | Held` to mirror -Run BC's `truncate_run` precedent). +Multi-source partial-data terminal: `Running | Held -> Truncated`. +`Held` was added when resumable conduct landed +([[project_resumable_conduct_design]] Tier 1); truncate widened to +accept it so a paused-then-de-facto-dead Procedure can be closed +retroactively. Mirrors Run BC's `truncate_run` (`Running | Held`). Truncating any terminal (Completed | Aborted | Truncated) raises; re-truncating a `Truncated` Procedure raises (strict-not-idempotent). @@ -29,7 +30,7 @@ -> InvalidProcedureTruncateReasonError - command.interrupted_at, when set, must not be in the future -> InvalidProcedureInterruptedAtError - - State.status must be in {Running} + - State.status must be in {Running, Held} -> ProcedureCannotTruncateError(current_status=...) """ @@ -46,7 +47,10 @@ ) from cora.operation.features.truncate_procedure.command import TruncateProcedure -_TRUNCATABLE_STATUSES: tuple[ProcedureStatus, ...] = (ProcedureStatus.RUNNING,) +_TRUNCATABLE_STATUSES: tuple[ProcedureStatus, ...] = ( + ProcedureStatus.RUNNING, + ProcedureStatus.HELD, +) def decide( diff --git a/apps/api/tests/unit/operation/test_abort_procedure_decider.py b/apps/api/tests/unit/operation/test_abort_procedure_decider.py index 3654ae0a40..2b5111dd3c 100644 --- a/apps/api/tests/unit/operation/test_abort_procedure_decider.py +++ b/apps/api/tests/unit/operation/test_abort_procedure_decider.py @@ -57,6 +57,21 @@ def test_decide_emits_procedure_aborted_when_running() -> None: assert events[0].actuation_kind is None +@pytest.mark.unit +def test_decide_emits_procedure_aborted_when_held() -> None: + """Resumable conduct: a paused (Held) Procedure stays abortable.""" + proc = _procedure(status=ProcedureStatus.HELD) + events = abort_procedure.decide( + state=proc, + command=AbortProcedure(procedure_id=proc.id, reason="paused then abandoned"), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureAborted) + assert events[0].procedure_id == proc.id + assert events[0].reason == "paused then abandoned" + + @pytest.mark.unit @pytest.mark.parametrize("kind", ["Physical", "Simulated", "Hybrid"]) def test_decide_snapshots_actuation_kind_onto_aborted_event(kind: str) -> None: diff --git a/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py index 2cdc86eab5..7dc8559514 100644 --- a/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py +++ b/apps/api/tests/unit/operation/test_abort_procedure_decider_properties.py @@ -10,8 +10,8 @@ - state=None always raises `ProcedureNotFoundError` carrying command.procedure_id. - - The source-state partition is total over `ProcedureStatus`: the - sole source `{Running}` emits exactly one `ProcedureAborted` + - The source-state partition is total over `ProcedureStatus`: each + source in `{Running, Held}` emits exactly one `ProcedureAborted` (procedure_id=state.id, reason threaded, occurred_at=now); every other status raises `ProcedureCannotAbortError` carrying the current status. @@ -46,7 +46,7 @@ _REASON = printable_ascii_text(min_size=1, max_size=500) -_ABORTABLE_SOURCES = (ProcedureStatus.RUNNING,) +_ABORTABLE_SOURCES = (ProcedureStatus.RUNNING, ProcedureStatus.HELD) _DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_ABORTABLE_SOURCES)) @@ -91,7 +91,7 @@ def test_abort_from_permitted_source_emits_single_event( reason: str, now: datetime, ) -> None: - """Running emits one ProcedureAborted with the threaded reason.""" + """Running or Held emits one ProcedureAborted with the threaded reason.""" events = abort_procedure.decide( state=_procedure(procedure_id=procedure_id, status=source), command=AbortProcedure(procedure_id=procedure_id, reason=reason), @@ -113,7 +113,7 @@ def test_abort_from_disallowed_source_always_raises_cannot_abort( reason: str, now: datetime, ) -> None: - """Any non-Running source raises ProcedureCannotAbortError carrying the status. + """Any source outside {Running, Held} raises ProcedureCannotAbortError. A valid reason is supplied so the source-state guard is what fires (reason validation runs first in the decider). diff --git a/apps/api/tests/unit/operation/test_end_iteration_decider.py b/apps/api/tests/unit/operation/test_end_iteration_decider.py index 1928b6c4c0..6c141be807 100644 --- a/apps/api/tests/unit/operation/test_end_iteration_decider.py +++ b/apps/api/tests/unit/operation/test_end_iteration_decider.py @@ -65,6 +65,23 @@ def test_decide_emits_iteration_ended_with_verdict() -> None: assert event.occurred_at == _NOW +@pytest.mark.unit +def test_decide_emits_iteration_ended_when_held() -> None: + """Resumable conduct: an iteration left open when the conduct was paused + can still be closed while Held (start_iteration stays Running-only).""" + proc = _procedure(status=ProcedureStatus.HELD) # iteration 1 open, paused + events = end_iteration.decide( + state=proc, + command=EndProcedureIteration( + procedure_id=proc.id, iteration_index=1, converged=False, reason=None + ), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureIterationEnded) + assert events[0].iteration_index == 1 + + @pytest.mark.unit def test_decide_passes_none_verdict_and_none_reason() -> None: proc = _procedure() diff --git a/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py b/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py index 5bfd1dbe99..77217ede46 100644 --- a/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py +++ b/apps/api/tests/unit/operation/test_end_iteration_decider_properties.py @@ -2,10 +2,10 @@ Universal claims across generated inputs: - - Running + open iteration + matching index emits exactly one + - Running or Held + open iteration + matching index emits exactly one ProcedureIterationEnded carrying the verdict/reason verbatim and now. - state=None always raises ProcedureNotFoundError. - - A non-Running status always raises ProcedureCannotEndIterationError. + - A disallowed status (not Running/Held) always raises ProcedureCannotEndIterationError. - No open iteration always raises ProcedureCannotEndIterationError. - A mismatched index always raises ProcedureCannotEndIterationError. - Pure: same (state, command, now) returns the same events. @@ -36,7 +36,7 @@ from datetime import datetime from uuid import UUID -_NON_RUNNING = st.sampled_from( +_DISALLOWED_STATUSES = st.sampled_from( [ ProcedureStatus.DEFINED, ProcedureStatus.COMPLETED, @@ -74,6 +74,7 @@ def _procedure( @pytest.mark.unit @given( procedure_id=st.uuids(), + status=st.sampled_from([ProcedureStatus.RUNNING, ProcedureStatus.HELD]), open_index=st.integers(min_value=1, max_value=500), converged=_CONVERGED, reason=_REASON, @@ -81,12 +82,18 @@ def _procedure( ) def test_end_iteration_emits_single_event_carrying_verdict( procedure_id: UUID, + status: ProcedureStatus, open_index: int, converged: bool | None, reason: str | None, now: datetime, ) -> None: - state = _procedure(procedure_id, iteration_count=open_index, current_iteration_index=open_index) + state = _procedure( + procedure_id, + status=status, + iteration_count=open_index, + current_iteration_index=open_index, + ) events = end_iteration.decide( state=state, command=EndProcedureIteration( @@ -127,11 +134,11 @@ def test_end_iteration_on_none_state_always_raises_not_found( @pytest.mark.unit @given( procedure_id=st.uuids(), - status=_NON_RUNNING, + status=_DISALLOWED_STATUSES, open_index=st.integers(min_value=1, max_value=100), now=aware_datetimes(), ) -def test_end_iteration_on_non_running_always_raises( +def test_end_iteration_on_disallowed_status_always_raises( procedure_id: UUID, status: ProcedureStatus, open_index: int, now: datetime ) -> None: state = _procedure( diff --git a/apps/api/tests/unit/operation/test_truncate_procedure_decider.py b/apps/api/tests/unit/operation/test_truncate_procedure_decider.py index 446e55412d..cd3cab4cbe 100644 --- a/apps/api/tests/unit/operation/test_truncate_procedure_decider.py +++ b/apps/api/tests/unit/operation/test_truncate_procedure_decider.py @@ -63,6 +63,25 @@ def test_decide_emits_procedure_truncated_when_running() -> None: assert events[0].occurred_at == _NOW +@pytest.mark.unit +def test_decide_emits_procedure_truncated_when_held() -> None: + """Resumable conduct: a paused (Held) Procedure that became de-facto + dead can be truncated retroactively.""" + proc = _procedure(status=ProcedureStatus.HELD) + events = truncate_procedure.decide( + state=proc, + command=TruncateProcedure( + procedure_id=proc.id, + reason="paused over the weekend, hardware died", + interrupted_at=None, + ), + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureTruncated) + assert events[0].procedure_id == proc.id + + @pytest.mark.unit def test_decide_accepts_none_interrupted_at() -> None: """interrupted_at is optional; None is valid (operator doesn't know when).""" diff --git a/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py index d8cb3b749e..ffa261011c 100644 --- a/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py +++ b/apps/api/tests/unit/operation/test_truncate_procedure_decider_properties.py @@ -10,8 +10,8 @@ - state=None always raises `ProcedureNotFoundError` carrying command.procedure_id. - - The source-state partition is total over `ProcedureStatus`: the - sole source `{Running}` emits exactly one `ProcedureTruncated` + - The source-state partition is total over `ProcedureStatus`: each + source in `{Running, Held}` emits exactly one `ProcedureTruncated` (procedure_id=state.id, reason threaded, occurred_at=now); every other status raises `ProcedureCannotTruncateError` carrying the current status. @@ -50,7 +50,7 @@ _REASON = printable_ascii_text(min_size=1, max_size=500) -_TRUNCATABLE_SOURCES = (ProcedureStatus.RUNNING,) +_TRUNCATABLE_SOURCES = (ProcedureStatus.RUNNING, ProcedureStatus.HELD) _DISALLOWED_SOURCES = tuple(s for s in ProcedureStatus if s not in frozenset(_TRUNCATABLE_SOURCES)) @@ -97,7 +97,7 @@ def test_truncate_from_permitted_source_emits_single_event( reason: str, now: _datetime, ) -> None: - """Running emits one ProcedureTruncated with the threaded reason.""" + """Running or Held emits one ProcedureTruncated with the threaded reason.""" events = truncate_procedure.decide( state=_procedure(procedure_id=procedure_id, status=source), command=TruncateProcedure(procedure_id=procedure_id, reason=reason, interrupted_at=None), From 0c70f28dcd940eec03c0805b270ed595f33ee13c Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 08:32:40 +0300 Subject: [PATCH 04/12] feat(operation): refuse Procedure resume while the parent Run is Held The off-diagonal guard for resumable conduct: a Phase-of-Run Procedure cannot resume to Running and walk real setpoints while its parent Run is itself Held. resume_procedure's handler is now a custom cross-aggregate handler (like start_procedure) that loads the parent Run via a one-directional Operation -> Run read and passes parent_run_held into the pure decider, which raises ProcedureCannotResumeError( parent_run_held=True). There is NO cascade from Run-resume into Procedure-resume (that is a Layer-3 saga, deferred). Standalone Procedures (no parent_run_id) skip the Run load and pass parent_run_held=False. A Phase-of-Run Procedure whose parent_run_id points at an empty Run stream raises RunNotFoundError (corruption, not a happy path) -- same posture as start_procedure. ProcedureCannotResumeError gains a parent_run_held discriminator for operator-facing messaging ("its parent Run is Held; resume the Run first") distinct from the status-guard message. Covered by decider example + property tests (flag set, status-guard precedence) and handler tests seeding a real parent Run in Held / Running / missing. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../operation/aggregates/procedure/state.py | 30 +++- .../features/resume_procedure/decider.py | 26 ++- .../features/resume_procedure/handler.py | 142 +++++++++++++-- .../test_resume_procedure_decider.py | 52 +++++- ...est_resume_procedure_decider_properties.py | 23 +++ .../test_resume_procedure_handler.py | 165 ++++++++++++++++-- 6 files changed, 396 insertions(+), 42 deletions(-) diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index df62e693e4..3efeeb2180 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -839,21 +839,39 @@ class ProcedureCannotResumeError(Exception): already-`Running` Procedure raises (strict-not-idempotent); resuming a `Defined` or terminal Procedure raises. Mirrors `RunCannotResumeError`. - - off-diagonal guard (added in a later slice): a Held Procedure + - off-diagonal guard (`parent_run_held=True`): a Held Procedure whose parent Run is itself `Held` cannot resume to `Running` and walk real setpoints while the Run is paused. The one-directional Operation -> Run read enforces this; there is NO cascade from Run-resume into Procedure-resume (that is a Layer-3 saga, deferred). See [[project_resumable_conduct_design]]. + + `parent_run_held` distinguishes the two for operator-facing + messaging; `current_status` is carried in both cases. """ - def __init__(self, procedure_id: UUID, current_status: "ProcedureStatus") -> None: - super().__init__( - f"Procedure {procedure_id} cannot be resumed: currently in status " - f"{current_status.value}, resume requires {ProcedureStatus.HELD.value}" - ) + def __init__( + self, + procedure_id: UUID, + current_status: "ProcedureStatus", + *, + parent_run_held: bool = False, + ) -> None: + if parent_run_held: + message = ( + f"Procedure {procedure_id} cannot be resumed: its parent Run is " + f"{ProcedureStatus.HELD.value}. Resume the Run first; CORA does not " + f"cascade a Run resume into its Procedures." + ) + else: + message = ( + f"Procedure {procedure_id} cannot be resumed: currently in status " + f"{current_status.value}, resume requires {ProcedureStatus.HELD.value}" + ) + super().__init__(message) self.procedure_id = procedure_id self.current_status = current_status + self.parent_run_held = parent_run_held class ProcedureCannotStartIterationError(Exception): diff --git a/apps/api/src/cora/operation/features/resume_procedure/decider.py b/apps/api/src/cora/operation/features/resume_procedure/decider.py index b74b28e148..f958845b6d 100644 --- a/apps/api/src/cora/operation/features/resume_procedure/decider.py +++ b/apps/api/src/cora/operation/features/resume_procedure/decider.py @@ -5,10 +5,14 @@ raises (strict-not-idempotent); resuming a `Defined` or terminal Procedure raises. Mirrors `resume_run`. -The off-diagonal guard (refuse while the parent Run is `Held`) is NOT -in this pure decider: it needs a cross-aggregate Run read and lands in -the handler in a follow-up slice (it raises the same -`ProcedureCannotResumeError`). See [[project_resumable_conduct_design]]. +Off-diagonal guard: a Held Procedure whose parent Run is itself `Held` +cannot resume to `Running` and walk real setpoints while the Run is +paused. The decider takes a `parent_run_held` fact the handler derives +from a one-directional Operation -> Run read (tach-legal); there is NO +cascade from Run-resume into Procedure-resume (that is a Layer-3 saga, +deferred). `parent_run_held` defaults False, which is correct for a +standalone Procedure (no parent Run). See +[[project_resumable_conduct_design]]. Invariants: - State must not be None -> ProcedureNotFoundError @@ -16,6 +20,8 @@ -> InvalidProcedureReEstablishmentBoundaryError - State.status must be in {Held} -> ProcedureCannotResumeError(current_status=...) + - parent_run_held must be False + -> ProcedureCannotResumeError(parent_run_held=True) """ from datetime import datetime @@ -37,15 +43,25 @@ def decide( state: Procedure | None, command: ResumeProcedure, *, + parent_run_held: bool = False, now: datetime, ) -> list[ProcedureResumed]: - """Decide the events produced by resuming a held Procedure.""" + """Decide the events produced by resuming a held Procedure. + + `parent_run_held` is the handler-derived fact that this Procedure's + parent Run is currently `Held`; standalone Procedures (no parent Run) + pass the default False. + """ if state is None: raise ProcedureNotFoundError(command.procedure_id) if command.re_establishment_boundary < 0: raise InvalidProcedureReEstablishmentBoundaryError(command.re_establishment_boundary) if state.status not in _RESUMABLE_STATUSES: raise ProcedureCannotResumeError(state.id, current_status=state.status) + if parent_run_held: + raise ProcedureCannotResumeError( + state.id, current_status=state.status, parent_run_held=True + ) return [ ProcedureResumed( procedure_id=state.id, diff --git a/apps/api/src/cora/operation/features/resume_procedure/handler.py b/apps/api/src/cora/operation/features/resume_procedure/handler.py index f079f4d9cd..725eaea411 100644 --- a/apps/api/src/cora/operation/features/resume_procedure/handler.py +++ b/apps/api/src/cora/operation/features/resume_procedure/handler.py @@ -1,24 +1,48 @@ """Application handler for the `resume_procedure` slice. -Update-style handler. Canonical body lives in -`cora.operation._procedure_update_handler.make_procedure_update_handler`; -this module is a thin slice-specific bind, mirroring resume_run. - -The off-diagonal guard (refuse to resume while the parent Run is `Held`) -is a cross-aggregate Run read added in a follow-up slice; it will replace -this factory bind with a custom handler (the factory loads exactly one -event-store stream). Until then the decider's status guard -(`Held -> Running`) is the only gate. +Update-style handler with a custom body (NOT the update-handler +factory): resume reads the parent Run to enforce the off-diagonal guard +(a Held Procedure cannot resume while its parent Run is itself Held). +The factory at `cora.infrastructure.update_handler` loads exactly one +event-store stream; this slice loads a second (the parent Run), so it +stays longhand -- same reason `start_procedure`'s handler is custom. + +## Off-diagonal guard + +For a Phase-of-Run Procedure (`parent_run_id` set), the handler loads +the parent Run and passes `parent_run_held = (Run.status == Held)` into +the pure decider, which refuses with `ProcedureCannotResumeError` so a +Procedure cannot resume to Running and walk real setpoints while the Run +is paused. This is a one-directional Operation -> Run read +(tach-legal); there is NO cascade from Run-resume into Procedure-resume +(a Layer-3 saga, deferred). Standalone Procedures (no parent_run_id) +skip the load and pass `parent_run_held=False`. """ from typing import Protocol from uuid import UUID +from cora.infrastructure.event_envelope import to_new_event from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny from cora.infrastructure.routing import NIL_SENTINEL_ID -from cora.operation._procedure_update_handler import make_procedure_update_handler +from cora.operation.aggregates.procedure import ( + ProcedureNotFoundError, + event_type_name, + fold, + from_stored, + to_payload, +) +from cora.operation.errors import UnauthorizedError from cora.operation.features.resume_procedure.command import ResumeProcedure from cora.operation.features.resume_procedure.decider import decide +from cora.run.aggregates.run import RunNotFoundError, RunStatus, load_run + +_STREAM_TYPE = "Procedure" +_COMMAND_NAME = "ResumeProcedure" + +_log = get_logger(__name__) class Handler(Protocol): @@ -37,9 +61,95 @@ async def __call__( def bind(deps: Kernel) -> Handler: """Build a resume_procedure handler closed over the shared deps.""" - return make_procedure_update_handler( - deps, - command_name="ResumeProcedure", - log_prefix="resume_procedure", - decide_fn=decide, - ) + + async def handler( + command: ResumeProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> None: + _log.info( + "resume_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + decision = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(decision, Deny): + _log.info( + "resume_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=decision.reason, + ) + raise UnauthorizedError(decision.reason) + + stored, version = await deps.event_store.load(_STREAM_TYPE, command.procedure_id) + state = fold([from_stored(s) for s in stored]) + if state is None: + raise ProcedureNotFoundError(command.procedure_id) + + # Off-diagonal guard: a Phase-of-Run Procedure cannot resume while + # its parent Run is Held. One-directional Operation -> Run read; a + # missing parent Run in the chain is corruption, so raise rather + # than silently skip the guard (mirrors start_procedure). Standalone + # Procedures (no parent_run_id) pass parent_run_held=False. + parent_run_held = False + if state.parent_run_id is not None: + parent_run = await load_run(deps.event_store, state.parent_run_id) + if parent_run is None: + raise RunNotFoundError(state.parent_run_id) + parent_run_held = parent_run.status == RunStatus.HELD + + now = deps.clock.now() + domain_events = decide( + state=state, + command=command, + parent_run_held=parent_run_held, + now=now, + ) + + new_events = [ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=deps.id_generator.new_id(), + command_name=_COMMAND_NAME, + correlation_id=correlation_id, + causation_id=causation_id, + principal_id=principal_id, + ) + for event in domain_events + ] + await deps.event_store.append( + stream_type=_STREAM_TYPE, + stream_id=command.procedure_id, + expected_version=version, + events=new_events, + ) + + _log.info( + "resume_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + parent_run_held=parent_run_held, + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + event_count=len(new_events), + ) + + return handler diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider.py b/apps/api/tests/unit/operation/test_resume_procedure_decider.py index 4bb094d1b3..5ac93884bc 100644 --- a/apps/api/tests/unit/operation/test_resume_procedure_decider.py +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider.py @@ -2,8 +2,10 @@ Single-source resume transition: `Held -> Running`. Carries `re_establishment_boundary` (>= 0). Mirrors `resume_run`. The -off-diagonal guard (parent Run Held) is NOT in this pure decider; it -lands in the handler in a follow-up slice. +off-diagonal guard (refuse while the parent Run is Held) lives in the +decider via the `parent_run_held` fact the handler derives from a +one-directional Operation -> Run read; these tests exercise it with +the flag directly. """ from datetime import UTC, datetime @@ -73,6 +75,52 @@ def test_decide_threads_decided_by_decision_id() -> None: assert events[0].decided_by_decision_id == decision_id +@pytest.mark.unit +def test_decide_rejects_when_parent_run_held() -> None: + """Off-diagonal guard: a Held Procedure whose parent Run is Held cannot + resume (it would walk real setpoints while the Run is paused).""" + proc = _procedure() # status Held + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=True, + now=_NOW, + ) + assert exc.value.parent_run_held is True + assert "parent Run is Held" in str(exc.value) + + +@pytest.mark.unit +def test_decide_allows_when_parent_run_not_held() -> None: + """A Held Procedure whose parent Run is NOT Held resumes normally.""" + proc = _procedure() + events = resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=False, + now=_NOW, + ) + assert len(events) == 1 + assert isinstance(events[0], ProcedureResumed) + + +@pytest.mark.unit +def test_decide_status_guard_precedes_parent_run_guard() -> None: + """A non-Held Procedure raises the status-guard form even if the parent + Run is also Held (status checked first; parent_run_held flag not set).""" + proc = _procedure(status=ProcedureStatus.RUNNING) + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=proc, + command=ResumeProcedure(procedure_id=proc.id, re_establishment_boundary=0), + parent_run_held=True, + now=_NOW, + ) + assert exc.value.parent_run_held is False + assert exc.value.current_status is ProcedureStatus.RUNNING + + @pytest.mark.unit def test_decide_accepts_zero_boundary() -> None: """Boundary 0 = re-establish from the first step (valid).""" diff --git a/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py index 441112edf8..33185d98e6 100644 --- a/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py +++ b/apps/api/tests/unit/operation/test_resume_procedure_decider_properties.py @@ -157,6 +157,29 @@ def test_resume_from_disallowed_source_always_raises_cannot_resume( assert exc.value.current_status is source +@pytest.mark.unit +@given( + procedure_id=st.uuids(), + boundary=_BOUNDARY, + now=aware_datetimes(), +) +def test_resume_with_parent_run_held_always_raises( + procedure_id: UUID, + boundary: int, + now: datetime, +) -> None: + """Off-diagonal guard: a Held Procedure whose parent Run is Held always + raises (the status guard passes, so the parent-Run guard is what fires).""" + with pytest.raises(ProcedureCannotResumeError) as exc: + resume_procedure.decide( + state=_procedure(procedure_id=procedure_id, status=ProcedureStatus.HELD), + command=ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + parent_run_held=True, + now=now, + ) + assert exc.value.parent_run_held is True + + @pytest.mark.unit @given( state_procedure_id=st.uuids(), diff --git a/apps/api/tests/unit/operation/test_resume_procedure_handler.py b/apps/api/tests/unit/operation/test_resume_procedure_handler.py index e9a5d1c497..626ff4135e 100644 --- a/apps/api/tests/unit/operation/test_resume_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_resume_procedure_handler.py @@ -1,9 +1,9 @@ """Application-handler tests for `resume_procedure` slice. -Update-style handler via `make_procedure_update_handler`. Source state -is `Held`, reached here by seeding Running then holding. The -off-diagonal parent-Run-Held guard is a follow-up slice; this test -covers the status-guard handler only. +Custom cross-aggregate handler. Source state is `Held`, reached here by +seeding Running then holding. Covers the status-guard path AND the +off-diagonal guard (the handler loads the parent Run and refuses while +the Run is itself `Held`). """ from datetime import UTC, datetime @@ -12,14 +12,22 @@ import pytest from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event from cora.operation.aggregates.procedure import ( ProcedureCannotResumeError, ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + event_type_name, + to_payload, ) from cora.operation.errors import UnauthorizedError from cora.operation.features import hold_procedure, resume_procedure from cora.operation.features.hold_procedure import HoldProcedure from cora.operation.features.resume_procedure import ResumeProcedure +from cora.run.aggregates.run import RunHeld, RunNotFoundError, RunStarted +from cora.run.aggregates.run import event_type_name as run_event_type_name +from cora.run.aggregates.run import to_payload as run_to_payload from tests.unit._helpers import build_deps as _build_deps_shared from tests.unit.operation._helpers import seed_running_procedure @@ -31,21 +39,88 @@ _CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") -async def _seed_held_procedure(store: InMemoryEventStore) -> None: - await seed_running_procedure( - store, - procedure_id=_PROCEDURE_ID, - when=_PRIOR, - correlation_id=_CORRELATION_ID, - principal_id=_PRINCIPAL_ID, - ) +async def _seed_held_procedure( + store: InMemoryEventStore, + *, + procedure_id: UUID = _PROCEDURE_ID, + parent_run_id: UUID | None = None, +) -> None: + """Land `procedure_id` in `Held`, optionally as a Phase-of-Run Procedure.""" + if parent_run_id is None: + await seed_running_procedure( + store, + procedure_id=procedure_id, + when=_PRIOR, + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + else: + # Phase-of-Run: ProcedureRegistered carries parent_run_id, then Started. + registered = ProcedureRegistered( + procedure_id=procedure_id, + name="mid-run alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=parent_run_id, + occurred_at=_PRIOR, + ) + started = ProcedureStarted(procedure_id=procedure_id, occurred_at=_PRIOR) + await store.append( + stream_type="Procedure", + stream_id=procedure_id, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in (registered, started) + ], + ) await hold_procedure.bind(_build_deps_shared(ids=[uuid4()], now=_PRIOR, event_store=store))( - HoldProcedure(procedure_id=_PROCEDURE_ID, reason="beam dropped"), + HoldProcedure(procedure_id=procedure_id, reason="beam dropped"), principal_id=_PRINCIPAL_ID, correlation_id=_CORRELATION_ID, ) +async def _seed_run(store: InMemoryEventStore, *, run_id: UUID, held: bool) -> None: + """Land a parent Run in `Running` (held=False) or `Held` (held=True).""" + events: list[object] = [ + RunStarted( + run_id=run_id, + name="parent run", + plan_id=uuid4(), + subject_id=None, + occurred_at=_PRIOR, + ) + ] + if held: + events.append(RunHeld(run_id=run_id, occurred_at=_PRIOR)) + await store.append( + stream_type="Run", + stream_id=run_id, + expected_version=0, + events=[ + to_new_event( + event_type=run_event_type_name(e), # type: ignore[arg-type] + payload=run_to_payload(e), # type: ignore[arg-type] + occurred_at=e.occurred_at, # type: ignore[attr-defined] + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + @pytest.mark.unit async def test_handler_appends_procedure_resumed_event() -> None: store = InMemoryEventStore() @@ -141,3 +216,67 @@ async def test_handler_raises_unauthorized_on_deny() -> None: principal_id=_PRINCIPAL_ID, correlation_id=_CORRELATION_ID, ) + + +# --- off-diagonal guard: parent Run Held --- + +_PARENT_RUN_ID = UUID("01900000-0000-7000-8000-0000000c0f0a") +_PHASE_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000c0f0b") + + +@pytest.mark.unit +async def test_handler_refuses_resume_when_parent_run_held() -> None: + """A Phase-of-Run Procedure cannot resume while its parent Run is Held.""" + store = InMemoryEventStore() + await _seed_run(store, run_id=_PARENT_RUN_ID, held=True) + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(ProcedureCannotResumeError) as exc: + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + assert exc.value.parent_run_held is True + # No ProcedureResumed appended (still Held: Registered, Started, Held). + events, version = await store.load("Procedure", _PHASE_PROCEDURE_ID) + assert version == 3 + assert events[-1].event_type == "ProcedureHeld" + + +@pytest.mark.unit +async def test_handler_allows_resume_when_parent_run_running() -> None: + """A Phase-of-Run Procedure resumes when its parent Run is Running.""" + store = InMemoryEventStore() + await _seed_run(store, run_id=_PARENT_RUN_ID, held=False) + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=4), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + events, _ = await store.load("Procedure", _PHASE_PROCEDURE_ID) + assert events[-1].event_type == "ProcedureResumed" + assert events[-1].payload["re_establishment_boundary"] == 4 + + +@pytest.mark.unit +async def test_handler_raises_run_not_found_when_parent_run_missing() -> None: + """Phase-of-Run Procedure with a parent_run_id pointing at an empty Run + stream is corruption: the handler raises rather than skipping the guard.""" + store = InMemoryEventStore() + await _seed_held_procedure( + store, procedure_id=_PHASE_PROCEDURE_ID, parent_run_id=_PARENT_RUN_ID + ) # parent Run never seeded + deps = _build_deps_shared(ids=[_EVENT_ID], now=_NOW, event_store=store) + with pytest.raises(RunNotFoundError): + await resume_procedure.bind(deps)( + ResumeProcedure(procedure_id=_PHASE_PROCEDURE_ID, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) From 6302574fd42da763a2b61ea72451f785f3b0b95a Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 08:59:03 +0300 Subject: [PATCH 05/12] feat(operation): Conductor.execute_from replays a pinned manifest on resume The resumable-conduct Tier-1 replay primitive. execute_from walks a PINNED manifest tail (manifest[boundary:]) rather than re-deriving the step list: re-drive setpoints (idempotent absolute writes), re-run checks as fresh gates, and HALT-for-operator on an acquisition (ActionStep) -- an interrupted acquisition is non-idempotent, so resume hands the redo-fresh-vs-reseed decision back to the operator instead of auto-skipping or auto-rerunning. Recorded step_index is the ABSOLUTE manifest position, so the replayed journal lines up with the original conduct, and re-driven setpoints inherit the pre-effect in-flight marker. Like execute(), it drives no FSM transition. `steps_from_manifest` is the exact inverse of `step_to_payload`: it parses the pinned ResolvedStepsRecorded.resolved_steps wire dicts back into Steps (pure, no Pydantic -- that lives at the HTTP boundary in step_from_wire), so resume NEVER re-derives from live Plan.wires / partition rules. A single-member `ResumePolicy.RE_ESTABLISH` enum documents the locked default; COMPARE (read-and-compare) stays an anti-hook-until-lease. Acceptance test: replay walks the pinned tail byte-for-byte -- two setpoints pinned via step_to_payload, parsed via steps_from_manifest, re-drive to two identical writes on a recording in-memory port. Plus boundary-skips-prefix, halt-on-action, re-run-check (match + mismatch), empty-tail no-op, negative-boundary reject, and the step/criterion round-trip pairs. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/src/cora/operation/conductor.py | 194 +++++++++- .../operation/test_conductor_execute_from.py | 334 ++++++++++++++++++ 2 files changed, 525 insertions(+), 3 deletions(-) create mode 100644 apps/api/tests/unit/operation/test_conductor_execute_from.py diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index f4defc7153..1137e6b8ad 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -37,6 +37,16 @@ False, "reason": "out_of_range"}`); the Conductor treats any return from a body as success-shaped at this tier. +## Resume (execute_from) + +`execute_from` replays a PINNED conduct manifest from a re-establishment +boundary rather than re-deriving the step list: re-drive setpoints, re-run +checks as fresh gates, and halt-for-operator on an acquisition +(`ActionStep`). It is the Tier-1 resumable-conduct primitive +([[project_resumable_conduct_design]]); the manifest comes from +`ResolvedStepsRecorded` parsed via `steps_from_manifest`. Like `execute` +it drives no FSM transition. + ## Pre-effect in-flight marker (side-effecting steps) A setpoint and an action are side-effecting: each records a SEPARATE @@ -104,7 +114,8 @@ import contextlib from collections.abc import AsyncIterator, Awaitable, Callable, Mapping, Sequence from dataclasses import dataclass, field -from typing import Any, Protocol +from enum import StrEnum +from typing import Any, Protocol, cast from uuid import UUID from cora.infrastructure.ports.clock import Clock @@ -227,6 +238,33 @@ _QUALITY_GOOD = "Good" +_RESUME_HALT_ERROR_CLASS = "AcquisitionResumeRequiresOperator" +"""`error_class` on the `ConductorFailure` that `execute_from` returns when +a resume reaches an `ActionStep` (an acquisition). It is NOT an exception +and NOT a step failure: re-running an interrupted acquisition is +non-idempotent (fly-scan triggers are one-shot, a mid-arm collect reads +identically for finished / aborted / never-armed), so resume HALTS and +hands the decision (redo-fresh vs reseed) back to the operator rather than +auto-skipping or auto-rerunning. See [[project_resumable_conduct_design]].""" + + +class ResumePolicy(StrEnum): + """How `execute_from` re-establishes state while replaying a manifest tail. + + `RE_ESTABLISH` (the only member today): re-drive setpoints (absolute + writes are idempotent; CORA has no relative-setpoint type), re-run + checks as fresh gates, and HALT on an acquisition (`ActionStep`) for an + operator decision. This is the locked default per + [[project_resumable_conduct_design]]. + + A future `COMPARE` member (read-and-compare instead of re-drive) is an + Anti-hook-until-lease: its single-writer guarantee is unsatisfiable on a + multi-writer floor until a Conduit/Surface write-ownership lease exists, + so it is deliberately absent rather than stubbed. + """ + + RE_ESTABLISH = "re_establish" + @dataclass(frozen=True) class SetpointStep: @@ -601,6 +639,104 @@ async def execute( actuation_kind=observer.actuation_kind, ) + async def execute_from( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + manifest: Sequence[Step], + boundary: int, + policy: ResumePolicy = ResumePolicy.RE_ESTABLISH, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Resume a halted conduct by REPLAYING a pinned manifest from `boundary`. + + `manifest` is the FINAL resolved step list pinned on + `ResolvedStepsRecorded` at first conduct (parse the event's + `resolved_steps` back via `steps_from_manifest`). Resume NEVER + re-derives the step list -- a re-derived list could silently skip or + mis-target a step (the end-of-run "home to 0" aliasing the + start-of-run "home to 0" after an index shift). It replays + `manifest[boundary:]` verbatim: + + - `SetpointStep` -> RE-DRIVE (idempotent absolute write). The + recorded `step_index` is the ABSOLUTE manifest position, so the + replayed journal lines up with the original conduct. + - `CheckStep` -> RE-RUN as a fresh gate (a passing check proves + "now", not "continuously", so it is re-evaluated). + - `ActionStep` -> HALT for an operator decision (an interrupted + acquisition is non-idempotent; see `_RESUME_HALT_ERROR_CLASS`). + The action is NOT executed and NOTHING is recorded for it; the + returned `ConductorResult.failure` carries the halt so the + caller (a resume orchestrator) routes the decision. + + `boundary` is the re-establishment boundary from `ProcedureResumed`: + the index from which re-drive + re-run resumes. `boundary >= + len(manifest)` replays an empty tail (a no-op resume). Like + `execute`, this drives no FSM transition; it walks + records. + + See [[project_resumable_conduct_design]] Tier 1. + """ + if boundary < 0: + msg = f"boundary must be >= 0 (got {boundary})" + raise ValueError(msg) + if policy is not ResumePolicy.RE_ESTABLISH: # pragma: no cover - only member today + msg = f"unsupported resume policy: {policy}" + raise ValueError(msg) + envelope = _Envelope( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + surface_id=surface_id, + ) + observer = _ActuationObserver(self._control_port) + completed = 0 + for index in range(boundary, len(manifest)): + step = manifest[index] + if isinstance(step, ActionStep): + # Halt-for-operator: do not re-run an interrupted acquisition. + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + failure=ConductorFailure( + step_index=index, + source_kind=_STEP_KIND_ACTION, + target=step.name, + error_class=_RESUME_HALT_ERROR_CLASS, + message=( + f"resume halted at step {index} (action {step.name!r}): an " + "interrupted acquisition needs an operator decision " + "(redo-fresh vs reseed); not auto-rerun, not auto-skipped" + ), + ), + actuation_kind=observer.actuation_kind, + ) + with with_dispatch_correlation_id(correlation_id): + if isinstance(step, SetpointStep): + failure = await self._run_setpoint( + step, index=index, envelope=envelope, port=observer + ) + else: + failure = await self._run_check( + step, index=index, envelope=envelope, port=observer + ) + if failure is not None: + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + failure=failure, + actuation_kind=observer.actuation_kind, + ) + completed += 1 + return ConductorResult( + procedure_id=procedure_id, + completed_count=completed, + actuation_kind=observer.actuation_kind, + ) + async def conduct( self, *, @@ -1070,11 +1206,12 @@ def _criterion_to_dict(criterion: CheckCriterion) -> dict[str, Any]: def step_to_payload(step: Step) -> dict[str, Any]: - """Serialize a `Step` to a JSON-clean dict (inverse of `step_from_wire`). + """Serialize a `Step` to a JSON-clean dict (inverse of `steps_from_manifest`). Mirrors the conduct route's wire shape (the `kind` discriminator + field names) so the resolved step list pinned on `ResolvedStepsRecorded` - round-trips back to `Step` objects via `step_from_wire` at resume. A + round-trips back to `Step` objects via `steps_from_manifest` at resume + (and via the route's Pydantic `step_from_wire` on the live HTTP path). A tuple `value` serializes as a list (JSON has no tuple); the criterion reuses `_criterion_to_dict` so the wire shape stays single-sourced. """ @@ -1095,6 +1232,55 @@ def step_to_payload(step: Step) -> dict[str, Any]: } +def _criterion_from_dict(criterion: Mapping[str, Any]) -> CheckCriterion: + """Rebuild a `CheckCriterion` from its `_criterion_to_dict` shape.""" + kind = criterion["kind"] + if kind == "equals": + expected: Any = criterion["expected"] + if isinstance(expected, list): + expected = cast("tuple[Any, ...]", tuple(expected)) # pyright: ignore[reportUnknownArgumentType] + return EqualsCriterion(expected=expected) + if kind == "within_tolerance": + return WithinToleranceCriterion( + expected=criterion["expected"], tolerance=criterion["tolerance"] + ) + msg = f"unknown criterion kind: {kind!r}" + raise ValueError(msg) + + +def _step_from_payload(payload: Mapping[str, Any]) -> Step: + """Rebuild one `Step` from its `step_to_payload` wire shape.""" + kind = payload["kind"] + if kind == "setpoint": + value: Any = payload["value"] + if isinstance(value, list): + value = cast("tuple[Any, ...]", tuple(value)) # pyright: ignore[reportUnknownArgumentType] + return SetpointStep( + address=payload["address"], value=value, verify=payload.get("verify", False) + ) + if kind == "action": + return ActionStep(name=payload["name"], params=dict(payload.get("params", {}))) + if kind == "check": + return CheckStep( + address=payload["address"], criterion=_criterion_from_dict(payload["criterion"]) + ) + msg = f"unknown step kind: {kind!r}" + raise ValueError(msg) + + +def steps_from_manifest(resolved_steps: Sequence[Mapping[str, Any]]) -> tuple[Step, ...]: + """Parse the pinned `ResolvedStepsRecorded.resolved_steps` back into `Step`s. + + The exact inverse of `step_to_payload` (the serialization used to pin the + conduct manifest). A resume reads the pinned event's `resolved_steps`, + parses them with this helper, and hands the result to + `Conductor.execute_from` -- it NEVER re-derives the step list from live + `Plan.wires` / partition rules. Pure; no Pydantic (that lives at the HTTP + boundary in `step_from_wire`). See [[project_resumable_conduct_design]]. + """ + return tuple(_step_from_payload(step) for step in resolved_steps) + + def _criterion_matches(criterion: CheckCriterion, value: Any) -> bool: """True iff `value` satisfies `criterion`. @@ -1163,8 +1349,10 @@ def _reading_to_dict(reading: Reading) -> dict[str, Any]: "ConductorResult", "EqualsCriterion", "InMemoryActionRegistry", + "ResumePolicy", "SetpointStep", "Step", "WithinToleranceCriterion", "step_to_payload", + "steps_from_manifest", ] diff --git a/apps/api/tests/unit/operation/test_conductor_execute_from.py b/apps/api/tests/unit/operation/test_conductor_execute_from.py new file mode 100644 index 0000000000..5b363477f4 --- /dev/null +++ b/apps/api/tests/unit/operation/test_conductor_execute_from.py @@ -0,0 +1,334 @@ +"""Behavioural tests for `Conductor.execute_from` (resumable conduct, Tier 1). + +`execute_from` REPLAYS a pinned conduct manifest from a re-establishment +boundary rather than re-deriving the step list: + + - setpoint -> re-drive (idempotent absolute write) + - check -> re-run as a fresh gate + - action -> HALT for an operator decision (interrupted acquisition) + +Headline acceptance test (per the design memo): replay walks the pinned +tail BYTE-FOR-BYTE -- two identical setpoints land on the in-memory port, +identical to what the original conduct wrote. `steps_from_manifest` is the +exact inverse of `step_to_payload`, so the pinned `ResolvedStepsRecorded` +manifest round-trips into the replayed `Step`s. +""" + +from collections.abc import AsyncIterator, Mapping +from dataclasses import dataclass, field +from datetime import UTC, datetime +from typing import Any +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.ports.clock import FakeClock +from cora.operation.conductor import ( + ActionStep, + CheckStep, + Conductor, + EqualsCriterion, + ResumePolicy, + SetpointStep, + Step, + WithinToleranceCriterion, + step_to_payload, + steps_from_manifest, +) +from cora.operation.features.append_activities.command import AppendProcedureActivities +from cora.operation.ports.control_port import ControlNotConnectedError, Reading + +_FIXED_NOW = datetime(2026, 6, 21, 9, 0, 0, tzinfo=UTC) + + +@dataclass +class _FakeAppendStep: + """Captures each append call (the replayed journal).""" + + calls: list[AppendProcedureActivities] = field(default_factory=list[AppendProcedureActivities]) + + async def __call__(self, command: AppendProcedureActivities, **_kwargs: Any) -> int: + self.calls.append(command) + return len(command.entries) + + +@dataclass +class _LenientIds: + """id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +@dataclass +class _RecordingControlPort: + """Captures writes in order (for byte-for-byte assertions); reads from a seed.""" + + writes: list[tuple[str, Any]] = field(default_factory=list[tuple[str, Any]]) + readings: dict[str, Reading] = field(default_factory=dict[str, Reading]) + + async def read(self, address: str) -> Reading: + if address not in self.readings: + raise ControlNotConnectedError(address) + return self.readings[address] + + async def write( + self, + address: str, + value: int | float | bool | str | tuple[Any, ...], + *, + wait: bool = True, + timeout_s: float = 30.0, + ) -> None: + _ = (wait, timeout_s) + self.writes.append((address, value)) + + def subscribe(self, address: str) -> AsyncIterator[Reading]: # pragma: no cover - unused + raise NotImplementedError + + +def _conductor(port: _RecordingControlPort, appender: _FakeAppendStep) -> Conductor: + return Conductor( + control_port=port, # type: ignore[arg-type] + append_step=appender, # type: ignore[arg-type] + clock=FakeClock(_FIXED_NOW), + id_generator=_LenientIds(), # type: ignore[arg-type] + ) + + +def _good_reading(value: Any) -> Reading: + return Reading(value=value, kind="Scalar", quality="Good", sampled_at=_FIXED_NOW) + + +def _pin_and_parse(steps: tuple[Step, ...]) -> tuple[Step, ...]: + """Serialize steps the way conduct pins them, then parse back (the + ResolvedStepsRecorded round-trip a real resume performs).""" + manifest_wire = tuple(step_to_payload(s) for s in steps) + return steps_from_manifest(manifest_wire) + + +# --- headline acceptance: byte-for-byte replay of the pinned tail ---------- + + +@pytest.mark.unit +async def test_execute_from_replays_pinned_tail_byte_for_byte() -> None: + """Two setpoints pinned on the manifest re-drive byte-for-byte on resume.""" + original = ( + SetpointStep(address="2bma:rot:val", value=45.0), + SetpointStep(address="2bma:cam:exposure", value=0.025), + ) + manifest = _pin_and_parse(original) + assert manifest == original # the pinned manifest round-trips to the same Steps + + port = _RecordingControlPort() + appender = _FakeAppendStep() + result = await _conductor(port, appender).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=0, + ) + + assert result.succeeded is True + assert result.completed_count == 2 + # Byte-for-byte: the replayed writes equal the pinned manifest's setpoints. + assert port.writes == [("2bma:rot:val", 45.0), ("2bma:cam:exposure", 0.025)] + + +@pytest.mark.unit +async def test_execute_from_boundary_skips_the_prefix() -> None: + """boundary=K re-drives only manifest[K:]; the prefix is not re-driven.""" + manifest = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + SetpointStep(address="2bma:c", value=3.0), + ) + ) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=1, + ) + assert result.completed_count == 2 + assert port.writes == [("2bma:b", 2.0), ("2bma:c", 3.0)] # 2bma:a (prefix) untouched + + +@pytest.mark.unit +async def test_execute_from_records_marker_and_outcome_with_absolute_index() -> None: + """A re-driven setpoint records the in-flight marker + ok outcome, each + carrying its ABSOLUTE manifest position (so the replayed journal lines up).""" + manifest = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ) + ) + appender = _FakeAppendStep() + await _conductor(_RecordingControlPort(), appender).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=1, + ) + payloads = [c.entries[0].payload for c in appender.calls] + # Only the boundary step (index 1) replayed: marker then outcome, both index 1. + assert [(p["step_index"], p["result"]) for p in payloads] == [(1, "in_flight"), (1, "ok")] + + +@pytest.mark.unit +async def test_execute_from_on_action_requires_operator_decision() -> None: + """An acquisition (ActionStep) is NOT re-run: resume halts for an operator + decision; the action and everything after it are untouched.""" + manifest = _pin_and_parse( + ( + SetpointStep(address="2bma:a", value=1.0), + ActionStep(name="collect", params={"dwell": 0.1}), + SetpointStep(address="2bma:c", value=3.0), + ) + ) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=0, + ) + assert result.succeeded is False + assert result.completed_count == 1 # the leading setpoint re-driven + assert result.failure is not None + assert result.failure.step_index == 1 + assert result.failure.source_kind == "action" + assert result.failure.target == "collect" + assert result.failure.error_class == "AcquisitionResumeRequiresOperator" + # The action did not run and the trailing setpoint was never reached. + assert port.writes == [("2bma:a", 1.0)] + + +@pytest.mark.unit +async def test_execute_from_reruns_check_fresh() -> None: + """A check in the tail is re-run as a fresh gate (read + evaluate).""" + manifest = _pin_and_parse( + (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) + ) + port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(45.0)}) + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=0, + ) + assert result.succeeded is True + assert result.completed_count == 1 + + +@pytest.mark.unit +async def test_execute_from_check_mismatch_on_rerun_halts() -> None: + """A re-run check whose criterion no longer matches halts the resume.""" + manifest = _pin_and_parse( + (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) + ) + port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(12.5)}) + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=0, + ) + assert result.succeeded is False + assert result.failure is not None + assert result.failure.error_class == "CheckFailedError" + assert result.failure.source_kind == "check" + + +@pytest.mark.unit +async def test_execute_from_boundary_past_end_is_a_no_op() -> None: + """Boundary >= len(manifest) replays an empty tail (a no-op resume).""" + manifest = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=5, + ) + assert result.succeeded is True + assert result.completed_count == 0 + assert port.writes == [] + + +@pytest.mark.unit +async def test_execute_from_rejects_negative_boundary() -> None: + with pytest.raises(ValueError, match="boundary must be >= 0"): + await _conductor(_RecordingControlPort(), _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=(), + boundary=-1, + ) + + +@pytest.mark.unit +async def test_execute_from_explicit_re_establish_policy_is_the_default() -> None: + """Passing the only policy member behaves identically to the default.""" + manifest = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + port = _RecordingControlPort() + result = await _conductor(port, _FakeAppendStep()).execute_from( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + manifest=manifest, + boundary=0, + policy=ResumePolicy.RE_ESTABLISH, + ) + assert result.succeeded is True + assert port.writes == [("2bma:a", 1.0)] + + +# --- steps_from_manifest is the exact inverse of step_to_payload ----------- + + +@pytest.mark.unit +@pytest.mark.parametrize( + "step", + [ + SetpointStep(address="2bma:rot", value=12.5, verify=True), + SetpointStep(address="2bma:energy", value=(1, 2, 3)), + ActionStep(name="collect", params={"dwell": 0.1, "detector": "2bma:cam1"}), + CheckStep(address="2bma:shutter", criterion=EqualsCriterion(expected="Open")), + CheckStep(address="2bma:idx", criterion=EqualsCriterion(expected=(1, 2))), + CheckStep( + address="2bma:temp", + criterion=WithinToleranceCriterion(expected=100.0, tolerance=0.5), + ), + ], +) +def test_steps_from_manifest_round_trips_step_to_payload(step: Step) -> None: + assert steps_from_manifest((step_to_payload(step),)) == (step,) + + +@pytest.mark.unit +def test_steps_from_manifest_rejects_unknown_kind() -> None: + with pytest.raises(ValueError, match="unknown step kind"): + steps_from_manifest(({"kind": "bogus"},)) + + +@pytest.mark.unit +def test_steps_from_manifest_rejects_unknown_criterion_kind() -> None: + bad: Mapping[str, Any] = { + "kind": "check", + "address": "x", + "criterion": {"kind": "bogus"}, + } + with pytest.raises(ValueError, match="unknown criterion kind"): + steps_from_manifest((bad,)) From 4d274037c2d0f4a96ce84f46f88ad52d10de56dc Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 10:35:26 +0300 Subject: [PATCH 06/12] feat(operation): reconduct_procedure resumes a Held procedure and replays its manifest Wires Conductor.execute_from into an operator entry point. reconduct is the resume twin of conduct_procedure: a thin orchestration slice (no decider) over a new Conductor.reconduct method that composes resume + execute_from + terminalize, exactly as Conductor.conduct composes start + execute + terminalize. Putting the sibling-handler composition on the Conductor (a non-slice module) keeps the slice independent (the cross-slice fitness forbids a slice importing a sibling slice) and the composition in the one place that already owns lifecycle-handler orchestration. Conductor.reconduct replays a Held Procedure's PINNED manifest tail from the re-establishment boundary, then terminalizes three ways: - clean tail (incl. empty) -> auto-complete (Completed) - acquisition halt -> leave Running, halt in the result body (operator decides redo-fresh vs reseed) - genuine step failure -> best-effort abort (Aborted) A refused resume (not-Held / held parent Run / authz deny / not-found) PROPAGATES as its mapped HTTP code rather than landing in the result body: a refused resume is a guard outcome, not a replay outcome, and no replay has happened yet. The slice handler stays thin: authz -> load -> status guard (non-Held -> 409, BEFORE the manifest lookup so a Defined procedure is never a misleading 500) -> find the pinned ResolvedStepsRecorded (a Held procedure missing it is corruption -> 500) -> steps_from_manifest -> conductor.reconduct -> project ConductorResult onto ReconductProcedureResult. Adds `_manifest_replay.find_resolved_steps_record` (sibling of the recipe finder) and `ResolvedStepsRecordNotFoundError` (500). `is_acquisition_halt` stays a public conductor helper (the slice reads it for the result flag); `derive_abort_reason` reverts to private (only conductor.py uses it now). wire.py passes the hoisted resume handler into the Conductor and binds reconduct over just the conductor. Coverage: 9 handler unit tests exercise resume+replay+terminalize against seeded Held+manifest states (clean / boundary / halt / step-failure / missing-manifest / not-found / not-Held / parent-Run-Held / deny). Contract tests cover the API-reachable guard/fault surfaces (404/409/422/500). NOTE: the 200 happy path is not yet API-reachable -- the synchronous conduct pins the manifest then runs to a terminal state, so there is no API path to a Held+manifest state. Producing it (a conduct that pauses to Held on a halt, or a mid-conduct cooperative hold) is a named follow-up; until then reconduct is a tested consumer ahead of its producer. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/openapi.json | 181 ++++++++ .../src/cora/operation/_manifest_replay.py | 41 ++ .../aggregates/procedure/__init__.py | 2 + .../operation/aggregates/procedure/state.py | 22 + apps/api/src/cora/operation/conductor.py | 140 +++++++ .../src/cora/operation/features/__init__.py | 2 + .../features/reconduct_procedure/__init__.py | 42 ++ .../features/reconduct_procedure/command.py | 46 +++ .../features/reconduct_procedure/handler.py | 193 +++++++++ .../features/reconduct_procedure/route.py | 164 ++++++++ .../features/reconduct_procedure/tool.py | 91 ++++ apps/api/src/cora/operation/routes.py | 6 + apps/api/src/cora/operation/tools.py | 5 + apps/api/src/cora/operation/wire.py | 26 +- .../tests/architecture/test_slice_contract.py | 4 + .../test_reconduct_procedure_endpoint.py | 111 +++++ .../test_reconduct_procedure_mcp_tool.py | 64 +++ .../test_reconduct_procedure_handler.py | 387 ++++++++++++++++++ 18 files changed, 1522 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/cora/operation/_manifest_replay.py create mode 100644 apps/api/src/cora/operation/features/reconduct_procedure/__init__.py create mode 100644 apps/api/src/cora/operation/features/reconduct_procedure/command.py create mode 100644 apps/api/src/cora/operation/features/reconduct_procedure/handler.py create mode 100644 apps/api/src/cora/operation/features/reconduct_procedure/route.py create mode 100644 apps/api/src/cora/operation/features/reconduct_procedure/tool.py create mode 100644 apps/api/tests/contract/test_reconduct_procedure_endpoint.py create mode 100644 apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py create mode 100644 apps/api/tests/unit/operation/test_reconduct_procedure_handler.py diff --git a/apps/api/openapi.json b/apps/api/openapi.json index f9f066d769..f37dded7af 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -10283,6 +10283,79 @@ "title": "RecipeResponse", "type": "object" }, + "ReconductProcedureRequest": { + "additionalProperties": false, + "description": "Body for `POST /procedures/{procedure_id}/reconduct`.", + "properties": { + "re_establishment_boundary": { + "description": "Index in the pinned conduct manifest from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "minimum": 0.0, + "title": "Re Establishment Boundary", + "type": "integer" + } + }, + "required": [ + "re_establishment_boundary" + ], + "title": "ReconductProcedureRequest", + "type": "object" + }, + "ReconductProcedureResponse": { + "description": "Response body for the reconduct_procedure slice.\n\n`succeeded` is the replay's pass/fail bit. `acquisition_halt` is True\niff the replay stopped at an acquisition needing an operator decision\n(the Procedure is left Running). `failure` is non-null iff `succeeded`\nis False (a halt or a genuine step failure).", + "properties": { + "acquisition_halt": { + "title": "Acquisition Halt", + "type": "boolean" + }, + "actuation_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Actuation Kind" + }, + "completed_count": { + "title": "Completed Count", + "type": "integer" + }, + "failure": { + "anyOf": [ + { + "$ref": "#/components/schemas/_ConductorFailureResponse" + }, + { + "type": "null" + } + ] + }, + "procedure_id": { + "format": "uuid", + "title": "Procedure Id", + "type": "string" + }, + "re_establishment_boundary": { + "title": "Re Establishment Boundary", + "type": "integer" + }, + "succeeded": { + "title": "Succeeded", + "type": "boolean" + } + }, + "required": [ + "procedure_id", + "completed_count", + "succeeded", + "re_establishment_boundary", + "acquisition_halt" + ], + "title": "ReconductProcedureResponse", + "type": "object" + }, "RecordAcquisitionRequest": { "additionalProperties": false, "description": "Body for `POST /acquisitions`.", @@ -37032,6 +37105,114 @@ ] } }, + "/procedures/{procedure_id}/reconduct": { + "post": { + "description": "Resume + replay a Held Procedure. Replay outcomes land in the body.", + "operationId": "post_procedures_reconduct_procedures__procedure_id__reconduct_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReconductProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReconductProcedureResponse" + } + } + }, + "description": "Successful Response" + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "404": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "No procedure exists with the given id." + }, + "409": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Procedure is not in `Held` status, OR its parent Run is itself `Held` (off-diagonal guard)." + }, + "422": { + "description": "Path parameter or request body failed schema validation." + }, + "500": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Held Procedure is missing its pinned manifest (corruption)." + } + }, + "summary": "Resume a held Procedure and replay its pinned manifest tail (Held -> Running)", + "tags": [ + "operation" + ] + } + }, "/procedures/{procedure_id}/resume": { "post": { "operationId": "post_procedures_resume_procedures__procedure_id__resume_post", diff --git a/apps/api/src/cora/operation/_manifest_replay.py b/apps/api/src/cora/operation/_manifest_replay.py new file mode 100644 index 0000000000..c4199a2197 --- /dev/null +++ b/apps/api/src/cora/operation/_manifest_replay.py @@ -0,0 +1,41 @@ +"""Conduct-manifest replay helper for the `reconduct_procedure` handler. + +The resume path replays a halted conduct from a PINNED manifest rather +than re-deriving the step list. This module locates the +`ResolvedStepsRecorded` provenance event (pinned once at conduct start by +`conduct_procedure/handler.py` + `manifest.py`) in a Procedure stream so +the handler can parse `resolved_steps` back into `Step`s via +`conductor.steps_from_manifest` and hand them to `Conductor.execute_from`. + +Sibling of `_recipe_replay.find_recipe_expansion_record` (the recipe +genesis provenance finder), kept separate because that module's tuple of +helpers is recipe-expansion-specific. This is the SECOND handler-tier +payload-direct reader; per the replay-design rule-of-three note, when a +THIRD lands the two `find_*_record` head-scanners should hoist to a +generic `cora.infrastructure.event_payload` helper. +""" + +from collections.abc import Iterable + +from cora.infrastructure.ports.event_store import StoredEvent + + +def find_resolved_steps_record( + stored_events: Iterable[StoredEvent], +) -> StoredEvent | None: + """Locate the `ResolvedStepsRecorded` event in a Procedure stream. + + Scans linearly from head, returns the first match, early-exits on the + first hit. A conduct pins exactly one `ResolvedStepsRecorded` at start + (only while the Procedure is `Defined`), so a Held Procedure that has + been conducted carries exactly one; head-scan returns it. + + Returns `None` when no match. The caller decides whether None is an + error: the `reconduct_procedure` handler raises + `ResolvedStepsRecordNotFoundError` (a Held Procedure missing its pinned + manifest is corruption, not an operational outcome). + """ + for event in stored_events: + if event.event_type == "ResolvedStepsRecorded": + return event + return None diff --git a/apps/api/src/cora/operation/aggregates/procedure/__init__.py b/apps/api/src/cora/operation/aggregates/procedure/__init__.py index 759c01b78d..8b7553473b 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/__init__.py +++ b/apps/api/src/cora/operation/aggregates/procedure/__init__.py @@ -91,6 +91,7 @@ RecipeExpansionOverflowError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + ResolvedStepsRecordNotFoundError, StepKind, ) @@ -162,6 +163,7 @@ "RecipeExpansionRecordNotFoundError", "RecipeExpansionRecorded", "RecipeExpansionReplayMismatchError", + "ResolvedStepsRecordNotFoundError", "ResolvedStepsRecorded", "StepKind", "event_type_name", diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index 3efeeb2180..21d28593a1 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -474,6 +474,28 @@ def __init__(self, procedure_id: UUID) -> None: self.procedure_id = procedure_id +class ResolvedStepsRecordNotFoundError(Exception): + """A Held Procedure cannot locate its pinned `ResolvedStepsRecorded` manifest. + + Raised by the `reconduct_procedure` (resume-and-replay) handler when a + Held Procedure's stream carries no `ResolvedStepsRecorded` event. A + conduct pins exactly one at start (while `Defined`), so a conducted + Procedure always has it; its absence is corruption (stream truncation, + a manual event-store write, or a partial-write failure), not an + operational outcome. Kept OUT of the conduct/reconduct failures-in-body + contract (that is for step outcomes like an IOC rejecting a write). + Sibling of `RecipeExpansionRecordNotFoundError`. Mapped to HTTP 500. + """ + + def __init__(self, procedure_id: UUID) -> None: + super().__init__( + f"Procedure {procedure_id} is Held but its pinned " + f"ResolvedStepsRecorded manifest could not be located; resume " + f"replay cannot proceed." + ) + self.procedure_id = procedure_id + + class RecipeExpansionReplayMismatchError(Exception): """Replay-time hash drift on a recipe-driven Procedure. diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 1137e6b8ad..08bdc5a0e4 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -138,6 +138,8 @@ from cora.operation.features.complete_procedure.handler import ( Handler as CompleteProcedureHandler, ) +from cora.operation.features.resume_procedure.command import ResumeProcedure +from cora.operation.features.resume_procedure.handler import Handler as ResumeProcedureHandler from cora.operation.features.start_procedure.command import StartProcedure from cora.operation.features.start_procedure.handler import Handler as StartProcedureHandler from cora.operation.ports.control_port import ( @@ -575,6 +577,7 @@ def __init__( start_procedure: StartProcedureHandler | None = None, complete_procedure: CompleteProcedureHandler | None = None, abort_procedure: AbortProcedureHandler | None = None, + resume_procedure: ResumeProcedureHandler | None = None, ) -> None: self._control_port = control_port self._append_step = append_step @@ -584,6 +587,7 @@ def __init__( self._start_procedure = start_procedure self._complete_procedure = complete_procedure self._abort_procedure = abort_procedure + self._resume_procedure = resume_procedure async def execute( self, @@ -891,6 +895,128 @@ async def conduct( ) return result + async def reconduct( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + manifest: Sequence[Step], + boundary: int, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Resume a Held Procedure and REPLAY its pinned manifest from `boundary`. + + The resume twin of `conduct()`: where `conduct()` drives + start -> execute -> complete | abort, this drives + resume -> execute_from -> complete | (leave Running) | abort. + + 1. Issue `resume_procedure` (transitions Held -> Running). Its OWN + authz + off-diagonal parent-Run-Held guard fire here; a non-Held + Procedure or a held parent Run raises `ProcedureCannotResumeError` + which PROPAGATES (mapped to 409 at the route) rather than landing + in the result body. A refused resume is a guard outcome, not a + replay outcome, and no replay has happened yet. + 2. Call `self.execute_from(manifest, boundary)`: re-drive setpoints, + re-run checks, halt-for-operator on an acquisition. + 3. Terminalize three-way: + - clean tail (incl. empty) -> `complete_procedure` (Completed). + - acquisition halt -> NO transition; the Procedure stays Running + and the operator decides redo-fresh vs reseed from the result. + - genuine step failure -> best-effort `abort_procedure` (if the + abort itself fails, the original step failure is what + surfaces, mirroring `conduct()`). + + `manifest` is the parsed `ResolvedStepsRecorded.resolved_steps`: the + caller locates + parses the PINNED record (resume NEVER re-derives the + step list). `boundary` is single-sourced: it rides into both + `ProcedureResumed.re_establishment_boundary` (audit) and + `execute_from(boundary=...)` (replay). + + Requires `resume_procedure` + `complete_procedure` + `abort_procedure` + handlers at __init__; raises `RuntimeError` (a wiring bug) otherwise. + + Unlike `conduct()`, this does NOT best-effort-abort on a mid-replay + `CancelledError`: a cancellation after the resume leaves the Procedure + Running with partial replay history, the same posture as the + acquisition-halt branch (the operator reconciles). See + [[project_resumable_conduct_design]] Tier 1. + """ + if ( + self._resume_procedure is None + or self._complete_procedure is None + or self._abort_procedure is None + ): + raise RuntimeError( + "Conductor.reconduct() requires resume_procedure + complete_procedure + " + "abort_procedure handlers at __init__; only execute_from() is available " + "without them." + ) + envelope_kwargs: dict[str, Any] = { + "principal_id": principal_id, + "correlation_id": correlation_id, + "causation_id": causation_id, + "surface_id": surface_id, + } + # Held -> Running. Refusals (not-Held / held parent Run / authz deny / + # not-found) propagate to the route as their mapped HTTP codes; no + # replay has happened, so they are NOT swallowed into the result body. + await self._resume_procedure( + ResumeProcedure(procedure_id=procedure_id, re_establishment_boundary=boundary), + **envelope_kwargs, + ) + result = await self.execute_from( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + manifest=manifest, + boundary=boundary, + causation_id=causation_id, + surface_id=surface_id, + ) + actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + if result.succeeded: + # Clean tail (incl. empty tail): auto-complete, threading the + # observed kind onto ProcedureCompleted (Data BC gate carrier). + try: + await self._complete_procedure( + CompleteProcedure(procedure_id=procedure_id, actuation_kind=actuation_kind), + **envelope_kwargs, + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_COMPLETE, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + return result + if is_acquisition_halt(result.failure): + # Halt-for-operator: leave the Procedure Running; no transition. + return result + # Genuine step failure: best-effort abort (if abort itself fails, the + # original step failure is what surfaces). Mirrors conduct(). + failure = result.failure + assert failure is not None # not succeeded + not halt -> failure + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure( + procedure_id=procedure_id, + reason=_derive_abort_reason(failure), + actuation_kind=actuation_kind, + ), + **envelope_kwargs, + ) + return result + async def _dispatch( self, step: Step, @@ -1281,6 +1407,19 @@ def steps_from_manifest(resolved_steps: Sequence[Mapping[str, Any]]) -> tuple[St return tuple(_step_from_payload(step) for step in resolved_steps) +def is_acquisition_halt(failure: ConductorFailure | None) -> bool: + """True iff `failure` is `execute_from`'s halt-for-operator on an acquisition. + + Distinguishes the resume halt (an `ActionStep` reached during replay, + which is a needs-operator-decision hand-off, NOT a failure) from a + genuine step failure (a setpoint/check that failed). A resume + orchestration completes on success, leaves the Procedure Running on an + acquisition halt, and aborts on a genuine failure -- this predicate is + the branch. See `_RESUME_HALT_ERROR_CLASS` and + [[project_resumable_conduct_design]].""" + return failure is not None and failure.error_class == _RESUME_HALT_ERROR_CLASS + + def _criterion_matches(criterion: CheckCriterion, value: Any) -> bool: """True iff `value` satisfies `criterion`. @@ -1353,6 +1492,7 @@ def _reading_to_dict(reading: Reading) -> dict[str, Any]: "SetpointStep", "Step", "WithinToleranceCriterion", + "is_acquisition_halt", "step_to_payload", "steps_from_manifest", ] diff --git a/apps/api/src/cora/operation/features/__init__.py b/apps/api/src/cora/operation/features/__init__.py index 4ed93561f5..ca82555e5b 100644 --- a/apps/api/src/cora/operation/features/__init__.py +++ b/apps/api/src/cora/operation/features/__init__.py @@ -46,6 +46,7 @@ get_procedure, hold_procedure, list_procedures, + reconduct_procedure, register_procedure, resume_procedure, start_procedure, @@ -59,6 +60,7 @@ "get_procedure", "hold_procedure", "list_procedures", + "reconduct_procedure", "register_procedure", "resume_procedure", "start_procedure", diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py new file mode 100644 index 0000000000..eec770fe7c --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py @@ -0,0 +1,42 @@ +"""Vertical slice for the `ReconductProcedure` command. + +Operator-facing resume-and-replay orchestration: resumes a Held +Procedure and hands control to the `Conductor` runtime, which replays the +pinned manifest tail from the re-establishment boundary (re-drive +setpoints, re-run checks, halt-for-operator on an acquisition), then +auto-completes on a clean tail / aborts on a step failure / leaves +Running on an acquisition halt. Returns a structured +`ReconductProcedureResult`; replay outcomes are encoded in the result, +not raised. + + from cora.operation.features import reconduct_procedure + + cmd = reconduct_procedure.ReconductProcedure(procedure_id=..., re_establishment_boundary=K) + handler = reconduct_procedure.bind( + deps, conductor=conductor, resume_procedure=..., complete_procedure=..., abort_procedure=... + ) + result = await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.reconduct_procedure import tool +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.operation.features.reconduct_procedure.handler import Handler, bind +from cora.operation.features.reconduct_procedure.route import ( + ReconductProcedureRequest, + ReconductProcedureResponse, + router, +) + +__all__ = [ + "Handler", + "ReconductProcedure", + "ReconductProcedureRequest", + "ReconductProcedureResponse", + "ReconductProcedureResult", + "bind", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/command.py b/apps/api/src/cora/operation/features/reconduct_procedure/command.py new file mode 100644 index 0000000000..4850623bf1 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/command.py @@ -0,0 +1,46 @@ +"""The `ReconductProcedure` command + result -- intent dataclass for this slice. + +Resume-and-replay orchestration: resume a `Held` Procedure and replay its +PINNED manifest tail from the re-establishment boundary (Tier 1 of +[[project_resumable_conduct_design]]). Mirrors `ConductProcedure` (the +conduct orchestration) but for the resume path; carries the +`re_establishment_boundary` (single-sourced -- it rides into both +`ProcedureResumed` and `Conductor.execute_from`). +""" + +from dataclasses import dataclass +from uuid import UUID + +from cora.operation.conductor import ConductorFailure + + +@dataclass(frozen=True) +class ReconductProcedure: + """Resume a held Procedure and replay its pinned manifest tail.""" + + procedure_id: UUID + re_establishment_boundary: int + + +@dataclass(frozen=True) +class ReconductProcedureResult: + """Outcome of a reconduct (resume + replay). + + `succeeded` is the canonical pass/fail bit (the replay's outcome). + `acquisition_halt` is True iff replay stopped at an acquisition that + needs an operator decision (redo-fresh vs reseed): in that case the + Procedure is LEFT Running (no complete, no abort) and `failure` carries + the halt. On a clean replay the Procedure is auto-completed; on a + genuine step failure it is aborted. `completed_count` is the number of + re-driven / re-run tail steps that succeeded; `actuation_kind` is the + Conductor's observed kind over the replay (None when nothing + instrumented was actuated). + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool = False + failure: ConductorFailure | None = None + actuation_kind: str | None = None diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py new file mode 100644 index 0000000000..335d0371d1 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py @@ -0,0 +1,193 @@ +"""Application handler for the `reconduct_procedure` slice. + +Resume-and-replay orchestration. Mirrors `conduct_procedure`: a thin +slice handler that authz-checks + loads + locates the pinned manifest, +then delegates the resume + replay + terminalize composition to +`Conductor.reconduct` (the resume twin of `Conductor.conduct`). No +`decider.py`: like `conduct_procedure` this is an orchestration entry +point, not an aggregate-state-mutating decider. + +This handler imports NO sibling slice: the resume / complete / abort +handlers it composes live on the injected `Conductor` (a non-slice +module), exactly as `conduct_procedure` delegates start / complete / +abort to `Conductor.conduct`. That keeps the slice independent (the +cross-slice fitness) and the composition in the one place that already +owns lifecycle-handler orchestration. + +## Flow + + 1. authz `ReconductProcedure`. + 2. load the Procedure + its raw events. + 3. status guard FIRST: a non-Held Procedure is a `ProcedureCannotResumeError` + (409), raised BEFORE the manifest lookup so a Defined / Completed + Procedure is never a misleading 500 and no resume-then-fail partial + state can occur. + 4. locate the PINNED `ResolvedStepsRecorded` (a conducted, Held Procedure + ALWAYS has exactly one; its absence is corruption -> + `ResolvedStepsRecordNotFoundError`, 500) and parse it back into `Step`s + via `steps_from_manifest` -- resume NEVER re-derives the step list. + 5. `Conductor.reconduct(manifest, boundary)`: resume (Held -> Running, with + its own authz + off-diagonal parent-Run-Held guard) -> `execute_from` + (re-drive setpoints, re-run checks, halt-for-operator on an acquisition) + -> terminalize (complete on a clean tail / leave Running on an + acquisition halt / best-effort abort on a genuine step failure). + 6. project the `ConductorResult` onto `ReconductProcedureResult` + (`acquisition_halt` is the named branch on the resume halt). + +The `re_establishment_boundary` is single-sourced: the operator supplies +it once; `Conductor.reconduct` rides it into both +`ProcedureResumed.re_establishment_boundary` (audit) and +`execute_from(boundary=...)` (replay). + +## Authorization scope + +`ReconductProcedure` is authz-checked as its own command. The wrapped +`resume_procedure` / `complete_procedure` / `abort_procedure` handlers +(on the Conductor) each authz internally with their OWN command names; an +operator authorized to call `ReconductProcedure` is NOT automatically +authorized for those individually. Same layering as `conduct_procedure`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._manifest_replay import find_resolved_steps_record +from cora.operation.aggregates.procedure import ( + ProcedureCannotResumeError, + ProcedureNotFoundError, + ProcedureStatus, + ResolvedStepsRecordNotFoundError, + load_procedure_with_events, +) +from cora.operation.conductor import Conductor, is_acquisition_halt, steps_from_manifest +from cora.operation.errors import UnauthorizedError +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) + +_COMMAND_NAME = "ReconductProcedure" + +_log = get_logger(__name__) + + +class Handler(Protocol): + """Callable interface every reconduct_procedure handler implements.""" + + async def __call__( + self, + command: ReconductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ReconductProcedureResult: ... + + +def bind(deps: Kernel, *, conductor: Conductor) -> Handler: + """Build a reconduct_procedure handler closed over deps + the Conductor. + + `conductor` is the same BC-internal Conductor `conduct_procedure` uses; + it carries the resume / complete / abort handlers (wired at app + composition) that `Conductor.reconduct` composes, so the internal + transitions land with the same observability shape as direct REST / MCP + calls. + """ + + async def handler( + command: ReconductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ReconductProcedureResult: + _log.info( + "reconduct_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + re_establishment_boundary=command.re_establishment_boundary, + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + authz = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(authz, Deny): + _log.info( + "reconduct_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=authz.reason, + ) + raise UnauthorizedError(authz.reason) + + procedure, stored_events = await load_procedure_with_events( + deps.event_store, command.procedure_id + ) + if procedure is None: + raise ProcedureNotFoundError(command.procedure_id) + + # Status guard FIRST (mirrors resume's `{Held}` source set): a + # non-Held Procedure is a 409, not a 500. This keeps the + # manifest-missing case below as genuine corruption (a conducted, + # Held Procedure ALWAYS has its pinned manifest) and avoids resuming + # then failing to find the manifest. The off-diagonal parent-Run-Held + # guard stays inside Conductor.reconduct's resume call. + if procedure.status is not ProcedureStatus.HELD: + raise ProcedureCannotResumeError(command.procedure_id, current_status=procedure.status) + + # Replay the PINNED manifest, never re-derive. A Held Procedure that + # was conducted always has exactly one ResolvedStepsRecorded; its + # absence here is corruption (500), not an operational outcome. + record = find_resolved_steps_record(stored_events) + if record is None: + raise ResolvedStepsRecordNotFoundError(command.procedure_id) + manifest = steps_from_manifest(record.payload["resolved_steps"]) + + result = await conductor.reconduct( + procedure_id=command.procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + manifest=manifest, + boundary=command.re_establishment_boundary, + causation_id=causation_id, + surface_id=surface_id, + ) + + actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + acquisition_halt = is_acquisition_halt(result.failure) + + _log.info( + "reconduct_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + completed_count=result.completed_count, + succeeded=result.succeeded, + acquisition_halt=acquisition_halt, + failure_class=(result.failure.error_class if result.failure is not None else None), + ) + + return ReconductProcedureResult( + procedure_id=command.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + re_establishment_boundary=command.re_establishment_boundary, + acquisition_halt=acquisition_halt, + failure=result.failure, + actuation_kind=actuation_kind, + ) + + return handler diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/route.py b/apps/api/src/cora/operation/features/reconduct_procedure/route.py new file mode 100644 index 0000000000..5ce9182ab7 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/route.py @@ -0,0 +1,164 @@ +"""HTTP route for the `reconduct_procedure` slice. + +`POST /procedures/{procedure_id}/reconduct` resumes a Held Procedure and +replays its pinned manifest tail from `re_establishment_boundary`. + +## Response code: 200, replay outcomes in body + +Like `conduct_procedure`, replay outcomes (a step that failed, an +acquisition that needs an operator decision) are NORMAL operational +results that land in the body, not HTTP errors. Only protocol / auth / +guard faults map to HTTP codes: 403 (authz deny), 404 (no procedure), +409 (Procedure not Held, or parent Run Held -- from the resume guard), +422 (negative boundary / malformed id), 500 (Held Procedure missing its +pinned manifest -- corruption). +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation.conductor import ConductorFailure +from cora.operation.features.reconduct_procedure.command import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.operation.features.reconduct_procedure.handler import Handler + + +class ReconductProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/reconduct`.""" + + re_establishment_boundary: int = Field( + ..., + ge=0, + description=( + "Index in the pinned conduct manifest from which the resume " + "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " + "from the first step). NOT a continuity proof." + ), + ) + + model_config = {"extra": "forbid"} + + +class _ConductorFailureResponse(BaseModel): + """JSON wire shape for `ConductorFailure`.""" + + step_index: int | None + source_kind: str + target: str + error_class: str + message: str + + +class ReconductProcedureResponse(BaseModel): + """Response body for the reconduct_procedure slice. + + `succeeded` is the replay's pass/fail bit. `acquisition_halt` is True + iff the replay stopped at an acquisition needing an operator decision + (the Procedure is left Running). `failure` is non-null iff `succeeded` + is False (a halt or a genuine step failure). + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool + failure: _ConductorFailureResponse | None = None + actuation_kind: str | None = None + + +def _failure_to_wire(failure: ConductorFailure) -> _ConductorFailureResponse: + return _ConductorFailureResponse( + step_index=failure.step_index, + source_kind=failure.source_kind, + target=failure.target, + error_class=failure.error_class, + message=failure.message, + ) + + +def result_to_wire(result: ReconductProcedureResult) -> ReconductProcedureResponse: + """Build a `ReconductProcedureResponse` from the slice result. + + Public because `tool.py` calls it too. + """ + return ReconductProcedureResponse( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + re_establishment_boundary=result.re_establishment_boundary, + acquisition_halt=result.acquisition_halt, + failure=_failure_to_wire(result.failure) if result.failure is not None else None, + actuation_kind=result.actuation_kind, + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.reconduct_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/reconduct", + status_code=status.HTTP_200_OK, + response_model=ReconductProcedureResponse, + responses={ + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_404_NOT_FOUND: { + "model": ErrorResponse, + "description": "No procedure exists with the given id.", + }, + status.HTTP_409_CONFLICT: { + "model": ErrorResponse, + "description": ( + "Procedure is not in `Held` status, OR its parent Run is " + "itself `Held` (off-diagonal guard)." + ), + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": "Path parameter or request body failed schema validation.", + }, + status.HTTP_500_INTERNAL_SERVER_ERROR: { + "model": ErrorResponse, + "description": "Held Procedure is missing its pinned manifest (corruption).", + }, + }, + summary="Resume a held Procedure and replay its pinned manifest tail (Held -> Running)", +) +async def post_procedures_reconduct( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: ReconductProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> ReconductProcedureResponse: + """Resume + replay a Held Procedure. Replay outcomes land in the body.""" + command = ReconductProcedure( + procedure_id=procedure_id, + re_establishment_boundary=body.re_establishment_boundary, + ) + result = await handler( + command, + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) + return result_to_wire(result) diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/tool.py b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py new file mode 100644 index 0000000000..cc10b6a947 --- /dev/null +++ b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py @@ -0,0 +1,91 @@ +"""MCP tool for the `reconduct_procedure` slice. + +Mirrors the REST route: resumes a Held Procedure and replays its pinned +manifest tail, returning a structured summary. Replay outcomes (a step +failure, an acquisition halt) land in the return value, not raised; the +LLM caller inspects `succeeded` / `acquisition_halt` / `failure`. +""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import BaseModel, Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation.features.reconduct_procedure.command import ReconductProcedure +from cora.operation.features.reconduct_procedure.handler import Handler +from cora.operation.features.reconduct_procedure.route import ( + ReconductProcedureResponse, + result_to_wire, +) + + +class _ToolResult(BaseModel): + """MCP-shape mirror of `ReconductProcedureResponse` for tool-output validation.""" + + procedure_id: UUID + completed_count: int + succeeded: bool + re_establishment_boundary: int + acquisition_halt: bool + failure: dict[str, Any] | None = None + actuation_kind: str | None = None + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `reconduct_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="reconduct_procedure", + description=( + "Resume a Held Procedure and replay its pinned manifest tail from " + "re_establishment_boundary: re-drive setpoints, re-run checks, and " + "HALT for an operator decision at an acquisition. On a clean tail " + "the Procedure auto-completes; on an acquisition halt it stays " + "Running (acquisition_halt=True); on a genuine step failure it " + "aborts. Returns a structured summary; outcomes DO NOT raise. " + "Requires the Procedure to be Held (and, for a Phase-of-Run " + "Procedure, its parent Run not Held)." + ), + ) + async def reconduct_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + re_establishment_boundary: Annotated[ + int, + Field( + ge=0, + description=( + "Manifest index the resume re-drives setpoints / re-runs " + "checks from (>= 0; 0 = from the first step)." + ), + ), + ], + ) -> _ToolResult: + handler = get_handler() + result = await handler( + ReconductProcedure( + procedure_id=procedure_id, + re_establishment_boundary=re_establishment_boundary, + ), + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) + wire: ReconductProcedureResponse = result_to_wire(result) + return _ToolResult( + procedure_id=wire.procedure_id, + completed_count=wire.completed_count, + succeeded=wire.succeeded, + re_establishment_boundary=wire.re_establishment_boundary, + acquisition_halt=wire.acquisition_halt, + failure=wire.failure.model_dump() if wire.failure is not None else None, + actuation_kind=wire.actuation_kind, + ) diff --git a/apps/api/src/cora/operation/routes.py b/apps/api/src/cora/operation/routes.py index 5d922c3633..d0c20999c5 100644 --- a/apps/api/src/cora/operation/routes.py +++ b/apps/api/src/cora/operation/routes.py @@ -74,6 +74,7 @@ RecipeExpansionOverflowError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + ResolvedStepsRecordNotFoundError, ) from cora.operation.errors import ( AssetNotPseudoAxisError, @@ -96,6 +97,7 @@ hold_procedure, list_procedure_iterations, list_procedures, + reconduct_procedure, register_procedure, register_procedure_from_recipe, resume_procedure, @@ -238,6 +240,7 @@ def register_operation_routes(app: FastAPI) -> None: app.include_router(truncate_procedure.router) app.include_router(hold_procedure.router) app.include_router(resume_procedure.router) + app.include_router(reconduct_procedure.router) app.include_router(start_iteration.router) app.include_router(end_iteration.router) app.include_router(append_activities.router) @@ -363,6 +366,9 @@ def register_operation_routes(app: FastAPI) -> None: RecipeExpanderVersionMismatchError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, + # resumable conduct: a Held Procedure missing its pinned manifest + # (corruption); kept out of the reconduct failures-in-body contract. + ResolvedStepsRecordNotFoundError, # PseudoAxis pre-Conductor expansion ([[project-pseudoaxis-design]] # v3): the partition-rule math kernel returned a non-finite result, # rejected an unsupported AggregatorKind / PartitionKind variant, diff --git a/apps/api/src/cora/operation/tools.py b/apps/api/src/cora/operation/tools.py index 08e0731d55..3024ff69b5 100644 --- a/apps/api/src/cora/operation/tools.py +++ b/apps/api/src/cora/operation/tools.py @@ -22,6 +22,7 @@ tool as list_procedure_iterations_tool, ) from cora.operation.features.list_procedures import tool as list_procedures_tool +from cora.operation.features.reconduct_procedure import tool as reconduct_procedure_tool from cora.operation.features.register_procedure import tool as register_procedure_tool from cora.operation.features.register_procedure_from_recipe import ( tool as register_procedure_from_recipe_tool, @@ -71,6 +72,10 @@ def register_operation_tools( mcp, get_handler=lambda: get_handlers().resume_procedure, ) + reconduct_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().reconduct_procedure, + ) start_iteration_tool.register( mcp, get_handler=lambda: get_handlers().start_iteration, diff --git a/apps/api/src/cora/operation/wire.py b/apps/api/src/cora/operation/wire.py index 8ceb274e2a..c9dc2bd293 100644 --- a/apps/api/src/cora/operation/wire.py +++ b/apps/api/src/cora/operation/wire.py @@ -77,6 +77,7 @@ hold_procedure, list_procedure_iterations, list_procedures, + reconduct_procedure, register_procedure, register_procedure_from_recipe, resume_procedure, @@ -107,6 +108,7 @@ class OperationHandlers: truncate_procedure: truncate_procedure.Handler hold_procedure: hold_procedure.Handler resume_procedure: resume_procedure.Handler + reconduct_procedure: reconduct_procedure.Handler start_iteration: start_iteration.Handler end_iteration: end_iteration.Handler append_activities: append_activities.Handler @@ -181,6 +183,14 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="AbortProcedure", bc=_BC, ) + # Hoisted to a local so the bundle field AND the Conductor share ONE + # post-tracing resume handler instance (mirrors the start/complete/abort + # hoist; Conductor.reconduct composes this resume handler). + resume_handler = with_tracing( + resume_procedure.bind(deps), + command_name="ResumeProcedure", + bc=_BC, + ) append_step_handler = with_tracing( append_activities.bind(deps, step_store=step_store), command_name="AppendProcedureActivities", @@ -203,6 +213,15 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> start_procedure=start_handler, complete_procedure=complete_handler, abort_procedure=abort_handler, + resume_procedure=resume_handler, + ) + # Resume-and-replay orchestration: a thin slice handler over + # Conductor.reconduct (which composes resume + execute_from + + # complete/abort). Reuses the same conductor; no sibling-slice imports. + reconduct_handler = with_tracing( + reconduct_procedure.bind(deps, conductor=conductor), + command_name="ReconductProcedure", + bc=_BC, ) return OperationHandlers( register_procedure=with_tracing( @@ -244,11 +263,8 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="HoldProcedure", bc=_BC, ), - resume_procedure=with_tracing( - resume_procedure.bind(deps), - command_name="ResumeProcedure", - bc=_BC, - ), + resume_procedure=resume_handler, + reconduct_procedure=reconduct_handler, start_iteration=with_tracing( start_iteration.bind(deps), command_name="StartProcedureIteration", diff --git a/apps/api/tests/architecture/test_slice_contract.py b/apps/api/tests/architecture/test_slice_contract.py index 3ef83df5fb..6a7cac218c 100644 --- a/apps/api/tests/architecture/test_slice_contract.py +++ b/apps/api/tests/architecture/test_slice_contract.py @@ -56,6 +56,10 @@ # complete_procedure / abort_procedure handlers; no direct event # emission. See [[project_edge_runtime_design]]. "cora.operation.features.conduct_procedure", + # Resume-and-replay entry: delegates resume_procedure + + # Conductor.execute_from + complete/abort; no direct event emission. + # See [[project_resumable_conduct_design]]. + "cora.operation.features.reconduct_procedure", # Bulk-mint sweep: enumerates Assets missing a persistent id and # delegates each to the assign_asset_persistent_id handler; no direct # event emission. See [[project_asset_persistent_id_design]]. diff --git a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py new file mode 100644 index 0000000000..45ca219acf --- /dev/null +++ b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py @@ -0,0 +1,111 @@ +"""Contract tests for `POST /procedures/{procedure_id}/reconduct`. + +Resume-and-replay: resumes a Held Procedure and replays its pinned +manifest tail. 200 with replay outcomes in body; 404/409/422/500 for +protocol / guard / corruption faults. + +Note on coverage: the 200 happy path (a clean replay that auto-completes) +requires a `Held` Procedure that carries a PINNED `ResolvedStepsRecorded` +manifest. The synchronous conduct flow today pins the manifest then runs +to a terminal state (Completed / Aborted) in one call, so there is no +API-reachable `Held`+manifest state yet (producing it -- a conduct that +pauses to Held instead of aborting on a halt, or a mid-conduct +cooperative hold -- is a follow-up slice). The clean / halt / step-failure +replay paths are covered end-to-end in +`tests/unit/operation/test_reconduct_procedure_handler.py` against a +seeded Held+manifest state. These contract tests cover the +API-reachable guard / fault surfaces. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + return UUID(client.post("/procedures", json=body).json()["procedure_id"]) + + +@pytest.mark.contract +def test_post_reconduct_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + f"/procedures/{uuid4()}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_reconduct_returns_409_for_defined_procedure() -> None: + """A Defined (non-Held) Procedure cannot be reconducted.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_reconduct_returns_409_for_completed_procedure_with_manifest() -> None: + """A conduct pins a manifest then completes; reconducting the (Completed) + Procedure is refused by the resume status guard (not Held).""" + with TestClient(create_app()) as client: + pid = _register(client) + # Conduct an EMPTY step list: pins ResolvedStepsRecorded, then + # start -> (no steps) -> complete, leaving the Procedure Completed + # WITH a pinned (empty) manifest. + conducted = client.post(f"/procedures/{pid}/conduct", json={"steps": []}) + assert conducted.status_code == 200 + assert conducted.json()["succeeded"] is True + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 409 + + +@pytest.mark.contract +def test_post_reconduct_returns_500_for_held_procedure_without_manifest() -> None: + """A Procedure started directly (no conduct) then held is Held WITHOUT a + pinned manifest; reconduct cannot locate it (corruption-shaped 500).""" + with TestClient(create_app()) as client: + pid = _register(client) + assert client.post(f"/procedures/{pid}/start").status_code == 204 + assert client.post(f"/procedures/{pid}/hold", json={"reason": "pause"}).status_code == 204 + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 500 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_negative_boundary() -> None: + """Pydantic ge=0 rejects a negative boundary at the wire before the handler.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": -1} + ) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_missing_boundary() -> None: + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post(f"/procedures/{pid}/reconduct", json={}) + assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_422_for_malformed_id() -> None: + with TestClient(create_app()) as client: + response = client.post( + "/procedures/not-a-uuid/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py b/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py new file mode 100644 index 0000000000..b5df533fee --- /dev/null +++ b/apps/api/tests/contract/test_reconduct_procedure_mcp_tool.py @@ -0,0 +1,64 @@ +"""Contract tests for the `reconduct_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + return UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + + +@pytest.mark.contract +def test_mcp_lists_reconduct_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "reconduct_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_reconduct_procedure_tool_errors_for_non_held() -> None: + """Reconducting a Defined (non-Held) Procedure surfaces the resume guard + as an MCP error (the tool wiring is exercised end-to-end).""" + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "reconduct_procedure", + "arguments": {"procedure_id": str(pid), "re_establishment_boundary": 0}, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + assert body["result"]["isError"] is True diff --git a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py new file mode 100644 index 0000000000..5a18773276 --- /dev/null +++ b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py @@ -0,0 +1,387 @@ +"""Application-handler tests for `reconduct_procedure` (resume + replay). + +Orchestration handler composing `resume_procedure` + `Conductor.execute_from` ++ complete/abort. Pins the three-way terminal contract and the guards: + + - clean tail -> resume + auto-complete (Completed) + - acquisition halt -> resume, NO complete/abort, stays Running, halt in result + - genuine step failure -> resume + abort (Aborted) + - missing pinned manifest -> ResolvedStepsRecordNotFoundError + - not Held / parent Run Held -> ProcedureCannotResumeError (no replay) + - authz deny -> UnauthorizedError +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.operation.adapters.in_memory_control_port import InMemoryControlPort +from cora.operation.aggregates.procedure import ( + InMemoryActivityStore, + ProcedureCannotResumeError, + ProcedureHeld, + ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + ProcedureStatus, + ResolvedStepsRecorded, + ResolvedStepsRecordNotFoundError, + event_type_name, + load_procedure, + to_payload, +) +from cora.operation.conductor import ActionStep, Conductor, SetpointStep, Step, step_to_payload +from cora.operation.errors import UnauthorizedError +from cora.operation.features import ( + abort_procedure, + append_activities, + complete_procedure, + reconduct_procedure, + resume_procedure, +) +from cora.operation.features.reconduct_procedure import ( + Handler as ReconductHandler, +) +from cora.operation.features.reconduct_procedure import ( + ReconductProcedure, + ReconductProcedureResult, +) +from cora.run.aggregates.run import RunHeld, RunStarted +from cora.run.aggregates.run import event_type_name as run_event_type_name +from cora.run.aggregates.run import to_payload as run_to_payload +from tests.unit._helpers import build_deps as _build_deps_shared + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PRIOR = datetime(2026, 6, 21, 11, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000d0a01") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +@dataclass +class _LenientIds: + """Conductor id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +def _deps(store: InMemoryEventStore, *, deny: bool = False) -> Kernel: + # Generous id pool: resume + lazy logbook-open + complete/abort all draw + # from deps.id_generator (the conductor's activity rows use a lenient one). + return _build_deps_shared( + ids=[uuid4() for _ in range(30)], now=_NOW, event_store=store, deny=deny + ) + + +def _make_reconduct(deps: Kernel, port: InMemoryControlPort) -> ReconductHandler: + conductor = Conductor( + control_port=port, + append_step=append_activities.bind(deps, step_store=InMemoryActivityStore()), + clock=deps.clock, + id_generator=_LenientIds(), + resume_procedure=resume_procedure.bind(deps), + complete_procedure=complete_procedure.bind(deps), + abort_procedure=abort_procedure.bind(deps), + ) + return reconduct_procedure.bind(deps, conductor=conductor) + + +async def _seed_held_with_manifest( + store: InMemoryEventStore, + *, + manifest: Sequence[Step], + procedure_id: UUID = _PROCEDURE_ID, + parent_run_id: UUID | None = None, +) -> None: + """Land a conducted-then-Held Procedure: Registered + ResolvedStepsRecorded + (the pinned manifest) + Started + Held.""" + resolved = tuple(step_to_payload(s) for s in manifest) + events = [ + ProcedureRegistered( + procedure_id=procedure_id, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=parent_run_id, + occurred_at=_PRIOR, + ), + ResolvedStepsRecorded( + procedure_id=procedure_id, + resolved_steps=resolved, + step_count=len(resolved), + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=procedure_id, occurred_at=_PRIOR), + ProcedureHeld(procedure_id=procedure_id, reason="beam dropped", occurred_at=_PRIOR), + ] + await store.append( + stream_type="Procedure", + stream_id=procedure_id, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _seed_held_run(store: InMemoryEventStore, *, run_id: UUID) -> None: + events: list[RunStarted | RunHeld] = [ + RunStarted( + run_id=run_id, name="parent", plan_id=uuid4(), subject_id=None, occurred_at=_PRIOR + ), + RunHeld(run_id=run_id, occurred_at=_PRIOR), + ] + await store.append( + stream_type="Run", + stream_id=run_id, + expected_version=0, + events=[ + to_new_event( + event_type=run_event_type_name(e), + payload=run_to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _status(store: InMemoryEventStore) -> ProcedureStatus: + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + return state.status + + +async def _call(handler: ReconductHandler, boundary: int) -> ReconductProcedureResult: + return await handler( + ReconductProcedure(procedure_id=_PROCEDURE_ID, re_establishment_boundary=boundary), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_clean_tail_resumes_then_auto_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + port.simulate_connect("2bma:b") + await _seed_held_with_manifest( + store, + manifest=( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is True + assert result.acquisition_halt is False + assert result.completed_count == 2 + assert await _status(store) is ProcedureStatus.COMPLETED + assert (await port.read("2bma:a")).value == 1.0 + assert (await port.read("2bma:b")).value == 2.0 + + +@pytest.mark.unit +async def test_boundary_replays_only_the_tail_then_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:b") # only the tail step is re-driven + await _seed_held_with_manifest( + store, + manifest=( + SetpointStep(address="2bma:a", value=1.0), + SetpointStep(address="2bma:b", value=2.0), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 1) + assert result.succeeded is True + assert result.completed_count == 1 + assert await _status(store) is ProcedureStatus.COMPLETED + # The prefix step (2bma:a) was never re-driven. + with pytest.raises(Exception, match="not connected"): + await port.read("2bma:a") + + +@pytest.mark.unit +async def test_acquisition_halt_resumes_but_leaves_running() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + await _seed_held_with_manifest( + store, + manifest=( + SetpointStep(address="2bma:a", value=1.0), + ActionStep(name="collect", params={"dwell": 0.1}), + ), + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is False + assert result.acquisition_halt is True + assert result.failure is not None + assert result.failure.error_class == "AcquisitionResumeRequiresOperator" + # Resumed (Held -> Running) but NOT completed/aborted: stays Running. + assert await _status(store) is ProcedureStatus.RUNNING + events, _ = await store.load("Procedure", _PROCEDURE_ID) + types = [e.event_type for e in events] + assert "ProcedureResumed" in types + assert "ProcedureCompleted" not in types + assert "ProcedureAborted" not in types + + +@pytest.mark.unit +async def test_genuine_step_failure_resumes_then_aborts() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a NOT connected -> write fails + await _seed_held_with_manifest(store, manifest=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store) + result = await _call(_make_reconduct(deps, port), 0) + + assert result.succeeded is False + assert result.acquisition_halt is False + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.ABORTED + + +@pytest.mark.unit +async def test_raises_when_manifest_record_missing() -> None: + """A Held Procedure with no pinned ResolvedStepsRecorded is corruption.""" + store = InMemoryEventStore() + # Seed Held WITHOUT a ResolvedStepsRecorded. + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="x", + kind="bakeout", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_PRIOR), + ProcedureHeld(procedure_id=_PROCEDURE_ID, reason="paused", occurred_at=_PRIOR), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + deps = _deps(store) + with pytest.raises(ResolvedStepsRecordNotFoundError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_reconduct_raises_not_found_when_procedure_absent() -> None: + store = InMemoryEventStore() + deps = _deps(store) + with pytest.raises(ProcedureNotFoundError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_cannot_resume_when_not_held() -> None: + """A Running (not Held) Procedure with a manifest cannot be reconducted.""" + store = InMemoryEventStore() + # Registered + ResolvedStepsRecorded + Started (Running, has manifest). + resolved = (step_to_payload(SetpointStep(address="2bma:a", value=1.0)),) + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="x", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_PRIOR, + ), + ResolvedStepsRecorded( + procedure_id=_PROCEDURE_ID, + resolved_steps=resolved, + step_count=1, + occurred_at=_PRIOR, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_PRIOR), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + deps = _deps(store) + with pytest.raises(ProcedureCannotResumeError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_cannot_resume_when_parent_run_held() -> None: + """Off-diagonal guard: a Phase-of-Run Procedure whose parent Run is Held.""" + store = InMemoryEventStore() + parent_run_id = uuid4() + await _seed_held_run(store, run_id=parent_run_id) + await _seed_held_with_manifest( + store, + manifest=(SetpointStep(address="2bma:a", value=1.0),), + parent_run_id=parent_run_id, + ) + deps = _deps(store) + with pytest.raises(ProcedureCannotResumeError) as exc: + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + assert exc.value.parent_run_held is True + + +@pytest.mark.unit +async def test_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_held_with_manifest(store, manifest=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store, deny=True) + with pytest.raises(UnauthorizedError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 0) From fbb9afb38416f7dc9a69d3f86c6698db8ce8c2cd Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 11:53:51 +0300 Subject: [PATCH 07/12] fix(operation): reseed FixedIdGenerator for the pre-effect in-flight marker The Tier-1 pre-effect in-flight marker records a separate in_flight activity entry before each side-effecting step (setpoint / action), which consumes one extra event id per such step. The softioc+postgres integration tests seed a FixedIdGenerator with an exact id list and were not updated, so they exhausted mid-conduct; in the abort test the swallowed best-effort abort then never appended ProcedureAborted. Add one marker id per side-effecting step, ordered before its outcome so the ORDER BY event_id activity assertions still hold. Co-Authored-By: Claude Opus 4.8 --- ...t_acquisitions_against_softioc_postgres.py | 18 ++++++++---- ...test_conductor_against_softioc_postgres.py | 28 +++++++++++-------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py index a860a94895..607acd7007 100644 --- a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py @@ -121,8 +121,9 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( started_event_id = UUID("01900000-0000-7000-8000-0000020d0101") logbook_id = UUID("01900000-0000-7000-8000-0000020d0102") open_event_id = UUID("01900000-0000-7000-8000-0000020d0103") - collect_step_id = UUID("01900000-0000-7000-8000-0000020d0104") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0105") + collect_marker_id = UUID("01900000-0000-7000-8000-0000020d0104") + collect_step_id = UUID("01900000-0000-7000-8000-0000020d0105") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0106") deps = build_postgres_deps( db_pool, @@ -131,6 +132,7 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( started_event_id, logbook_id, open_event_id, + collect_marker_id, collect_step_id, completed_event_id, ], @@ -211,8 +213,9 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects started_event_id = UUID("01900000-0000-7000-8000-0000020d0201") logbook_id = UUID("01900000-0000-7000-8000-0000020d0202") open_event_id = UUID("01900000-0000-7000-8000-0000020d0203") - discrete_step_id = UUID("01900000-0000-7000-8000-0000020d0204") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0205") + discrete_marker_id = UUID("01900000-0000-7000-8000-0000020d0204") + discrete_step_id = UUID("01900000-0000-7000-8000-0000020d0205") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0206") deps = build_postgres_deps( db_pool, @@ -221,6 +224,7 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects started_event_id, logbook_id, open_event_id, + discrete_marker_id, discrete_step_id, completed_event_id, ], @@ -296,8 +300,9 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020d0301") logbook_id = UUID("01900000-0000-7000-8000-0000020d0302") open_event_id = UUID("01900000-0000-7000-8000-0000020d0303") - continuous_step_id = UUID("01900000-0000-7000-8000-0000020d0304") - completed_event_id = UUID("01900000-0000-7000-8000-0000020d0305") + continuous_marker_id = UUID("01900000-0000-7000-8000-0000020d0304") + continuous_step_id = UUID("01900000-0000-7000-8000-0000020d0305") + completed_event_id = UUID("01900000-0000-7000-8000-0000020d0306") deps = build_postgres_deps( db_pool, @@ -306,6 +311,7 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( started_event_id, logbook_id, open_event_id, + continuous_marker_id, continuous_step_id, completed_event_id, ], diff --git a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py index 4ded95ad05..83b191d2b4 100644 --- a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py @@ -104,9 +104,10 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( logbook_id = UUID("01900000-0000-7000-8000-0000020c0101") open_event_id = UUID("01900000-0000-7000-8000-0000020c0102") started_event_id = UUID("01900000-0000-7000-8000-0000020c0103") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0104") - check_step_id = UUID("01900000-0000-7000-8000-0000020c0105") - completed_event_id = UUID("01900000-0000-7000-8000-0000020c0106") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0104") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0105") + check_step_id = UUID("01900000-0000-7000-8000-0000020c0106") + completed_event_id = UUID("01900000-0000-7000-8000-0000020c0107") deps = build_postgres_deps( db_pool, @@ -115,7 +116,8 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, check_step_id, completed_event_id, ], @@ -217,8 +219,9 @@ async def test_conductor_aborts_procedure_when_setpoint_fails_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020c0201") logbook_id = UUID("01900000-0000-7000-8000-0000020c0202") open_event_id = UUID("01900000-0000-7000-8000-0000020c0203") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0204") - aborted_event_id = UUID("01900000-0000-7000-8000-0000020c0205") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0204") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0205") + aborted_event_id = UUID("01900000-0000-7000-8000-0000020c0206") deps = build_postgres_deps( db_pool, @@ -227,7 +230,8 @@ async def test_conductor_aborts_procedure_when_setpoint_fails_against_softioc( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, aborted_event_id, ], ) @@ -284,9 +288,10 @@ async def test_conductor_completes_procedure_with_equals_check_against_softioc( started_event_id = UUID("01900000-0000-7000-8000-0000020c0301") logbook_id = UUID("01900000-0000-7000-8000-0000020c0302") open_event_id = UUID("01900000-0000-7000-8000-0000020c0303") - setpoint_step_id = UUID("01900000-0000-7000-8000-0000020c0304") - check_step_id = UUID("01900000-0000-7000-8000-0000020c0305") - completed_event_id = UUID("01900000-0000-7000-8000-0000020c0306") + setpoint_marker_id = UUID("01900000-0000-7000-8000-0000020c0304") + setpoint_outcome_id = UUID("01900000-0000-7000-8000-0000020c0305") + check_step_id = UUID("01900000-0000-7000-8000-0000020c0306") + completed_event_id = UUID("01900000-0000-7000-8000-0000020c0307") deps = build_postgres_deps( db_pool, @@ -295,7 +300,8 @@ async def test_conductor_completes_procedure_with_equals_check_against_softioc( started_event_id, logbook_id, open_event_id, - setpoint_step_id, + setpoint_marker_id, + setpoint_outcome_id, check_step_id, completed_event_id, ], From a9a6e5ba22cb208c07e748c0113a82e0af9497e8 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 11:55:08 +0300 Subject: [PATCH 08/12] refactor(operation): retire "manifest" for the resolved step list (use steps) "manifest" was vestigial vocabulary. The event is already ResolvedStepsRecorded with field resolved_steps, but the Conductor parameter, a helper, two modules, and prose still said "manifest", a noun that also collides with the federation and data BCs' own manifests. Reuse the already-locked steps / resolved-steps / payload family instead of the orphaned noun: - execute_from(manifest=) / reconduct(manifest=) -> steps= (matches the sibling execute(steps=); closes a same-data, different-name asymmetry) - steps_from_manifest() -> steps_from_payload() (joins the existing step_to_payload / _step_from_payload serialize pair) - conduct_procedure/manifest.py -> resolved_steps.py - _manifest_replay.py -> _resolved_steps_replay.py (mirrors the _recipe_replay.py sibling: named after its event subject) - prose / Field descriptions: "conduct manifest" / "pinned manifest" -> "resolved steps" / "pinned step list"; openapi snapshot regenerated. No behavior change. naming-r3 review PASS. Co-Authored-By: Claude Opus 4.8 --- apps/api/openapi.json | 8 +-- ...st_replay.py => _resolved_steps_replay.py} | 10 +-- .../operation/aggregates/procedure/events.py | 6 +- .../operation/aggregates/procedure/state.py | 12 ++-- apps/api/src/cora/operation/conductor.py | 48 ++++++------- .../features/conduct_procedure/handler.py | 8 +-- .../{manifest.py => resolved_steps.py} | 4 +- .../features/reconduct_procedure/__init__.py | 2 +- .../features/reconduct_procedure/command.py | 4 +- .../features/reconduct_procedure/handler.py | 24 +++---- .../features/reconduct_procedure/route.py | 10 +-- .../features/reconduct_procedure/tool.py | 6 +- .../features/resume_procedure/command.py | 2 +- .../features/resume_procedure/route.py | 2 +- .../features/resume_procedure/tool.py | 4 +- apps/api/src/cora/operation/routes.py | 2 +- .../test_reconduct_procedure_endpoint.py | 18 ++--- .../test_conduct_procedure_handler.py | 10 +-- .../operation/test_conductor_execute_from.py | 72 +++++++++---------- .../test_reconduct_procedure_handler.py | 36 +++++----- .../operation/test_record_resolved_steps.py | 8 +-- 21 files changed, 148 insertions(+), 148 deletions(-) rename apps/api/src/cora/operation/{_manifest_replay.py => _resolved_steps_replay.py} (80%) rename apps/api/src/cora/operation/features/conduct_procedure/{manifest.py => resolved_steps.py} (92%) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index f37dded7af..4542d45d05 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -10288,7 +10288,7 @@ "description": "Body for `POST /procedures/{procedure_id}/reconduct`.", "properties": { "re_establishment_boundary": { - "description": "Index in the pinned conduct manifest from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "description": "Index in the pinned resolved step list from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", "minimum": 0.0, "title": "Re Establishment Boundary", "type": "integer" @@ -12541,7 +12541,7 @@ "description": "Body for `POST /procedures/{procedure_id}/resume`.", "properties": { "re_establishment_boundary": { - "description": "Index in the pinned conduct manifest from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", + "description": "Index in the pinned resolved step list from which the resume re-drives setpoints and re-runs checks. >= 0 (0 = re-establish from the first step). NOT a continuity proof.", "minimum": 0.0, "title": "Re Establishment Boundary", "type": "integer" @@ -37204,10 +37204,10 @@ } } }, - "description": "Held Procedure is missing its pinned manifest (corruption)." + "description": "Held Procedure is missing its pinned resolved steps (corruption)." } }, - "summary": "Resume a held Procedure and replay its pinned manifest tail (Held -> Running)", + "summary": "Resume a held Procedure and replay its pinned step-list tail (Held -> Running)", "tags": [ "operation" ] diff --git a/apps/api/src/cora/operation/_manifest_replay.py b/apps/api/src/cora/operation/_resolved_steps_replay.py similarity index 80% rename from apps/api/src/cora/operation/_manifest_replay.py rename to apps/api/src/cora/operation/_resolved_steps_replay.py index c4199a2197..77f6aa8b2a 100644 --- a/apps/api/src/cora/operation/_manifest_replay.py +++ b/apps/api/src/cora/operation/_resolved_steps_replay.py @@ -1,11 +1,11 @@ -"""Conduct-manifest replay helper for the `reconduct_procedure` handler. +"""Resolved-steps replay helper for the `reconduct_procedure` handler. -The resume path replays a halted conduct from a PINNED manifest rather +The resume path replays a halted conduct from PINNED resolved steps rather than re-deriving the step list. This module locates the `ResolvedStepsRecorded` provenance event (pinned once at conduct start by -`conduct_procedure/handler.py` + `manifest.py`) in a Procedure stream so +`conduct_procedure/handler.py` + `resolved_steps.py`) in a Procedure stream so the handler can parse `resolved_steps` back into `Step`s via -`conductor.steps_from_manifest` and hand them to `Conductor.execute_from`. +`conductor.steps_from_payload` and hand them to `Conductor.execute_from`. Sibling of `_recipe_replay.find_recipe_expansion_record` (the recipe genesis provenance finder), kept separate because that module's tuple of @@ -33,7 +33,7 @@ def find_resolved_steps_record( Returns `None` when no match. The caller decides whether None is an error: the `reconduct_procedure` handler raises `ResolvedStepsRecordNotFoundError` (a Held Procedure missing its pinned - manifest is corruption, not an operational outcome). + resolved steps is corruption, not an operational outcome). """ for event in stored_events: if event.event_type == "ResolvedStepsRecorded": diff --git a/apps/api/src/cora/operation/aggregates/procedure/events.py b/apps/api/src/cora/operation/aggregates/procedure/events.py index b643f03e11..397a1c251f 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/events.py +++ b/apps/api/src/cora/operation/aggregates/procedure/events.py @@ -364,11 +364,11 @@ class ProcedureResumed: Inverse of `ProcedureHeld`. Mirrors `RunResumed`. Hold <-> Resume is bidirectional and unlimited-cycle within one conduct. - `re_establishment_boundary` is the index in the pinned conduct - manifest from which resume re-drives setpoints + re-runs checks (NOT + `re_establishment_boundary` is the index in the pinned resolved + step list from which resume re-drives setpoints + re-runs checks (NOT a continuity proof; the pre-effect in-flight marker is the only continuity fact the aggregate owns). It is `>= 0`; the Conductor's - `execute_from` consumes it to replay the pinned manifest tail. Per + `execute_from` consumes it to replay the pinned step-list tail. Per [[project_resumable_conduct_design]] the field is the re-establishment boundary, deliberately NOT a "verified continuity" claim. diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index 21d28593a1..dab6bdc831 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -475,7 +475,7 @@ def __init__(self, procedure_id: UUID) -> None: class ResolvedStepsRecordNotFoundError(Exception): - """A Held Procedure cannot locate its pinned `ResolvedStepsRecorded` manifest. + """A Held Procedure cannot locate its pinned `ResolvedStepsRecorded` record. Raised by the `reconduct_procedure` (resume-and-replay) handler when a Held Procedure's stream carries no `ResolvedStepsRecorded` event. A @@ -490,7 +490,7 @@ class ResolvedStepsRecordNotFoundError(Exception): def __init__(self, procedure_id: UUID) -> None: super().__init__( f"Procedure {procedure_id} is Held but its pinned " - f"ResolvedStepsRecorded manifest could not be located; resume " + f"ResolvedStepsRecorded record could not be located; resume " f"replay cannot proceed." ) self.procedure_id = procedure_id @@ -1180,13 +1180,13 @@ def __init__(self, value: str) -> None: class InvalidProcedureReEstablishmentBoundaryError(ValueError): """The supplied resume re-establishment boundary is negative. - `re_establishment_boundary` is the index in the pinned conduct - manifest from which resume re-drives setpoints + re-runs checks. It + `re_establishment_boundary` is the index in the pinned resolved + step list from which resume re-drives setpoints + re-runs checks. It must be >= 0 (a step position; 0 means re-establish from the very first step). Validated at the API boundary via Pydantic `ge=0` AND defensively at the `resume_procedure` decider. The upper bound - (boundary vs manifest length) is enforced by the Conductor's - `execute_from` replay, not the decider (the manifest is not folded + (boundary vs step-list length) is enforced by the Conductor's + `execute_from` replay, not the decider (the step list is not folded into Procedure state). Mapped to HTTP 400. See [[project_resumable_conduct_design]]. """ diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 08bdc5a0e4..0fff402a05 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -39,12 +39,12 @@ ## Resume (execute_from) -`execute_from` replays a PINNED conduct manifest from a re-establishment +`execute_from` replays the PINNED resolved step list from a re-establishment boundary rather than re-deriving the step list: re-drive setpoints, re-run checks as fresh gates, and halt-for-operator on an acquisition (`ActionStep`). It is the Tier-1 resumable-conduct primitive -([[project_resumable_conduct_design]]); the manifest comes from -`ResolvedStepsRecorded` parsed via `steps_from_manifest`. Like `execute` +([[project_resumable_conduct_design]]); the step list comes from +`ResolvedStepsRecorded` parsed via `steps_from_payload`. Like `execute` it drives no FSM transition. ## Pre-effect in-flight marker (side-effecting steps) @@ -251,7 +251,7 @@ class ResumePolicy(StrEnum): - """How `execute_from` re-establishes state while replaying a manifest tail. + """How `execute_from` re-establishes state while replaying a step-list tail. `RE_ESTABLISH` (the only member today): re-drive setpoints (absolute writes are idempotent; CORA has no relative-setpoint type), re-run @@ -649,24 +649,24 @@ async def execute_from( procedure_id: UUID, principal_id: UUID, correlation_id: UUID, - manifest: Sequence[Step], + steps: Sequence[Step], boundary: int, policy: ResumePolicy = ResumePolicy.RE_ESTABLISH, causation_id: UUID | None = None, surface_id: UUID = NIL_SENTINEL_ID, ) -> ConductorResult: - """Resume a halted conduct by REPLAYING a pinned manifest from `boundary`. + """Resume a halted conduct by REPLAYING the pinned resolved steps from `boundary`. - `manifest` is the FINAL resolved step list pinned on + `steps` is the FINAL resolved step list pinned on `ResolvedStepsRecorded` at first conduct (parse the event's - `resolved_steps` back via `steps_from_manifest`). Resume NEVER + `resolved_steps` back via `steps_from_payload`). Resume NEVER re-derives the step list -- a re-derived list could silently skip or mis-target a step (the end-of-run "home to 0" aliasing the start-of-run "home to 0" after an index shift). It replays - `manifest[boundary:]` verbatim: + `steps[boundary:]` verbatim: - `SetpointStep` -> RE-DRIVE (idempotent absolute write). The - recorded `step_index` is the ABSOLUTE manifest position, so the + recorded `step_index` is the ABSOLUTE position in the step list, so the replayed journal lines up with the original conduct. - `CheckStep` -> RE-RUN as a fresh gate (a passing check proves "now", not "continuously", so it is re-evaluated). @@ -678,7 +678,7 @@ async def execute_from( `boundary` is the re-establishment boundary from `ProcedureResumed`: the index from which re-drive + re-run resumes. `boundary >= - len(manifest)` replays an empty tail (a no-op resume). Like + len(steps)` replays an empty tail (a no-op resume). Like `execute`, this drives no FSM transition; it walks + records. See [[project_resumable_conduct_design]] Tier 1. @@ -698,8 +698,8 @@ async def execute_from( ) observer = _ActuationObserver(self._control_port) completed = 0 - for index in range(boundary, len(manifest)): - step = manifest[index] + for index in range(boundary, len(steps)): + step = steps[index] if isinstance(step, ActionStep): # Halt-for-operator: do not re-run an interrupted acquisition. return ConductorResult( @@ -901,12 +901,12 @@ async def reconduct( procedure_id: UUID, principal_id: UUID, correlation_id: UUID, - manifest: Sequence[Step], + steps: Sequence[Step], boundary: int, causation_id: UUID | None = None, surface_id: UUID = NIL_SENTINEL_ID, ) -> ConductorResult: - """Resume a Held Procedure and REPLAY its pinned manifest from `boundary`. + """Resume a Held Procedure and REPLAY its pinned resolved steps from `boundary`. The resume twin of `conduct()`: where `conduct()` drives start -> execute -> complete | abort, this drives @@ -918,7 +918,7 @@ async def reconduct( which PROPAGATES (mapped to 409 at the route) rather than landing in the result body. A refused resume is a guard outcome, not a replay outcome, and no replay has happened yet. - 2. Call `self.execute_from(manifest, boundary)`: re-drive setpoints, + 2. Call `self.execute_from(steps, boundary)`: re-drive setpoints, re-run checks, halt-for-operator on an acquisition. 3. Terminalize three-way: - clean tail (incl. empty) -> `complete_procedure` (Completed). @@ -928,7 +928,7 @@ async def reconduct( abort itself fails, the original step failure is what surfaces, mirroring `conduct()`). - `manifest` is the parsed `ResolvedStepsRecorded.resolved_steps`: the + `steps` is the parsed `ResolvedStepsRecorded.resolved_steps`: the caller locates + parses the PINNED record (resume NEVER re-derives the step list). `boundary` is single-sourced: it rides into both `ProcedureResumed.re_establishment_boundary` (audit) and @@ -970,7 +970,7 @@ async def reconduct( procedure_id=procedure_id, principal_id=principal_id, correlation_id=correlation_id, - manifest=manifest, + steps=steps, boundary=boundary, causation_id=causation_id, surface_id=surface_id, @@ -1281,7 +1281,7 @@ async def _record( `index` is the step's zero-based position in the conducted step list; it rides the payload as `step_index` so a future resume can map a recorded outcome back to its position in the pinned - conduct manifest. + resolved step list. """ payload: dict[str, Any] = {**body, "step_index": index, "result": result} if error_class is not None: @@ -1332,11 +1332,11 @@ def _criterion_to_dict(criterion: CheckCriterion) -> dict[str, Any]: def step_to_payload(step: Step) -> dict[str, Any]: - """Serialize a `Step` to a JSON-clean dict (inverse of `steps_from_manifest`). + """Serialize a `Step` to a JSON-clean dict (inverse of `steps_from_payload`). Mirrors the conduct route's wire shape (the `kind` discriminator + field names) so the resolved step list pinned on `ResolvedStepsRecorded` - round-trips back to `Step` objects via `steps_from_manifest` at resume + round-trips back to `Step` objects via `steps_from_payload` at resume (and via the route's Pydantic `step_from_wire` on the live HTTP path). A tuple `value` serializes as a list (JSON has no tuple); the criterion reuses `_criterion_to_dict` so the wire shape stays single-sourced. @@ -1394,11 +1394,11 @@ def _step_from_payload(payload: Mapping[str, Any]) -> Step: raise ValueError(msg) -def steps_from_manifest(resolved_steps: Sequence[Mapping[str, Any]]) -> tuple[Step, ...]: +def steps_from_payload(resolved_steps: Sequence[Mapping[str, Any]]) -> tuple[Step, ...]: """Parse the pinned `ResolvedStepsRecorded.resolved_steps` back into `Step`s. The exact inverse of `step_to_payload` (the serialization used to pin the - conduct manifest). A resume reads the pinned event's `resolved_steps`, + resolved step list). A resume reads the pinned event's `resolved_steps`, parses them with this helper, and hands the result to `Conductor.execute_from` -- it NEVER re-derives the step list from live `Plan.wires` / partition rules. Pure; no Pydantic (that lives at the HTTP @@ -1494,5 +1494,5 @@ def _reading_to_dict(reading: Reading) -> dict[str, Any]: "WithinToleranceCriterion", "is_acquisition_halt", "step_to_payload", - "steps_from_manifest", + "steps_from_payload", ] diff --git a/apps/api/src/cora/operation/features/conduct_procedure/handler.py b/apps/api/src/cora/operation/features/conduct_procedure/handler.py index 8392c3237a..58568d5277 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/handler.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/handler.py @@ -74,7 +74,7 @@ ConductProcedure, ConductProcedureResult, ) -from cora.operation.features.conduct_procedure.manifest import ( +from cora.operation.features.conduct_procedure.resolved_steps import ( decide_resolved_steps_recorded, ) from cora.operation.ports.recipe_expander import RecipeExpander @@ -230,12 +230,12 @@ def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: # while the Procedure is still Defined and returns [] otherwise, # leaving the Conductor's start_procedure to surface a lifecycle # failure (keeps the conduct route's failures-in-body contract). - manifest_events = decide_resolved_steps_recorded( + resolved_steps_events = decide_resolved_steps_recorded( procedure, tuple(step_to_payload(step) for step in steps), now=deps.clock.now(), ) - if manifest_events: + if resolved_steps_events: _, current_version = await deps.event_store.load( stream_type="Procedure", stream_id=command.procedure_id ) @@ -254,7 +254,7 @@ def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: causation_id=causation_id, principal_id=principal_id, ) - for event in manifest_events + for event in resolved_steps_events ], ) diff --git a/apps/api/src/cora/operation/features/conduct_procedure/manifest.py b/apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py similarity index 92% rename from apps/api/src/cora/operation/features/conduct_procedure/manifest.py rename to apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py index 11e95f6ee0..53476f96d8 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/manifest.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py @@ -3,7 +3,7 @@ The `conduct_procedure` orchestration handler calls this AFTER it has resolved the final step list (recipe re-expansion + pseudoaxis + constituent resolution) and BEFORE handing the list to the Conductor, so -every conduct pins its manifest before any step executes. +every conduct pins its resolved steps before any step executes. Emitted inline from the conduct flow rather than via a dedicated command slice: `ResolvedStepsRecorded` is an internal provenance event with no @@ -34,7 +34,7 @@ def decide_resolved_steps_recorded( (the normal conduct path, before `start_procedure` transitions it to `Running`). Returns `[]` when `state` is None or not `Defined`: a conduct of a missing / already-running / terminal Procedure records no - manifest and lets the Conductor's `start_procedure` produce the normal + resolved steps and lets the Conductor's `start_procedure` produce the normal lifecycle failure, preserving the conduct route's failures-in-body contract instead of raising a fresh HTTP error here. """ diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py index eec770fe7c..13dd8f4493 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/__init__.py @@ -2,7 +2,7 @@ Operator-facing resume-and-replay orchestration: resumes a Held Procedure and hands control to the `Conductor` runtime, which replays the -pinned manifest tail from the re-establishment boundary (re-drive +pinned step-list tail from the re-establishment boundary (re-drive setpoints, re-run checks, halt-for-operator on an acquisition), then auto-completes on a clean tail / aborts on a step failure / leaves Running on an acquisition halt. Returns a structured diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/command.py b/apps/api/src/cora/operation/features/reconduct_procedure/command.py index 4850623bf1..4b228e565d 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/command.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/command.py @@ -1,7 +1,7 @@ """The `ReconductProcedure` command + result -- intent dataclass for this slice. Resume-and-replay orchestration: resume a `Held` Procedure and replay its -PINNED manifest tail from the re-establishment boundary (Tier 1 of +PINNED step-list tail from the re-establishment boundary (Tier 1 of [[project_resumable_conduct_design]]). Mirrors `ConductProcedure` (the conduct orchestration) but for the resume path; carries the `re_establishment_boundary` (single-sourced -- it rides into both @@ -16,7 +16,7 @@ @dataclass(frozen=True) class ReconductProcedure: - """Resume a held Procedure and replay its pinned manifest tail.""" + """Resume a held Procedure and replay its pinned step-list tail.""" procedure_id: UUID re_establishment_boundary: int diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py index 335d0371d1..ab0bb6c2ba 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py @@ -1,7 +1,7 @@ """Application handler for the `reconduct_procedure` slice. Resume-and-replay orchestration. Mirrors `conduct_procedure`: a thin -slice handler that authz-checks + loads + locates the pinned manifest, +slice handler that authz-checks + loads + locates the pinned resolved steps, then delegates the resume + replay + terminalize composition to `Conductor.reconduct` (the resume twin of `Conductor.conduct`). No `decider.py`: like `conduct_procedure` this is an orchestration entry @@ -19,14 +19,14 @@ 1. authz `ReconductProcedure`. 2. load the Procedure + its raw events. 3. status guard FIRST: a non-Held Procedure is a `ProcedureCannotResumeError` - (409), raised BEFORE the manifest lookup so a Defined / Completed + (409), raised BEFORE the step-list lookup so a Defined / Completed Procedure is never a misleading 500 and no resume-then-fail partial state can occur. 4. locate the PINNED `ResolvedStepsRecorded` (a conducted, Held Procedure ALWAYS has exactly one; its absence is corruption -> `ResolvedStepsRecordNotFoundError`, 500) and parse it back into `Step`s - via `steps_from_manifest` -- resume NEVER re-derives the step list. - 5. `Conductor.reconduct(manifest, boundary)`: resume (Held -> Running, with + via `steps_from_payload` -- resume NEVER re-derives the step list. + 5. `Conductor.reconduct(steps, boundary)`: resume (Held -> Running, with its own authz + off-diagonal parent-Run-Held guard) -> `execute_from` (re-drive setpoints, re-run checks, halt-for-operator on an acquisition) -> terminalize (complete on a clean tail / leave Running on an @@ -55,7 +55,7 @@ from cora.infrastructure.logging import get_logger from cora.infrastructure.ports import Deny from cora.infrastructure.routing import NIL_SENTINEL_ID -from cora.operation._manifest_replay import find_resolved_steps_record +from cora.operation._resolved_steps_replay import find_resolved_steps_record from cora.operation.aggregates.procedure import ( ProcedureCannotResumeError, ProcedureNotFoundError, @@ -63,7 +63,7 @@ ResolvedStepsRecordNotFoundError, load_procedure_with_events, ) -from cora.operation.conductor import Conductor, is_acquisition_halt, steps_from_manifest +from cora.operation.conductor import Conductor, is_acquisition_halt, steps_from_payload from cora.operation.errors import UnauthorizedError from cora.operation.features.reconduct_procedure.command import ( ReconductProcedure, @@ -142,26 +142,26 @@ async def handler( # Status guard FIRST (mirrors resume's `{Held}` source set): a # non-Held Procedure is a 409, not a 500. This keeps the - # manifest-missing case below as genuine corruption (a conducted, - # Held Procedure ALWAYS has its pinned manifest) and avoids resuming - # then failing to find the manifest. The off-diagonal parent-Run-Held + # missing-record case below as genuine corruption (a conducted, + # Held Procedure ALWAYS has its pinned resolved steps) and avoids resuming + # then failing to find them. The off-diagonal parent-Run-Held # guard stays inside Conductor.reconduct's resume call. if procedure.status is not ProcedureStatus.HELD: raise ProcedureCannotResumeError(command.procedure_id, current_status=procedure.status) - # Replay the PINNED manifest, never re-derive. A Held Procedure that + # Replay the PINNED resolved steps, never re-derive. A Held Procedure that # was conducted always has exactly one ResolvedStepsRecorded; its # absence here is corruption (500), not an operational outcome. record = find_resolved_steps_record(stored_events) if record is None: raise ResolvedStepsRecordNotFoundError(command.procedure_id) - manifest = steps_from_manifest(record.payload["resolved_steps"]) + steps = steps_from_payload(record.payload["resolved_steps"]) result = await conductor.reconduct( procedure_id=command.procedure_id, principal_id=principal_id, correlation_id=correlation_id, - manifest=manifest, + steps=steps, boundary=command.re_establishment_boundary, causation_id=causation_id, surface_id=surface_id, diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/route.py b/apps/api/src/cora/operation/features/reconduct_procedure/route.py index 5ce9182ab7..f343b732f9 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/route.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/route.py @@ -1,7 +1,7 @@ """HTTP route for the `reconduct_procedure` slice. `POST /procedures/{procedure_id}/reconduct` resumes a Held Procedure and -replays its pinned manifest tail from `re_establishment_boundary`. +replays its pinned step-list tail from `re_establishment_boundary`. ## Response code: 200, replay outcomes in body @@ -11,7 +11,7 @@ guard faults map to HTTP codes: 403 (authz deny), 404 (no procedure), 409 (Procedure not Held, or parent Run Held -- from the resume guard), 422 (negative boundary / malformed id), 500 (Held Procedure missing its -pinned manifest -- corruption). +pinned resolved steps -- corruption). """ from typing import Annotated @@ -41,7 +41,7 @@ class ReconductProcedureRequest(BaseModel): ..., ge=0, description=( - "Index in the pinned conduct manifest from which the resume " + "Index in the pinned resolved step list from which the resume " "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " "from the first step). NOT a continuity proof." ), @@ -137,10 +137,10 @@ def _get_handler(request: Request) -> Handler: }, status.HTTP_500_INTERNAL_SERVER_ERROR: { "model": ErrorResponse, - "description": "Held Procedure is missing its pinned manifest (corruption).", + "description": "Held Procedure is missing its pinned resolved steps (corruption).", }, }, - summary="Resume a held Procedure and replay its pinned manifest tail (Held -> Running)", + summary="Resume a held Procedure and replay its pinned step-list tail (Held -> Running)", ) async def post_procedures_reconduct( procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/tool.py b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py index cc10b6a947..680ee07468 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/tool.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/tool.py @@ -1,7 +1,7 @@ """MCP tool for the `reconduct_procedure` slice. Mirrors the REST route: resumes a Held Procedure and replays its pinned -manifest tail, returning a structured summary. Replay outcomes (a step +step-list tail, returning a structured summary. Replay outcomes (a step failure, an acquisition halt) land in the return value, not raised; the LLM caller inspects `succeeded` / `acquisition_halt` / `failure`. """ @@ -42,7 +42,7 @@ def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: @mcp.tool( name="reconduct_procedure", description=( - "Resume a Held Procedure and replay its pinned manifest tail from " + "Resume a Held Procedure and replay its pinned step-list tail from " "re_establishment_boundary: re-drive setpoints, re-run checks, and " "HALT for an operator decision at an acquisition. On a clean tail " "the Procedure auto-completes; on an acquisition halt it stays " @@ -63,7 +63,7 @@ async def reconduct_procedure_tool( # pyright: ignore[reportUnusedFunction] Field( ge=0, description=( - "Manifest index the resume re-drives setpoints / re-runs " + "Step-list index the resume re-drives setpoints / re-runs " "checks from (>= 0; 0 = from the first step)." ), ), diff --git a/apps/api/src/cora/operation/features/resume_procedure/command.py b/apps/api/src/cora/operation/features/resume_procedure/command.py index a9e8996470..a65c713911 100644 --- a/apps/api/src/cora/operation/features/resume_procedure/command.py +++ b/apps/api/src/cora/operation/features/resume_procedure/command.py @@ -2,7 +2,7 @@ Single-source resume transition: `Held -> Running`. The inverse of hold_procedure. Carries `re_establishment_boundary`: the index in the -pinned conduct manifest from which a resume re-drives setpoints and +pinned resolved step list from which a resume re-drives setpoints and re-runs checks (Tier 1 of [[project_resumable_conduct_design]]). It is NOT a continuity proof; it is the re-establishment boundary the Conductor's `execute_from` replays from. diff --git a/apps/api/src/cora/operation/features/resume_procedure/route.py b/apps/api/src/cora/operation/features/resume_procedure/route.py index 52c9286c02..08e977fa45 100644 --- a/apps/api/src/cora/operation/features/resume_procedure/route.py +++ b/apps/api/src/cora/operation/features/resume_procedure/route.py @@ -27,7 +27,7 @@ class ResumeProcedureRequest(BaseModel): ..., ge=0, description=( - "Index in the pinned conduct manifest from which the resume " + "Index in the pinned resolved step list from which the resume " "re-drives setpoints and re-runs checks. >= 0 (0 = re-establish " "from the first step). NOT a continuity proof." ), diff --git a/apps/api/src/cora/operation/features/resume_procedure/tool.py b/apps/api/src/cora/operation/features/resume_procedure/tool.py index f80fb11366..94a4acc553 100644 --- a/apps/api/src/cora/operation/features/resume_procedure/tool.py +++ b/apps/api/src/cora/operation/features/resume_procedure/tool.py @@ -23,7 +23,7 @@ def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: "Resume a held Procedure conduct (Held -> Running). The inverse of " "hold_procedure. Requires the Procedure to currently be in `Held`. " "Resuming a `Running` / `Defined` / terminal Procedure raises. " - "re_establishment_boundary (>= 0) is the manifest index the resume " + "re_establishment_boundary (>= 0) is the step-list index the resume " "re-drives setpoints / re-runs checks from." ), ) @@ -38,7 +38,7 @@ async def resume_procedure_tool( # pyright: ignore[reportUnusedFunction] Field( ge=0, description=( - "Index in the pinned conduct manifest the resume re-drives " + "Index in the pinned resolved step list the resume re-drives " "setpoints / re-runs checks from (>= 0; 0 = from the first step)." ), ), diff --git a/apps/api/src/cora/operation/routes.py b/apps/api/src/cora/operation/routes.py index d0c20999c5..cbb28d7983 100644 --- a/apps/api/src/cora/operation/routes.py +++ b/apps/api/src/cora/operation/routes.py @@ -366,7 +366,7 @@ def register_operation_routes(app: FastAPI) -> None: RecipeExpanderVersionMismatchError, RecipeExpansionRecordNotFoundError, RecipeExpansionReplayMismatchError, - # resumable conduct: a Held Procedure missing its pinned manifest + # resumable conduct: a Held Procedure missing its pinned resolved steps # (corruption); kept out of the reconduct failures-in-body contract. ResolvedStepsRecordNotFoundError, # PseudoAxis pre-Conductor expansion ([[project-pseudoaxis-design]] diff --git a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py index 45ca219acf..e3e3756a13 100644 --- a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py +++ b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py @@ -1,19 +1,19 @@ """Contract tests for `POST /procedures/{procedure_id}/reconduct`. Resume-and-replay: resumes a Held Procedure and replays its pinned -manifest tail. 200 with replay outcomes in body; 404/409/422/500 for +step-list tail. 200 with replay outcomes in body; 404/409/422/500 for protocol / guard / corruption faults. Note on coverage: the 200 happy path (a clean replay that auto-completes) requires a `Held` Procedure that carries a PINNED `ResolvedStepsRecorded` -manifest. The synchronous conduct flow today pins the manifest then runs +resolved steps. The synchronous conduct flow today pins the resolved steps then runs to a terminal state (Completed / Aborted) in one call, so there is no -API-reachable `Held`+manifest state yet (producing it -- a conduct that +API-reachable `Held + resolved-steps state yet (producing it -- a conduct that pauses to Held instead of aborting on a halt, or a mid-conduct cooperative hold -- is a follow-up slice). The clean / halt / step-failure replay paths are covered end-to-end in `tests/unit/operation/test_reconduct_procedure_handler.py` against a -seeded Held+manifest state. These contract tests cover the +seeded Held+resolved steps state. These contract tests cover the API-reachable guard / fault surfaces. """ @@ -52,14 +52,14 @@ def test_post_reconduct_returns_409_for_defined_procedure() -> None: @pytest.mark.contract -def test_post_reconduct_returns_409_for_completed_procedure_with_manifest() -> None: - """A conduct pins a manifest then completes; reconducting the (Completed) +def test_post_reconduct_returns_409_for_completed_procedure_with_resolved_steps() -> None: + """A conduct pins resolved steps then completes; reconducting the (Completed) Procedure is refused by the resume status guard (not Held).""" with TestClient(create_app()) as client: pid = _register(client) # Conduct an EMPTY step list: pins ResolvedStepsRecorded, then # start -> (no steps) -> complete, leaving the Procedure Completed - # WITH a pinned (empty) manifest. + # WITH a pinned (empty) resolved steps. conducted = client.post(f"/procedures/{pid}/conduct", json={"steps": []}) assert conducted.status_code == 200 assert conducted.json()["succeeded"] is True @@ -70,9 +70,9 @@ def test_post_reconduct_returns_409_for_completed_procedure_with_manifest() -> N @pytest.mark.contract -def test_post_reconduct_returns_500_for_held_procedure_without_manifest() -> None: +def test_post_reconduct_returns_500_for_held_procedure_without_resolved_steps() -> None: """A Procedure started directly (no conduct) then held is Held WITHOUT a - pinned manifest; reconduct cannot locate it (corruption-shaped 500).""" + pinned resolved steps; reconduct cannot locate it (corruption-shaped 500).""" with TestClient(create_app()) as client: pid = _register(client) assert client.post(f"/procedures/{pid}/start").status_code == 204 diff --git a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py index 9b6026bc51..49ad04a5ab 100644 --- a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py @@ -225,7 +225,7 @@ async def test_conduct_procedure_handler_dispatches_to_conductor_with_envelope() @pytest.mark.unit async def test_conduct_procedure_pins_resolved_steps_before_conducting() -> None: - """The handler appends a ResolvedStepsRecorded manifest (the resolved + """The handler appends a ResolvedStepsRecorded event (the resolved step list) to the Procedure stream before dispatching to the Conductor.""" procedure_id = uuid4() store = InMemoryEventStore() @@ -248,14 +248,14 @@ async def test_conduct_procedure_pins_resolved_steps_before_conducting() -> None ) stored, _ = await store.load(stream_type="Procedure", stream_id=procedure_id) - manifests = [ + recorded = [ event for event in (from_stored(s) for s in stored) if isinstance(event, ResolvedStepsRecorded) ] - assert len(manifests) == 1 - assert manifests[0].step_count == 2 - assert manifests[0].resolved_steps == tuple(step_to_payload(step) for step in steps) + assert len(recorded) == 1 + assert recorded[0].step_count == 2 + assert recorded[0].resolved_steps == tuple(step_to_payload(step) for step in steps) @pytest.mark.unit diff --git a/apps/api/tests/unit/operation/test_conductor_execute_from.py b/apps/api/tests/unit/operation/test_conductor_execute_from.py index 5b363477f4..b2e1e19c34 100644 --- a/apps/api/tests/unit/operation/test_conductor_execute_from.py +++ b/apps/api/tests/unit/operation/test_conductor_execute_from.py @@ -1,6 +1,6 @@ """Behavioural tests for `Conductor.execute_from` (resumable conduct, Tier 1). -`execute_from` REPLAYS a pinned conduct manifest from a re-establishment +`execute_from` REPLAYS a pinned resolved step list from a re-establishment boundary rather than re-deriving the step list: - setpoint -> re-drive (idempotent absolute write) @@ -9,9 +9,9 @@ Headline acceptance test (per the design memo): replay walks the pinned tail BYTE-FOR-BYTE -- two identical setpoints land on the in-memory port, -identical to what the original conduct wrote. `steps_from_manifest` is the +identical to what the original conduct wrote. `steps_from_payload` is the exact inverse of `step_to_payload`, so the pinned `ResolvedStepsRecorded` -manifest round-trips into the replayed `Step`s. +step list round-trips into the replayed `Step`s. """ from collections.abc import AsyncIterator, Mapping @@ -33,7 +33,7 @@ Step, WithinToleranceCriterion, step_to_payload, - steps_from_manifest, + steps_from_payload, ) from cora.operation.features.append_activities.command import AppendProcedureActivities from cora.operation.ports.control_port import ControlNotConnectedError, Reading @@ -103,8 +103,8 @@ def _good_reading(value: Any) -> Reading: def _pin_and_parse(steps: tuple[Step, ...]) -> tuple[Step, ...]: """Serialize steps the way conduct pins them, then parse back (the ResolvedStepsRecorded round-trip a real resume performs).""" - manifest_wire = tuple(step_to_payload(s) for s in steps) - return steps_from_manifest(manifest_wire) + steps_wire = tuple(step_to_payload(s) for s in steps) + return steps_from_payload(steps_wire) # --- headline acceptance: byte-for-byte replay of the pinned tail ---------- @@ -112,13 +112,13 @@ def _pin_and_parse(steps: tuple[Step, ...]) -> tuple[Step, ...]: @pytest.mark.unit async def test_execute_from_replays_pinned_tail_byte_for_byte() -> None: - """Two setpoints pinned on the manifest re-drive byte-for-byte on resume.""" + """Two setpoints pinned on the step list re-drive byte-for-byte on resume.""" original = ( SetpointStep(address="2bma:rot:val", value=45.0), SetpointStep(address="2bma:cam:exposure", value=0.025), ) - manifest = _pin_and_parse(original) - assert manifest == original # the pinned manifest round-trips to the same Steps + steps = _pin_and_parse(original) + assert steps == original # the pinned step list round-trips to the same Steps port = _RecordingControlPort() appender = _FakeAppendStep() @@ -126,20 +126,20 @@ async def test_execute_from_replays_pinned_tail_byte_for_byte() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=0, ) assert result.succeeded is True assert result.completed_count == 2 - # Byte-for-byte: the replayed writes equal the pinned manifest's setpoints. + # Byte-for-byte: the replayed writes equal the pinned step list.s setpoints. assert port.writes == [("2bma:rot:val", 45.0), ("2bma:cam:exposure", 0.025)] @pytest.mark.unit async def test_execute_from_boundary_skips_the_prefix() -> None: - """boundary=K re-drives only manifest[K:]; the prefix is not re-driven.""" - manifest = _pin_and_parse( + """boundary=K re-drives only steps[K:]; the prefix is not re-driven.""" + steps = _pin_and_parse( ( SetpointStep(address="2bma:a", value=1.0), SetpointStep(address="2bma:b", value=2.0), @@ -151,7 +151,7 @@ async def test_execute_from_boundary_skips_the_prefix() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=1, ) assert result.completed_count == 2 @@ -161,8 +161,8 @@ async def test_execute_from_boundary_skips_the_prefix() -> None: @pytest.mark.unit async def test_execute_from_records_marker_and_outcome_with_absolute_index() -> None: """A re-driven setpoint records the in-flight marker + ok outcome, each - carrying its ABSOLUTE manifest position (so the replayed journal lines up).""" - manifest = _pin_and_parse( + carrying its ABSOLUTE position in the step list (so the replayed journal lines up).""" + steps = _pin_and_parse( ( SetpointStep(address="2bma:a", value=1.0), SetpointStep(address="2bma:b", value=2.0), @@ -173,7 +173,7 @@ async def test_execute_from_records_marker_and_outcome_with_absolute_index() -> procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=1, ) payloads = [c.entries[0].payload for c in appender.calls] @@ -185,7 +185,7 @@ async def test_execute_from_records_marker_and_outcome_with_absolute_index() -> async def test_execute_from_on_action_requires_operator_decision() -> None: """An acquisition (ActionStep) is NOT re-run: resume halts for an operator decision; the action and everything after it are untouched.""" - manifest = _pin_and_parse( + steps = _pin_and_parse( ( SetpointStep(address="2bma:a", value=1.0), ActionStep(name="collect", params={"dwell": 0.1}), @@ -197,7 +197,7 @@ async def test_execute_from_on_action_requires_operator_decision() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=0, ) assert result.succeeded is False @@ -214,7 +214,7 @@ async def test_execute_from_on_action_requires_operator_decision() -> None: @pytest.mark.unit async def test_execute_from_reruns_check_fresh() -> None: """A check in the tail is re-run as a fresh gate (read + evaluate).""" - manifest = _pin_and_parse( + steps = _pin_and_parse( (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) ) port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(45.0)}) @@ -222,7 +222,7 @@ async def test_execute_from_reruns_check_fresh() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=0, ) assert result.succeeded is True @@ -232,7 +232,7 @@ async def test_execute_from_reruns_check_fresh() -> None: @pytest.mark.unit async def test_execute_from_check_mismatch_on_rerun_halts() -> None: """A re-run check whose criterion no longer matches halts the resume.""" - manifest = _pin_and_parse( + steps = _pin_and_parse( (CheckStep(address="2bma:rbv", criterion=EqualsCriterion(expected=45.0)),) ) port = _RecordingControlPort(readings={"2bma:rbv": _good_reading(12.5)}) @@ -240,7 +240,7 @@ async def test_execute_from_check_mismatch_on_rerun_halts() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=0, ) assert result.succeeded is False @@ -251,14 +251,14 @@ async def test_execute_from_check_mismatch_on_rerun_halts() -> None: @pytest.mark.unit async def test_execute_from_boundary_past_end_is_a_no_op() -> None: - """Boundary >= len(manifest) replays an empty tail (a no-op resume).""" - manifest = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + """Boundary >= len(steps) replays an empty tail (a no-op resume).""" + steps = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) port = _RecordingControlPort() result = await _conductor(port, _FakeAppendStep()).execute_from( procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=5, ) assert result.succeeded is True @@ -273,7 +273,7 @@ async def test_execute_from_rejects_negative_boundary() -> None: procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=(), + steps=(), boundary=-1, ) @@ -281,13 +281,13 @@ async def test_execute_from_rejects_negative_boundary() -> None: @pytest.mark.unit async def test_execute_from_explicit_re_establish_policy_is_the_default() -> None: """Passing the only policy member behaves identically to the default.""" - manifest = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) + steps = _pin_and_parse((SetpointStep(address="2bma:a", value=1.0),)) port = _RecordingControlPort() result = await _conductor(port, _FakeAppendStep()).execute_from( procedure_id=uuid4(), principal_id=uuid4(), correlation_id=uuid4(), - manifest=manifest, + steps=steps, boundary=0, policy=ResumePolicy.RE_ESTABLISH, ) @@ -295,7 +295,7 @@ async def test_execute_from_explicit_re_establish_policy_is_the_default() -> Non assert port.writes == [("2bma:a", 1.0)] -# --- steps_from_manifest is the exact inverse of step_to_payload ----------- +# --- steps_from_payload is the exact inverse of step_to_payload ----------- @pytest.mark.unit @@ -313,22 +313,22 @@ async def test_execute_from_explicit_re_establish_policy_is_the_default() -> Non ), ], ) -def test_steps_from_manifest_round_trips_step_to_payload(step: Step) -> None: - assert steps_from_manifest((step_to_payload(step),)) == (step,) +def test_steps_from_payload_round_trips_step_to_payload(step: Step) -> None: + assert steps_from_payload((step_to_payload(step),)) == (step,) @pytest.mark.unit -def test_steps_from_manifest_rejects_unknown_kind() -> None: +def test_steps_from_payload_rejects_unknown_kind() -> None: with pytest.raises(ValueError, match="unknown step kind"): - steps_from_manifest(({"kind": "bogus"},)) + steps_from_payload(({"kind": "bogus"},)) @pytest.mark.unit -def test_steps_from_manifest_rejects_unknown_criterion_kind() -> None: +def test_steps_from_payload_rejects_unknown_criterion_kind() -> None: bad: Mapping[str, Any] = { "kind": "check", "address": "x", "criterion": {"kind": "bogus"}, } with pytest.raises(ValueError, match="unknown criterion kind"): - steps_from_manifest((bad,)) + steps_from_payload((bad,)) diff --git a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py index 5a18773276..c958787d2e 100644 --- a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py @@ -6,7 +6,7 @@ - clean tail -> resume + auto-complete (Completed) - acquisition halt -> resume, NO complete/abort, stays Running, halt in result - genuine step failure -> resume + abort (Aborted) - - missing pinned manifest -> ResolvedStepsRecordNotFoundError + - missing pinned resolved steps -> ResolvedStepsRecordNotFoundError - not Held / parent Run Held -> ProcedureCannotResumeError (no replay) - authz deny -> UnauthorizedError """ @@ -93,16 +93,16 @@ def _make_reconduct(deps: Kernel, port: InMemoryControlPort) -> ReconductHandler return reconduct_procedure.bind(deps, conductor=conductor) -async def _seed_held_with_manifest( +async def _seed_held_with_steps( store: InMemoryEventStore, *, - manifest: Sequence[Step], + steps: Sequence[Step], procedure_id: UUID = _PROCEDURE_ID, parent_run_id: UUID | None = None, ) -> None: """Land a conducted-then-Held Procedure: Registered + ResolvedStepsRecorded - (the pinned manifest) + Started + Held.""" - resolved = tuple(step_to_payload(s) for s in manifest) + (the pinned resolved steps) + Started + Held.""" + resolved = tuple(step_to_payload(s) for s in steps) events = [ ProcedureRegistered( procedure_id=procedure_id, @@ -186,9 +186,9 @@ async def test_clean_tail_resumes_then_auto_completes() -> None: port = InMemoryControlPort() port.simulate_connect("2bma:a") port.simulate_connect("2bma:b") - await _seed_held_with_manifest( + await _seed_held_with_steps( store, - manifest=( + steps=( SetpointStep(address="2bma:a", value=1.0), SetpointStep(address="2bma:b", value=2.0), ), @@ -209,9 +209,9 @@ async def test_boundary_replays_only_the_tail_then_completes() -> None: store = InMemoryEventStore() port = InMemoryControlPort() port.simulate_connect("2bma:b") # only the tail step is re-driven - await _seed_held_with_manifest( + await _seed_held_with_steps( store, - manifest=( + steps=( SetpointStep(address="2bma:a", value=1.0), SetpointStep(address="2bma:b", value=2.0), ), @@ -231,9 +231,9 @@ async def test_acquisition_halt_resumes_but_leaves_running() -> None: store = InMemoryEventStore() port = InMemoryControlPort() port.simulate_connect("2bma:a") - await _seed_held_with_manifest( + await _seed_held_with_steps( store, - manifest=( + steps=( SetpointStep(address="2bma:a", value=1.0), ActionStep(name="collect", params={"dwell": 0.1}), ), @@ -258,7 +258,7 @@ async def test_acquisition_halt_resumes_but_leaves_running() -> None: async def test_genuine_step_failure_resumes_then_aborts() -> None: store = InMemoryEventStore() port = InMemoryControlPort() # 2bma:a NOT connected -> write fails - await _seed_held_with_manifest(store, manifest=(SetpointStep(address="2bma:a", value=1.0),)) + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) deps = _deps(store) result = await _call(_make_reconduct(deps, port), 0) @@ -270,7 +270,7 @@ async def test_genuine_step_failure_resumes_then_aborts() -> None: @pytest.mark.unit -async def test_raises_when_manifest_record_missing() -> None: +async def test_raises_when_resolved_steps_record_missing() -> None: """A Held Procedure with no pinned ResolvedStepsRecorded is corruption.""" store = InMemoryEventStore() # Seed Held WITHOUT a ResolvedStepsRecorded. @@ -318,9 +318,9 @@ async def test_reconduct_raises_not_found_when_procedure_absent() -> None: @pytest.mark.unit async def test_raises_cannot_resume_when_not_held() -> None: - """A Running (not Held) Procedure with a manifest cannot be reconducted.""" + """A Running (not Held) Procedure with resolved steps cannot be reconducted.""" store = InMemoryEventStore() - # Registered + ResolvedStepsRecorded + Started (Running, has manifest). + # Registered + ResolvedStepsRecorded + Started (Running, has resolved steps). resolved = (step_to_payload(SetpointStep(address="2bma:a", value=1.0)),) events = [ ProcedureRegistered( @@ -367,9 +367,9 @@ async def test_raises_cannot_resume_when_parent_run_held() -> None: store = InMemoryEventStore() parent_run_id = uuid4() await _seed_held_run(store, run_id=parent_run_id) - await _seed_held_with_manifest( + await _seed_held_with_steps( store, - manifest=(SetpointStep(address="2bma:a", value=1.0),), + steps=(SetpointStep(address="2bma:a", value=1.0),), parent_run_id=parent_run_id, ) deps = _deps(store) @@ -381,7 +381,7 @@ async def test_raises_cannot_resume_when_parent_run_held() -> None: @pytest.mark.unit async def test_raises_unauthorized_on_deny() -> None: store = InMemoryEventStore() - await _seed_held_with_manifest(store, manifest=(SetpointStep(address="2bma:a", value=1.0),)) + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) deps = _deps(store, deny=True) with pytest.raises(UnauthorizedError): await _call(_make_reconduct(deps, InMemoryControlPort()), 0) diff --git a/apps/api/tests/unit/operation/test_record_resolved_steps.py b/apps/api/tests/unit/operation/test_record_resolved_steps.py index 51bd345ec0..92b1b97f05 100644 --- a/apps/api/tests/unit/operation/test_record_resolved_steps.py +++ b/apps/api/tests/unit/operation/test_record_resolved_steps.py @@ -1,4 +1,4 @@ -"""Tier-0 manifest recording: decide_resolved_steps_recorded + step_to_payload. +"""Tier-0 resolved-steps recording: decide_resolved_steps_recorded + step_to_payload. Covers: - the helper emits one ResolvedStepsRecorded for a Defined Procedure, @@ -8,7 +8,7 @@ the conduct route keeps its failures-in-body contract). - step_to_payload round-trips every step kind back to an equal Step via the public wire path (ConductProcedureRequest validation + step_from_wire), - proving a pinned manifest can be replayed. + proving a pinned step list can be replayed. """ from datetime import UTC, datetime @@ -31,7 +31,7 @@ WithinToleranceCriterion, step_to_payload, ) -from cora.operation.features.conduct_procedure.manifest import ( +from cora.operation.features.conduct_procedure.resolved_steps import ( decide_resolved_steps_recorded, ) from cora.operation.features.conduct_procedure.route import ( @@ -55,7 +55,7 @@ def _registered() -> tuple[UUID, ProcedureRegistered]: @pytest.mark.unit -def test_decide_records_manifest_for_defined_procedure() -> None: +def test_decide_records_resolved_steps_for_defined_procedure() -> None: procedure_id, registered = _registered() state = fold([registered]) steps = ( From c38fb781e00dedea5dbc7d640340acaf52d8e858 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 13:16:39 +0300 Subject: [PATCH 09/12] feat(operation): try_conduct_procedure pauses a conduct to Held on a recoverable failure Closes the resumable-conduct producer gap: until now a halted conduct could only abort, so reconduct (the resume path, PR #276) had nothing to resume and its 200 happy path was not API-reachable. try_conduct is the conduct verb-family's third member (conduct = run-to-terminal, reconduct = resume-and-replay, try-conduct = pause-to-Held-on-recoverable-failure). On a RECOVERABLE step failure (a setpoint or check: re-drivable / re-runnable on resume) the Conductor pauses the Procedure to Held instead of aborting, leaving the pinned ResolvedStepsRecorded so an operator can reconduct from a boundary. An acquisition (action) failure keeps aborting (an interrupted acquisition is non-idempotent, Tier 2 territory; a Held tail starting with an acquisition could only halt-for-operator on reconduct). Lifecycle failures and a mid-execute cancellation keep conduct's posture verbatim. Conductor changes: - new Conductor.try_conduct: the abort-vs-hold twin of conduct(); injects a hold_procedure handler and branches the failure terminalize on _is_recoverable_failure (source_kind in {setpoint, check}). - ConductorResult.held: True only when the hold transition itself succeeded, so a caller can tell a resumable Held outcome from a terminal Aborted one (both carry succeeded=False + failure). A hold that fails leaves the Procedure Running (held=False), mirroring conduct's best-effort abort. - _derive_abort_reason -> _derive_failure_reason (now serves abort + hold). Always-200 response gains a `held` discriminator (REST + MCP); without it Held and Aborted are indistinguishable in the body. Slice composition lives on the non-slice Conductor and two BC-level modules so try_conduct imports no sibling slice (the cross-slice-independence fitness): - _conduct_wire.py: the shared step-list wire models + converters, hoisted out of conduct's route (conduct's route + tool now import from here). - _conduct_preparation.py: the shared recipe re-expansion + pseudoaxis + resolved-steps pin pipeline, hoisted out of conduct's handler (the pure pin decider decide_resolved_steps_recorded moved here; conduct/resolved_steps.py removed). conduct's behavior is byte-for-byte unchanged (verified). Coverage: 10 handler unit tests (recoverable setpoint/check -> Held + manifest pinned; action -> Aborted; clean / empty -> Completed; hold-fails -> Running; start-rejected + complete-fails -> lifecycle failure; deny; not-found), contract + MCP tests, and two new reconduct contract tests that drive the now-reachable 200 paths end-to-end (try-conduct -> Held -> reconduct: empty tail completes; acquisition tail halts-for-operator). Gate-reviewed (naming-r3 PASS on the conduct/reconduct/try-conduct family; correctness + standards clean). Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/openapi.json | 207 +++++++++- .../cora/operation/_conduct_preparation.py | 249 ++++++++++++ apps/api/src/cora/operation/_conduct_wire.py | 163 ++++++++ apps/api/src/cora/operation/conductor.py | 201 +++++++++- .../features/conduct_procedure/handler.py | 194 +--------- .../conduct_procedure/resolved_steps.py | 51 --- .../features/conduct_procedure/route.py | 174 +-------- .../features/conduct_procedure/tool.py | 2 +- .../try_conduct_procedure/__init__.py | 39 ++ .../features/try_conduct_procedure/command.py | 47 +++ .../features/try_conduct_procedure/handler.py | 166 ++++++++ .../features/try_conduct_procedure/route.py | 151 ++++++++ .../features/try_conduct_procedure/tool.py | 86 ++++ apps/api/src/cora/operation/routes.py | 2 + apps/api/src/cora/operation/tools.py | 5 + apps/api/src/cora/operation/wire.py | 26 +- .../tests/architecture/test_slice_contract.py | 4 + .../test_reconduct_procedure_endpoint.py | 69 +++- .../test_try_conduct_procedure_endpoint.py | 86 ++++ .../test_try_conduct_procedure_mcp_tool.py | 72 ++++ .../test_conduct_procedure_handler.py | 3 +- .../operation/test_record_resolved_steps.py | 4 +- .../test_try_conduct_procedure_handler.py | 366 ++++++++++++++++++ 23 files changed, 1946 insertions(+), 421 deletions(-) create mode 100644 apps/api/src/cora/operation/_conduct_preparation.py create mode 100644 apps/api/src/cora/operation/_conduct_wire.py delete mode 100644 apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py create mode 100644 apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py create mode 100644 apps/api/src/cora/operation/features/try_conduct_procedure/command.py create mode 100644 apps/api/src/cora/operation/features/try_conduct_procedure/handler.py create mode 100644 apps/api/src/cora/operation/features/try_conduct_procedure/route.py create mode 100644 apps/api/src/cora/operation/features/try_conduct_procedure/tool.py create mode 100644 apps/api/tests/contract/test_try_conduct_procedure_endpoint.py create mode 100644 apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py create mode 100644 apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 4542d45d05..37ba084f09 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -3750,7 +3750,7 @@ "failure": { "anyOf": [ { - "$ref": "#/components/schemas/_ConductorFailureResponse" + "$ref": "#/components/schemas/ConductorFailureResponse" }, { "type": "null" @@ -3913,6 +3913,47 @@ "title": "ConductRunResponse", "type": "object" }, + "ConductorFailureResponse": { + "description": "JSON wire shape for a `ConductorFailure`.", + "properties": { + "error_class": { + "title": "Error Class", + "type": "string" + }, + "message": { + "title": "Message", + "type": "string" + }, + "source_kind": { + "title": "Source Kind", + "type": "string" + }, + "step_index": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Step Index" + }, + "target": { + "title": "Target", + "type": "string" + } + }, + "required": [ + "step_index", + "source_kind", + "target", + "error_class", + "message" + ], + "title": "ConductorFailureResponse", + "type": "object" + }, "ConduitListResponse": { "description": "Page of conduits plus opaque next-page cursor.", "properties": { @@ -14537,6 +14578,92 @@ "title": "TruncateRunRequest", "type": "object" }, + "TryConductProcedureRequest": { + "additionalProperties": false, + "description": "Body for `POST /procedures/{procedure_id}/try-conduct`.", + "properties": { + "steps": { + "description": "Steps the Conductor walks in order (0-500). Empty list is valid: start + complete fire with no steps.", + "items": { + "discriminator": { + "mapping": { + "action": "#/components/schemas/_ActionStepRequest", + "check": "#/components/schemas/_CheckStepRequest", + "setpoint": "#/components/schemas/_SetpointStepRequest" + }, + "propertyName": "kind" + }, + "oneOf": [ + { + "$ref": "#/components/schemas/_SetpointStepRequest" + }, + { + "$ref": "#/components/schemas/_ActionStepRequest" + }, + { + "$ref": "#/components/schemas/_CheckStepRequest" + } + ] + }, + "maxItems": 500, + "title": "Steps", + "type": "array" + } + }, + "title": "TryConductProcedureRequest", + "type": "object" + }, + "TryConductProcedureResponse": { + "description": "Response body for the try_conduct_procedure slice.\n\n`succeeded` is the canonical pass/fail bit; `failure` is non-null iff\n`succeeded` is False. `held` is True iff a recoverable step failure paused\nthe Procedure to `Held` (resumable via `reconduct`); a terminal `Aborted`\noutcome carries `succeeded=False` + `failure` + `held=False`.\n\n`actuation_kind` is the raw `ActuationKind` value the Conductor observed,\nor None when nothing instrumented was actuated. Read-only operator\nvisibility; the gate that consumes it reads the value server-side off the\nProcedure stream, never back from this response.", + "properties": { + "actuation_kind": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Actuation Kind" + }, + "completed_count": { + "title": "Completed Count", + "type": "integer" + }, + "failure": { + "anyOf": [ + { + "$ref": "#/components/schemas/ConductorFailureResponse" + }, + { + "type": "null" + } + ] + }, + "held": { + "default": false, + "title": "Held", + "type": "boolean" + }, + "procedure_id": { + "format": "uuid", + "title": "Procedure Id", + "type": "string" + }, + "succeeded": { + "title": "Succeeded", + "type": "boolean" + } + }, + "required": [ + "procedure_id", + "completed_count", + "succeeded" + ], + "title": "TryConductProcedureResponse", + "type": "object" + }, "UnbindPlanRoleRequest": { "description": "Body for `POST /plans/{plan_id}/unbind-role`.", "properties": { @@ -37500,6 +37627,84 @@ ] } }, + "/procedures/{procedure_id}/try-conduct": { + "post": { + "description": "Conduct a Procedure, pausing to Held on a recoverable failure.", + "operationId": "post_procedures_try_conduct_procedures__procedure_id__try_conduct_post", + "parameters": [ + { + "description": "Target procedure's id.", + "in": "path", + "name": "procedure_id", + "required": true, + "schema": { + "description": "Target procedure's id.", + "format": "uuid", + "title": "Procedure Id", + "type": "string" + } + }, + { + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "in": "header", + "name": "X-Principal-Id", + "required": false, + "schema": { + "anyOf": [ + { + "format": "uuid", + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Legacy principal-id header (trust-the-proxy shape). When IDENTITY_PROVIDERS is configured (bearer-auth mode), this header is IGNORED and the verified bearer token from `BearerAuthMiddleware` (Authorization: Bearer) sets the principal. When no IdPs are configured (legacy mode), the application TRUSTS this header (no cryptographic verification) -- production deployments in legacy mode MUST front the API with an auth proxy that strips any client-supplied X-Principal-Id and sets it to the verified principal UUID. Behavior when absent: see Settings.require_authenticated_principal.", + "title": "X-Principal-Id" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TryConductProcedureRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/TryConductProcedureResponse" + } + } + }, + "description": "Successful Response" + }, + "403": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "Authorize port denied the command." + }, + "422": { + "description": "Request body failed schema validation: unknown step kind, missing required field, batch over cap, invalid criterion shape." + } + }, + "summary": "Conduct a Procedure, pausing to Held on a recoverable failure: start -> walk steps -> complete (success) / pause to Held (recoverable setpoint or check failure) / abort (acquisition failure).", + "tags": [ + "operation" + ] + } + }, "/recipes": { "post": { "operationId": "post_recipes_recipes_post", diff --git a/apps/api/src/cora/operation/_conduct_preparation.py b/apps/api/src/cora/operation/_conduct_preparation.py new file mode 100644 index 0000000000..c4368723dd --- /dev/null +++ b/apps/api/src/cora/operation/_conduct_preparation.py @@ -0,0 +1,249 @@ +"""Shared pre-conduct pipeline for the conduct verb-family slices. + +`conduct_procedure` and `try_conduct_procedure` resolve the SAME step list +the same way before handing it to the Conductor, then pin it identically: + + 1. recipe re-expansion when the Procedure was created from a recipe + (the five-step replay gate per [[project-run-procedure-replay-design]]); + 2. pseudoaxis -> constituent expansion when the Procedure is a Run phase + (resolve each virtual-axis SetpointStep's constituents from the Run's + Plan wires); + 3. pin the FINAL resolved list as a `ResolvedStepsRecorded` provenance + event BEFORE any step executes, so a later resume replays this exact + list rather than re-deriving it. + +A slice cannot import a sibling slice (the cross-slice-independence fitness), +so this BC-level module owns the shared pipeline, mirroring `_conduct_wire` +(shared HTTP/MCP shapes) and `_resolved_steps_replay` (the resume-side read). +The pin is emitted inline rather than via a dedicated command slice: +`ResolvedStepsRecorded` is an internal provenance event with no operator +entry point, exactly like `RecipeExpansionRecorded`. +""" + +from collections.abc import Mapping, Sequence +from datetime import datetime +from typing import TYPE_CHECKING, Any +from uuid import UUID + +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.ports import EventStore +from cora.infrastructure.ports.event_store import StoredEvent +from cora.operation._recipe_replay import ( + find_recipe_expansion_record, + pins_from_payload, + verify_bindings_hash, + verify_steps_hash, +) +from cora.operation.aggregates.procedure import ( + Procedure, + ProcedureBoundCapabilityDeprecatedError, + ProcedureStatus, + ProcedureStepsForbiddenForRecipeDrivenError, + RecipeExpanderVersionMismatchError, + RecipeExpansionRecordNotFoundError, + ResolvedStepsRecorded, + event_type_name, + to_payload, +) +from cora.operation.conductor import Step, step_to_payload +from cora.operation.ports.recipe_expander import RecipeExpander +from cora.recipe.aggregates.capability import CapabilityStatus, load_capability +from cora.recipe.aggregates.plan import ( + PlanNotFoundError, + constituents_from_wires, + load_plan, +) +from cora.recipe.aggregates.recipe import load_recipe_at_version +from cora.run.aggregates.run import RunNotFoundError, load_run + +if TYPE_CHECKING: + from cora.operation._pseudoaxis_expander import ConstituentResolver + + +def decide_resolved_steps_recorded( + state: Procedure | None, + resolved_steps: Sequence[Mapping[str, Any]], + *, + now: datetime, +) -> list[ResolvedStepsRecorded]: + """Pin the resolved step list iff the Procedure is pre-conduct (Defined). + + Returns a single `ResolvedStepsRecorded` when `state` is `Defined` + (the normal conduct path, before `start_procedure` transitions it to + `Running`). Returns `[]` when `state` is None or not `Defined`: a + conduct of a missing / already-running / terminal Procedure records no + resolved steps and lets the Conductor's `start_procedure` produce the + normal lifecycle failure, preserving the conduct route's failures-in-body + contract instead of raising a fresh HTTP error here. Kept as a pure + function so the decision is unit-testable without an event store. + """ + if state is None or state.status is not ProcedureStatus.DEFINED: + return [] + steps = tuple(dict(step) for step in resolved_steps) + return [ + ResolvedStepsRecorded( + procedure_id=state.id, + resolved_steps=steps, + step_count=len(steps), + occurred_at=now, + ) + ] + + +async def resolve_and_pin_conduct_steps( + deps: Kernel, + *, + command_name: str, + procedure: Procedure, + stored_events: list[StoredEvent], + caller_steps: Sequence[Step], + expansion_port: RecipeExpander, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None, +) -> tuple[Step, ...]: + """Resolve the final conduct step list + pin it as `ResolvedStepsRecorded`. + + The shared pre-Conductor work for `conduct` / `try_conduct`: recipe + re-expansion (recipe-driven Procedures) -> pseudoaxis constituent + expansion (Run-phase Procedures) -> pin. Returns the resolved steps to + hand to the Conductor. `command_name` rides the pinned event's metadata. + """ + if procedure.recipe_id is not None: + steps = await _re_expand_steps( + procedure_id=procedure.id, + recipe_id=procedure.recipe_id, + caller_steps=caller_steps, + stored_events=stored_events, + event_store=deps.event_store, + expansion_port=expansion_port, + ) + else: + steps = tuple(caller_steps) + + # A Phase-of-Run Procedure resolves a pseudoaxis's constituent motors from + # its Run's Plan wires: parent_run_id -> Run.plan_id -> Plan.wires (the + # same load chain start_procedure walks for its Supply gate). A missing + # Run / Plan in that chain is corruption, so raise rather than silently + # skip. Standalone / recipe-driven Procedures (no parent_run_id) pass no + # resolver, so any pseudoaxis SetpointStep hits the wiring-deferred default + # and is rejected with PartitionRuleNotFoundError. + constituent_resolver: ConstituentResolver | None = None + if procedure.parent_run_id is not None: + parent_run = await load_run(deps.event_store, procedure.parent_run_id) + if parent_run is None: + raise RunNotFoundError(procedure.parent_run_id) + plan = await load_plan(deps.event_store, parent_run.plan_id) + if plan is None: + raise PlanNotFoundError(parent_run.plan_id) + + def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: + return constituents_from_wires(plan, asset_id) + + constituent_resolver = _resolve_constituents + + # Pre-Conductor PseudoAxis expansion: rewrite any virtual-axis SetpointStep + # into N sequential constituent SetpointSteps so the Conductor's dispatch + # loop walks the constituents in declared order. ActionStep / CheckStep + # pass through unchanged ([[project-pseudoaxis-design]] v3). + steps = await expansion_port.expand_pseudoaxis( + steps, + event_store=deps.event_store, + correlation_id=correlation_id, + constituent_resolver=constituent_resolver, + ) + + # Pin the resolved step list (after recipe + pseudoaxis expansion) BEFORE + # conducting, so a future resume replays this exact list. The helper emits + # the event only while the Procedure is still Defined and returns [] + # otherwise, leaving the Conductor's start_procedure to surface a lifecycle + # failure (keeps the conduct route's failures-in-body contract). + resolved_steps_events = decide_resolved_steps_recorded( + procedure, + tuple(step_to_payload(step) for step in steps), + now=deps.clock.now(), + ) + if resolved_steps_events: + _, current_version = await deps.event_store.load( + stream_type="Procedure", stream_id=procedure.id + ) + await deps.event_store.append( + stream_type="Procedure", + stream_id=procedure.id, + expected_version=current_version, + events=[ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=deps.id_generator.new_id(), + command_name=command_name, + correlation_id=correlation_id, + causation_id=causation_id, + principal_id=principal_id, + ) + for event in resolved_steps_events + ], + ) + + return steps + + +async def _re_expand_steps( + *, + procedure_id: UUID, + recipe_id: UUID, + caller_steps: Sequence[Step], + stored_events: list[StoredEvent], + event_store: EventStore, + expansion_port: RecipeExpander, +) -> tuple[Step, ...]: + """Run the recipe-replay gate per [[project-run-procedure-replay-design]]. + + Six steps: reject non-empty caller steps -> find_recipe_expansion_record + (raise RecipeExpansionRecordNotFoundError on None) -> pins_from_payload + -> port-version strict-equals (raise RecipeExpanderVersionMismatchError + on drift) -> load_recipe_at_version (raise RecipeExpansionRecordNotFoundError + when None on a recipe-driven Procedure; RecipeVersionNotFoundError + propagates from helper) -> load_capability + reject Deprecated + (raise ProcedureBoundCapabilityDeprecatedError, symmetric to + start_run's RunBoundPlanDeprecatedError) -> verify_bindings_hash -> + expand -> verify_steps_hash -> return the re-expanded tuple. + """ + if list(caller_steps): + raise ProcedureStepsForbiddenForRecipeDrivenError(procedure_id) + + record = find_recipe_expansion_record(stored_events) + if record is None: + raise RecipeExpansionRecordNotFoundError(procedure_id) + + pins = pins_from_payload(procedure_id, record.payload) + + if pins.expansion_port_version != expansion_port.version: + raise RecipeExpanderVersionMismatchError( + procedure_id, + pins.expansion_port_version, + expansion_port.version, + ) + + recipe = await load_recipe_at_version( + event_store, + recipe_id, + pins.recipe_version, + ) + if recipe is None: + raise RecipeExpansionRecordNotFoundError(procedure_id) + + # Capability-deprecation gate: reject conduct against a tombstoned + # Capability before running the expansion port. Symmetric to start_run's + # RunBoundPlanDeprecatedError: re-expanding a Recipe against a Deprecated + # Capability would silently execute against a contract operators retired. + capability = await load_capability(event_store, recipe.capability_id) + if capability is not None and capability.status == CapabilityStatus.DEPRECATED: + raise ProcedureBoundCapabilityDeprecatedError(procedure_id, recipe.capability_id) + + verify_bindings_hash(procedure_id, pins) + expanded = expansion_port.expand(recipe.steps, dict(pins.bindings)) + verify_steps_hash(procedure_id, expanded, pins) + return expanded diff --git a/apps/api/src/cora/operation/_conduct_wire.py b/apps/api/src/cora/operation/_conduct_wire.py new file mode 100644 index 0000000000..9f542a86c9 --- /dev/null +++ b/apps/api/src/cora/operation/_conduct_wire.py @@ -0,0 +1,163 @@ +"""Shared HTTP/MCP wire shapes for the conduct verb-family slices. + +`conduct_procedure` and `try_conduct_procedure` accept the SAME step-list +body and surface the SAME per-step failure shape; this BC-level module owns +those wire types + converters so both slices reuse them. A slice cannot +import a sibling slice (the cross-slice-independence fitness), so the shared +seam lives here, outside `features/`, exactly like the resolved-steps replay +helper (`_resolved_steps_replay`) and preparation pipeline +(`_conduct_preparation`). + +The Conductor's `Step = SetpointStep | ActionStep | CheckStep` and +`CheckCriterion = EqualsCriterion | WithinToleranceCriterion` discriminated +unions land on the wire as JSON discriminated unions with a `kind` field. +Pydantic's `Field(discriminator="kind")` validates the union at parse time so +a malformed step kind fails the request with a 422 before the handler runs. + +Per-step `value` and `criterion.expected` are typed broadly +(`int | float | bool | str | list[Any]`) to match the ControlPort's value +vocabulary. Tuples-on-the-wire arrive as lists; the converter widens to +tuple for the in-process Conductor. +""" + +from typing import Annotated, Any, Literal, cast + +from pydantic import BaseModel, Field + +from cora.operation.conductor import ( + ActionStep, + CheckCriterion, + CheckStep, + ConductorFailure, + EqualsCriterion, + SetpointStep, + Step, + WithinToleranceCriterion, +) + +STEP_BATCH_MAX = 500 +"""Mirror of `append_activities`'s batch cap. A single conduct request never +carries more than this many steps; larger procedures split client-side via +multiple sequential runs.""" + + +class _SetpointStepRequest(BaseModel): + """JSON wire shape for a `SetpointStep`.""" + + kind: Literal["setpoint"] + address: str = Field(..., min_length=1) + value: int | float | bool | str | list[Any] + verify: bool = False + + model_config = {"extra": "forbid"} + + +class _ActionStepRequest(BaseModel): + """JSON wire shape for an `ActionStep`.""" + + kind: Literal["action"] + name: str = Field(..., min_length=1) + params: dict[str, Any] = Field(default_factory=dict) + + model_config = {"extra": "forbid"} + + +class _EqualsCriterion(BaseModel): + """JSON wire shape for an `EqualsCriterion`.""" + + kind: Literal["equals"] + expected: int | float | bool | str | list[Any] + + model_config = {"extra": "forbid"} + + +class _WithinToleranceCriterion(BaseModel): + """JSON wire shape for a `WithinToleranceCriterion`.""" + + kind: Literal["within_tolerance"] + expected: float + tolerance: float = Field(..., ge=0.0) + + model_config = {"extra": "forbid"} + + +_CriterionRequest = Annotated[ + _EqualsCriterion | _WithinToleranceCriterion, + Field(discriminator="kind"), +] + + +class _CheckStepRequest(BaseModel): + """JSON wire shape for a `CheckStep`.""" + + kind: Literal["check"] + address: str = Field(..., min_length=1) + criterion: _CriterionRequest + + model_config = {"extra": "forbid"} + + +StepRequest = Annotated[ + _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, + Field(discriminator="kind"), +] +"""The wire-side step union a conduct request body carries (`list[StepRequest]`).""" + + +class ConductorFailureResponse(BaseModel): + """JSON wire shape for a `ConductorFailure`.""" + + step_index: int | None + source_kind: str + target: str + error_class: str + message: str + + +def criterion_from_wire( + wire: _EqualsCriterion | _WithinToleranceCriterion, +) -> CheckCriterion: + """Build a Conductor `CheckCriterion` from a Pydantic wire model. + + The seam between the JSON shape and the in-process Conductor type; REST + routes + MCP tools across the conduct family share it. + """ + if isinstance(wire, _EqualsCriterion): + expected: Any = wire.expected + if isinstance(expected, list): + # wire.expected is a JSON list of Any; tuple-coerce for the in-process EqualsCriterion + return EqualsCriterion(expected=cast("tuple[Any, ...]", tuple(expected))) # pyright: ignore[reportUnknownArgumentType] + return EqualsCriterion(expected=expected) + return WithinToleranceCriterion(expected=wire.expected, tolerance=wire.tolerance) + + +def step_from_wire( + wire: _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, +) -> Step: + """Build a Conductor `Step` from a Pydantic wire model (REST + MCP share it).""" + if isinstance(wire, _SetpointStepRequest): + value: Any = wire.value + if isinstance(value, list): + return SetpointStep( + address=wire.address, + value=cast("tuple[Any, ...]", tuple(value)), # pyright: ignore[reportUnknownArgumentType] + verify=wire.verify, + ) + return SetpointStep(address=wire.address, value=value, verify=wire.verify) + if isinstance(wire, _ActionStepRequest): + return ActionStep(name=wire.name, params=wire.params) + return CheckStep( + address=wire.address, + criterion=criterion_from_wire(wire.criterion), + ) + + +def failure_to_wire(failure: ConductorFailure) -> ConductorFailureResponse: + """Project a `ConductorFailure` onto its JSON wire shape.""" + return ConductorFailureResponse( + step_index=failure.step_index, + source_kind=failure.source_kind, + target=failure.target, + error_class=failure.error_class, + message=failure.message, + ) diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 0fff402a05..7c06b03f1e 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -138,6 +138,8 @@ from cora.operation.features.complete_procedure.handler import ( Handler as CompleteProcedureHandler, ) +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.hold_procedure.handler import Handler as HoldProcedureHandler from cora.operation.features.resume_procedure.command import ResumeProcedure from cora.operation.features.resume_procedure.handler import Handler as ResumeProcedureHandler from cora.operation.features.start_procedure.command import StartProcedure @@ -477,12 +479,21 @@ class ConductorResult: (any simulator touch is disqualifying), so the failure-path result still reports the kind. Do not "fix" the observe-before-dispatch ordering in `_ActuationObserver` without revisiting this contract. + + `held` is True ONLY when `try_conduct` paused the Procedure to `Held` + on a recoverable step failure (and the hold transition itself + succeeded). Every other path (`execute` / `conduct` / `execute_from` + / `reconduct`, and a `try_conduct` whose hold itself failed) leaves it + False. It reflects the ACTUAL transition, not the mere recoverability + of the failure, so a caller can distinguish a resumable `Held` outcome + from a terminal `Aborted` one (both carry `succeeded=False` + `failure`). """ procedure_id: UUID completed_count: int failure: ConductorFailure | None = None actuation_kind: ActuationKind | None = None + held: bool = False @property def succeeded(self) -> bool: @@ -578,6 +589,7 @@ def __init__( complete_procedure: CompleteProcedureHandler | None = None, abort_procedure: AbortProcedureHandler | None = None, resume_procedure: ResumeProcedureHandler | None = None, + hold_procedure: HoldProcedureHandler | None = None, ) -> None: self._control_port = control_port self._append_step = append_step @@ -588,6 +600,7 @@ def __init__( self._complete_procedure = complete_procedure self._abort_procedure = abort_procedure self._resume_procedure = resume_procedure + self._hold_procedure = hold_procedure async def execute( self, @@ -879,7 +892,7 @@ async def conduct( # that is what the caller needs to triage. failure = result.failure assert failure is not None # not result.succeeded implies failure - reason = _derive_abort_reason(failure) + reason = _derive_failure_reason(failure) with contextlib.suppress(Exception): await self._abort_procedure( AbortProcedure( @@ -895,6 +908,160 @@ async def conduct( ) return result + async def try_conduct( + self, + *, + procedure_id: UUID, + principal_id: UUID, + correlation_id: UUID, + steps: Sequence[Step], + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> ConductorResult: + """Drive the lifecycle like `conduct()`, but PAUSE to Held on a recoverable failure. + + The pause-capable twin of `conduct()`. Identical start -> execute -> + complete-on-success path; the only divergence is the failure branch: + + - a RECOVERABLE step failure (setpoint / check: re-drivable / + re-runnable on resume) -> best-effort `hold_procedure` (Running -> + Held). On a successful hold the result carries `held=True` so the + caller can offer `reconduct`; if the hold itself fails the + Procedure is left Running (same posture as conduct's best-effort + abort that fails) and `held` stays False. + - a NON-recoverable step failure (an action: an interrupted + acquisition is not auto-resumable, Tier 2) -> best-effort + `abort_procedure`, exactly like `conduct()`. Holding here would + strand a Procedure whose replay tail starts with an acquisition + that `reconduct` can only halt-for-operator on. + - lifecycle failures (start / complete rejected) and a mid-execute + `CancelledError` keep `conduct()`'s behavior verbatim (no hold). + + Requires `start_procedure` + `complete_procedure` + `abort_procedure` + + `hold_procedure` handlers at __init__; raises `RuntimeError` (a + wiring bug) otherwise. + + This is the Tier-1 producer that makes a Held + pinned-resolved-steps + state reachable, so the `reconduct` resume path has something to + resume. See [[project_resumable_conduct_design]] Tier 1. + """ + if ( + self._start_procedure is None + or self._complete_procedure is None + or self._abort_procedure is None + or self._hold_procedure is None + ): + raise RuntimeError( + "Conductor.try_conduct() requires start_procedure + complete_procedure + " + "abort_procedure + hold_procedure handlers at __init__; only execute() is " + "available without them." + ) + envelope_kwargs: dict[str, Any] = { + "principal_id": principal_id, + "correlation_id": correlation_id, + "causation_id": causation_id, + "surface_id": surface_id, + } + try: + await self._start_procedure( + StartProcedure(procedure_id=procedure_id), **envelope_kwargs + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=0, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_START, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + try: + result = await self.execute( + procedure_id=procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + steps=steps, + causation_id=causation_id, + surface_id=surface_id, + ) + except asyncio.CancelledError: + # Mirror conduct(): best-effort abort so the FSM is not orphaned in + # Running, then re-raise. A cancellation is not a recoverable step + # failure, so it aborts rather than pausing to Held. + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure(procedure_id=procedure_id, reason="cancelled mid-execute"), + **envelope_kwargs, + ) + raise + actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + if result.succeeded: + try: + await self._complete_procedure( + CompleteProcedure(procedure_id=procedure_id, actuation_kind=actuation_kind), + **envelope_kwargs, + ) + except _LIFECYCLE_RERAISE: + raise + except Exception as exc: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=ConductorFailure( + step_index=None, + source_kind=_SOURCE_KIND_LIFECYCLE, + target=_LIFECYCLE_TARGET_COMPLETE, + error_class=type(exc).__name__, + message=str(exc), + ), + ) + return result + failure = result.failure + assert failure is not None # not result.succeeded implies failure + if _is_recoverable_failure(failure): + # Pause-to-Held instead of abort: a setpoint / check failure is + # re-drivable / re-runnable, so keep the conduct resumable. The + # hold is best-effort: if it fails, leave the Procedure Running + # (held stays False) and surface the original step failure, the + # same posture as conduct()'s best-effort abort that fails. + held_ok = False + with contextlib.suppress(Exception): + await self._hold_procedure( + HoldProcedure( + procedure_id=procedure_id, + reason=_derive_failure_reason(failure), + ), + **envelope_kwargs, + ) + held_ok = True + if held_ok: + return ConductorResult( + procedure_id=procedure_id, + completed_count=result.completed_count, + failure=failure, + actuation_kind=result.actuation_kind, + held=True, + ) + return result + # Non-recoverable step failure (action): best-effort abort, exactly + # like conduct(). Holding would strand a Procedure whose replay tail + # starts with an interrupted acquisition. + with contextlib.suppress(Exception): + await self._abort_procedure( + AbortProcedure( + procedure_id=procedure_id, + reason=_derive_failure_reason(failure), + actuation_kind=actuation_kind, + ), + **envelope_kwargs, + ) + return result + async def reconduct( self, *, @@ -1010,7 +1177,7 @@ async def reconduct( await self._abort_procedure( AbortProcedure( procedure_id=procedure_id, - reason=_derive_abort_reason(failure), + reason=_derive_failure_reason(failure), actuation_kind=actuation_kind, ), **envelope_kwargs, @@ -1420,6 +1587,21 @@ def is_acquisition_halt(failure: ConductorFailure | None) -> bool: return failure is not None and failure.error_class == _RESUME_HALT_ERROR_CLASS +def _is_recoverable_failure(failure: ConductorFailure) -> bool: + """True iff a conduct step failure is safe to PAUSE-and-resume, not abort. + + Recoverable = a setpoint or check failure: on `reconduct` a setpoint is + re-driven (idempotent absolute write) and a check is re-run as a fresh + gate, so the conduct can honestly continue from the boundary. An action + failure is NOT recoverable here: an interrupted acquisition is + non-idempotent (Tier 2 per-point decomposition is the real fix), and a + Held Procedure whose replay tail starts with that acquisition could only + halt-for-operator on `reconduct`. This is `try_conduct`'s hold-vs-abort + branch; lifecycle failures never reach it (handled before the step-failure + branch). See [[project_resumable_conduct_design]] Tier 1.""" + return failure.source_kind in (_STEP_KIND_SETPOINT, _STEP_KIND_CHECK) + + def _criterion_matches(criterion: CheckCriterion, value: Any) -> bool: """True iff `value` satisfies `criterion`. @@ -1443,14 +1625,15 @@ def _mismatch_reason(criterion: CheckCriterion, value: Any) -> str: return f"value {value!r} not within {criterion.tolerance} of expected {criterion.expected}" -def _derive_abort_reason(failure: ConductorFailure) -> str: - """Build a Procedure-aggregate-compliant abort reason from a step failure. +def _derive_failure_reason(failure: ConductorFailure) -> str: + """Build a Procedure-aggregate-compliant reason string from a step failure. - Truncates to `REASON_MAX_LENGTH` so the AbortProcedure - handler does not reject the cleanup call. The format leads with - the step pointer (kind + index + target) so an operator scanning - the abort reason knows immediately which step in the conducted - sequence killed the Procedure. + Used for both the abort path (`conduct` / `reconduct`) and the + pause-to-Held path (`try_conduct`). Truncates to `REASON_MAX_LENGTH` so + the AbortProcedure / HoldProcedure handler does not reject the call. The + format leads with the step pointer (kind + index + target) so an operator + scanning the reason knows immediately which step in the conducted sequence + halted the Procedure. """ if failure.step_index is None: prefix = f"{failure.source_kind} {failure.target}" diff --git a/apps/api/src/cora/operation/features/conduct_procedure/handler.py b/apps/api/src/cora/operation/features/conduct_procedure/handler.py index 58568d5277..bf016f3411 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/handler.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/handler.py @@ -42,53 +42,25 @@ call site. """ -from collections.abc import Sequence -from typing import TYPE_CHECKING, Protocol +from typing import Protocol from uuid import UUID -from cora.infrastructure.event_envelope import to_new_event from cora.infrastructure.kernel import Kernel from cora.infrastructure.logging import get_logger -from cora.infrastructure.ports import Deny, EventStore -from cora.infrastructure.ports.event_store import StoredEvent +from cora.infrastructure.ports import Deny from cora.infrastructure.routing import NIL_SENTINEL_ID -from cora.operation._recipe_replay import ( - find_recipe_expansion_record, - pins_from_payload, - verify_bindings_hash, - verify_steps_hash, -) +from cora.operation._conduct_preparation import resolve_and_pin_conduct_steps from cora.operation.aggregates.procedure import ( - ProcedureBoundCapabilityDeprecatedError, ProcedureNotFoundError, - ProcedureStepsForbiddenForRecipeDrivenError, - RecipeExpanderVersionMismatchError, - RecipeExpansionRecordNotFoundError, - event_type_name, load_procedure_with_events, - to_payload, ) -from cora.operation.conductor import Conductor, Step, step_to_payload +from cora.operation.conductor import Conductor from cora.operation.errors import UnauthorizedError from cora.operation.features.conduct_procedure.command import ( ConductProcedure, ConductProcedureResult, ) -from cora.operation.features.conduct_procedure.resolved_steps import ( - decide_resolved_steps_recorded, -) from cora.operation.ports.recipe_expander import RecipeExpander -from cora.recipe.aggregates.capability import CapabilityStatus, load_capability -from cora.recipe.aggregates.plan import ( - PlanNotFoundError, - constituents_from_wires, - load_plan, -) -from cora.recipe.aggregates.recipe import load_recipe_at_version -from cora.run.aggregates.run import RunNotFoundError, load_run - -if TYPE_CHECKING: - from cora.operation._pseudoaxis_expander import ConstituentResolver _COMMAND_NAME = "ConductProcedure" @@ -167,97 +139,18 @@ async def handler( if procedure is None: raise ProcedureNotFoundError(command.procedure_id) - if procedure.recipe_id is not None: - steps = await _re_expand_steps( - procedure_id=procedure.id, - recipe_id=procedure.recipe_id, - caller_steps=command.steps, - stored_events=stored_events, - event_store=deps.event_store, - expansion_port=expansion_port, - ) - else: - steps = tuple(command.steps) - - # A Phase-of-Run Procedure resolves a pseudoaxis's constituent - # motors from its Run's Plan wires: parent_run_id -> Run.plan_id - # -> Plan.wires (the same load chain start_procedure walks for - # its Supply gate). A missing Run / Plan in that chain is - # corruption, so raise rather than silently skip. Standalone / - # recipe-driven Procedures (no parent_run_id) pass no resolver, so - # any pseudoaxis SetpointStep hits the wiring-deferred default and - # is rejected with PartitionRuleNotFoundError. - # - # This composes with recipe-driven expansion rather than - # conflicting: the recipe block above produces the STEPS; the wires - # resolve each pseudoaxis step's CONSTITUENTS. A Procedure that is - # both recipe-driven and a Run phase gets recipe steps with - # wire-resolved constituents. (Watch item: this loads Run + Plan - # once per conduct; if a high-frequency re-conduct loop makes that - # latency matter, cache per command-lifetime.) - constituent_resolver: ConstituentResolver | None = None - if procedure.parent_run_id is not None: - parent_run = await load_run(deps.event_store, procedure.parent_run_id) - if parent_run is None: - raise RunNotFoundError(procedure.parent_run_id) - plan = await load_plan(deps.event_store, parent_run.plan_id) - if plan is None: - raise PlanNotFoundError(parent_run.plan_id) - - def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: - return constituents_from_wires(plan, asset_id) - - constituent_resolver = _resolve_constituents - - # Pre-Conductor PseudoAxis expansion: rewrite any virtual-axis - # SetpointStep into N sequential constituent SetpointSteps so - # the Conductor's existing dispatch loop walks the constituents - # in declared order. ActionStep / CheckStep pass through - # unchanged. PseudoAxis evaluator errors propagate to the - # routes layer for HTTP status mapping - # ([[project-pseudoaxis-design]] v3). - steps = await expansion_port.expand_pseudoaxis( - steps, - event_store=deps.event_store, + steps = await resolve_and_pin_conduct_steps( + deps, + command_name=_COMMAND_NAME, + procedure=procedure, + stored_events=stored_events, + caller_steps=command.steps, + expansion_port=expansion_port, + principal_id=principal_id, correlation_id=correlation_id, - constituent_resolver=constituent_resolver, + causation_id=causation_id, ) - # Pin the resolved step list (after recipe + pseudoaxis expansion) - # BEFORE conducting, so a future resume replays this exact list - # instead of re-deriving it from live Plan.wires / partition rules. - # Provenance-only ResolvedStepsRecorded; the helper emits it only - # while the Procedure is still Defined and returns [] otherwise, - # leaving the Conductor's start_procedure to surface a lifecycle - # failure (keeps the conduct route's failures-in-body contract). - resolved_steps_events = decide_resolved_steps_recorded( - procedure, - tuple(step_to_payload(step) for step in steps), - now=deps.clock.now(), - ) - if resolved_steps_events: - _, current_version = await deps.event_store.load( - stream_type="Procedure", stream_id=command.procedure_id - ) - await deps.event_store.append( - stream_type="Procedure", - stream_id=command.procedure_id, - expected_version=current_version, - events=[ - to_new_event( - event_type=event_type_name(event), - payload=to_payload(event), - occurred_at=event.occurred_at, - event_id=deps.id_generator.new_id(), - command_name=_COMMAND_NAME, - correlation_id=correlation_id, - causation_id=causation_id, - principal_id=principal_id, - ) - for event in resolved_steps_events - ], - ) - result = await conductor.conduct( procedure_id=command.procedure_id, principal_id=principal_id, @@ -287,64 +180,3 @@ def _resolve_constituents(asset_id: UUID) -> tuple[UUID, ...]: ) return handler - - -async def _re_expand_steps( - *, - procedure_id: UUID, - recipe_id: UUID, - caller_steps: Sequence[Step], - stored_events: list[StoredEvent], - event_store: EventStore, - expansion_port: RecipeExpander, -) -> tuple[Step, ...]: - """Run the recipe-replay gate per [[project-run-procedure-replay-design]]. - - Six steps: reject non-empty caller steps -> find_recipe_expansion_record - (raise RecipeExpansionRecordNotFoundError on None) -> pins_from_payload - -> port-version strict-equals (raise RecipeExpanderVersionMismatchError - on drift) -> load_recipe_at_version (raise RecipeExpansionRecordNotFoundError - when None on a recipe-driven Procedure; RecipeVersionNotFoundError - propagates from helper) -> load_capability + reject Deprecated - (raise ProcedureBoundCapabilityDeprecatedError, symmetric to - start_run's RunBoundPlanDeprecatedError) -> verify_bindings_hash -> - expand -> verify_steps_hash -> return the re-expanded tuple. - """ - if list(caller_steps): - raise ProcedureStepsForbiddenForRecipeDrivenError(procedure_id) - - record = find_recipe_expansion_record(stored_events) - if record is None: - raise RecipeExpansionRecordNotFoundError(procedure_id) - - pins = pins_from_payload(procedure_id, record.payload) - - if pins.expansion_port_version != expansion_port.version: - raise RecipeExpanderVersionMismatchError( - procedure_id, - pins.expansion_port_version, - expansion_port.version, - ) - - recipe = await load_recipe_at_version( - event_store, - recipe_id, - pins.recipe_version, - ) - if recipe is None: - raise RecipeExpansionRecordNotFoundError(procedure_id) - - # Capability-deprecation gate: reject conduct against a tombstoned - # Capability before running the expansion port. Symmetric to - # start_run's RunBoundPlanDeprecatedError. Per the 2026-06-04 domain - # harmony audit: re-expanding a Recipe against a Deprecated - # Capability would silently execute against a contract operators - # have retired. - capability = await load_capability(event_store, recipe.capability_id) - if capability is not None and capability.status == CapabilityStatus.DEPRECATED: - raise ProcedureBoundCapabilityDeprecatedError(procedure_id, recipe.capability_id) - - verify_bindings_hash(procedure_id, pins) - expanded = expansion_port.expand(recipe.steps, dict(pins.bindings)) - verify_steps_hash(procedure_id, expanded, pins) - return expanded diff --git a/apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py b/apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py deleted file mode 100644 index 53476f96d8..0000000000 --- a/apps/api/src/cora/operation/features/conduct_procedure/resolved_steps.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Pure decision helper: pin the resolved step list at conduct start. - -The `conduct_procedure` orchestration handler calls this AFTER it has -resolved the final step list (recipe re-expansion + pseudoaxis + -constituent resolution) and BEFORE handing the list to the Conductor, so -every conduct pins its resolved steps before any step executes. - -Emitted inline from the conduct flow rather than via a dedicated command -slice: `ResolvedStepsRecorded` is an internal provenance event with no -operator entry point, exactly like `RecipeExpansionRecorded`. Kept as a -pure function so the decision is unit-testable without an event store. -""" - -from collections.abc import Mapping, Sequence -from datetime import datetime -from typing import Any - -from cora.operation.aggregates.procedure import ( - Procedure, - ProcedureStatus, - ResolvedStepsRecorded, -) - - -def decide_resolved_steps_recorded( - state: Procedure | None, - resolved_steps: Sequence[Mapping[str, Any]], - *, - now: datetime, -) -> list[ResolvedStepsRecorded]: - """Pin the resolved step list iff the Procedure is pre-conduct (Defined). - - Returns a single `ResolvedStepsRecorded` when `state` is `Defined` - (the normal conduct path, before `start_procedure` transitions it to - `Running`). Returns `[]` when `state` is None or not `Defined`: a - conduct of a missing / already-running / terminal Procedure records no - resolved steps and lets the Conductor's `start_procedure` produce the normal - lifecycle failure, preserving the conduct route's failures-in-body - contract instead of raising a fresh HTTP error here. - """ - if state is None or state.status is not ProcedureStatus.DEFINED: - return [] - steps = tuple(dict(step) for step in resolved_steps) - return [ - ResolvedStepsRecorded( - procedure_id=state.id, - resolved_steps=steps, - step_count=len(steps), - occurred_at=now, - ) - ] diff --git a/apps/api/src/cora/operation/features/conduct_procedure/route.py b/apps/api/src/cora/operation/features/conduct_procedure/route.py index 971593d5e9..97bc90aa68 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/route.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/route.py @@ -23,20 +23,13 @@ ## Pydantic wire types -The Conductor's `Step = SetpointStep | ActionStep | CheckStep` and -`CheckCriterion = EqualsCriterion | WithinToleranceCriterion` discriminated unions -land on the wire as JSON discriminated unions with a `kind` field. -Pydantic's `Field(discriminator="kind")` validates the union at -parse time so a malformed step kind fails the request with a 422 -before the handler ever runs. - -Per-step `value` and `criterion.expected` are typed broadly -(`int | float | bool | str | list[Any]`) to match the -ControlPort's value vocabulary. Tuples-on-the-wire arrive as lists; -the converter widens to tuple for the in-process Conductor. +The shared step-list body + per-step failure shape live in the BC-level +`cora.operation._conduct_wire` module (reused by `try_conduct_procedure`, +which a slice cannot import directly). This slice owns only the +conduct-specific request/response envelope. """ -from typing import Annotated, Any, Literal, cast +from typing import Annotated from uuid import UUID from fastapi import APIRouter, Depends, Path, Request, status @@ -48,15 +41,12 @@ get_principal_id, get_surface_id, ) -from cora.operation.conductor import ( - ActionStep, - CheckCriterion, - CheckStep, - ConductorFailure, - EqualsCriterion, - SetpointStep, - Step, - WithinToleranceCriterion, +from cora.operation._conduct_wire import ( + STEP_BATCH_MAX, + ConductorFailureResponse, + StepRequest, + failure_to_wire, + step_from_wire, ) from cora.operation.features.conduct_procedure.command import ( ConductProcedure, @@ -64,82 +54,15 @@ ) from cora.operation.features.conduct_procedure.handler import Handler -_STEP_BATCH_MAX = 500 -"""Mirror of `append_activities`'s batch cap. A single -`ConductProcedure` request never carries more than this many steps; -larger procedures split client-side via multiple sequential runs.""" - - -class _SetpointStepRequest(BaseModel): - """JSON wire shape for a `SetpointStep`.""" - - kind: Literal["setpoint"] - address: str = Field(..., min_length=1) - value: int | float | bool | str | list[Any] - verify: bool = False - - model_config = {"extra": "forbid"} - - -class _ActionStepRequest(BaseModel): - """JSON wire shape for an `ActionStep`.""" - - kind: Literal["action"] - name: str = Field(..., min_length=1) - params: dict[str, Any] = Field(default_factory=dict) - - model_config = {"extra": "forbid"} - - -class _EqualsCriterion(BaseModel): - """JSON wire shape for an `EqualsCriterion`.""" - - kind: Literal["equals"] - expected: int | float | bool | str | list[Any] - - model_config = {"extra": "forbid"} - - -class _WithinToleranceCriterion(BaseModel): - """JSON wire shape for a `WithinToleranceCriterion`.""" - - kind: Literal["within_tolerance"] - expected: float - tolerance: float = Field(..., ge=0.0) - - model_config = {"extra": "forbid"} - - -_CriterionRequest = Annotated[ - _EqualsCriterion | _WithinToleranceCriterion, - Field(discriminator="kind"), -] - - -class _CheckStepRequest(BaseModel): - """JSON wire shape for a `CheckStep`.""" - - kind: Literal["check"] - address: str = Field(..., min_length=1) - criterion: _CriterionRequest - - model_config = {"extra": "forbid"} - - -_StepRequest = Annotated[ - _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, - Field(discriminator="kind"), -] - class ConductProcedureRequest(BaseModel): """Body for `POST /procedures/{procedure_id}/conduct`.""" - steps: list[_StepRequest] = Field( - default_factory=list[_StepRequest], - max_length=_STEP_BATCH_MAX, + steps: list[StepRequest] = Field( + default_factory=list[StepRequest], + max_length=STEP_BATCH_MAX, description=( - f"Steps the Conductor walks in order (0-{_STEP_BATCH_MAX}). " + f"Steps the Conductor walks in order (0-{STEP_BATCH_MAX}). " "Empty list is valid: start + complete fire with no steps." ), ) @@ -147,16 +70,6 @@ class ConductProcedureRequest(BaseModel): model_config = {"extra": "forbid"} -class _ConductorFailureResponse(BaseModel): - """JSON wire shape for `ConductorFailure`.""" - - step_index: int | None - source_kind: str - target: str - error_class: str - message: str - - class ConductProcedureResponse(BaseModel): """Response body for the conduct_procedure slice. @@ -174,63 +87,10 @@ class ConductProcedureResponse(BaseModel): procedure_id: UUID completed_count: int succeeded: bool - failure: _ConductorFailureResponse | None = None + failure: ConductorFailureResponse | None = None actuation_kind: str | None = None -def criterion_from_wire( - wire: _EqualsCriterion | _WithinToleranceCriterion, -) -> CheckCriterion: - """Build a Conductor `CheckCriterion` from a Pydantic wire model. - - Public because `tool.py` calls it too (MCP + REST share the same - wire schema; the converter is the seam between the JSON shape - and the in-process Conductor type). - """ - if isinstance(wire, _EqualsCriterion): - expected: Any = wire.expected - if isinstance(expected, list): - # wire.expected is a JSON list of Any; tuple-coerce for the in-process EqualsCriterion - return EqualsCriterion(expected=cast("tuple[Any, ...]", tuple(expected))) # pyright: ignore[reportUnknownArgumentType] - return EqualsCriterion(expected=expected) - return WithinToleranceCriterion(expected=wire.expected, tolerance=wire.tolerance) - - -def step_from_wire( - wire: _SetpointStepRequest | _ActionStepRequest | _CheckStepRequest, -) -> Step: - """Build a Conductor `Step` from a Pydantic wire model. - - Public because `tool.py` calls it too (MCP + REST share the same - wire schema). - """ - if isinstance(wire, _SetpointStepRequest): - value: Any = wire.value - if isinstance(value, list): - return SetpointStep( - address=wire.address, - value=cast("tuple[Any, ...]", tuple(value)), # pyright: ignore[reportUnknownArgumentType] - verify=wire.verify, - ) - return SetpointStep(address=wire.address, value=value, verify=wire.verify) - if isinstance(wire, _ActionStepRequest): - return ActionStep(name=wire.name, params=wire.params) - return CheckStep( - address=wire.address, - criterion=criterion_from_wire(wire.criterion), - ) - - -def _failure_to_wire(failure: ConductorFailure) -> _ConductorFailureResponse: - return _ConductorFailureResponse( - step_index=failure.step_index, - source_kind=failure.source_kind, - target=failure.target, - error_class=failure.error_class, - message=failure.message, - ) - - def result_to_wire(result: ConductProcedureResult) -> ConductProcedureResponse: """Build a `ConductProcedureResponse` from the slice's `ConductProcedureResult`. @@ -240,7 +100,7 @@ def result_to_wire(result: ConductProcedureResult) -> ConductProcedureResponse: procedure_id=result.procedure_id, completed_count=result.completed_count, succeeded=result.succeeded, - failure=_failure_to_wire(result.failure) if result.failure is not None else None, + failure=failure_to_wire(result.failure) if result.failure is not None else None, actuation_kind=result.actuation_kind, ) diff --git a/apps/api/src/cora/operation/features/conduct_procedure/tool.py b/apps/api/src/cora/operation/features/conduct_procedure/tool.py index 41363cf85d..2baa0c9160 100644 --- a/apps/api/src/cora/operation/features/conduct_procedure/tool.py +++ b/apps/api/src/cora/operation/features/conduct_procedure/tool.py @@ -16,13 +16,13 @@ from cora.infrastructure.mcp_principal import get_mcp_principal_id from cora.infrastructure.observability import current_correlation_id from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation._conduct_wire import step_from_wire from cora.operation.features.conduct_procedure.command import ConductProcedure from cora.operation.features.conduct_procedure.handler import Handler from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, ConductProcedureResponse, result_to_wire, - step_from_wire, ) diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py b/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py new file mode 100644 index 0000000000..fdcb64653c --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/__init__.py @@ -0,0 +1,39 @@ +"""Vertical slice for the `TryConductProcedure` command. + +Pause-capable conduct: the conduct verb-family's third member (conduct = +run-to-terminal, reconduct = resume-and-replay, try-conduct = +pause-to-Held-on-recoverable-failure). Hands control to the `Conductor` +runtime which, on a recoverable step failure, pauses the Procedure to `Held` +instead of aborting it, so an operator can `reconduct` from the pinned +resolved steps. Returns a structured `TryConductProcedureResult` whose `held` +flag distinguishes a paused (resumable) outcome from a terminal one. + + from cora.operation.features import try_conduct_procedure + + cmd = try_conduct_procedure.TryConductProcedure(procedure_id=..., steps=(...)) + handler = try_conduct_procedure.bind(deps, conductor=conductor, expansion_port=...) + result = await handler(cmd, principal_id=..., correlation_id=...) +""" + +from cora.operation.features.try_conduct_procedure import tool +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.features.try_conduct_procedure.handler import Handler, bind +from cora.operation.features.try_conduct_procedure.route import ( + TryConductProcedureRequest, + TryConductProcedureResponse, + router, +) + +__all__ = [ + "Handler", + "TryConductProcedure", + "TryConductProcedureRequest", + "TryConductProcedureResponse", + "TryConductProcedureResult", + "bind", + "router", + "tool", +] diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/command.py b/apps/api/src/cora/operation/features/try_conduct_procedure/command.py new file mode 100644 index 0000000000..07ea1d1563 --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/command.py @@ -0,0 +1,47 @@ +"""The `TryConductProcedure` command -- pause-capable conduct entry point. + +Like `ConductProcedure`, hands control to the `Conductor` runtime; the one +difference is the failure posture. On a RECOVERABLE step failure (a setpoint +or check: re-drivable / re-runnable on resume) the Conductor PAUSES the +Procedure to `Held` instead of aborting it, so the operator can fix the cause +and `reconduct` from the pinned resolved steps. A NON-recoverable failure (an +action: an interrupted acquisition), a lifecycle failure, and a mid-execute +cancellation keep `conduct`'s abort posture. + +`steps` is the caller-supplied sequence the Conductor walks (same wire shape +as `ConductProcedure`). +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from uuid import UUID + +from cora.operation.conductor import ConductorFailure, Step + + +@dataclass(frozen=True) +class TryConductProcedure: + """Conduct a Procedure, pausing to Held on a recoverable step failure.""" + + procedure_id: UUID + steps: Sequence[Step] + + +@dataclass(frozen=True) +class TryConductProcedureResult: + """Summary of a `TryConductProcedure` invocation. + + Mirrors `ConductProcedureResult` plus `held`: True iff a recoverable step + failure paused the Procedure to `Held` AND the pause transition itself + succeeded. `held` is what distinguishes a resumable outcome from a + terminal `Aborted` one: both carry `succeeded=False` + `failure`, but only + a `held` Procedure can be `reconduct`-ed. A `held` Procedure whose hold + transition failed (left Running) reports `held=False`. + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: ConductorFailure | None = None + actuation_kind: str | None = None diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py b/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py new file mode 100644 index 0000000000..c4558404d6 --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/handler.py @@ -0,0 +1,166 @@ +"""Application handler for the `try_conduct_procedure` slice. + +Pause-capable conduct. A thin orchestrator that delegates to +`Conductor.try_conduct()` (the pause-to-Held twin of `Conductor.conduct()`): +on a recoverable step failure the Conductor pauses the Procedure to `Held` +rather than aborting it, so the operator can `reconduct` from the pinned +resolved steps. This is the Tier-1 producer that makes a Held + pinned-steps +state reachable so the `reconduct` resume path has something to resume. + +Shares the pre-Conductor pipeline (recipe re-expansion + pseudoaxis + +resolved-steps pin) with `conduct_procedure` via the BC-level +`resolve_and_pin_conduct_steps`, and the HTTP/MCP wire shapes via +`_conduct_wire`. It imports NO sibling slice: the cross-slice-independence +fitness forbids that, and the shared seams live outside `features/`. + +## Why no `_decider` + +Like `conduct_procedure`, records no new events directly: the wrapped +start / append / complete / abort / hold handlers (on the Conductor) write. +An orchestration entry point, not an aggregate-state-mutating decider. + +## Authorization scope + +`TryConductProcedure` is authz-checked as its own command. The wrapped +transition handlers each authz internally with their OWN command names; an +operator authorized to call `TryConductProcedure` is NOT automatically +authorized for those individually. Same layering as `conduct_procedure`. +""" + +from typing import Protocol +from uuid import UUID + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.logging import get_logger +from cora.infrastructure.ports import Deny +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._conduct_preparation import resolve_and_pin_conduct_steps +from cora.operation.aggregates.procedure import ( + ProcedureNotFoundError, + load_procedure_with_events, +) +from cora.operation.conductor import Conductor +from cora.operation.errors import UnauthorizedError +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.ports.recipe_expander import RecipeExpander + +_COMMAND_NAME = "TryConductProcedure" + +_log = get_logger(__name__) + + +class Handler(Protocol): + """Callable interface every try_conduct_procedure handler implements.""" + + async def __call__( + self, + command: TryConductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> TryConductProcedureResult: ... + + +def bind( + deps: Kernel, + *, + conductor: Conductor, + expansion_port: RecipeExpander, +) -> Handler: + """Build a try_conduct_procedure handler closed over deps + Conductor + port. + + `conductor` is the same BC-internal Conductor `conduct_procedure` uses; it + carries the start / complete / abort / hold handlers (wired at app + composition) that `Conductor.try_conduct` composes. `expansion_port` is + the same instance wired for `register_procedure_from_recipe` + conduct. + """ + + async def handler( + command: TryConductProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, + ) -> TryConductProcedureResult: + _log.info( + "try_conduct_procedure.start", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + step_count=len(command.steps), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + causation_id=str(causation_id) if causation_id is not None else None, + ) + + authz = await deps.authz.authorize( + principal_id=principal_id, + command_name=_COMMAND_NAME, + conduit_id=NIL_SENTINEL_ID, + surface_id=surface_id, + ) + if isinstance(authz, Deny): + _log.info( + "try_conduct_procedure.denied", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + principal_id=str(principal_id), + correlation_id=str(correlation_id), + reason=authz.reason, + ) + raise UnauthorizedError(authz.reason) + + procedure, stored_events = await load_procedure_with_events( + deps.event_store, command.procedure_id + ) + if procedure is None: + raise ProcedureNotFoundError(command.procedure_id) + + steps = await resolve_and_pin_conduct_steps( + deps, + command_name=_COMMAND_NAME, + procedure=procedure, + stored_events=stored_events, + caller_steps=command.steps, + expansion_port=expansion_port, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + ) + + result = await conductor.try_conduct( + procedure_id=command.procedure_id, + principal_id=principal_id, + correlation_id=correlation_id, + causation_id=causation_id, + surface_id=surface_id, + steps=steps, + ) + + _log.info( + "try_conduct_procedure.success", + command_name=_COMMAND_NAME, + procedure_id=str(command.procedure_id), + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure_class=(result.failure.error_class if result.failure is not None else None), + ) + + return TryConductProcedureResult( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure=result.failure, + actuation_kind=( + result.actuation_kind.value if result.actuation_kind is not None else None + ), + ) + + return handler diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/route.py b/apps/api/src/cora/operation/features/try_conduct_procedure/route.py new file mode 100644 index 0000000000..ed7c14319c --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/route.py @@ -0,0 +1,151 @@ +"""HTTP route for the `try_conduct_procedure` slice. + +`POST /procedures/{procedure_id}/try-conduct` accepts the same step-list body +as conduct, but on a RECOVERABLE step failure (a setpoint / check) the +Procedure is PAUSED to `Held` (resumable via `reconduct`) instead of aborted. + +## Response code: always 200, failures in body + +Like `conduct`, this is an orchestration endpoint: step-level failures + the +pause-to-Held outcome are NORMAL operational results that land in the response +body, not HTTP 4xx / 5xx. `held` distinguishes a paused (resumable) outcome +from a terminal `Aborted` one (both carry `succeeded=False` + `failure`). +Only true protocol / auth / validation faults map to HTTP error codes (422 +for malformed JSON, 403 for authz deny). + +## Pydantic wire types + +The shared step-list body + per-step failure shape live in the BC-level +`cora.operation._conduct_wire` module (shared with `conduct_procedure`). This +slice owns only the try-conduct-specific request/response envelope, which adds +the `held` discriminator. +""" + +from typing import Annotated +from uuid import UUID + +from fastapi import APIRouter, Depends, Path, Request, status +from pydantic import BaseModel, Field + +from cora.infrastructure.routing import ( + ErrorResponse, + get_correlation_id, + get_principal_id, + get_surface_id, +) +from cora.operation._conduct_wire import ( + STEP_BATCH_MAX, + ConductorFailureResponse, + StepRequest, + failure_to_wire, + step_from_wire, +) +from cora.operation.features.try_conduct_procedure.command import ( + TryConductProcedure, + TryConductProcedureResult, +) +from cora.operation.features.try_conduct_procedure.handler import Handler + + +class TryConductProcedureRequest(BaseModel): + """Body for `POST /procedures/{procedure_id}/try-conduct`.""" + + steps: list[StepRequest] = Field( + default_factory=list[StepRequest], + max_length=STEP_BATCH_MAX, + description=( + f"Steps the Conductor walks in order (0-{STEP_BATCH_MAX}). " + "Empty list is valid: start + complete fire with no steps." + ), + ) + + model_config = {"extra": "forbid"} + + +class TryConductProcedureResponse(BaseModel): + """Response body for the try_conduct_procedure slice. + + `succeeded` is the canonical pass/fail bit; `failure` is non-null iff + `succeeded` is False. `held` is True iff a recoverable step failure paused + the Procedure to `Held` (resumable via `reconduct`); a terminal `Aborted` + outcome carries `succeeded=False` + `failure` + `held=False`. + + `actuation_kind` is the raw `ActuationKind` value the Conductor observed, + or None when nothing instrumented was actuated. Read-only operator + visibility; the gate that consumes it reads the value server-side off the + Procedure stream, never back from this response. + """ + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: ConductorFailureResponse | None = None + actuation_kind: str | None = None + + +def result_to_wire(result: TryConductProcedureResult) -> TryConductProcedureResponse: + """Build a `TryConductProcedureResponse` from the slice's result. + + Public because `tool.py` calls it too. + """ + return TryConductProcedureResponse( + procedure_id=result.procedure_id, + completed_count=result.completed_count, + succeeded=result.succeeded, + held=result.held, + failure=failure_to_wire(result.failure) if result.failure is not None else None, + actuation_kind=result.actuation_kind, + ) + + +def _get_handler(request: Request) -> Handler: + handler: Handler = request.app.state.operation.try_conduct_procedure + return handler + + +router = APIRouter(tags=["operation"]) + + +@router.post( + "/procedures/{procedure_id}/try-conduct", + status_code=status.HTTP_200_OK, + response_model=TryConductProcedureResponse, + responses={ + status.HTTP_403_FORBIDDEN: { + "model": ErrorResponse, + "description": "Authorize port denied the command.", + }, + status.HTTP_422_UNPROCESSABLE_CONTENT: { + "description": ( + "Request body failed schema validation: unknown step kind, " + "missing required field, batch over cap, invalid criterion shape." + ), + }, + }, + summary=( + "Conduct a Procedure, pausing to Held on a recoverable failure: " + "start -> walk steps -> complete (success) / pause to Held " + "(recoverable setpoint or check failure) / abort (acquisition failure)." + ), +) +async def post_procedures_try_conduct( + procedure_id: Annotated[UUID, Path(description="Target procedure's id.")], + body: TryConductProcedureRequest, + handler: Annotated[Handler, Depends(_get_handler)], + cid: Annotated[UUID, Depends(get_correlation_id)], + principal_id: Annotated[UUID, Depends(get_principal_id)], + surface_id: Annotated[UUID, Depends(get_surface_id)], +) -> TryConductProcedureResponse: + """Conduct a Procedure, pausing to Held on a recoverable failure.""" + command = TryConductProcedure( + procedure_id=procedure_id, + steps=tuple(step_from_wire(s) for s in body.steps), + ) + result = await handler( + command, + principal_id=principal_id, + correlation_id=cid, + surface_id=surface_id, + ) + return result_to_wire(result) diff --git a/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py b/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py new file mode 100644 index 0000000000..a9f1da08eb --- /dev/null +++ b/apps/api/src/cora/operation/features/try_conduct_procedure/tool.py @@ -0,0 +1,86 @@ +"""MCP tool for the `try_conduct_procedure` slice. + +Mirrors the REST route: accepts a discriminated step list, returns a +structured summary. On a recoverable step failure the Procedure is PAUSED to +`Held` (resumable) instead of aborted; `held` in the return value flags that. +Failures land in the return value (not raised); the LLM caller inspects +`succeeded` + `held` + `failure` to decide reconduct / abort / escalation. +""" + +from collections.abc import Callable +from typing import Annotated, Any +from uuid import UUID + +from mcp.server.fastmcp import Context, FastMCP +from pydantic import BaseModel, Field + +from cora.infrastructure.mcp_principal import get_mcp_principal_id +from cora.infrastructure.observability import current_correlation_id +from cora.infrastructure.routing import get_mcp_surface_id +from cora.operation._conduct_wire import step_from_wire +from cora.operation.features.try_conduct_procedure.command import TryConductProcedure +from cora.operation.features.try_conduct_procedure.handler import Handler +from cora.operation.features.try_conduct_procedure.route import ( + TryConductProcedureRequest, + TryConductProcedureResponse, + result_to_wire, +) + + +class _ToolResult(BaseModel): + """MCP-shape mirror of `TryConductProcedureResponse` for tool-output validation.""" + + procedure_id: UUID + completed_count: int + succeeded: bool + held: bool = False + failure: dict[str, Any] | None = None + actuation_kind: str | None = None + + +def register(mcp: FastMCP, *, get_handler: Callable[[], Handler]) -> None: + """Register the `try_conduct_procedure` tool on the given MCP server.""" + + @mcp.tool( + name="try_conduct_procedure", + description=( + "Conduct an existing Procedure end-to-end like conduct_procedure, " + "but on a RECOVERABLE step failure (a setpoint write or read-back " + "check) PAUSE the Procedure to Held (resumable via " + "reconduct_procedure) instead of aborting it. An acquisition " + "(action) failure still aborts. Returns a structured summary; " + "`held` is True when the Procedure was paused (resumable). " + "Failures DO NOT raise." + ), + ) + async def try_conduct_procedure_tool( # pyright: ignore[reportUnusedFunction] + ctx: Context[Any, Any, Any], + procedure_id: Annotated[ + UUID, + Field(description="Target procedure's id."), + ], + body: Annotated[ + TryConductProcedureRequest, + Field(description="Step list the Conductor walks in order."), + ], + ) -> _ToolResult: + handler = get_handler() + command = TryConductProcedure( + procedure_id=procedure_id, + steps=tuple(step_from_wire(s) for s in body.steps), + ) + result = await handler( + command, + principal_id=get_mcp_principal_id(ctx), + correlation_id=current_correlation_id(), + surface_id=get_mcp_surface_id(), + ) + wire: TryConductProcedureResponse = result_to_wire(result) + return _ToolResult( + procedure_id=wire.procedure_id, + completed_count=wire.completed_count, + succeeded=wire.succeeded, + held=wire.held, + failure=wire.failure.model_dump() if wire.failure is not None else None, + actuation_kind=wire.actuation_kind, + ) diff --git a/apps/api/src/cora/operation/routes.py b/apps/api/src/cora/operation/routes.py index cbb28d7983..9c71460360 100644 --- a/apps/api/src/cora/operation/routes.py +++ b/apps/api/src/cora/operation/routes.py @@ -104,6 +104,7 @@ start_iteration, start_procedure, truncate_procedure, + try_conduct_procedure, ) @@ -248,6 +249,7 @@ def register_operation_routes(app: FastAPI) -> None: app.include_router(list_procedures.router) app.include_router(list_procedure_iterations.router) app.include_router(conduct_procedure.router) + app.include_router(try_conduct_procedure.router) for validation_cls in ( InvalidProcedureNameError, InvalidProcedureKindError, diff --git a/apps/api/src/cora/operation/tools.py b/apps/api/src/cora/operation/tools.py index 3024ff69b5..04c4e76560 100644 --- a/apps/api/src/cora/operation/tools.py +++ b/apps/api/src/cora/operation/tools.py @@ -31,6 +31,7 @@ from cora.operation.features.start_iteration import tool as start_iteration_tool from cora.operation.features.start_procedure import tool as start_procedure_tool from cora.operation.features.truncate_procedure import tool as truncate_procedure_tool +from cora.operation.features.try_conduct_procedure import tool as try_conduct_procedure_tool from cora.operation.wire import OperationHandlers @@ -104,3 +105,7 @@ def register_operation_tools( mcp, get_handler=lambda: get_handlers().conduct_procedure, ) + try_conduct_procedure_tool.register( + mcp, + get_handler=lambda: get_handlers().try_conduct_procedure, + ) diff --git a/apps/api/src/cora/operation/wire.py b/apps/api/src/cora/operation/wire.py index c9dc2bd293..ed2c664c58 100644 --- a/apps/api/src/cora/operation/wire.py +++ b/apps/api/src/cora/operation/wire.py @@ -84,6 +84,7 @@ start_iteration, start_procedure, truncate_procedure, + try_conduct_procedure, ) from cora.operation.ports.control_port import ControlPort @@ -116,6 +117,7 @@ class OperationHandlers: list_procedures: list_procedures.Handler list_procedure_iterations: list_procedure_iterations.Handler conduct_procedure: conduct_procedure.Handler + try_conduct_procedure: try_conduct_procedure.Handler control_port: ControlPort """The ControlPort the Conductor talks to. Surfaced on the bundle so the FastAPI lifespan's teardown can call `aclose()` on it @@ -191,6 +193,14 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="ResumeProcedure", bc=_BC, ) + # Hoisted likewise so the bundle field AND the Conductor share ONE + # post-tracing hold handler instance; Conductor.try_conduct composes it + # to pause-to-Held on a recoverable conduct failure. + hold_handler = with_tracing( + hold_procedure.bind(deps), + command_name="HoldProcedure", + bc=_BC, + ) append_step_handler = with_tracing( append_activities.bind(deps, step_store=step_store), command_name="AppendProcedureActivities", @@ -214,6 +224,7 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> complete_procedure=complete_handler, abort_procedure=abort_handler, resume_procedure=resume_handler, + hold_procedure=hold_handler, ) # Resume-and-replay orchestration: a thin slice handler over # Conductor.reconduct (which composes resume + execute_from + @@ -223,6 +234,14 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="ReconductProcedure", bc=_BC, ) + # Pause-capable conduct: a thin slice handler over Conductor.try_conduct + # (which composes start + execute + complete/hold/abort). Reuses the same + # conductor + recipe expander as conduct; no sibling-slice imports. + try_conduct_handler = with_tracing( + try_conduct_procedure.bind(deps, conductor=conductor, expansion_port=recipe_expander), + command_name="TryConductProcedure", + bc=_BC, + ) return OperationHandlers( register_procedure=with_tracing( with_idempotency( @@ -258,11 +277,7 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="TruncateProcedure", bc=_BC, ), - hold_procedure=with_tracing( - hold_procedure.bind(deps), - command_name="HoldProcedure", - bc=_BC, - ), + hold_procedure=hold_handler, resume_procedure=resume_handler, reconduct_procedure=reconduct_handler, start_iteration=with_tracing( @@ -299,5 +314,6 @@ def wire_operation(deps: Kernel, *, control_port: ControlPort | None = None) -> command_name="ConductProcedure", bc=_BC, ), + try_conduct_procedure=try_conduct_handler, control_port=control_port, ) diff --git a/apps/api/tests/architecture/test_slice_contract.py b/apps/api/tests/architecture/test_slice_contract.py index 6a7cac218c..5d45ee3178 100644 --- a/apps/api/tests/architecture/test_slice_contract.py +++ b/apps/api/tests/architecture/test_slice_contract.py @@ -60,6 +60,10 @@ # Conductor.execute_from + complete/abort; no direct event emission. # See [[project_resumable_conduct_design]]. "cora.operation.features.reconduct_procedure", + # Pause-capable conduct entry: delegates Conductor.try_conduct + # (start + execute + complete/hold/abort); no direct event emission. + # See [[project_resumable_conduct_design]]. + "cora.operation.features.try_conduct_procedure", # Bulk-mint sweep: enumerates Assets missing a persistent id and # delegates each to the assign_asset_persistent_id handler; no direct # event emission. See [[project_asset_persistent_id_design]]. diff --git a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py index e3e3756a13..ab6044aaa4 100644 --- a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py +++ b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py @@ -4,17 +4,13 @@ step-list tail. 200 with replay outcomes in body; 404/409/422/500 for protocol / guard / corruption faults. -Note on coverage: the 200 happy path (a clean replay that auto-completes) -requires a `Held` Procedure that carries a PINNED `ResolvedStepsRecorded` -resolved steps. The synchronous conduct flow today pins the resolved steps then runs -to a terminal state (Completed / Aborted) in one call, so there is no -API-reachable `Held + resolved-steps state yet (producing it -- a conduct that -pauses to Held instead of aborting on a halt, or a mid-conduct -cooperative hold -- is a follow-up slice). The clean / halt / step-failure -replay paths are covered end-to-end in -`tests/unit/operation/test_reconduct_procedure_handler.py` against a -seeded Held+resolved steps state. These contract tests cover the -API-reachable guard / fault surfaces. +The 200 happy paths are now API-reachable via `try_conduct_procedure`: it +conducts a Procedure that pauses to `Held` on a recoverable step failure, +leaving the pinned `ResolvedStepsRecorded` for `reconduct` to replay. The +test wire-up uses `InMemoryControlPort` with no pre-connected addresses, so a +setpoint fails (recoverable -> Held); reconduct then replays the pinned tail +from the operator's boundary (an empty tail completes; a tail starting with an +acquisition halts-for-operator). """ from typing import Any @@ -31,6 +27,57 @@ def _register(client: TestClient) -> UUID: return UUID(client.post("/procedures", json=body).json()["procedure_id"]) +def _try_conduct_to_held(client: TestClient, steps: list[dict[str, Any]]) -> UUID: + """Register + try-conduct a Procedure to Held (the recoverable setpoint at + index 0 fails on the unconnected port), leaving a pinned resolved-step list + `reconduct` can replay. Returns the Held Procedure's id.""" + pid = _register(client) + held = client.post(f"/procedures/{pid}/try-conduct", json={"steps": steps}) + assert held.status_code == 200 + assert held.json()["held"] is True + return pid + + +@pytest.mark.contract +def test_post_reconduct_completes_held_procedure_with_empty_tail() -> None: + """Reconduct a Held Procedure past the end of its resolved steps (empty + tail): nothing to replay, so it auto-completes (200, succeeded).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + # boundary == len(resolved steps): the replayed tail is empty. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 1} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is True + assert body["acquisition_halt"] is False + + +@pytest.mark.contract +def test_post_reconduct_halts_on_acquisition_in_replayed_tail() -> None: + """Reconduct replaying a tail that starts with an acquisition halts for the + operator (200, acquisition_halt=True), leaving the Procedure Running.""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, + [ + {"kind": "setpoint", "address": "2bma:x", "value": 1.0}, + {"kind": "action", "name": "collect"}, + ], + ) + # boundary == 1 skips the prefix setpoint; the tail starts with the action. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 1} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["acquisition_halt"] is True + + @pytest.mark.contract def test_post_reconduct_returns_404_for_unknown_id() -> None: with TestClient(create_app()) as client: diff --git a/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py b/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py new file mode 100644 index 0000000000..ee403945ff --- /dev/null +++ b/apps/api/tests/contract/test_try_conduct_procedure_endpoint.py @@ -0,0 +1,86 @@ +"""Contract tests for `POST /procedures/{procedure_id}/try-conduct`. + +Pause-capable conduct: like conduct, but a RECOVERABLE step failure (setpoint +/ check) PAUSES the Procedure to Held (resumable via reconduct) instead of +aborting. Always 200 with the outcome in the body; `held` flags the pause. +404 for an unknown procedure, 422 for a malformed body. + +The test wire-up uses `InMemoryControlPort` with no pre-connected addresses, +so a setpoint to any address fails with ControlNotConnectedError: that is the +recoverable failure this slice pauses on. +""" + +from typing import Any +from uuid import UUID, uuid4 + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app + + +def _register(client: TestClient) -> UUID: + body: dict[str, Any] = {"name": "Vessel-A bakeout", "kind": "bakeout"} + return UUID(client.post("/procedures", json=body).json()["procedure_id"]) + + +@pytest.mark.contract +def test_post_try_conduct_empty_steps_completes() -> None: + """An empty step list starts + completes the Procedure (no failure to pause on).""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post(f"/procedures/{pid}/try-conduct", json={"steps": []}) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is True + assert body["held"] is False + + +@pytest.mark.contract +def test_post_try_conduct_recoverable_setpoint_pauses_to_held() -> None: + """A setpoint to an unconnected address is recoverable: pause to Held, not abort.""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}]}, + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["held"] is True + assert body["failure"]["source_kind"] == "setpoint" + + +@pytest.mark.contract +def test_post_try_conduct_action_failure_aborts_not_held() -> None: + """An unregistered action is an acquisition failure: abort (not held).""" + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "action", "name": "unregistered"}]}, + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["held"] is False + assert body["failure"]["source_kind"] == "action" + + +@pytest.mark.contract +def test_post_try_conduct_returns_404_for_unknown_id() -> None: + with TestClient(create_app()) as client: + response = client.post(f"/procedures/{uuid4()}/try-conduct", json={"steps": []}) + assert response.status_code == 404 + + +@pytest.mark.contract +def test_post_try_conduct_returns_422_for_unknown_step_kind() -> None: + with TestClient(create_app()) as client: + pid = _register(client) + response = client.post( + f"/procedures/{pid}/try-conduct", + json={"steps": [{"kind": "teleport", "address": "x", "value": 1}]}, + ) + assert response.status_code == 422 diff --git a/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py b/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py new file mode 100644 index 0000000000..6fb1f452ff --- /dev/null +++ b/apps/api/tests/contract/test_try_conduct_procedure_mcp_tool.py @@ -0,0 +1,72 @@ +"""Contract tests for the `try_conduct_procedure` MCP tool.""" + +from uuid import UUID + +import pytest +from fastapi.testclient import TestClient + +from cora.api.main import create_app +from tests.contract._mcp_helpers import open_session, parse_sse_data + + +def _register_via_mcp(client: TestClient, headers: dict[str, str]) -> UUID: + reg = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 1, + "method": "tools/call", + "params": { + "name": "register_procedure", + "arguments": {"name": "Vessel-A bakeout", "kind": "bakeout"}, + }, + }, + headers=headers, + ) + return UUID(parse_sse_data(reg.text)["result"]["structuredContent"]["procedure_id"]) + + +@pytest.mark.contract +def test_mcp_lists_try_conduct_procedure_tool() -> None: + with TestClient(create_app()) as client: + headers = open_session(client) + response = client.post( + "/mcp", + json={"jsonrpc": "2.0", "id": 99, "method": "tools/list"}, + headers=headers, + ) + body = parse_sse_data(response.text) + tool_names = [t["name"] for t in body["result"]["tools"]] + assert "try_conduct_procedure" in tool_names + + +@pytest.mark.contract +def test_mcp_try_conduct_procedure_pauses_to_held() -> None: + """A recoverable setpoint failure pauses the Procedure to Held via the tool; + the structured output carries held=True (the tool wiring is exercised + end-to-end).""" + with TestClient(create_app()) as client: + headers = open_session(client) + pid = _register_via_mcp(client, headers) + response = client.post( + "/mcp", + json={ + "jsonrpc": "2.0", + "id": 3, + "method": "tools/call", + "params": { + "name": "try_conduct_procedure", + "arguments": { + "procedure_id": str(pid), + "body": { + "steps": [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + }, + }, + }, + }, + headers=headers, + ) + body = parse_sse_data(response.text) + structured = body["result"]["structuredContent"] + assert structured["held"] is True + assert structured["succeeded"] is False diff --git a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py index 49ad04a5ab..8d25a4dd45 100644 --- a/apps/api/tests/unit/operation/test_conduct_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_conduct_procedure_handler.py @@ -29,6 +29,7 @@ from cora.infrastructure.ports.clock import FakeClock from cora.infrastructure.ports.id_generator import UUIDv7Generator from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation._conduct_wire import criterion_from_wire, step_from_wire from cora.operation.adapters.in_memory_recipe_expander import ( InMemoryRecipeExpander, ) @@ -58,9 +59,7 @@ from cora.operation.features.conduct_procedure.handler import bind from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, - criterion_from_wire, result_to_wire, - step_from_wire, ) _NOW = datetime(2026, 6, 2, 12, 0, 0, tzinfo=UTC) diff --git a/apps/api/tests/unit/operation/test_record_resolved_steps.py b/apps/api/tests/unit/operation/test_record_resolved_steps.py index 92b1b97f05..8540e824d8 100644 --- a/apps/api/tests/unit/operation/test_record_resolved_steps.py +++ b/apps/api/tests/unit/operation/test_record_resolved_steps.py @@ -16,6 +16,7 @@ import pytest +from cora.operation._conduct_preparation import decide_resolved_steps_recorded from cora.operation.aggregates.procedure import ( ProcedureRegistered, ProcedureStarted, @@ -31,9 +32,6 @@ WithinToleranceCriterion, step_to_payload, ) -from cora.operation.features.conduct_procedure.resolved_steps import ( - decide_resolved_steps_recorded, -) from cora.operation.features.conduct_procedure.route import ( ConductProcedureRequest, step_from_wire, diff --git a/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py b/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py new file mode 100644 index 0000000000..5735feb911 --- /dev/null +++ b/apps/api/tests/unit/operation/test_try_conduct_procedure_handler.py @@ -0,0 +1,366 @@ +"""Application-handler tests for `try_conduct_procedure` (pause-to-Held conduct). + +Orchestration handler delegating to `Conductor.try_conduct`. Pins the +hold-vs-abort branch + the guards against a real Conductor + real +start/complete/abort/hold handlers over an in-memory store: + + - recoverable setpoint failure -> start + pause to Held (held=True), manifest pinned + - recoverable check failure -> start + pause to Held + - action (acquisition) failure -> start + abort (held=False, Aborted) + - clean run -> start + complete (Completed) + - hold itself fails -> left Running, original failure surfaced + - authz deny -> UnauthorizedError + - unknown procedure -> ProcedureNotFoundError +""" + +from collections.abc import Sequence +from dataclasses import dataclass +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import pytest + +from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore +from cora.infrastructure.event_envelope import to_new_event +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.routing import NIL_SENTINEL_ID +from cora.operation.adapters.in_memory_control_port import InMemoryControlPort +from cora.operation.adapters.in_memory_recipe_expander import InMemoryRecipeExpander +from cora.operation.aggregates.procedure import ( + InMemoryActivityStore, + ProcedureNotFoundError, + ProcedureRegistered, + ProcedureStarted, + ProcedureStatus, + event_type_name, + load_procedure, + to_payload, +) +from cora.operation.conductor import ( + ActionStep, + CheckStep, + Conductor, + EqualsCriterion, + SetpointStep, + Step, +) +from cora.operation.errors import UnauthorizedError +from cora.operation.features import ( + abort_procedure, + append_activities, + complete_procedure, + hold_procedure, + start_procedure, + try_conduct_procedure, +) +from cora.operation.features.complete_procedure.command import CompleteProcedure +from cora.operation.features.hold_procedure.command import HoldProcedure +from cora.operation.features.try_conduct_procedure import ( + Handler as TryConductHandler, +) +from cora.operation.features.try_conduct_procedure import ( + TryConductProcedure, + TryConductProcedureResult, +) +from tests.unit._helpers import build_deps as _build_deps_shared + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PROCEDURE_ID = UUID("01900000-0000-7000-8000-0000000d0b01") +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +@dataclass +class _LenientIds: + """Conductor id_generator that never exhausts (markers double appends).""" + + def new_id(self) -> UUID: + return uuid4() + + +async def _raising_hold( + command: HoldProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, +) -> None: + _ = (command, principal_id, correlation_id, causation_id, surface_id) + msg = "hold backend unavailable" + raise RuntimeError(msg) + + +async def _raising_complete( + command: CompleteProcedure, + *, + principal_id: UUID, + correlation_id: UUID, + causation_id: UUID | None = None, + surface_id: UUID = NIL_SENTINEL_ID, +) -> None: + _ = (command, principal_id, correlation_id, causation_id, surface_id) + msg = "complete backend unavailable" + raise RuntimeError(msg) + + +def _deps(store: InMemoryEventStore, *, deny: bool = False) -> Kernel: + return _build_deps_shared( + ids=[uuid4() for _ in range(30)], now=_NOW, event_store=store, deny=deny + ) + + +def _make_try_conduct( + deps: Kernel, + port: InMemoryControlPort, + *, + hold_fails: bool = False, + complete_fails: bool = False, +) -> TryConductHandler: + conductor = Conductor( + control_port=port, + append_step=append_activities.bind(deps, step_store=InMemoryActivityStore()), + clock=deps.clock, + id_generator=_LenientIds(), + start_procedure=start_procedure.bind(deps), + complete_procedure=_raising_complete if complete_fails else complete_procedure.bind(deps), + abort_procedure=abort_procedure.bind(deps), + hold_procedure=_raising_hold if hold_fails else hold_procedure.bind(deps), + ) + return try_conduct_procedure.bind( + deps, conductor=conductor, expansion_port=InMemoryRecipeExpander() + ) + + +async def _seed_defined(store: InMemoryEventStore) -> None: + """Seed a standalone Defined Procedure (no recipe, no parent Run).""" + event = ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ) + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(event), + payload=to_payload(event), + occurred_at=event.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + ], + ) + + +async def _seed_running(store: InMemoryEventStore) -> None: + """Seed a Registered + Started (Running) Procedure so try_conduct's + start_procedure rejects it (Defined-only) as a lifecycle failure.""" + events = [ + ProcedureRegistered( + procedure_id=_PROCEDURE_ID, + name="alignment", + kind="alignment", + target_asset_ids=(), + parent_run_id=None, + occurred_at=_NOW, + ), + ProcedureStarted(procedure_id=_PROCEDURE_ID, occurred_at=_NOW), + ] + await store.append( + stream_type="Procedure", + stream_id=_PROCEDURE_ID, + expected_version=0, + events=[ + to_new_event( + event_type=event_type_name(e), + payload=to_payload(e), + occurred_at=e.occurred_at, + event_id=uuid4(), + command_name="seed", + correlation_id=_CORRELATION_ID, + principal_id=_PRINCIPAL_ID, + ) + for e in events + ], + ) + + +async def _status(store: InMemoryEventStore) -> ProcedureStatus: + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + return state.status + + +async def _event_types(store: InMemoryEventStore) -> list[str]: + events, _ = await store.load("Procedure", _PROCEDURE_ID) + return [e.event_type for e in events] + + +async def _call(handler: TryConductHandler, steps: Sequence[Step]) -> TryConductProcedureResult: + return await handler( + TryConductProcedure(procedure_id=_PROCEDURE_ID, steps=steps), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + +@pytest.mark.unit +async def test_recoverable_setpoint_failure_pauses_to_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a NOT connected -> write fails (recoverable) + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + assert result.succeeded is False + assert result.held is True + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.HELD + types = await _event_types(store) + assert "ResolvedStepsRecorded" in types # manifest pinned -> reconduct-ready + assert "ProcedureHeld" in types + assert "ProcedureAborted" not in types + assert "ProcedureCompleted" not in types + + +@pytest.mark.unit +async def test_recoverable_check_failure_pauses_to_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # read of unconnected address fails (recoverable) + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (CheckStep(address="2bma:a", criterion=EqualsCriterion(expected=1.0)),), + ) + + assert result.held is True + assert result.failure is not None + assert result.failure.source_kind == "check" + assert await _status(store) is ProcedureStatus.HELD + + +@pytest.mark.unit +async def test_action_failure_aborts_not_held() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + await _seed_defined(store) + # An unregistered action -> UnknownActionError (source_kind=action), which + # is NOT recoverable: an interrupted acquisition aborts rather than pausing. + result = await _call(_make_try_conduct(_deps(store), port), (ActionStep(name="unregistered"),)) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "action" + assert await _status(store) is ProcedureStatus.ABORTED + + +@pytest.mark.unit +async def test_clean_run_completes() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() + port.simulate_connect("2bma:a") + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + assert result.succeeded is True + assert result.held is False + assert result.completed_count == 1 + assert await _status(store) is ProcedureStatus.COMPLETED + assert (await port.read("2bma:a")).value == 1.0 + + +@pytest.mark.unit +async def test_empty_step_list_completes() -> None: + store = InMemoryEventStore() + await _seed_defined(store) + result = await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + assert result.succeeded is True + assert result.held is False + assert await _status(store) is ProcedureStatus.COMPLETED + + +@pytest.mark.unit +async def test_hold_itself_failing_leaves_running() -> None: + store = InMemoryEventStore() + port = InMemoryControlPort() # 2bma:a not connected -> recoverable failure + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), port, hold_fails=True), + (SetpointStep(address="2bma:a", value=1.0),), + ) + + # Recoverable failure, but the hold transition itself failed: leave the + # Procedure Running and surface the original step failure (held=False). + assert result.held is False + assert result.succeeded is False + assert result.failure is not None + assert result.failure.error_class == "ControlNotConnectedError" + assert await _status(store) is ProcedureStatus.RUNNING + types = await _event_types(store) + assert "ProcedureHeld" not in types + assert "ProcedureAborted" not in types + + +@pytest.mark.unit +async def test_raises_unauthorized_on_deny() -> None: + store = InMemoryEventStore() + await _seed_defined(store) + deps = _deps(store, deny=True) + with pytest.raises(UnauthorizedError): + await _call(_make_try_conduct(deps, InMemoryControlPort()), ()) + + +@pytest.mark.unit +async def test_try_conduct_raises_not_found_when_procedure_absent() -> None: + store = InMemoryEventStore() + with pytest.raises(ProcedureNotFoundError): + await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + +@pytest.mark.unit +async def test_start_rejected_records_lifecycle_failure() -> None: + """An already-Running Procedure cannot start: a lifecycle failure lands in + the result (not held, not a step failure), and no step runs.""" + store = InMemoryEventStore() + await _seed_running(store) + result = await _call(_make_try_conduct(_deps(store), InMemoryControlPort()), ()) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "lifecycle" + assert result.failure.target == "start" + assert await _status(store) is ProcedureStatus.RUNNING + + +@pytest.mark.unit +async def test_complete_rejected_records_lifecycle_failure() -> None: + """A clean run whose complete transition itself fails records a lifecycle + failure (target=complete), not held.""" + store = InMemoryEventStore() + await _seed_defined(store) + result = await _call( + _make_try_conduct(_deps(store), InMemoryControlPort(), complete_fails=True), () + ) + + assert result.succeeded is False + assert result.held is False + assert result.failure is not None + assert result.failure.source_kind == "lifecycle" + assert result.failure.target == "complete" From 390d6adc530a0612d28ed32e9055656450a941f5 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 19:00:29 +0300 Subject: [PATCH 10/12] feat(operation): surface Held in the procedure read model Resumable conduct landed the Held/Resumed FSM and try_conduct made a Held Procedure operator-reachable, but the summary read model still showed it as Running: the projection was left unsubscribed because the status CHECK admitted only the 5 non-Held values. This completes the read-model story. - Migration 20260621020000 widens the proj_operation_procedure_summary status CHECK to admit 'Held' (drop + re-add the inline-named constraint; loosening a CHECK is non-destructive, so no backfill and no data-safety opt-out). Resumed maps back to 'Running', so 'Held' is the only new persisted value. - ProcedureSummaryProjection subscribes ProcedureHeld -> status='Held' + last_status_reason (the hold reason, like Aborted) and ProcedureResumed -> status='Running' clearing last_status_reason (Running is not reason-bearing). - ListProcedures status filter widened to admit 'Held' (the read-path Literal had the same wrong "locked at 5 day one" assumption the projection did). The projection's old docstring claim that the CHECK was "locked with the full enum day one (5 statuses) so no future migration is needed even if Held/Resumed land" was internally inconsistent (Held was never among those 5) and is corrected. The "hoist at the 5th status arm" note is re-evaluated and the explicit per-event SQL constants are kept: the arms are non-uniform (Truncated sets interrupted_at, Resumed clears the reason), so a parameterized SQL would read worse. Coverage: 2 new projection unit tests (Held sets status+reason; Resumed clears reason) + the flipped subscription test, and a real-Postgres integration test that writes 'Held' through the projection and round-trips it via list_procedures -- the only place the widened CHECK is actually exercised (unit tests mock the connection). Contract status-filter test extended to all 6 statuses. Gate reviewed (migration constraint name + safety, projection arg positions, filter/read-path consistency, no other 5-status assumptions). Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/openapi.json | 5 +- .../features/list_procedures/query.py | 12 +- .../features/list_procedures/route.py | 2 +- .../cora/operation/projections/procedure.py | 63 ++++++++-- .../contract/test_list_procedures_endpoint.py | 8 +- .../test_held_status_projection_postgres.py | 113 ++++++++++++++++++ .../test_procedure_summary_projection.py | 61 ++++++++-- ...1020000_proc_summary_status_admit_held.sql | 24 ++++ infra/atlas/migrations/atlas.sum | 3 +- 9 files changed, 256 insertions(+), 35 deletions(-) create mode 100644 apps/api/tests/integration/test_held_status_projection_postgres.py create mode 100644 infra/atlas/migrations/20260621020000_proc_summary_status_admit_held.sql diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 37ba084f09..6d30dff3cb 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -36079,7 +36079,7 @@ } }, { - "description": "Optional status filter (one of: Defined, Running, Completed, Aborted, Truncated). Omit to return all statuses.", + "description": "Optional status filter (one of: Defined, Running, Held, Completed, Aborted, Truncated). Omit to return all statuses.", "in": "query", "name": "status", "required": false, @@ -36089,6 +36089,7 @@ "enum": [ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated" @@ -36099,7 +36100,7 @@ "type": "null" } ], - "description": "Optional status filter (one of: Defined, Running, Completed, Aborted, Truncated). Omit to return all statuses.", + "description": "Optional status filter (one of: Defined, Running, Held, Completed, Aborted, Truncated). Omit to return all statuses.", "title": "Status" } }, diff --git a/apps/api/src/cora/operation/features/list_procedures/query.py b/apps/api/src/cora/operation/features/list_procedures/query.py index 02fd8467b8..2e283c8220 100644 --- a/apps/api/src/cora/operation/features/list_procedures/query.py +++ b/apps/api/src/cora/operation/features/list_procedures/query.py @@ -1,15 +1,16 @@ """The `ListProcedures` query: intent dataclass for keyset-paginated list of procedures from the projection. -Four optional filters: status (one of the 5 ProcedureStatus values), +Four optional filters: status (one of the 6 ProcedureStatus values), kind (free-form bare-str discriminator, exact match), parent_run_id (UUID for Phase-of-Run filtering), target_asset_id (UUID for "procedures targeting this Asset" via the GIN index on the target_asset_ids UUID[] column). -`ProcedureStatusFilter` is locked at the full enum width day one -(Defined / Running / Completed / Aborted / Truncated). Same forward- -compat motivation as ListSupplies's SupplyStatusFilter. +`ProcedureStatusFilter` mirrors the full `ProcedureStatus` enum +(Defined / Running / Held / Completed / Aborted / Truncated). `Held` +was added when resumable conduct surfaced it in the read model (the +projection folds ProcedureHeld -> status='Held'). Cursor encodes (registered_at, procedure_id) -- `registered_at` is set once at ProcedureRegistered (immutable), so it's a stable keyset @@ -23,6 +24,7 @@ ProcedureStatusFilter = Literal[ "Defined", "Running", + "Held", "Completed", "Aborted", "Truncated", @@ -40,7 +42,7 @@ class ListProcedures: """Page size cap. Default 50, max 100 (route enforces).""" status: ProcedureStatusFilter | None = None - """Optional status filter (one of the five ProcedureStatus values).""" + """Optional status filter (one of the six ProcedureStatus values).""" kind: str | None = None """Optional kind filter (free-form, exact match; for example 'bakeout').""" diff --git a/apps/api/src/cora/operation/features/list_procedures/route.py b/apps/api/src/cora/operation/features/list_procedures/route.py index c630499542..b294571fb7 100644 --- a/apps/api/src/cora/operation/features/list_procedures/route.py +++ b/apps/api/src/cora/operation/features/list_procedures/route.py @@ -99,7 +99,7 @@ async def list_procedures( Query( alias="status", description=( - "Optional status filter (one of: Defined, Running, " + "Optional status filter (one of: Defined, Running, Held, " "Completed, Aborted, Truncated). Omit to return all statuses." ), ), diff --git a/apps/api/src/cora/operation/projections/procedure.py b/apps/api/src/cora/operation/projections/procedure.py index 78b8e4ed66..c20d883672 100644 --- a/apps/api/src/cora/operation/projections/procedure.py +++ b/apps/api/src/cora/operation/projections/procedure.py @@ -12,6 +12,11 @@ - ProcedureTruncated -> UPDATE status='Truncated' + status-change ts + last_status_reason + interrupted_at + - ProcedureHeld -> UPDATE status='Held' + status-change ts + + last_status_reason + - ProcedureResumed -> UPDATE status='Running' + status-change ts + (clears last_status_reason: + Running is not reason-bearing) - ProcedureActivitiesLogbookOpened -> UPDATE activity_logbook_id (status NOT touched; logbook is orthogonal to lifecycle) @@ -26,16 +31,19 @@ ordered per-stream delivery; equals the count because the start decider enforces strict-successor indexing). -The 4 status-change UPDATEs share the same SQL shape (status literal + -status-change timestamp + optional reason); per-event arms differ only -in which status string + which payload fields they pull. A future -parameterized `_UPDATE_STATUS_SQL` hoist (mirroring proj_supply_summary's -later cleanup) becomes worthwhile when a 5th status-change arm -lands -- today the 4 arms keep the dispatch readable. +The 6 status-change UPDATEs (Started / Completed / Aborted / Truncated / +Held / Resumed) keep per-event SQL constants rather than a parameterized +`_UPDATE_STATUS_SQL`. The "hoist at the 5th arm" note from the 4-arm era +was re-evaluated when Held/Resumed landed: the arms are NOT uniform +(Truncated also sets interrupted_at, Resumed CLEARS last_status_reason +rather than setting it), so a single parameterized SQL would need +conditional columns and read worse than the explicit constants. Revisit +only if a future arm restores uniformity. -All branches idempotent. The CHECK constraint on `status` is locked -with the full enum values day one (5 statuses) so no future migration -is needed even if Held/Resumed land later. +All branches idempotent. The status CHECK was widened to admit 'Held' in +migration `20260621020000_proc_summary_status_admit_held` (Resumed maps +back to 'Running', so 'Held' is the only new persisted value). See +[[project_resumable_conduct_design]]. """ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false @@ -91,6 +99,24 @@ WHERE procedure_id = $1 """ +_UPDATE_HELD_SQL = """ +UPDATE proj_operation_procedure_summary +SET status = 'Held', + last_status_changed_at = $2, + last_status_reason = $3, + updated_at = now() +WHERE procedure_id = $1 +""" + +_UPDATE_RESUMED_SQL = """ +UPDATE proj_operation_procedure_summary +SET status = 'Running', + last_status_changed_at = $2, + last_status_reason = NULL, + updated_at = now() +WHERE procedure_id = $1 +""" + _UPDATE_STEPS_LOGBOOK_OPENED_SQL = """ UPDATE proj_operation_procedure_summary SET activity_logbook_id = $2, @@ -117,6 +143,8 @@ class ProcedureSummaryProjection: "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", } @@ -191,6 +219,23 @@ async def apply( ) return + if event.event_type == "ProcedureHeld": + await conn.execute( + _UPDATE_HELD_SQL, + UUID(event.payload["procedure_id"]), + datetime.fromisoformat(event.payload["occurred_at"]), + event.payload["reason"], + ) + return + + if event.event_type == "ProcedureResumed": + await conn.execute( + _UPDATE_RESUMED_SQL, + UUID(event.payload["procedure_id"]), + datetime.fromisoformat(event.payload["occurred_at"]), + ) + return + if event.event_type == "ProcedureActivitiesLogbookOpened": await conn.execute( _UPDATE_STEPS_LOGBOOK_OPENED_SQL, diff --git a/apps/api/tests/contract/test_list_procedures_endpoint.py b/apps/api/tests/contract/test_list_procedures_endpoint.py index 42dde43c0c..ac26c01504 100644 --- a/apps/api/tests/contract/test_list_procedures_endpoint.py +++ b/apps/api/tests/contract/test_list_procedures_endpoint.py @@ -35,12 +35,10 @@ def test_get_procedures_returns_empty_page_with_no_data(client: TestClient) -> N @pytest.mark.contract @pytest.mark.parametrize( "status_value", - ["Defined", "Running", "Completed", "Aborted", "Truncated"], + ["Defined", "Running", "Held", "Completed", "Aborted", "Truncated"], ) -def test_get_procedures_accepts_each_status_locked_day_one( - client: TestClient, status_value: str -) -> None: - """All 5 statuses accepted; the Literal is locked at the full FSM.""" +def test_get_procedures_accepts_each_status(client: TestClient, status_value: str) -> None: + """All 6 ProcedureStatus values are accepted by the status filter.""" with client: response = client.get(f"/procedures?status={status_value}") assert response.status_code == 200 diff --git a/apps/api/tests/integration/test_held_status_projection_postgres.py b/apps/api/tests/integration/test_held_status_projection_postgres.py new file mode 100644 index 0000000000..763d36b8b9 --- /dev/null +++ b/apps/api/tests/integration/test_held_status_projection_postgres.py @@ -0,0 +1,113 @@ +"""End-to-end: the Held/Resumed FSM drives `status` in the +`proj_operation_procedure_summary` read model against real Postgres. + +This is the only place the widened status CHECK (migration +20260621020000, admitting 'Held') is actually exercised: the projection +unit tests use a mocked connection, so the column constraint is enforced +only here. + +Pins: + - ProcedureHeld folds to status='Held' + last_status_reason (the hold + reason), proving the CHECK admits 'Held'. + - ProcedureResumed folds back to status='Running' and clears + last_status_reason (Running is not reason-bearing). + - The list_procedures read path surfaces + filters on status='Held'. +""" + +# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false + +from datetime import UTC, datetime +from uuid import UUID, uuid4 + +import asyncpg +import pytest + +from cora.infrastructure.kernel import Kernel +from cora.infrastructure.projection import ProjectionRegistry, drain_projections +from cora.operation._projections import register_operation_projections +from cora.operation.features.hold_procedure import HoldProcedure +from cora.operation.features.hold_procedure import bind as bind_hold +from cora.operation.features.list_procedures import ListProcedures +from cora.operation.features.list_procedures import bind as bind_list +from cora.operation.features.register_procedure import RegisterProcedure +from cora.operation.features.register_procedure import bind as bind_register +from cora.operation.features.resume_procedure import ResumeProcedure +from cora.operation.features.resume_procedure import bind as bind_resume +from cora.operation.features.start_procedure import StartProcedure +from cora.operation.features.start_procedure import bind as bind_start +from tests.integration._helpers import build_postgres_deps + +_NOW = datetime(2026, 6, 21, 12, 0, 0, tzinfo=UTC) +_PRINCIPAL_ID = UUID("01900000-0000-7000-8000-000000000099") +_CORRELATION_ID = UUID("01900000-0000-7000-8000-0000000000aa") + + +def _build_deps(db_pool: asyncpg.Pool, ids: list[UUID]) -> Kernel: + return build_postgres_deps(db_pool, now=_NOW, ids=ids) + + +async def _drain(db_pool: asyncpg.Pool) -> None: + registry = ProjectionRegistry() + register_operation_projections(registry) + await drain_projections(db_pool, registry, deadline_seconds=2.0) + + +async def _status_row(db_pool: asyncpg.Pool, proc_id: UUID) -> asyncpg.Record: + async with db_pool.acquire() as conn: + row = await conn.fetchrow( + "SELECT status, last_status_reason FROM proj_operation_procedure_summary " + "WHERE procedure_id = $1", + proc_id, + ) + assert row is not None + return row + + +@pytest.mark.integration +async def test_hold_then_resume_drives_status_in_read_model(db_pool: asyncpg.Pool) -> None: + proc_id = uuid4() + deps = _build_deps(db_pool, [proc_id, *[uuid4() for _ in range(8)]]) + + await bind_register(deps)( + RegisterProcedure(name="2-BM center alignment", kind="center_alignment"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await bind_start(deps)( + StartProcedure(procedure_id=proc_id), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + # Hold: the projection writes status='Held' (the CHECK must admit it) + + # the hold reason. + await bind_hold(deps)( + HoldProcedure(procedure_id=proc_id, reason="beam dropped"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await _drain(db_pool) + held = await _status_row(db_pool, proc_id) + assert held["status"] == "Held" + assert held["last_status_reason"] == "beam dropped" + + # The list read path surfaces + filters on the new status. + page = await bind_list(deps)( + ListProcedures(status="Held"), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + item = next(i for i in page.items if i.procedure_id == proc_id) + assert item.status == "Held" + assert item.last_status_reason == "beam dropped" + + # Resume: back to Running, hold reason cleared. + await bind_resume(deps)( + ResumeProcedure(procedure_id=proc_id, re_establishment_boundary=0), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + await _drain(db_pool) + resumed = await _status_row(db_pool, proc_id) + assert resumed["status"] == "Running" + assert resumed["last_status_reason"] is None diff --git a/apps/api/tests/unit/operation/test_procedure_summary_projection.py b/apps/api/tests/unit/operation/test_procedure_summary_projection.py index 54e2f5f575..54f381ba86 100644 --- a/apps/api/tests/unit/operation/test_procedure_summary_projection.py +++ b/apps/api/tests/unit/operation/test_procedure_summary_projection.py @@ -49,6 +49,8 @@ def test_projection_metadata() -> None: "ProcedureCompleted", "ProcedureAborted", "ProcedureTruncated", + "ProcedureHeld", + "ProcedureResumed", "ProcedureActivitiesLogbookOpened", "ProcedureIterationStarted", } @@ -71,19 +73,14 @@ def test_projection_does_not_subscribe_to_iteration_ended() -> None: @pytest.mark.unit -def test_projection_does_not_subscribe_to_hold_resume() -> None: - """Tier-1 resumable conduct deliberately leaves the summary read model - untouched: the `status` CHECK constraint admits only the 5 non-Held - statuses, so subscribing to ProcedureHeld/Resumed would write a value - the column rejects. A held Procedure therefore shows its last - subscribed status (Running) in `list_procedures`; terminal states are - still captured because abort/truncate/complete require resuming to - Running first (and those events ARE subscribed). Surfacing `Held` in - the read model is a follow-up that needs a forward-only migration to - widen the CHECK. See [[project_resumable_conduct_design]].""" +def test_projection_subscribes_to_hold_resume() -> None: + """Resumable conduct now surfaces Held in the read model: migration + 20260621020000 widened the `status` CHECK to admit 'Held', so the + projection folds ProcedureHeld -> status='Held' and ProcedureResumed -> + status='Running'. See [[project_resumable_conduct_design]].""" proj = ProcedureSummaryProjection() - assert "ProcedureHeld" not in proj.subscribed_event_types - assert "ProcedureResumed" not in proj.subscribed_event_types + assert "ProcedureHeld" in proj.subscribed_event_types + assert "ProcedureResumed" in proj.subscribed_event_types @pytest.mark.unit @@ -229,6 +226,46 @@ async def test_procedure_truncated_handles_null_interrupted_at() -> None: assert conn.execute.call_args.args[4] is None +@pytest.mark.unit +async def test_procedure_held_updates_status_and_reason() -> None: + proj = ProcedureSummaryProjection() + conn = AsyncMock() + event = _stored( + "ProcedureHeld", + { + "procedure_id": str(_PROCEDURE_ID), + "reason": "beam dropped", + "occurred_at": _NOW.isoformat(), + }, + ) + await proj.apply(event, conn) + sql = conn.execute.call_args.args[0] + assert "SET status = 'Held'" in sql + assert conn.execute.call_args.args[1] == _PROCEDURE_ID + assert conn.execute.call_args.args[2] == _NOW + assert conn.execute.call_args.args[3] == "beam dropped" + + +@pytest.mark.unit +async def test_procedure_resumed_updates_status_to_running_and_clears_reason() -> None: + proj = ProcedureSummaryProjection() + conn = AsyncMock() + event = _stored( + "ProcedureResumed", + { + "procedure_id": str(_PROCEDURE_ID), + "re_establishment_boundary": 0, + "occurred_at": _NOW.isoformat(), + }, + ) + await proj.apply(event, conn) + sql = conn.execute.call_args.args[0] + assert "SET status = 'Running'" in sql + assert "last_status_reason = NULL" in sql + assert conn.execute.call_args.args[1] == _PROCEDURE_ID + assert conn.execute.call_args.args[2] == _NOW + + @pytest.mark.unit async def test_procedure_steps_logbook_opened_updates_logbook_id() -> None: proj = ProcedureSummaryProjection() diff --git a/infra/atlas/migrations/20260621020000_proc_summary_status_admit_held.sql b/infra/atlas/migrations/20260621020000_proc_summary_status_admit_held.sql new file mode 100644 index 0000000000..74d4f056f1 --- /dev/null +++ b/infra/atlas/migrations/20260621020000_proc_summary_status_admit_held.sql @@ -0,0 +1,24 @@ +-- Procedure summary projection: admit 'Held' in the status CHECK. +-- +-- Tier-1 resumable conduct landed the Held/Resumed FSM (ProcedureHeld / +-- ProcedureResumed) and try_conduct_procedure makes a Held Procedure +-- operator-reachable. The summary read model can now surface it: widen the +-- status CHECK so the ProcedureSummaryProjection can fold ProcedureHeld into +-- status='Held'. ProcedureResumed maps back to 'Running', so 'Held' is the +-- only new persisted status value. +-- +-- The init migration declared the CHECK inline on the column, so Postgres +-- auto-named it proj_operation_procedure_summary_status_check. Drop + re-add +-- with the widened value set. Loosening a CHECK is non-destructive: no +-- existing row (one of the 5 prior statuses) can violate the wider set, so +-- this needs no backfill and no data-safety opt-out. +-- +-- Forward-only: a rollback is a new compensating migration. Mutable read +-- model; cora_app keeps its existing DML grants. + +ALTER TABLE proj_operation_procedure_summary + DROP CONSTRAINT proj_operation_procedure_summary_status_check; + +ALTER TABLE proj_operation_procedure_summary + ADD CONSTRAINT proj_operation_procedure_summary_status_check + CHECK (status IN ('Defined', 'Running', 'Held', 'Completed', 'Aborted', 'Truncated')); diff --git a/infra/atlas/migrations/atlas.sum b/infra/atlas/migrations/atlas.sum index 458d498ea9..6f70d10131 100644 --- a/infra/atlas/migrations/atlas.sum +++ b/infra/atlas/migrations/atlas.sum @@ -1,4 +1,4 @@ -h1:3NR+ptNTFbVBMGyQigNPTFWzWcPDcL9JS+/6qQjJU9A= +h1:4bfwJFPssyTQNRq4W47pIWYikEfRN77XrVVRf3emup4= 20260509120000_init_events.sql h1:GmgCZKfaqXu1m96/cKAks2vhaLWTdEaHTLkFtUo9FXg= 20260509170000_init_idempotency.sql h1:Nbu8DIE4Sv1WiHw3G22+tYffPhKc5Jryw3PMK8wB2zY= 20260510010000_add_event_id.sql h1:RbtYP6uMnOB20zhJ9dNXUi4YVqbmlEzf562pmygnRW8= @@ -145,3 +145,4 @@ h1:3NR+ptNTFbVBMGyQigNPTFWzWcPDcL9JS+/6qQjJU9A= 20260615000000_drop_proj_equipment_assembly_summary_presents_as_family_id.sql h1:dFkJ+S5IkEBbvs3F6kJfpDJC89mc6j0oxN4QyfAptt0= 20260621000000_add_proj_safety_clearance_summary_status_keyset_idx.sql h1:WOKLOYRXatOGUF6/ws3VYcXV7TKojlz9G+iMqZRWlGo= 20260621010000_add_proj_run_summary_running_since.sql h1:P4ERA96MJ3CO4Mnl1WEe1NrE3Emg27U6Zad94QoG+Zs= +20260621020000_proc_summary_status_admit_held.sql h1:TLQQGnfNSe5iKZGt4H067EHfn9NDztzPfUD3riYmujI= From 73512cced3ad0ad42432a6cb7b91b9e6e4d6a725 Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 20:33:07 +0300 Subject: [PATCH 11/12] fix(operation): close gate-review findings on resumable conduct (provenance + cleanup) Addresses the independent multi-axis gate review of PR #276. One confirmed MAJOR plus the verified MINOR/NIT cleanup. MAJOR -- actuation_kind provenance survives hold->resume. A conduct that touched a SIMULATED route before pausing to Held, then reconducted from a boundary past that prefix over a physical tail, completed as Physical and slipped past the promote_dataset Simulated/Hybrid gate. Fix: - ProcedureHeld (+ HoldProcedure) carry the conduct's observed-so-far actuation_kind (additive; legacy folds via payload.get -> None). - Conductor.try_conduct passes the observed kind to the hold. - the evolver Held arm MERGES it into Procedure.actuation_kind (merge, not set, so a later manual hold with no kind cannot wipe a prior conduct's kind); Resumed carries it forward. New pure merge_actuation_kinds helper on the aggregate (string-based: the aggregate owns no ActuationKind import, the cross-BC snapshot seam). Carry-forward fitness gains ProcedureHeld as an actuation_kind writer. - reconduct merges the folded prefix kind (passed by the handler) with the replay-tail kind before BOTH complete and abort, and reports the merged kind on the result so the response matches the terminal event. Regression test: Simulated prefix + physical tail -> Completed as Hybrid (not Physical), so the gate still bites. Two residuals documented (boundary past the failure index; acquisition-halt tail kind unpersisted), both narrower than the bug and aligned with the Tier-2 deferral. MINOR: - reconduct rejects re_establishment_boundary past the pinned step count (the resume decider only floors at 0; the bound lives where the manifest is known) -> 400; boundary == count stays valid (complete-with-nothing). - reconduct reuses the shared _conduct_wire.ConductorFailureResponse instead of a private duplicate, so the OpenAPI carries one canonical schema. - fix a stale docstring pointing at the removed resolved_steps.py. NIT: refresh the stale "hoist at the 5th status arm" projection-test note; add reconduct contract tests (boundary-past-count 400, genuine-step-failure abort) + try_conduct/reconduct missing-handler RuntimeError tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/api/openapi.json | 53 ++++------------- .../cora/operation/_resolved_steps_replay.py | 2 +- .../aggregates/procedure/__init__.py | 2 + .../operation/aggregates/procedure/events.py | 15 +++++ .../operation/aggregates/procedure/evolver.py | 17 ++++-- .../operation/aggregates/procedure/state.py | 31 ++++++++++ apps/api/src/cora/operation/conductor.py | 38 ++++++++++--- .../features/hold_procedure/command.py | 5 ++ .../features/hold_procedure/decider.py | 1 + .../features/reconduct_procedure/handler.py | 15 +++++ .../features/reconduct_procedure/route.py | 30 +++------- .../test_procedure_evolver_carry_forward.py | 7 ++- .../test_reconduct_procedure_endpoint.py | 34 +++++++++++ .../tests/unit/operation/test_conductor.py | 29 ++++++++++ .../operation/test_hold_procedure_handler.py | 2 + .../unit/operation/test_procedure_events.py | 2 + .../test_procedure_summary_projection.py | 11 ++-- .../test_reconduct_procedure_handler.py | 57 ++++++++++++++++++- 18 files changed, 263 insertions(+), 88 deletions(-) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 6d30dff3cb..9db33a6101 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -10366,7 +10366,7 @@ "failure": { "anyOf": [ { - "$ref": "#/components/schemas/_ConductorFailureResponse" + "$ref": "#/components/schemas/ConductorFailureResponse" }, { "type": "null" @@ -15504,47 +15504,6 @@ "title": "_CheckStepRequest", "type": "object" }, - "_ConductorFailureResponse": { - "description": "JSON wire shape for `ConductorFailure`.", - "properties": { - "error_class": { - "title": "Error Class", - "type": "string" - }, - "message": { - "title": "Message", - "type": "string" - }, - "source_kind": { - "title": "Source Kind", - "type": "string" - }, - "step_index": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Step Index" - }, - "target": { - "title": "Target", - "type": "string" - } - }, - "required": [ - "step_index", - "source_kind", - "target", - "error_class", - "message" - ], - "title": "_ConductorFailureResponse", - "type": "object" - }, "_EqualsCriterion": { "additionalProperties": false, "description": "JSON wire shape for an `EqualsCriterion`.", @@ -37291,6 +37250,16 @@ }, "description": "Successful Response" }, + "400": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + }, + "description": "re_establishment_boundary is past the pinned resolved step count." + }, "403": { "content": { "application/json": { diff --git a/apps/api/src/cora/operation/_resolved_steps_replay.py b/apps/api/src/cora/operation/_resolved_steps_replay.py index 77f6aa8b2a..b91f3064f8 100644 --- a/apps/api/src/cora/operation/_resolved_steps_replay.py +++ b/apps/api/src/cora/operation/_resolved_steps_replay.py @@ -3,7 +3,7 @@ The resume path replays a halted conduct from PINNED resolved steps rather than re-deriving the step list. This module locates the `ResolvedStepsRecorded` provenance event (pinned once at conduct start by -`conduct_procedure/handler.py` + `resolved_steps.py`) in a Procedure stream so +`_conduct_preparation.resolve_and_pin_conduct_steps`) in a Procedure stream so the handler can parse `resolved_steps` back into `Step`s via `conductor.steps_from_payload` and hand them to `Conductor.execute_from`. diff --git a/apps/api/src/cora/operation/aggregates/procedure/__init__.py b/apps/api/src/cora/operation/aggregates/procedure/__init__.py index 8b7553473b..c43a31c3fc 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/__init__.py +++ b/apps/api/src/cora/operation/aggregates/procedure/__init__.py @@ -93,6 +93,7 @@ RecipeExpansionReplayMismatchError, ResolvedStepsRecordNotFoundError, StepKind, + merge_actuation_kinds, ) __all__ = [ @@ -172,5 +173,6 @@ "from_stored", "load_procedure", "load_procedure_with_events", + "merge_actuation_kinds", "to_payload", ] diff --git a/apps/api/src/cora/operation/aggregates/procedure/events.py b/apps/api/src/cora/operation/aggregates/procedure/events.py index 397a1c251f..de64a56df3 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/events.py +++ b/apps/api/src/cora/operation/aggregates/procedure/events.py @@ -347,6 +347,16 @@ class ProcedureHeld: hold. NO existence check per the cross-BC eventual-consistency stance. Forward-compat via `payload.get("decided_by_decision_id")` -> None. + `actuation_kind` is the raw `ActuationKind` value the Conductor observed + in the conduct UP TO this pause (None for an operator hold issued outside + a conduct). It is carried so a later resume can fold the pre-hold + provenance with the replay tail's: without it, a `reconduct` from a + boundary past a simulated prefix would complete as `Physical` and slip + past the `promote_dataset` Simulated/Hybrid gate. The evolver merges it + into `Procedure.actuation_kind` (via `merge_actuation_kinds`); + `ProcedureResumed` then carries it forward. Additive: legacy streams fold + via `payload.get("actuation_kind")` -> None. + Status is NOT carried (the event type encodes the transition); the evolver maps `ProcedureHeld -> HELD`. """ @@ -355,6 +365,7 @@ class ProcedureHeld: reason: str occurred_at: datetime decided_by_decision_id: UUID | None = None + actuation_kind: str | None = None @dataclass(frozen=True) @@ -586,6 +597,7 @@ def to_payload(event: ProcedureEvent) -> dict[str, Any]: reason=reason, occurred_at=occurred_at, decided_by_decision_id=decided_by_decision_id, + actuation_kind=actuation_kind, ): return { "procedure_id": str(procedure_id), @@ -594,6 +606,7 @@ def to_payload(event: ProcedureEvent) -> dict[str, Any]: str(decided_by_decision_id) if decided_by_decision_id is not None else None ), "occurred_at": occurred_at.isoformat(), + "actuation_kind": actuation_kind, } case ProcedureResumed( procedure_id=procedure_id, @@ -796,6 +809,8 @@ def _build_held() -> ProcedureHeld: UUID(raw_decided_by) if raw_decided_by is not None else None ), occurred_at=datetime.fromisoformat(payload["occurred_at"]), + # Additive: pre-activation streams omit the key -> None. + actuation_kind=payload.get("actuation_kind"), ) return deserialize_or_raise("ProcedureHeld", _build_held) diff --git a/apps/api/src/cora/operation/aggregates/procedure/evolver.py b/apps/api/src/cora/operation/aggregates/procedure/evolver.py index d8f2b56238..7a4d50e81e 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/evolver.py +++ b/apps/api/src/cora/operation/aggregates/procedure/evolver.py @@ -83,6 +83,7 @@ Procedure, ProcedureName, ProcedureStatus, + merge_actuation_kinds, ) @@ -197,10 +198,16 @@ def evolve(state: Procedure | None, event: ProcedureEvent) -> Procedure: ), actuation_kind=prior.actuation_kind, ) - case ProcedureHeld(): - # Operator-pause transition (Running -> Held). Status-only - # change; every non-status field carries verbatim from prior - # (especially the iteration denorms). Mirrors RunHeld. + case ProcedureHeld(actuation_kind=held_actuation_kind): + # Operator-pause transition (Running -> Held). Status-only change; + # every non-status field carries verbatim from prior (especially + # the iteration denorms). Mirrors RunHeld. EXCEPT actuation_kind: + # the conduct's observed-so-far kind rides ProcedureHeld and is + # MERGED into state so it survives the hold->resume boundary (a + # reconduct from a boundary past a simulated prefix would otherwise + # complete as Physical and bypass the promote_dataset gate). Merge, + # not set, so a manual operator hold (actuation_kind=None) cannot + # wipe a prior conduct's recorded kind. prior = require_state(state, "ProcedureHeld") return Procedure( id=prior.id, @@ -218,7 +225,7 @@ def evolve(state: Procedure | None, event: ProcedureEvent) -> Procedure: max_consecutive_unconverged_iterations=( prior.max_consecutive_unconverged_iterations ), - actuation_kind=prior.actuation_kind, + actuation_kind=merge_actuation_kinds(prior.actuation_kind, held_actuation_kind), ) case ProcedureResumed(): # Resume transition (Held -> Running). Status-only change; every diff --git a/apps/api/src/cora/operation/aggregates/procedure/state.py b/apps/api/src/cora/operation/aggregates/procedure/state.py index dab6bdc831..cbeca2b2d0 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/state.py +++ b/apps/api/src/cora/operation/aggregates/procedure/state.py @@ -1423,3 +1423,34 @@ class Procedure: enum; state stores the raw string (cross-BC string-snapshot seam, mirroring how the Data BC stores it). Additive-state default None: legacy + pre-activation streams fold cleanly.""" + + +def merge_actuation_kinds(first: str | None, second: str | None) -> str | None: + """Combine two observed actuation-kind values into the honest aggregate kind. + + Mirrors the Conductor `_ActuationObserver`'s flag collapse, but over the + persisted raw string values (an `ActuationKind` value or None) so a resume + can fold the PRE-HOLD conduct's observed kind (carried on `ProcedureHeld`) + with the replay tail's kind before the terminal event. Without this, a + reconduct from a boundary past a simulated prefix would complete as + `Physical` and slip past the `promote_dataset` Simulated/Hybrid gate. None + contributes nothing; a `Physical` + `Simulated` mix (or either with + `Hybrid`) collapses to `Hybrid`. Pure + no `ActuationKind` import: the + aggregate stores the raw string by design (the cross-BC snapshot seam).""" + simulated_seen = False + physical_seen = False + for kind in (first, second): + if kind == "Simulated": + simulated_seen = True + elif kind == "Physical": + physical_seen = True + elif kind == "Hybrid": + simulated_seen = True + physical_seen = True + if simulated_seen and physical_seen: + return "Hybrid" + if simulated_seen: + return "Simulated" + if physical_seen: + return "Physical" + return None diff --git a/apps/api/src/cora/operation/conductor.py b/apps/api/src/cora/operation/conductor.py index 7c06b03f1e..8e9197f8e1 100644 --- a/apps/api/src/cora/operation/conductor.py +++ b/apps/api/src/cora/operation/conductor.py @@ -113,7 +113,7 @@ import asyncio import contextlib from collections.abc import AsyncIterator, Awaitable, Callable, Mapping, Sequence -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from enum import StrEnum from typing import Any, Protocol, cast from uuid import UUID @@ -123,7 +123,7 @@ from cora.infrastructure.ports.id_generator import IdGenerator from cora.infrastructure.routing import NIL_SENTINEL_ID from cora.operation._control_dispatch_context import with_dispatch_correlation_id -from cora.operation.aggregates.procedure import ProcedureNotFoundError +from cora.operation.aggregates.procedure import ProcedureNotFoundError, merge_actuation_kinds from cora.operation.errors import CheckFailedError, UnauthorizedError, UnknownActionError from cora.operation.features.abort_procedure.command import AbortProcedure from cora.operation.features.abort_procedure.handler import Handler as AbortProcedureHandler @@ -1035,6 +1035,9 @@ async def try_conduct( HoldProcedure( procedure_id=procedure_id, reason=_derive_failure_reason(failure), + # Carry the observed-so-far kind so a later reconduct + # folds the pre-hold provenance with the replay tail. + actuation_kind=actuation_kind, ), **envelope_kwargs, ) @@ -1070,6 +1073,7 @@ async def reconduct( correlation_id: UUID, steps: Sequence[Step], boundary: int, + prior_actuation_kind: str | None = None, causation_id: UUID | None = None, surface_id: UUID = NIL_SENTINEL_ID, ) -> ConductorResult: @@ -1142,10 +1146,24 @@ async def reconduct( causation_id=causation_id, surface_id=surface_id, ) - actuation_kind = result.actuation_kind.value if result.actuation_kind is not None else None + # Fold the pre-hold conduct's kind (carried on the Held procedure, + # passed in by the handler) with the replay tail's observed kind, so a + # boundary>0 resume past a simulated prefix does not complete as + # Physical and bypass the promote_dataset gate. boundary=0 re-observes + # everything, so the merge is a no-op there. + tail_actuation_kind = ( + result.actuation_kind.value if result.actuation_kind is not None else None + ) + actuation_kind = merge_actuation_kinds(prior_actuation_kind, tail_actuation_kind) + # Report the merged kind on the result too, so the response body matches + # the kind threaded onto the terminal event (not just the replay tail). + merged_result = replace( + result, + actuation_kind=(ActuationKind(actuation_kind) if actuation_kind is not None else None), + ) if result.succeeded: # Clean tail (incl. empty tail): auto-complete, threading the - # observed kind onto ProcedureCompleted (Data BC gate carrier). + # merged observed kind onto ProcedureCompleted (Data BC gate carrier). try: await self._complete_procedure( CompleteProcedure(procedure_id=procedure_id, actuation_kind=actuation_kind), @@ -1165,10 +1183,16 @@ async def reconduct( message=str(exc), ), ) - return result + return merged_result if is_acquisition_halt(result.failure): # Halt-for-operator: leave the Procedure Running; no transition. - return result + # RESIDUAL: the replay tail's observed kind is NOT persisted here + # (no terminal event), so a later manual complete/abort -- which + # SETs actuation_kind from the command, not merges -- could stamp + # over a tail simulator touch. Narrower than the hold->resume gap + # this method closes; the design-memo second-writer hazard, aligned + # with the Tier-2 acquisition-decomposition deferral. + return merged_result # Genuine step failure: best-effort abort (if abort itself fails, the # original step failure is what surfaces). Mirrors conduct(). failure = result.failure @@ -1182,7 +1206,7 @@ async def reconduct( ), **envelope_kwargs, ) - return result + return merged_result async def _dispatch( self, diff --git a/apps/api/src/cora/operation/features/hold_procedure/command.py b/apps/api/src/cora/operation/features/hold_procedure/command.py index 356c6e972c..861194ca32 100644 --- a/apps/api/src/cora/operation/features/hold_procedure/command.py +++ b/apps/api/src/cora/operation/features/hold_procedure/command.py @@ -27,3 +27,8 @@ class HoldProcedure: procedure_id: UUID reason: str decided_by_decision_id: UUID | None = None + actuation_kind: str | None = None + """The raw `ActuationKind` value the Conductor observed in the conduct up + to this pause. `Conductor.try_conduct` sets it so the pre-hold provenance + survives the hold->resume boundary (see `ProcedureHeld.actuation_kind`); an + operator hold issued outside a conduct leaves it None.""" diff --git a/apps/api/src/cora/operation/features/hold_procedure/decider.py b/apps/api/src/cora/operation/features/hold_procedure/decider.py index 136804a286..41b249faf4 100644 --- a/apps/api/src/cora/operation/features/hold_procedure/decider.py +++ b/apps/api/src/cora/operation/features/hold_procedure/decider.py @@ -53,5 +53,6 @@ def decide( reason=reason.value, decided_by_decision_id=command.decided_by_decision_id, occurred_at=now, + actuation_kind=command.actuation_kind, ) ] diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py index ab0bb6c2ba..2f6c415ded 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/handler.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/handler.py @@ -57,6 +57,7 @@ from cora.infrastructure.routing import NIL_SENTINEL_ID from cora.operation._resolved_steps_replay import find_resolved_steps_record from cora.operation.aggregates.procedure import ( + InvalidProcedureReEstablishmentBoundaryError, ProcedureCannotResumeError, ProcedureNotFoundError, ProcedureStatus, @@ -157,12 +158,26 @@ async def handler( raise ResolvedStepsRecordNotFoundError(command.procedure_id) steps = steps_from_payload(record.payload["resolved_steps"]) + # Upper-bound guard: a boundary PAST the pinned step count would replay + # an empty tail and silently auto-complete with nothing re-driven. The + # resume decider only floors at 0 (it has no manifest to size against); + # the bound lives here, where the manifest is known. `boundary == + # len(steps)` is allowed (a deliberate "everything already done, + # complete" resume); only strictly-past is rejected. + if command.re_establishment_boundary > len(steps): + raise InvalidProcedureReEstablishmentBoundaryError(command.re_establishment_boundary) + result = await conductor.reconduct( procedure_id=command.procedure_id, principal_id=principal_id, correlation_id=correlation_id, steps=steps, boundary=command.re_establishment_boundary, + # The pre-hold conduct's observed kind (folded onto the Held + # Procedure) so the terminal event reflects the FULL provenance, + # not just the replay tail -- guards the promote_dataset gate + # against a boundary>0 resume past a simulated prefix. + prior_actuation_kind=procedure.actuation_kind, causation_id=causation_id, surface_id=surface_id, ) diff --git a/apps/api/src/cora/operation/features/reconduct_procedure/route.py b/apps/api/src/cora/operation/features/reconduct_procedure/route.py index f343b732f9..2e854cb30e 100644 --- a/apps/api/src/cora/operation/features/reconduct_procedure/route.py +++ b/apps/api/src/cora/operation/features/reconduct_procedure/route.py @@ -26,7 +26,7 @@ get_principal_id, get_surface_id, ) -from cora.operation.conductor import ConductorFailure +from cora.operation._conduct_wire import ConductorFailureResponse, failure_to_wire from cora.operation.features.reconduct_procedure.command import ( ReconductProcedure, ReconductProcedureResult, @@ -50,16 +50,6 @@ class ReconductProcedureRequest(BaseModel): model_config = {"extra": "forbid"} -class _ConductorFailureResponse(BaseModel): - """JSON wire shape for `ConductorFailure`.""" - - step_index: int | None - source_kind: str - target: str - error_class: str - message: str - - class ReconductProcedureResponse(BaseModel): """Response body for the reconduct_procedure slice. @@ -74,20 +64,10 @@ class ReconductProcedureResponse(BaseModel): succeeded: bool re_establishment_boundary: int acquisition_halt: bool - failure: _ConductorFailureResponse | None = None + failure: ConductorFailureResponse | None = None actuation_kind: str | None = None -def _failure_to_wire(failure: ConductorFailure) -> _ConductorFailureResponse: - return _ConductorFailureResponse( - step_index=failure.step_index, - source_kind=failure.source_kind, - target=failure.target, - error_class=failure.error_class, - message=failure.message, - ) - - def result_to_wire(result: ReconductProcedureResult) -> ReconductProcedureResponse: """Build a `ReconductProcedureResponse` from the slice result. @@ -99,7 +79,7 @@ def result_to_wire(result: ReconductProcedureResult) -> ReconductProcedureRespon succeeded=result.succeeded, re_establishment_boundary=result.re_establishment_boundary, acquisition_halt=result.acquisition_halt, - failure=_failure_to_wire(result.failure) if result.failure is not None else None, + failure=failure_to_wire(result.failure) if result.failure is not None else None, actuation_kind=result.actuation_kind, ) @@ -117,6 +97,10 @@ def _get_handler(request: Request) -> Handler: status_code=status.HTTP_200_OK, response_model=ReconductProcedureResponse, responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorResponse, + "description": "re_establishment_boundary is past the pinned resolved step count.", + }, status.HTTP_403_FORBIDDEN: { "model": ErrorResponse, "description": "Authorize port denied the command.", diff --git a/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py index 9ff5afa02f..2dde1ec3f6 100644 --- a/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py +++ b/apps/api/tests/architecture/test_procedure_evolver_carry_forward.py @@ -69,8 +69,11 @@ "iteration_count": frozenset({"ProcedureIterationStarted"}), "consecutive_unconverged_iterations": frozenset({"ProcedureIterationEnded"}), "max_consecutive_unconverged_iterations": frozenset(), - # Terminal arms snapshot the Conductor's observed kind from the event. - "actuation_kind": frozenset({"ProcedureCompleted", "ProcedureAborted"}), + # Terminal arms snapshot the Conductor's observed kind from the event; + # ProcedureHeld MERGES the conduct's observed-so-far kind into state (via + # merge_actuation_kinds) so the pre-hold provenance survives the + # hold->resume boundary. + "actuation_kind": frozenset({"ProcedureCompleted", "ProcedureAborted", "ProcedureHeld"}), } diff --git a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py index ab6044aaa4..6c218d4005 100644 --- a/apps/api/tests/contract/test_reconduct_procedure_endpoint.py +++ b/apps/api/tests/contract/test_reconduct_procedure_endpoint.py @@ -156,3 +156,37 @@ def test_post_reconduct_returns_422_for_malformed_id() -> None: "/procedures/not-a-uuid/reconduct", json={"re_establishment_boundary": 0} ) assert response.status_code == 422 + + +@pytest.mark.contract +def test_post_reconduct_returns_400_for_boundary_past_step_count() -> None: + """A boundary strictly past the pinned step count is rejected (it would + replay an empty tail and silently auto-complete).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 2} + ) + assert response.status_code == 400 + + +@pytest.mark.contract +def test_post_reconduct_aborts_on_a_genuine_step_failure() -> None: + """Replaying a tail whose setpoint still fails (unconnected address) aborts: + 200 with succeeded=False + acquisition_halt=False (a genuine step failure, + not an acquisition halt).""" + with TestClient(create_app()) as client: + pid = _try_conduct_to_held( + client, [{"kind": "setpoint", "address": "2bma:x", "value": 1.0}] + ) + # boundary 0 re-drives the still-unconnected setpoint -> it fails again. + response = client.post( + f"/procedures/{pid}/reconduct", json={"re_establishment_boundary": 0} + ) + assert response.status_code == 200 + body = response.json() + assert body["succeeded"] is False + assert body["acquisition_halt"] is False + assert body["failure"]["source_kind"] == "setpoint" diff --git a/apps/api/tests/unit/operation/test_conductor.py b/apps/api/tests/unit/operation/test_conductor.py index f38ee0fc7e..967e5bc435 100644 --- a/apps/api/tests/unit/operation/test_conductor.py +++ b/apps/api/tests/unit/operation/test_conductor.py @@ -1023,6 +1023,35 @@ async def test_conduct_without_lifecycle_handlers_raises_runtime_error() -> None ) +@pytest.mark.unit +async def test_try_conduct_without_handlers_raises_runtime_error() -> None: + """try_conduct() requires start + complete + abort + hold; a missing one is + a wiring bug, not a runtime failure, so it propagates.""" + conductor = _conductor(InMemoryControlPort(), _FakeAppendStep()) # no FSM handlers + with pytest.raises(RuntimeError, match="try_conduct"): + await conductor.try_conduct( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(), + ) + + +@pytest.mark.unit +async def test_reconduct_without_handlers_raises_runtime_error() -> None: + """reconduct() requires resume + complete + abort; a missing one is a + wiring bug, so it propagates.""" + conductor = _conductor(InMemoryControlPort(), _FakeAppendStep()) # no FSM handlers + with pytest.raises(RuntimeError, match="reconduct"): + await conductor.reconduct( + procedure_id=uuid4(), + principal_id=uuid4(), + correlation_id=uuid4(), + steps=(), + boundary=0, + ) + + @pytest.mark.unit async def test_conduct_start_failure_records_lifecycle_failure_without_execute() -> None: """start_procedure rejection -> lifecycle failure; no steps attempted.""" diff --git a/apps/api/tests/unit/operation/test_hold_procedure_handler.py b/apps/api/tests/unit/operation/test_hold_procedure_handler.py index 6dc7ef999e..1378ea1025 100644 --- a/apps/api/tests/unit/operation/test_hold_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_hold_procedure_handler.py @@ -61,6 +61,8 @@ async def test_handler_appends_procedure_held_event_with_trimmed_reason() -> Non "reason": "beam dropped", "decided_by_decision_id": None, "occurred_at": _NOW.isoformat(), + # Operator hold (no conduct observer) leaves actuation_kind None. + "actuation_kind": None, } diff --git a/apps/api/tests/unit/operation/test_procedure_events.py b/apps/api/tests/unit/operation/test_procedure_events.py index 09b5870524..9e93702c17 100644 --- a/apps/api/tests/unit/operation/test_procedure_events.py +++ b/apps/api/tests/unit/operation/test_procedure_events.py @@ -769,6 +769,7 @@ def test_to_payload_serializes_procedure_held() -> None: reason="beam dropped", decided_by_decision_id=decision_id, occurred_at=_NOW, + actuation_kind="Simulated", ) ) assert payload == { @@ -776,6 +777,7 @@ def test_to_payload_serializes_procedure_held() -> None: "reason": "beam dropped", "decided_by_decision_id": str(decision_id), "occurred_at": _NOW.isoformat(), + "actuation_kind": "Simulated", } diff --git a/apps/api/tests/unit/operation/test_procedure_summary_projection.py b/apps/api/tests/unit/operation/test_procedure_summary_projection.py index 54f381ba86..6c99e8f5eb 100644 --- a/apps/api/tests/unit/operation/test_procedure_summary_projection.py +++ b/apps/api/tests/unit/operation/test_procedure_summary_projection.py @@ -135,11 +135,12 @@ async def test_procedure_registered_handles_null_parent_run() -> None: assert args[5] is None -# NOTE: the 4 status-change UPDATE arms (Started/Completed/Aborted/Truncated) -# use literal status strings in SQL today (per-event SQL constants in the -# projection). When `_UPDATE_STATUS_SQL` parameterized hoist lands (trigger: -# 5th status-change arm), flip these substring assertions to `"SET status = $5"` -# in lockstep with the projection refactor. +# NOTE: the 6 status-change UPDATE arms (Started/Completed/Aborted/Truncated/ +# Held/Resumed) use literal status strings in SQL via per-event SQL constants. +# The old "hoist to a parameterized `_UPDATE_STATUS_SQL` at the 5th arm" plan +# was re-evaluated when Held/Resumed landed and dropped: the arms are NOT +# uniform (Truncated also sets interrupted_at, Resumed CLEARS the reason), so a +# single parameterized SQL reads worse. These substring assertions stay. @pytest.mark.unit diff --git a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py index c958787d2e..bd7a7f9b07 100644 --- a/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py +++ b/apps/api/tests/unit/operation/test_reconduct_procedure_handler.py @@ -21,9 +21,11 @@ from cora.infrastructure.adapters.in_memory_event_store import InMemoryEventStore from cora.infrastructure.event_envelope import to_new_event from cora.infrastructure.kernel import Kernel +from cora.operation.adapters.control_port_registry import ControlPortRegistry from cora.operation.adapters.in_memory_control_port import InMemoryControlPort from cora.operation.aggregates.procedure import ( InMemoryActivityStore, + InvalidProcedureReEstablishmentBoundaryError, ProcedureCannotResumeError, ProcedureHeld, ProcedureNotFoundError, @@ -52,6 +54,7 @@ ReconductProcedure, ReconductProcedureResult, ) +from cora.operation.ports.control_port import ActuationKind, ControlPort from cora.run.aggregates.run import RunHeld, RunStarted from cora.run.aggregates.run import event_type_name as run_event_type_name from cora.run.aggregates.run import to_payload as run_to_payload @@ -80,7 +83,7 @@ def _deps(store: InMemoryEventStore, *, deny: bool = False) -> Kernel: ) -def _make_reconduct(deps: Kernel, port: InMemoryControlPort) -> ReconductHandler: +def _make_reconduct(deps: Kernel, port: ControlPort) -> ReconductHandler: conductor = Conductor( control_port=port, append_step=append_activities.bind(deps, step_store=InMemoryActivityStore()), @@ -99,9 +102,11 @@ async def _seed_held_with_steps( steps: Sequence[Step], procedure_id: UUID = _PROCEDURE_ID, parent_run_id: UUID | None = None, + held_actuation_kind: str | None = None, ) -> None: """Land a conducted-then-Held Procedure: Registered + ResolvedStepsRecorded - (the pinned resolved steps) + Started + Held.""" + (the pinned resolved steps) + Started + Held. `held_actuation_kind` is the + kind the pre-hold conduct observed (carried on ProcedureHeld).""" resolved = tuple(step_to_payload(s) for s in steps) events = [ ProcedureRegistered( @@ -119,7 +124,12 @@ async def _seed_held_with_steps( occurred_at=_PRIOR, ), ProcedureStarted(procedure_id=procedure_id, occurred_at=_PRIOR), - ProcedureHeld(procedure_id=procedure_id, reason="beam dropped", occurred_at=_PRIOR), + ProcedureHeld( + procedure_id=procedure_id, + reason="beam dropped", + occurred_at=_PRIOR, + actuation_kind=held_actuation_kind, + ), ] await store.append( stream_type="Procedure", @@ -385,3 +395,44 @@ async def test_raises_unauthorized_on_deny() -> None: deps = _deps(store, deny=True) with pytest.raises(UnauthorizedError): await _call(_make_reconduct(deps, InMemoryControlPort()), 0) + + +@pytest.mark.unit +async def test_raises_when_boundary_past_step_count() -> None: + """A boundary strictly past the pinned step count is rejected (it would + replay an empty tail and silently auto-complete). boundary == count is + allowed (a deliberate complete-with-nothing resume).""" + store = InMemoryEventStore() + await _seed_held_with_steps(store, steps=(SetpointStep(address="2bma:a", value=1.0),)) + deps = _deps(store) + with pytest.raises(InvalidProcedureReEstablishmentBoundaryError): + await _call(_make_reconduct(deps, InMemoryControlPort()), 2) # only 1 step pinned + + +@pytest.mark.unit +async def test_reconduct_folds_pre_hold_actuation_kind_into_completion() -> None: + """Regression (provenance gate): a conduct that touched a SIMULATED route + before the hold must not complete as Physical when reconducted over a + physical tail. The pre-hold kind carried on ProcedureHeld is merged with + the replay-tail kind, so the terminal event reports Hybrid and the + promote_dataset Simulated/Hybrid gate still bites.""" + store = InMemoryEventStore() + inner = InMemoryControlPort() + inner.simulate_connect("real:a") + registry = ControlPortRegistry() + registry.register("real:", inner, is_simulated=False) # the replay tail is physical + await _seed_held_with_steps( + store, + steps=(SetpointStep(address="real:a", value=1.0),), + held_actuation_kind="Simulated", # the pre-hold prefix touched a simulator + ) + deps = _deps(store) + result = await _call(_make_reconduct(deps, registry), 0) + + assert result.succeeded is True + # Merged, NOT the tail-only Physical -> the response + the terminal event agree. + assert result.actuation_kind == ActuationKind.HYBRID.value + state = await load_procedure(store, _PROCEDURE_ID) + assert state is not None + assert state.status is ProcedureStatus.COMPLETED + assert state.actuation_kind == ActuationKind.HYBRID.value From 9efb1bd4c5bc2b19bee0e209c7bfb573a49088eb Mon Sep 17 00:00:00 2001 From: Doga Gursoy Date: Sun, 21 Jun 2026 22:43:21 +0300 Subject: [PATCH 12/12] fix(operation): store activity payload as real jsonb, not a double-encoded scalar The pool registers a jsonb codec (pool.py set_type_codec encoder=json.dumps), so the event store passes event.payload as a raw dict and it round-trips as a real jsonb OBJECT. The activity store instead did an EXTRA json.dumps before binding, so the codec serialized the already-serialized string AGAIN, storing a double-encoded jsonb SCALAR string. Server-side `payload->>'key'` then returns NULL, which silently turned the conductor's pre-effect in-flight-marker filters (`payload->>'result' IS DISTINCT FROM 'in_flight'`) into no-ops across 17 integration/scenario test files. 16 tolerated the leaked marker rows; the 3 acquisition softioc tests that read exact rows did not -- the long-standing red on CI's (non-required) DB shard 1 that the local gate could not catch (softioc is CI-only). Fix: pass `row.payload` (the dict), matching the event store; the codec encodes it once into a real jsonb object. Reads now decode to dicts (like the event store), so the activity-payload `json.loads(row["payload"])` in 8 test files becomes direct dict access. Adds a real-Postgres regression test asserting `payload->>'result'` extracts the value (not NULL) so the marker filter actually excludes the marker -- the check that would have caught this locally. No production reader of the activities payload exists, so the read-shape change is test-only. The decision_inferences adapter has the same json.dumps-into-jsonb pattern; left as-is (nothing queries its jsonb server-side) and called out in the entries.py comment as a latent follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../operation/aggregates/procedure/entries.py | 18 +++--- .../scenarios/test_2bm_alignment_center.py | 5 +- .../scenarios/test_2bm_alignment_pitch.py | 3 +- .../scenarios/test_2bm_alignment_roll.py | 3 +- .../test_2bm_detector_z_rail_alignment.py | 3 +- .../scenarios/test_2bm_slit_centering.py | 3 +- ...t_acquisitions_against_softioc_postgres.py | 7 +-- ...test_append_activities_handler_postgres.py | 63 +++++++++++++++++-- ...test_conductor_against_softioc_postgres.py | 3 +- 9 files changed, 79 insertions(+), 29 deletions(-) diff --git a/apps/api/src/cora/operation/aggregates/procedure/entries.py b/apps/api/src/cora/operation/aggregates/procedure/entries.py index 2111fe85c0..4b72db4d9a 100644 --- a/apps/api/src/cora/operation/aggregates/procedure/entries.py +++ b/apps/api/src/cora/operation/aggregates/procedure/entries.py @@ -77,7 +77,6 @@ # adapter class. The dataclass + Protocol stay strictly typed for # every caller above the boundary. -import json from dataclasses import dataclass from datetime import datetime from typing import Any, Protocol @@ -170,12 +169,17 @@ async def append(self, rows: list[Activity]) -> None: row.actor_id, row.command_name, row.step_kind, - # asyncpg encodes Python dict to jsonb when the - # column is jsonb-typed; explicit json.dumps - # keeps the contract obvious and matches the - # decision_reasonings adapter's posture (which - # also has a JSON body column). - json.dumps(row.payload), + # Pass the dict; the pool's jsonb codec (pool.py + # set_type_codec encoder=json.dumps) serializes it ONCE + # into a real jsonb OBJECT, exactly like the event store + # passes event.payload. An EXTRA json.dumps here (the + # former code) double-encoded it into a jsonb SCALAR + # string, which made server-side `payload->>'key'` + # return NULL and silently no-op'd the conductor's + # in-flight-marker filters. (The decision_inferences + # adapter still json.dumps-es into jsonb; harmless only + # while nothing queries its jsonb server-side.) + row.payload, row.sampled_at, row.occurred_at, row.correlation_id, diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py index b463f22b21..313cfde7bc 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_center.py @@ -85,7 +85,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -929,7 +928,7 @@ def t(seconds: int) -> datetime: # The final setpoint records the calibrated rotation-axis pixel position -- # the artifact a downstream science scan will read. - final_setpoint_payload = json.loads(step_rows[-1]["payload"]) + final_setpoint_payload = step_rows[-1]["payload"] assert final_setpoint_payload["channel"] == "RotationCenter" assert final_setpoint_payload["target_value"] == 1024.5 assert final_setpoint_payload["units"] == "px" @@ -938,7 +937,7 @@ def t(seconds: int) -> datetime: # judgment + supporting evidence. Iteration is no longer encoded here # (no `evidence['iteration']`); it is first-class, asserted via the # per-iteration read model below. - convergence_check_payload = json.loads(step_rows[11]["payload"]) + convergence_check_payload = step_rows[11]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["source"] == "live_tomostream_centroid" assert convergence_check_payload["evidence"]["offset_px"] == 0.5 diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py index 6d15ad966d..fcdf0a7583 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_pitch.py @@ -102,7 +102,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -689,7 +688,7 @@ async def test_pitch_alignment_plays_out_end_to_end( # The convergence Check (iteration 2's 180° check) records the sharpness # delta. Iteration is no longer encoded here (no `evidence['iteration']`); # it is first-class, asserted via the per-iteration read model below. - convergence_check_payload = json.loads(rows[12]["payload"]) + convergence_check_payload = rows[12]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["evidence"]["delta_sharpness"] == 0.02 assert "iteration" not in convergence_check_payload["evidence"] diff --git a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py index e25dc682bb..728dcadee7 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py +++ b/apps/api/tests/integration/scenarios/test_2bm_alignment_roll.py @@ -100,7 +100,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -678,7 +677,7 @@ async def test_roll_alignment_plays_out_end_to_end( # judgment + supporting evidence. Iteration is no longer encoded here # (no `evidence['iteration']`); it is first-class, asserted via the # per-iteration read model below. - convergence_check_payload = json.loads(rows[12]["payload"]) + convergence_check_payload = rows[12]["payload"] assert convergence_check_payload["passed"] is True assert convergence_check_payload["evidence"]["delta_y_px"] == 0.3 assert "iteration" not in convergence_check_payload["evidence"] diff --git a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py index aee487af5e..120d047734 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py +++ b/apps/api/tests/integration/scenarios/test_2bm_detector_z_rail_alignment.py @@ -518,9 +518,8 @@ def t(seconds: int) -> datetime: "setpoint", "setpoint", # finalize: converged AX + AY ] - import json - last = json.loads(step_rows[-1]["payload"]) + last = step_rows[-1]["payload"] assert last["channel"] == "DetectorTable.AY" assert last["units"] == "deg" diff --git a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py index da1cf587d0..5bda2a3ed1 100644 --- a/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py +++ b/apps/api/tests/integration/scenarios/test_2bm_slit_centering.py @@ -22,7 +22,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from typing import Any from uuid import UUID, uuid4 @@ -370,7 +369,7 @@ def t(seconds: int) -> datetime: "action", "check", # close ] - close_size = json.loads(step_rows[4]["payload"]) + close_size = step_rows[4]["payload"] assert close_size["channel"] == "Hsize" assert close_size["target_value"] == 0.5 diff --git a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py index 607acd7007..524455f51d 100644 --- a/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_acquisitions_against_softioc_postgres.py @@ -23,7 +23,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from uuid import UUID @@ -186,7 +185,7 @@ async def test_conductor_runs_collect_action_against_real_softioc_and_postgres( procedure_id, ) assert [r["step_kind"] for r in rows] == ["action"] - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "collect" assert payload["result"] == "ok" result_data = payload["result_data"] @@ -277,7 +276,7 @@ async def test_conductor_runs_discrete_action_walks_axis_with_per_point_collects """, procedure_id, ) - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "discrete" assert payload["result"] == "ok" result_data = payload["result_data"] @@ -366,7 +365,7 @@ async def test_conductor_runs_continuous_action_with_axis_sweep_against_softioc( """, procedure_id, ) - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload["name"] == "continuous" assert payload["result"] == "ok" result_data = payload["result_data"] diff --git a/apps/api/tests/integration/test_append_activities_handler_postgres.py b/apps/api/tests/integration/test_append_activities_handler_postgres.py index 997d354695..1ca26d607e 100644 --- a/apps/api/tests/integration/test_append_activities_handler_postgres.py +++ b/apps/api/tests/integration/test_append_activities_handler_postgres.py @@ -12,7 +12,6 @@ # pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false -import json from datetime import UTC, datetime from uuid import UUID, uuid4 @@ -207,7 +206,7 @@ async def test_append_activities_lazy_open_and_polymorphic_round_trip( # recorded_at is DEFAULT now() at the DB layer; must come AFTER occurred_at. assert setpoint_row["recorded_at"] >= setpoint_row["occurred_at"] # asyncpg returns jsonb as a JSON string for plain SELECT; decode it. - setpoint_payload = json.loads(setpoint_row["payload"]) + setpoint_payload = setpoint_row["payload"] assert setpoint_payload == { "channel": "T_oven", "target_value": 423.0, @@ -215,10 +214,10 @@ async def test_append_activities_lazy_open_and_polymorphic_round_trip( "ramp_rate": 5.0, } - action_payload = json.loads(by_kind["action"]["payload"]) + action_payload = by_kind["action"]["payload"] assert action_payload == {"action_name": "open_valve", "params": {"valve": "V12"}} - check_payload = json.loads(by_kind["check"]["payload"]) + check_payload = by_kind["check"]["payload"] assert check_payload["passed"] is True assert check_payload["actual"] == 422.8 @@ -308,7 +307,7 @@ async def test_append_activities_dedups_on_event_id_in_postgres( rows = await _read_steps_for_procedure(db_pool, procedure_id) assert len(rows) == 1 assert rows[0]["step_kind"] == "setpoint" - payload = json.loads(rows[0]["payload"]) + payload = rows[0]["payload"] assert payload == {"channel": "X", "target_value": 1.0} @@ -317,3 +316,57 @@ async def test_postgres_step_store_handles_empty_batch(db_pool: asyncpg.Pool) -> """Empty batch is a no-op at the adapter layer (early return).""" store = PostgresActivityStore(db_pool) await store.append([]) # No exception, no rows touched. + + +@pytest.mark.integration +async def test_payload_stores_as_real_jsonb_so_server_side_filters_work( + db_pool: asyncpg.Pool, +) -> None: + """Regression: payload must persist as a real jsonb OBJECT (not a double- + encoded jsonb scalar string), so server-side `payload->>'key'` works. When + payload was double-encoded (json.dumps bound to a jsonb column with no + `::jsonb` cast), `payload->>'result'` returned NULL and the conductor's + in-flight-marker filters (`payload->>'result' IS DISTINCT FROM 'in_flight'`) + silently no-op'd, leaking marker rows into assertions.""" + procedure_id = UUID("01900000-0000-7000-8000-0000010c0d01") + logbook_id = UUID("01900000-0000-7000-8000-0000010c0d02") + open_event_id = UUID("01900000-0000-7000-8000-0000010c0d03") + deps = build_postgres_deps(db_pool, now=_NOW, ids=[logbook_id, open_event_id]) + step_store = PostgresActivityStore(db_pool) + await _seed_running_procedure(deps.event_store, procedure_id) + + handler = bind_append(deps, step_store=step_store) + await handler( + AppendProcedureActivities( + procedure_id=procedure_id, + entries=( + _entry( + event_id=UUID("01900000-0000-7000-8000-0000010c0e01"), + step_kind="setpoint", + payload={"address": "2bma:x", "result": "in_flight"}, + sampled_at=datetime(2026, 5, 15, 12, 0, 1, tzinfo=UTC), + ), + _entry( + event_id=UUID("01900000-0000-7000-8000-0000010c0e02"), + step_kind="setpoint", + payload={"address": "2bma:x", "result": "ok"}, + sampled_at=datetime(2026, 5, 15, 12, 0, 2, tzinfo=UTC), + ), + ), + ), + principal_id=_PRINCIPAL_ID, + correlation_id=_CORRELATION_ID, + ) + + async with db_pool.acquire() as conn: + # Server-side extraction returns the actual value (not NULL), so the + # marker filter excludes the in_flight row and keeps only the outcome. + rows = await conn.fetch( + """ + SELECT payload->>'result' AS result + FROM entries_operation_procedure_activities + WHERE procedure_id = $1 AND payload->>'result' IS DISTINCT FROM 'in_flight' + """, + procedure_id, + ) + assert [r["result"] for r in rows] == ["ok"] diff --git a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py index 83b191d2b4..88c24c0439 100644 --- a/apps/api/tests/integration/test_conductor_against_softioc_postgres.py +++ b/apps/api/tests/integration/test_conductor_against_softioc_postgres.py @@ -169,9 +169,8 @@ async def test_conductor_runs_setpoint_check_against_real_softioc_and_postgres( """, procedure_id, ) - import json - parsed = [(r["step_kind"], json.loads(r["payload"])) for r in rows] + parsed = [(r["step_kind"], r["payload"]) for r in rows] # The setpoint is side-effecting: it records a pre-effect in-flight # marker then the `ok` outcome, both round-tripping into Postgres. The # check (pure read) records only its outcome -- no marker.