Skip to content

Commit 9b818ff

Browse files
author
Mateusz
committed
fix(acp): address review findings in stale kill implementation
- Mirror DISABLE_STALE_ACP_AGENT_KILLS to os.environ in server_applicator - Add top-level exception handler in _run() to prevent silent task failures - Return True (proceed with kill) when psutil is unavailable but PID matches - Document psutil as required dependency with defensive ImportError fallback
1 parent 609ecbf commit 9b818ff

7 files changed

Lines changed: 95 additions & 40 deletions

File tree

docs/user_guide/backends/overview.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ To change the idle delay:
6161
- Environment: `STALE_ACP_AGENT_KILL_IDLE_SECONDS=<seconds>`
6262
- Configuration file: `stale_acp_agent_kill_idle_seconds: <seconds>`
6363

64-
When **psutil** is installed (runtime dependency), before terminating a child the proxy checks that the OS process is still the same one it spawned (creation time and, when available, executable path), so an unrelated process that reused the PID is not killed. If psutil is missing, idle-kill falls back to the subprocess handle only (weaker).
64+
**psutil** is a required runtime dependency (declared in `pyproject.toml`). Before terminating a child, the proxy uses it to verify the OS process is still the same one it spawned (creation time and, when available, executable path), so an unrelated process that reused the PID is not killed. The code also has a defensive import fallback: if `psutil` cannot be imported at runtime, idle-kill falls back to the subprocess handle only (weaker).
6565

6666
Precedence: **CLI** overrides **environment** overrides **configuration file**. INFO-level logs describe when a kill is scheduled, cancelled, or executed.
6767

src/connectors/acp_core/acp_subprocess_identity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def stale_kill_still_same_os_process(
9191
return False
9292

9393
if _psutil is None:
94-
return False
94+
return True
9595

9696
try:
9797
current = _psutil.Process(identity.pid)

src/connectors/acp_core/base_connector.py

Lines changed: 48 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -246,52 +246,67 @@ async def _schedule_stale_kill_after_turn(self, runtime: ACPProcessRuntime) -> N
246246

247247
async def _run() -> None:
248248
try:
249-
await asyncio.sleep(delay)
250-
except asyncio.CancelledError:
251-
return
252-
if not connector._stale_acp_kill_enabled():
253-
return
254-
proc = runtime_ref.process
255-
if proc is None or proc.poll() is not None:
256-
return
257-
ident = runtime_ref.acp_subprocess_identity
258-
if ident is not None and not stale_kill_still_same_os_process(proc, ident):
259-
if logger.isEnabledFor(logging.INFO):
260-
logger.info(
261-
"Stale ACP kill skipped: OS process identity mismatch "
262-
"(backend=%s pid=%s project=%s)",
249+
try:
250+
await asyncio.sleep(delay)
251+
except asyncio.CancelledError:
252+
return
253+
if not connector._stale_acp_kill_enabled():
254+
return
255+
proc = runtime_ref.process
256+
if proc is None or proc.poll() is not None:
257+
return
258+
ident = runtime_ref.acp_subprocess_identity
259+
if ident is not None and not stale_kill_still_same_os_process(
260+
proc, ident
261+
):
262+
if logger.isEnabledFor(logging.INFO):
263+
logger.info(
264+
"Stale ACP kill skipped: OS process identity mismatch "
265+
"(backend=%s pid=%s project=%s)",
266+
connector.backend_type,
267+
getattr(proc, "pid", None),
268+
runtime_ref.project_dir,
269+
)
270+
return
271+
if ident is None and logger.isEnabledFor(logging.DEBUG):
272+
logger.debug(
273+
"ACP stale kill has no subprocess identity fingerprint "
274+
"(backend=%s pid=%s); relying on subprocess handle state only",
263275
connector.backend_type,
264276
getattr(proc, "pid", None),
265-
runtime_ref.project_dir,
266277
)
267-
return
268-
if ident is None and logger.isEnabledFor(logging.DEBUG):
269-
logger.debug(
270-
"ACP stale kill has no subprocess identity fingerprint "
271-
"(backend=%s pid=%s); relying on subprocess handle state only",
272-
connector.backend_type,
273-
getattr(proc, "pid", None),
274-
)
275-
req_lock = runtime_ref.request_lock
276-
if req_lock is not None and req_lock.locked():
278+
req_lock = runtime_ref.request_lock
279+
if req_lock is not None and req_lock.locked():
280+
if logger.isEnabledFor(logging.INFO):
281+
logger.info(
282+
"Stale ACP kill skipped: request in progress "
283+
"(backend=%s pid=%s)",
284+
connector.backend_type,
285+
getattr(proc, "pid", None),
286+
)
287+
return
277288
if logger.isEnabledFor(logging.INFO):
278289
logger.info(
279-
"Stale ACP kill skipped: request in progress (backend=%s pid=%s)",
290+
"Stale ACP agent idle timeout: terminating process (backend=%s "
291+
"pid=%s project=%s model=%s client_session=%s)",
280292
connector.backend_type,
281293
getattr(proc, "pid", None),
294+
runtime_ref.project_dir,
295+
runtime_ref.model,
296+
runtime_ref.client_session_id,
282297
)
283-
return
284-
if logger.isEnabledFor(logging.INFO):
285-
logger.info(
286-
"Stale ACP agent idle timeout: terminating process (backend=%s "
287-
"pid=%s project=%s model=%s client_session=%s)",
298+
await connector._kill_runtime(runtime_ref)
299+
except asyncio.CancelledError:
300+
raise
301+
except Exception:
302+
logger.exception(
303+
"Stale ACP agent kill task failed (backend=%s project=%s model=%s "
304+
"client_session=%s)",
288305
connector.backend_type,
289-
getattr(proc, "pid", None),
290306
runtime_ref.project_dir,
291307
runtime_ref.model,
292308
runtime_ref.client_session_id,
293309
)
294-
await connector._kill_runtime(runtime_ref)
295310

296311
runtime.stale_kill_task = asyncio.create_task(_run())
297312

src/core/cli_support/applicators/server_applicator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,7 @@ def _apply_disable_stale_acp_agent_kills(
258258
if raw is not True:
259259
return
260260
overrides["disable_stale_acp_agent_kills"] = True
261+
os.environ["DISABLE_STALE_ACP_AGENT_KILLS"] = "true"
261262
resolution.record(
262263
"disable_stale_acp_agent_kills",
263264
True,

tests/unit/connectors/acp_core/test_acp_stale_kill_timer.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,35 @@ async def test_stale_kill_task_calls_kill_runtime_after_delay(
109109
mock_kill.assert_awaited_once_with(runtime)
110110

111111

112+
@pytest.mark.asyncio
113+
async def test_stale_kill_task_logs_exception_when_kill_runtime_raises(
114+
connector: DummyAcpConnector,
115+
runtime: ACPProcessRuntime,
116+
) -> None:
117+
mock_proc = MagicMock()
118+
mock_proc.poll.return_value = None
119+
mock_proc.pid = 4242
120+
runtime.process = mock_proc
121+
with (
122+
patch.object(
123+
connector,
124+
"_stale_acp_kill_delay_seconds",
125+
return_value=0.01,
126+
),
127+
patch.object(
128+
connector,
129+
"_kill_runtime",
130+
new_callable=AsyncMock,
131+
side_effect=RuntimeError("boom"),
132+
),
133+
patch("src.connectors.acp_core.base_connector.logger") as mock_logger,
134+
):
135+
await connector._schedule_stale_kill_after_turn(runtime)
136+
assert runtime.stale_kill_task is not None
137+
await asyncio.wait_for(runtime.stale_kill_task, timeout=2.0)
138+
mock_logger.exception.assert_called_once()
139+
140+
112141
@pytest.mark.asyncio
113142
async def test_stale_kill_skipped_when_identity_mismatch(
114143
connector: DummyAcpConnector,

tests/unit/connectors/acp_core/test_acp_subprocess_identity.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,11 @@ def test_stale_kill_rejects_exe_mismatch(mock_popen: MagicMock) -> None:
9292

9393
def test_stale_kill_false_when_identity_none(mock_popen: MagicMock) -> None:
9494
assert stale_kill_still_same_os_process(mock_popen, None) is False
95+
96+
97+
def test_stale_kill_true_when_psutil_unavailable_but_identity_present(
98+
mock_popen: MagicMock,
99+
) -> None:
100+
ident = AcpSubprocessIdentity(pid=9001, create_time=1.0, exe_key="a.exe")
101+
with patch.object(sid_mod, "_psutil", None):
102+
assert stale_kill_still_same_os_process(mock_popen, ident) is True

tests/unit/core/cli_support/applicators/test_server_applicator.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -249,12 +249,14 @@ def test_apply_disable_stale_acp_agent_kills(
249249
) -> None:
250250
"""Test that --disable-stale-acp-agent-kills is applied."""
251251
empty_args.disable_stale_acp_agent_kills = True
252-
applicator.apply(empty_args, overrides, resolution)
252+
with mock.patch.dict(os.environ, {}, clear=True):
253+
applicator.apply(empty_args, overrides, resolution)
253254

254-
assert overrides.get("disable_stale_acp_agent_kills") is True
255-
assert resolution.is_set("disable_stale_acp_agent_kills")
256-
cli_params = resolution.latest_by_source(ParameterSource.CLI)
257-
assert "disable_stale_acp_agent_kills" in cli_params
255+
assert overrides.get("disable_stale_acp_agent_kills") is True
256+
assert os.environ.get("DISABLE_STALE_ACP_AGENT_KILLS") == "true"
257+
assert resolution.is_set("disable_stale_acp_agent_kills")
258+
cli_params = resolution.latest_by_source(ParameterSource.CLI)
259+
assert "disable_stale_acp_agent_kills" in cli_params
258260

259261
def test_apply_stale_acp_agent_kill_idle_seconds(
260262
self,

0 commit comments

Comments
 (0)