From 7ebd2ef75a061174c096f52d857cc7a9b4a73a12 Mon Sep 17 00:00:00 2001 From: Joaquin Hui Gomez Date: Tue, 28 Apr 2026 19:47:05 +0100 Subject: [PATCH] transport: reap MCP grandchildren on close via process group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Query.close() only waited on and signalled the CLI subprocess itself, so MCP servers spawned by the CLI reparented to PID 1 after close and persisted for the lifetime of the parent Python process. Long- running daemons saw one MCP server leak per configured server per query(), eventually exhausting file descriptors / RAM. Spawn the CLI in a new POSIX session (start_new_session=True) so its descendants inherit the same process group, and on close signal the whole group via os.killpg — once after graceful CLI exit, and during the SIGTERM/SIGKILL escalation paths. Windows behavior is unchanged. Fixes #889 --- .../_internal/transport/subprocess_cli.py | 49 ++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/src/claude_agent_sdk/_internal/transport/subprocess_cli.py b/src/claude_agent_sdk/_internal/transport/subprocess_cli.py index 9a1d7458..85657711 100644 --- a/src/claude_agent_sdk/_internal/transport/subprocess_cli.py +++ b/src/claude_agent_sdk/_internal/transport/subprocess_cli.py @@ -6,6 +6,7 @@ import platform import re import shutil +import signal from collections.abc import AsyncIterable, AsyncIterator from contextlib import suppress from pathlib import Path @@ -26,6 +27,28 @@ logger = logging.getLogger(__name__) _DEFAULT_MAX_BUFFER_SIZE = 1024 * 1024 # 1MB buffer limit + + +def _terminate_group(pid: int, sig: "signal.Signals") -> None: + """Send ``sig`` to the process group led by ``pid``. + + The CLI is spawned with ``start_new_session=True`` on POSIX, so its + descendants (e.g. MCP servers) share a group id equal to the CLI's + pid. Signalling that group reaches the whole tree in one syscall; + without it, descendants reparent to PID 1 on close and leak across + repeated query() / ClaudeSDKClient usage. + """ + if os.name != "posix": + return + try: + pgid = os.getpgid(pid) + except ProcessLookupError: + return + try: + os.killpg(pgid, sig) + except (ProcessLookupError, PermissionError): + # Group already gone or out of our control; nothing to clean up. + return MINIMUM_CLAUDE_CODE_VERSION = "2.0.0" @@ -447,6 +470,14 @@ async def connect(self) -> None: # Pipe stderr only when the caller registered a callback. stderr_dest = PIPE if self._options.stderr is not None else None + # Spawn the CLI in a new POSIX session so its descendants + # (e.g. MCP servers it starts) inherit the same process group + # and can be reaped together via os.killpg in close(). Without + # this, MCP grandchildren reparent to PID 1 on close and leak + # across repeated query() / ClaudeSDKClient usage. + spawn_kwargs: dict[str, Any] = {} + if os.name == "posix": + spawn_kwargs["start_new_session"] = True self._process = await anyio.open_process( cmd, stdin=PIPE, @@ -455,6 +486,7 @@ async def connect(self) -> None: cwd=self._cwd, env=process_env, user=self._options.user, + **spawn_kwargs, ) if self._process.stdout: @@ -539,24 +571,29 @@ async def close(self) -> None: # The subprocess needs time to flush its session file after receiving # EOF on stdin. Without this grace period, SIGTERM can interrupt the # write and cause the last assistant message to be lost (see #625). + cli_pid = self._process.pid if self._process.returncode is None: try: with anyio.fail_after(5): await self._process.wait() except TimeoutError: - # Graceful shutdown timed out — force terminate - with suppress(ProcessLookupError): - self._process.terminate() + # Graceful shutdown timed out — force terminate the entire + # process group (CLI + MCP servers + other grandchildren) + # so descendants don't reparent to PID 1. + _terminate_group(cli_pid, signal.SIGTERM) try: with anyio.fail_after(5): await self._process.wait() except TimeoutError: - # SIGTERM handler blocked — force kill (SIGKILL) - with suppress(ProcessLookupError): - self._process.kill() + # SIGTERM handler blocked — SIGKILL the whole group. + _terminate_group(cli_pid, signal.SIGKILL) with suppress(Exception): await self._process.wait() + # Belt-and-suspenders: even on graceful CLI exit, signal the + # process group to ensure no descendant survives the close call. + _terminate_group(cli_pid, signal.SIGTERM) + self._process = None self._stdout_stream = None self._stdin_stream = None