From 64c6d1165e53456d89e9e49201b5dc1924dd00fc Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 19 May 2026 17:35:05 -0500 Subject: [PATCH 1/3] fix: generate MAD_MULTI_NODE_RUNNER for Docker local deployment Docker local mode had no mechanism to set MAD_MULTI_NODE_RUNNER, unlike SLURM (job.sh.j2) and K8s (kubernetes_launcher_mixin.py) which generate it in their deployment layers. This caused train_7b.sh (Megatron-LM) to fail with 'Error: MULTI_NODE_RUNNER is not defined' on local runs. Add _generate_local_launcher_command() to ContainerRunner that generates the appropriate single-node distributed process launcher command, and call it in run_container() after GPU resolution. Reuses the already-resolved launcher variable (lines 327-372) to stay consistent with existing launcher parsing conventions. Defaults to torchrun for backward compatibility. Supports torchrun, megatron-lm, torchtitan, deepspeed, vllm, sglang, and primus launchers. The env var is simply unused for models that hardcode their own launcher (e.g. HuggingFace scripts). Co-Authored-By: Claude Sonnet 4 --- src/madengine/execution/container_runner.py | 49 ++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index e0488411..a1fb7648 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -657,6 +657,31 @@ def get_cpu_arg(self) -> str: cpus = self.context.ctx["docker_cpus"].replace(" ", "") return f"--cpuset-cpus {cpus} " + def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: int) -> str: + """Generate distributed process launcher command for Docker local deployment. + + Docker local is always single-node. This parallels + SlurmDeployer._generate_launcher_command() and + KubernetesLauncherMixin._generate_torchrun_command() for the + Docker local path. + + Args: + launcher_type: Distributed launcher (torchrun, megatron, deepspeed, etc.) + nproc_per_node: Number of GPUs (processes) per node. + + Returns: + Launcher command string, or empty string for launchers that + manage their own process spawning (vllm, sglang). + """ + if launcher_type in ("torchrun", "megatron", "megatron-lm", "torchtitan"): + return f"torchrun --standalone --nproc_per_node={nproc_per_node}" + elif launcher_type == "deepspeed": + return f"deepspeed --num_gpus={nproc_per_node}" + elif launcher_type in ("vllm", "sglang", "sglang-disagg", "primus"): + return "" + else: + return f"torchrun --standalone --nproc_per_node={nproc_per_node}" + _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") def get_env_arg(self, run_env: typing.Dict) -> str: @@ -1089,7 +1114,29 @@ def run_container( resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context) docker_options += self.get_gpu_arg(str(resolved_gpu_count)) docker_options += self.get_cpu_arg() - + + # Generate MAD_MULTI_NODE_RUNNER for Docker local deployment. + # SLURM and K8s generate this in their deployment layers (slurm.py, + # kubernetes_launcher_mixin.py). Docker local has no such layer, so + # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS. + # Reuses the `launcher` variable already resolved at lines 327-372. + # Defaults to torchrun when launcher is a deployment-level value + # ("docker", "native") rather than a distributed launcher. + # For models that hardcode their own launcher (e.g. HuggingFace scripts + # calling torchrun directly), this env var is simply unused. + if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]: + dist_launcher = launcher if launcher in ( + "torchrun", "megatron", "megatron-lm", "torchtitan", + "deepspeed", "vllm", "sglang", "sglang-disagg", "primus", + ) else "torchrun" + runtime_ngpus = int(self.context.ctx["docker_env_vars"].get( + "MAD_RUNTIME_NGPUS", str(resolved_gpu_count))) + launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus) + if launcher_cmd: + self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd + print(f"ℹ️ Set MAD_MULTI_NODE_RUNNER for local deployment " + f"(launcher={dist_launcher}): {launcher_cmd}") + # Filter out MIOPEN_USER_DB_PATH from run_env if it exists # It should be passed via docker_env_vars in context instead if "MIOPEN_USER_DB_PATH" in run_env: From b5834016863ecc12e67141057a3d53265bd93fca Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 22 May 2026 15:03:02 +0000 Subject: [PATCH 2/3] fix: resolve launcher in run_container scope to fix NameError MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MAD_MULTI_NODE_RUNNER generation block in run_container() referenced `launcher` from create_run_details_dict(), a different method's local scope. Resolve the launcher inline using the same priority chain (additional_context → model_info → MAD_LAUNCHER env). Co-Authored-By: Claude Opus 4 --- src/madengine/execution/container_runner.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index a1fb7648..e49d9647 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -1119,12 +1119,18 @@ def run_container( # SLURM and K8s generate this in their deployment layers (slurm.py, # kubernetes_launcher_mixin.py). Docker local has no such layer, so # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS. - # Reuses the `launcher` variable already resolved at lines 327-372. # Defaults to torchrun when launcher is a deployment-level value # ("docker", "native") rather than a distributed launcher. # For models that hardcode their own launcher (e.g. HuggingFace scripts # calling torchrun directly), this env var is simply unused. if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]: + launcher = "" + if self.additional_context: + launcher = self.additional_context.get("distributed", {}).get("launcher", "") + if not launcher and model_info.get("distributed"): + launcher = model_info["distributed"].get("launcher", "") + if not launcher: + launcher = os.environ.get("MAD_LAUNCHER", "") dist_launcher = launcher if launcher in ( "torchrun", "megatron", "megatron-lm", "torchtitan", "deepspeed", "vllm", "sglang", "sglang-disagg", "primus", From 2cd4f071fb411870fb45772bcc2e988b20bc33ea Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 26 May 2026 20:13:10 -0400 Subject: [PATCH 3/3] Updated CHANGELOG --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e1e6bdeb..70c52e38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to madengine will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [2.0.3] - 2026-05-19 +## [2.0.3] - 2026-05-26 ### Added @@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Deployment monitor infinite loop on cancelled jobs**: `BaseDeployment._monitor_job` treated only `COMPLETED`/`FAILED` as terminal, so a `CANCELLED` job (manual `scancel`, K8s job deletion, etc.) would loop forever waiting for a state that never arrived. `CANCELLED` is now in the terminal-state set in `deployment/base.py`. +- **Docker local: missing `MAD_MULTI_NODE_RUNNER`**: SLURM (`job.sh.j2`) and Kubernetes (`kubernetes_launcher_mixin.py`) already export `MAD_MULTI_NODE_RUNNER` with the appropriate distributed launcher command, but local Docker runs had no equivalent. Models that delegate process spawning to `$MAD_MULTI_NODE_RUNNER` (e.g. Megatron-LM `train_7b.sh`) failed on `madengine run` with `MULTI_NODE_RUNNER is not defined`. `ContainerRunner` now resolves the launcher from `--additional-context` → model `distributed.launcher` → `MAD_LAUNCHER` (same priority chain as elsewhere), treats deployment-level values (`docker`, `native`) as `torchrun`, and sets `MAD_MULTI_NODE_RUNNER` via `_generate_local_launcher_command()` after GPU resolution (`MAD_RUNTIME_NGPUS`). Supports torchrun, megatron-lm, torchtitan, deepspeed, vllm, sglang, and primus; models that hardcode their own launcher (e.g. HuggingFace scripts) simply ignore the variable. Skipped when `MAD_MULTI_NODE_RUNNER` is already set in `docker_env_vars`. + ### Security - **Shell injection hardening (extended)**: `shlex.quote()` is now applied to every shell interpolation of a user-controlled value across `core/docker.py`, `execution/container_runner.py`, `execution/docker_builder.py`, and `orchestration/run_orchestrator.py` (image names, paths, container names, build-args). A follow-up pass closed the last remaining sites in `docker_builder.py` (`grep`, `docker manifest inspect`, `docker tag`, `docker push`, `head`). This is a defence-in-depth extension of the v2.0.2 build-arg quoting work — values that flow through `--additional-context`, model configs, or registry credentials can no longer break out of the shell command they are embedded in.