From 64c6d1165e53456d89e9e49201b5dc1924dd00fc Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Tue, 19 May 2026 17:35:05 -0500
Subject: [PATCH 1/3] fix: generate MAD_MULTI_NODE_RUNNER for Docker local
 deployment

Docker local mode had no mechanism to set MAD_MULTI_NODE_RUNNER, unlike
SLURM (job.sh.j2) and K8s (kubernetes_launcher_mixin.py) which generate
it in their deployment layers. This caused train_7b.sh (Megatron-LM) to
fail with 'Error: MULTI_NODE_RUNNER is not defined' on local runs.

Add _generate_local_launcher_command() to ContainerRunner that generates
the appropriate single-node distributed process launcher command, and call
it in run_container() after GPU resolution. Reuses the already-resolved
launcher variable (lines 327-372) to stay consistent with existing launcher
parsing conventions. Defaults to torchrun for backward compatibility.
Supports torchrun, megatron-lm, torchtitan, deepspeed, vllm, sglang, and
primus launchers. The env var is simply unused for models that hardcode
their own launcher (e.g. HuggingFace scripts).

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 src/madengine/execution/container_runner.py | 49 ++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py
index e0488411..a1fb7648 100644
--- a/src/madengine/execution/container_runner.py
+++ b/src/madengine/execution/container_runner.py
@@ -657,6 +657,31 @@ def get_cpu_arg(self) -> str:
         cpus = self.context.ctx["docker_cpus"].replace(" ", "")
         return f"--cpuset-cpus {cpus} "
 
+    def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: int) -> str:
+        """Generate distributed process launcher command for Docker local deployment.
+
+        Docker local is always single-node. This parallels
+        SlurmDeployer._generate_launcher_command() and
+        KubernetesLauncherMixin._generate_torchrun_command() for the
+        Docker local path.
+
+        Args:
+            launcher_type: Distributed launcher (torchrun, megatron, deepspeed, etc.)
+            nproc_per_node: Number of GPUs (processes) per node.
+
+        Returns:
+            Launcher command string, or empty string for launchers that
+            manage their own process spawning (vllm, sglang).
+        """
+        if launcher_type in ("torchrun", "megatron", "megatron-lm", "torchtitan"):
+            return f"torchrun --standalone --nproc_per_node={nproc_per_node}"
+        elif launcher_type == "deepspeed":
+            return f"deepspeed --num_gpus={nproc_per_node}"
+        elif launcher_type in ("vllm", "sglang", "sglang-disagg", "primus"):
+            return ""
+        else:
+            return f"torchrun --standalone --nproc_per_node={nproc_per_node}"
+
     _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
 
     def get_env_arg(self, run_env: typing.Dict) -> str:
@@ -1089,7 +1114,29 @@ def run_container(
         resolved_gpu_count = resolve_runtime_gpus(model_info, self.additional_context)
         docker_options += self.get_gpu_arg(str(resolved_gpu_count))
         docker_options += self.get_cpu_arg()
-        
+
+        # Generate MAD_MULTI_NODE_RUNNER for Docker local deployment.
+        # SLURM and K8s generate this in their deployment layers (slurm.py,
+        # kubernetes_launcher_mixin.py). Docker local has no such layer, so
+        # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS.
+        # Reuses the `launcher` variable already resolved at lines 327-372.
+        # Defaults to torchrun when launcher is a deployment-level value
+        # ("docker", "native") rather than a distributed launcher.
+        # For models that hardcode their own launcher (e.g. HuggingFace scripts
+        # calling torchrun directly), this env var is simply unused.
+        if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]:
+            dist_launcher = launcher if launcher in (
+                "torchrun", "megatron", "megatron-lm", "torchtitan",
+                "deepspeed", "vllm", "sglang", "sglang-disagg", "primus",
+            ) else "torchrun"
+            runtime_ngpus = int(self.context.ctx["docker_env_vars"].get(
+                "MAD_RUNTIME_NGPUS", str(resolved_gpu_count)))
+            launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus)
+            if launcher_cmd:
+                self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd
+                print(f"ℹ️  Set MAD_MULTI_NODE_RUNNER for local deployment "
+                      f"(launcher={dist_launcher}): {launcher_cmd}")
+
         # Filter out MIOPEN_USER_DB_PATH from run_env if it exists
         # It should be passed via docker_env_vars in context instead
         if "MIOPEN_USER_DB_PATH" in run_env:

From b5834016863ecc12e67141057a3d53265bd93fca Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Fri, 22 May 2026 15:03:02 +0000
Subject: [PATCH 2/3] fix: resolve launcher in run_container scope to fix
 NameError
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MAD_MULTI_NODE_RUNNER generation block in run_container() referenced
`launcher` from create_run_details_dict(), a different method's local
scope. Resolve the launcher inline using the same priority chain
(additional_context → model_info → MAD_LAUNCHER env).

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 src/madengine/execution/container_runner.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py
index a1fb7648..e49d9647 100644
--- a/src/madengine/execution/container_runner.py
+++ b/src/madengine/execution/container_runner.py
@@ -1119,12 +1119,18 @@ def run_container(
         # SLURM and K8s generate this in their deployment layers (slurm.py,
         # kubernetes_launcher_mixin.py). Docker local has no such layer, so
         # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS.
-        # Reuses the `launcher` variable already resolved at lines 327-372.
         # Defaults to torchrun when launcher is a deployment-level value
         # ("docker", "native") rather than a distributed launcher.
         # For models that hardcode their own launcher (e.g. HuggingFace scripts
         # calling torchrun directly), this env var is simply unused.
         if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]:
+            launcher = ""
+            if self.additional_context:
+                launcher = self.additional_context.get("distributed", {}).get("launcher", "")
+            if not launcher and model_info.get("distributed"):
+                launcher = model_info["distributed"].get("launcher", "")
+            if not launcher:
+                launcher = os.environ.get("MAD_LAUNCHER", "")
             dist_launcher = launcher if launcher in (
                 "torchrun", "megatron", "megatron-lm", "torchtitan",
                 "deepspeed", "vllm", "sglang", "sglang-disagg", "primus",

From 2cd4f071fb411870fb45772bcc2e988b20bc33ea Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Tue, 26 May 2026 20:13:10 -0400
Subject: [PATCH 3/3] Updated CHANGELOG

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1e6bdeb..70c52e38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to madengine will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [2.0.3] - 2026-05-19
+## [2.0.3] - 2026-05-26
 
 ### Added
 
@@ -41,6 +41,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - **Deployment monitor infinite loop on cancelled jobs**: `BaseDeployment._monitor_job` treated only `COMPLETED`/`FAILED` as terminal, so a `CANCELLED` job (manual `scancel`, K8s job deletion, etc.) would loop forever waiting for a state that never arrived. `CANCELLED` is now in the terminal-state set in `deployment/base.py`.
 
+- **Docker local: missing `MAD_MULTI_NODE_RUNNER`**: SLURM (`job.sh.j2`) and Kubernetes (`kubernetes_launcher_mixin.py`) already export `MAD_MULTI_NODE_RUNNER` with the appropriate distributed launcher command, but local Docker runs had no equivalent. Models that delegate process spawning to `$MAD_MULTI_NODE_RUNNER` (e.g. Megatron-LM `train_7b.sh`) failed on `madengine run` with `MULTI_NODE_RUNNER is not defined`. `ContainerRunner` now resolves the launcher from `--additional-context` → model `distributed.launcher` → `MAD_LAUNCHER` (same priority chain as elsewhere), treats deployment-level values (`docker`, `native`) as `torchrun`, and sets `MAD_MULTI_NODE_RUNNER` via `_generate_local_launcher_command()` after GPU resolution (`MAD_RUNTIME_NGPUS`). Supports torchrun, megatron-lm, torchtitan, deepspeed, vllm, sglang, and primus; models that hardcode their own launcher (e.g. HuggingFace scripts) simply ignore the variable. Skipped when `MAD_MULTI_NODE_RUNNER` is already set in `docker_env_vars`.
+
 ### Security
 
 - **Shell injection hardening (extended)**: `shlex.quote()` is now applied to every shell interpolation of a user-controlled value across `core/docker.py`, `execution/container_runner.py`, `execution/docker_builder.py`, and `orchestration/run_orchestrator.py` (image names, paths, container names, build-args). A follow-up pass closed the last remaining sites in `docker_builder.py` (`grep`, `docker manifest inspect`, `docker tag`, `docker push`, `head`). This is a defence-in-depth extension of the v2.0.2 build-arg quoting work — values that flow through `--additional-context`, model configs, or registry credentials can no longer break out of the shell command they are embedded in.