diff --git a/CLAUDE.md b/CLAUDE.md index 085b8997..298d0ba5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,6 +2,72 @@ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +## Behavioral Guidelines + +Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed. + +**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment. + +## 1. Think Before Coding + +**Don't assume. Don't hide confusion. Surface tradeoffs.** + +Before implementing: +- State your assumptions explicitly. If uncertain, ask. +- If multiple interpretations exist, present them - don't pick silently. +- If a simpler approach exists, say so. Push back when warranted. +- If something is unclear, stop. Name what's confusing. Ask. + +## 2. Simplicity First + +**Minimum code that solves the problem. Nothing speculative.** + +- No features beyond what was asked. +- No abstractions for single-use code. +- No "flexibility" or "configurability" that wasn't requested. +- No error handling for impossible scenarios. +- If you write 200 lines and it could be 50, rewrite it. + +Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. + +## 3. Surgical Changes + +**Touch only what you must. Clean up only your own mess.** + +When editing existing code: +- Don't "improve" adjacent code, comments, or formatting. +- Don't refactor things that aren't broken. +- Match existing style, even if you'd do it differently. +- If you notice unrelated dead code, mention it - don't delete it. + +When your changes create orphans: +- Remove imports/variables/functions that YOUR changes made unused. +- Don't remove pre-existing dead code unless asked. + +The test: Every changed line should trace directly to the user's request. + +## 4. Goal-Driven Execution + +**Define success criteria. Loop until verified.** + +Transform tasks into verifiable goals: +- "Add validation" → "Write tests for invalid inputs, then make them pass" +- "Fix the bug" → "Write a test that reproduces it, then make it pass" +- "Refactor X" → "Ensure tests pass before and after" + +For multi-step tasks, state a brief plan: +``` +1. [Step] → verify: [check] +2. [Step] → verify: [check] +3. [Step] → verify: [check] +``` + +Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification. + +--- + +**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes. + ## Development Setup ```bash diff --git a/src/madengine/deployment/common.py b/src/madengine/deployment/common.py index 4231854f..13657246 100644 --- a/src/madengine/deployment/common.py +++ b/src/madengine/deployment/common.py @@ -25,6 +25,24 @@ "slurm_multi", ] +# Alternate spellings for distributed launcher values → canonical form. +# Add new aliases here only; do not branch on alternate spellings at dispatch sites. +_LAUNCHER_ALIASES: Dict[str, str] = { + "sglang_disagg": "sglang-disagg", +} + + +def canonicalize_distributed_launcher(launcher: Optional[str]) -> Optional[str]: + """Normalize alternate launcher spellings to their canonical form. + + Resolves aliases (e.g. ``sglang_disagg`` → ``sglang-disagg``). Unknown or + empty values are returned unchanged; callers do their own validation. + """ + if not launcher: + return launcher + return _LAUNCHER_ALIASES.get(launcher, launcher) + + # Tool names that use rocprof / rocprofv3 wrapping and need MPI-aware rocprofv3 on multi-node. _ROCPROF_FAMILY_TOOL_NAMES = frozenset( { diff --git a/src/madengine/deployment/k8s_template_context.py b/src/madengine/deployment/k8s_template_context.py index bdb43d0e..e38b251a 100644 --- a/src/madengine/deployment/k8s_template_context.py +++ b/src/madengine/deployment/k8s_template_context.py @@ -13,7 +13,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional -from .common import configure_multi_node_profiling +from .common import canonicalize_distributed_launcher, configure_multi_node_profiling from .k8s_names import sanitize_k8s_container_name, sanitize_k8s_label_value from .k8s_secrets import ( CONFIGMAP_MAX_BYTES, @@ -339,7 +339,7 @@ def _prepare_template_context( model_args=model_info.get("args", ""), ) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + elif canonicalize_distributed_launcher(launcher_type) == "sglang-disagg": if nnodes < 3: raise ValueError( f"SGLang Disaggregated requires minimum 3 nodes " diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 4efc5855..fdbabda0 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -20,7 +20,12 @@ from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env from .primus_backend import infer_primus_backend_from_model_name, merged_primus_config -from .common import configure_multi_node_profiling, is_self_managed_launcher, normalize_launcher +from .common import ( + canonicalize_distributed_launcher, + configure_multi_node_profiling, + is_self_managed_launcher, + normalize_launcher, +) from .config_loader import ConfigLoader, apply_deployment_config from .slurm_node_selector import SlurmNodeSelector from madengine.utils.gpu_config import resolve_runtime_gpus @@ -606,6 +611,9 @@ def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: # Extract launcher configuration launcher_type = self.distributed_config.get("launcher", "torchrun") # Default to torchrun + # Canonicalize aliases before validity check so e.g. sglang_disagg → sglang-disagg + # passes through normalize_launcher instead of being mapped to "docker". + launcher_type = canonicalize_distributed_launcher(launcher_type) or launcher_type # Normalize launcher based on deployment type and validity launcher_type = normalize_launcher(launcher_type, "slurm") @@ -719,7 +727,7 @@ def _generate_launcher_command( return self._generate_vllm_command(nnodes, nproc_per_node, master_port) elif launcher_type == "sglang": return self._generate_sglang_command(nnodes, nproc_per_node, master_port) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + elif canonicalize_distributed_launcher(launcher_type) == "sglang-disagg": return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port) elif launcher_type == "deepspeed": return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 26110c41..dbb57ad2 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -31,6 +31,7 @@ ) from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv from madengine.utils.gpu_config import resolve_runtime_gpus +from madengine.deployment.common import canonicalize_distributed_launcher from madengine.utils.config_parser import ConfigParser from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline @@ -662,7 +663,7 @@ def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: i """Generate distributed process launcher command for Docker local deployment. Docker local is always single-node. This parallels - SlurmDeployer._generate_launcher_command() and + SlurmDeployment._generate_launcher_command() and KubernetesLauncherMixin._generate_torchrun_command() for the Docker local path. @@ -683,6 +684,46 @@ def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: i else: return f"torchrun --standalone --nproc_per_node={nproc_per_node}" + def _resolve_local_multi_node_runner_env( + self, model_info: typing.Dict, resolved_gpu_count: int + ) -> None: + """Set ``docker_env_vars["MAD_MULTI_NODE_RUNNER"]`` for local Docker runs. + + No-op if the env var is already set. Resolves launcher from + ``additional_context.distributed.launcher``, then ``model_info.distributed.launcher``, + then ``MAD_LAUNCHER``; falls back to ``torchrun`` for unknown values. + Self-managing launchers (vllm/sglang/sglang-disagg/primus) set the var + to an empty string so downstream scripts under ``set -u`` don't fail. + """ + if "MAD_MULTI_NODE_RUNNER" in self.context.ctx["docker_env_vars"]: + return + launcher = "" + if self.additional_context: + launcher = self.additional_context.get("distributed", {}).get("launcher", "") + if not launcher and model_info.get("distributed"): + launcher = model_info["distributed"].get("launcher", "") + if not launcher: + launcher = os.environ.get("MAD_LAUNCHER", "") + canonical_launcher = canonicalize_distributed_launcher(launcher) + valid_local_launchers = ( + "torchrun", "megatron", "megatron-lm", "torchtitan", + "deepspeed", "vllm", "sglang", "sglang-disagg", "primus", + ) + if canonical_launcher in valid_local_launchers: + dist_launcher = canonical_launcher + else: + if launcher: + print(f"⚠️ Unrecognized launcher '{launcher}'; " + f"defaulting to torchrun for local deployment") + dist_launcher = "torchrun" + runtime_ngpus = int(self.context.ctx["docker_env_vars"].get( + "MAD_RUNTIME_NGPUS", str(resolved_gpu_count))) + launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus) + self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd + if launcher_cmd: + print(f"ℹ️ Set MAD_MULTI_NODE_RUNNER for local deployment " + f"(launcher={dist_launcher}): {launcher_cmd}") + _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") def get_env_arg(self, run_env: typing.Dict) -> str: @@ -1279,29 +1320,7 @@ def run_container( # SLURM and K8s generate this in their deployment layers (slurm.py, # kubernetes_launcher_mixin.py). Docker local has no such layer, so # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS. - # Defaults to torchrun when launcher is a deployment-level value - # ("docker", "native") rather than a distributed launcher. - # For models that hardcode their own launcher (e.g. HuggingFace scripts - # calling torchrun directly), this env var is simply unused. - if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]: - launcher = "" - if self.additional_context: - launcher = self.additional_context.get("distributed", {}).get("launcher", "") - if not launcher and model_info.get("distributed"): - launcher = model_info["distributed"].get("launcher", "") - if not launcher: - launcher = os.environ.get("MAD_LAUNCHER", "") - dist_launcher = launcher if launcher in ( - "torchrun", "megatron", "megatron-lm", "torchtitan", - "deepspeed", "vllm", "sglang", "sglang-disagg", "primus", - ) else "torchrun" - runtime_ngpus = int(self.context.ctx["docker_env_vars"].get( - "MAD_RUNTIME_NGPUS", str(resolved_gpu_count))) - launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus) - if launcher_cmd: - self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd - print(f"ℹ️ Set MAD_MULTI_NODE_RUNNER for local deployment " - f"(launcher={dist_launcher}): {launcher_cmd}") + self._resolve_local_multi_node_runner_env(model_info, resolved_gpu_count) # Filter out MIOPEN_USER_DB_PATH from run_env if it exists # It should be passed via docker_env_vars in context instead diff --git a/tests/unit/test_container_runner.py b/tests/unit/test_container_runner.py index 7eb7fe88..1c8f1d70 100644 --- a/tests/unit/test_container_runner.py +++ b/tests/unit/test_container_runner.py @@ -291,3 +291,113 @@ def test_invalid_mode_falls_back_to_lite(self): runner.gather_system_env_details(pep, "my_model") args = pep["pre_scripts"][0]["args"] assert args == "my_model_env lite UBUNTU" + + +class TestGenerateLocalLauncherCommand: + """_generate_local_launcher_command: launcher_type → MAD_MULTI_NODE_RUNNER command.""" + + def _runner(self): + return ContainerRunner.__new__(ContainerRunner) + + @pytest.mark.parametrize("launcher", ["torchrun", "megatron", "megatron-lm", "torchtitan"]) + def test_torchrun_family_emits_torchrun_standalone(self, launcher): + cmd = self._runner()._generate_local_launcher_command(launcher, nproc_per_node=8) + assert cmd == "torchrun --standalone --nproc_per_node=8" + + def test_deepspeed_emits_deepspeed_command(self): + cmd = self._runner()._generate_local_launcher_command("deepspeed", nproc_per_node=4) + assert cmd == "deepspeed --num_gpus=4" + + @pytest.mark.parametrize("launcher", ["vllm", "sglang", "sglang-disagg", "primus"]) + def test_self_managed_launchers_emit_empty(self, launcher): + assert self._runner()._generate_local_launcher_command(launcher, nproc_per_node=8) == "" + + def test_unknown_launcher_falls_back_to_torchrun(self): + cmd = self._runner()._generate_local_launcher_command("bogus", nproc_per_node=2) + assert cmd == "torchrun --standalone --nproc_per_node=2" + + +class TestResolveLocalMultiNodeRunnerEnv: + """_resolve_local_multi_node_runner_env: wires launcher → docker_env_vars.""" + + def _runner(self, docker_env_vars=None, additional_context=None): + runner = ContainerRunner.__new__(ContainerRunner) + runner.context = MagicMock() + runner.context.ctx = {"docker_env_vars": dict(docker_env_vars or {})} + runner.additional_context = additional_context + return runner + + @pytest.mark.parametrize( + "launcher,expected", + [ + ("torchrun", "torchrun --standalone --nproc_per_node=8"), + ("megatron", "torchrun --standalone --nproc_per_node=8"), + ("megatron-lm", "torchrun --standalone --nproc_per_node=8"), + ("torchtitan", "torchrun --standalone --nproc_per_node=8"), + ("deepspeed", "deepspeed --num_gpus=8"), + ], + ) + def test_sets_expected_command_from_additional_context(self, launcher, expected): + runner = self._runner( + additional_context={"distributed": {"launcher": launcher}}, + ) + runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8) + assert runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] == expected + + def test_does_not_override_user_provided_value(self): + runner = self._runner( + docker_env_vars={"MAD_MULTI_NODE_RUNNER": "custom-runner --foo"}, + additional_context={"distributed": {"launcher": "torchrun"}}, + ) + runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8) + assert ( + runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] + == "custom-runner --foo" + ) + + @pytest.mark.parametrize( + "launcher", + ["vllm", "sglang", "sglang-disagg", "sglang_disagg", "primus"], + ) + def test_self_managed_launchers_set_empty_string(self, launcher): + """Self-managing launchers set the var to "" (defined but empty), + so downstream scripts under set -u don't fail referencing it. + Covers the ``sglang_disagg`` underscore alias to lock in the + canonicalize_distributed_launcher() routing.""" + runner = self._runner( + additional_context={"distributed": {"launcher": launcher}}, + ) + runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=4) + assert "MAD_MULTI_NODE_RUNNER" in runner.context.ctx["docker_env_vars"] + assert runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] == "" + + def test_falls_back_to_model_info_launcher(self): + runner = self._runner(additional_context=None) + runner._resolve_local_multi_node_runner_env( + {"distributed": {"launcher": "deepspeed"}}, resolved_gpu_count=2 + ) + assert ( + runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] + == "deepspeed --num_gpus=2" + ) + + def test_unknown_launcher_defaults_to_torchrun(self): + runner = self._runner( + additional_context={"distributed": {"launcher": "docker"}}, + ) + runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=1) + assert ( + runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] + == "torchrun --standalone --nproc_per_node=1" + ) + + def test_uses_mad_runtime_ngpus_when_set(self): + runner = self._runner( + docker_env_vars={"MAD_RUNTIME_NGPUS": "4"}, + additional_context={"distributed": {"launcher": "torchrun"}}, + ) + runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8) + assert ( + runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] + == "torchrun --standalone --nproc_per_node=4" + ) diff --git a/tests/unit/test_deployment.py b/tests/unit/test_deployment.py index 4d5ad4dd..e054554e 100644 --- a/tests/unit/test_deployment.py +++ b/tests/unit/test_deployment.py @@ -9,6 +9,7 @@ from madengine.deployment.base import BaseDeployment, DeploymentConfig, create_jinja_env from madengine.deployment.common import ( VALID_LAUNCHERS, + canonicalize_distributed_launcher, configure_multi_node_profiling, is_rocprofv3_available, normalize_launcher, @@ -70,6 +71,24 @@ def test_invalid_or_missing_launcher_non_k8s_returns_docker(self, deployment): assert normalize_launcher(None, deployment) == "docker" +class TestCanonicalizeDistributedLauncher: + """canonicalize_distributed_launcher resolves alternate spellings.""" + + def test_underscore_form_maps_to_canonical_hyphen_form(self): + assert canonicalize_distributed_launcher("sglang_disagg") == "sglang-disagg" + + def test_canonical_form_passthrough(self): + assert canonicalize_distributed_launcher("sglang-disagg") == "sglang-disagg" + assert canonicalize_distributed_launcher("torchrun") == "torchrun" + + @pytest.mark.parametrize("value", [None, ""]) + def test_empty_returned_unchanged(self, value): + assert canonicalize_distributed_launcher(value) == value + + def test_unknown_value_returned_unchanged(self): + assert canonicalize_distributed_launcher("bogus_launcher") == "bogus_launcher" + + class TestToolsIncludeRocprofFamily: """tools_include_rocprof_family.""" diff --git a/tests/unit/test_slurm_multi.py b/tests/unit/test_slurm_multi.py index 872450c3..b57d602d 100644 --- a/tests/unit/test_slurm_multi.py +++ b/tests/unit/test_slurm_multi.py @@ -19,6 +19,7 @@ """ import json +import shlex from pathlib import Path from unittest.mock import MagicMock, patch @@ -199,7 +200,7 @@ def test_wrapper_exports_all_model_env_vars(self, slurm_deployment): # xP/yD are overridden by distributed.sglang_disagg.{prefill,decode}_nodes # in _prepare_slurm_multi_script; values match in this fixture so the # assertion still holds, but the source of truth is the model card. - expected = f'export {key}="{value}"' + expected = f"export {key}={shlex.quote(str(value))}" assert expected in script_text, f"missing export for {key!r}: expected {expected!r}" def test_wrapper_is_slurm_multi_flag_set(self, slurm_deployment): @@ -435,10 +436,10 @@ def test_model_card_xp_yd_wins(self, slurm_deployment): slurm_deployment.prepare() script_text = Path(slurm_deployment.script_path).read_text() - assert 'export xP="2"' in script_text, "model card xP=2 must not be overwritten" - assert 'export yD="3"' in script_text, "model card yD=3 must not be overwritten" - assert 'export xP="1"' not in script_text, "distributed.sglang_disagg xP=1 must not win" - assert 'export yD="1"' not in script_text, "distributed.sglang_disagg yD=1 must not win" + assert f"export xP={shlex.quote('2')}" in script_text, "model card xP=2 must not be overwritten" + assert f"export yD={shlex.quote('3')}" in script_text, "model card yD=3 must not be overwritten" + assert f"export xP={shlex.quote('1')}" not in script_text, "distributed.sglang_disagg xP=1 must not win" + assert f"export yD={shlex.quote('1')}" not in script_text, "distributed.sglang_disagg yD=1 must not win" # ---------------------------------------------------------------------------