Fix wandb run reuse across models

Kovbo · Kovbo · commit 0db8ff8411ce · 2026-03-16T16:30:09.000-07:00
diff --git a/src/art/model.py b/src/art/model.py
@@ -470,6 +470,7 @@ def _get_wandb_run(self) -> Optional["Run"]:
                 id=self.name,
                 config=self._wandb_config or None,
                 resume="allow",
+                reinit="create_new",
                 settings=wandb.Settings(
                     x_stats_open_metrics_endpoints={
                         "vllm": "http://localhost:8000/metrics",
@@ -492,18 +493,18 @@ def _get_wandb_run(self) -> Optional["Run"]:
 
             # Define training_step as the x-axis for all metrics.
             # This allows out-of-order logging (e.g., async validation for previous steps).
-            wandb.define_metric("training_step")
-            wandb.define_metric("time/wall_clock_sec")
-            wandb.define_metric("reward/*", step_metric="training_step")
-            wandb.define_metric("loss/*", step_metric="training_step")
-            wandb.define_metric("throughput/*", step_metric="training_step")
-            wandb.define_metric("costs/*", step_metric="training_step")
-            wandb.define_metric("time/*", step_metric="training_step")
-            wandb.define_metric("data/*", step_metric="training_step")
-            wandb.define_metric("train/*", step_metric="training_step")
-            wandb.define_metric("val/*", step_metric="training_step")
-            wandb.define_metric("test/*", step_metric="training_step")
-            wandb.define_metric("discarded/*", step_metric="training_step")
+            run.define_metric("training_step")
+            run.define_metric("time/wall_clock_sec")
+            run.define_metric("reward/*", step_metric="training_step")
+            run.define_metric("loss/*", step_metric="training_step")
+            run.define_metric("throughput/*", step_metric="training_step")
+            run.define_metric("costs/*", step_metric="training_step")
+            run.define_metric("time/*", step_metric="training_step")
+            run.define_metric("data/*", step_metric="training_step")
+            run.define_metric("train/*", step_metric="training_step")
+            run.define_metric("val/*", step_metric="training_step")
+            run.define_metric("test/*", step_metric="training_step")
+            run.define_metric("discarded/*", step_metric="training_step")
             self._sync_wandb_config(run)
         return self._wandb_run
 
@@ -562,14 +563,16 @@ def _log_metrics(
                 run.log(prefixed)
 
     def _define_wandb_step_metrics(self, keys: Iterable[str]) -> None:
-        import wandb
+        run = self._wandb_run
+        if run is None or run._is_finished:
+            return
 
         for key in keys:
             if not key.startswith("costs/"):
                 continue
             if key in self._wandb_defined_metrics:
                 continue
-            wandb.define_metric(key, step_metric="training_step")
+            run.define_metric(key, step_metric="training_step")
             self._wandb_defined_metrics.add(key)
 
     def _route_metrics_and_collect_non_costs(
diff --git a/tests/unit/test_wandb_multi_run.py b/tests/unit/test_wandb_multi_run.py
@@ -0,0 +1,92 @@
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+from art import Model
+
+
+def test_wandb_creates_separate_runs_per_model(tmp_path: Path):
+    class FakeRun:
+        def __init__(self, name: str):
+            self.name = name
+            self.id = name
+            self._is_finished = False
+            self.defined_metrics: list[tuple[str, str | None]] = []
+
+        def define_metric(self, name: str, *, step_metric: str | None = None) -> None:
+            self.defined_metrics.append((name, step_metric))
+
+    class FakeWandb:
+        def __init__(self):
+            self.init_calls: list[dict] = []
+            self.runs: list[FakeRun] = []
+
+        @staticmethod
+        def Settings(**kwargs):
+            return kwargs
+
+        def init(self, **kwargs):
+            self.init_calls.append(kwargs)
+            run = FakeRun(kwargs["name"])
+            self.runs.append(run)
+            return run
+
+        def define_metric(self, *args, **kwargs) -> None:
+            raise AssertionError("Model should define metrics on the run object")
+
+    fake_wandb = FakeWandb()
+    model_one = Model(
+        name="run-one",
+        project="test-project",
+        base_path=str(tmp_path),
+    )
+    model_two = Model(
+        name="run-two",
+        project="test-project",
+        base_path=str(tmp_path),
+    )
+
+    with patch.dict(os.environ, {"WANDB_API_KEY": "test-key"}):
+        with patch.dict(sys.modules, {"wandb": fake_wandb}):
+            run_one = model_one._get_wandb_run()
+            run_two = model_two._get_wandb_run()
+            model_one._define_wandb_step_metrics(["costs/train/custom"])
+
+    assert run_one is not None
+    assert run_two is not None
+    assert run_one is not run_two
+    assert [call["name"] for call in fake_wandb.init_calls] == [
+        "run-one",
+        "run-two",
+    ]
+    assert all(call["reinit"] == "create_new" for call in fake_wandb.init_calls)
+    assert run_one.defined_metrics == [
+        ("training_step", None),
+        ("time/wall_clock_sec", None),
+        ("reward/*", "training_step"),
+        ("loss/*", "training_step"),
+        ("throughput/*", "training_step"),
+        ("costs/*", "training_step"),
+        ("time/*", "training_step"),
+        ("data/*", "training_step"),
+        ("train/*", "training_step"),
+        ("val/*", "training_step"),
+        ("test/*", "training_step"),
+        ("discarded/*", "training_step"),
+        ("costs/train/custom", "training_step"),
+    ]
+    assert run_two.defined_metrics == [
+        ("training_step", None),
+        ("time/wall_clock_sec", None),
+        ("reward/*", "training_step"),
+        ("loss/*", "training_step"),
+        ("throughput/*", "training_step"),
+        ("costs/*", "training_step"),
+        ("time/*", "training_step"),
+        ("data/*", "training_step"),
+        ("train/*", "training_step"),
+        ("val/*", "training_step"),
+        ("test/*", "training_step"),
+        ("discarded/*", "training_step"),
+    ]