Default Megatron grad accumulation by DP size

Kovbo · Kovbo · commit 2e64da04bdd8 · 2026-04-01T02:10:30.000Z
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -805,7 +805,9 @@ async def _train_model(
             packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors"
         )
         # Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())
-        grad_accumulation_sequences = max(1, int(config.grad_accumulation_sequences))
+        grad_accumulation_sequences = max(
+            1, int(config.grad_accumulation_sequences or 1)
+        )
         estimated_gradient_steps = math.ceil(
             disk_packed_tensors["num_sequences"] / grad_accumulation_sequences
         )
diff --git a/src/art/megatron/shared.py b/src/art/megatron/shared.py
@@ -28,6 +28,7 @@
     configure_moe_routing_replay,
     load_adapter_into_model,
     print0,
+    resolve_global_grad_accumulation_sequences,
     run_training_step,
     select_indexed_inputs,
     select_micro_inputs,
@@ -119,7 +120,9 @@ def run_megatron_rl_job(
         template = _clone_packed_tensors(select_indexed_inputs(packed_tensors, 0))
         zero_template = _zero_contribution_inputs(template)
         num_sequences = job.disk_packed_tensors["num_sequences"]
-        global_grad_accumulation_sequences = job.config.grad_accumulation_sequences
+        global_grad_accumulation_sequences = resolve_global_grad_accumulation_sequences(
+            job.config.grad_accumulation_sequences
+        )
         num_steps = math.ceil(num_sequences / global_grad_accumulation_sequences)
         for step_index in range(num_steps):
             micro_indices = build_micro_sample_indices(
diff --git a/src/art/megatron/train.py b/src/art/megatron/train.py
@@ -352,35 +352,54 @@ def _zero_contribution_inputs(template: PackedTensors) -> PackedTensors:
     return dummy
 
 
+def resolve_global_grad_accumulation_sequences(
+    global_grad_accumulation_sequences: int | None,
+) -> int:
+    dp_world_size = ps.get_data_parallel_world_size()
+    if global_grad_accumulation_sequences is None:
+        return dp_world_size
+    return global_grad_accumulation_sequences
+
+
 def resolve_local_grad_accumulation_sequences(
-    global_grad_accumulation_sequences: int,
+    global_grad_accumulation_sequences: int | None,
 ) -> int:
+    resolved_global_grad_accumulation_sequences = (
+        resolve_global_grad_accumulation_sequences(
+            global_grad_accumulation_sequences=global_grad_accumulation_sequences
+        )
+    )
     dp_world_size = ps.get_data_parallel_world_size()
     if (
-        global_grad_accumulation_sequences <= 0
-        or global_grad_accumulation_sequences % dp_world_size != 0
+        resolved_global_grad_accumulation_sequences <= 0
+        or resolved_global_grad_accumulation_sequences % dp_world_size != 0
     ):
         raise RuntimeError(
             "Invalid global grad accumulation / DP world size combination: "
-            f"global_grad_accumulation_sequences={global_grad_accumulation_sequences}, "
+            f"global_grad_accumulation_sequences={resolved_global_grad_accumulation_sequences}, "
             f"dp_world_size={dp_world_size}"
         )
-    return global_grad_accumulation_sequences // dp_world_size
+    return resolved_global_grad_accumulation_sequences // dp_world_size
 
 
 def build_micro_sample_indices(
     step_index: int,
     num_sequences: int,
-    global_grad_accumulation_sequences: int,
+    global_grad_accumulation_sequences: int | None,
 ) -> list[int | None]:
     dp_rank = ps.get_data_parallel_rank()
+    resolved_global_grad_accumulation_sequences = (
+        resolve_global_grad_accumulation_sequences(
+            global_grad_accumulation_sequences=global_grad_accumulation_sequences
+        )
+    )
     dp_world_size = ps.get_data_parallel_world_size()
     local_grad_accumulation_sequences = resolve_local_grad_accumulation_sequences(
-        global_grad_accumulation_sequences=global_grad_accumulation_sequences,
+        global_grad_accumulation_sequences=resolved_global_grad_accumulation_sequences,
     )
-    base_global_sample_index = step_index * global_grad_accumulation_sequences
+    base_global_sample_index = step_index * resolved_global_grad_accumulation_sequences
     global_step_indices: list[int | None] = []
-    for offset in range(global_grad_accumulation_sequences):
+    for offset in range(resolved_global_grad_accumulation_sequences):
         global_sample_index = base_global_sample_index + offset
         global_step_indices.append(
             global_sample_index if global_sample_index < num_sequences else None
@@ -479,10 +498,15 @@ def run_training_step(
         micro_sample_indices = [sample_index]
 
     if moe_routing_replay_controller is not None:
+        resolved_global_grad_accumulation_sequences = (
+            resolve_global_grad_accumulation_sequences(
+                config.grad_accumulation_sequences
+            )
+        )
         moe_routing_replay_controller.set_step(
             step_index=step_index,
             sample_index=micro_sample_indices,
-            global_grad_accumulation_sequences=config.grad_accumulation_sequences,
+            global_grad_accumulation_sequences=resolved_global_grad_accumulation_sequences,
         )
 
     device = next(model_chunks[0].parameters()).device
@@ -532,6 +556,7 @@ def run_training_step(
     if new_logprobs is None or raw_loss_sum is None:
         raise RuntimeError("run_training_step did not produce outputs")
 
+    # num_tokens is reduced in place across ranks by finalize_model_grads().
     finalize_model_grads_extended(model_chunks, num_tokens=num_tokens)
     update_successful, grad_norm, num_zeros_in_grad = _optimizer_step(
         optimizer,
diff --git a/src/art/types.py b/src/art/types.py
@@ -17,7 +17,7 @@
 class TrainConfig(pydantic.BaseModel):
     learning_rate: float = 5e-6
     kl_penalty_coef: float = 0.0
-    grad_accumulation_sequences: int = pydantic.Field(default=1, ge=1)
+    grad_accumulation_sequences: int | None = pydantic.Field(default=None, ge=1)
 
 
 class TrainSFTConfig(pydantic.BaseModel):

Original file line number	Diff line number	Diff line change
`@@ -805,7 +805,9 @@ async def _train_model(`
`805`	`805`	`packed_tensors, f"{get_model_dir(model=model, art_path=self._path)}/tensors"`
`806`	`806`	`)`
`807`	`807`	`# Note: scale_learning_rate_by_reward_std_dev is now handled by the frontend (Model.train())`
`808`		`- grad_accumulation_sequences = max(1, int(config.grad_accumulation_sequences))`
	`808`	`+ grad_accumulation_sequences = max(`
	`809`	`+ 1, int(config.grad_accumulation_sequences or 1)`
	`810`	`+ )`
`809`	`811`	`estimated_gradient_steps = math.ceil(`
`810`	`812`	`disk_packed_tensors["num_sequences"] / grad_accumulation_sequences`
`811`	`813`	`)`