Fix Megatron row-parallel LoRA grad sync

Kovbo · Kovbo · commit 3d8d1f55ed41 · 2026-04-01T22:33:05.000Z
diff --git a/src/art/megatron/lora.py b/src/art/megatron/lora.py
@@ -398,7 +398,9 @@ def __init__(
             update={
                 "sharded": False,
                 "shard_dim": None,
-                "grad_sync_op": GRAD_SYNC_OP_SUM,  # sum replicated TP contributions
+                # Row-parallel output uses TP collectives whose backward already gives
+                # replicated B the full output gradient on each TP rank.
+                "grad_sync_op": GRAD_SYNC_OP_NONE,
             }
         )
         self.lora = LoRA(
@@ -689,7 +691,9 @@ def __init__(
                 "sharded": False,
                 "shard_dim": None,
                 "grad_sync_domain": EXPERT_TP_GRAD_SYNC_DOMAIN,
-                "grad_sync_op": GRAD_SYNC_OP_SUM,  # we handle this with extended finalize_grads
+                # Expert row-parallel output follows the same pattern: replicated B
+                # already sees the full gradient from the backward TP collective.
+                "grad_sync_op": GRAD_SYNC_OP_NONE,
             }
         )
         self.lora = LoRA(

Original file line number	Diff line number	Diff line change
`@@ -398,7 +398,9 @@ def __init__(`
`398`	`398`	`update={`
`399`	`399`	`"sharded": False,`
`400`	`400`	`"shard_dim": None,`
`401`		`- "grad_sync_op": GRAD_SYNC_OP_SUM, # sum replicated TP contributions`
	`401`	`+ # Row-parallel output uses TP collectives whose backward already gives`
	`402`	`+ # replicated B the full output gradient on each TP rank.`
	`403`	`+ "grad_sync_op": GRAD_SYNC_OP_NONE,`
`402`	`404`	`}`
`403`	`405`	`)`
`404`	`406`	`self.lora = LoRA(`
`@@ -689,7 +691,9 @@ def __init__(`
`689`	`691`	`"sharded": False,`
`690`	`692`	`"shard_dim": None,`
`691`	`693`	`"grad_sync_domain": EXPERT_TP_GRAD_SYNC_DOMAIN,`
`692`		`- "grad_sync_op": GRAD_SYNC_OP_SUM, # we handle this with extended finalize_grads`
	`694`	`+ # Expert row-parallel output follows the same pattern: replicated B`
	`695`	`+ # already sees the full gradient from the backward TP collective.`
	`696`	`+ "grad_sync_op": GRAD_SYNC_OP_NONE,`
`693`	`697`	`}`
`694`	`698`	`)`
`695`	`699`	`self.lora = LoRA(`