diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py index efec5c9d00c8..c2bb16b04e6d 100644 --- a/deepspeed/runtime/precision_config.py +++ b/deepspeed/runtime/precision_config.py @@ -24,7 +24,7 @@ "bf16": { "enabled": true, "immediate_grad_update": false, - "check_grad_overflow": false + "check_grad_overflow": true } ''' BFLOAT16 = "bf16" @@ -53,9 +53,20 @@ class DeepSpeedBF16Config(DeepSpeedConfigModel): Apply gradient updates immediately rather than delayed. """ - check_grad_overflow: bool = False - """ - Check for gradient overflows and underflows + check_grad_overflow: bool = True + """ + Detect gradient overflow/underflow before optimizer step and skip the step + when detected. Default True (matching fp16 default) because bf16 partition-flat + gradient accumulation in ZeRO-2 with heterogeneous per-sample loss masks (e.g. + Mixture-of-Transformers + per-sample validity dropout) can produce a bf16 element + that overflows to +inf in averaged_gradients[i]. Without this check, Adam.step + computes inf/sqrt(inf)=NaN inside a fused kernel, simultaneously corrupting + thousands of parameter tensors and ending the training run with no useful + diagnostic. Set False only if you have measured this check to be too expensive + and have separately confirmed your bf16 path cannot overflow. + See: + - github.com/deepspeedai/DeepSpeed/issues/5242 + - github.com/deepspeedai/DeepSpeed/pull/6976 (introduced the option) """ bf16_master_weights_and_grads: bool = False