From 2dcef3bbdf3833e593bde38f278a050a9931022a Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 25 May 2026 11:06:29 +0200 Subject: [PATCH 1/3] [PyTorch Debug] Fix scale_inv_min always returning 0 for MXFP8/NVFP4 MXFP8/NVFP4 quantizers pad scale_inv to multiples of [128, 4] (or [4, 128] columnwise) with zeros, so a plain .min() over the whole tensor was always returning 0. Mask zeros out before computing the minimum. Fixes #2628 Signed-off-by: Pawel Gadzinski --- .../debug/features/utils/stats_computation.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index b0002ffee6..79f043714b 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -348,6 +348,16 @@ def get_scale_inv(quantized_tensor, columnwise): return getattr(quantized_tensor, "_columnwise_scale_inv") return getattr(quantized_tensor, "_rowwise_scale_inv") + def nonzero_min(scale_inv): + # MXFP8/NVFP4 scale_inv is padded to a multiple of [128, 4] (or [4, 128] + # for columnwise) with zeros, so a plain .min() always returns 0. Mask + # those padding zeros out; if everything is zero (degenerate case) fall + # back to 0 so the buffer aggregation stays well-defined. + nz = scale_inv[scale_inv != 0] + if nz.numel() == 0: + return scale_inv.new_zeros(()) + return nz.min() + columnwise_suffix = "_columnwise" if columnwise else "" # Prepare stat names. stat_name_min = ( @@ -363,7 +373,7 @@ def get_scale_inv(quantized_tensor, columnwise): # Capture the attribute name inside lambdas via default args to avoid late binding. STATS[stat_name_min] = ( - lambda x, aux_dict, _col=columnwise: get_scale_inv(aux_dict[recipe_name], _col).min(), + lambda x, aux_dict, _col=columnwise: nonzero_min(get_scale_inv(aux_dict[recipe_name], _col)), lambda buffers, _sn=stat_name_min: min(_get(buffers, _sn)), ) STATS[stat_name_max] = ( From 9bd7c6f241f6a93b129b6cea0ad841d8ad639e7e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 May 2026 09:18:56 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/debug/features/utils/stats_computation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index 79f043714b..b512aa604b 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -373,7 +373,9 @@ def nonzero_min(scale_inv): # Capture the attribute name inside lambdas via default args to avoid late binding. STATS[stat_name_min] = ( - lambda x, aux_dict, _col=columnwise: nonzero_min(get_scale_inv(aux_dict[recipe_name], _col)), + lambda x, aux_dict, _col=columnwise: nonzero_min( + get_scale_inv(aux_dict[recipe_name], _col) + ), lambda buffers, _sn=stat_name_min: min(_get(buffers, _sn)), ) STATS[stat_name_max] = ( From d1ea951b69b0af4b12b8da726eb0729e5aff7004 Mon Sep 17 00:00:00 2001 From: Pawel Gadzinski Date: Mon, 25 May 2026 11:25:16 +0200 Subject: [PATCH 3/3] Clarify scale_inv padding comment The previous wording said the padding was always [128, 4] / [4, 128], which is true for MXFP8 but inaccurate for NVFP4 columnwise (padded to [128, 4], not [4, 128]). Also note that scale_inv is never naturally 0 (compute_scale_from_amax returns 1.0 for all-zero blocks), so masking zeros is exact rather than heuristic. Signed-off-by: Pawel Gadzinski --- .../debug/features/utils/stats_computation.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py index b512aa604b..6668400017 100644 --- a/transformer_engine/debug/features/utils/stats_computation.py +++ b/transformer_engine/debug/features/utils/stats_computation.py @@ -349,10 +349,15 @@ def get_scale_inv(quantized_tensor, columnwise): return getattr(quantized_tensor, "_rowwise_scale_inv") def nonzero_min(scale_inv): - # MXFP8/NVFP4 scale_inv is padded to a multiple of [128, 4] (or [4, 128] - # for columnwise) with zeros, so a plain .min() always returns 0. Mask - # those padding zeros out; if everything is zero (degenerate case) fall - # back to 0 so the buffer aggregation stays well-defined. + # MXFP8/NVFP4 quantizers round the scale_inv shape up to multiples of + # 128 along one axis and 4 along the other and fill the extra slots + # with zeros (via torch.nn.functional.pad with the default value=0), + # so a plain .min() always returns 0 for shapes that needed padding. + # A real scale_inv entry is never 0: compute_scale_from_amax returns + # scale=1.0 for all-zero blocks and clamps the inf case to a finite + # fallback, so zeros uniquely identify padding and masking them out + # gives the true minimum. The empty-after-mask branch is a safety + # net for the (in practice unreachable) all-zero tensor. nz = scale_inv[scale_inv != 0] if nz.numel() == 0: return scale_inv.new_zeros(())