From 2dcef3bbdf3833e593bde38f278a050a9931022a Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 25 May 2026 11:06:29 +0200
Subject: [PATCH 1/3] [PyTorch Debug] Fix scale_inv_min always returning 0 for
 MXFP8/NVFP4

MXFP8/NVFP4 quantizers pad scale_inv to multiples of [128, 4] (or
[4, 128] columnwise) with zeros, so a plain .min() over the whole
tensor was always returning 0. Mask zeros out before computing the
minimum.

Fixes #2628

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 .../debug/features/utils/stats_computation.py        | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py
index b0002ffee6..79f043714b 100644
--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -348,6 +348,16 @@ def get_scale_inv(quantized_tensor, columnwise):
             return getattr(quantized_tensor, "_columnwise_scale_inv")
         return getattr(quantized_tensor, "_rowwise_scale_inv")
 
+    def nonzero_min(scale_inv):
+        # MXFP8/NVFP4 scale_inv is padded to a multiple of [128, 4] (or [4, 128]
+        # for columnwise) with zeros, so a plain .min() always returns 0. Mask
+        # those padding zeros out; if everything is zero (degenerate case) fall
+        # back to 0 so the buffer aggregation stays well-defined.
+        nz = scale_inv[scale_inv != 0]
+        if nz.numel() == 0:
+            return scale_inv.new_zeros(())
+        return nz.min()
+
     columnwise_suffix = "_columnwise" if columnwise else ""
     # Prepare stat names.
     stat_name_min = (
@@ -363,7 +373,7 @@ def get_scale_inv(quantized_tensor, columnwise):
 
     # Capture the attribute name inside lambdas via default args to avoid late binding.
     STATS[stat_name_min] = (
-        lambda x, aux_dict, _col=columnwise: get_scale_inv(aux_dict[recipe_name], _col).min(),
+        lambda x, aux_dict, _col=columnwise: nonzero_min(get_scale_inv(aux_dict[recipe_name], _col)),
         lambda buffers, _sn=stat_name_min: min(_get(buffers, _sn)),
     )
     STATS[stat_name_max] = (

From 9bd7c6f241f6a93b129b6cea0ad841d8ad639e7e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 09:18:56 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 transformer_engine/debug/features/utils/stats_computation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py
index 79f043714b..b512aa604b 100644
--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -373,7 +373,9 @@ def nonzero_min(scale_inv):
 
     # Capture the attribute name inside lambdas via default args to avoid late binding.
     STATS[stat_name_min] = (
-        lambda x, aux_dict, _col=columnwise: nonzero_min(get_scale_inv(aux_dict[recipe_name], _col)),
+        lambda x, aux_dict, _col=columnwise: nonzero_min(
+            get_scale_inv(aux_dict[recipe_name], _col)
+        ),
         lambda buffers, _sn=stat_name_min: min(_get(buffers, _sn)),
     )
     STATS[stat_name_max] = (

From d1ea951b69b0af4b12b8da726eb0729e5aff7004 Mon Sep 17 00:00:00 2001
From: Pawel Gadzinski <pgadzinski@nvidia.com>
Date: Mon, 25 May 2026 11:25:16 +0200
Subject: [PATCH 3/3] Clarify scale_inv padding comment

The previous wording said the padding was always [128, 4] / [4, 128],
which is true for MXFP8 but inaccurate for NVFP4 columnwise (padded to
[128, 4], not [4, 128]). Also note that scale_inv is never naturally 0
(compute_scale_from_amax returns 1.0 for all-zero blocks), so masking
zeros is exact rather than heuristic.

Signed-off-by: Pawel Gadzinski <pgadzinski@nvidia.com>
---
 .../debug/features/utils/stats_computation.py       | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/transformer_engine/debug/features/utils/stats_computation.py b/transformer_engine/debug/features/utils/stats_computation.py
index b512aa604b..6668400017 100644
--- a/transformer_engine/debug/features/utils/stats_computation.py
+++ b/transformer_engine/debug/features/utils/stats_computation.py
@@ -349,10 +349,15 @@ def get_scale_inv(quantized_tensor, columnwise):
         return getattr(quantized_tensor, "_rowwise_scale_inv")
 
     def nonzero_min(scale_inv):
-        # MXFP8/NVFP4 scale_inv is padded to a multiple of [128, 4] (or [4, 128]
-        # for columnwise) with zeros, so a plain .min() always returns 0. Mask
-        # those padding zeros out; if everything is zero (degenerate case) fall
-        # back to 0 so the buffer aggregation stays well-defined.
+        # MXFP8/NVFP4 quantizers round the scale_inv shape up to multiples of
+        # 128 along one axis and 4 along the other and fill the extra slots
+        # with zeros (via torch.nn.functional.pad with the default value=0),
+        # so a plain .min() always returns 0 for shapes that needed padding.
+        # A real scale_inv entry is never 0: compute_scale_from_amax returns
+        # scale=1.0 for all-zero blocks and clamps the inf case to a finite
+        # fallback, so zeros uniquely identify padding and masking them out
+        # gives the true minimum. The empty-after-mask branch is a safety
+        # net for the (in practice unreachable) all-zero tensor.
         nz = scale_inv[scale_inv != 0]
         if nz.numel() == 0:
             return scale_inv.new_zeros(())