From c873e4644c95f77572a3a14fa2663173ddcbb1b6 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Tue, 19 May 2026 22:25:50 +0000
Subject: [PATCH 1/7] add production GEMM tests

---
 tests/cpp/operator/CMakeLists.txt        |   1 +
 tests/cpp/operator/test_gemm_prodgemm.cu | 396 +++++++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 tests/cpp/operator/test_gemm_prodgemm.cu
diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
index 0ebd7fdfe..0eded7219 100644
--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -39,6 +39,7 @@ if(USE_CUDA)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu
+              test_gemm_prodgemm.cu
               test_cast_mxfp4_transpose.cu)
 endif()
 
diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
new file mode 100644
index 000000000..2a086ddea
--- /dev/null
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -0,0 +1,396 @@
+/*************************************************************************
+ * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * License for AMD contributions = MIT. See LICENSE for more information
+ ************************************************************************/
+
+/*
+ * MXFP8 GEMM correctness tests for production LLM shapes.
+ *
+ * Tests forward, dgrad, and wgrad passes with appropriate FP8 type combos:
+ *   Forward:  E4M3 x E4M3 -> BF16
+ *   Dgrad:    E5M2 x E4M3 -> BF16
+ *   Wgrad:    E4M3 x E5M2 -> BF16
+ *
+ * Each shape is tested with 3 transpose configs (TN, NN, NT) and
+ * 3 micro-batch sizes (MBS = 1, 2, 4 -> tokens = 4096, 8192, 16384).
+ */
+
+#ifdef __HIP_PLATFORM_AMD__
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <transformer_engine/cast.h>
+#include <transformer_engine/gemm.h>
+#include <transformer_engine/swizzle.h>
+#include <transformer_engine/transformer_engine.h>
+#include "../test_common.h"
+
+using namespace transformer_engine;
+using namespace test;
+
+namespace {
+
+using fp32 = float;
+using fp8  = fp8e4m3;
+using bf8  = fp8e5m2;
+
+using TShape = std::vector<size_t>;
+using Layout = std::pair<bool, bool>;  // {transa, transb}
+
+static const Layout kTN{true, false};
+static const Layout kNN{false, false};
+static const Layout kNT{false, true};
+static const std::vector<Layout> kLayouts = {kTN, kNN, kNT};
+
+// ============================================================================
+// GemmPass: determines A/B FP8 type combination
+//   FWD:   fp8  x fp8  (E4M3 x E4M3)
+//   DGRAD: bf8  x fp8  (E5M2 x E4M3)
+//   WGRAD: fp8  x bf8  (E4M3 x E5M2)
+// ============================================================================
+
+enum class GemmPass { FWD, DGRAD, WGRAD };
+
+// ============================================================================
+// Shape definition: describes a GEMM from the model architecture.
+//
+//   Forward / Dgrad: M = tokens, dim1 = N, dim2 = K
+//   Wgrad:           K = tokens, dim1 = M, dim2 = N
+// ============================================================================
+
+struct ShapeDef {
+    const char* label;
+    size_t dim1;
+    size_t dim2;
+    GemmPass pass;
+};
+
+// LLM1 (hidden=7168, MLA, seq=4096)
+
+static const ShapeDef llm1_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"LLM1_Linear0_fwd",           1536,  7168, GemmPass::FWD},
+    {"LLM1_Linear1_fwd",            576,  7168, GemmPass::FWD},
+    {"LLM1_LNLinear0_fwd",        24576,  1536, GemmPass::FWD},
+    {"LLM1_LNLinear1_fwd",        32768,   512, GemmPass::FWD},
+    {"LLM1_Linear_attn_fwd",       7168, 16384, GemmPass::FWD},
+    {"LLM1_LNMLP_gateup_fwd",     36864,  7168, GemmPass::FWD},
+    {"LLM1_LNMLP_down_fwd",        7168, 18432, GemmPass::FWD},
+    {"LLM1_SharedExp_gu_fwd",      4096,  7168, GemmPass::FWD},
+    {"LLM1_SharedExp_dn_fwd",      7168,  2048, GemmPass::FWD},
+    {"LLM1_TopKRouter_fwd",         256,  7168, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"LLM1_attn_dgrad",           16384,  7168, GemmPass::DGRAD},
+    {"LLM1_LNLinear1_dgrad",        512, 32768, GemmPass::DGRAD},
+    {"LLM1_LNLinear0_dgrad",       1536, 24576, GemmPass::DGRAD},
+    {"LLM1_SharedExp_dn_dgrad",    2048,  7168, GemmPass::DGRAD},
+    {"LLM1_SharedExp_gu_dgrad",    7168,  4096, GemmPass::DGRAD},
+    {"LLM1_TopKRouter_dgrad",      7168,   256, GemmPass::DGRAD},
+    {"LLM1_MLP_post_dgrad",        7168, 14336, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"LLM1_attn_wgrad",           16384,  7168, GemmPass::WGRAD},
+    {"LLM1_LNLinear1_wgrad",        512, 32768, GemmPass::WGRAD},
+    {"LLM1_LNLinear0_wgrad",       1536, 24576, GemmPass::WGRAD},
+    {"LLM1_SharedExp_dn_wgrad",    2048,  7168, GemmPass::WGRAD},
+    {"LLM1_SharedExp_gu_wgrad",    7168,  4096, GemmPass::WGRAD},
+    {"LLM1_TopKRouter_wgrad",      7168,   256, GemmPass::WGRAD},
+};
+
+// LLM1 LM Head (large N, memory-intensive)
+static const ShapeDef llm1_lm_head_shapes[] = {
+    {"LLM1_LMHead_fwd",          129280,  7168, GemmPass::FWD},
+    {"LLM1_LMHead_dgrad",          7168,129280, GemmPass::DGRAD},
+    {"LLM1_LMHead_wgrad",          7168,129280, GemmPass::WGRAD},
+};
+
+// LLM2 (hidden=4096, GQA, seq=4096)
+
+static const ShapeDef llm2_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"LLM2_LNLinear_QKV_fwd",   9216,  4096, GemmPass::FWD},
+    {"LLM2_Linear_attn_fwd",    4096,  8192, GemmPass::FWD},
+    {"LLM2_Router_fwd",          128,  4096, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"LLM2_Router_dgrad",       4096,   128, GemmPass::DGRAD},
+    {"LLM2_Linear_attn_dgrad",  8192,  4096, GemmPass::DGRAD},
+    {"LLM2_LNLinear_dgrad",     4096,  9216, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"LLM2_Router_wgrad",       4096,   128, GemmPass::WGRAD},
+    {"LLM2_Linear_attn_wgrad",  8192,  4096, GemmPass::WGRAD},
+    {"LLM2_LNLinear_wgrad",     4096,  9216, GemmPass::WGRAD},
+};
+
+// LLM2 LM Head (large N, memory-intensive)
+static const ShapeDef llm2_lm_head_shapes[] = {
+    {"LLM2_LMHead_fwd",        151936,  4096, GemmPass::FWD},
+    {"LLM2_LMHead_dgrad",        4096,151936, GemmPass::DGRAD},
+    {"LLM2_LMHead_wgrad",        4096,151936, GemmPass::WGRAD},
+};
+
+// ============================================================================
+// Test case: a concrete (M, K, N) shape with pass info, ready for execution
+// ============================================================================
+
+struct ProdGemmTestCase {
+    std::string label;
+    size_t m, k, n;
+    GemmPass pass;
+};
+
+std::ostream& operator<<(std::ostream& os, const ProdGemmTestCase& tc) {
+    return os << tc.label;
+}
+
+static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t count) {
+    std::vector<ProdGemmTestCase> cases;
+    for (size_t i = 0; i < count; ++i) {
+        const auto& s = defs[i];
+        for (size_t mbs : {1, 2, 4}) {
+            size_t tokens = mbs * 4096;
+            ProdGemmTestCase tc;
+            tc.label = std::string(s.label) + "_mbs" + std::to_string(mbs);
+            tc.pass = s.pass;
+            switch (s.pass) {
+                case GemmPass::FWD:
+                case GemmPass::DGRAD:
+                    tc.m = tokens;
+                    tc.n = s.dim1;
+                    tc.k = s.dim2;
+                    break;
+                case GemmPass::WGRAD:
+                    tc.m = s.dim1;
+                    tc.n = s.dim2;
+                    tc.k = tokens;
+                    break;
+            }
+            cases.push_back(std::move(tc));
+        }
+    }
+    return cases;
+}
+
+static std::vector<ProdGemmTestCase> generate_model_test_cases() {
+    auto v1   = expand_shapes(llm1_shapes,   std::size(llm1_shapes));
+    auto v2   = expand_shapes(llm2_shapes,   std::size(llm2_shapes));
+    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
+                        std::make_move_iterator(v2.end()));
+    return v1;
+}
+
+static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
+    auto v1   = expand_shapes(llm1_lm_head_shapes,   std::size(llm1_lm_head_shapes));
+    auto v2   = expand_shapes(llm2_lm_head_shapes,   std::size(llm2_lm_head_shapes));
+    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
+                        std::make_move_iterator(v2.end()));
+    return v1;
+}
+
+// ============================================================================
+// Swizzle helper for gfx1250 MXFP8 scales (same as test_cublaslt_gemm.cu)
+// ============================================================================
+
+static void swizzle_mxfp8_scales(test::Tensor& t, bool rowwise) {
+    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
+                              : t.columnwise_scale_inv_dptr();
+    if (!scale_ptr) return;
+
+    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
+                                          : t.columnwise_scale_inv_shape();
+    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
+                                          : t.columnwise_shape();
+
+    size_t num_scales = 1;
+    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
+
+    uint8_t* d_tmp = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
+
+    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+    output_tw.set_with_gemm_swizzled_scales(true);
+
+    if (rowwise) {
+        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    } else {
+        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    }
+
+    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
+    NVTE_CHECK_CUDA(cudaFree(d_tmp));
+}
+
+// ============================================================================
+// MXFP8 dequantize-based GEMM correctness test
+//
+// 1. Create random source matrices A_src, B_src in D_Type (bf16)
+// 2. Quantize:   A_src -> A_fp8, B_src -> B_fp8  (MXFP8 block scaling)
+// 3. Dequantize: A_fp8 -> A_ref, B_fp8 -> B_ref  (back to D_Type)
+// 4. Swizzle scales for gfx1250 (if needed)
+// 5. MXFP8 GEMM:     D     = A_fp8 * B_fp8
+// 6. Non-FP8 GEMM:   D_ref = A_ref * B_ref
+// 7. Compare D vs D_ref
+// ============================================================================
+
+template <typename A_Type, typename B_Type, typename D_Type>
+void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb) {
+    DType atype = TypeInfo<A_Type>::dtype;
+    DType btype = TypeInfo<B_Type>::dtype;
+    DType dtype = TypeInfo<D_Type>::dtype;
+
+    ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input types expected";
+    ASSERT_FALSE(isFp8Type(dtype)) << "Non-FP8 output type expected";
+
+    if (m % 16 || n % 16) {
+        GTEST_SKIP() << "MXFP8 requires M & N to be multiples of 16";
+    }
+    if (k % 128) {
+        GTEST_SKIP() << "MXFP8 requires K to be a multiple of 128";
+    }
+
+    cudaDeviceProp prop;
+    (void)cudaGetDeviceProperties(&prop, 0);
+
+    bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5) || prop.major >= 12;
+    if (!mxfp8_supported) {
+        GTEST_SKIP() << "MXFP8 is not supported on this GPU";
+    }
+
+    TShape a_shape = transa ? TShape{m, k} : TShape{k, m};
+    TShape b_shape = transb ? TShape{k, n} : TShape{n, k};
+
+    // 1. Create random source matrices
+    Tensor A_src("A_src", a_shape, dtype);
+    Tensor B_src("B_src", b_shape, dtype);
+    fillUniform(&A_src);
+    fillUniform(&B_src);
+
+    // 2. Quantize to FP8 with MXFP8 scaling
+    Tensor A_fp8("A_fp8", a_shape, atype, transa, !transa,
+                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+    Tensor B_fp8("B_fp8", b_shape, btype, !transb, transb,
+                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+    nvte_quantize(A_src.data(), A_fp8.data(), 0);
+    nvte_quantize(B_src.data(), B_fp8.data(), 0);
+
+    // 3. Dequantize back to reference type
+    Tensor A_ref("A_ref", a_shape, dtype);
+    Tensor B_ref("B_ref", b_shape, dtype);
+    nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
+    nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
+
+    // 4. Swizzle scales for gfx1250
+    if (prop.major == 12) {
+        const bool a_colwise = !transa;
+        const bool b_colwise = transb;
+        if (!a_colwise) swizzle_mxfp8_scales(A_fp8, true);
+        if (a_colwise)  swizzle_mxfp8_scales(A_fp8, false);
+        if (!b_colwise) swizzle_mxfp8_scales(B_fp8, true);
+        if (b_colwise)  swizzle_mxfp8_scales(B_fp8, false);
+    }
+
+    Tensor bias;
+    Tensor pre_gelu_out;
+
+    size_t workspace_size = 67108864;  // 64 MB
+    Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte);
+
+    // 5. MXFP8 GEMM
+    Tensor D("D", TShape{n, m}, dtype);
+    nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(),
+                     bias.data(), pre_gelu_out.data(),
+                     transa, transb, false,
+                     Workspace.data(), false, false,
+                     prop.multiProcessorCount, 0);
+    D.to_cpu();
+
+    // 6. Non-FP8 reference GEMM
+    Tensor D_ref("D_ref", TShape{n, m}, dtype);
+    nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(),
+                     bias.data(), pre_gelu_out.data(),
+                     transa, transb, false,
+                     Workspace.data(), false, false,
+                     prop.multiProcessorCount, 0);
+    D_ref.to_cpu();
+
+    // Check for CUDA errors
+    (void)cudaDeviceSynchronize();
+    auto err = cudaGetLastError();
+    ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
+
+    // 7. Compare results
+    auto [atol, rtol] = getTolerances(dtype);
+    atol = std::max(atol, 5e-4);
+    rtol = std::max(rtol, 1e-3);
+    compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
+}
+
+// ============================================================================
+// Test suite
+// ============================================================================
+
+using ProdGemmParam = std::tuple<ProdGemmTestCase, Layout>;
+
+class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
+
+TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
+    const auto& tc = std::get<0>(GetParam());
+    const auto& layout = std::get<1>(GetParam());
+    bool transa = layout.first;
+    bool transb = layout.second;
+
+    switch (tc.pass) {
+        case GemmPass::FWD:
+            performMxfp8DqTest<fp8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+        case GemmPass::DGRAD:
+            performMxfp8DqTest<bf8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+        case GemmPass::WGRAD:
+            performMxfp8DqTest<fp8, bf8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+    }
+}
+
+static inline std::string TN(const Layout& layout) {
+    static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
+    return map[layout.first][layout.second];
+}
+
+// Regular model shapes (excluding LM Head)
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmModel, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(generate_model_test_cases()),
+        ::testing::ValuesIn(kLayouts)),
+    [](const testing::TestParamInfo<ProdGemmParam>& info) {
+        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
+    });
+
+// LM Head shapes (very large N, memory-intensive)
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmLMHead, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(generate_lm_head_test_cases()),
+        ::testing::ValuesIn(kLayouts)),
+    [](const testing::TestParamInfo<ProdGemmParam>& info) {
+        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
+    });
+
+}  // namespace
+
+#endif  // __HIP_PLATFORM_AMD__

From c4c2ea53fb19e030111e165ac60ca44fc0394f15 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Tue, 26 May 2026 10:55:28 -0500
Subject: [PATCH 2/7] rename

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 101 +++++++++++------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 2a086ddea..863ec8bb1 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -37,7 +37,6 @@ using namespace test;
 
 namespace {
 
-using fp32 = float;
 using fp8  = fp8e4m3;
 using bf8  = fp8e5m2;
 
@@ -72,66 +71,66 @@ struct ShapeDef {
     GemmPass pass;
 };
 
-// LLM1 (hidden=7168, MLA, seq=4096)
+// DeepSeek3 (hidden=7168, MLA, seq=4096)
 
-static const ShapeDef llm1_shapes[] = {
+static const ShapeDef deepseek3_shapes[] = {
     // Forward (M=tokens, N, K)
-    {"LLM1_Linear0_fwd",           1536,  7168, GemmPass::FWD},
-    {"LLM1_Linear1_fwd",            576,  7168, GemmPass::FWD},
-    {"LLM1_LNLinear0_fwd",        24576,  1536, GemmPass::FWD},
-    {"LLM1_LNLinear1_fwd",        32768,   512, GemmPass::FWD},
-    {"LLM1_Linear_attn_fwd",       7168, 16384, GemmPass::FWD},
-    {"LLM1_LNMLP_gateup_fwd",     36864,  7168, GemmPass::FWD},
-    {"LLM1_LNMLP_down_fwd",        7168, 18432, GemmPass::FWD},
-    {"LLM1_SharedExp_gu_fwd",      4096,  7168, GemmPass::FWD},
-    {"LLM1_SharedExp_dn_fwd",      7168,  2048, GemmPass::FWD},
-    {"LLM1_TopKRouter_fwd",         256,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
+    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
+    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
+    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
     // Dgrad (M=tokens, N, K)
-    {"LLM1_attn_dgrad",           16384,  7168, GemmPass::DGRAD},
-    {"LLM1_LNLinear1_dgrad",        512, 32768, GemmPass::DGRAD},
-    {"LLM1_LNLinear0_dgrad",       1536, 24576, GemmPass::DGRAD},
-    {"LLM1_SharedExp_dn_dgrad",    2048,  7168, GemmPass::DGRAD},
-    {"LLM1_SharedExp_gu_dgrad",    7168,  4096, GemmPass::DGRAD},
-    {"LLM1_TopKRouter_dgrad",      7168,   256, GemmPass::DGRAD},
-    {"LLM1_MLP_post_dgrad",        7168, 14336, GemmPass::DGRAD},
+    {"DeepSeek3_attn_dgrad",        16384,  7168, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear1_dgrad",     512, 32768, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear0_dgrad",    1536, 24576, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_dn_dgrad", 2048,  7168, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_gu_dgrad", 7168,  4096, GemmPass::DGRAD},
+    {"DeepSeek3_TopKRouter_dgrad",   7168,   256, GemmPass::DGRAD},
+    {"DeepSeek3_MLP_post_dgrad",     7168, 14336, GemmPass::DGRAD},
     // Wgrad (M, N, K=tokens)
-    {"LLM1_attn_wgrad",           16384,  7168, GemmPass::WGRAD},
-    {"LLM1_LNLinear1_wgrad",        512, 32768, GemmPass::WGRAD},
-    {"LLM1_LNLinear0_wgrad",       1536, 24576, GemmPass::WGRAD},
-    {"LLM1_SharedExp_dn_wgrad",    2048,  7168, GemmPass::WGRAD},
-    {"LLM1_SharedExp_gu_wgrad",    7168,  4096, GemmPass::WGRAD},
-    {"LLM1_TopKRouter_wgrad",      7168,   256, GemmPass::WGRAD},
+    {"DeepSeek3_attn_wgrad",        16384,  7168, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear1_wgrad",     512, 32768, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear0_wgrad",    1536, 24576, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_dn_wgrad", 2048,  7168, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_gu_wgrad", 7168,  4096, GemmPass::WGRAD},
+    {"DeepSeek3_TopKRouter_wgrad",   7168,   256, GemmPass::WGRAD},
 };
 
-// LLM1 LM Head (large N, memory-intensive)
-static const ShapeDef llm1_lm_head_shapes[] = {
-    {"LLM1_LMHead_fwd",          129280,  7168, GemmPass::FWD},
-    {"LLM1_LMHead_dgrad",          7168,129280, GemmPass::DGRAD},
-    {"LLM1_LMHead_wgrad",          7168,129280, GemmPass::WGRAD},
+// DeepSeek3 LM Head (large N, memory-intensive)
+static const ShapeDef deepseek3_lm_head_shapes[] = {
+    {"DeepSeek3_LMHead_fwd",     129280,   7168, GemmPass::FWD},
+    {"DeepSeek3_LMHead_dgrad",     7168, 129280, GemmPass::DGRAD},
+    {"DeepSeek3_LMHead_wgrad",     7168, 129280, GemmPass::WGRAD},
 };
 
-// LLM2 (hidden=4096, GQA, seq=4096)
+// Qwen3 (hidden=4096, GQA, seq=4096)
 
-static const ShapeDef llm2_shapes[] = {
+static const ShapeDef qwen3_shapes[] = {
     // Forward (M=tokens, N, K)
-    {"LLM2_LNLinear_QKV_fwd",   9216,  4096, GemmPass::FWD},
-    {"LLM2_Linear_attn_fwd",    4096,  8192, GemmPass::FWD},
-    {"LLM2_Router_fwd",          128,  4096, GemmPass::FWD},
+    {"Qwen3_LNLinear_QKV_fwd",  9216,  4096, GemmPass::FWD},
+    {"Qwen3_Linear_attn_fwd",   4096,  8192, GemmPass::FWD},
+    {"Qwen3_Router_fwd",         128,  4096, GemmPass::FWD},
     // Dgrad (M=tokens, N, K)
-    {"LLM2_Router_dgrad",       4096,   128, GemmPass::DGRAD},
-    {"LLM2_Linear_attn_dgrad",  8192,  4096, GemmPass::DGRAD},
-    {"LLM2_LNLinear_dgrad",     4096,  9216, GemmPass::DGRAD},
+    {"Qwen3_Router_dgrad",      4096,   128, GemmPass::DGRAD},
+    {"Qwen3_Linear_attn_dgrad", 8192,  4096, GemmPass::DGRAD},
+    {"Qwen3_LNLinear_dgrad",    4096,  9216, GemmPass::DGRAD},
     // Wgrad (M, N, K=tokens)
-    {"LLM2_Router_wgrad",       4096,   128, GemmPass::WGRAD},
-    {"LLM2_Linear_attn_wgrad",  8192,  4096, GemmPass::WGRAD},
-    {"LLM2_LNLinear_wgrad",     4096,  9216, GemmPass::WGRAD},
+    {"Qwen3_Router_wgrad",      4096,   128, GemmPass::WGRAD},
+    {"Qwen3_Linear_attn_wgrad", 8192,  4096, GemmPass::WGRAD},
+    {"Qwen3_LNLinear_wgrad",    4096,  9216, GemmPass::WGRAD},
 };
 
-// LLM2 LM Head (large N, memory-intensive)
-static const ShapeDef llm2_lm_head_shapes[] = {
-    {"LLM2_LMHead_fwd",        151936,  4096, GemmPass::FWD},
-    {"LLM2_LMHead_dgrad",        4096,151936, GemmPass::DGRAD},
-    {"LLM2_LMHead_wgrad",        4096,151936, GemmPass::WGRAD},
+// Qwen3 LM Head (large N, memory-intensive)
+static const ShapeDef qwen3_lm_head_shapes[] = {
+    {"Qwen3_LMHead_fwd",       151936,   4096, GemmPass::FWD},
+    {"Qwen3_LMHead_dgrad",       4096, 151936, GemmPass::DGRAD},
+    {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
 };
 
 // ============================================================================
@@ -177,16 +176,16 @@ static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t
 }
 
 static std::vector<ProdGemmTestCase> generate_model_test_cases() {
-    auto v1   = expand_shapes(llm1_shapes,   std::size(llm1_shapes));
-    auto v2   = expand_shapes(llm2_shapes,   std::size(llm2_shapes));
+    auto v1   = expand_shapes(deepseek3_shapes,   std::size(deepseek3_shapes));
+    auto v2   = expand_shapes(qwen3_shapes,       std::size(qwen3_shapes));
     v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
                         std::make_move_iterator(v2.end()));
     return v1;
 }
 
 static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
-    auto v1   = expand_shapes(llm1_lm_head_shapes,   std::size(llm1_lm_head_shapes));
-    auto v2   = expand_shapes(llm2_lm_head_shapes,   std::size(llm2_lm_head_shapes));
+    auto v1   = expand_shapes(deepseek3_lm_head_shapes, std::size(deepseek3_lm_head_shapes));
+    auto v2   = expand_shapes(qwen3_lm_head_shapes,     std::size(qwen3_lm_head_shapes));
     v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
                         std::make_move_iterator(v2.end()));
     return v1;

From 8eaf06d4516a553b2c1006758b956dc7ab0b88f7 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 14:03:52 -0500
Subject: [PATCH 3/7] restructure based on review comments

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 174 ++++++++---------------
 tests/cpp/test_common.cu                 |  43 ++++++
 tests/cpp/test_common.h                  |   4 +
 3 files changed, 107 insertions(+), 114 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 863ec8bb1..db29b4030 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -28,7 +28,6 @@
 #include <gtest/gtest.h>
 #include <transformer_engine/cast.h>
 #include <transformer_engine/gemm.h>
-#include <transformer_engine/swizzle.h>
 #include <transformer_engine/transformer_engine.h>
 #include "../test_common.h"
 
@@ -133,104 +132,24 @@ static const ShapeDef qwen3_lm_head_shapes[] = {
     {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
 };
 
-// ============================================================================
-// Test case: a concrete (M, K, N) shape with pass info, ready for execution
-// ============================================================================
-
-struct ProdGemmTestCase {
-    std::string label;
-    size_t m, k, n;
-    GemmPass pass;
-};
-
-std::ostream& operator<<(std::ostream& os, const ProdGemmTestCase& tc) {
-    return os << tc.label;
-}
+// ====================================================
+// Test case: a concrete (M, K, N) shape with pass info
+// ====================================================
 
-static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t count) {
-    std::vector<ProdGemmTestCase> cases;
-    for (size_t i = 0; i < count; ++i) {
-        const auto& s = defs[i];
-        for (size_t mbs : {1, 2, 4}) {
-            size_t tokens = mbs * 4096;
-            ProdGemmTestCase tc;
-            tc.label = std::string(s.label) + "_mbs" + std::to_string(mbs);
-            tc.pass = s.pass;
-            switch (s.pass) {
-                case GemmPass::FWD:
-                case GemmPass::DGRAD:
-                    tc.m = tokens;
-                    tc.n = s.dim1;
-                    tc.k = s.dim2;
-                    break;
-                case GemmPass::WGRAD:
-                    tc.m = s.dim1;
-                    tc.n = s.dim2;
-                    tc.k = tokens;
-                    break;
-            }
-            cases.push_back(std::move(tc));
-        }
-    }
-    return cases;
-}
-
-static std::vector<ProdGemmTestCase> generate_model_test_cases() {
-    auto v1   = expand_shapes(deepseek3_shapes,   std::size(deepseek3_shapes));
-    auto v2   = expand_shapes(qwen3_shapes,       std::size(qwen3_shapes));
-    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
-                        std::make_move_iterator(v2.end()));
-    return v1;
-}
-
-static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
-    auto v1   = expand_shapes(deepseek3_lm_head_shapes, std::size(deepseek3_lm_head_shapes));
-    auto v2   = expand_shapes(qwen3_lm_head_shapes,     std::size(qwen3_lm_head_shapes));
-    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
-                        std::make_move_iterator(v2.end()));
-    return v1;
+std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
+    return os << s.label;
 }
 
-// ============================================================================
-// Swizzle helper for gfx1250 MXFP8 scales (same as test_cublaslt_gemm.cu)
-// ============================================================================
-
-static void swizzle_mxfp8_scales(test::Tensor& t, bool rowwise) {
-    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
-                              : t.columnwise_scale_inv_dptr();
-    if (!scale_ptr) return;
-
-    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
-                                          : t.columnwise_scale_inv_shape();
-    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
-                                          : t.columnwise_shape();
-
-    size_t num_scales = 1;
-    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
-
-    uint8_t* d_tmp = nullptr;
-    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
-
-    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
-    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
-    output_tw.set_with_gemm_swizzled_scales(true);
-
-    if (rowwise) {
-        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    } else {
-        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+static void resolve_mkn(const ShapeDef& s, size_t mbs,
+                         size_t& m, size_t& k, size_t& n) {
+    size_t tokens = mbs * 4096;
+    switch (s.pass) {
+        case GemmPass::FWD:
+        case GemmPass::DGRAD:
+            m = tokens; n = s.dim1; k = s.dim2; break;
+        case GemmPass::WGRAD:
+            m = s.dim1; n = s.dim2; k = tokens; break;
     }
-
-    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
-    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
-    NVTE_CHECK_CUDA(cudaFree(d_tmp));
 }
 
 // ============================================================================
@@ -342,25 +261,29 @@ void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb)
 // Test suite
 // ============================================================================
 
-using ProdGemmParam = std::tuple<ProdGemmTestCase, Layout>;
+using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
 
 class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
 TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
-    const auto& tc = std::get<0>(GetParam());
-    const auto& layout = std::get<1>(GetParam());
+    const auto& shape = std::get<0>(GetParam());
+    size_t mbs = std::get<1>(GetParam());
+    const auto& layout = std::get<2>(GetParam());
     bool transa = layout.first;
     bool transb = layout.second;
 
-    switch (tc.pass) {
+    size_t m, k, n;
+    resolve_mkn(shape, mbs, m, k, n);
+
+    switch (shape.pass) {
         case GemmPass::FWD:
-            performMxfp8DqTest<fp8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<fp8, fp8, bf16>(m, k, n, transa, transb);
             break;
         case GemmPass::DGRAD:
-            performMxfp8DqTest<bf8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<bf8, fp8, bf16>(m, k, n, transa, transb);
             break;
         case GemmPass::WGRAD:
-            performMxfp8DqTest<fp8, bf8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<fp8, bf8, bf16>(m, k, n, transa, transb);
             break;
     }
 }
@@ -370,25 +293,48 @@ static inline std::string TN(const Layout& layout) {
     return map[layout.first][layout.second];
 }
 
-// Regular model shapes (excluding LM Head)
+static inline auto testName(const testing::TestParamInfo<ProdGemmParam>& info) {
+    const auto& shape = std::get<0>(info.param);
+    size_t mbs = std::get<1>(info.param);
+    const auto& layout = std::get<2>(info.param);
+    return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
+}
+
+// DeepSeek3 model shapes
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmDeepSeek3, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(deepseek3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    testName);
+
+// Qwen3 model shapes
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmQwen3, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(qwen3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    testName);
+
+// DeepSeek3 LM Head shapes (very large N, memory-intensive)
 INSTANTIATE_TEST_SUITE_P(
-    ProdGemmModel, ProdGemmTestSuite,
+    ProdGemmDeepSeek3LMHead, ProdGemmTestSuite,
     ::testing::Combine(
-        ::testing::ValuesIn(generate_model_test_cases()),
+        ::testing::ValuesIn(deepseek3_lm_head_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
         ::testing::ValuesIn(kLayouts)),
-    [](const testing::TestParamInfo<ProdGemmParam>& info) {
-        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
-    });
+    testName);
 
-// LM Head shapes (very large N, memory-intensive)
+// Qwen3 LM Head shapes (very large N, memory-intensive)
 INSTANTIATE_TEST_SUITE_P(
-    ProdGemmLMHead, ProdGemmTestSuite,
+    ProdGemmQwen3LMHead, ProdGemmTestSuite,
     ::testing::Combine(
-        ::testing::ValuesIn(generate_lm_head_test_cases()),
+        ::testing::ValuesIn(qwen3_lm_head_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
         ::testing::ValuesIn(kLayouts)),
-    [](const testing::TestParamInfo<ProdGemmParam>& info) {
-        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
-    });
+    testName);
 
 }  // namespace
 
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index fbcfdf89d..392e641d5 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -22,6 +22,9 @@
 #endif
 #include <omp.h>
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <transformer_engine/swizzle.h>
+#endif
 #include <transformer_engine/transformer_engine.h>
 #include "util/logging.h"
 
@@ -1314,4 +1317,44 @@ std::array<size_t, 4> get_scale_tensor_dims(const size_t rows,
     return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X};
 }
 
+#ifdef __HIP_PLATFORM_AMD__
+void swizzle_mxfp8_scales(Tensor& t, bool rowwise) {
+    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
+                              : t.columnwise_scale_inv_dptr();
+    if (!scale_ptr) return;
+
+    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
+                                          : t.columnwise_scale_inv_shape();
+    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
+                                          : t.columnwise_shape();
+
+    size_t num_scales = 1;
+    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
+
+    uint8_t* d_tmp = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
+
+    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+    output_tw.set_with_gemm_swizzled_scales(true);
+
+    if (rowwise) {
+        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    } else {
+        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    }
+
+    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
+    NVTE_CHECK_CUDA(cudaFree(d_tmp));
+}
+#endif // #ifdef __HIP_PLATFORM_AMD__
+
 }  // namespace test
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index a25b7b61e..6c37ccc57 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -581,6 +581,10 @@ int32_t getDeviceComputeCapability();
 constexpr int32_t hopperComputeCapability = 90;
 constexpr int32_t blackwellComputeCapability = 100;
 
+#ifdef USE_ROCM
+void swizzle_mxfp8_scales(Tensor& t, bool rowwise);
+#endif
+
 }  // namespace test
 
 #if FP4_TYPE_SUPPORTED

From 77f1c4535f341a9410e313a1829b242457e71409 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 14:15:43 -0500
Subject: [PATCH 4/7] clarify switch

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index db29b4030..97410d182 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -144,11 +144,17 @@ static void resolve_mkn(const ShapeDef& s, size_t mbs,
                          size_t& m, size_t& k, size_t& n) {
     size_t tokens = mbs * 4096;
     switch (s.pass) {
-        case GemmPass::FWD:
+        case GemmPass::FWD: // Fallthrough, same as DGRAD
         case GemmPass::DGRAD:
-            m = tokens; n = s.dim1; k = s.dim2; break;
+            m = tokens;
+            n = s.dim1;
+            k = s.dim2;
+            break;
         case GemmPass::WGRAD:
-            m = s.dim1; n = s.dim2; k = tokens; break;
+            m = s.dim1;
+            n = s.dim2;
+            k = tokens;
+            break;
     }
 }
 

From 76c8d9894d1039428e2dfa95a96ccfbcc124beb5 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 17:14:36 -0500
Subject: [PATCH 5/7] skip known-bad tests

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 97410d182..02695c387 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -21,6 +21,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
+#include <set>
 #include <string>
 #include <vector>
 #include <cuda_bf16.h>
@@ -271,6 +272,47 @@ using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
 
 class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
+// Known-failing GEMM shapes on gfx950
+static const std::set<std::string> kMI355XSkips = {
+    // N=576 + NT: rocroller LDS stride mismatch
+    "DeepSeek3_Linear1_fwd_mbs1_NT",
+    "DeepSeek3_Linear1_fwd_mbs2_NT",
+    "DeepSeek3_Linear1_fwd_mbs4_NT",
+    // Sporadic kernel failures
+    "DeepSeek3_LNLinear0_fwd_mbs2_NT",
+    "DeepSeek3_Linear_attn_fwd_mbs4_NT",
+    "DeepSeek3_LNMLP_gateup_fwd_mbs4_NN",
+    "DeepSeek3_LNMLP_down_fwd_mbs2_NN",
+    "DeepSeek3_attn_wgrad_mbs2_NN",
+    "DeepSeek3_LNLinear0_dgrad_mbs2_NN",
+    "DeepSeek3_LNLinear0_wgrad_mbs4_NN",
+    "DeepSeek3_SharedExp_dn_wgrad_mbs4_NT",
+    // K=128 (minimum for MXFP8): unreliable across layouts
+    "Qwen3_Router_fwd_mbs1_NN",
+    "Qwen3_Router_dgrad_mbs1_NN",
+    "Qwen3_Router_dgrad_mbs1_NT",
+    "Qwen3_Router_dgrad_mbs2_NT",
+    "Qwen3_Router_dgrad_mbs4_TN",
+    "Qwen3_Router_dgrad_mbs4_NT",
+    // Other failures
+    "Qwen3_Linear_attn_wgrad_mbs2_NT",
+    "DeepSeek3_LMHead_fwd_mbs1_NT",
+    "DeepSeek3_LMHead_fwd_mbs4_NN",
+    // Qwen3 LM Head dgrad (N=151936): nearly all combos fail
+    "Qwen3_LMHead_dgrad_mbs1_NN",
+    "Qwen3_LMHead_dgrad_mbs1_NT",
+    "Qwen3_LMHead_dgrad_mbs2_TN",
+    "Qwen3_LMHead_dgrad_mbs2_NN",
+    "Qwen3_LMHead_dgrad_mbs2_NT",
+    "Qwen3_LMHead_dgrad_mbs4_TN",
+    "Qwen3_LMHead_dgrad_mbs4_NN",
+    "Qwen3_LMHead_dgrad_mbs4_NT",
+    // Crash (likely OOM / kernel fault)
+    "Qwen3_LMHead_fwd_mbs4_TN",
+    "Qwen3_LMHead_fwd_mbs4_NN",
+    "Qwen3_LMHead_fwd_mbs4_NT",
+};
+
 TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
     const auto& shape = std::get<0>(GetParam());
     size_t mbs = std::get<1>(GetParam());
@@ -278,6 +320,13 @@ TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
     bool transa = layout.first;
     bool transb = layout.second;
 
+    static const char* tn_map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
+    std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
+                       + "_" + tn_map[transa][transb];
+    if (kMI355XSkips.count(name)) {
+        GTEST_SKIP() << "Known MI355X hipBLASLt failure: " << name;
+    }
+
     size_t m, k, n;
     resolve_mkn(shape, mbs, m, k, n);
 

From db3123fb2a111fea7f09a7eba8e299d403ad4a9a Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 17:47:35 -0500
Subject: [PATCH 6/7] loosen tolerances a bit

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 37 ++++--------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 02695c387..b25d67f35 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -258,9 +258,11 @@ void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb)
     ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 
     // 7. Compare results
+    // The MXFP8 GEMM and bf16 reference GEMM use different internal accumulation
+    // paths, so results can differ by up to 1 ULP in bf16 (~1.5-2% relative).
     auto [atol, rtol] = getTolerances(dtype);
-    atol = std::max(atol, 5e-4);
-    rtol = std::max(rtol, 1e-3);
+    atol = std::max(atol, 1e-3);
+    rtol = std::max(rtol, 2e-2);
     compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
 }
 
@@ -274,39 +276,10 @@ class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
 // Known-failing GEMM shapes on gfx950
 static const std::set<std::string> kMI355XSkips = {
-    // N=576 + NT: rocroller LDS stride mismatch
+    // N=576 + NT: rocroller LDS stride mismatch (all elements wrong, ~100x off)
     "DeepSeek3_Linear1_fwd_mbs1_NT",
     "DeepSeek3_Linear1_fwd_mbs2_NT",
     "DeepSeek3_Linear1_fwd_mbs4_NT",
-    // Sporadic kernel failures
-    "DeepSeek3_LNLinear0_fwd_mbs2_NT",
-    "DeepSeek3_Linear_attn_fwd_mbs4_NT",
-    "DeepSeek3_LNMLP_gateup_fwd_mbs4_NN",
-    "DeepSeek3_LNMLP_down_fwd_mbs2_NN",
-    "DeepSeek3_attn_wgrad_mbs2_NN",
-    "DeepSeek3_LNLinear0_dgrad_mbs2_NN",
-    "DeepSeek3_LNLinear0_wgrad_mbs4_NN",
-    "DeepSeek3_SharedExp_dn_wgrad_mbs4_NT",
-    // K=128 (minimum for MXFP8): unreliable across layouts
-    "Qwen3_Router_fwd_mbs1_NN",
-    "Qwen3_Router_dgrad_mbs1_NN",
-    "Qwen3_Router_dgrad_mbs1_NT",
-    "Qwen3_Router_dgrad_mbs2_NT",
-    "Qwen3_Router_dgrad_mbs4_TN",
-    "Qwen3_Router_dgrad_mbs4_NT",
-    // Other failures
-    "Qwen3_Linear_attn_wgrad_mbs2_NT",
-    "DeepSeek3_LMHead_fwd_mbs1_NT",
-    "DeepSeek3_LMHead_fwd_mbs4_NN",
-    // Qwen3 LM Head dgrad (N=151936): nearly all combos fail
-    "Qwen3_LMHead_dgrad_mbs1_NN",
-    "Qwen3_LMHead_dgrad_mbs1_NT",
-    "Qwen3_LMHead_dgrad_mbs2_TN",
-    "Qwen3_LMHead_dgrad_mbs2_NN",
-    "Qwen3_LMHead_dgrad_mbs2_NT",
-    "Qwen3_LMHead_dgrad_mbs4_TN",
-    "Qwen3_LMHead_dgrad_mbs4_NN",
-    "Qwen3_LMHead_dgrad_mbs4_NT",
     // Crash (likely OOM / kernel fault)
     "Qwen3_LMHead_fwd_mbs4_TN",
     "Qwen3_LMHead_fwd_mbs4_NN",

From c6cc59f810dfca622de2dd5bd1425b11969be4d2 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Mon, 1 Jun 2026 10:56:20 -0500
Subject: [PATCH 7/7] restructure tests into test_cublaslt_gemm

---
 tests/cpp/operator/CMakeLists.txt        |   1 -
 tests/cpp/operator/test_cublaslt_gemm.cu | 185 +++++++++++-
 tests/cpp/operator/test_gemm_prodgemm.cu | 369 -----------------------
 tests/cpp/test_common.cu                 |  43 ---
 tests/cpp/test_common.h                  |   4 -
 5 files changed, 184 insertions(+), 418 deletions(-)
 delete mode 100644 tests/cpp/operator/test_gemm_prodgemm.cu

diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
index 0eded7219..0ebd7fdfe 100644
--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -39,7 +39,6 @@ if(USE_CUDA)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu
-              test_gemm_prodgemm.cu
               test_cast_mxfp4_transpose.cu)
 endif()
 
diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index 85f183bf7..669238baf 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -5,6 +5,8 @@
  ************************************************************************/
 #include <cmath>
 #include <iostream>
+#include <optional>
+#include <set>
 #include <string>
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>
@@ -33,6 +35,98 @@ std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes_mxfp8 = {
   {768, 3072, 4096},
 };
 
+// ============================================================================
+// Production LLM shapes for MXFP8 GEMM testing.
+//
+// Each shape is tested with 3 micro-batch sizes (MBS = 1, 2, 4)
+// yielding tokens = 4096, 8192, 16384, and 3 layouts (TN, NN, NT)
+// via ::testing::Combine.
+//
+// GemmPass selects the FP8 type combination:
+//   FWD:   E4M3 x E4M3 -> BF16
+//   DGRAD: E5M2 x E4M3 -> BF16
+//   WGRAD: E4M3 x E5M2 -> BF16
+// ============================================================================
+
+enum class GemmPass { FWD, DGRAD, WGRAD };
+
+struct ShapeDef {
+  const char* label;
+  size_t dim1;   // FWD/DGRAD: N, WGRAD: M
+  size_t dim2;   // FWD/DGRAD: K, WGRAD: N
+  GemmPass pass;
+};
+
+std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
+  return os << s.label;
+}
+
+static void resolve_mkn(const ShapeDef& s, size_t mbs,
+                         size_t& m, size_t& k, size_t& n) {
+  size_t tokens = mbs * 4096;
+  switch (s.pass) {
+    case GemmPass::FWD:
+    case GemmPass::DGRAD:
+      m = tokens; n = s.dim1; k = s.dim2;
+      break;
+    case GemmPass::WGRAD:
+      m = s.dim1; n = s.dim2; k = tokens;
+      break;
+  }
+}
+
+// DeepSeek3 (hidden=7168, MLA, seq=4096, incl. LM Head)
+static const ShapeDef deepseek3_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
+    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
+    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
+    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
+    {"DeepSeek3_LMHead_fwd",       129280,  7168, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"DeepSeek3_attn_dgrad",        16384,   7168, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear1_dgrad",     512,  32768, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear0_dgrad",    1536,  24576, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_dn_dgrad", 2048,   7168, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_gu_dgrad", 7168,   4096, GemmPass::DGRAD},
+    {"DeepSeek3_TopKRouter_dgrad",   7168,    256, GemmPass::DGRAD},
+    {"DeepSeek3_MLP_post_dgrad",     7168,  14336, GemmPass::DGRAD},
+    {"DeepSeek3_LMHead_dgrad",       7168, 129280, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"DeepSeek3_attn_wgrad",        16384,   7168, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear1_wgrad",     512,  32768, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear0_wgrad",    1536,  24576, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_dn_wgrad", 2048,   7168, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_gu_wgrad", 7168,   4096, GemmPass::WGRAD},
+    {"DeepSeek3_TopKRouter_wgrad",   7168,    256, GemmPass::WGRAD},
+    {"DeepSeek3_LMHead_wgrad",       7168, 129280, GemmPass::WGRAD},
+};
+
+// Qwen3 (hidden=4096, GQA, seq=4096, incl. LM Head)
+static const ShapeDef qwen3_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"Qwen3_LNLinear_QKV_fwd",  9216,   4096, GemmPass::FWD},
+    {"Qwen3_Linear_attn_fwd",   4096,   8192, GemmPass::FWD},
+    {"Qwen3_Router_fwd",         128,   4096, GemmPass::FWD},
+    {"Qwen3_LMHead_fwd",      151936,   4096, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"Qwen3_Router_dgrad",      4096,    128, GemmPass::DGRAD},
+    {"Qwen3_Linear_attn_dgrad", 8192,   4096, GemmPass::DGRAD},
+    {"Qwen3_LNLinear_dgrad",    4096,   9216, GemmPass::DGRAD},
+    {"Qwen3_LMHead_dgrad",      4096, 151936, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"Qwen3_Router_wgrad",      4096,    128, GemmPass::WGRAD},
+    {"Qwen3_Linear_attn_wgrad", 8192,   4096, GemmPass::WGRAD},
+    {"Qwen3_LNLinear_wgrad",    4096,   9216, GemmPass::WGRAD},
+    {"Qwen3_LMHead_wgrad",      4096, 151936, GemmPass::WGRAD},
+};
+
 //  A, B, Bias, Gelu, D
 //  Bias type choose as bf16 in use_fp8, D_type otherwise
 //  Gelu type the same as Bias_Type
@@ -559,7 +653,9 @@ void performTest(const TestParams& params) {
 
 #ifdef __HIP_PLATFORM_AMD__
 template <typename A_Type, typename B_Type, typename D_Type>
-void performDqTest(const TestParams &params) {
+void performDqTest(const TestParams &params,
+                   std::optional<double> atol_override = std::nullopt,
+                   std::optional<double> rtol_override = std::nullopt) {
   DType atype = TypeInfo<A_Type>::dtype;
   DType btype = TypeInfo<B_Type>::dtype;
   DType dtype = TypeInfo<D_Type>::dtype;
@@ -633,6 +729,10 @@ void performDqTest(const TestParams &params) {
 
   //compare results
   auto [atol, rtol] = getTestTolerances(dtype, true, true);
+  if (atol_override)
+    atol = *atol_override;
+  if (rtol_override)
+    rtol = *rtol_override;
   compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
 }
 #endif // __HIP_PLATFORM_AMD__
@@ -751,6 +851,89 @@ INSTANTIATE_TEST_SUITE_P(OperatorTest, DqGEMMTestSuite,
                            return MKN(std::get<0>(info.param)) + "x" + TN(std::get<3>(info.param));
                          });
 
+// ============================================================================
+// Production GEMM shape instantiations (run with --gtest_filter='ProdGemm*')
+// ============================================================================
+
+// Known-failing GEMM shapes on gfx950
+static const std::set<std::string> kGfx950Skips = {
+    "DeepSeek3_Linear1_fwd_mbs1_NT",
+    "DeepSeek3_Linear1_fwd_mbs2_NT",
+    "DeepSeek3_Linear1_fwd_mbs4_NT",
+    "DeepSeek3_LNLinear0_fwd_mbs4_NN",
+    "DeepSeek3_LNLinear0_fwd_mbs4_NT",
+    "DeepSeek3_attn_wgrad_mbs1_NN",
+    "Qwen3_LMHead_fwd_mbs2_NN",
+    "Qwen3_Router_fwd_mbs2_NT",
+    "Qwen3_LMHead_fwd_mbs4_TN",
+    "Qwen3_LMHead_fwd_mbs4_NN",
+    "Qwen3_LMHead_fwd_mbs4_NT",
+};
+
+// Production GEMM test suite using ShapeDef x MBS x Layout via testing::Combine.
+using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
+
+class ProdDqGEMMTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
+
+TEST_P(ProdDqGEMMTestSuite, TestMxfp8Dq) {
+  const auto& shape = std::get<0>(GetParam());
+  size_t mbs = std::get<1>(GetParam());
+  const auto& layout = std::get<2>(GetParam());
+
+  std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
+                     + "_" + TN(layout);
+  if (kGfx950Skips.count(name)) {
+    GTEST_SKIP() << "Known gfx950 hipBLASLt failure: " << name;
+  }
+
+  size_t m, k, n;
+  resolve_mkn(shape, mbs, m, k, n);
+
+  TestParams params = {.m = m, .k = k, .n = n,
+                       .use_bias = false, .use_gelu = false,
+                       .transa = layout.first, .transb = layout.second,
+                       .scaling_mode = NVTEScalingMode::NVTE_MXFP8_1D_SCALING};
+
+  // Production shapes use looser tolerances: the MXFP8 and bf16 reference
+  // GEMM use different internal accumulation paths, so results can differ
+  // by up to 1 ULP in bf16 (~1.5-2% relative).
+  const double prod_atol = 1e-3;
+  const double prod_rtol = 2e-2;
+
+  switch (shape.pass) {
+    case GemmPass::FWD:
+      performDqTest<fp8, fp8, bf16>(params, prod_atol, prod_rtol);
+      break;
+    case GemmPass::DGRAD:
+      performDqTest<bf8, fp8, bf16>(params, prod_atol, prod_rtol);
+      break;
+    case GemmPass::WGRAD:
+      performDqTest<fp8, bf8, bf16>(params, prod_atol, prod_rtol);
+      break;
+  }
+}
+
+static auto prodTestName = [](const testing::TestParamInfo<ProdGemmParam>& info) {
+  const auto& shape = std::get<0>(info.param);
+  size_t mbs = std::get<1>(info.param);
+  const auto& layout = std::get<2>(info.param);
+  return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
+};
+
+INSTANTIATE_TEST_SUITE_P(ProdGemmDeepSeek3, ProdDqGEMMTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(deepseek3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    prodTestName);
+
+INSTANTIATE_TEST_SUITE_P(ProdGemmQwen3, ProdDqGEMMTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(qwen3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    prodTestName);
+
 TEST(InputGenTest, FillUniform_DoesNotGetOverwrittenByFromCpu) {
   const size_t rows = 128;
   const size_t cols = 256;
diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
deleted file mode 100644
index b25d67f35..000000000
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
- *
- * License for AMD contributions = MIT. See LICENSE for more information
- ************************************************************************/
-
-/*
- * MXFP8 GEMM correctness tests for production LLM shapes.
- *
- * Tests forward, dgrad, and wgrad passes with appropriate FP8 type combos:
- *   Forward:  E4M3 x E4M3 -> BF16
- *   Dgrad:    E5M2 x E4M3 -> BF16
- *   Wgrad:    E4M3 x E5M2 -> BF16
- *
- * Each shape is tested with 3 transpose configs (TN, NN, NT) and
- * 3 micro-batch sizes (MBS = 1, 2, 4 -> tokens = 4096, 8192, 16384).
- */
-
-#ifdef __HIP_PLATFORM_AMD__
-
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <set>
-#include <string>
-#include <vector>
-#include <cuda_bf16.h>
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <transformer_engine/cast.h>
-#include <transformer_engine/gemm.h>
-#include <transformer_engine/transformer_engine.h>
-#include "../test_common.h"
-
-using namespace transformer_engine;
-using namespace test;
-
-namespace {
-
-using fp8  = fp8e4m3;
-using bf8  = fp8e5m2;
-
-using TShape = std::vector<size_t>;
-using Layout = std::pair<bool, bool>;  // {transa, transb}
-
-static const Layout kTN{true, false};
-static const Layout kNN{false, false};
-static const Layout kNT{false, true};
-static const std::vector<Layout> kLayouts = {kTN, kNN, kNT};
-
-// ============================================================================
-// GemmPass: determines A/B FP8 type combination
-//   FWD:   fp8  x fp8  (E4M3 x E4M3)
-//   DGRAD: bf8  x fp8  (E5M2 x E4M3)
-//   WGRAD: fp8  x bf8  (E4M3 x E5M2)
-// ============================================================================
-
-enum class GemmPass { FWD, DGRAD, WGRAD };
-
-// ============================================================================
-// Shape definition: describes a GEMM from the model architecture.
-//
-//   Forward / Dgrad: M = tokens, dim1 = N, dim2 = K
-//   Wgrad:           K = tokens, dim1 = M, dim2 = N
-// ============================================================================
-
-struct ShapeDef {
-    const char* label;
-    size_t dim1;
-    size_t dim2;
-    GemmPass pass;
-};
-
-// DeepSeek3 (hidden=7168, MLA, seq=4096)
-
-static const ShapeDef deepseek3_shapes[] = {
-    // Forward (M=tokens, N, K)
-    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
-    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
-    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
-    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
-    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
-    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
-    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
-    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
-    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
-    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
-    // Dgrad (M=tokens, N, K)
-    {"DeepSeek3_attn_dgrad",        16384,  7168, GemmPass::DGRAD},
-    {"DeepSeek3_LNLinear1_dgrad",     512, 32768, GemmPass::DGRAD},
-    {"DeepSeek3_LNLinear0_dgrad",    1536, 24576, GemmPass::DGRAD},
-    {"DeepSeek3_SharedExp_dn_dgrad", 2048,  7168, GemmPass::DGRAD},
-    {"DeepSeek3_SharedExp_gu_dgrad", 7168,  4096, GemmPass::DGRAD},
-    {"DeepSeek3_TopKRouter_dgrad",   7168,   256, GemmPass::DGRAD},
-    {"DeepSeek3_MLP_post_dgrad",     7168, 14336, GemmPass::DGRAD},
-    // Wgrad (M, N, K=tokens)
-    {"DeepSeek3_attn_wgrad",        16384,  7168, GemmPass::WGRAD},
-    {"DeepSeek3_LNLinear1_wgrad",     512, 32768, GemmPass::WGRAD},
-    {"DeepSeek3_LNLinear0_wgrad",    1536, 24576, GemmPass::WGRAD},
-    {"DeepSeek3_SharedExp_dn_wgrad", 2048,  7168, GemmPass::WGRAD},
-    {"DeepSeek3_SharedExp_gu_wgrad", 7168,  4096, GemmPass::WGRAD},
-    {"DeepSeek3_TopKRouter_wgrad",   7168,   256, GemmPass::WGRAD},
-};
-
-// DeepSeek3 LM Head (large N, memory-intensive)
-static const ShapeDef deepseek3_lm_head_shapes[] = {
-    {"DeepSeek3_LMHead_fwd",     129280,   7168, GemmPass::FWD},
-    {"DeepSeek3_LMHead_dgrad",     7168, 129280, GemmPass::DGRAD},
-    {"DeepSeek3_LMHead_wgrad",     7168, 129280, GemmPass::WGRAD},
-};
-
-// Qwen3 (hidden=4096, GQA, seq=4096)
-
-static const ShapeDef qwen3_shapes[] = {
-    // Forward (M=tokens, N, K)
-    {"Qwen3_LNLinear_QKV_fwd",  9216,  4096, GemmPass::FWD},
-    {"Qwen3_Linear_attn_fwd",   4096,  8192, GemmPass::FWD},
-    {"Qwen3_Router_fwd",         128,  4096, GemmPass::FWD},
-    // Dgrad (M=tokens, N, K)
-    {"Qwen3_Router_dgrad",      4096,   128, GemmPass::DGRAD},
-    {"Qwen3_Linear_attn_dgrad", 8192,  4096, GemmPass::DGRAD},
-    {"Qwen3_LNLinear_dgrad",    4096,  9216, GemmPass::DGRAD},
-    // Wgrad (M, N, K=tokens)
-    {"Qwen3_Router_wgrad",      4096,   128, GemmPass::WGRAD},
-    {"Qwen3_Linear_attn_wgrad", 8192,  4096, GemmPass::WGRAD},
-    {"Qwen3_LNLinear_wgrad",    4096,  9216, GemmPass::WGRAD},
-};
-
-// Qwen3 LM Head (large N, memory-intensive)
-static const ShapeDef qwen3_lm_head_shapes[] = {
-    {"Qwen3_LMHead_fwd",       151936,   4096, GemmPass::FWD},
-    {"Qwen3_LMHead_dgrad",       4096, 151936, GemmPass::DGRAD},
-    {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
-};
-
-// ====================================================
-// Test case: a concrete (M, K, N) shape with pass info
-// ====================================================
-
-std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
-    return os << s.label;
-}
-
-static void resolve_mkn(const ShapeDef& s, size_t mbs,
-                         size_t& m, size_t& k, size_t& n) {
-    size_t tokens = mbs * 4096;
-    switch (s.pass) {
-        case GemmPass::FWD: // Fallthrough, same as DGRAD
-        case GemmPass::DGRAD:
-            m = tokens;
-            n = s.dim1;
-            k = s.dim2;
-            break;
-        case GemmPass::WGRAD:
-            m = s.dim1;
-            n = s.dim2;
-            k = tokens;
-            break;
-    }
-}
-
-// ============================================================================
-// MXFP8 dequantize-based GEMM correctness test
-//
-// 1. Create random source matrices A_src, B_src in D_Type (bf16)
-// 2. Quantize:   A_src -> A_fp8, B_src -> B_fp8  (MXFP8 block scaling)
-// 3. Dequantize: A_fp8 -> A_ref, B_fp8 -> B_ref  (back to D_Type)
-// 4. Swizzle scales for gfx1250 (if needed)
-// 5. MXFP8 GEMM:     D     = A_fp8 * B_fp8
-// 6. Non-FP8 GEMM:   D_ref = A_ref * B_ref
-// 7. Compare D vs D_ref
-// ============================================================================
-
-template <typename A_Type, typename B_Type, typename D_Type>
-void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb) {
-    DType atype = TypeInfo<A_Type>::dtype;
-    DType btype = TypeInfo<B_Type>::dtype;
-    DType dtype = TypeInfo<D_Type>::dtype;
-
-    ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input types expected";
-    ASSERT_FALSE(isFp8Type(dtype)) << "Non-FP8 output type expected";
-
-    if (m % 16 || n % 16) {
-        GTEST_SKIP() << "MXFP8 requires M & N to be multiples of 16";
-    }
-    if (k % 128) {
-        GTEST_SKIP() << "MXFP8 requires K to be a multiple of 128";
-    }
-
-    cudaDeviceProp prop;
-    (void)cudaGetDeviceProperties(&prop, 0);
-
-    bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5) || prop.major >= 12;
-    if (!mxfp8_supported) {
-        GTEST_SKIP() << "MXFP8 is not supported on this GPU";
-    }
-
-    TShape a_shape = transa ? TShape{m, k} : TShape{k, m};
-    TShape b_shape = transb ? TShape{k, n} : TShape{n, k};
-
-    // 1. Create random source matrices
-    Tensor A_src("A_src", a_shape, dtype);
-    Tensor B_src("B_src", b_shape, dtype);
-    fillUniform(&A_src);
-    fillUniform(&B_src);
-
-    // 2. Quantize to FP8 with MXFP8 scaling
-    Tensor A_fp8("A_fp8", a_shape, atype, transa, !transa,
-                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
-    Tensor B_fp8("B_fp8", b_shape, btype, !transb, transb,
-                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
-    nvte_quantize(A_src.data(), A_fp8.data(), 0);
-    nvte_quantize(B_src.data(), B_fp8.data(), 0);
-
-    // 3. Dequantize back to reference type
-    Tensor A_ref("A_ref", a_shape, dtype);
-    Tensor B_ref("B_ref", b_shape, dtype);
-    nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
-    nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
-
-    // 4. Swizzle scales for gfx1250
-    if (prop.major == 12) {
-        const bool a_colwise = !transa;
-        const bool b_colwise = transb;
-        if (!a_colwise) swizzle_mxfp8_scales(A_fp8, true);
-        if (a_colwise)  swizzle_mxfp8_scales(A_fp8, false);
-        if (!b_colwise) swizzle_mxfp8_scales(B_fp8, true);
-        if (b_colwise)  swizzle_mxfp8_scales(B_fp8, false);
-    }
-
-    Tensor bias;
-    Tensor pre_gelu_out;
-
-    size_t workspace_size = 67108864;  // 64 MB
-    Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte);
-
-    // 5. MXFP8 GEMM
-    Tensor D("D", TShape{n, m}, dtype);
-    nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(),
-                     bias.data(), pre_gelu_out.data(),
-                     transa, transb, false,
-                     Workspace.data(), false, false,
-                     prop.multiProcessorCount, 0);
-    D.to_cpu();
-
-    // 6. Non-FP8 reference GEMM
-    Tensor D_ref("D_ref", TShape{n, m}, dtype);
-    nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(),
-                     bias.data(), pre_gelu_out.data(),
-                     transa, transb, false,
-                     Workspace.data(), false, false,
-                     prop.multiProcessorCount, 0);
-    D_ref.to_cpu();
-
-    // Check for CUDA errors
-    (void)cudaDeviceSynchronize();
-    auto err = cudaGetLastError();
-    ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-
-    // 7. Compare results
-    // The MXFP8 GEMM and bf16 reference GEMM use different internal accumulation
-    // paths, so results can differ by up to 1 ULP in bf16 (~1.5-2% relative).
-    auto [atol, rtol] = getTolerances(dtype);
-    atol = std::max(atol, 1e-3);
-    rtol = std::max(rtol, 2e-2);
-    compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
-}
-
-// ============================================================================
-// Test suite
-// ============================================================================
-
-using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
-
-class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
-
-// Known-failing GEMM shapes on gfx950
-static const std::set<std::string> kMI355XSkips = {
-    // N=576 + NT: rocroller LDS stride mismatch (all elements wrong, ~100x off)
-    "DeepSeek3_Linear1_fwd_mbs1_NT",
-    "DeepSeek3_Linear1_fwd_mbs2_NT",
-    "DeepSeek3_Linear1_fwd_mbs4_NT",
-    // Crash (likely OOM / kernel fault)
-    "Qwen3_LMHead_fwd_mbs4_TN",
-    "Qwen3_LMHead_fwd_mbs4_NN",
-    "Qwen3_LMHead_fwd_mbs4_NT",
-};
-
-TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
-    const auto& shape = std::get<0>(GetParam());
-    size_t mbs = std::get<1>(GetParam());
-    const auto& layout = std::get<2>(GetParam());
-    bool transa = layout.first;
-    bool transb = layout.second;
-
-    static const char* tn_map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
-    std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
-                       + "_" + tn_map[transa][transb];
-    if (kMI355XSkips.count(name)) {
-        GTEST_SKIP() << "Known MI355X hipBLASLt failure: " << name;
-    }
-
-    size_t m, k, n;
-    resolve_mkn(shape, mbs, m, k, n);
-
-    switch (shape.pass) {
-        case GemmPass::FWD:
-            performMxfp8DqTest<fp8, fp8, bf16>(m, k, n, transa, transb);
-            break;
-        case GemmPass::DGRAD:
-            performMxfp8DqTest<bf8, fp8, bf16>(m, k, n, transa, transb);
-            break;
-        case GemmPass::WGRAD:
-            performMxfp8DqTest<fp8, bf8, bf16>(m, k, n, transa, transb);
-            break;
-    }
-}
-
-static inline std::string TN(const Layout& layout) {
-    static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
-    return map[layout.first][layout.second];
-}
-
-static inline auto testName(const testing::TestParamInfo<ProdGemmParam>& info) {
-    const auto& shape = std::get<0>(info.param);
-    size_t mbs = std::get<1>(info.param);
-    const auto& layout = std::get<2>(info.param);
-    return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
-}
-
-// DeepSeek3 model shapes
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmDeepSeek3, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(deepseek3_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// Qwen3 model shapes
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmQwen3, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(qwen3_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// DeepSeek3 LM Head shapes (very large N, memory-intensive)
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmDeepSeek3LMHead, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(deepseek3_lm_head_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// Qwen3 LM Head shapes (very large N, memory-intensive)
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmQwen3LMHead, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(qwen3_lm_head_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-}  // namespace
-
-#endif  // __HIP_PLATFORM_AMD__
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 392e641d5..fbcfdf89d 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -22,9 +22,6 @@
 #endif
 #include <omp.h>
 
-#ifdef __HIP_PLATFORM_AMD__
-#include <transformer_engine/swizzle.h>
-#endif
 #include <transformer_engine/transformer_engine.h>
 #include "util/logging.h"
 
@@ -1317,44 +1314,4 @@ std::array<size_t, 4> get_scale_tensor_dims(const size_t rows,
     return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X};
 }
 
-#ifdef __HIP_PLATFORM_AMD__
-void swizzle_mxfp8_scales(Tensor& t, bool rowwise) {
-    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
-                              : t.columnwise_scale_inv_dptr();
-    if (!scale_ptr) return;
-
-    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
-                                          : t.columnwise_scale_inv_shape();
-    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
-                                          : t.columnwise_shape();
-
-    size_t num_scales = 1;
-    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
-
-    uint8_t* d_tmp = nullptr;
-    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
-
-    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
-    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
-    output_tw.set_with_gemm_swizzled_scales(true);
-
-    if (rowwise) {
-        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    } else {
-        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    }
-
-    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
-    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
-    NVTE_CHECK_CUDA(cudaFree(d_tmp));
-}
-#endif // #ifdef __HIP_PLATFORM_AMD__
-
 }  // namespace test
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 6c37ccc57..a25b7b61e 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -581,10 +581,6 @@ int32_t getDeviceComputeCapability();
 constexpr int32_t hopperComputeCapability = 90;
 constexpr int32_t blackwellComputeCapability = 100;
 
-#ifdef USE_ROCM
-void swizzle_mxfp8_scales(Tensor& t, bool rowwise);
-#endif
-
 }  // namespace test
 
 #if FP4_TYPE_SUPPORTED