drivenets
diff --git a/‎aiter/fused_moe_bf16_asm.py‎
Lines changed: 18 additions & 6 deletions b/‎aiter/fused_moe_bf16_asm.py‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎aiter/jit/optCompilerConfig.json‎
Lines changed: 2 additions & 1 deletion b/‎aiter/jit/optCompilerConfig.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aiter/ops/quant.py‎
Lines changed: 2 additions & 0 deletions b/‎aiter/ops/quant.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/include/quant.h‎
Lines changed: 7 additions & 5 deletions b/‎csrc/include/quant.h‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎csrc/include/rocm_ops.hpp‎
Lines changed: 6 additions & 4 deletions b/‎csrc/include/rocm_ops.hpp‎
Lines changed: 6 additions & 4 deletions
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 
 import torch
 import torch.nn.functional as F
@@ -68,6 +68,7 @@ def asm_moe(
     block_shape=None,
     expert_mask=None,
     activation=ActivationType.Silu,
+    local_expert_hash=None,
 ):
     E, model_dim, inter_dim = w2.shape
     global_E = E
@@ -187,14 +188,25 @@ def asm_moe(
             a8_scale = torch.empty((topk * M), dtype=dtypes.fp32, device=device)
 
             # moe_smoothquant_fwd need topk_ids which contains local_expert_id
-            if expert_mask is not None:
+            if expert_mask is not None and local_expert_hash is None:
                 local_expert_hash = expert_mask.cumsum(0, dtype=dtypes.i32)
                 local_expert_hash[local_expert_hash > 0] -= 1
-                topk_ids = local_expert_hash[topk_ids]
-
-            aiter.moe_smoothquant_fwd(
-                a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale
+                local_expert_hash[expert_mask == 0] = -1
+            #     topk_ids = local_expert_hash[topk_ids]
+
+            # aiter.moe_smoothquant_fwd(
+            #     a8, hidden_states, fc1_smooth_scale, topk_ids, a8_scale
+            # )
+            aiter.smooth_per_token_scaled_quant(
+                a8.view(topk, M, model_dim).transpose(0, 1),
+                hidden_states.view(M, 1, model_dim).expand(-1, topk, -1),
+                a8_scale,
+                fc1_smooth_scale,
+                topk_ids,
+                smooth_scale_map_hash=local_expert_hash,
+                enable_ps=True,
             )
+            a8 = a8.view(-1, model_dim)
         else:
             if (
                 w1.dtype == dtypes.fp8
 
@@ -738,7 +738,8 @@
         ],
         "extra_ldflags": "None",
         "extra_include": [
-            "f'{AITER_CSRC_DIR}/include/ck_tile'"
+            "f'{AITER_CSRC_DIR}/include/ck_tile'",
+            "f'{AITER_CSRC_DIR}/include/opus'"
         ],
         "verbose": "False",
         "blob_gen_cmd": "''"
 
@@ -447,6 +447,8 @@ def smooth_per_token_scaled_quant(
     shuffle_scale: bool = False,
     num_rows: Optional[torch.Tensor] = None,
     num_rows_factor: int = 1,
+    smooth_scale_map_hash: Optional[torch.Tensor] = None,
+    enable_ps: bool = True,
 ) -> None: ...
 
 
 
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
 #pragma once
 
 #include <torch/extension.h>
@@ -35,10 +35,12 @@ void smooth_per_token_scaled_quant(
     torch::Tensor const& input, // [..., d]
     torch::Tensor& scales,
     torch::Tensor const& smooth_scale,
-    std::optional<torch::Tensor> const& smooth_scale_map = std::nullopt,
-    bool shuffle_scale                                   = false,
-    std::optional<torch::Tensor> const& num_rows         = std::nullopt,
-    int num_rows_factor                                  = 1);
+    std::optional<torch::Tensor> const& smooth_scale_map      = std::nullopt,
+    bool shuffle_scale                                        = false,
+    std::optional<torch::Tensor> const& num_rows              = std::nullopt,
+    int num_rows_factor                                       = 1,
+    std::optional<torch::Tensor> const& smooth_scale_map_hash = std::nullopt,
+    bool enable_ps                                            = true);
 
 void partial_transpose(torch::Tensor& out,         // [rows, d]
                        torch::Tensor const& input, // [rows, d]
 
@@ -1386,10 +1386,12 @@ namespace py = pybind11;
           py::arg("input"),                                              \
           py::arg("scales"),                                             \
           py::arg("smooth_scale"),                                       \
-          py::arg("smooth_scale_map") = std::nullopt,                    \
-          py::arg("shuffle_scale")    = false,                           \
-          py::arg("num_rows")         = std::nullopt,                    \
-          py::arg("num_rows_factor")  = 1);                               \
+          py::arg("smooth_scale_map")      = std::nullopt,               \
+          py::arg("shuffle_scale")         = false,                      \
+          py::arg("num_rows")              = std::nullopt,               \
+          py::arg("num_rows_factor")       = 1,                          \
+          py::arg("smooth_scale_map_hash") = std::nullopt,               \
+          py::arg("enable_ps")             = true);                                  \
     m.def("partial_transpose",                                           \
           &aiter::partial_transpose,                                     \
           py::arg("out"),                                                \