feat(ascend): add GEMM kernel, NPU test infra, and example integration

zhangyue · zhangyue · commit 4833eb94ffc4 · 2026-04-08T12:25:42.000+08:00
- Add Ascend GEMM specialization using `aclnnAddmm`/`aclnnBaddbmm`.
- Add `get_npu_stream()` helper and NPU device detection in test utils.
- Add `skip_unsupported_dtype` fixture for Ascend in conftest.
- Update `runtime_api.h` with Ascend backend entry.
diff --git a/examples/runtime_api.h b/examples/runtime_api.h
@@ -19,6 +19,9 @@
 #elif WITH_MOORE
 #include "moore/gemm/mublas.h"
 #include "moore/runtime_.h"
+#elif WITH_ASCEND
+#include "ascend/gemm/kernel.h"
+#include "ascend/runtime_.h"
 #elif WITH_CPU
 #include "cpu/gemm/gemm.h"
 #include "cpu/runtime_.h"
@@ -38,6 +41,8 @@ using DefaultRuntimeUtils = Runtime<Device::Type::kMetax>;
 using DefaultRuntimeUtils = Runtime<Device::Type::kCambricon>;
 #elif WITH_MOORE
 using DefaultRuntimeUtils = Runtime<Device::Type::kMoore>;
+#elif WITH_ASCEND
+using DefaultRuntimeUtils = Runtime<Device::Type::kAscend>;
 #elif WITH_CPU
 using DefaultRuntimeUtils = Runtime<Device::Type::kCpu>;
 #endif
diff --git a/src/ascend/gemm/kernel.h b/src/ascend/gemm/kernel.h
@@ -0,0 +1,80 @@
+#ifndef INFINI_OPS_ASCEND_GEMM_KERNEL_H_
+#define INFINI_OPS_ASCEND_GEMM_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_addmm.h"
+#include "aclnnop/aclnn_baddbmm.h"
+#include "ascend/common.h"
+#include "ascend/workspace_pool_.h"
+#include "base/gemm.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Gemm, Device::Type::kAscend> : public Gemm {
+ public:
+  Operator(const Tensor a, const Tensor b, std::optional<float> alpha,
+           std::optional<float> beta, std::optional<int> trans_a,
+           std::optional<int> trans_b, Tensor c)
+      : Gemm(a, b, alpha, beta, trans_a, trans_b, c),
+        batched_{batch_count_ > 1},
+        alpha_val_{alpha.value_or(1.0f)},
+        beta_val_{beta.value_or(1.0f)} {
+    alpha_scalar_ = aclCreateScalar(&alpha_val_, ACL_FLOAT);
+    beta_scalar_ = aclCreateScalar(&beta_val_, ACL_FLOAT);
+  }
+
+  ~Operator() {
+    aclDestroyScalar(alpha_scalar_);
+    aclDestroyScalar(beta_scalar_);
+  }
+
+  void operator()(const Tensor a, const Tensor b, std::optional<float> alpha,
+                  std::optional<float> beta, std::optional<int> trans_a,
+                  std::optional<int> trans_b, Tensor c) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    auto t_self = ascend::buildAclTensor(c);
+    auto t_a = ascend::buildAclTensor(a, trans_a_);
+    auto t_b = ascend::buildAclTensor(b, trans_b_);
+    auto t_out = ascend::buildAclTensor(c);
+
+    uint64_t ws_needed = 0;
+    aclOpExecutor* executor = nullptr;
+
+    if (batched_) {
+      aclnnBaddbmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_,
+                                   alpha_scalar_, t_out, 0, &ws_needed,
+                                   &executor);
+    } else {
+      aclnnAddmmGetWorkspaceSize(t_self, t_a, t_b, beta_scalar_, alpha_scalar_,
+                                 t_out, 0, &ws_needed, &executor);
+    }
+
+    auto& arena = ascend::workspacePool().ensure(stream, ws_needed);
+
+    if (batched_) {
+      aclnnBaddbmm(arena.buf, ws_needed, executor, stream);
+    } else {
+      aclnnAddmm(arena.buf, ws_needed, executor, stream);
+    }
+
+    aclDestroyTensor(t_self);
+    aclDestroyTensor(t_a);
+    aclDestroyTensor(t_b);
+    aclDestroyTensor(t_out);
+  }
+
+ private:
+  bool batched_;
+  float alpha_val_;
+  float beta_val_;
+  aclScalar* alpha_scalar_ = nullptr;
+  aclScalar* beta_scalar_ = nullptr;
+};
+
+}  // namespace infini::ops
+
+#endif
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -38,6 +38,25 @@ def set_seed_per_test(request):
     _set_random_seed(_hash(_test_case_path_from_request(request)))
 
 
+_NPU_UNSUPPORTED_DTYPES = {torch.float64}
+
+# torch_npu does not implement random number generation for uint16/uint32/uint64.
+for _bits in (16, 32, 64):
+    _t = getattr(torch, f"uint{_bits}", None)
+    if _t is not None:
+        _NPU_UNSUPPORTED_DTYPES.add(_t)
+
+
+@pytest.fixture(autouse=True)
+def skip_unsupported_dtype(request):
+    if not hasattr(request.node, "callspec"):
+        return
+    params = request.node.callspec.params
+
+    if params.get("device") == "npu" and params.get("dtype") in _NPU_UNSUPPORTED_DTYPES:
+        pytest.skip(f"{params['dtype']} not supported on Ascend 910B")
+
+
 def _set_random_seed(seed):
     random.seed(seed)
     torch.manual_seed(seed)
diff --git a/tests/test_gemm.py b/tests/test_gemm.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 
-from tests.utils import Payload, randn_strided
+from tests.utils import Payload, get_npu_stream, randn_strided
 
 
 @pytest.mark.auto_act_and_assert
@@ -84,16 +84,22 @@ def test_gemm(
 
 
 def _gemm(a, b, alpha, beta, trans_a, trans_b, c, implementation_index=0):
-    infini.ops.gemm(
-        a,
-        b,
-        alpha,
-        beta,
-        trans_a,
-        trans_b,
-        c,
-        implementation_index=implementation_index,
-    )
+    if a.device.type == "npu":
+        infini.ops.gemm(
+            a, b, alpha, beta, trans_a, trans_b, c,
+            stream=get_npu_stream(a),
+        )
+    else:
+        infini.ops.gemm(
+            a,
+            b,
+            alpha,
+            beta,
+            trans_a,
+            trans_b,
+            c,
+            implementation_index=implementation_index,
+        )
 
     return c
 
diff --git a/tests/utils.py b/tests/utils.py
@@ -32,12 +32,18 @@ def get_available_devices():
     if hasattr(torch, "musa") and torch.musa.is_available():
         devices.append("musa")
 
+    if hasattr(torch, "npu") and torch.npu.is_available():
+        devices.append("npu")
+
     return tuple(devices)
 
 
 with contextlib.suppress(ImportError, ModuleNotFoundError):
     import torch_mlu  # noqa: F401
 
+with contextlib.suppress(ImportError, ModuleNotFoundError):
+    import torch_npu  # noqa: F401
+
 
 def empty_strided(shape, strides, *, dtype=None, device=None):
     if strides is None:
@@ -76,6 +82,14 @@ def randint_strided(low, high, shape, strides, *, dtype=None, device=None):
     return output
 
 
+def get_npu_stream(tensor):
+    """Return the current NPU stream handle for `tensor`, or 0 on other devices."""
+    if tensor.device.type != "npu":
+        return 0
+
+    return torch.npu.current_stream().npu_stream
+
+
 def clone_strided(input):
     output = empty_strided(
         input.size(), input.stride(), dtype=input.dtype, device=input.device