InfiniTensor · zhangyue207 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 # Generated files
 build/
 generated/
+.worktrees/
 
 # Prerequisites
 *.d

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,7 @@ option(WITH_ILUVATAR "Enable Iluvatar GPU backend" OFF)
 option(WITH_METAX "Enable MetaX backend" OFF)
 option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
 option(WITH_MOORE "Enable Moore backend" OFF)
+option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)
@@ -71,20 +72,25 @@ if(AUTO_DETECT_DEVICES)
         set(WITH_MOORE OFF)
         set(WITH_MOORE OFF CACHE BOOL "Enable Moore backend" FORCE)
     endif()
+
+    if(DEFINED ENV{ASCEND_HOME_PATH} OR EXISTS "/dev/davinci0")
+        set(WITH_ASCEND ON)
+        message(STATUS "Auto-detected Ascend environment.")
+    endif()
 endif()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
 # Only one CUDA-like GPU backend can be enabled at a time.
 set(_gpu_backend_count 0)
-foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE)
+foreach(_gpu_backend WITH_NVIDIA WITH_ILUVATAR WITH_METAX WITH_MOORE WITH_ASCEND)
     if(${_gpu_backend})
         math(EXPR _gpu_backend_count "${_gpu_backend_count} + 1")
     endif()
 endforeach()
 
 if(_gpu_backend_count GREATER 1)
-    message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, and `WITH_MOORE` are mutually exclusive. Build one GPU backend at a time.")
+    message(FATAL_ERROR "`WITH_NVIDIA`, `WITH_ILUVATAR`, `WITH_METAX`, `WITH_MOORE`, and `WITH_ASCEND` are mutually exclusive. Build one GPU backend at a time.")
 endif()
 
 if(WITH_NVIDIA)
@@ -178,8 +184,23 @@ if(WITH_CAMBRICON)
     find_library(CAMBRICON_PAPI_LIB       NAMES cnpapi     HINTS "${NEUWARE_HOME}/lib64" REQUIRED)
 endif()
 
+if(WITH_ASCEND)
+    add_compile_definitions(WITH_ASCEND=1)
+    if(NOT DEFINED ASCEND_HOME)
+        if(DEFINED ENV{ASCEND_HOME_PATH} AND NOT "$ENV{ASCEND_HOME_PATH}" STREQUAL "")
+            set(ASCEND_HOME "$ENV{ASCEND_HOME_PATH}" CACHE PATH "Ascend toolkit root")
+        else()
+            set(ASCEND_HOME "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "Ascend toolkit root")
+        endif()
+    endif()
+    if(NOT EXISTS "${ASCEND_HOME}")
+        message(FATAL_ERROR "`WITH_ASCEND` is ON but `${ASCEND_HOME}` was not found. Set ASCEND_HOME_PATH.")
+    endif()
+    message(STATUS "Using Ascend from `${ASCEND_HOME}`.")
+endif()
+
 # If all other platforms are not enabled, CPU is enabled by default.
-if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON)
+if(NOT WITH_NVIDIA AND NOT WITH_ILUVATAR AND NOT WITH_METAX AND NOT WITH_MOORE AND NOT WITH_CAMBRICON AND NOT WITH_ASCEND)
     add_compile_definitions(WITH_CPU=1)
 endif()
 

diff --git a/examples/runtime_api.h b/examples/runtime_api.h
@@ -19,6 +19,9 @@
 #elif WITH_MOORE
 #include "moore/gemm/mublas.h"
 #include "moore/runtime_.h"
+#elif WITH_ASCEND
+#include "ascend/gemm/kernel.h"
+#include "ascend/runtime_.h"
 #elif WITH_CPU
 #include "cpu/gemm/gemm.h"
 #include "cpu/runtime_.h"
@@ -38,6 +41,8 @@ using DefaultRuntimeUtils = Runtime<Device::Type::kMetax>;
 using DefaultRuntimeUtils = Runtime<Device::Type::kCambricon>;
 #elif WITH_MOORE
 using DefaultRuntimeUtils = Runtime<Device::Type::kMoore>;
+#elif WITH_ASCEND
+using DefaultRuntimeUtils = Runtime<Device::Type::kAscend>;
 #elif WITH_CPU
 using DefaultRuntimeUtils = Runtime<Device::Type::kCpu>;
 #endif

diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -91,26 +91,56 @@ def __init__(self, name, constructors, calls):
         self.calls = calls
 
 
+def _find_optional_tensor_params(op_name):
+    """Return a set of parameter names declared as `std::optional<Tensor>` in
+    the base header.  libclang resolves the type to ``int`` when the STL
+    headers are not fully available, so we fall back to a regex scan of the
+    source text.
+    """
+    import re
+
+    source = (_BASE_DIR / f"{op_name}.h").read_text()
+    return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))
+
+
 def _generate_pybind11(operator):
+    optional_tensor_params = _find_optional_tensor_params(operator.name)
+
+    def _is_optional_tensor(arg):
+        if arg.spelling in optional_tensor_params:
+            return True
+        return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling
+
     def _generate_params(node):
-        return (
-            ", ".join(
-                f"{arg.type.spelling} {arg.spelling}"
-                for arg in node.get_arguments()
-                if arg.spelling != "stream"
-            )
-            .replace("const Tensor", "py::object")
-            .replace("Tensor", "py::object")
-        )
+        parts = []
+        for arg in node.get_arguments():
+            if arg.spelling == "stream":
+                continue
+            if _is_optional_tensor(arg):
+                parts.append(f"std::optional<py::object> {arg.spelling}")
+            else:
+                param = (
+                    arg.type.spelling
+                    .replace("const Tensor", "py::object")
+                    .replace("Tensor", "py::object")
+                )
+                parts.append(f"{param} {arg.spelling}")
+        return ", ".join(parts)
 
     def _generate_arguments(node):
-        return ", ".join(
-            f"TensorFromPybind11Handle({arg.spelling})"
-            if "Tensor" in arg.type.spelling
-            else arg.spelling
-            for arg in node.get_arguments()
-            if arg.spelling != "stream"
-        )
+        args = []
+        for arg in node.get_arguments():
+            if arg.spelling == "stream":
+                continue
+            if _is_optional_tensor(arg):
+                args.append(
+                    f"OptionalTensorFromPybind11Handle({arg.spelling})"
+                )
+            elif "Tensor" in arg.type.spelling:
+                args.append(f"TensorFromPybind11Handle({arg.spelling})")
+            else:
+                args.append(arg.spelling)
+        return ", ".join(args)
 
     op_name = operator.name
 
@@ -134,18 +164,24 @@ def _generate_call(op_name, call, method=True):
 
         if not method:
             params = (
-                f"{call_params}, std::size_t implementation_index"
+                f"{call_params}, std::size_t implementation_index, std::uintptr_t stream"
                 if call_params
-                else "std::size_t implementation_index"
+                else "std::size_t implementation_index, std::uintptr_t stream"
             )
             py_args = _generate_py_args(call)
             py_args_str = f"{py_args}, " if py_args else ""
 
-            return f"""  m.def("{op_name}", []({params}) {{
-    Config config;
-    config.set_implementation_index(implementation_index);
-    return Self::call({{}}, config, {call_args});
-  }}, {py_args_str}py::kw_only(), py::arg("implementation_index") = 0);"""
+            return (
+                f'  m.def("{op_name}", []({params}) {{\n'
+                f"    Config config;\n"
+                f"    config.set_implementation_index(implementation_index);\n"
+                f"    Handle handle;\n"
+                f"    if (stream) {{\n"
+                f"      handle.set_stream(reinterpret_cast<void*>(stream));\n"
+                f"    }}\n"
+                f"    return Self::call(handle, config, {call_args});\n"
+                f"  }}, {py_args_str}py::kw_only(), py::arg(\"implementation_index\") = 0, py::arg(\"stream\") = 0);"
+            )
 
         return f"""      .def("__call__", [](const Self& self, {call_params}) {{
         return static_cast<const Operator<Self>&>(self)({call_args});
@@ -169,6 +205,8 @@ def _generate_call(op_name, call, method=True):
 
 #include "base/{op_name}.h"
 #include "config.h"
+#include "handle.h"
+#include "operator.h"
 #include "pybind11_utils.h"
 
 namespace py = pybind11;

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -172,10 +172,60 @@ if(WITH_CAMBRICON)
     list(APPEND DEVICE_LIST "cambricon")
 endif()
 
+if(WITH_ASCEND)
+    # ASCEND_HOME is set by the top-level CMakeLists.txt.
+    file(GLOB_RECURSE ASCEND_SOURCES CONFIGURE_DEPENDS
+        "ascend/*.cc"
+        "ascend/*.cpp"
+    )
+    # Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.
+    list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
+
+    target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
+    target_sources(infiniops PRIVATE ${ASCEND_SOURCES})
+
+    # Resolve the driver lib dir two levels above the toolkit root.
+    get_filename_component(ASCEND_ROOT "${ASCEND_HOME}/../.." ABSOLUTE)
+
+    # Prefer the real driver HAL; fall back to the toolkit stub for build-only
+    # environments (e.g., Docker CI images without hardware drivers installed).
+    # CANN <= 8.0: stub at runtime/lib64/stub/; CANN >= 8.5: devlib/<arch>-linux/devlib/.
+    set(ASCEND_HAL_REAL   "${ASCEND_ROOT}/driver/lib64/driver/libascend_hal.so")
+    set(ASCEND_HAL_STUB   "${ASCEND_HOME}/runtime/lib64/stub/libascend_hal.so")
+    set(ASCEND_HAL_DEVLIB "${ASCEND_HOME}/${CMAKE_SYSTEM_PROCESSOR}-linux/devlib/libascend_hal.so")
+    if(EXISTS "${ASCEND_HAL_REAL}")
+        set(ASCEND_HAL_LIB "${ASCEND_HAL_REAL}")
+    elseif(EXISTS "${ASCEND_HAL_STUB}")
+        set(ASCEND_HAL_LIB "${ASCEND_HAL_STUB}")
+        message(STATUS "ascend_hal: driver not found, using stub for linking")
+    elseif(EXISTS "${ASCEND_HAL_DEVLIB}")
+        set(ASCEND_HAL_LIB "${ASCEND_HAL_DEVLIB}")
+        message(STATUS "ascend_hal: driver not found, using devlib for linking")
+    else()
+        message(FATAL_ERROR "libascend_hal.so not found (tried ${ASCEND_HAL_REAL}, ${ASCEND_HAL_STUB}, and ${ASCEND_HAL_DEVLIB})")
+    endif()
+
+    target_include_directories(infiniops PUBLIC
+        "${ASCEND_HOME}/include"
+        "${ASCEND_HOME}/include/aclnn"
+        "${ASCEND_HOME}/include/aclnnop")
+    target_link_libraries(infiniops PUBLIC
+        "${ASCEND_HOME}/lib64/libascendcl.so"
+        "${ASCEND_HOME}/lib64/libnnopbase.so"
+        "${ASCEND_HOME}/lib64/libopapi.so"
+        "${ASCEND_HAL_LIB}")
+
+    list(APPEND DEVICE_LIST "ascend")
+endif()
+
 target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 if(GENERATE_PYTHON_BINDINGS)
     find_package(Python COMPONENTS Interpreter REQUIRED)
+    # Always regenerate bindings so the included kernel headers match the
+    # active device list.  Stale generated files (e.g., committed for one
+    # platform) would omit specializations for other enabled backends,
+    # causing link-time or runtime failures.
     execute_process(
         COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/scripts/generate_wrappers.py --devices ${DEVICE_LIST}
         WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}

diff --git a/src/ascend/common.h b/src/ascend/common.h
@@ -0,0 +1,56 @@
+#ifndef INFINI_OPS_ASCEND_COMMON_H_
+#define INFINI_OPS_ASCEND_COMMON_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/acl_meta.h"
+#include "ascend/data_type_.h"
+#include "tensor.h"
+
+namespace infini::ops::ascend {
+
+// Build an aclTensor descriptor from an InfiniOps Tensor.
+//
+// When `transpose_last2` is true the last two dimensions are swapped in the
+// descriptor (shape and strides) without copying data.  This is used by GEMM
+// and Matmul to express a transpose via the view.
+inline aclTensor* buildAclTensor(const Tensor& t,
+                                 bool transpose_last2 = false) {
+  std::vector<int64_t> shape(t.shape().begin(), t.shape().end());
+  std::vector<int64_t> strides(t.strides().begin(), t.strides().end());
+
+  if (transpose_last2 && shape.size() >= 2) {
+    auto n = shape.size();
+    std::swap(shape[n - 2], shape[n - 1]);
+    std::swap(strides[n - 2], strides[n - 1]);
+  }
+
+  // Compute the minimum physical storage needed for this strided view.
+  // For contiguous tensors this equals numel(); for non-contiguous (gapped)
+  // tensors it may be larger; for broadcast (stride-0) tensors it may be
+  // smaller.  Passing the view shape as the storage shape causes
+  // "ViewShape overlap" errors in ACLNN for non-contiguous inputs.
+  int64_t storage_elems = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == 0) {
+      storage_elems = 0;
+      break;
+    }
+    if (strides[i] > 0 && shape[i] > 1) {
+      storage_elems += static_cast<int64_t>(shape[i] - 1) * strides[i];
+    }
+  }
+  std::vector<int64_t> storage_shape = {storage_elems};
+
+  return aclCreateTensor(
+      shape.data(), static_cast<int64_t>(shape.size()), toAclDtype(t.dtype()),
+      strides.data(),
+      /*storageOffset=*/0, ACL_FORMAT_ND, storage_shape.data(),
+      static_cast<int64_t>(storage_shape.size()), const_cast<void*>(t.data()));
+}
+
+}  // namespace infini::ops::ascend
+
+#endif
diff --git a/src/ascend/data_type_.h b/src/ascend/data_type_.h
@@ -0,0 +1,61 @@
+#ifndef INFINI_OPS_ASCEND_DATA_TYPE__H_
+#define INFINI_OPS_ASCEND_DATA_TYPE__H_
+
+#include <cassert>
+
+#include "acl/acl.h"
+#include "ascend/device_.h"
+#include "data_type.h"
+
+namespace infini::ops::ascend {
+
+inline aclDataType toAclDtype(DataType dt) {
+  switch (dt) {
+    case DataType::kFloat16:
+      return ACL_FLOAT16;
+    case DataType::kBFloat16:
+      return ACL_BF16;
+    case DataType::kFloat32:
+      return ACL_FLOAT;
+    case DataType::kInt8:
+      return ACL_INT8;
+    case DataType::kInt16:
+      return ACL_INT16;
+    case DataType::kInt32:
+      return ACL_INT32;
+    case DataType::kInt64:
+      return ACL_INT64;
+    case DataType::kUInt8:
+      return ACL_UINT8;
+    case DataType::kUInt16:
+      return ACL_UINT16;
+    case DataType::kUInt32:
+      return ACL_UINT32;
+    case DataType::kUInt64:
+      return ACL_UINT64;
+    default:
+      assert(false && "unsupported dtype for Ascend backend");
+      return ACL_DT_UNDEFINED;
+  }
+}
+
+// Returns true for integer (signed or unsigned) DataType values.
+inline bool isIntegerDtype(DataType dt) {
+  switch (dt) {
+    case DataType::kInt8:
+    case DataType::kInt16:
+    case DataType::kInt32:
+    case DataType::kInt64:
+    case DataType::kUInt8:
+    case DataType::kUInt16:
+    case DataType::kUInt32:
+    case DataType::kUInt64:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace infini::ops::ascend
+
+#endif
diff --git a/src/ascend/device_.h b/src/ascend/device_.h
@@ -0,0 +1,16 @@
+#ifndef INFINI_OPS_ASCEND_DEVICE__H_
+#define INFINI_OPS_ASCEND_DEVICE__H_
+
+// NOTE: Cannot use `#include "device.h"` here — GCC resolves quoted includes
+// relative to the current file first, and `src/ascend/` used to contain a
+// `device.h`.  Use `data_type.h` which transitively pulls in `src/device.h`.
+#include "data_type.h"
+
+namespace infini::ops {
+
+template <>
+struct DeviceEnabled<Device::Type::kAscend> : std::true_type {};
+
+}  // namespace infini::ops
+
+#endif