InfiniTensor · zhangyue207 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.ci/README.md b/.ci/README.md
@@ -158,7 +158,7 @@ Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnm
 | `--stage` | Run only the specified stage |
 | `--image-tag` | Override image tag |
 | `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) |
-| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) |
+| `--test` | Replace stage command entirely (e.g., `pytest tests/test_add.py -v`) |
 | `--results-dir` | Host directory mounted to `/workspace/results` inside the container |
 | `--local` | Mount current directory (read-only) instead of cloning from git |
 | `--dry-run` | Print docker command without executing |
@@ -195,7 +195,7 @@ Proxy vars are forwarded from the host. Test results are written to `--results-d
 | MetaX | `--privileged` | `none` | `maca-pytorch:3.2.1.4-...` | `mx-smi` |
 | Moore | `--privileged` | `none` | `vllm_musa:20251112_hygon` | `mthreads-gmi` |
 | Cambricon | `--privileged` | `mlu` | `cambricon/pytorch:v1.25.3` | `cnmon` |
-| Ascend | TODO | — | `ascend-pytorch:24.0.0` | — |
+| Ascend | `--privileged` + device mounts | `npu` | `ascend-pytorch:24.0.RC3-A2-2.1.0` | `npu-smi` |
 
 `gpu_style` controls the Docker device injection mechanism: `nvidia` uses `--gpus`, `none` uses `CUDA_VISIBLE_DEVICES` (or skips injection for Moore), `mlu` uses `MLU_VISIBLE_DEVICES`.
 

diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
@@ -14,6 +14,7 @@
 GPU_STYLE_NVIDIA = "nvidia"
 GPU_STYLE_NONE = "none"
 GPU_STYLE_MLU = "mlu"
+GPU_STYLE_NPU = "npu"
 
 
 @dataclass
@@ -44,6 +45,7 @@ class ResourcePool:
         "metax": "mx-smi",
         "moore": "mthreads-gmi",
         "cambricon": "cnmon",
+        "ascend": "npu-smi",
     }
 
     def __init__(self, platform, utilization_threshold=10):
@@ -72,6 +74,9 @@ def detect_gpus(self) -> list[GpuInfo]:
         if self._platform == "cambricon":
             return self._detect_gpus_cambricon()
 
+        if self._platform == "ascend":
+            return self._detect_gpus_ascend()
+
         tool = self.GPU_QUERY_TOOLS.get(self._platform)
 
         if not tool:
@@ -325,6 +330,73 @@ def _detect_gpus_cambricon(self) -> list[GpuInfo]:
 
         return sorted(gpus, key=operator.attrgetter("index"))
 
+    def _detect_gpus_ascend(self) -> list[GpuInfo]:
+        """Parse npu-smi info output for Huawei Ascend NPUs.
+
+        Output format (pipe-delimited table, two rows per NPU):
+            | 0     910B4               | OK            | 86.5  41  ...
+            | 0                         | 0000:C1:00.0  | 0     0 / 0   2789 / 32768   |
+        Row 1: index, name, health, power, temp, hugepages.
+        Row 2: chip_id, bus_id, aicore_util, memory_usage, hbm_usage.
+        """
+        try:
+            result = subprocess.run(
+                ["npu-smi", "info"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            return []
+
+        if result.returncode != 0:
+            return []
+
+        gpus = []
+        lines = result.stdout.splitlines()
+        i = 0
+
+        while i < len(lines):
+            line = lines[i]
+            # Match row 1: "| {index}  {name}  ..."
+            m1 = re.match(r"^\|\s+(\d+)\s+", line)
+
+            if m1 and i + 1 < len(lines):
+                try:
+                    npu_index = int(m1.group(1))
+                    aicore_m = re.match(
+                        r"^\|\s+\d+\s+\|\s+[\da-f:.]+\s+\|\s*([\d.]+)\s", lines[i + 1]
+                    )
+
+                    util_pct = float(aicore_m.group(1)) if aicore_m else 0.0
+
+                    # Parse HBM usage from row 2: "{used} / {total}".
+                    hbm_m = re.search(r"([\d.]+)\s*/\s*([\d.]+)", lines[i + 1])
+
+                    if hbm_m:
+                        used_mb = float(hbm_m.group(1))
+                        total_mb = float(hbm_m.group(2))
+                    else:
+                        used_mb, total_mb = 0.0, 0.0
+
+                    gpus.append(
+                        GpuInfo(
+                            index=npu_index,
+                            memory_used_mb=used_mb,
+                            memory_total_mb=total_mb,
+                            utilization_pct=util_pct,
+                        )
+                    )
+                except (ValueError, AttributeError):
+                    pass
+
+                i += 2
+                continue
+
+            i += 1
+
+        return sorted(gpus, key=operator.attrgetter("index"))
+
     def detect_system_resources(self) -> SystemResources:
         """Read system memory from /proc/meminfo and CPU count."""
         total_mb = 0.0

diff --git a/.ci/config.yaml b/.ci/config.yaml
@@ -137,10 +137,34 @@ platforms:
           - name: test
             run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
-  ascend:                                  # TODO: Ascend image is not ready yet
+  ascend:
     image:
       dockerfile: .ci/images/ascend/
       build_args:
-        BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
-      private_sdk:
-        source_env: PRIVATE_SDK_URL
+        BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler
+        PIP_INDEX_URL: https://pypi.org/simple
+    docker_args:
+      - "--runtime=runc"
+      - "--privileged"
+      - "--device=/dev/davinci0"
+      - "--device=/dev/davinci_manager"
+      - "--device=/dev/devmm_svm"
+      - "--device=/dev/hisi_hdc"
+    volumes:
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
+      - /usr/local/dcmi:/usr/local/dcmi:ro
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro
+    env:
+      ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest
+    setup: pip install .[dev] --no-build-isolation
+    jobs:
+      npu:
+        resources:
+          gpu_ids: "0"
+          gpu_style: npu
+          memory: 32GB
+          shm_size: 16g
+          timeout: 3600
+        stages:
+          - name: test
+            run: pytest tests/ -n 1 -k npu -v --tb=short --junitxml=/workspace/results/test-results.xml
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
@@ -1,7 +1,7 @@
 ARG BASE_IMAGE
 FROM ${BASE_IMAGE}
 
-ENV DEBIAN_FRONTEND=noninteractive
+USER root
 
 ARG HTTP_PROXY
 ARG HTTPS_PROXY
@@ -10,30 +10,22 @@ ARG http_proxy
 ARG https_proxy
 ARG no_proxy
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        cmake \
-        ninja-build \
-        coreutils \
-        curl \
-        libclang-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-ARG PRIVATE_SDK_URL
-RUN if [ -n "$PRIVATE_SDK_URL" ]; then \
-        curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \
-        chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \
-        rm /tmp/sdk.run; \
-    fi
-
-RUN pip install --no-cache-dir \
+ARG PIP_INDEX_URL
+RUN pip install --no-cache-dir --progress-bar off \
+    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
+    libclang \
+    ninja \
     scikit-build-core \
     pybind11 \
-    libclang \
     pytest \
     pytest-cov \
     pytest-xdist \
-    pyyaml
+    ruff==0.15.7
+
+# Pin pre-installed torch to prevent pip from replacing it.
+RUN pip show torch >/dev/null 2>&1 && \
+    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
+    touch /etc/pip-constraints.txt
+ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
 
 WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
@@ -13,47 +13,19 @@
     GPU_STYLE_NVIDIA,
     GPU_STYLE_NONE,
     GPU_STYLE_MLU,
+    GPU_STYLE_NPU,
     ResourcePool,
     detect_platform,
 )
 from utils import get_git_commit, load_config
 
-# Flags that consume the next token as their value (e.g. -n 4, -k expr).
-_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
+def apply_test_override(run_cmd, test_cmd):
+    """Replace a stage command with *test_cmd*.
 
-
-def apply_test_override(run_cmd, test_path):
-    """Replace positional test path(s) in a pytest stage command.
-
-    For example: ``pytest tests/ -n 4 ...`` becomes
-    ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is
-    ``tests/test_gemm.py``.
+    ``--test`` always replaces the entire stage command regardless of whether
+    the original is pytest or something else.
     """
-    parts = shlex.split(run_cmd)
-
-    if not parts or parts[0] != "pytest":
-        return run_cmd
-
-    result = ["pytest", test_path]
-    skip_next = False
-
-    for p in parts[1:]:
-        if skip_next:
-            result.append(p)
-            skip_next = False
-            continue
-
-        if p.startswith("-"):
-            result.append(p)
-            if p in _PYTEST_VALUE_FLAGS:
-                skip_next = True
-            continue
-
-        # Skip existing test paths; the override is already in result[1].
-        if not ("/" in p or p.endswith(".py") or "::" in p):
-            result.append(p)
-
-    return shlex.join(result)
+    return test_cmd
 
 
 def build_results_dir(base, platform, stages, commit):
@@ -212,6 +184,9 @@ def build_docker_args(
         # For Cambricon MLU platforms that use --privileged,
         # control visible devices via MLU_VISIBLE_DEVICES.
         args.extend(["-e", f"MLU_VISIBLE_DEVICES={gpu_id}"])
+    elif gpu_style == GPU_STYLE_NPU and gpu_id and gpu_id != "all":
+        # Ascend: control visible NPU via ASCEND_VISIBLE_DEVICES.
+        args.extend(["-e", f"ASCEND_VISIBLE_DEVICES={gpu_id}"])
 
     memory = resources.get("memory")
 
@@ -315,7 +290,7 @@ def main():
     parser.add_argument(
         "--test",
         type=str,
-        help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"',
+        help='Replace stage command with this (e.g. "pytest tests/test_add.py -v")',
     )
     parser.add_argument(
         "--local",

diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py
@@ -93,6 +93,7 @@ def test_detect_system_resources(monkeypatch, tmp_path):
         "MemAvailable:   20000000 kB\n"
     )
 
+
     _real_open = open
 
     def fake_open(path, **kw):

diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py
@@ -296,3 +296,36 @@ def test_build_results_dir_under_base():
     stages = [{"name": "test", "run": "pytest"}]
     d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678")
     assert d.parent == Path("/tmp/my-results")
+
+
+# ---------------------------------------------------------------------------
+# Tests for `apply_test_override`.
+# ---------------------------------------------------------------------------
+
+
+def test_apply_test_override_replaces_pytest_command():
+    assert run.apply_test_override("pytest tests/ -v", "pytest tests/test_add.py") == (
+        "pytest tests/test_add.py"
+    )
+
+
+def test_apply_test_override_replaces_non_pytest_command():
+    assert run.apply_test_override("ruff check .", "python docs/repro.py") == (
+        "python docs/repro.py"
+    )
+
+
+def test_apply_test_override_replaces_empty_command():
+    assert run.apply_test_override("", "bash script.sh") == "bash script.sh"
+
+
+def test_apply_test_override_preserves_user_flags():
+    cmd = "pytest tests/test_gemm.py -n 1 -v --tb=short"
+    assert run.apply_test_override("pytest tests/ -n 4", cmd) == cmd
+
+
+def test_apply_test_override_with_shell_command():
+    assert (
+        run.apply_test_override("pytest tests/", "cd /tmp && python repro.py")
+        == "cd /tmp && python repro.py"
+    )
diff --git a/src/ascend/add/kernel.h b/src/ascend/add/kernel.h
@@ -0,0 +1,58 @@
+#ifndef INFINI_OPS_ASCEND_ADD_KERNEL_H_
+#define INFINI_OPS_ASCEND_ADD_KERNEL_H_
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnn_add.h"
+#include "ascend/common.h"
+#include "ascend/workspace_pool_.h"
+#include "base/add.h"
+#include "data_type.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Add, Device::Type::kAscend> : public Add {
+ public:
+  Operator(const Tensor input, const Tensor other, Tensor out)
+      : Add(input, other, out) {
+    // aclCreateScalar stores the pointer rather than copying the value, so
+    // alpha_storage_* must remain alive for the lifetime of alpha_.
+    // The alpha scalar type must match the tensor dtype: use int64 for integer
+    // dtypes and float for floating-point dtypes.
+    if (ascend::isIntegerDtype(input.dtype())) {
+      alpha_ = aclCreateScalar(&alpha_int_storage_, ACL_INT64);
+    } else {
+      alpha_ = aclCreateScalar(&alpha_float_storage_, ACL_FLOAT);
+    }
+  }
+
+  ~Operator() { aclDestroyScalar(alpha_); }
+
+  void operator()(const Tensor input, const Tensor other,
+                  Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+    auto t_in = ascend::buildAclTensor(input);
+    auto t_oth = ascend::buildAclTensor(other);
+    auto t_out = ascend::buildAclTensor(out);
+    uint64_t ws_needed = 0;
+    aclOpExecutor* executor = nullptr;
+    aclnnAddGetWorkspaceSize(t_in, t_oth, alpha_, t_out, &ws_needed, &executor);
+    auto& arena = ascend::workspacePool().ensure(stream, ws_needed);
+    aclnnAdd(arena.buf, ws_needed, executor, stream);
+    aclDestroyTensor(t_in);
+    aclDestroyTensor(t_oth);
+    aclDestroyTensor(t_out);
+  }
+
+ private:
+  float alpha_float_storage_ =
+      1.0f;                        // stable address for aclCreateScalar (float)
+  int64_t alpha_int_storage_ = 1;  // stable address for aclCreateScalar (int)
+  aclScalar* alpha_ = nullptr;
+};
+
+}  // namespace infini::ops
+
+#endif
-Original file line number
+Diff line change
@@ Expand Up / @@ -93,6 +93,7 @@ def test_detect_system_resources(monkeypatch, tmp_path): @@
             "MemAvailable:   20000000 kB\n"
         )
         _real_open = open
         def fake_open(path, **kw):
@@ Expand Down @@