Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnm
| `--stage` | Run only the specified stage |
| `--image-tag` | Override image tag |
| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via `CUDA_VISIBLE_DEVICES`) |
| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) |
| `--test` | Replace stage command entirely (e.g., `pytest tests/test_add.py -v`) |
| `--results-dir` | Host directory mounted to `/workspace/results` inside the container |
| `--local` | Mount current directory (read-only) instead of cloning from git |
| `--dry-run` | Print docker command without executing |
Expand Down Expand Up @@ -195,7 +195,7 @@ Proxy vars are forwarded from the host. Test results are written to `--results-d
| MetaX | `--privileged` | `none` | `maca-pytorch:3.2.1.4-...` | `mx-smi` |
| Moore | `--privileged` | `none` | `vllm_musa:20251112_hygon` | `mthreads-gmi` |
| Cambricon | `--privileged` | `mlu` | `cambricon/pytorch:v1.25.3` | `cnmon` |
| Ascend | TODO | — | `ascend-pytorch:24.0.0` | |
| Ascend | `--privileged` + device mounts | `npu` | `ascend-pytorch:24.0.RC3-A2-2.1.0` | `npu-smi` |

`gpu_style` controls the Docker device injection mechanism: `nvidia` uses `--gpus`, `none` uses `CUDA_VISIBLE_DEVICES` (or skips injection for Moore), `mlu` uses `MLU_VISIBLE_DEVICES`.

Expand Down
72 changes: 72 additions & 0 deletions .ci/ci_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
GPU_STYLE_NVIDIA = "nvidia"
GPU_STYLE_NONE = "none"
GPU_STYLE_MLU = "mlu"
GPU_STYLE_NPU = "npu"


@dataclass
Expand Down Expand Up @@ -44,6 +45,7 @@ class ResourcePool:
"metax": "mx-smi",
"moore": "mthreads-gmi",
"cambricon": "cnmon",
"ascend": "npu-smi",
}

def __init__(self, platform, utilization_threshold=10):
Expand Down Expand Up @@ -72,6 +74,9 @@ def detect_gpus(self) -> list[GpuInfo]:
if self._platform == "cambricon":
return self._detect_gpus_cambricon()

if self._platform == "ascend":
return self._detect_gpus_ascend()

tool = self.GPU_QUERY_TOOLS.get(self._platform)

if not tool:
Expand Down Expand Up @@ -325,6 +330,73 @@ def _detect_gpus_cambricon(self) -> list[GpuInfo]:

return sorted(gpus, key=operator.attrgetter("index"))

def _detect_gpus_ascend(self) -> list[GpuInfo]:
"""Parse npu-smi info output for Huawei Ascend NPUs.

Output format (pipe-delimited table, two rows per NPU):
| 0 910B4 | OK | 86.5 41 ...
| 0 | 0000:C1:00.0 | 0 0 / 0 2789 / 32768 |
Row 1: index, name, health, power, temp, hugepages.
Row 2: chip_id, bus_id, aicore_util, memory_usage, hbm_usage.
"""
try:
result = subprocess.run(
["npu-smi", "info"],
capture_output=True,
text=True,
timeout=10,
)
except (FileNotFoundError, subprocess.TimeoutExpired):
return []

if result.returncode != 0:
return []

gpus = []
lines = result.stdout.splitlines()
i = 0

while i < len(lines):
line = lines[i]
# Match row 1: "| {index} {name} ..."
m1 = re.match(r"^\|\s+(\d+)\s+", line)

if m1 and i + 1 < len(lines):
try:
npu_index = int(m1.group(1))
aicore_m = re.match(
r"^\|\s+\d+\s+\|\s+[\da-f:.]+\s+\|\s*([\d.]+)\s", lines[i + 1]
)

util_pct = float(aicore_m.group(1)) if aicore_m else 0.0

# Parse HBM usage from row 2: "{used} / {total}".
hbm_m = re.search(r"([\d.]+)\s*/\s*([\d.]+)", lines[i + 1])

if hbm_m:
used_mb = float(hbm_m.group(1))
total_mb = float(hbm_m.group(2))
else:
used_mb, total_mb = 0.0, 0.0

gpus.append(
GpuInfo(
index=npu_index,
memory_used_mb=used_mb,
memory_total_mb=total_mb,
utilization_pct=util_pct,
)
)
except (ValueError, AttributeError):
pass

i += 2
continue

i += 1

return sorted(gpus, key=operator.attrgetter("index"))

def detect_system_resources(self) -> SystemResources:
"""Read system memory from /proc/meminfo and CPU count."""
total_mb = 0.0
Expand Down
32 changes: 28 additions & 4 deletions .ci/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,34 @@ platforms:
- name: test
run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml

ascend: # TODO: Ascend image is not ready yet
ascend:
image:
dockerfile: .ci/images/ascend/
build_args:
BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
private_sdk:
source_env: PRIVATE_SDK_URL
BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler
PIP_INDEX_URL: https://pypi.org/simple
docker_args:
- "--runtime=runc"
- "--privileged"
- "--device=/dev/davinci0"
- "--device=/dev/davinci_manager"
- "--device=/dev/devmm_svm"
- "--device=/dev/hisi_hdc"
volumes:
- /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
- /usr/local/dcmi:/usr/local/dcmi:ro
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro
env:
ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest
setup: pip install .[dev] --no-build-isolation
jobs:
npu:
resources:
gpu_ids: "0"
gpu_style: npu
memory: 32GB
shm_size: 16g
timeout: 3600
stages:
- name: test
run: pytest tests/ -n 1 -k npu -v --tb=short --junitxml=/workspace/results/test-results.xml
34 changes: 13 additions & 21 deletions .ci/images/ascend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

ENV DEBIAN_FRONTEND=noninteractive
USER root

ARG HTTP_PROXY
ARG HTTPS_PROXY
Expand All @@ -10,30 +10,22 @@ ARG http_proxy
ARG https_proxy
ARG no_proxy

RUN apt-get update && \
apt-get install -y --no-install-recommends \
git \
cmake \
ninja-build \
coreutils \
curl \
libclang-dev \
&& rm -rf /var/lib/apt/lists/*

ARG PRIVATE_SDK_URL
RUN if [ -n "$PRIVATE_SDK_URL" ]; then \
curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \
chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \
rm /tmp/sdk.run; \
fi

RUN pip install --no-cache-dir \
ARG PIP_INDEX_URL
RUN pip install --no-cache-dir --progress-bar off \
${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
libclang \
ninja \
scikit-build-core \
pybind11 \
libclang \
pytest \
pytest-cov \
pytest-xdist \
pyyaml
ruff==0.15.7

# Pin pre-installed torch to prevent pip from replacing it.
RUN pip show torch >/dev/null 2>&1 && \
echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
touch /etc/pip-constraints.txt
ENV PIP_CONSTRAINT=/etc/pip-constraints.txt

WORKDIR /workspace
45 changes: 10 additions & 35 deletions .ci/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,19 @@
GPU_STYLE_NVIDIA,
GPU_STYLE_NONE,
GPU_STYLE_MLU,
GPU_STYLE_NPU,
ResourcePool,
detect_platform,
)
from utils import get_git_commit, load_config

# Flags that consume the next token as their value (e.g. -n 4, -k expr).
_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
def apply_test_override(run_cmd, test_cmd):
"""Replace a stage command with *test_cmd*.


def apply_test_override(run_cmd, test_path):
"""Replace positional test path(s) in a pytest stage command.

For example: ``pytest tests/ -n 4 ...`` becomes
``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is
``tests/test_gemm.py``.
``--test`` always replaces the entire stage command regardless of whether
the original is pytest or something else.
"""
parts = shlex.split(run_cmd)

if not parts or parts[0] != "pytest":
return run_cmd

result = ["pytest", test_path]
skip_next = False

for p in parts[1:]:
if skip_next:
result.append(p)
skip_next = False
continue

if p.startswith("-"):
result.append(p)
if p in _PYTEST_VALUE_FLAGS:
skip_next = True
continue

# Skip existing test paths; the override is already in result[1].
if not ("/" in p or p.endswith(".py") or "::" in p):
result.append(p)

return shlex.join(result)
return test_cmd


def build_results_dir(base, platform, stages, commit):
Expand Down Expand Up @@ -212,6 +184,9 @@ def build_docker_args(
# For Cambricon MLU platforms that use --privileged,
# control visible devices via MLU_VISIBLE_DEVICES.
args.extend(["-e", f"MLU_VISIBLE_DEVICES={gpu_id}"])
elif gpu_style == GPU_STYLE_NPU and gpu_id and gpu_id != "all":
# Ascend: control visible NPU via ASCEND_VISIBLE_DEVICES.
args.extend(["-e", f"ASCEND_VISIBLE_DEVICES={gpu_id}"])

memory = resources.get("memory")

Expand Down Expand Up @@ -315,7 +290,7 @@ def main():
parser.add_argument(
"--test",
type=str,
help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"',
help='Replace stage command with this (e.g. "pytest tests/test_add.py -v")',
)
parser.add_argument(
"--local",
Expand Down
1 change: 1 addition & 0 deletions .ci/tests/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def test_detect_system_resources(monkeypatch, tmp_path):
"MemAvailable: 20000000 kB\n"
)


_real_open = open

def fake_open(path, **kw):
Expand Down
33 changes: 33 additions & 0 deletions .ci/tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,36 @@ def test_build_results_dir_under_base():
stages = [{"name": "test", "run": "pytest"}]
d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678")
assert d.parent == Path("/tmp/my-results")


# ---------------------------------------------------------------------------
# Tests for `apply_test_override`.
# ---------------------------------------------------------------------------


def test_apply_test_override_replaces_pytest_command():
assert run.apply_test_override("pytest tests/ -v", "pytest tests/test_add.py") == (
"pytest tests/test_add.py"
)


def test_apply_test_override_replaces_non_pytest_command():
assert run.apply_test_override("ruff check .", "python docs/repro.py") == (
"python docs/repro.py"
)


def test_apply_test_override_replaces_empty_command():
assert run.apply_test_override("", "bash script.sh") == "bash script.sh"


def test_apply_test_override_preserves_user_flags():
cmd = "pytest tests/test_gemm.py -n 1 -v --tb=short"
assert run.apply_test_override("pytest tests/ -n 4", cmd) == cmd


def test_apply_test_override_with_shell_command():
assert (
run.apply_test_override("pytest tests/", "cd /tmp && python repro.py")
== "cd /tmp && python repro.py"
)
58 changes: 58 additions & 0 deletions src/ascend/add/kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#ifndef INFINI_OPS_ASCEND_ADD_KERNEL_H_
#define INFINI_OPS_ASCEND_ADD_KERNEL_H_

#include "acl/acl.h"
#include "aclnn/aclnn_base.h"
#include "aclnn_add.h"
#include "ascend/common.h"
#include "ascend/workspace_pool_.h"
#include "base/add.h"
#include "data_type.h"
#include "operator.h"

namespace infini::ops {

template <>
class Operator<Add, Device::Type::kAscend> : public Add {
public:
Operator(const Tensor input, const Tensor other, Tensor out)
: Add(input, other, out) {
// aclCreateScalar stores the pointer rather than copying the value, so
// alpha_storage_* must remain alive for the lifetime of alpha_.
// The alpha scalar type must match the tensor dtype: use int64 for integer
// dtypes and float for floating-point dtypes.
if (ascend::isIntegerDtype(input.dtype())) {
alpha_ = aclCreateScalar(&alpha_int_storage_, ACL_INT64);
} else {
alpha_ = aclCreateScalar(&alpha_float_storage_, ACL_FLOAT);
}
}

~Operator() { aclDestroyScalar(alpha_); }

void operator()(const Tensor input, const Tensor other,
Tensor out) const override {
auto stream = static_cast<aclrtStream>(stream_);
auto t_in = ascend::buildAclTensor(input);
auto t_oth = ascend::buildAclTensor(other);
auto t_out = ascend::buildAclTensor(out);
uint64_t ws_needed = 0;
aclOpExecutor* executor = nullptr;
aclnnAddGetWorkspaceSize(t_in, t_oth, alpha_, t_out, &ws_needed, &executor);
auto& arena = ascend::workspacePool().ensure(stream, ws_needed);
aclnnAdd(arena.buf, ws_needed, executor, stream);
aclDestroyTensor(t_in);
aclDestroyTensor(t_oth);
aclDestroyTensor(t_out);
}

private:
float alpha_float_storage_ =
1.0f; // stable address for aclCreateScalar (float)
int64_t alpha_int_storage_ = 1; // stable address for aclCreateScalar (int)
aclScalar* alpha_ = nullptr;
};

} // namespace infini::ops

#endif
Loading