Skip to content

test: add qwen3 decode A3/A5 PTO cases #1450

test: add qwen3 decode A3/A5 PTO cases

test: add qwen3 decode A3/A5 PTO cases #1450

Workflow file for this run

name: CI
on:
push:
pull_request:
# Nightly remote-board validation (GitHub cron is UTC).
# 22:00 CST (UTC+8) == 14:00 UTC.
schedule:
- cron: "0 14 * * *"
workflow_dispatch:
inputs:
stage:
description: "Validation stage (build|run)"
type: choice
options: [build, run]
default: run
run_mode:
description: "Run mode passed to generated CMake (sim|npu)"
type: choice
options: [npu, sim]
default: npu
soc_version:
description: "Ascend SoC version (e.g. Ascend910B1)"
type: string
default: Ascend910
device_id:
description: "aclrtSetDevice device id"
type: string
# NOTE: On our shared remote NPU host, device 0/1 may be unstable or
# occupied. Default to a higher id to reduce flakiness for scheduled
# runs. Override in workflow_dispatch if needed.
default: "2"
skip_cases:
description: "Comma/space separated testcase names to skip (e.g. scatter,mrgsort)"
type: string
default: "mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp"
run_only_cases:
description: "Comma/space separated testcase names to run (empty = run all)"
type: string
default: ""
pto_isa_repo:
description: "pto-isa repo URL on remote"
type: string
default: https://gitcode.com/cann/pto-isa.git
pto_isa_commit:
description: "pto-isa ref (commit/tag/branch; empty = repo-pinned weekly commit)"
type: string
# NOTE: Pin a known-good GitCode commit for deterministic runs.
default: 662d7f2a916d6bbde3109ce4a16ed5c28f5d900a
remote_host:
description: "SSH host/IP for the NPU machine"
type: string
default: 101.245.68.6
remote_user:
description: "SSH user for the NPU machine"
type: string
default: zhongxuan
remote_port:
description: "SSH port"
type: string
default: "22"
permissions:
contents: read
actions: write
pull-requests: read
jobs:
license-header-check:
if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }}
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.sha || github.sha }}
fetch-depth: 0
persist-credentials: false
- name: Check PR386 license headers
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
python3 .github/scripts/check_license_headers.py \
--repo "${{ github.repository }}" \
--event-name "${{ github.event_name }}" \
--pr-number "${{ github.event.pull_request.number || '' }}" \
--base-sha "${{ github.event.pull_request.base.sha || github.event.before || '' }}" \
--head-sha "${{ github.event.pull_request.head.sha || github.sha }}" \
--github-token "${GITHUB_TOKEN}"
build-and-test:
runs-on: ubuntu-22.04
env:
LLVM_COMMIT: cd708029e0b2869e80abe31ddb175f7c35361f90
LLVM_DIR: ${{ github.workspace }}/llvm-project/llvm/build-shared
PTO_INSTALL_DIR: ${{ github.workspace }}/install
MLIR_PYTHONPATH: ${{ github.workspace }}/llvm-project/llvm/build-shared/tools/mlir/python_packages/mlir_core
steps:
- name: Checkout
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.sha || github.sha }}
fetch-depth: 1
persist-credentials: false
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y \
cmake git ninja-build \
python3 python3-pip python3-venv \
clang lld \
libedit-dev zlib1g-dev libxml2-dev libzstd-dev
python3 -m pip install --upgrade pip
# LLVM/MLIR Python bindings are not yet compatible with pybind11 3.x.
python3 -m pip install 'pybind11<3' numpy
- name: Define payload paths
shell: bash
run: |
set -euo pipefail
# NOTE: Some GitHub/GHES versions don't allow using `${{ runner.temp }}` in
# `jobs.<job>.env`. Use the runtime env var instead.
tmp_root="${RUNNER_TEMP:-${GITHUB_WORKSPACE}/.tmp}"
echo "PAYLOAD_DIR=${tmp_root}/ptoas_payload" >> "${GITHUB_ENV}"
echo "PAYLOAD_TGZ=${tmp_root}/ptoas_payload.tgz" >> "${GITHUB_ENV}"
- name: Prepare payload dir
shell: bash
run: |
set -euo pipefail
rm -rf "${PAYLOAD_DIR}" "${PAYLOAD_TGZ}"
mkdir -p "${PAYLOAD_DIR}/test/samples"
mkdir -p "${PAYLOAD_DIR}/test/npu_validation/scripts"
mkdir -p "${PAYLOAD_DIR}/test/npu_validation/templates"
# 先恢复 LLVM build 缓存
- name: Restore LLVM build cache
id: cache-llvm
uses: actions/cache/restore@v4
with:
path: |
llvm-project/llvm/build-shared
key: llvm-${{ runner.os }}-${{ env.LLVM_COMMIT }}-shared-mlirpy
- name: Prepare LLVM source (no rebuild)
run: |
mkdir -p llvm-project
cd llvm-project
# 如果 restore 只带来了 build-shared,这里补一个 git repo
if [ ! -d .git ]; then
git init
git remote add origin https://github.com/llvm/llvm-project.git
fi
git fetch --depth 1 origin tag llvmorg-19.1.7
git checkout "${LLVM_COMMIT}"
- name: Build LLVM/MLIR (only if cache miss)
if: steps.cache-llvm.outputs.cache-hit != 'true'
run: |
cd llvm-project
cmake -G Ninja -S llvm -B llvm/build-shared \
-DLLVM_ENABLE_PROJECTS="mlir;clang" \
-DBUILD_SHARED_LIBS=ON \
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-DPython3_EXECUTABLE=python3 \
-DCMAKE_BUILD_TYPE=Release \
-DLLVM_TARGETS_TO_BUILD="host"
ninja -C llvm/build-shared
# LLVM build 完成后立即保存缓存,避免后续测试影响缓存内容
- name: Save LLVM build cache
if: steps.cache-llvm.outputs.cache-hit != 'true'
uses: actions/cache/save@v4
with:
path: |
llvm-project/llvm/build-shared
key: llvm-${{ runner.os }}-${{ env.LLVM_COMMIT }}-shared-mlirpy
- name: Build PTOAS
run: |
export PYBIND11_CMAKE_DIR="$(python3 -m pybind11 --cmakedir)"
cmake -G Ninja -S . -B build \
-DLLVM_DIR="${LLVM_DIR}/lib/cmake/llvm" \
-DMLIR_DIR="${LLVM_DIR}/lib/cmake/mlir" \
-DPython3_EXECUTABLE=python3 \
-DPython3_FIND_STRATEGY=LOCATION \
-Dpybind11_DIR="${PYBIND11_CMAKE_DIR}" \
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-DMLIR_PYTHON_PACKAGE_DIR="${LLVM_DIR}/tools/mlir/python_packages/mlir_core" \
-DCMAKE_INSTALL_PREFIX="${PTO_INSTALL_DIR}" \
-DCMAKE_BUILD_TYPE=Release
ninja -C build ptoas
ninja -C build ptobc
ninja -C build install
- name: Run sample tests (py -> pto -> cpp)
shell: bash
env:
PTOAS_BIN: ${{ github.workspace }}/build/tools/ptoas/ptoas
PYTHON_BIN: /usr/bin/python3
MLIR_PYTHON_ROOT: ${{ env.MLIR_PYTHONPATH }}
PTO_PYTHON_ROOT: ${{ env.PTO_INSTALL_DIR }}/
run: |
set -euo pipefail
export PYTHONPATH="${MLIR_PYTHON_ROOT}:${PTO_PYTHON_ROOT}:${PYTHONPATH:-}"
export LD_LIBRARY_PATH="${LLVM_DIR}/lib:${PTO_INSTALL_DIR}/lib:${LD_LIBRARY_PATH:-}"
export PTOAS_OUT_DIR="${PAYLOAD_DIR}/test/samples"
bash test/samples/runop.sh --enablebc all
- name: Build payload artifact
if: >-
${{
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
}}
shell: bash
run: |
set -euo pipefail
cp test/npu_validation/scripts/generate_testcase.py "${PAYLOAD_DIR}/test/npu_validation/scripts/"
cp test/npu_validation/scripts/run_remote_npu_validation.sh "${PAYLOAD_DIR}/test/npu_validation/scripts/"
cp test/npu_validation/templates/* "${PAYLOAD_DIR}/test/npu_validation/templates/"
chmod +x "${PAYLOAD_DIR}/test/npu_validation/scripts/run_remote_npu_validation.sh"
tar -czf "${PAYLOAD_TGZ}" -C "${PAYLOAD_DIR}" .
- name: Upload payload artifact
if: >-
${{
github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule'
}}
uses: actions/upload-artifact@v4
with:
name: ptoas_payload
path: ${{ env.PAYLOAD_TGZ }}
if-no-files-found: error
remote-npu-validation:
needs: build-and-test
runs-on: ubuntu-22.04
timeout-minutes: 180
concurrency:
group: remote-npu-validation
cancel-in-progress: false
# Ordering: `needs: build-and-test` enforces "CI -> remote".
if: >-
${{
(github.event_name == 'workflow_dispatch' ||
github.event_name == 'schedule')
}}
env:
PAYLOAD_DOWNLOAD_DIR: ${{ github.workspace }}/_payload
PAYLOAD_TGZ: ${{ github.workspace }}/_payload/ptoas_payload.tgz
# Temporary CI gate: skip cases that still error/flap on the remote NPU.
# Update this list as we fix the underlying issues.
DEFAULT_SKIP_CASES: >-
mix_kernel,vadd_validshape,vadd_validshape_dynamic,print,storefp,Gemvmx
steps:
- name: Resolve validation parameters
shell: bash
env:
STAGE: ${{ github.event.inputs.stage || 'run' }}
RUN_MODE: ${{ github.event.inputs.run_mode || 'npu' }}
SOC_VERSION: ${{ github.event.inputs.soc_version || 'Ascend910' }}
DEVICE_ID: ${{ github.event.inputs.device_id || '2' }}
SKIP_CASES: ${{ github.event.inputs.skip_cases || '' }}
RUN_ONLY_CASES: ${{ github.event.inputs.run_only_cases || '' }}
PTO_ISA_REPO: ${{ github.event.inputs.pto_isa_repo || 'https://gitcode.com/cann/pto-isa.git' }}
PTO_ISA_COMMIT: ${{ github.event.inputs.pto_isa_commit || '662d7f2a916d6bbde3109ce4a16ed5c28f5d900a' }}
REMOTE_HOST: ${{ github.event.inputs.remote_host || '101.245.68.6' }}
REMOTE_USER: ${{ github.event.inputs.remote_user || 'zhongxuan' }}
REMOTE_PORT: ${{ github.event.inputs.remote_port || '22' }}
run: |
set -euo pipefail
# For scheduled runs, default to DEFAULT_SKIP_CASES (known-bad/flaky).
# For workflow_dispatch runs, honor the user's input (the UI default
# is pre-filled but can be edited to run everything).
if [[ "${GITHUB_EVENT_NAME}" != "workflow_dispatch" ]]; then
if [[ -z "${SKIP_CASES}" ]]; then
SKIP_CASES="${DEFAULT_SKIP_CASES}"
fi
fi
# Some validation samples are arch-specific due to stricter pto-isa
# static checks and A5-only tile layouts. Always skip the
# non-matching variant based on SOC_VERSION, even for explicit
# RUN_ONLY_CASES requests, so remote validation does not try to force
# A5-only cases through an A3 flow or vice versa.
A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync"
QWEN3_TILELET_A5_ONLY_CASES="$(printf 'qwen3_decode_layer_incore_%s,' {0..19})"
QWEN3_TILELET_A5_ONLY_CASES="${QWEN3_TILELET_A5_ONLY_CASES%,}"
A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,tpack,${QWEN3_TILELET_A5_ONLY_CASES}"
sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')"
is_a5=0
if [[ "${sv_lc}" == *"950"* || "${sv_lc}" == *"a5"* ]]; then
is_a5=1
fi
if [[ ${is_a5} -eq 1 ]]; then
SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A3_ONLY_CASES}"
else
SKIP_CASES="${SKIP_CASES:+${SKIP_CASES},}${A5_ONLY_CASES}"
fi
echo "STAGE=${STAGE}" >> "${GITHUB_ENV}"
echo "RUN_MODE=${RUN_MODE}" >> "${GITHUB_ENV}"
echo "SOC_VERSION=${SOC_VERSION}" >> "${GITHUB_ENV}"
echo "DEVICE_ID=${DEVICE_ID}" >> "${GITHUB_ENV}"
echo "SKIP_CASES=${SKIP_CASES}" >> "${GITHUB_ENV}"
echo "RUN_ONLY_CASES=${RUN_ONLY_CASES}" >> "${GITHUB_ENV}"
echo "PTO_ISA_REPO=${PTO_ISA_REPO}" >> "${GITHUB_ENV}"
echo "PTO_ISA_COMMIT=${PTO_ISA_COMMIT}" >> "${GITHUB_ENV}"
echo "REMOTE_HOST=${REMOTE_HOST}" >> "${GITHUB_ENV}"
echo "REMOTE_USER=${REMOTE_USER}" >> "${GITHUB_ENV}"
echo "REMOTE_PORT=${REMOTE_PORT}" >> "${GITHUB_ENV}"
- name: Setup SSH
shell: bash
env:
SSH_KEY: ${{ secrets.SSH_KEY }}
SSH_KNOWN_HOSTS: ${{ secrets.SSH_KNOWN_HOSTS }}
run: |
set -euo pipefail
if [[ -z "${SSH_KEY}" ]]; then
echo "ERROR: secrets.SSH_KEY is not set"
exit 1
fi
if [[ -z "${SSH_KNOWN_HOSTS}" ]]; then
echo "ERROR: secrets.SSH_KNOWN_HOSTS is not set"
exit 1
fi
mkdir -p ~/.ssh
chmod 700 ~/.ssh
printf '%s\n' "${SSH_KEY}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
printf '%s\n' "${SSH_KNOWN_HOSTS}" > ~/.ssh/known_hosts
chmod 644 ~/.ssh/known_hosts
- name: Download payload artifact
uses: actions/download-artifact@v4
with:
name: ptoas_payload
path: ${{ env.PAYLOAD_DOWNLOAD_DIR }}
- name: Vendor pto-isa into payload (offline remote)
shell: bash
run: |
set -euo pipefail
tmp_root="${RUNNER_TEMP:-${GITHUB_WORKSPACE}/.tmp}"
work="$(mktemp -d "${tmp_root}/ptoas_payload_unpack.XXXXXX")"
tar -xzf "${PAYLOAD_TGZ}" -C "${work}"
rm -rf "${work}/pto-isa"
git clone "${PTO_ISA_REPO}" "${work}/pto-isa"
if [[ -n "${PTO_ISA_COMMIT}" ]]; then
git -C "${work}/pto-isa" checkout -f "${PTO_ISA_COMMIT}"
else
git -C "${work}/pto-isa" checkout -f origin/HEAD || true
fi
# Ship a working tree only; remote should not need outbound network.
rm -rf "${work}/pto-isa/.git"
tar -czf "${PAYLOAD_TGZ}" -C "${work}" .
- name: Copy payload to remote
shell: bash
run: |
set -euo pipefail
if [[ ! -f "${PAYLOAD_TGZ}" ]]; then
echo "ERROR: payload tarball not found: ${PAYLOAD_TGZ}"
exit 1
fi
REMOTE_DIR="/tmp/ptoas_npu_validation/${GITHUB_REPOSITORY}/${GITHUB_RUN_ID}"
echo "REMOTE_DIR=${REMOTE_DIR}" >> "${GITHUB_ENV}"
ssh -p "${REMOTE_PORT}" "${REMOTE_USER}@${REMOTE_HOST}" "rm -rf '${REMOTE_DIR}' && mkdir -p '${REMOTE_DIR}'"
scp -P "${REMOTE_PORT}" "${PAYLOAD_TGZ}" "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/payload.tgz"
- name: Run remote validation
shell: bash
run: |
set -euo pipefail
ssh -p "${REMOTE_PORT}" "${REMOTE_USER}@${REMOTE_HOST}" \
"set -euo pipefail; \
cd '${REMOTE_DIR}'; \
tar -xzf payload.tgz; \
STAGE='${STAGE}' RUN_MODE='${RUN_MODE}' SOC_VERSION='${SOC_VERSION}' PTO_ISA_REPO='${PTO_ISA_REPO}' PTO_ISA_COMMIT='${PTO_ISA_COMMIT}' DEVICE_ID='${DEVICE_ID}' SKIP_CASES='${SKIP_CASES}' RUN_ONLY_CASES='${RUN_ONLY_CASES}' \
bash ./test/npu_validation/scripts/run_remote_npu_validation.sh"
- name: Fetch results
if: always()
shell: bash
run: |
set -euo pipefail
if [[ -z "${REMOTE_DIR:-}" ]]; then
echo "REMOTE_DIR is not set; skipping results fetch."
exit 0
fi
scp -P "${REMOTE_PORT}" "${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/remote_npu_validation_results.tsv" ./remote_npu_validation_results.tsv || true
- name: Upload results artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: remote_npu_validation_results
path: remote_npu_validation_results.tsv
if-no-files-found: warn
- name: Cleanup remote
if: always()
shell: bash
run: |
set -euo pipefail
if [[ -n "${REMOTE_DIR:-}" ]]; then
ssh -p "${REMOTE_PORT}" "${REMOTE_USER}@${REMOTE_HOST}" "rm -rf '${REMOTE_DIR}'" || true
fi