hw-native-sys · zhangstevenunity · Apr 16, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -204,6 +204,8 @@ jobs:
       - name: Run sample tests (py -> pto -> cpp)
         shell: bash
         env:
+          CI_EVENT_NAME: ${{ github.event_name }}
+          WORKFLOW_SOC_VERSION: ${{ github.event.inputs.soc_version || 'Ascend910' }}
           PTOAS_BIN: ${{ github.workspace }}/build/tools/ptoas/ptoas
           PYTHON_BIN: /usr/bin/python3
           MLIR_PYTHON_ROOT: ${{ env.MLIR_PYTHONPATH }}
@@ -213,7 +215,15 @@ jobs:
           export PYTHONPATH="${MLIR_PYTHON_ROOT}:${PTO_PYTHON_ROOT}:${PYTHONPATH:-}"
           export LD_LIBRARY_PATH="${LLVM_DIR}/lib:${PTO_INSTALL_DIR}/lib:${LD_LIBRARY_PATH:-}"
           export PTOAS_OUT_DIR="${PAYLOAD_DIR}/test/samples"
-          bash test/samples/runop.sh --enablebc all
+          if [[ "${CI_EVENT_NAME}" == "workflow_dispatch" || "${CI_EVENT_NAME}" == "schedule" ]]; then
+            # Board-validation payloads must only contain the arch-matching
+            # direct .pto samples. Some A3/A5 qwen decode cases intentionally
+            # share the same testcase basename, so SKIP_CASES cannot
+            # distinguish them later once the payload is built.
+            SOC_VERSION="${WORKFLOW_SOC_VERSION}" bash test/samples/runop.sh --enablebc all
+          else
+            bash test/samples/runop.sh --enablebc all
+          fi
 
       - name: Build payload artifact
         if: >-
@@ -294,9 +304,7 @@ jobs:
           # RUN_ONLY_CASES requests, so remote validation does not try to force
           # A5-only cases through an A3 flow or vice versa.
           A3_ONLY_CASES="partition5d,partition5d_dynamic,mrgsort,tmatmulk_autosync"
-          QWEN3_TILELET_A5_ONLY_CASES="$(printf 'qwen3_decode_layer_incore_%s,' {0..19})"
-          QWEN3_TILELET_A5_ONLY_CASES="${QWEN3_TILELET_A5_ONLY_CASES%,}"
-          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,tpack,${QWEN3_TILELET_A5_ONLY_CASES}"
+          A5_ONLY_CASES="partition5d_a5,partition5d_dynamic_a5,mrgsort_a5,tmatmulk_autosync_a5,tpack"
 
           sv_lc="$(printf '%s' "${SOC_VERSION}" | tr '[:upper:]' '[:lower:]')"
           is_a5=0

diff --git a/test/npu_validation/scripts/generate_testcase.py b/test/npu_validation/scripts/generate_testcase.py
@@ -86,25 +86,56 @@
 })
 
 CASE_INT_SCALAR_DEFAULTS = {
-    "qwen3_decode_layer_incore_13": {
-        "v7": 64,
+    "qwen3_decode_incore_4": {
+        "v11": 1,
+        "v12": 0,
+        "v13": 1,
     },
-    "qwen3_decode_layer_incore_14": {
+    "qwen3_decode_incore_5": {
+        "v4": 1,
+        "v5": 1,
+        "v6": 1,
+        "v7": 0,
+    },
+    "qwen3_decode_incore_6": {
+        "v5": 1,
+        "v6": 1,
+        "v7": 0,
+    },
+    "qwen3_decode_incore_7": {
+        "v4": 1,
+        "v5": 1,
+        "v6": 1,
+        "v7": 0,
+    },
+    "qwen3_decode_incore_8": {
+        "v5": 2,
+        "v6": 1,
+    },
+    "qwen3_decode_incore_9": {
+        "v4": 1,
+        "v5": 64,
+    },
+    "qwen3_decode_incore_10": {
         "v4": 1,
         "v5": 64,
     },
+    "qwen3_decode_incore_12": {
+        "v4": 256,
+    },
+    "qwen3_decode_incore_13": {
+        "v4": 256,
+    },
+    "qwen3_decode_incore_15": {
+        "v4": 128,
+    },
+    "qwen3_decode_incore_16": {
+        "v4": 1,
+        "v5": 128,
+    },
 }
 
 CASE_POINTER_COUNT_MINIMUMS = {
-    "qwen3_decode_layer_incore_13": {
-        "v2": 20480,
-        "v4": 131046528,
-        "v5": 131046528,
-    },
-    "qwen3_decode_layer_incore_14": {
-        "v1": 16384,
-        "v3": 651264,
-    },
 }
 
 
@@ -878,6 +909,11 @@ def _copy_asset_if_needed(src: Path, dst: Path):
     shutil.copy2(src, dst)
 
 
+def _copy_custom_golden_helpers(sample_root: Path, output_dir: Path):
+    for helper in sample_root.glob("*_golden_*.py"):
+        _copy_asset_if_needed(helper, output_dir / helper.name)
+
+
 def _replace_includes(text: str) -> str:
     if "#include \"common/pto_instr.hpp\"" in text:
         return text.replace("#include \"common/pto_instr.hpp\"", INCLUDE_REPLACEMENT.rstrip())
@@ -1937,8 +1973,10 @@ def generate_testcase(
     else:
         golden_py = golden_template.replace("@INPUT_GENERATE@", "\n".join(input_generate))
         golden_dst.write_text(golden_py, encoding="utf-8")
-    if (custom_golden is not None or custom_compare is not None) and shared_validation_runtime.is_file():
-        _copy_asset_if_needed(shared_validation_runtime, output_dir / "validation_runtime.py")
+    if custom_golden is not None or custom_compare is not None:
+        _copy_custom_golden_helpers(sample_root, output_dir)
+        if shared_validation_runtime.is_file():
+            _copy_asset_if_needed(shared_validation_runtime, output_dir / "validation_runtime.py")
 
     # Emit the kernel source, optionally injecting a packed-predicate preload to
     # make TCMP/TCMPS outputs deterministic for byte-wise compares.

diff --git a/test/samples/Qwen3DecodeA3/README.md b/test/samples/Qwen3DecodeA3/README.md
@@ -0,0 +1,11 @@
+Qwen3 decode PTO kernels for A3, generated from `pypto-lib/examples/models/qwen3/qwen3_32b_decode.py`.
+
+Scope:
+- compile-regression inputs for `ptoas`
+- board-validation inputs with per-case custom golden
+
+Notes:
+- This directory vendors the 17 emitted `qwen3_decode_incore_*.pto` fragments for the A3 lowering.
+- `runop.sh` defaults these cases to `--pto-level=level3`.
+- `runop.sh` skips this directory on A5 / Ascend950 targets.
+- Each case has a sibling `<case>_golden.py`; shared reference logic lives in `qwen3_decode_golden_lib.py`.