From b97cff1ca3ffe2498ab7df0d5be8cec99233cca6 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Mon, 27 Oct 2025 13:51:15 -0500
Subject: [PATCH 01/43] Add github workflows to automate IFU  (#2688) (#2748)

(cherry picked from commit a66eeda0423ae7eeeff445438d4dedda549524af)

Fixes #ISSUE_NUMBER

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 .github/workflows/create_ifu_tag.yml | 116 +++++++++++++++++++++
 .github/workflows/pytorch_ifu.yml    | 145 +++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 .github/workflows/create_ifu_tag.yml
 create mode 100644 .github/workflows/pytorch_ifu.yml

diff --git a/.github/workflows/create_ifu_tag.yml b/.github/workflows/create_ifu_tag.yml
new file mode 100644
index 0000000000000..e54bb35e6982c
--- /dev/null
+++ b/.github/workflows/create_ifu_tag.yml
@@ -0,0 +1,116 @@
+name: Create git tags for IFU PRs
+
+on:
+  pull_request:
+    types: [closed]
+
+permissions:
+  contents: write        # create/push tags
+  pull-requests: write   # edit PR body
+
+jobs:
+  tag-ifu:
+    # Only proceed if: merged AND title has both markers
+    if: >
+      github.event.pull_request.merged == true &&
+      contains(github.event.pull_request.title, '[AUTOGENERATED]') &&
+      contains(github.event.pull_request.title, 'IFU')
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout base repo (full history)
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          fetch-depth: 0
+
+      - name: Configure Git user
+        run: |
+          git config user.name  "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Derive key SHAs (rocm base, upstream main, merge)
+        id: shas
+        shell: bash
+        run: |
+          set -euo pipefail
+          
+          PR_NUM="${{ github.event.pull_request.number }}"
+          BASE_REF="${{ github.event.pull_request.base.ref }}"
+          HEAD_SHA="${{ github.event.pull_request.head.sha }}"
+          MERGE_SHA="${{ github.event.pull_request.merge_commit_sha }}"
+
+          # The ROCm base commit is the first parent of the merge commit that landed the PR
+          # (i.e., the base branch tip BEFORE this PR merged).
+          ROCM_BASE_SHA=$(git rev-parse "${MERGE_SHA}^1")
+
+          # Add and fetch upstream to identify the upstream/main commit that HEAD integrated.
+          git remote add upstream "https://github.com/pytorch/pytorch.git"
+          git fetch upstream "$BASE_REF"
+
+          # Heuristic: the upstream commit integrated by the PR's head is the merge-base
+          # between the PR head commit and upstream/main as fetched now.
+          # This gives you the exact upstream commit (or the best common ancestor) that HEAD included.
+          UPSTREAM_MAIN_SHA=$(git merge-base "${HEAD_SHA}" "upstream/$BASE_REF")
+          echo "PR_NUM=$PR_NUM"                           
+          echo "BASE_REF=$BASE_REF"                       
+          echo "HEAD_SHA=$HEAD_SHA"                       
+          echo "MERGE_SHA=$MERGE_SHA"                     
+          echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"            
+          echo "UPSTREAM_MAIN_SHA=$UPSTREAM_MAIN_SHA"     
+
+          
+          echo "PR_NUM=$PR_NUM"                           >> "$GITHUB_OUTPUT"
+          echo "BASE_REF=$BASE_REF"                       >> "$GITHUB_OUTPUT"
+          echo "HEAD_SHA=$HEAD_SHA"                       >> "$GITHUB_OUTPUT"
+          echo "MERGE_SHA=$MERGE_SHA"                     >> "$GITHUB_OUTPUT"
+          echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"             >> "$GITHUB_OUTPUT"
+          echo "UPSTREAM_MAIN_SHA=$UPSTREAM_MAIN_SHA"     >> "$GITHUB_OUTPUT"
+
+      - name: Extract tag base from PR title
+        id: tagname
+        run: |
+          TITLE="${{ github.event.pull_request.title }}"
+          # Remove everything up to and including "[AUTOGENERATED]"
+          BASE_TAG=$(echo "$TITLE" | sed -E 's/^\[AUTOGENERATED\][[:space:]]*//')
+
+          echo "BASE_TAG=$BASE_TAG" 
+          echo "PRE_TAG=${BASE_TAG}_pre" 
+          echo "POST_TAG=${BASE_TAG}_post" 
+          
+          echo "BASE_TAG=$BASE_TAG" >> $GITHUB_OUTPUT
+          echo "PRE_TAG=${BASE_TAG}_pre" >> $GITHUB_OUTPUT
+          echo "POST_TAG=${BASE_TAG}_post" >> $GITHUB_OUTPUT
+
+      - name: Create pre/post tags 
+        shell: bash
+        run: |
+          set -euo pipefail
+          echo "Tagging:"
+          echo "  ${{ steps.tagname.outputs.PRE_TAG }}  @ ${{ steps.shas.outputs.ROCM_BASE_SHA }}"
+          echo "  ${{ steps.tagname.outputs.POST_TAG }} @ ${{ steps.shas.outputs.MERGE_SHA }}"
+
+          git tag -a "${{ steps.tagname.outputs.PRE_TAG }}"  -m "IFU pre (PR #${{ steps.shas.outputs.PR_NUM }})"  "${{ steps.shas.outputs.ROCM_BASE_SHA }}" 
+          git tag -a "${{ steps.tagname.outputs.POST_TAG }}" -m "IFU post (PR #${{ steps.shas.outputs.PR_NUM }})" "${{ steps.shas.outputs.MERGE_SHA }}"
+
+          #Force pushing is safe. If we land a new PR, we'd wanna retag a commit if we have to.
+          git push origin "refs/tags/${{ steps.tagname.outputs.PRE_TAG }}" -f
+          git push origin "refs/tags/${{ steps.tagname.outputs.POST_TAG  }}" -f
+
+      - name: Append rocm_base & upstream_main to PR body
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        shell: bash
+        run: |
+          set -euo pipefail
+          # Read current body
+          PR="${{ steps.shas.outputs.PR_NUM }}"
+          CURR=$(gh api repos/${{ github.repository }}/pulls/$PR --jq .body)
+          APPEND=$'\n'"rocm_base: ${{ steps.shas.outputs.ROCM_BASE_SHA }}"$'\n'"upstream_main: ${{ steps.shas.outputs.UPSTREAM_MAIN_SHA }}"$'\n'
+          NEW_BODY="${CURR}${APPEND}"
+
+          # Write to a temp file and update PR body
+          printf '%s' "$NEW_BODY" > body.txt
+          gh api --method PATCH -H  "Accept: application/vnd.github+json" \
+            repos/${{ github.repository }}/pulls/$PR -F body=@body.txt
+
diff --git a/.github/workflows/pytorch_ifu.yml b/.github/workflows/pytorch_ifu.yml
new file mode 100644
index 0000000000000..fe7439e2e7475
--- /dev/null
+++ b/.github/workflows/pytorch_ifu.yml
@@ -0,0 +1,145 @@
+name: PyTorch IFU (Sync with upstream)
+
+on:
+  workflow_dispatch:
+    inputs:
+      ifu_target_repo:
+        description: "Target repo for IFU"
+        required: false
+        default: "ROCm/pytorch"
+        type: string
+      ifu_target_branch:
+        description: "Target branch for IFU"
+        required: true
+        default: "rocm7.1_internal_testing"
+        type: string
+      ifu_source_repo:
+        description: "Source repo for IFU"
+        required: false
+        default: "pytorch/pytorch"
+        type: string
+      ifu_source_branch:
+        description: "Source branch for IFU"
+        required: false
+        default: "main"
+        type: string
+  # schedule:
+  #   # Runs every 14 days at 09:00 AM UTC/ 04:00 AM CST
+  #   - cron: "0 9 */14 * *"
+
+permissions:
+  contents: write        # push branches/tags
+  pull-requests: write   # create PRs
+
+concurrency:
+  group: ifu
+  # If two jobs are running simultaneously, we will queue them (not cancel the one running)
+  cancel-in-progress: false
+
+jobs:
+  ifu:
+    runs-on: ubuntu-latest
+    env:
+      UPSTREAM_REMOTE: upstream                              # IFU source remote name
+      UPSTREAM_REPO: ${{ inputs.ifu_source_repo }}           # source repo for IFU
+      UPSTREAM_BRANCH: ${{ inputs.ifu_source_branch }}       # source branch for IFU
+      DOWNSTREAM_REMOTE: origin                              # IFU target remote name
+      DOWNSTREAM_REPO: ${{ inputs.ifu_target_repo }}         # target repo for IFU (fork); actions/checkout sets this to origin
+      DOWNSTREAM_BRANCH: ${{ inputs.ifu_target_branch }}     # target branch for IFU
+      GH_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}              # used by gh; provided by Action 
+    steps:
+      - name: Checkout repository (${{ env.DOWNSTREAM_REPO }}) (full history)
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.DOWNSTREAM_REPO }}
+          path: ${{ env.DOWNSTREAM_REPO }}
+          ref: ${{ env.DOWNSTREAM_BRANCH }}
+          token: ${{ env.GH_TOKEN }}
+          fetch-depth: 0 # need full history for merges/tags
+          submodules: recursive
+
+      - name: Add upstream remote (${{ env.UPSTREAM_REPO }})
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          if ! git remote get-url ${UPSTREAM_REMOTE} >/dev/null 2>&1; then
+            git remote add ${UPSTREAM_REMOTE} https://github.com/${UPSTREAM_REPO}.git
+          fi
+          # Confirm remotes
+          git remote -v
+
+      - name: Configure Git user
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          git config user.name  "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+      - name: Fetch upstream and local branch
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          git fetch ${UPSTREAM_REMOTE} ${UPSTREAM_BRANCH}
+          git fetch ${DOWNSTREAM_REMOTE} ${DOWNSTREAM_BRANCH}
+
+      - name: Compute date tag and create working branch
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        id: tag
+        shell: bash
+        run: |
+          DATE="$(date +"%Y-%m-%d")"
+          TAG="${DOWNSTREAM_BRANCH}_IFU_${DATE}"
+          echo "TAG=${TAG}" >> $GITHUB_OUTPUT
+          # Start from rocm branch
+          git checkout -b "$TAG" "${DOWNSTREAM_REMOTE}/${DOWNSTREAM_BRANCH}"
+
+      - name: Save ROCm base commit
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        id: rocm_base
+        run: |
+          base_commit=`git rev-parse --short HEAD`
+          echo "ROCM_BASE_COMMIT=$base_commit" >> $GITHUB_OUTPUT
+                  
+      - name: Merge upstream into working branch (non-interactive)
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        id: merge
+        run: |
+          if git merge "${UPSTREAM_REMOTE}/${UPSTREAM_BRANCH}" --no-edit; then
+            echo "merge_status=clean" >> $GITHUB_OUTPUT
+          else
+            echo "Merge conflicts detected. Committing current resolution snapshot."
+            git submodule sync
+            git submodule update --init --recursive
+            git add -A
+            git status
+            git commit --no-edit 
+            echo "merge_status=conflict" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Push branch & tag to fork
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          git push ${DOWNSTREAM_REMOTE} "${{ steps.tag.outputs.TAG }}"
+
+      - name: Authenticate gh (non-interactive)
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          # The GitHub-hosted runner has gh preinstalled.
+          gh auth status || echo "$GH_TOKEN" | gh auth login --with-token
+          gh repo set-default "${{ env.DOWNSTREAM_REPO }}"
+
+      - name: Create Pull Request with gh
+        working-directory: ${{ env.DOWNSTREAM_REPO }}
+        run: |
+          BASE="${DOWNSTREAM_BRANCH}"
+          HEAD="${{ steps.tag.outputs.TAG }}"
+          TITLE="[AUTOGENERATED] $HEAD"
+          BODY="rocm_base: ${{ steps.rocm_base.outputs.ROCM_BASE_COMMIT }}"
+
+          # If a PR for this head already exists, skip creating a new one
+          if gh pr list --head "$HEAD" --base "$BASE" --state all --json number | grep -q '[0-9]'; then
+            echo "PR already exists for $HEAD -> $BASE. Skipping creation."
+          else
+            gh pr create --base "$BASE" --head "$HEAD" --title "$TITLE" --body "$BODY"
+          fi
+
+      - name: Summarize
+        run: |
+          echo "::notice title=IFU Completed::Branch ${{ steps.tag.outputs.TAG }} pushed. PR created (or already existed). Merge status: ${{ steps.merge.outputs.merge_status }}"

From db39cf7cdae734957d044aa5a3dfc1afb8fd721d Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pruthvigithub@gmail.com>
Date: Fri, 8 Mar 2024 11:29:12 -0800
Subject: [PATCH 02/43] CONSOLIDATED COMMITS: Triton build updates

==========================================

Triton build conditionalized on ROCM_VERSION

Include the ROCm version in triton version

(cherry picked from commit 7d33910198d20c755e40f30de6c9b9e2e03b1a8c)
(cherry picked from commit 0412eb4eed668567d394615aa644eacc2741dcd3)

Update triton-rocm.txt to triton.txt

(cherry picked from commit 0ce9f6ea5f6a654e05fa098f1693c05f3c33d6ec)

Use ROCm/triton for install_triton.sh

(cherry picked from commit 6e9714bbfbe8f003643252ec58f7b42a3c9c364c)

update triton commit

Revert "Use ROCm/triton for install_triton.sh"

This reverts commit 81b0cbc8435122030044049c661f252ee8aa7ae5.

change triton repo

Update triton.txt to use release/internal/3.3.x branch

Use ROCm/triton

Use ROCm/triton for install_triton.sh

(cherry picked from commit 0036db5ab1a6fcf13662cd8ccd99f021422fe547)
---
 .ci/docker/common/install_triton.sh   |  2 +-
 .github/scripts/build_triton_wheel.py | 37 +++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index 1b68e3c247839..b2fdebdcc4747 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
   TRITON_REPO="https://github.com/triton-lang/triton-cpu"
   TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/ROCm/triton"
   TRITON_TEXT_FILE="triton"
 fi
 
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 11fa8404273d3..b12d7bfbeb1ee 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import shutil
 import sys
 from pathlib import Path
@@ -50,6 +51,31 @@ def patch_init_py(
     with open(path, "w") as f:
         f.write(orig)
 
+def get_rocm_version() -> str:
+    rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+    rocm_version = "0.0.0"
+    rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+    if not os.path.isfile(rocm_version_h):
+        rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+
+    # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+    if os.path.isfile(rocm_version_h):
+        RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+        RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+        RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+        major, minor, patch = 0, 0, 0
+        for line in open(rocm_version_h):
+            match = RE_MAJOR.search(line)
+            if match:
+                major = int(match.group(1))
+            match = RE_MINOR.search(line)
+            if match:
+                minor = int(match.group(1))
+            match = RE_PATCH.search(line)
+            if match:
+                patch = int(match.group(1))
+        rocm_version = str(major)+"."+str(minor)+"."+str(patch)
+    return rocm_version
 
 def build_triton(
     *,
@@ -65,6 +91,14 @@ def build_triton(
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
 
+    version_suffix = ""
+    if not release:
+        # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
+        # while release build should only include the version, i.e. 2.1.0
+        rocm_version = get_rocm_version()
+        version_suffix = f"+rocm{rocm_version}_{commit_hash[:10]}"
+        version += version_suffix
+
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
@@ -72,6 +106,7 @@ def build_triton(
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
             triton_pkg_name = "pytorch-triton-rocm"
+            triton_repo = "https://github.com/ROCm/triton/"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -104,6 +139,8 @@ def build_triton(
                 cwd=triton_basedir,
                 shell=True,
             )
+            cur_rocm_ver = get_rocm_version()
+            check_call(["scripts/amd/setup_rocm_libs.sh", cur_rocm_ver], cwd=triton_basedir)
             print("ROCm libraries setup for triton installation...")
 
         # old triton versions have setup.py in the python/ dir,

From d28eca7356755d9f9b50259c8a80656955091e66 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 12 Aug 2025 11:40:53 -0700
Subject: [PATCH 03/43] [rocm7.1_internal_testing] Change pytorch-triton-rocm
 to pytorch-triton (#2482)

Related to https://github.com/ROCm/builder/pull/90/files

http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/305/

PyTorch wheel installs successfully when building torchvision/torchaudio

(cherry picked from commit c1ee54d9804886c5de9cb8eff295185afffafd1d)
---
 .circleci/scripts/binary_populate_env.sh | 4 ++--
 .github/scripts/build_triton_wheel.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 11f9678579935..f876ac8efcf7f 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -84,10 +84,10 @@ fi
 
 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+    TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
     if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
         TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
     fi
     if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index b12d7bfbeb1ee..3b5877b6a987c 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -105,8 +105,8 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "pytorch-triton-rocm"
-            triton_repo = "https://github.com/ROCm/triton/"
+            triton_pkg_name = "triton"
+            triton_repo = "https://github.com/ROCm/triton"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

From 6b3a1411412461fa31c96861e35c411439c528b2 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Fri, 22 Aug 2025 18:35:35 -0500
Subject: [PATCH 04/43] [rocm7.1_internal_testing] Add triton_kernels wheel
 generation (#2566)

Fixes #ISSUE_NUMBER

(cherry picked from commit 0ea0592f5fd18fd105995af578ce8156de71d57c)
---
 .github/scripts/build_triton_wheel.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 3b5877b6a987c..16eb6a7851974 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -164,6 +164,13 @@ def build_triton(
                 cwd=triton_basedir,
             )
 
+        # For gpt-oss models, triton requires this extra triton_kernels wheel
+        # triton_kernels came after pytorch release/2.8
+        triton_kernels_dir = Path(f"{triton_basedir}/python/triton_kernels")
+        check_call([sys.executable, "-m", "build", "--wheel"], cwd=triton_kernels_dir, env=env)
+        kernels_whl_path = next(iter((triton_kernels_dir / "dist").glob("*.whl")))
+        shutil.copy(kernels_whl_path, Path.cwd())
+
         return Path.cwd() / whl_path.name
 
 

From c440bc6c9885ae3fc86245d1e6e20e471549bf59 Mon Sep 17 00:00:00 2001
From: Bo Li <110066325+BLOrange-AMD@users.noreply.github.com>
Date: Thu, 18 Apr 2024 11:12:30 -0500
Subject: [PATCH 05/43] CONSOLIDATED COMMITS: Implementation of PyTorch ut
 parsing script - QA helper functions

=======================================================================================

Implementation of PyTorch ut parsing script - QA helper function (#1386)

* Initial implementation of PyTorch ut parsing script

* Extracted path variables

* Use nested dict to save results

* Fixes typo

* Cleanup

* Fixes several issues

* Minor name change

* Update run_pytorch_unit_tests.py

* Added file banners

* Supported running from API

* Added more help info

* Consistent naming

* Format help text

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Co-authored-by: Jithun Nair <jithun.nair@amd.com>

Print consolidated log file for pytorch unit test automation scripts (#1433)

* Print consolidated log file for pytorch uts

* Update run_entire_tests subprocess call as well

* lint

* Add ERROR string

[SWDEV-466849] Enhancements for PyTorch UT helper scripts (#1491)

* Check that >1 GPUs are visible when running TEST_CONFIG=distributed

* Add EXECUTION_TIME to file-level and aggregate statistics

PyTorch unit test helper scripts enhancements (#1517)

* Fail earlier for distributed-on-1-GPU scenario
* print cmd in consolidated log with prettier formatting
* python->python3

Fixes https://ontrack-internal.amd.com/browse/SWDEV-477264

---------

Co-authored-by: blorange-amd <bo.li2@amd.com>

Several issues fix of QA helper script (#1564)

Fixes SWDEV-475071: https://ontrack-internal.amd.com/browse/SWDEV-475071

Removed args inside function (#1595)

Fixes SWDEV-475071

(cherry picked from commit 041aa1b47978154de63edc6b7ffcdea218a847a3)

QA script - Added multi gpu check with priority_tests (#1604)

Fixes SWDEV-487907. Verified throwing exception for distributed is
working correctly on single gpu with command: python
.automation_scripts/run_pytorch_unit_tests.py --priority_test

(cherry picked from commit 57cc742271cbf4547f9213710e57f6444bbc983e)
(cherry picked from commit 6d5c3dcae5d02196749df11b57e6d0d5e4e2212e)
(cherry picked from commit 2ee3aa2de081680756002bbfc4006b5f68e68c16)
---
 .automation_scripts/parse_xml_results.py      | 178 ++++++
 .automation_scripts/run_pytorch_unit_tests.py | 518 ++++++++++++++++++
 2 files changed, 696 insertions(+)
 create mode 100644 .automation_scripts/parse_xml_results.py
 create mode 100644 .automation_scripts/run_pytorch_unit_tests.py

diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py
new file mode 100644
index 0000000000000..7db2e1ce9233c
--- /dev/null
+++ b/.automation_scripts/parse_xml_results.py
@@ -0,0 +1,178 @@
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+# Backends list
+BACKENDS_LIST = [
+    "dist-gloo",
+    "dist-nccl"
+]
+
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+def get_job_id(report: Path) -> int:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return -1
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
+
+def parse_xml_report(
+    tag: str,
+    report: Path,
+    workflow_id: int,
+    workflow_run_attempt: int,
+    work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    print(f"Parsing {tag}s for test report: {report}")
+
+    job_id = get_job_id(report)
+    print(f"Found job id: {job_id}")
+
+    test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+    root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
+
+    for test_case in root.iter(tag):
+        case = process_xml_element(test_case)
+        if tag == 'testcase':
+            case["workflow_id"] = workflow_id
+            case["workflow_run_attempt"] = workflow_run_attempt
+            case["job_id"] = job_id
+            case["work_flow_name"] = work_flow_name
+
+            # [invoking file]
+            # The name of the file that the test is located in is not necessarily
+            # the same as the name of the file that invoked the test.
+            # For example, `test_jit.py` calls into multiple other test files (e.g.
+            # jit/test_dce.py). For sharding/test selection purposes, we want to
+            # record the file that invoked the test.
+            #
+            # To do this, we leverage an implementation detail of how we write out
+            # tests (https://bit.ly/3ajEV1M), which is that reports are created
+            # under a folder with the same name as the invoking file.
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+        elif tag == 'testsuite':
+            case["work_flow_name"] = work_flow_name
+            case["invoking_xml"] = report.name
+            case["running_time_xml"] = case["time"]
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+    return test_cases
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #       <foo>world</foo>
+    #       <bar>another</bar>
+    #     </testcase>
+    # becomes
+    #    {
+    #       "foo": [{"text": "hello"}, {"text": "world"}],
+    #       "bar": {"text": "another"}
+    #    }
+    for child in element:
+        if child.tag not in ret:
+            ret[child.tag] = process_xml_element(child)
+        else:
+            # If there are multiple tags with the same name, they should be
+            # coalesced into a list.
+            if not isinstance(ret[child.tag], list):
+                ret[child.tag] = [ret[child.tag]]
+            ret[child.tag].append(process_xml_element(child))
+    return ret
\ No newline at end of file
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
new file mode 100644
index 0000000000000..514afd19624c3
--- /dev/null
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+from subprocess import STDOUT, CalledProcessError
+
+from collections import namedtuple
+from datetime import datetime
+from pathlib import Path
+from parse_xml_results import (
+        parse_xml_report
+)
+from pprint import pprint
+from typing import Any, Dict, List
+
+# unit test status list
+UT_STATUS_LIST = [
+    "PASSED",
+    "MISSED",
+    "SKIPPED",
+    "FAILED",
+    "XFAILED",
+    "ERROR"
+]
+
+DEFAULT_CORE_TESTS = [
+    "test_nn",
+    "test_torch",
+    "test_cuda",
+    "test_ops",
+    "test_unary_ufuncs",
+    "test_autograd",
+    "inductor/test_torchinductor"
+]
+
+DISTRIBUTED_CORE_TESTS = [
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_nccl",
+    "distributed/test_distributed_spawn"
+]
+
+CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log"
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."):
+    test_cases = {}
+    items_list = os.listdir(path)
+    for dir in items_list:
+        new_dir = path + '/' + dir + '/'
+        if os.path.isdir(new_dir):
+            for xml_report in Path(new_dir).glob("**/*.xml"):
+                test_cases.update(
+                    parse_xml_report(
+                        tag,
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                        workflow_name
+                    )
+                )
+    return test_cases
+
+def get_test_status(test_case):
+  # In order of priority: S=skipped, F=failure, E=error, P=pass
+  if "skipped" in test_case and test_case["skipped"]:
+      type_message = test_case["skipped"]
+      if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+          return "XFAILED"
+      else:
+          return "SKIPPED"
+  elif "failure" in test_case and test_case["failure"]:
+    return "FAILED"
+  elif "error" in test_case and test_case["error"]:
+    return "ERROR"
+  else:
+    return "PASSED"
+
+def get_test_message(test_case, status=None):
+  if status == "SKIPPED":
+    return test_case["skipped"] if "skipped" in test_case else ""
+  elif status == "FAILED":
+    return test_case["failure"] if "failure" in test_case else ""
+  elif status == "ERROR":
+    return test_case["error"] if "error" in test_case else ""
+  else:
+    if "skipped" in test_case:
+      return test_case["skipped"]
+    elif "failure" in test_case:
+      return test_case["failure"]
+    elif "error" in test_case:
+      return test_case["error"]
+    else:
+      return ""
+
+def get_test_file_running_time(test_suite):
+  if test_suite.__contains__('time'):
+    return test_suite["time"]
+  return 0
+
+def get_test_running_time(test_case):
+  if test_case.__contains__('time'):
+    return test_case["time"]
+  return ""
+
+def summarize_xml_files(path, workflow_name):
+    # statistics
+    TOTAL_TEST_NUM = 0
+    TOTAL_PASSED_NUM = 0
+    TOTAL_SKIPPED_NUM = 0
+    TOTAL_XFAIL_NUM = 0
+    TOTAL_FAILED_NUM = 0
+    TOTAL_ERROR_NUM = 0
+    TOTAL_EXECUTION_TIME = 0
+
+    #parse the xml files
+    test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path)
+    test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path)
+    test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"])
+    # results dict
+    res = {}
+    res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ]
+    test_file_items = set()
+    for (k,v) in list(test_suites.items()):
+        file_name = k[0]
+        if not file_name in test_file_items:
+            test_file_items.add(file_name)
+            # initialization
+            for item in res_item_list:
+                temp_item = test_file_and_status(file_name, item)
+                res[temp_item] = {}
+            temp_item_statistics = test_file_and_status(file_name, "STATISTICS")
+            res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0}
+            test_running_time = get_test_file_running_time(v)
+            res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+        else:
+            test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+            test_running_time = get_test_file_running_time(v)
+            res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+
+    for (k,v) in list(test_cases.items()):
+        file_name = k[0]
+        class_name = k[1]
+        test_name = k[2]
+        combined_name = file_name + "::" + class_name + "::" + test_name
+        test_status = get_test_status(v)
+        test_running_time = get_test_running_time(v)
+        test_message = get_test_message(v, test_status)
+        test_info_value = ""
+        test_tuple_key_status = test_file_and_status(file_name, test_status)
+        test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+        TOTAL_TEST_NUM += 1
+        res[test_tuple_key_statistics]["TOTAL"] += 1
+        if test_status == "PASSED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["PASSED"] += 1
+            TOTAL_PASSED_NUM += 1
+        elif test_status == "SKIPPED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["SKIPPED"] += 1
+            TOTAL_SKIPPED_NUM += 1
+        elif test_status == "XFAILED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["XFAILED"] += 1
+            TOTAL_XFAIL_NUM += 1
+        elif test_status == "FAILED":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["FAILED"] += 1
+            TOTAL_FAILED_NUM += 1
+        elif test_status == "ERROR":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["ERROR"] += 1
+            TOTAL_ERROR_NUM += 1
+
+    # generate statistics_dict
+    statistics_dict = {}
+    statistics_dict["TOTAL"] = TOTAL_TEST_NUM
+    statistics_dict["PASSED"] = TOTAL_PASSED_NUM
+    statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM
+    statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM
+    statistics_dict["FAILED"] = TOTAL_FAILED_NUM
+    statistics_dict["ERROR"] = TOTAL_ERROR_NUM
+    statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME
+    aggregate_item = workflow_name + "_aggregate"
+    total_item = test_file_and_status(aggregate_item, "STATISTICS")
+    res[total_item] = statistics_dict
+
+    return res
+
+def run_command_and_capture_output(cmd):
+    try:
+        print(f"Running command '{cmd}'")
+        with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
+            print(f"========================================", file=output_file, flush=True)
+            print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
+            print(f"========================================", file=output_file, flush=True)
+            p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
+    except CalledProcessError as e:
+        print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
+
+def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/"
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/"
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/"
+    # use test.sh for tests execution
+    run_command_and_capture_output(test_shell_path)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+    return entire_results_dict
+
+def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return priority_results_dict
+
+def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        default_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites  + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        distributed_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/"
+        inductor_selected_test_suites = ""
+        non_inductor_selected_test_suites = ""
+        for item in selected_list:
+            if "inductor/" in item:
+                inductor_selected_test_suites += item
+                inductor_selected_test_suites += " "
+            else:
+                non_inductor_selected_test_suites += item
+                non_inductor_selected_test_suites += " "
+        if inductor_selected_test_suites != "":
+            inductor_selected_test_suites = inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+        if non_inductor_selected_test_suites != "":
+            non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return selected_results_dict
+
+def run_test_and_summarize_results(
+    pytorch_root_dir: str,
+    priority_tests: bool,
+    test_config: List[str],
+    default_list: List[str],
+    distributed_list: List[str],
+    inductor_list: List[str],
+    skip_rerun: bool) -> Dict[str, Any]:
+
+    # copy current environment variables
+    _environ = dict(os.environ)
+    
+    # modify path
+    test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
+    test_run_test_path = pytorch_root_dir + "/test/run_test.py"
+    repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/"
+    test_reports_src = pytorch_root_dir + "/test/test-reports/"
+    run_test_python_file = pytorch_root_dir + "/test/run_test.py"
+
+    # change directory to pytorch root
+    os.chdir(pytorch_root_dir)
+
+    # all test results dict
+    res_all_tests_dict = {}
+
+    # patterns
+    search_text = "--reruns=2"
+    replace_text = "--reruns=0"
+
+    # create logs folder
+    if not os.path.exists(repo_test_log_folder_path):
+        os.mkdir(repo_test_log_folder_path)
+
+    # Set common environment variables for all scenarios
+    os.environ['CI'] = '1'
+    os.environ['PYTORCH_TEST_WITH_ROCM'] = '1'
+    os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1'
+    os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda'
+    os.environ['CONTINUE_THROUGH_ERROR'] = 'True'
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(search_text, replace_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    # Time stamp
+    current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+    print("Current date & time : ", current_datetime)
+    # performed as Job ID
+    str_current_datetime = str(current_datetime)
+    overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/"
+    os.mkdir(overall_logs_path_current_run)
+
+    global CONSOLIDATED_LOG_FILE_PATH
+    CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
+
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+    # Install test requirements
+    command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
+    run_command_and_capture_output(command)
+
+    # Run entire tests for each workflow
+    if not priority_tests and not default_list and not distributed_list and not inductor_list:
+        # run entire tests for default, distributed and inductor workflows → use test.sh
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_all
+            # distributed test process
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_all
+            # inductor test process
+            res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["inductor"] = res_inductor_all
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_all
+            if "distributed" in workflow_list:
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_all
+            if "inductor" in workflow_list:
+                res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["inductor"] = res_inductor_all
+    # Run priority test for each workflow
+    elif priority_tests and not default_list and not distributed_list and not inductor_list:
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_priority
+            # distributed test process
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_priority
+            # will not run inductor priority tests
+            print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_priority
+            if "distributed" in workflow_list:
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_priority
+            if "inductor" in workflow_list:
+                print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+    # Run specified tests for each workflow
+    elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests:
+        if default_list:
+            default_workflow_list = []
+            for item in default_list:
+                default_workflow_list.append(item)
+            res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list)
+            res_all_tests_dict["default"] = res_default_selected
+        if distributed_list:
+            distributed_workflow_list = []
+            for item in distributed_list:
+                distributed_workflow_list.append(item)
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+            res_all_tests_dict["distributed"] = res_distributed_selected
+        if inductor_list:
+            inductor_workflow_list = []
+            for item in inductor_list:
+                 inductor_workflow_list.append(item)
+            res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list)
+            res_all_tests_dict["inductor"] = res_inductor_selected
+    else:
+        raise Exception("Invalid test configurations!")
+
+    # restore environment variables
+    os.environ.clear()
+    os.environ.update(_environ)
+
+    # restore files
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(replace_text, search_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    return res_all_tests_dict
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'")
+    parser.add_argument('--priority_tests', action='store_true', help="run priority tests only")
+    parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'")
+    parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'")
+    parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'")
+    parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory")
+    parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process")
+    parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n"
+                                                           "  test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
+                                                           "}}\n")
+    parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
+                                                            "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
+                                                            "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
+    return parser.parse_args()
+
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+
+def main():
+    args = parse_args()
+    all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun)
+    pprint(dict(all_tests_results))
+
+if __name__ == "__main__":
+    main()

From 25a18ab2e13e4fc9fe5141af60b49173b93c12ae Mon Sep 17 00:00:00 2001
From: Ethan Wee <158101733+ethanwee1@users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:54:08 -0800
Subject: [PATCH 06/43] [rocm6.4_internal_testing] Enable wheels (#1884)
 (#1907)

* Use triton commit same as that used for release/2.6 branch since both
are triton version 3.2.0, so assuming they're compatible.

Relates to:
https://github.com/ROCm/rocAutomation/pull/660/files
https://github.com/ROCm/builder/pull/70/files

Validation

http://ml-ci-internal.amd.com:8080/job/pytorch/job/manylinux_rocm_wheels/568/

---------

Co-authored-by: Jithun Nair <jithun.nair@amd.com>
Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit 14c1417d4b13abb77db5fcba521791a9b6086737)
(cherry picked from commit c20a8f8632a447adc3ed369a5f85ea87a2b518f0)
---
 .circleci/scripts/binary_populate_env.sh | 42 ------------------------
 .github/scripts/build_triton_wheel.py    |  5 ++-
 CMakeLists.txt                           |  6 ++--
 3 files changed, 5 insertions(+), 48 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index f876ac8efcf7f..e64a690af1d6a 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -69,48 +69,6 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
-# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
-TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-TRITON_CONSTRAINT="platform_system == 'Linux'"
-
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
-  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
-  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-      TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-      TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
-  fi
-  export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-fi
-
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-        TRITON_REQUIREMENT="triton==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
-    TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_xpu_version.txt)
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
   USE_GLOO_WITH_OPENSSL="OFF"
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index 16eb6a7851974..e541e7a86f653 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -96,7 +96,7 @@ def build_triton(
         # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
         # while release build should only include the version, i.e. 2.1.0
         rocm_version = get_rocm_version()
-        version_suffix = f"+rocm{rocm_version}_{commit_hash[:10]}"
+        version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
         version += version_suffix
 
     with TemporaryDirectory() as tmpdir:
@@ -124,6 +124,7 @@ def build_triton(
 
         # change built wheel name and version
         env["TRITON_WHEEL_NAME"] = triton_pkg_name
+        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
         if with_clang_ldd:
             env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
 
@@ -139,8 +140,6 @@ def build_triton(
                 cwd=triton_basedir,
                 shell=True,
             )
-            cur_rocm_ver = get_rocm_version()
-            check_call(["scripts/amd/setup_rocm_libs.sh", cur_rocm_ver], cwd=triton_basedir)
             print("ROCm libraries setup for triton installation...")
 
         # old triton versions have setup.py in the python/ dir,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b88247df27a5..991ea336a175b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,11 +56,11 @@ set(CMAKE_C_STANDARD
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
-# --- [ Check that minimal gcc version is 9.3+
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+# --- [ Check that minimal gcc version is 9.2+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2)
   message(
     FATAL_ERROR
-      "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
+      "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
   )
 endif()
 

From 70a5f9fa212250a2703c1b6cfdd474fb05269fb5 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Fri, 18 Apr 2025 18:56:37 -0700
Subject: [PATCH 07/43] ROCm: Enable tf32 testing on test_nn (#55)

* Add trailing comma for consistency in gfx architecture list

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

* ROCm: Enable tf32 testing on test_nn

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

---------

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
(cherry picked from commit c113e1482c68c9bcf11b43351e9187003747804e)
---
 torch/testing/_internal/common_cuda.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 74dfe0c56c232..3f475bd6823b5 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -192,6 +192,9 @@ def tf32_off():
 
 @contextlib.contextmanager
 def tf32_on(self, tf32_precision=1e-5):
+    if torch.version.hip:
+        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
     old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
     old_precision = self.precision
     try:
@@ -200,6 +203,11 @@ def tf32_on(self, tf32_precision=1e-5):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
             yield
     finally:
+        if torch.version.hip:
+            if hip_allow_tf32 is not None:
+                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+            else:
+                del os.environ["HIPBLASLT_ALLOW_TF32"]
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precision
 

From d14e5a9ee5356c4dd64e485a0e6db33235752a9a Mon Sep 17 00:00:00 2001
From: omkar kakarparthi <75638701+okakarpa@users.noreply.github.com>
Date: Tue, 13 May 2025 13:14:36 -0500
Subject: [PATCH 08/43] [AUTOGENERATED] [rocm6.5_internal_testing] Remove
 --no-index and --no-deps flags (#2121)

Cherry-pick of https://github.com/ROCm/pytorch/pull/2103

Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>
(cherry picked from commit 1dea6e825b91409af44d873d025eae9d6f578e7c)
---
 .ci/pytorch/common_utils.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index ff9d8ad41cc92..9c9d223777466 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -67,13 +67,13 @@ function pip_install_whl() {
     # Loop through each path and install individually
     for path in "${paths[@]}"; do
       echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -mpip install "$path"
     done
   else
     # Loop through each argument and install individually
     for path in "${args[@]}"; do
       echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
+      python3 -mpip install "$path"
     done
   fi
 }

From b296de5aa10132f97dbbe8f6741df07253c49885 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 10 Jun 2025 07:08:01 -0700
Subject: [PATCH 09/43] [rocm7.0_internal_testing] upgrading numpy (#2256)

Relates to: https://github.com/ROCm/builder/pull/82

Validation:
http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/98/

Using
`registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16180_ubuntu24.04_py3.12_pytorch_lw_rocm7.0_IT_upgrade_numpy_452f3df6`:
```
root@d92befdbb2a6:/# pip list | egrep "numpy|pandas"
numpy                   2.1.2
pandas                  2.2.3
root@d92befdbb2a6:/# python3
Python 3.12.3 (main, Feb  4 2025, 14:48:35) [GCC 13.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import pandas
>>> import torch
>>> import numpy
>>> exit()
root@d92befdbb2a6:/data/pytorch-micro-benchmarking# HIP_VISIBLE_DEVICES=1 python3 micro_benchmarking_pytorch.py --network resnet50
INFO: running forward and backward for warmup.
INFO: running the benchmark..
OK: finished running benchmark..
--------------------SUMMARY--------------------------
Microbenchmark for network : resnet50
Num devices: 1
Dtype: FP32
Mini batch size [img] : 64
Time per mini-batch : 0.11369450092315674
Throughput [img/sec] : 562.9120096428937
```

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
(cherry picked from commit cf324795568f35f91a78fae9cb8f0efc8d955351)
---
 .ci/docker/requirements-ci.txt | 14 ++++++--------
 requirements-build.txt         |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index d44dfb1ed67ae..47c9a44a775bb 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -117,10 +117,10 @@ ninja==1.11.1.4
 #Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.55.2, 0.60.0
+#Pinned versions: 0.61.2, 0.60.0
 #test that import: test_numba_integration.py
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
@@ -136,12 +136,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
+numpy==2.0.2; python_version == "3.9"
+numpy==2.1.2; python_version >= "3.9"
 
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"
+pandas==2.2.3
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
diff --git a/requirements-build.txt b/requirements-build.txt
index 85923ae39cbdb..170868e57db1a 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,7 +2,7 @@
 setuptools>=70.1.0
 cmake>=3.27
 ninja
-numpy
+numpy==2.1.2
 packaging
 pyyaml
 requests

From a016608019aac6fa721d57523f548d2d1f199778 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Thu, 12 Jun 2025 23:32:43 -0500
Subject: [PATCH 10/43] [rocm7.0_internal_testing] Use different package
 versions for py3.9 (#2269)

Fixes SWDEV-536456

Fixes error post-https://github.com/ROCm/pytorch/pull/2256:
```
00:12:44.248  #22 155.3 ERROR: Ignored the following versions that require a different python version: 0.52.0 Requires-Python >=3.6,<3.9; 0.52.0rc3 Requires-Python >=3.6,<3.9; 0.61.0 Requires-Python >=3.10; 0.61.0rc1 Requires-Python >=3.10; 0.61.0rc2 Requires-Python >=3.10; 0.61.1rc1 Requires-Python >=3.10; 0.61.2 Requires-Python >=3.10; 3.3 Requires-Python >=3.10; 3.3rc0 Requires-Python >=3.10; 3.4 Requires-Python >=3.10; 3.4.1 Requires-Python >=3.10; 3.4.2 Requires-Python >=3.10; 3.4rc0 Requires-Python >=3.10; 3.5 Requires-Python >=3.11; 3.5rc0 Requires-Python >=3.11; 8.2.0 Requires-Python >=3.10; 8.2.1 Requires-Python >=3.10
00:12:44.248  #22 155.3 ERROR: Could not find a version that satisfies the requirement numba==0.61.2 (from versions: 0.1, 0.2, 0.3, 0.5.0, 0.6.0, 0.7.0, 0.7.1, 0.7.2, 0.8.0, 0.8.1, 0.9.0, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.12.1, 0.12.2, 0.13.0, 0.13.2, 0.13.3, 0.13.4, 0.14.0, 0.15.1, 0.16.0, 0.17.0, 0.18.1, 0.18.2, 0.19.1, 0.19.2, 0.20.0, 0.21.0, 0.22.0, 0.22.1, 0.23.0, 0.23.1, 0.24.0, 0.25.0, 0.26.0, 0.27.0, 0.28.1, 0.29.0, 0.30.0, 0.30.1, 0.31.0, 0.32.0, 0.33.0, 0.34.0, 0.35.0, 0.36.1, 0.36.2, 0.37.0, 0.38.0, 0.38.1, 0.39.0, 0.40.0, 0.40.1, 0.41.0, 0.42.0, 0.42.1, 0.43.0, 0.43.1, 0.44.0, 0.44.1, 0.45.0, 0.45.1, 0.46.0, 0.47.0, 0.48.0, 0.49.0, 0.49.1rc1, 0.49.1, 0.50.0rc1, 0.50.0, 0.50.1, 0.51.0rc1, 0.51.0, 0.51.1, 0.51.2, 0.52.0rc2, 0.53.0rc1.post1, 0.53.0rc2, 0.53.0rc3, 0.53.0, 0.53.1, 0.54.0rc2, 0.54.0rc3, 0.54.0, 0.54.1rc1, 0.54.1, 0.55.0rc1, 0.55.0, 0.55.1, 0.55.2, 0.56.0rc1, 0.56.0, 0.56.2, 0.56.3, 0.56.4, 0.57.0rc1, 0.57.0, 0.57.1rc1, 0.57.1, 0.58.0rc1, 0.58.0rc2, 0.58.0, 0.58.1, 0.59.0rc1, 0.59.0, 0.59.1, 0.60.0rc1, 0.60.0)
00:12:44.248  #22 155.3 ERROR: No matching distribution found for numba==0.61.2
```

Validation:
* Docker image:
http://rocm-ci.amd.com/job/mainline-framework-pytorch-internal-cs9-ci/132
* Wheels:
http://rocm-ci.amd.com/job/mainline-pytorch_internal-manylinux-wheels/102/

From
`registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16180_ubuntu22.04_py3.9_pytorch_lw_rocm7.0_IT_py3.9_a11d94ad`:
```
root@f43861a0a856:/# pip list | egrep "numpy|pandas"
numpy                   2.0.2
pandas                  2.2.3
root@f43861a0a856:/# python
Python 3.9.23 (main, Jun  4 2025, 08:55:38)
[GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> import numpy
>>> import pandas
root@f43861a0a856:/data/pytorch-micro-benchmarking# HIP_VISIBLE_DEVICES=1 python3 micro_benchmarking_pytorch.py --network resnet50
INFO: running forward and backward for warmup.
INFO: running the benchmark..
OK: finished running benchmark..
--------------------SUMMARY--------------------------
Microbenchmark for network : resnet50
Num devices: 1
Dtype: FP32
Mini batch size [img] : 64
Time per mini-batch : 0.11354223489761353
Throughput [img/sec] : 563.6669038416574
```

(cherry picked from commit a0a9d816537b921a3d82edf77d103fbe0fbb5fe1)
---
 .ci/docker/requirements-ci.txt | 10 +++++-----
 requirements-build.txt         |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 47c9a44a775bb..0081762c320d3 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -120,7 +120,7 @@ ninja==1.11.1.4
 numba==0.60.0 ; python_version == "3.9"
 numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.61.2, 0.60.0
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
@@ -136,8 +136,8 @@ numba==0.61.2 ; python_version > "3.9"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==2.0.2; python_version == "3.9"
-numpy==2.1.2; python_version >= "3.9"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 
 pandas==2.2.3
 
@@ -249,8 +249,8 @@ scikit-image==0.22.0
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
diff --git a/requirements-build.txt b/requirements-build.txt
index 170868e57db1a..25f1e47a14968 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -2,7 +2,8 @@
 setuptools>=70.1.0
 cmake>=3.27
 ninja
-numpy==2.1.2
+numpy==2.0.2 ; python_version == "3.9""
+numpy==2.1.2 ; python_version > "3.9"
 packaging
 pyyaml
 requests

From 451567ee3cbc55cd65a4c140e66853a0557c6549 Mon Sep 17 00:00:00 2001
From: Dmitry Nikolaev <139769634+dnikolaev-amd@users.noreply.github.com>
Date: Mon, 30 Jun 2025 19:01:04 +0200
Subject: [PATCH 11/43] [rocm7.0_internal_testing] fix enabling sparse tests
 fp16/bf16 for rocm7.0/7.1 (#2239)

Revamped version of #2108

PR to:
- enable complex data types for sparse matmul on ROCm
- fix sparse addmm/baddbmm on ROCm
- fix sparse hipification for ROCm
- fix/enable sparse tests on ROCm (~50 tests total for non-fp16/bf16):
- enable fp16/bf16 sparse path for rocm7.0
- enable fp16/bf16 sparse tests for rocm7.0/7.1
```
test_sparse_csr.py::TestSparseCSRCUDA::test_bmm_cuda_*
test_sparse.py::TestSparseCUDA::test_sparse_matmul_cuda_*
test_sparse_csr.py::TestSparseCSRCUDA::test_mm_cuda_float64
test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_all_sparse_csr_SparseCS*
test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_sizes_all_sparse_csr_*
test_sparse_csr.py::TestSparseCSRCUDA::test_sparse_addmm_cuda_float16
```

(cherry picked from commit cc2a69c6b6c35e6024f40189ceebe530c1968810)
---
 .../ATen/native/sparse/cuda/SparseMatMul.cu   | 20 +++++++++++++++++++
 test/test_sparse.py                           |  6 ++++++
 test/test_sparse_csr.py                       |  7 ++++---
 torch/utils/hipify/cuda_to_hip_mappings.py    |  8 ++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 49bea10c65104..8402555a5c340 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -40,7 +40,27 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
+#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+#define IS_CUSPARSE11_AVAILABLE() 1
+#else
+#define IS_CUSPARSE11_AVAILABLE() 0
+#endif
+
+#if defined(USE_ROCM) && (ROCM_VERSION >= 70000)
+#define HIPSPARSE_FP16_SUPPORT 1
+#else
+#define HIPSPARSE_FP16_SUPPORT 0
+#endif
+
+#if defined(USE_ROCM) && (ROCM_VERSION >= 70100)
+#define HIPSPARSE_FP16_BF16_SUPPORT 1
+#else
+#define HIPSPARSE_FP16_BF16_SUPPORT 0
+#endif
+
+#if IS_CUSPARSE11_AVAILABLE()
 #include <library_types.h>
+#endif
 
 namespace at::native {
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 5150dab4b7cf1..eb6877b419d0b 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -69,6 +69,12 @@ def _op_supports_any_sparse(op):
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
 HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
+HIPSPARSE_FP16_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("7.0")
+HIPSPARSE_BF16_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("7.1")
+
+SPARSE_COMPLEX128_SUPPORTED = CUSPARSE_SPMM_COMPLEX128_SUPPORTED or HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+SPARSE_FLOAT16_SUPPORTED = (SM53OrLater and torch.version.cuda) or (HIPSPARSE_FP16_SUPPORTED)
+SPARSE_BFLOAT16_SUPPORTED = (SM80OrLater and torch.version.cuda) or (HIPSPARSE_BF16_SUPPORTED)
 
 def all_sparse_layouts(test_name='layout', include_strided=False):
     return parametrize(test_name, [
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index f84adcc7bd262..e1bfd3f146991 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -25,7 +25,8 @@
     all_types_and_complex, floating_and_complex_types_and)
 from torch.testing._internal.opinfo.definitions.linalg import sample_inputs_linalg_solve
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
-from test_sparse import CUSPARSE_SPMM_COMPLEX128_SUPPORTED, HIPSPARSE_SPMM_COMPLEX128_SUPPORTED
+from test_sparse import HIPSPARSE_BF16_SUPPORTED, HIPSPARSE_FP16_SUPPORTED, \
+    SPARSE_FLOAT16_SUPPORTED, SPARSE_BFLOAT16_SUPPORTED, SPARSE_COMPLEX128_SUPPORTED
 import operator
 
 if TEST_SCIPY:
@@ -1940,8 +1941,8 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
 
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_and_complex_types_and(
-                  *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [],
-                  *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
+                  *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
+                  *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else []))
     @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 82547c8e28540..12e1a1209c2cd 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8593,6 +8593,14 @@
             "CUSPARSE_STATUS_ZERO_PIVOT",
             ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL),
         ),
+        (
+            "CUSPARSE_STATUS_NOT_SUPPORTED",
+            ("HIPSPARSE_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES",
+            ("HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
         (
             "CUSPARSE_OPERATION_TRANSPOSE",
             ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),

From c2d4e9983457b2aa14b832ed441d0776be706a0c Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 9 Jul 2025 11:59:44 -0700
Subject: [PATCH 12/43] [rocm7.0_internal_testing] upgrade tensorboard
 compatible with numpy 2 (#2326)

Fixes https://ontrack-internal.amd.com/browse/SWDEV-541809

Upgrading tensorboard after numpy upgrade
Ran in
**registry-sc-harbor.amd.com/framework/compute-rocm-dkms-no-npi-hipclang:16381_ubuntu24.04_py3.12_pytorch_lw_rocm7.0_internal_testing_afe8b782**

```
    7  git checkout rocm7.0_IT_upgrade_tensorboard
    8  pip install .ci/docker/requirements-ci.txt
    9  pip install -r .ci/docker/requirements-ci.txt
   10  PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler

root@ubb4-rack-22:/var/lib/jenkins/pytorch# PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler
/opt/venv/lib/python3.12/site-packages/google/protobuf/internal/well_known_types.py:91: DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.fromtimestamp(timestamp, datetime.UTC).
  _EPOCH_DATETIME_NAIVE = datetime.datetime.utcfromtimestamp(0)
.
----------------------------------------------------------------------
Ran 1 test in 0.327s

OK
root@ubb4-rack-22:/var/lib/jenkins/pytorch#

```

(cherry picked from commit c7f61f42059a9a56956190eef5ef4e5f6f127038)
---
 .ci/docker/requirements-ci.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 0081762c320d3..93d32b803b199 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -309,8 +309,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard

From 30a23df8a5bac99b5778a6d45a8d81b59aa1630f Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 15 Aug 2025 13:28:53 -0700
Subject: [PATCH 13/43] [rocm7.1_internal_testing] Pin requirements (#2526)

Tested locally successfully
```
root@rocm-framework-47:/var/lib/jenkins/pytorch# pip install -r requirements.txt
Ignoring numpy: markers 'python_version == "3.9"' don't match your environment
Requirement already satisfied: setuptools<80.0,>=70.1.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 2)) (79.0.1)
Requirement already satisfied: cmake>=3.31.4 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 3)) (4.0.0)
Requirement already satisfied: ninja==1.11.1.3 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 4)) (1.11.1.3)
Requirement already satisfied: numpy==2.1.2 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 5)) (2.1.2)
Requirement already satisfied: packaging==25.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 6)) (25.0)
Requirement already satisfied: pyyaml==6.0.2 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 7)) (6.0.2)
Requirement already satisfied: requests==2.32.4 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2.32.4)
Requirement already satisfied: six==1.17.0 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 9)) (1.17.0)
Requirement already satisfied: typing-extensions==4.14.1 in /opt/venv/lib/python3.10/site-packages (from -r /var/lib/jenkins/pytorch/requirements-build.txt (line 10)) (4.14.1)
Requirement already satisfied: expecttest==0.3.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 8)) (0.3.0)
Requirement already satisfied: filelock==3.18.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 9)) (3.18.0)
Requirement already satisfied: fsspec==2025.7.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 10)) (2025.7.0)
Requirement already satisfied: hypothesis==5.35.1 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 11)) (5.35.1)
Requirement already satisfied: jinja2==3.1.6 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 12)) (3.1.6)
Requirement already satisfied: lintrunner==0.12.7 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 13)) (0.12.7)
Requirement already satisfied: networkx==2.8.8 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 14)) (2.8.8)
Requirement already satisfied: optree==0.13.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 18)) (0.13.0)
Requirement already satisfied: psutil==7.0.0 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 19)) (7.0.0)
Requirement already satisfied: sympy==1.13.3 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 20)) (1.13.3)
Requirement already satisfied: wheel==0.45.1 in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 22)) (0.45.1)
Requirement already satisfied: build[uv] in /opt/venv/lib/python3.10/site-packages (from -r requirements.txt (line 7)) (1.3.0)
Requirement already satisfied: charset_normalizer<4,>=2 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (3.4.3)
Requirement already satisfied: idna<4,>=2.5 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.10/site-packages (from requests==2.32.4->-r /var/lib/jenkins/pytorch/requirements-build.txt (line 8)) (2025.8.3)
Requirement already satisfied: attrs>=19.2.0 in /opt/venv/lib/python3.10/site-packages (from hypothesis==5.35.1->-r requirements.txt (line 11)) (25.3.0)
Requirement already satisfied: sortedcontainers<3.0.0,>=2.1.0 in /opt/venv/lib/python3.10/site-packages (from hypothesis==5.35.1->-r requirements.txt (line 11)) (2.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/venv/lib/python3.10/site-packages (from jinja2==3.1.6->-r requirements.txt (line 12)) (3.0.2)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/venv/lib/python3.10/site-packages (from sympy==1.13.3->-r requirements.txt (line 20)) (1.3.0)
Requirement already satisfied: pyproject_hooks in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (1.2.0)
Requirement already satisfied: tomli>=1.1.0 in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (2.2.1)
Requirement already satisfied: uv>=0.1.18 in /opt/venv/lib/python3.10/site-packages (from build[uv]->-r requirements.txt (line 7)) (0.8.10)
root@rocm-framework-47:/var/lib/jenkins/pytorch# pip install -r requirements-build.txt

```

(cherry picked from commit 6e6e45424f4ba643f9f642f69206f6d1df4e9192)
---
 requirements-build.txt | 20 ++++++++++----------
 requirements.txt       | 27 +++++++++++++++------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/requirements-build.txt b/requirements-build.txt
index 25f1e47a14968..f2edf387fb97a 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,12 +1,12 @@
 # Build System requirements
-setuptools>=70.1.0
-cmake>=3.27
-ninja
-numpy==2.0.2 ; python_version == "3.9""
-numpy==2.1.2 ; python_version > "3.9"
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.10.0
 pip  # not technically needed, but this makes setup.py invocation work
+setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
+cmake>=3.31.4
+ninja==1.11.1.3
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
+packaging==25.0
+pyyaml==6.0.2
+requests==2.32.4
+six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
+typing-extensions==4.14.1
diff --git a/requirements.txt b/requirements.txt
index fc4b53dfd49ea..090a733726658 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,15 +5,18 @@
 
 # Install / Development extra requirements
 build[uv]  # for building sdist and wheel
-expecttest>=0.3.0
-filelock
-fsspec>=0.8.5
-hypothesis
-jinja2
-lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
-networkx>=2.5.1
-optree>=0.13.0
-psutil
-sympy>=1.13.3
-typing-extensions>=4.13.2
-wheel
+expecttest==0.3.0
+filelock==3.18.0
+fsspec==2025.7.0
+hypothesis==5.35.1
+jinja2==3.1.6
+lintrunner==0.12.7 ; platform_machine != "s390x"
+networkx==2.8.8
+ninja==1.11.1.3
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
+optree==0.13.0
+psutil==7.0.0
+sympy==1.13.3
+typing-extensions==4.14.1
+wheel==0.45.1

From 11ca2d04427c6b863da3d06fa35f40e8bbddf995 Mon Sep 17 00:00:00 2001
From: Xinya Zhang <Xinya.Zhang@amd.com>
Date: Wed, 20 Dec 2023 16:26:17 -0600
Subject: [PATCH 14/43] Enable gesvda for ROCM >= 6.1 (#1339)

This also fixes a problem in gesvd driver when UV is not needed.

(cherry picked from commit 4ce57ecb3665796ff737d23cccba314c7ea12e92)
(cherry picked from commit 167b4c1c24abc61968b776a30db6794065de5625)
---
 .../native/cuda/linalg/BatchLinearAlgebraLib.cpp   | 14 +++++++-------
 aten/src/ATen/native/cuda/linalg/CUDASolver.cpp    |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 267d1f5acea52..5b28cc6eccf01 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -332,11 +332,11 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
   // gesvd just knows how to handle m >= n, so in the other case we need to transpose A
   const auto not_A_H = A.size(-2) >= A.size(-1);
   Tensor Vcopy = V; // Shallow copy
-#ifdef USE_ROCM
+#ifdef ROCM_VERSION
   // Similar to the case in svd_magma(), experiments have shown Vh tensor is
   // not guaranteed to be column major on ROCM, we have to create a copy to
   // deal with this
-  if (!not_A_H) {
+  if (compute_uv && !not_A_H) {
     Vcopy = at::empty_like(V.mT(),
                            V.options()
                            .device(V.device())
@@ -351,8 +351,8 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
                                        infos,
                                        full_matrices, compute_uv, calculate_all_batches, batches);
   });
-#ifdef USE_ROCM
-  if (!not_A_H) {
+#ifdef ROCM_VERSION
+  if (compute_uv && !not_A_H) {
     V.copy_(Vcopy);
   }
 #endif
@@ -526,8 +526,8 @@ static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const T
 template<typename scalar_t>
 static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
     const Tensor& infos, bool full_matrices, bool compute_uv) {
-#ifndef CUDART_VERSION
-  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
+#if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION < 60100
+  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend or ROCM >= 5.7.0.")
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   int m = cuda_int_cast(A.size(-2), "m");
@@ -665,7 +665,7 @@ void svd_cusolver(const Tensor& A,
   static constexpr const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
   // The default heuristic is to use gesvdj driver
-#ifdef USE_ROCM
+#if defined(ROCM_VERSION) && ROCM_VERSION < 60100
   const auto driver_v = std::string_view("gesvdj");
 #else
   const auto driver_v = driver.value_or("gesvdj");
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
index 99c38077611d6..af183038bb8e4 100644
--- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
+++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -470,8 +470,8 @@ void gesvdjBatched<c10::complex<double>>(
 }
 
 
-// ROCM does not implement gesdva yet
-#ifdef CUDART_VERSION
+// ROCM does not implement gesdva correctly before 6.1
+#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 60100
 template<>
 void gesvdaStridedBatched_buffersize<float>(
     cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n, float *A, int lda, long long int strideA,

From 629e82400d76d7701c10debc8a893db691d0f57d Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pruthvigithub@gmail.com>
Date: Tue, 12 Mar 2024 09:56:58 -0700
Subject: [PATCH 15/43] Remove ROCmloops specific test

(cherry picked from commit d6879fa6b482188ccfc54a8ea8be272a35fc7cdf)
(cherry picked from commit 123a16486958b171b538126ef4fe0045464a505e)
---
 aten/src/ATen/test/cuda_vectorized_test.cu | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index e4c18102526ac..1b3ed4dc4ac42 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -32,23 +32,6 @@ void reset_buffers() {
   }
 }
 
-#if defined(USE_ROCM) && !defined(_WIN32)
-TEST(TestLoops, HasSameArgTypes) {
-  // This is a compile-time unit test. If this file compiles without error,
-  // then the test passes and during runtime, we just need to return.
-  using namespace at::native::modern::detail;
-  using func1_t = int (*)(float, float);
-  using func2_t = int (*)(bool, float, float);
-  using func3_t = int (*)(float);
-  using func4_t = int (*)();
-  static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
-  static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
-  static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
-  static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
-  return;
-}
-#endif
-
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 

From ab4714d0aa0e18ebf4c3f54f762df66cca9d5afe Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Wed, 29 Oct 2025 17:41:21 +0000
Subject: [PATCH 16/43] Bump triton to 3.5.x and update related_commits

---
 .ci/docker/ci_commit_pins/triton.txt |  2 +-
 related_commits                      | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 related_commits

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 10f1207e60e6c..d893bdd32ab34 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
+ac80c4190aa0321f761a08af97e1e1eee41f01d9
diff --git a/related_commits b/related_commits
new file mode 100644
index 0000000000000..ee36e55601d0f
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
+centos|pytorch|apex|master|2190fbaeb88384ed792373adbb83c182af117ca0|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|main|218d2ab791d437309f91e0486eb9fa7f00badc17|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|218d2ab791d437309f91e0486eb9fa7f00badc17|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|92950795e0790eb74df995daf40b658e85fd2c9f|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|3577306c8b32517afe8eb6eb7e84335601180598|https://github.com/pytorch/ao
+centos|pytorch|ao|main|3577306c8b32517afe8eb6eb7e84335601180598|https://github.com/pytorch/ao

From 25366318c78a52a27e346e706827c57f8adaec90 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>
Date: Tue, 1 Apr 2025 09:19:40 -0700
Subject: [PATCH 17/43] Revert to prev sccache by ROCm

Signed-off-by: Jagadish Krishnamoorthy <jagadish.krishnamoorthy@amd.com>

(cherry picked from commit 1ad5bb95d796283d5f56ac1edd16f1731d24a49d)
(cherry picked from commit 519160d466782f5a62365be051fcb3ef90fa0b00)
---
 .ci/docker/common/install_cache.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.ci/docker/common/install_cache.sh b/.ci/docker/common/install_cache.sh
index f38cb3d06d88b..80839990e4e6f 100644
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@@ -36,7 +36,12 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"
 
 # Setup compiler cache
-install_ubuntu
+if [ -n "$ROCM_VERSION" ]; then
+  curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
+else
+  install_ubuntu
+fi
+
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {

From 777e73cd5d77edaf14d9f3834e9540e29d17a274 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Mon, 3 Nov 2025 18:13:06 -0500
Subject: [PATCH 18/43] pytorch_ifu.yml: Change date format (#2776)

Fixes #ISSUE_NUMBER
---
 .github/workflows/pytorch_ifu.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytorch_ifu.yml b/.github/workflows/pytorch_ifu.yml
index fe7439e2e7475..a06c567a61dcb 100644
--- a/.github/workflows/pytorch_ifu.yml
+++ b/.github/workflows/pytorch_ifu.yml
@@ -84,7 +84,7 @@ jobs:
         id: tag
         shell: bash
         run: |
-          DATE="$(date +"%Y-%m-%d")"
+          DATE="$(date +"%Y%m%d")"
           TAG="${DOWNSTREAM_BRANCH}_IFU_${DATE}"
           echo "TAG=${TAG}" >> $GITHUB_OUTPUT
           # Start from rocm branch

From 56002f4c285600e8d1232d9506d88de35d629b03 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Tue, 4 Nov 2025 09:44:36 -0500
Subject: [PATCH 19/43] create_ifu_tag: updates from testing on rocm fork

- Need to use upstream/main for rocm/pytorch's develop branch. For
  release branches, `github.event.pull_request.base.ref` should work as
  is.

- Need to remove any trailing space in PR TITTLE so branch name can be
  formed correctly

Fixes #ISSUE_NUMBER
---
 .github/workflows/create_ifu_tag.yml | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/create_ifu_tag.yml b/.github/workflows/create_ifu_tag.yml
index e54bb35e6982c..422b6d1c5ba67 100644
--- a/.github/workflows/create_ifu_tag.yml
+++ b/.github/workflows/create_ifu_tag.yml
@@ -40,6 +40,17 @@ jobs:
           HEAD_SHA="${{ github.event.pull_request.head.sha }}"
           MERGE_SHA="${{ github.event.pull_request.merge_commit_sha }}"
 
+          # For release branches, local branch is same as remote branch. But, for rocm/pytorch's
+          # develop branch, we want to use main branch upstream
+          if [ "$BASE_REF" == "develop" ]; then
+            BASE_REF="main"
+          fi
+          
+          echo "PR_NUM=$PR_NUM"                           
+          echo "BASE_REF=$BASE_REF"                       
+          echo "HEAD_SHA=$HEAD_SHA"                       
+          echo "MERGE_SHA=$MERGE_SHA"                     
+    
           # The ROCm base commit is the first parent of the merge commit that landed the PR
           # (i.e., the base branch tip BEFORE this PR merged).
           ROCM_BASE_SHA=$(git rev-parse "${MERGE_SHA}^1")
@@ -52,10 +63,6 @@ jobs:
           # between the PR head commit and upstream/main as fetched now.
           # This gives you the exact upstream commit (or the best common ancestor) that HEAD included.
           UPSTREAM_MAIN_SHA=$(git merge-base "${HEAD_SHA}" "upstream/$BASE_REF")
-          echo "PR_NUM=$PR_NUM"                           
-          echo "BASE_REF=$BASE_REF"                       
-          echo "HEAD_SHA=$HEAD_SHA"                       
-          echo "MERGE_SHA=$MERGE_SHA"                     
           echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"            
           echo "UPSTREAM_MAIN_SHA=$UPSTREAM_MAIN_SHA"     
 
@@ -72,7 +79,8 @@ jobs:
         run: |
           TITLE="${{ github.event.pull_request.title }}"
           # Remove everything up to and including "[AUTOGENERATED]"
-          BASE_TAG=$(echo "$TITLE" | sed -E 's/^\[AUTOGENERATED\][[:space:]]*//')
+          # Remove trailing whitespace
+          BASE_TAG=$(echo "$TITLE" | sed -E 's/^\[AUTOGENERATED\][[:space:]]*//' | sed -E 's/[[:space:]]+$//')
 
           echo "BASE_TAG=$BASE_TAG" 
           echo "PRE_TAG=${BASE_TAG}_pre" 

From b4c1e1e712f007a80716b552319654f24435839b Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Tue, 4 Nov 2025 15:40:16 +0000
Subject: [PATCH 20/43] Fix merge conflict

---
 .ci/docker/requirements-ci.txt | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 9ed8345f0fa31..bdc34b4864cd7 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -136,21 +136,11 @@ numba==0.61.2 ; python_version > "3.9"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-<<<<<<< HEAD
-numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
-
-pandas==2.2.3
-=======
-numpy==1.22.4; python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
+numpy==2.1.2; python_version > "3.9" and python_version < "3.14"
 numpy==2.3.4; python_version >= "3.14"
 
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
+pandas==2.2.3; python_version >= "3.9" and python_version < "3.14"
 pandas==2.3.3; python_version >= "3.14"
->>>>>>> upstream/main
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@@ -261,14 +251,8 @@ scikit-image==0.22.0
 #Pinned versions: 0.20.3
 #test that import:
 
-<<<<<<< HEAD
-scipy==1.13.1 ; python_version == "3.9"
-scipy==1.14.1 ; python_version > "3.9"
-=======
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
+scipy==1.14.1 ; python_version > "3.9" and python_version < "3.14"
 scipy==1.16.2 ; python_version >= "3.14"
->>>>>>> upstream/main
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1

From a3c49a95de48914e369aa08899a683c2db88ed5f Mon Sep 17 00:00:00 2001
From: AMD <amd@amd.com>
Date: Wed, 19 Nov 2025 02:54:48 +0000
Subject: [PATCH 21/43] Fix conflicts and move triton ver to 3.5.0

To keep triton version consistent with what is in rocm/triton's
release/internal/3.5.x branch, we need to keep triton_version.txt at
3.5.0 and move triton hash to ToT of that branch.
---
 .ci/docker/ci_commit_pins/triton.txt |  6 +-----
 .ci/docker/triton_version.txt        |  2 +-
 requirements.txt                     | 18 ------------------
 3 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 2bc3043f3008f..8fcbc3de469f4 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1,5 +1 @@
-<<<<<<< HEAD
-ac80c4190aa0321f761a08af97e1e1eee41f01d9
-=======
-bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
->>>>>>> upstream/main
+5df9c723de8c23508773b07fe16dd34e4c444541
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index d5c0c99142898..1545d966571dc 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1 +1 @@
-3.5.1
+3.5.0
diff --git a/requirements.txt b/requirements.txt
index 39fd9ff2c3067..e9b5d4482bc5c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,23 +5,6 @@
 
 # Install / Development extra requirements
 build[uv]  # for building sdist and wheel
-<<<<<<< HEAD
-expecttest==0.3.0
-filelock==3.18.0
-fsspec==2025.7.0
-hypothesis==5.35.1
-jinja2==3.1.6
-lintrunner==0.12.7 ; platform_machine != "s390x"
-networkx==2.8.8
-ninja==1.11.1.3
-numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
-optree==0.13.0
-psutil==7.0.0
-sympy==1.13.3
-typing-extensions==4.14.1
-wheel==0.45.1
-=======
 expecttest>=0.3.0
 filelock
 fsspec>=0.8.5
@@ -35,4 +18,3 @@ spin
 sympy>=1.13.3
 typing-extensions>=4.13.2
 wheel
->>>>>>> upstream/main

From 7e17fb9a935e2a5939a3b9fb3834330352bc9e4b Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Mon, 19 Jan 2026 18:02:29 +0000
Subject: [PATCH 22/43] Fix merge conflicts + bump triton to 3.6.x branch

---
 .ci/docker/ci_commit_pins/triton.txt     | 6 +-----
 .ci/docker/requirements-ci.txt           | 9 ---------
 .ci/docker/triton_version.txt            | 4 ----
 .circleci/scripts/binary_populate_env.sh | 3 ---
 .github/scripts/build_triton_wheel.py    | 4 ----
 test/test_sparse_csr.py                  | 5 -----
 6 files changed, 1 insertion(+), 30 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index b06e0c0c7e656..3d17e9c0de64b 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1,5 +1 @@
-<<<<<<< HEAD
-5df9c723de8c23508773b07fe16dd34e4c444541
-=======
-9844da955a9db14ec69c9aac828ee9803085e288
->>>>>>> upstream/main
+ba5c1517e6f5906761cf5783036efb587026208d
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 6a8cfdc62c0ec..a1cde5541b3db 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -117,13 +117,8 @@ ninja==1.11.1.4
 #Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-<<<<<<< HEAD
-numba==0.60.0 ; python_version == "3.9"
-numba==0.61.2 ; python_version > "3.9"
-=======
 numba==0.57.1 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
->>>>>>> upstream/main
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -141,13 +136,9 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-<<<<<<< HEAD
-numpy==2.1.2; python_version > "3.9" and python_version < "3.14"
-=======
 numpy==1.23.2; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
->>>>>>> upstream/main
 numpy==2.3.4; python_version >= "3.14"
 
 pandas==2.2.3; python_version >= "3.9" and python_version < "3.14"
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index db2f0be12db3a..40c341bdcdbe8 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1,5 +1 @@
-<<<<<<< HEAD
-3.5.0
-=======
 3.6.0
->>>>>>> upstream/main
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 7d283f4588694..74ad225db933b 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -69,8 +69,6 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
-<<<<<<< HEAD
-=======
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 TRITON_CONSTRAINT="platform_system == 'Linux'"
@@ -113,7 +111,6 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
     fi
 fi
 
->>>>>>> upstream/main
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
   USE_GLOO_WITH_OPENSSL="OFF"
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index d388013b8ccb2..64fc21fba445c 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -105,12 +105,8 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-<<<<<<< HEAD
             triton_pkg_name = "triton"
             triton_repo = "https://github.com/ROCm/triton"
-=======
-            triton_pkg_name = "triton-rocm"
->>>>>>> upstream/main
         elif device == "xpu":
             triton_pkg_name = "triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index ad6979599b5fe..a22f196e8c869 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -1941,13 +1941,8 @@ def test_shape(d1, d2, d3, nnz, transposed, index_dtype):
 
     @dtypes(*floating_and_complex_types())
     @dtypesIfCUDA(*floating_and_complex_types_and(
-<<<<<<< HEAD
-                  *[torch.half] if SPARSE_FLOAT16_SUPPORTED else [],
-                  *[torch.bfloat16] if SPARSE_BFLOAT16_SUPPORTED else []))
-=======
                   *[torch.half] if not TEST_WITH_ROCM else [],
                   *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else []))
->>>>>>> upstream/main
     @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2})
     def test_sparse_addmm(self, device, dtype):
         def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None):

From 4d67363992f299b48068fc26a6cd0cd2fc770761 Mon Sep 17 00:00:00 2001
From: AMD <amd@amd.com>
Date: Tue, 20 Jan 2026 11:58:09 +0000
Subject: [PATCH 23/43] Remove stale opentelemetry-cpp submodule

---
 third_party/opentelemetry-cpp | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 third_party/opentelemetry-cpp

diff --git a/third_party/opentelemetry-cpp b/third_party/opentelemetry-cpp
deleted file mode 160000
index a799f4aed9c94..0000000000000
--- a/third_party/opentelemetry-cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a799f4aed9c94b765dcdaabaeab7d5e7e2310878

From 3ee04a9830bea722779f6591ffb9a2386afcfc14 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Thu, 12 Feb 2026 03:50:13 +0000
Subject: [PATCH 24/43] Fix merge conflicts

---
 .ci/docker/requirements-ci.txt             |   5 +-
 requirements-build.txt                     |  13 -
 torch/utils/hipify/cuda_to_hip_mappings.py | 345 ---------------------
 3 files changed, 1 insertion(+), 362 deletions(-)

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 758b5ff852586..8416dac24b1cf 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -141,12 +141,9 @@ numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
 numpy==2.3.4; python_version >= "3.14"
 
-<<<<<<< HEAD
-pandas==2.2.3; python_version >= "3.9" and python_version < "3.14"
-=======
+
 pandas==2.0.3; python_version < "3.12"
 pandas==2.2.3; python_version >= "3.12" and python_version < "3.14"
->>>>>>> upstream/main
 pandas==2.3.3; python_version >= "3.14"
 
 #onnxruntime
diff --git a/requirements-build.txt b/requirements-build.txt
index 982002a447065..863bc9f921d8d 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,6 +1,4 @@
 # Build System requirements
-<<<<<<< HEAD
-=======
 setuptools>=70.1.0,<82
 cmake>=3.27
 ninja
@@ -10,15 +8,4 @@ pyyaml
 requests
 six  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions>=4.15.0
->>>>>>> upstream/main
 pip  # not technically needed, but this makes setup.py invocation work
-setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
-cmake>=3.31.4
-ninja==1.11.1.3
-numpy==2.0.2 ; python_version == "3.9"
-numpy==2.1.2 ; python_version > "3.9"
-packaging==25.0
-pyyaml==6.0.2
-requests==2.32.4
-six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions==4.14.1
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 6b3409c8daf0a..3dff877946cea 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -3396,350 +3396,6 @@
     ("CUDNN_CHECK", "MIOPEN_CHECK"),
 ])
 
-<<<<<<< HEAD
-CUDA_SPECIAL_MAP = collections.OrderedDict(
-    [
-        # SPARSE
-        ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cuComplex", ("hipComplex", CONV_TYPE, API_SPECIAL)),
-        ("cuDoubleComplex", ("hipDoubleComplex", CONV_TYPE, API_SPECIAL)),
-        (
-            "CUSPARSE_POINTER_MODE_HOST",
-            ("HIPSPARSE_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPECIAL)),
-        (
-            "cusparseCreateMatDescr",
-            ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPECIAL)),
-        (
-            "cusparseDestroyMatDescr",
-            ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDiagType_t", ("hipsparseDiagType_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_DIAG_TYPE_UNIT", ("HIPSPARSE_DIAG_TYPE_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_DIAG_TYPE_NON_UNIT", ("HIPSPARSE_DIAG_TYPE_NON_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseSetMatDiagType", ("hipsparseSetMatDiagType", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseFillMode_t", ("hipsparseFillMode_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_FILL_MODE_UPPER", ("HIPSPARSE_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_FILL_MODE_LOWER", ("HIPSPARSE_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseSetMatFillMode", ("hipsparseSetMatFillMode", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDirection_t", ("hipsparseDirection_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_DIRECTION_ROW", ("HIPSPARSE_DIRECTION_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_DIRECTION_COLUMN", ("HIPSPARSE_DIRECTION_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseSolvePolicy_t", ("hipsparseSolvePolicy_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_SOLVE_POLICY_NO_LEVEL", ("HIPSPARSE_SOLVE_POLICY_NO_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SOLVE_POLICY_USE_LEVEL", ("HIPSPARSE_SOLVE_POLICY_USE_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseCreateBsrsv2Info", ("hipsparseCreateBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateBsrsm2Info", ("hipsparseCreateBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDestroyBsrsv2Info", ("hipsparseDestroyBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDestroyBsrsm2Info", ("hipsparseDestroyBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrmm", ("hipsparseSbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrmm", ("hipsparseDbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrmm", ("hipsparseCbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrmm", ("hipsparseZbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrmv", ("hipsparseSbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrmv", ("hipsparseDbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrmv", ("hipsparseCbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrmv", ("hipsparseZbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsv2_bufferSize", ("hipsparseSbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsv2_bufferSize", ("hipsparseDbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsv2_bufferSize", ("hipsparseCbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsv2_bufferSize", ("hipsparseZbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsv2_analysis", ("hipsparseSbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsv2_analysis", ("hipsparseDbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsv2_analysis", ("hipsparseCbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsv2_analysis", ("hipsparseZbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsv2_solve", ("hipsparseSbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsv2_solve", ("hipsparseDbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsv2_solve", ("hipsparseCbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsv2_solve", ("hipsparseZbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsm2_bufferSize", ("hipsparseSbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsm2_bufferSize", ("hipsparseDbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsm2_bufferSize", ("hipsparseCbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsm2_bufferSize", ("hipsparseZbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsm2_analysis", ("hipsparseSbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsm2_analysis", ("hipsparseDbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsm2_analysis", ("hipsparseCbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsm2_analysis", ("hipsparseZbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSbsrsm2_solve", ("hipsparseSbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDbsrsm2_solve", ("hipsparseDbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCbsrsm2_solve", ("hipsparseCbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZbsrsm2_solve", ("hipsparseZbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCcsrmm2", ("hipsparseCcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZcsrmm2", ("hipsparseZcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPECIAL)),
-        (
-            "cusparseXcsrsort_bufferSizeExt",
-            ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseCreateCsrgemm2Info", ("hipsparseCreateCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL)),
-        (
-            "cusparseDestroyCsrgemm2Info",
-            ("hipsparseDestroyCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseXcsrgemm2Nnz", ("hipsparseXcsrgemm2Nnz", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrgemm2_bufferSizeExt", ("hipsparseDcsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrgemm2_bufferSizeExt", ("hipsparseScsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrgemm2", ("hipsparseDcsrgemm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrgemm2", ("hipsparseScsrgemm2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSetPointerMode", ("hipsparseSetPointerMode", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseXcsrgeam2Nnz", ("hipsparseXcsrgeam2Nnz", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrgeam2_bufferSizeExt", ("hipsparseScsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrgeam2_bufferSizeExt", ("hipsparseDcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCcsrgeam2_bufferSizeExt", ("hipsparseCcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZcsrgeam2_bufferSizeExt", ("hipsparseZcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseScsrgeam2", ("hipsparseScsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDcsrgeam2", ("hipsparseDcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCcsrgeam2", ("hipsparseCcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseZcsrgeam2", ("hipsparseZcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseXbsrsm2_zeroPivot", ("hipsparseXbsrsm2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseXbsrsv2_zeroPivot", ("hipsparseXbsrsv2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL)),
-        (
-            "cusparseXcoosort_bufferSizeExt",
-            ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        (
-            "cusparseXcoosortByRow",
-            ("hipsparseXcoosortByRow", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPECIAL)),
-        (
-            "cusparseCreateIdentityPermutation",
-            ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        (
-            "cusparseSetMatIndexBase",
-            ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPECIAL),
-        ),
-        ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMV", ("hipsparseSpMV", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMV_bufferSize", ("hipsparseSpMV_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMM", ("hipsparseSpMM", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMM_bufferSize", ("hipsparseSpMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateDnMat", ("hipsparseCreateDnMat", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCsrSetStridedBatch", ("hipsparseCsrSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateDnVec", ("hipsparseCreateDnVec", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDestroyDnMat", ("hipsparseDestroyDnMat", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDestroyDnVec", ("hipsparseDestroyDnVec", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDestroySpMat", ("hipsparseDestroySpMat", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpGEMM_destroyDescr", ("hipsparseSpGEMM_destroyDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateCoo", ("hipsparseCreateCoo", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpGEMM_createDescr", ("hipsparseSpGEMM_createDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpGEMM_copy", ("hipsparseSpGEMM_copy", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSDDMM_bufferSize", ("hipsparseSDDMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSDDMM_preprocess", ("hipsparseSDDMM_preprocess", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSDDMM", ("hipsparseSDDMM", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpGEMM_compute", ("hipsparseSpGEMM_compute", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpGEMM_workEstimation", ("hipsparseSpGEMM_workEstimation", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMatGetSize", ("hipsparseSpMatGetSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMatSetAttribute", ("hipsparseSpMatSetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseCsrSetPointers", ("hipsparseCsrSetPointers", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSM_createDescr", ("hipsparseSpSM_createDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSM_destroyDescr", ("hipsparseSpSM_destroyDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSM_bufferSize", ("hipsparseSpSM_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSM_analysis", ("hipsparseSpSM_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSM_solve", ("hipsparseSpSM_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSV_createDescr", ("hipsparseSpSV_createDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSV_destroyDescr", ("hipsparseSpSV_destroyDescr", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSV_bufferSize", ("hipsparseSpSV_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSV_analysis", ("hipsparseSpSV_analysis", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpSV_solve", ("hipsparseSpSV_solve", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseSpMVAlg_t", ("hipsparseSpMVAlg_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpMMAlg_t", ("hipsparseSpMMAlg_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpSMAlg_t", ("hipsparseSpSMAlg_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpSVAlg_t", ("hipsparseSpSVAlg_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseIndexType_t", ("hipsparseIndexType_t", CONV_TYPE, API_SPECIAL)),
-        # Unsupported ("cusparseMatDescr", ("hipsparseMatDescr", CONV_TYPE, API_SPECIAL)),
-        # Unsupported ("cusparseDnMatDescr", ("hipsparseDnMatDescr", CONV_TYPE, API_SPECIAL)),
-        # Unsupported ("cusparseDnVecDescr", ("hipsparseDnVecDescr", CONV_TYPE, API_SPECIAL)),
-        # Unsupported ("cusparseSpMatDescr", ("hipsparseSpMatDescr", CONV_TYPE, API_SPECIAL)),
-        # Unsupported ("cusparseSpGEMMDescr", ("hipsparseSpGEMMDescr", CONV_TYPE, API_SPECIAL)),
-        ("cusparseDnMatDescr_t", ("hipsparseDnMatDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseDnVecDescr_t", ("hipsparseDnVecDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpMatDescr_t", ("hipsparseSpMatDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpGEMMDescr_t", ("hipsparseSpGEMMDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpSMDescr_t", ("hipsparseSpSMDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseSpSVDescr_t", ("hipsparseSpSVDescr_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_INDEX_32I", ("HIPSPARSE_INDEX_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_INDEX_64I", ("HIPSPARSE_INDEX_64I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_ORDER_ROW", ("HIPSPARSE_ORDER_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_MV_ALG_DEFAULT", ("HIPSPARSE_MV_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_MM_ALG_DEFAULT", ("HIPSPARSE_MM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_COO_ALG1", ("HIPSPARSE_SPMM_COO_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_COO_ALG2", ("HIPSPARSE_SPMM_COO_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_CSR_ALG1", ("HIPSPARSE_SPMM_CSR_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_CSR_ALG2", ("HIPSPARSE_SPMM_CSR_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_CSR_ALG3", ("HIPSPARSE_SPMM_CSR_ALG3", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_COOMV_ALG", ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMM_CSR_ALG1", ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPGEMM_DEFAULT", ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SDDMM_ALG_DEFAULT", ("HIPSPARSE_SDDMM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPSM_ALG_DEFAULT", ("HIPSPARSE_SPSM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPSV_ALG_DEFAULT", ("HIPSPARSE_SPSV_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMAT_FILL_MODE", ("HIPSPARSE_SPMAT_FILL_MODE", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_SPMAT_DIAG_TYPE", ("HIPSPARSE_SPMAT_DIAG_TYPE", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        (
-            "CUSPARSE_STATUS_SUCCESS",
-            ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_NOT_INITIALIZED",
-            ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_ALLOC_FAILED",
-            ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_INVALID_VALUE",
-            ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_MAPPING_ERROR",
-            ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_EXECUTION_FAILED",
-            ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_INTERNAL_ERROR",
-            ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
-            (
-                "HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
-                CONV_NUMERIC_LITERAL,
-                API_SPECIAL,
-            ),
-        ),
-        (
-            "CUSPARSE_STATUS_ARCH_MISMATCH",
-            ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_ZERO_PIVOT",
-            ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_NOT_SUPPORTED",
-            ("HIPSPARSE_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES",
-            ("HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_OPERATION_TRANSPOSE",
-            ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_OPERATION_NON_TRANSPOSE",
-            ("HIPSPARSE_OPERATION_NON_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
-            (
-                "HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
-                CONV_NUMERIC_LITERAL,
-                API_SPECIAL,
-            ),
-        ),
-        (
-            "CUSPARSE_INDEX_BASE_ZERO",
-            ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_INDEX_BASE_ONE",
-            ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUSPARSE_MATRIX_TYPE_GENERAL",
-            ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        # SparseLt
-        ("cuSPARSELt", ("hipSPARSELt", CONV_TYPE, API_SPECIAL)),
-        ("AT_CUSPARSELT_ENABLED", ("AT_HIPSPARSELT_ENABLED", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_ORDER_ROW", ("HIPSPARSE_ORDER_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_SPARSITY_50_PERCENT", ("HIPSPARSELT_SPARSITY_50_PERCENT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseComputeType", ("hipsparseLtComputetype_t", CONV_TYPE, API_SPECIAL)),
-        ("CUSPARSE_COMPUTE_32F", ("HIPSPARSELT_COMPUTE_32F", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_COMPUTE_16F", ("HIPSPARSELT_COMPUTE_16F", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_COMPUTE_32I", ("HIPSPARSELT_COMPUTE_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSE_COMPUTE_TF32", ("HIPSPARSELT_COMPUTE_TF32", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_ALG_CONFIG_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_ALG_CONFIG_MAX_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_MAX_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_BIAS_POINTER", ("HIPSPARSELT_MATMUL_BIAS_POINTER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_ALG_DEFAULT", ("HIPSPARSELT_MATMUL_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_ALG_CONFIG_ID", ("HIPSPARSELT_MATMUL_ALG_CONFIG_ID", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", ("HIPSPARSELT_MATMUL_ALPHA_VECTOR_SCALING", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_SPLIT_K", ("HIPSPARSELT_MATMUL_SPLIT_K", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUSPARSELT_MATMUL_SPLIT_K_MODE", ("HIPSPARSELT_MATMUL_SPLIT_K_MODE", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("cusparseLtHandle_t", ("hipsparseLtHandle_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtMatDescriptor_t", ("hipsparseLtMatDescriptor_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtInit", ("hipsparseLtInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtStructuredDescriptorInit", ("hipsparseLtStructuredDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtSplitKMode_t", ("hipsparseLtSplitKMode_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtSpMMACompressedSize2", ("hipsparseLtSpMMACompressedSize2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtSpMMACompress2", ("hipsparseLtSpMMACompress2", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulDescriptor_t", ("hipsparseLtMatmulDescriptor_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtMatmulPlan_t", ("hipsparseLtMatmulPlan_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtMatmulAlgSelection_t", ("hipsparseLtMatmulAlgSelection_t", CONV_TYPE, API_SPECIAL)),
-        ("cusparseLtStructuredDescriptorInit", ("hipsparseLtStructuredDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtDenseDescriptorInit", ("hipsparseLtDenseDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulDescriptorInit", ("hipsparseLtMatmulDescriptorInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulDescSetAttribute", ("hipsparseLtMatmulDescSetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulAlgSelectionInit", ("hipsparseLtMatmulAlgSelectionInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulAlgSetAttribute", ("hipsparseLtMatmulAlgSetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulPlanInit", ("hipsparseLtMatmulPlanInit", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulGetWorkspace", ("hipsparseLtMatmulGetWorkspace", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulSearch", ("hipsparseLtMatmulSearch", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulAlgGetAttribute", ("hipsparseLtMatmulAlgGetAttribute", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmul", ("hipsparseLtMatmul", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatDescriptorDestroy", ("hipsparseLtMatDescriptorDestroy", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseLtMatmulPlanDestroy", ("hipsparseLtMatmulPlanDestroy", CONV_MATH_FUNC, API_SPECIAL)),
-        ("cusparseGetErrorString", ("hipsparseGetErrorString", CONV_MATH_FUNC, API_SPECIAL)),
-        # SOLVER
-        ("cublasOperation_t", ("hipsolverOperation_t", CONV_TYPE, API_SPECIAL)),
-        ("CUBLAS_OP_N", ("HIPSOLVER_OP_N", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        (
-            "CUBLAS_OP_T",
-            ("HIPSOLVER_OP_T", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUBLAS_OP_C",
-            ("HIPSOLVER_OP_C", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        ("cublasFillMode_t", ("hipsolverFillMode_t", CONV_TYPE, API_SPECIAL)),
-        (
-            "CUBLAS_FILL_MODE_LOWER",
-            ("HIPSOLVER_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        (
-            "CUBLAS_FILL_MODE_UPPER",
-            ("HIPSOLVER_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL),
-        ),
-        ("cublasSideMode_t", ("hipsolverSideMode_t", CONV_TYPE, API_SPECIAL)),
-        ("CUBLAS_SIDE_LEFT", ("HIPSOLVER_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-        ("CUBLAS_SIDE_RIGHT", ("HIPSOLVER_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
-=======
 C10_MAPPINGS = collections.OrderedDict([
     ("CUDA_VERSION", "TORCH_HIP_VERSION"),
     ("CUDA_LAUNCH_BLOCKING=1", "AMD_SERIALIZE_KERNEL=3"),
@@ -3765,7 +3421,6 @@
     # TODO: Remove these. They were necessary for Meta-internal builds.
     ("c10::hip::c10_hip_check_implementation", "c10::cuda::c10_cuda_check_implementation"),
 ])
->>>>>>> upstream/main
 
 # TODO: Remove CAFFE2_SPECIFIC_MAPPINGS. They were necessary for Meta-internal builds.
 # CAFFE2 mappings for simple filename patterns (no path separators)

From 02bdd61c0baed346e16245dcb5596c0eb69133ee Mon Sep 17 00:00:00 2001
From: Chinmay Dattanand Kuchinad
 <40351312+chinmaydk99@users.noreply.github.com>
Date: Fri, 27 Feb 2026 13:49:52 -0600
Subject: [PATCH 25/43] Automate IFU issue creation and author assignment
 (#2997)

Adds workflow automation so IFU merges generate issues for commits in
range and assign them to commit authors. Includes cold-start handling
for first IFU on a branch, normal case when previous IFU tags exist, and
dedupe logic to prevent duplicate issues on reruns.
---
 .github/workflows/create_ifu_issues.yml | 336 ++++++++++++++++++++++++
 .github/workflows/create_ifu_tag.yml    | 257 ++++++++++++++++--
 2 files changed, 565 insertions(+), 28 deletions(-)
 create mode 100644 .github/workflows/create_ifu_issues.yml

diff --git a/.github/workflows/create_ifu_issues.yml b/.github/workflows/create_ifu_issues.yml
new file mode 100644
index 0000000000000..d639171de0018
--- /dev/null
+++ b/.github/workflows/create_ifu_issues.yml
@@ -0,0 +1,336 @@
+name: Create issues for ROCm commits
+
+on:
+  # Manual trigger for testing
+  workflow_dispatch:
+    inputs:
+      prev_post_tag:
+        description: "Issue range start ref (previous IFU post tag or cold-start SHA)"
+        required: true
+        type: string
+      curr_pre_tag:
+        description: "Current IFU pre tag"
+        required: true
+        type: string
+      target_repo:
+        description: "Target repo for issue creation"
+        required: false
+        default: "chinmaydk99/pytorch"
+        type: string
+      project_number:
+        description: "GitHub Project number"
+        required: false
+        default: "7"
+        type: string
+      project_owner:
+        description: "Project owner"
+        required: false
+        default: "chinmaydk99"
+        type: string
+
+  # Called by create_ifu_tag.yml after tagging
+  workflow_call:
+    inputs:
+      prev_post_tag:
+        description: "Issue range start ref (previous IFU post tag or cold-start SHA)"
+        required: true
+        type: string
+      curr_pre_tag:
+        description: "Current IFU pre tag"
+        required: true
+        type: string
+      target_repo:
+        description: "Target repo for issue creation"
+        required: false
+        default: "chinmaydk99/pytorch"
+        type: string
+      project_number:
+        description: "GitHub Project number"
+        required: false
+        default: "7"
+        type: string
+      project_owner:
+        description: "Project owner"
+        required: false
+        default: "chinmaydk99"
+        type: string
+    secrets:
+      IFU_GITHUB_TOKEN:
+        required: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  create-issues:
+    runs-on: ubuntu-latest
+    env:
+      # Use passed secret for workflow_call, direct secret for workflow_dispatch
+      GH_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch tags
+        run: git fetch origin --tags --force
+
+      - name: Extract branch from tag
+        id: parse
+        env:
+          CURR_PRE_TAG: ${{ inputs.curr_pre_tag }}
+        run: |
+          branch="${CURR_PRE_TAG%_IFU_*}"
+          echo "Branch: $branch"
+          echo "branch=$branch" >> $GITHUB_OUTPUT
+
+      - name: List commits in range
+        run: |
+          echo "Commits between start ref ${{ inputs.prev_post_tag }} and end ref ${{ inputs.curr_pre_tag }}:"
+          git log ${{ inputs.prev_post_tag }}..${{ inputs.curr_pre_tag }} --oneline --no-merges
+
+      - name: Get or create project fields
+        id: project_fields
+        if: ${{ inputs.project_number != '' }}
+        env:
+          PROJECT_NUMBER: ${{ inputs.project_number }}
+          PROJECT_OWNER: ${{ inputs.project_owner }}
+        run: |
+          echo "Getting project information..."
+
+          # Get project node ID and existing fields
+          project_data=$(gh api graphql -f query='
+            query($owner: String!, $number: Int!) {
+              user(login: $owner) {
+                projectV2(number: $number) {
+                  id
+                  fields(first: 50) {
+                    nodes {
+                      ... on ProjectV2Field {
+                        id
+                        name
+                        dataType
+                      }
+                      ... on ProjectV2SingleSelectField {
+                        id
+                        name
+                        dataType
+                      }
+                    }
+                  }
+                }
+              }
+            }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}")
+
+          project_id=$(echo "$project_data" | jq -r '.data.user.projectV2.id')
+          echo "Project ID: $project_id"
+
+          if [[ "$project_id" == "null" || -z "$project_id" ]]; then
+            echo "Error: Could not find project. Trying organization query..."
+            project_data=$(gh api graphql -f query='
+              query($owner: String!, $number: Int!) {
+                organization(login: $owner) {
+                  projectV2(number: $number) {
+                    id
+                    fields(first: 50) {
+                      nodes {
+                        ... on ProjectV2Field {
+                          id
+                          name
+                          dataType
+                        }
+                        ... on ProjectV2SingleSelectField {
+                          id
+                          name
+                          dataType
+                        }
+                      }
+                    }
+                  }
+                }
+              }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}")
+
+            project_id=$(echo "$project_data" | jq -r '.data.organization.projectV2.id')
+            fields_json=$(echo "$project_data" | jq -r '.data.organization.projectV2.fields.nodes')
+          else
+            fields_json=$(echo "$project_data" | jq -r '.data.user.projectV2.fields.nodes')
+          fi
+
+          echo "Project ID: $project_id"
+          echo "project_id=$project_id" >> $GITHUB_OUTPUT
+
+          # Find or create 'branch' field
+          branch_field_id=$(echo "$fields_json" | jq -r '.[] | select(.name == "branch") | .id')
+          if [[ -z "$branch_field_id" || "$branch_field_id" == "null" ]]; then
+            echo "Creating 'branch' field..."
+            branch_field_id=$(gh api graphql -f query='
+              mutation($projectId: ID!, $name: String!) {
+                createProjectV2Field(input: {projectId: $projectId, dataType: TEXT, name: $name}) {
+                  projectV2Field {
+                    ... on ProjectV2Field {
+                      id
+                    }
+                  }
+                }
+              }' -f projectId="$project_id" -f name="branch" --jq '.data.createProjectV2Field.projectV2Field.id')
+            echo "Created 'branch' field: $branch_field_id"
+          else
+            echo "Found existing 'branch' field: $branch_field_id"
+          fi
+          echo "branch_field_id=$branch_field_id" >> $GITHUB_OUTPUT
+
+          # Find or create 'commit_hash' field
+          commit_hash_field_id=$(echo "$fields_json" | jq -r '.[] | select(.name == "commit_hash") | .id')
+          if [[ -z "$commit_hash_field_id" || "$commit_hash_field_id" == "null" ]]; then
+            echo "Creating 'commit_hash' field..."
+            commit_hash_field_id=$(gh api graphql -f query='
+              mutation($projectId: ID!, $name: String!) {
+                createProjectV2Field(input: {projectId: $projectId, dataType: TEXT, name: $name}) {
+                  projectV2Field {
+                    ... on ProjectV2Field {
+                      id
+                    }
+                  }
+                }
+              }' -f projectId="$project_id" -f name="commit_hash" --jq '.data.createProjectV2Field.projectV2Field.id')
+            echo "Created 'commit_hash' field: $commit_hash_field_id"
+          else
+            echo "Found existing 'commit_hash' field: $commit_hash_field_id"
+          fi
+          echo "commit_hash_field_id=$commit_hash_field_id" >> $GITHUB_OUTPUT
+
+      - name: Create issues for commits
+        env:
+          PREV_POST_TAG: ${{ inputs.prev_post_tag }}
+          CURR_PRE_TAG: ${{ inputs.curr_pre_tag }}
+          TARGET_REPO: ${{ inputs.target_repo }}
+          PROJECT_NUMBER: ${{ inputs.project_number }}
+          PROJECT_OWNER: ${{ inputs.project_owner }}
+          REPO_NAME: ${{ github.repository }}
+          BRANCH: ${{ steps.parse.outputs.branch }}
+          PROJECT_ID: ${{ steps.project_fields.outputs.project_id }}
+          BRANCH_FIELD_ID: ${{ steps.project_fields.outputs.branch_field_id }}
+          COMMIT_HASH_FIELD_ID: ${{ steps.project_fields.outputs.commit_hash_field_id }}
+        run: |
+          echo "Creating issues for commits..."
+
+          commit_count=$(git rev-list --count --no-merges "${PREV_POST_TAG}..${CURR_PRE_TAG}")
+          if [[ "${commit_count}" -eq 0 ]]; then
+            echo "No non-merge commits in range ${PREV_POST_TAG}..${CURR_PRE_TAG}; no issues to create."
+            exit 0
+          fi
+
+          echo "Found ${commit_count} non-merge commits to process."
+
+          git log "${PREV_POST_TAG}..${CURR_PRE_TAG}" --format="%H" --no-merges | while read hash; do
+            short_hash="${hash:0:5}"
+            subject=$(git log -1 --format="%s" "$hash")
+            author=$(git log -1 --format="%an" "$hash")
+            email=$(git log -1 --format="%ae" "$hash")
+
+            echo "Processing ${short_hash}: ${subject}"
+
+            # Try to get GitHub username via API first
+            gh_username=""
+            gh_username=$(gh api "repos/${REPO_NAME}/commits/${hash}" --jq '.author.login // empty' 2>/dev/null || true)
+
+            if [[ -z "${gh_username}" ]]; then
+              # Fallback: try to extract from noreply email
+              if [[ "$email" =~ ^[0-9]+\+([^@]+)@users\.noreply\.github\.com$ ]]; then
+                gh_username="${BASH_REMATCH[1]}"
+                echo "  Extracted username from email: ${gh_username}"
+              fi
+            else
+              echo "  Found GitHub username via API: ${gh_username}"
+            fi
+
+            # Dedupe by commit hash marker in issue body across all issue states.
+            existing_issue_url=$(gh issue list \
+              --repo "${TARGET_REPO}" \
+              --state all \
+              --search "\"${hash}\" in:body" \
+              --limit 20 \
+              --json url,body \
+              | jq -r --arg hash "$hash" '.[] | select((.body // "") | contains("**Commit:** " + $hash)) | .url' \
+              | head -n 1 || true)
+            if [[ -n "${existing_issue_url}" ]]; then
+              echo "  Existing issue found for commit ${short_hash}: ${existing_issue_url}"
+              echo "  Skipping duplicate issue creation."
+              continue
+            fi
+
+            body="**Commit:** ${hash}"$'\n'"**Author:** ${author} (${email})"$'\n'"**Branch:** ${BRANCH}"$'\n'"**Link:** [View commit](https://github.com/${REPO_NAME}/commit/${hash})"
+
+            issue_url=$(gh issue create \
+              --repo "${TARGET_REPO}" \
+              --title "${subject}" \
+              --body "${body}")
+
+            echo "  Created: ${issue_url}"
+
+            # Try to assign the issue
+            if [[ -n "${gh_username}" ]]; then
+              echo "  Trying to assign to @${gh_username}..."
+              if gh issue edit "${issue_url}" --add-assignee "${gh_username}" 2>/dev/null; then
+                echo "  Successfully assigned issue"
+              else
+                echo "  Could not assign, adding comment instead"
+                gh issue comment "${issue_url}" --body "cc @${gh_username} - you authored this commit" || true
+              fi
+            fi
+
+            # Add to project and set field values
+            if [[ -n "${PROJECT_NUMBER}" && -n "${PROJECT_ID}" ]]; then
+              echo "  Adding to project..."
+              item_id=$(gh project item-add "${PROJECT_NUMBER}" --owner "${PROJECT_OWNER}" --url "${issue_url}" --format json 2>/dev/null | jq -r '.id' || true)
+
+              if [[ -n "${item_id}" && "${item_id}" != "null" ]]; then
+                echo "  Project item ID: ${item_id}"
+
+                # Set branch field
+                if [[ -n "${BRANCH_FIELD_ID}" ]]; then
+                  echo "  Setting branch field to: ${BRANCH}"
+                  gh api graphql -f query='
+                    mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: String!) {
+                      updateProjectV2ItemFieldValue(input: {
+                        projectId: $projectId
+                        itemId: $itemId
+                        fieldId: $fieldId
+                        value: {text: $value}
+                      }) {
+                        projectV2Item {
+                          id
+                        }
+                      }
+                    }' -f projectId="${PROJECT_ID}" -f itemId="${item_id}" -f fieldId="${BRANCH_FIELD_ID}" -f value="${BRANCH}" || echo "  Warning: Failed to set branch field"
+                fi
+
+                # Set commit_hash field
+                if [[ -n "${COMMIT_HASH_FIELD_ID}" ]]; then
+                  echo "  Setting commit_hash field to: ${hash}"
+                  gh api graphql -f query='
+                    mutation($projectId: ID!, $itemId: ID!, $fieldId: ID!, $value: String!) {
+                      updateProjectV2ItemFieldValue(input: {
+                        projectId: $projectId
+                        itemId: $itemId
+                        fieldId: $fieldId
+                        value: {text: $value}
+                      }) {
+                        projectV2Item {
+                          id
+                        }
+                      }
+                    }' -f projectId="${PROJECT_ID}" -f itemId="${item_id}" -f fieldId="${COMMIT_HASH_FIELD_ID}" -f value="${hash}" || echo "  Warning: Failed to set commit_hash field"
+                fi
+              else
+                echo "  Warning: Could not get project item ID"
+              fi
+            fi
+
+            sleep 1
+          done
+
+          echo "Done creating issues!"
diff --git a/.github/workflows/create_ifu_tag.yml b/.github/workflows/create_ifu_tag.yml
index 422b6d1c5ba67..fae8140011cf0 100644
--- a/.github/workflows/create_ifu_tag.yml
+++ b/.github/workflows/create_ifu_tag.yml
@@ -1,29 +1,81 @@
 name: Create git tags for IFU PRs
 
 on:
+  # ORIGINAL: Triggered when an IFU PR is merged
   pull_request:
     types: [closed]
 
+  # Test harness - manually trigger to test without a real PR merge
+  workflow_dispatch:
+    inputs:
+      test_branch:
+        description: "Branch name to test (e.g., rocm7.1_internal_testing)"
+        required: true
+        type: string
+      test_curr_pre_tag:
+        description: "Pre tag to use as curr_pre_tag (required for full chain test)"
+        required: false
+        type: string
+      test_issue_prev_ref:
+        description: "Optional issue range start ref for cold-start full-chain test (tag or SHA)"
+        required: false
+        type: string
+      run_full_chain:
+        description: "Run full chain - actually call create_ifu_issues.yml (will create real issues!)"
+        required: false
+        default: false
+        type: boolean
+
 permissions:
   contents: write        # create/push tags
   pull-requests: write   # edit PR body
+  issues: write          # needed for create_ifu_issues.yml when called
 
 jobs:
   tag-ifu:
-    # Only proceed if: merged AND title has both markers
+    # Run for workflow_dispatch (test mode) OR for real PR merges
     if: >
-      github.event.pull_request.merged == true &&
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.pull_request.merged == true &&
       contains(github.event.pull_request.title, '[AUTOGENERATED]') &&
-      contains(github.event.pull_request.title, 'IFU')
+      contains(github.event.pull_request.title, 'IFU'))
     runs-on: ubuntu-latest
 
+    # Export values so the create-issues job can use them
+    outputs:
+      prev_post_tag: ${{ steps.prev_tag.outputs.prev_post_tag }}
+      curr_pre_tag: ${{ github.event_name == 'workflow_dispatch' && inputs.test_curr_pre_tag || steps.tagname.outputs.PRE_TAG }}
+      has_prev_tag: ${{ steps.prev_tag.outputs.has_prev_tag }}
+      issue_prev_ref: ${{ steps.prev_ref.outputs.issue_prev_ref }}
+      can_create_issues: ${{ steps.prev_ref.outputs.can_create_issues }}
+
     steps:
+      - name: Validate test inputs
+        if: github.event_name == 'workflow_dispatch' && inputs.run_full_chain == true
+        run: |
+          if [[ -z "${{ inputs.test_curr_pre_tag }}" ]]; then
+            echo "ERROR: test_curr_pre_tag is required when run_full_chain is enabled"
+            echo "Please provide an existing pre tag (e.g., rocm7.1_internal_testing_IFU_2025-10-29_pre)"
+            exit 1
+          fi
+          echo "Full chain test enabled with:"
+          echo "  test_branch: ${{ inputs.test_branch }}"
+          echo "  test_curr_pre_tag: ${{ inputs.test_curr_pre_tag }}"
+          if [[ -n "${{ inputs.test_issue_prev_ref }}" ]]; then
+            echo "  test_issue_prev_ref: ${{ inputs.test_issue_prev_ref }}"
+          fi
+
       - name: Checkout base repo (full history)
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.pull_request.base.ref }}
+          # Use current branch for test mode, PR base branch for real merges
+          ref: ${{ github.event_name == 'workflow_dispatch' && github.ref || github.event.pull_request.base.ref }}
           fetch-depth: 0
 
+      # Fetch all tags so we can find the previous post tag
+      - name: Fetch all tags
+        run: git fetch origin --tags --force
+
       - name: Configure Git user
         run: |
           git config user.name  "github-actions[bot]"
@@ -31,44 +83,59 @@ jobs:
 
       - name: Derive key SHAs (rocm base, upstream main, merge)
         id: shas
+        # Skip in test mode - we don't have a real PR
+        if: github.event_name != 'workflow_dispatch'
         shell: bash
         run: |
           set -euo pipefail
-          
+
           PR_NUM="${{ github.event.pull_request.number }}"
           BASE_REF="${{ github.event.pull_request.base.ref }}"
           HEAD_SHA="${{ github.event.pull_request.head.sha }}"
           MERGE_SHA="${{ github.event.pull_request.merge_commit_sha }}"
 
-          # For release branches, local branch is same as remote branch. But, for rocm/pytorch's
-          # develop branch, we want to use main branch upstream
-          if [ "$BASE_REF" == "develop" ]; then
-            BASE_REF="main"
+          # Upstream ref is usually the same as base branch. For rocm/pytorch's
+          # develop branch, compare against upstream/main.
+          UPSTREAM_REF="$BASE_REF"
+          if [ "$UPSTREAM_REF" == "develop" ]; then
+            UPSTREAM_REF="main"
           fi
-          
-          echo "PR_NUM=$PR_NUM"                           
-          echo "BASE_REF=$BASE_REF"                       
-          echo "HEAD_SHA=$HEAD_SHA"                       
-          echo "MERGE_SHA=$MERGE_SHA"                     
-    
+
+          echo "PR_NUM=$PR_NUM"
+          echo "BASE_REF=$BASE_REF"
+          echo "UPSTREAM_REF=$UPSTREAM_REF"
+          echo "HEAD_SHA=$HEAD_SHA"
+          echo "MERGE_SHA=$MERGE_SHA"
+
           # The ROCm base commit is the first parent of the merge commit that landed the PR
           # (i.e., the base branch tip BEFORE this PR merged).
           ROCM_BASE_SHA=$(git rev-parse "${MERGE_SHA}^1")
 
-          # Add and fetch upstream to identify the upstream/main commit that HEAD integrated.
-          git remote add upstream "https://github.com/pytorch/pytorch.git"
-          git fetch upstream "$BASE_REF"
+          # Add upstream if missing.
+          if ! git remote get-url upstream >/dev/null 2>&1; then
+            git remote add upstream "https://github.com/pytorch/pytorch.git"
+          fi
+
+          # Some IFU base branches may not exist in upstream (e.g., fork-only/test branches).
+          # In that case, fall back to upstream/main.
+          if ! git ls-remote --exit-code --heads upstream "$UPSTREAM_REF" >/dev/null 2>&1; then
+            echo "Upstream branch '$UPSTREAM_REF' not found; falling back to upstream/main"
+            UPSTREAM_REF="main"
+          fi
+          git fetch upstream "$UPSTREAM_REF"
 
           # Heuristic: the upstream commit integrated by the PR's head is the merge-base
           # between the PR head commit and upstream/main as fetched now.
           # This gives you the exact upstream commit (or the best common ancestor) that HEAD included.
-          UPSTREAM_MAIN_SHA=$(git merge-base "${HEAD_SHA}" "upstream/$BASE_REF")
-          echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"            
-          echo "UPSTREAM_MAIN_SHA=$UPSTREAM_MAIN_SHA"     
+          UPSTREAM_MAIN_SHA=$(git merge-base "${HEAD_SHA}" "upstream/$UPSTREAM_REF")
+          echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"
+          echo "UPSTREAM_MAIN_SHA=$UPSTREAM_MAIN_SHA"
+          echo "UPSTREAM_REF_USED=$UPSTREAM_REF"
+
 
-          
           echo "PR_NUM=$PR_NUM"                           >> "$GITHUB_OUTPUT"
           echo "BASE_REF=$BASE_REF"                       >> "$GITHUB_OUTPUT"
+          echo "UPSTREAM_REF_USED=$UPSTREAM_REF"          >> "$GITHUB_OUTPUT"
           echo "HEAD_SHA=$HEAD_SHA"                       >> "$GITHUB_OUTPUT"
           echo "MERGE_SHA=$MERGE_SHA"                     >> "$GITHUB_OUTPUT"
           echo "ROCM_BASE_SHA=$ROCM_BASE_SHA"             >> "$GITHUB_OUTPUT"
@@ -76,21 +143,138 @@ jobs:
 
       - name: Extract tag base from PR title
         id: tagname
+        # Skip in test mode
+        if: github.event_name != 'workflow_dispatch'
         run: |
           TITLE="${{ github.event.pull_request.title }}"
           # Remove everything up to and including "[AUTOGENERATED]"
           # Remove trailing whitespace
           BASE_TAG=$(echo "$TITLE" | sed -E 's/^\[AUTOGENERATED\][[:space:]]*//' | sed -E 's/[[:space:]]+$//')
 
-          echo "BASE_TAG=$BASE_TAG" 
-          echo "PRE_TAG=${BASE_TAG}_pre" 
-          echo "POST_TAG=${BASE_TAG}_post" 
-          
+          echo "BASE_TAG=$BASE_TAG"
+          echo "PRE_TAG=${BASE_TAG}_pre"
+          echo "POST_TAG=${BASE_TAG}_post"
+
+          # Extract branch name from BASE_TAG (everything before _IFU_)
+          BRANCH="${BASE_TAG%_IFU_*}"
+          echo "BRANCH=$BRANCH"
+
           echo "BASE_TAG=$BASE_TAG" >> $GITHUB_OUTPUT
           echo "PRE_TAG=${BASE_TAG}_pre" >> $GITHUB_OUTPUT
           echo "POST_TAG=${BASE_TAG}_post" >> $GITHUB_OUTPUT
+          echo "BRANCH=$BRANCH" >> $GITHUB_OUTPUT
+
+      # Find the most recent post tag for this branch
+      # This is needed to know the range of commits for issue creation
+      - name: Find previous post tag
+        id: prev_tag
+        env:
+          # Use test_branch input for test mode, extracted BRANCH for real merges
+          BRANCH: ${{ github.event_name == 'workflow_dispatch' && inputs.test_branch || steps.tagname.outputs.BRANCH }}
+        run: |
+          echo "Finding previous post tag for branch: ${BRANCH}"
+
+          # List all post tags for this branch, sorted by version (date in tag name)
+          echo "All post tags for ${BRANCH}:"
+          git tag --list "${BRANCH}_IFU_*_post" --sort=-version:refname
+
+          # Get the most recent post tag
+          prev_post_tag=$(git tag --list "${BRANCH}_IFU_*_post" --sort=-version:refname | head -n 1)
+
+          if [[ -z "$prev_post_tag" ]]; then
+            echo "WARNING: No previous post tag found for branch ${BRANCH}"
+            echo "This might be the first IFU for this branch"
+            echo "prev_post_tag=" >> $GITHUB_OUTPUT
+            echo "has_prev_tag=false" >> $GITHUB_OUTPUT
+          else
+            echo "Found previous post tag: $prev_post_tag"
+            echo "prev_post_tag=$prev_post_tag" >> $GITHUB_OUTPUT
+            echo "has_prev_tag=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Validate full-chain test start ref
+        if: github.event_name == 'workflow_dispatch' && inputs.run_full_chain == true
+        env:
+          HAS_PREV_TAG: ${{ steps.prev_tag.outputs.has_prev_tag }}
+          TEST_ISSUE_PREV_REF: ${{ inputs.test_issue_prev_ref }}
+        run: |
+          if [[ "${HAS_PREV_TAG}" != "true" && -z "${TEST_ISSUE_PREV_REF}" ]]; then
+            echo "ERROR: No previous post tag found for this branch."
+            echo "For cold-start full-chain tests, provide test_issue_prev_ref (tag or SHA)."
+            exit 1
+          fi
 
-      - name: Create pre/post tags 
+      # In test mode, print a summary of what was found
+      - name: Test mode summary
+        if: github.event_name == 'workflow_dispatch'
+        env:
+          BRANCH: ${{ inputs.test_branch }}
+          PREV_POST_TAG: ${{ steps.prev_tag.outputs.prev_post_tag }}
+          HAS_PREV_TAG: ${{ steps.prev_tag.outputs.has_prev_tag }}
+          TEST_CURR_PRE_TAG: ${{ inputs.test_curr_pre_tag }}
+          TEST_ISSUE_PREV_REF: ${{ inputs.test_issue_prev_ref }}
+          RUN_FULL_CHAIN: ${{ inputs.run_full_chain }}
+        run: |
+          echo "=========================================="
+          echo "TEST MODE SUMMARY"
+          echo "=========================================="
+          echo "Branch: ${BRANCH}"
+          echo "Has previous post tag: ${HAS_PREV_TAG}"
+          echo "Previous post tag: ${PREV_POST_TAG:-'(none)'}"
+          echo ""
+          if [[ "${RUN_FULL_CHAIN}" == "true" ]]; then
+            echo " FULL CHAIN TEST ENABLED"
+            echo "Will call create_ifu_issues.yml with:"
+            echo "   - prev_post_tag: ${PREV_POST_TAG}"
+            echo "   - curr_pre_tag: ${TEST_CURR_PRE_TAG}"
+            if [[ -n "${TEST_ISSUE_PREV_REF}" ]]; then
+              echo "   - test_issue_prev_ref override: ${TEST_ISSUE_PREV_REF}"
+            fi
+            echo ""
+            echo " WARNING: This will create REAL issues!"
+          else
+            echo " Full chain test NOT enabled"
+            echo "To test issue creation, re-run with:"
+            echo "   - run_full_chain: true"
+            echo "   - test_curr_pre_tag: (an existing pre tag)"
+          fi
+          echo "=========================================="
+
+      # Determine the start reference for issue creation.
+      # Priority:
+      #   1) previous IFU post tag (normal path)
+      #   2) test_issue_prev_ref (cold-start fallback for workflow_dispatch test path)
+      #   3) UPSTREAM_MAIN_SHA (cold-start fallback on real merge path only)
+      - name: Resolve issue range start reference
+        id: prev_ref
+        env:
+          HAS_PREV_TAG: ${{ steps.prev_tag.outputs.has_prev_tag }}
+          PREV_POST_TAG: ${{ steps.prev_tag.outputs.prev_post_tag }}
+          TEST_ISSUE_PREV_REF: ${{ inputs.test_issue_prev_ref }}
+          EVENT_NAME: ${{ github.event_name }}
+          UPSTREAM_MAIN_SHA: ${{ steps.shas.outputs.UPSTREAM_MAIN_SHA }}
+        run: |
+          if [[ "${HAS_PREV_TAG}" == "true" && -n "${PREV_POST_TAG}" ]]; then
+            echo "Using previous IFU post tag for issue range start: ${PREV_POST_TAG}"
+            echo "issue_prev_ref=${PREV_POST_TAG}" >> "$GITHUB_OUTPUT"
+            echo "can_create_issues=true" >> "$GITHUB_OUTPUT"
+          elif [[ "${EVENT_NAME}" == "workflow_dispatch" && -n "${TEST_ISSUE_PREV_REF}" ]]; then
+            echo "Using test override for issue range start: ${TEST_ISSUE_PREV_REF}"
+            echo "issue_prev_ref=${TEST_ISSUE_PREV_REF}" >> "$GITHUB_OUTPUT"
+            echo "can_create_issues=true" >> "$GITHUB_OUTPUT"
+          elif [[ "${EVENT_NAME}" != "workflow_dispatch" && -n "${UPSTREAM_MAIN_SHA:-}" ]]; then
+            echo "No previous IFU post tag found; using cold-start fallback UPSTREAM_MAIN_SHA: ${UPSTREAM_MAIN_SHA}"
+            echo "issue_prev_ref=${UPSTREAM_MAIN_SHA}" >> "$GITHUB_OUTPUT"
+            echo "can_create_issues=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "Could not determine issue range start reference."
+            echo "issue_prev_ref=" >> "$GITHUB_OUTPUT"
+            echo "can_create_issues=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Create pre/post tags
+        # Skip in test mode
+        if: github.event_name != 'workflow_dispatch'
         shell: bash
         run: |
           set -euo pipefail
@@ -98,7 +282,7 @@ jobs:
           echo "  ${{ steps.tagname.outputs.PRE_TAG }}  @ ${{ steps.shas.outputs.ROCM_BASE_SHA }}"
           echo "  ${{ steps.tagname.outputs.POST_TAG }} @ ${{ steps.shas.outputs.MERGE_SHA }}"
 
-          git tag -a "${{ steps.tagname.outputs.PRE_TAG }}"  -m "IFU pre (PR #${{ steps.shas.outputs.PR_NUM }})"  "${{ steps.shas.outputs.ROCM_BASE_SHA }}" 
+          git tag -a "${{ steps.tagname.outputs.PRE_TAG }}"  -m "IFU pre (PR #${{ steps.shas.outputs.PR_NUM }})"  "${{ steps.shas.outputs.ROCM_BASE_SHA }}"
           git tag -a "${{ steps.tagname.outputs.POST_TAG }}" -m "IFU post (PR #${{ steps.shas.outputs.PR_NUM }})" "${{ steps.shas.outputs.MERGE_SHA }}"
 
           #Force pushing is safe. If we land a new PR, we'd wanna retag a commit if we have to.
@@ -106,6 +290,8 @@ jobs:
           git push origin "refs/tags/${{ steps.tagname.outputs.POST_TAG  }}" -f
 
       - name: Append rocm_base & upstream_main to PR body
+        # Skip in test mode
+        if: github.event_name != 'workflow_dispatch'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         shell: bash
@@ -122,3 +308,18 @@ jobs:
           gh api --method PATCH -H  "Accept: application/vnd.github+json" \
             repos/${{ github.repository }}/pulls/$PR -F body=@body.txt
 
+  # Calls create_ifu_issues.yml after tagging
+  # Runs for:
+  #   - Real PR merges (when a start reference can be resolved)
+  #   - Test mode with run_full_chain=true (when a start reference can be resolved)
+  create-issues:
+    needs: tag-ifu
+    if: >
+      needs.tag-ifu.outputs.can_create_issues == 'true' &&
+      (github.event_name != 'workflow_dispatch' || inputs.run_full_chain == true)
+    uses: ./.github/workflows/create_ifu_issues.yml
+    with:
+      prev_post_tag: ${{ needs.tag-ifu.outputs.issue_prev_ref }}
+      curr_pre_tag: ${{ needs.tag-ifu.outputs.curr_pre_tag }}
+    secrets:
+      IFU_GITHUB_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}

From af4f7f5de13a26687ede71ad9522fef63bdc827e Mon Sep 17 00:00:00 2001
From: Chinmay Dattanand Kuchinad
 <40351312+chinmaydk99@users.noreply.github.com>
Date: Mon, 2 Mar 2026 10:54:37 -0600
Subject: [PATCH 26/43] Fix Issue creation workflow to filter ROCM-only commits
 (#3017)

---
 .github/workflows/create_ifu_issues.yml | 31 ++++++++++++++-----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/create_ifu_issues.yml b/.github/workflows/create_ifu_issues.yml
index d639171de0018..c05f51fc36822 100644
--- a/.github/workflows/create_ifu_issues.yml
+++ b/.github/workflows/create_ifu_issues.yml
@@ -15,17 +15,17 @@ on:
       target_repo:
         description: "Target repo for issue creation"
         required: false
-        default: "chinmaydk99/pytorch"
+        default: "ROCm/pytorch"
         type: string
       project_number:
         description: "GitHub Project number"
         required: false
-        default: "7"
+        default: "114"
         type: string
       project_owner:
         description: "Project owner"
         required: false
-        default: "chinmaydk99"
+        default: "ROCm"
         type: string
 
   # Called by create_ifu_tag.yml after tagging
@@ -42,17 +42,17 @@ on:
       target_repo:
         description: "Target repo for issue creation"
         required: false
-        default: "chinmaydk99/pytorch"
+        default: "ROCm/pytorch"
         type: string
       project_number:
         description: "GitHub Project number"
         required: false
-        default: "7"
+        default: "114"
         type: string
       project_owner:
         description: "Project owner"
         required: false
-        default: "chinmaydk99"
+        default: "ROCm"
         type: string
     secrets:
       IFU_GITHUB_TOKEN:
@@ -87,10 +87,15 @@ jobs:
           echo "Branch: $branch"
           echo "branch=$branch" >> $GITHUB_OUTPUT
 
+      - name: Fetch upstream
+        run: |
+          git remote add upstream https://github.com/pytorch/pytorch.git 2>/dev/null || true
+          git fetch upstream main --force
+
       - name: List commits in range
         run: |
-          echo "Commits between start ref ${{ inputs.prev_post_tag }} and end ref ${{ inputs.curr_pre_tag }}:"
-          git log ${{ inputs.prev_post_tag }}..${{ inputs.curr_pre_tag }} --oneline --no-merges
+          echo "ROCm-only commits between ${{ inputs.prev_post_tag }} and ${{ inputs.curr_pre_tag }}:"
+          git log ${{ inputs.prev_post_tag }}..${{ inputs.curr_pre_tag }} --oneline --no-merges --not upstream/main
 
       - name: Get or create project fields
         id: project_fields
@@ -129,7 +134,7 @@ jobs:
           echo "Project ID: $project_id"
 
           if [[ "$project_id" == "null" || -z "$project_id" ]]; then
-            echo "Error: Could not find project. Trying organization query..."
+            echo "User project not found. Trying organization query..."
             project_data=$(gh api graphql -f query='
               query($owner: String!, $number: Int!) {
                 organization(login: $owner) {
@@ -217,15 +222,15 @@ jobs:
         run: |
           echo "Creating issues for commits..."
 
-          commit_count=$(git rev-list --count --no-merges "${PREV_POST_TAG}..${CURR_PRE_TAG}")
+          commit_count=$(git rev-list --count --no-merges "${PREV_POST_TAG}..${CURR_PRE_TAG}" --not upstream/main)
           if [[ "${commit_count}" -eq 0 ]]; then
-            echo "No non-merge commits in range ${PREV_POST_TAG}..${CURR_PRE_TAG}; no issues to create."
+            echo "No ROCm-only commits in range ${PREV_POST_TAG}..${CURR_PRE_TAG}; nothing to create."
             exit 0
           fi
 
-          echo "Found ${commit_count} non-merge commits to process."
+          echo "Found ${commit_count} ROCm-only commits to process."
 
-          git log "${PREV_POST_TAG}..${CURR_PRE_TAG}" --format="%H" --no-merges | while read hash; do
+          git log "${PREV_POST_TAG}..${CURR_PRE_TAG}" --format="%H" --no-merges --not upstream/main | while read hash; do
             short_hash="${hash:0:5}"
             subject=$(git log -1 --format="%s" "$hash")
             author=$(git log -1 --format="%an" "$hash")

From 7735e5bd08b8d130b163f926a3c099c1e3774092 Mon Sep 17 00:00:00 2001
From: Chinmay Dattanand Kuchinad
 <40351312+chinmaydk99@users.noreply.github.com>
Date: Mon, 2 Mar 2026 14:00:16 -0600
Subject: [PATCH 27/43] Fix automatic issue creation workflow to filter
 ROCM-only commits (#3018)

---
 .github/workflows/create_ifu_issues.yml | 33 ++++++++++++++++---------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/create_ifu_issues.yml b/.github/workflows/create_ifu_issues.yml
index c05f51fc36822..8e2e7da07ab43 100644
--- a/.github/workflows/create_ifu_issues.yml
+++ b/.github/workflows/create_ifu_issues.yml
@@ -106,7 +106,7 @@ jobs:
         run: |
           echo "Getting project information..."
 
-          # Get project node ID and existing fields
+          # Try user-owned project first.
           project_data=$(gh api graphql -f query='
             query($owner: String!, $number: Int!) {
               user(login: $owner) {
@@ -128,13 +128,13 @@ jobs:
                   }
                 }
               }
-            }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}")
+            }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}" 2>/dev/null || true)
 
-          project_id=$(echo "$project_data" | jq -r '.data.user.projectV2.id')
-          echo "Project ID: $project_id"
+          project_id=$(echo "$project_data" | jq -r '.data.user.projectV2.id // empty' 2>/dev/null || true)
+          echo "User project ID: ${project_id:-'(none)'}"
 
-          if [[ "$project_id" == "null" || -z "$project_id" ]]; then
-            echo "User project not found. Trying organization query..."
+          if [[ -z "$project_id" ]]; then
+            echo "User project not found (or owner is an org). Trying organization query..."
             project_data=$(gh api graphql -f query='
               query($owner: String!, $number: Int!) {
                 organization(login: $owner) {
@@ -156,12 +156,18 @@ jobs:
                     }
                   }
                 }
-              }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}")
+              }' -f owner="${PROJECT_OWNER}" -F number="${PROJECT_NUMBER}" 2>/dev/null || true)
 
-            project_id=$(echo "$project_data" | jq -r '.data.organization.projectV2.id')
-            fields_json=$(echo "$project_data" | jq -r '.data.organization.projectV2.fields.nodes')
+            project_id=$(echo "$project_data" | jq -r '.data.organization.projectV2.id // empty' 2>/dev/null || true)
+            fields_json=$(echo "$project_data" | jq -r '.data.organization.projectV2.fields.nodes // empty' 2>/dev/null || true)
           else
-            fields_json=$(echo "$project_data" | jq -r '.data.user.projectV2.fields.nodes')
+            fields_json=$(echo "$project_data" | jq -r '.data.user.projectV2.fields.nodes // empty' 2>/dev/null || true)
+          fi
+
+          if [[ -z "$project_id" || -z "$fields_json" ]]; then
+            echo "Error: Could not resolve project owner '${PROJECT_OWNER}' project #${PROJECT_NUMBER}."
+            echo "If PROJECT_OWNER is an organization, ensure PROJECT_OWNER is exactly the org login and token has org access."
+            exit 1
           fi
 
           echo "Project ID: $project_id"
@@ -272,7 +278,12 @@ jobs:
             issue_url=$(gh issue create \
               --repo "${TARGET_REPO}" \
               --title "${subject}" \
-              --body "${body}")
+              --body "${body}" 2>/dev/null || true)
+
+            if [[ -z "${issue_url}" ]]; then
+              echo "  ERROR: Failed to create issue for ${short_hash}. Skipping."
+              continue
+            fi
 
             echo "  Created: ${issue_url}"
 

From 0168e75a09cee10c3821ab4111e1a4669dbf81c2 Mon Sep 17 00:00:00 2001
From: Prachi Gupta <prachi.gupta@amd.com>
Date: Mon, 16 Mar 2026 20:52:52 +0000
Subject: [PATCH 28/43] Fix merge conflicts

---
 CMakeLists.txt | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a74e80af756b1..fdb5062824815 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,19 +56,11 @@ set(CMAKE_C_STANDARD
 # ---[ Utils
 include(cmake/public/utils.cmake)
 
-<<<<<<< HEAD
-# --- [ Check that minimal gcc version is 9.2+
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.2)
-  message(
-    FATAL_ERROR
-      "GCC-9.2 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
-=======
 # --- [ Check that minimal gcc version is 11.3+
 if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.3)
   message(
     FATAL_ERROR
       "GCC-11.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}"
->>>>>>> upstream/main
   )
 endif()
 

From fb6e4ef45355a0845db438946fd9f96279317baf Mon Sep 17 00:00:00 2001
From: Prachi Gupta <pracgupt@amd.com>
Date: Tue, 17 Mar 2026 11:42:10 -0500
Subject: [PATCH 29/43] [develop] Update create_ifu_tag to run via
 workflow_dispatch and PR num (#3076)

In case of github workflow failing when it gets triggered via PR merge
of an IFU PR, we want to be able to run workflow manually to debug and
correctly create tags and issues. For this purpose, I have changed the
workflow file to take in rocm/pytorch's branch and PR number and run the
entire workflow on that.

Action Running: https://github.com/ROCm/pytorch/actions/runs/23174239617
IFU PR: https://github.com/ROCm/pytorch/pull/3069
---
 .github/workflows/create_ifu_tag.yml | 71 +++++++++++++++++++---------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/create_ifu_tag.yml b/.github/workflows/create_ifu_tag.yml
index fae8140011cf0..7dc766cd06b0a 100644
--- a/.github/workflows/create_ifu_tag.yml
+++ b/.github/workflows/create_ifu_tag.yml
@@ -25,6 +25,11 @@ on:
         required: false
         default: false
         type: boolean
+      pr_num:
+        description: "Merged IFU PR number — runs full pipeline (tags, PR body, create_issues) as if that PR just merged"
+        required: false
+        default: 0
+        type: number
 
 permissions:
   contents: write        # create/push tags
@@ -44,7 +49,7 @@ jobs:
     # Export values so the create-issues job can use them
     outputs:
       prev_post_tag: ${{ steps.prev_tag.outputs.prev_post_tag }}
-      curr_pre_tag: ${{ github.event_name == 'workflow_dispatch' && inputs.test_curr_pre_tag || steps.tagname.outputs.PRE_TAG }}
+      curr_pre_tag: ${{ (github.event_name == 'workflow_dispatch' && inputs.pr_num == 0 && inputs.test_curr_pre_tag) || steps.tagname.outputs.PRE_TAG }}
       has_prev_tag: ${{ steps.prev_tag.outputs.has_prev_tag }}
       issue_prev_ref: ${{ steps.prev_ref.outputs.issue_prev_ref }}
       can_create_issues: ${{ steps.prev_ref.outputs.can_create_issues }}
@@ -65,12 +70,37 @@ jobs:
             echo "  test_issue_prev_ref: ${{ inputs.test_issue_prev_ref }}"
           fi
 
+      # When dispatch + pr_num: fetch PR via API so we have base.ref, head.sha, merge_commit_sha, title (no event.pull_request in dispatch).
+      - name: Get PR details
+        id: get_pr
+        if: github.event_name == 'workflow_dispatch' && inputs.pr_num != 0
+        env:
+          GH_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          PR_JSON=$(gh api "repos/${{ github.repository }}/pulls/${{ inputs.pr_num }}")
+          MERGE_SHA=$(echo "$PR_JSON" | jq -r .merge_commit_sha)
+          if [[ "$MERGE_SHA" == "null" || -z "$MERGE_SHA" ]]; then
+            echo "ERROR: PR #${{ inputs.pr_num }} is not merged yet. Use a merged IFU PR number."
+            exit 1
+          fi
+          echo "base_ref=$(echo "$PR_JSON" | jq -r .base.ref)" >> "$GITHUB_OUTPUT"
+          echo "head_sha=$(echo "$PR_JSON" | jq -r .head.sha)" >> "$GITHUB_OUTPUT"
+          echo "merge_sha=$MERGE_SHA" >> "$GITHUB_OUTPUT"
+          echo "title=$(echo "$PR_JSON" | jq -r .title)" >> "$GITHUB_OUTPUT"
+          echo "pr_num=${{ inputs.pr_num }}" >> "$GITHUB_OUTPUT"
+          echo "Fetched PR #${{ inputs.pr_num }}: base=$(echo "$PR_JSON" | jq -r .base.ref), merge_sha=$MERGE_SHA"
+
       - name: Checkout base repo (full history)
         uses: actions/checkout@v4
         with:
-          # Use current branch for test mode, PR base branch for real merges
-          ref: ${{ github.event_name == 'workflow_dispatch' && github.ref || github.event.pull_request.base.ref }}
+          # Worflow_dispatch
+          # pr_num != 0 -> use pr details from json which we got in get_pr step
+          # pr_num == 0 -> use current branch
+          # PR merge -> use base.ref
+          ref: ${{ (github.event_name == 'workflow_dispatch' && inputs.pr_num != 0 && steps.get_pr.outputs.base_ref) || (github.event_name == 'workflow_dispatch' && github.ref) || github.event.pull_request.base.ref }}
           fetch-depth: 0
+          token: ${{ secrets.IFU_GITHUB_TOKEN }}
 
       # Fetch all tags so we can find the previous post tag
       - name: Fetch all tags
@@ -83,17 +113,16 @@ jobs:
 
       - name: Derive key SHAs (rocm base, upstream main, merge)
         id: shas
-        # Skip in test mode - we don't have a real PR
-        if: github.event_name != 'workflow_dispatch'
+        if: (github.event_name == 'workflow_dispatch' && inputs.pr_num != 0) || (github.event_name != 'workflow_dispatch')
+        env:
+          PR_NUM: ${{ steps.get_pr.outputs.pr_num || github.event.pull_request.number }}
+          BASE_REF: ${{ steps.get_pr.outputs.base_ref || github.event.pull_request.base.ref }}
+          HEAD_SHA: ${{ steps.get_pr.outputs.head_sha || github.event.pull_request.head.sha }}
+          MERGE_SHA: ${{ steps.get_pr.outputs.merge_sha || github.event.pull_request.merge_commit_sha }}
         shell: bash
         run: |
           set -euo pipefail
 
-          PR_NUM="${{ github.event.pull_request.number }}"
-          BASE_REF="${{ github.event.pull_request.base.ref }}"
-          HEAD_SHA="${{ github.event.pull_request.head.sha }}"
-          MERGE_SHA="${{ github.event.pull_request.merge_commit_sha }}"
-
           # Upstream ref is usually the same as base branch. For rocm/pytorch's
           # develop branch, compare against upstream/main.
           UPSTREAM_REF="$BASE_REF"
@@ -143,10 +172,10 @@ jobs:
 
       - name: Extract tag base from PR title
         id: tagname
-        # Skip in test mode
-        if: github.event_name != 'workflow_dispatch'
+        if: (github.event_name == 'workflow_dispatch' && inputs.pr_num != 0) || (github.event_name != 'workflow_dispatch')
+        env:
+          TITLE: ${{ steps.get_pr.outputs.title || github.event.pull_request.title }}
         run: |
-          TITLE="${{ github.event.pull_request.title }}"
           # Remove everything up to and including "[AUTOGENERATED]"
           # Remove trailing whitespace
           BASE_TAG=$(echo "$TITLE" | sed -E 's/^\[AUTOGENERATED\][[:space:]]*//' | sed -E 's/[[:space:]]+$//')
@@ -169,8 +198,8 @@ jobs:
       - name: Find previous post tag
         id: prev_tag
         env:
-          # Use test_branch input for test mode, extracted BRANCH for real merges
-          BRANCH: ${{ github.event_name == 'workflow_dispatch' && inputs.test_branch || steps.tagname.outputs.BRANCH }}
+          # Dispatch without pr_num: test_branch; dispatch+pr_num or PR merge: from tagname
+          BRANCH: ${{ (github.event_name == 'workflow_dispatch' && inputs.pr_num == 0 && inputs.test_branch) || steps.tagname.outputs.BRANCH }}
         run: |
           echo "Finding previous post tag for branch: ${BRANCH}"
 
@@ -273,8 +302,7 @@ jobs:
           fi
 
       - name: Create pre/post tags
-        # Skip in test mode
-        if: github.event_name != 'workflow_dispatch'
+        if: (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch' && inputs.pr_num != 0)
         shell: bash
         run: |
           set -euo pipefail
@@ -284,16 +312,15 @@ jobs:
 
           git tag -a "${{ steps.tagname.outputs.PRE_TAG }}"  -m "IFU pre (PR #${{ steps.shas.outputs.PR_NUM }})"  "${{ steps.shas.outputs.ROCM_BASE_SHA }}"
           git tag -a "${{ steps.tagname.outputs.POST_TAG }}" -m "IFU post (PR #${{ steps.shas.outputs.PR_NUM }})" "${{ steps.shas.outputs.MERGE_SHA }}"
-
+          
           #Force pushing is safe. If we land a new PR, we'd wanna retag a commit if we have to.
           git push origin "refs/tags/${{ steps.tagname.outputs.PRE_TAG }}" -f
           git push origin "refs/tags/${{ steps.tagname.outputs.POST_TAG  }}" -f
 
       - name: Append rocm_base & upstream_main to PR body
-        # Skip in test mode
-        if: github.event_name != 'workflow_dispatch'
+        if: (github.event_name == 'pull_request') || (github.event_name == 'workflow_dispatch' && inputs.pr_num != 0)
         env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GH_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}
         shell: bash
         run: |
           set -euo pipefail
@@ -316,7 +343,7 @@ jobs:
     needs: tag-ifu
     if: >
       needs.tag-ifu.outputs.can_create_issues == 'true' &&
-      (github.event_name != 'workflow_dispatch' || inputs.run_full_chain == true)
+      (github.event_name != 'workflow_dispatch' || inputs.run_full_chain == true || inputs.pr_num != 0)
     uses: ./.github/workflows/create_ifu_issues.yml
     with:
       prev_post_tag: ${{ needs.tag-ifu.outputs.issue_prev_ref }}

From 79e88771ce581c09b352918bb2fe2f5d4f0f98aa Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Fri, 10 Apr 2026 07:53:22 -0700
Subject: [PATCH 30/43] [CI]Add parity report scripts and workflow (#3094)

## Summary
- Add `pytorch-unit-test-scripts/` directory with all parity scripts
(download_testlogs, summarize_xml_testreports, parity.sh, and supporting
utilities)
- Add `parity.yml` GitHub Actions workflow that can be manually
triggered to download CI artifacts and generate parity CSVs
- All `download_testlogs` and `summarize_xml_testreports.py` flags are
exposed as workflow inputs (SHA, PR ID, arch, exclude flags, filter, set
names, etc.)
- Architectures are configurable via comma-separated input (default:
mi200,mi300,mi355)
- Generated CSVs and logs are uploaded as downloadable workflow
artifacts

## Setup
Requires these repository secrets:

- [x] - `IFU_GITHUB_TOKEN` (already exists)
- [x] - `AWS_ACCESS_KEY_ID`
- [x] - `AWS_SECRET_ACCESS_KEY`

## Test plan
- [x] Trigger workflow via Actions tab or `gh workflow run parity.yml
--ref add-parity-scripts-dashboard`
- [x] Verify artifacts download and CSVs generate for each architecture
- [x] Verify CSV artifacts are downloadable from the workflow run
https://github.com/ethanwee1/pytorch/actions/runs/23413634454

---------

Co-authored-by: Jithun Nair <jithun.nair@amd.com>
---
 .../auto_classify_skip_reasons.py             | 1027 +++++++++++++++++
 .../download_testlogs                         |  922 +++++++++++++++
 .../generate_summary.py                       |  427 +++++++
 .../requirements.txt                          |    4 +
 .../summarize_xml_testreports.py              |  717 ++++++++++++
 .../upload_stats_lib.py                       |  187 +++
 .../upload_test_stats.py                      |  394 +++++++
 .github/workflows/parity.yml                  |  369 ++++++
 8 files changed, 4047 insertions(+)
 create mode 100644 .automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
 create mode 100755 .automation_scripts/pytorch-unit-test-scripts/download_testlogs
 create mode 100644 .automation_scripts/pytorch-unit-test-scripts/generate_summary.py
 create mode 100644 .automation_scripts/pytorch-unit-test-scripts/requirements.txt
 create mode 100755 .automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
 create mode 100644 .automation_scripts/pytorch-unit-test-scripts/upload_stats_lib.py
 create mode 100644 .automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
 create mode 100644 .github/workflows/parity.yml

diff --git a/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py b/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
new file mode 100644
index 0000000000000..d9d14deefb268
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
@@ -0,0 +1,1027 @@
+#!/usr/bin/env python3
+"""
+Auto-classify skip reasons for ROCm parity CSV tests.
+
+Takes a parity CSV (output of summarize_xml_testreports.py) and automatically
+assigns skip_reason categories to tests where ROCm=SKIPPED/MISSED and CUDA=PASSED
+based on patterns in:
+  - The skip message (message_rocm column)
+  - The test file name
+  - The test class name
+  - The test name
+
+Rules are ordered by specificity: combined match rules first, then message-based,
+then file+class combos, then file-only fallbacks. First matching rule wins.
+
+Usage:
+  python auto_classify_skip_reasons.py -i input.csv -o output.csv [--report]
+  python auto_classify_skip_reasons.py -i input.csv -o output.csv --tsv-out updated_skip_reasons.tsv
+  python auto_classify_skip_reasons.py -i input.csv --dry-run --report
+"""
+
+import argparse
+import ast
+import csv
+import re
+import sys
+from collections import Counter, defaultdict
+
+
+# ---------------------------------------------------------------------------
+# Rules are evaluated top-to-bottom; first match wins.
+# Each rule is a dict with:
+#   reason:   the skip_reason category string
+#   msg:      (optional) regex to match against the skip message
+#   file:     (optional) regex to match against test_file
+#   cls:      (optional) regex to match against test_class
+#   name:     (optional) regex to match against test_name
+#   workflow: (optional) one of "default", "distributed", "inductor"
+#
+# All provided fields must match (AND logic). Omitted fields match anything.
+# msg="" matches empty messages; omitting msg matches anything.
+# ---------------------------------------------------------------------------
+
+RULES = [
+    # ==================================================================
+    # TIER 1: High-specificity combined rules (message + file/class)
+    # ==================================================================
+
+    # --- bfloat16_SDPA_ME: dropout mask in test_transformers with bfloat16 in TEST NAME ---
+    # Must be before generic SDPA_ME rule
+    {"reason": "bfloat16_SDPA_ME",
+     "msg": r"_fill_mem_eff_dropout_mask",
+     "file": r"^test_transformers$",
+     "name": r"(?i)bfloat16|bf16"},
+
+    # --- GEMMS: test_mm_bmm in test_matmul_cuda with accuracy regression ---
+    # Must be before generic hipblas rule
+    {"reason": "GEMMS",
+     "msg": r"accuracy regression in hipblas",
+     "file": r"^test_matmul_cuda$",
+     "name": r"test_mm_bmm"},
+
+    # --- hipblas hipblaslt: test_addmm/test_cublas/other in test_matmul_cuda ---
+    {"reason": "hipblas hipblaslt",
+     "msg": r"accuracy regression in hipblas",
+     "file": r"^test_matmul_cuda$"},
+    {"reason": "hipblas hipblaslt",
+     "msg": r"skipIfRocm.*doesn't currently work",
+     "file": r"^test_matmul_cuda$"},
+    {"reason": "hipblas hipblaslt",
+     "file": r"^test_matmul_cuda$",
+     "msg": r"Green contexts are not supported"},
+
+    # --- Expected to work: skipCUDAIfRocm in test_meta for ldl_solve ops ---
+    {"reason": "Expected to work",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_meta$",
+     "name": r"(?i)ldl_solve"},
+
+    # --- Linalg: skipCUDAIfRocm in test_meta for other linalg ops ---
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_meta$"},
+
+    # --- Linalg: skipCUDAIfRocm in test_ops/test_linalg/test_meta/test_ops_fwd_gradients/test_ops_gradients ---
+    # These are ops like linalg.svd, linalg.eigh, etc.
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_linalg$"},
+    {"reason": "Linalg",
+     "msg": r"_convert_weight_to_int4pack_cuda.*(supported only for|is supported only for) CDNA"},
+    {"reason": "Linalg",
+     "msg": r"bfloat16 NCHW train failed"},
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops$",
+     "name": r"(?i)linalg|svd|eig[hs]?|cholesky|lstsq|solve|inv|det|qr|lu|pinv|matrix_rank|cross|norm|cond|householder|ormqr|geqrf|triangular|vecdot|multi_dot"},
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops_fwd_gradients$"},
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops_gradients$",
+     "name": r"(?i)linalg|svd|eig[hs]?|cholesky|lstsq|solve|inv|det|qr|lu|pinv|householder|ormqr|geqrf|triangular"},
+    {"reason": "Linalg",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_meta$",
+     "name": r"(?i)linalg|svd|eig[hs]?|cholesky|lstsq|solve|inv|det|qr|lu|pinv|householder|ormqr|geqrf|triangular"},
+    {"reason": "Linalg",
+     "file": r"^test_nn$",
+     "msg": r"skipIfRocm.*doesn't currently work"},
+
+    # --- hipSolver/Magma: skipCUDAIfRocm in test_ops for ldl_solve, scaled_dot_product, conv_transpose3d ---
+    {"reason": "hipSolver/Magma",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops$",
+     "name": r"(?i)ldl_solve|scaled_dot_product|conv_transpose3d"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops_jit$"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_decomp$"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_schema_check$"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_testing$"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"Skipped for ROCm!"},
+    {"reason": "hipSolver/Magma",
+     "msg": r"test_cow_input does not work with efficient attention on ROCM"},
+
+    # --- Compiler issue: "Skipped!" in test_ops for specific compiler-related tests ---
+    {"reason": "Compiler issue",
+     "msg": r"^Skipped!$",
+     "file": r"^test_ops$",
+     "name": r"(?i)special_hermite_polynomial_h|special_laguerre"},
+
+    # --- non-standard bool: "Skipped!" in test_ops for bool-related tests ---
+    {"reason": "non-standard bool",
+     "msg": r"^Skipped!$",
+     "file": r"^test_ops$",
+     "name": r"(?i)bool"},
+
+    # --- pow: "Skipped!" in test_ops/test_decomp for pow tests ---
+    {"reason": "pow",
+     "msg": r"^Skipped!$",
+     "file": r"^test_ops$|^test_decomp$",
+     "name": r"(?i)^pow$|_pow_|float_power"},
+
+    # --- fft: "Skipped!" or "Skipped on ROCm" in test_ops for fft tests ---
+    {"reason": "fft",
+     "msg": r"^Skipped(!| on ROCm)$",
+     "file": r"^test_ops$",
+     "name": r"(?i)fft"},
+
+    # --- NHWC: "Skipped!" in test_modules for NHWC tests ---
+    {"reason": "NHWC",
+     "msg": r"^Skipped!$",
+     "file": r"^test_modules$"},
+
+    # (FakeTensor removed — "Requires CUDA" messages are explicit NVIDIA test per policy)
+
+    # --- hermite_polynomial_h: custom_mask_type in test_ops for hermite ---
+    {"reason": "hermite_polynomial_h",
+     "msg": r"Efficient attention on ROCM doesn't support custom_mask_type",
+     "file": r"^test_ops$",
+     "name": r"(?i)hermite"},
+
+    # --- fake_crossref: skipCUDAIfRocm in test_ops for crossref tests ---
+    {"reason": "fake_crossref",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work",
+     "file": r"^test_ops$",
+     "name": r"(?i)crossref|fake_crossref"},
+
+    # --- Jit: Tensor-likes not close in test_jit_fuser ---
+    {"reason": "Jit",
+     "msg": r"Tensor-likes are not close",
+     "file": r"test_jit_fuser"},
+
+    # --- Memory allocation: TestBlockStateAbsorption in test_cuda ---
+    {"reason": "Memory allocation",
+     "file": r"^test_cuda$",
+     "cls": r"^TestBlockStateAbsorption$"},
+
+    # --- cuda allocator: TestCudaAllocator in test_cuda ---
+    {"reason": "cuda allocator",
+     "file": r"^test_cuda$",
+     "cls": r"^TestCudaAllocator$"},
+
+    # --- hipGraph/cudaGraph: CudaGraph-related classes in test_cuda ---
+    {"reason": "hipGraph/cudaGraph",
+     "file": r"^test_cuda$",
+     "cls": r"CachingHostAllocatorCudaGraph|GreenContext"},
+
+    # --- Memory allocation: TestMemPool in test_cuda ---
+    {"reason": "Memory allocation",
+     "file": r"^test_cuda$",
+     "cls": r"^TestMemPool$"},
+
+    # --- Profiler: TestFXMemoryProfiler in test_cuda ---
+    {"reason": "Profiler",
+     "file": r"^test_cuda$",
+     "cls": r"FXMemoryProfiler"},
+
+    # --- compiled optimizer: ROCm numerical behavior in inductor.test_compiled_optimizers ---
+    {"reason": "compiled optimizer",
+     "msg": r"ROCm may have different numerical behavior",
+     "file": r"inductor\.test_compiled_optimizers"},
+
+    # --- functorch: FuncTorch classes in inductor.test_compiled_autograd ---
+    {"reason": "functorch",
+     "file": r"^inductor\.test_compiled_autograd$",
+     "cls": r"FuncTorch"},
+
+    # --- PT2.0 - Distributed: DTensor classes in inductor.test_compiled_autograd ---
+    {"reason": "PT2.0 - Distributed",
+     "file": r"^inductor\.test_compiled_autograd$",
+     "cls": r"DTensor"},
+
+    # --- hipdnn: cudnn Attention messages ---
+    {"reason": "hipdnn",
+     "msg": r"[Cc]u[Dd][Nn][Nn] Attention is not supported"},
+    {"reason": "hipdnn",
+     "msg": r"Efficient or cuDNN Attention was not built"},
+
+    # --- Will not be supported on ROCm: test_transformers with (no message) ---
+    {"reason": "Will not be supported on ROCm",
+     "file": r"^test_transformers$",
+     "cls": r"SDPA.*CUDA",
+     "msg": r"^$"},
+
+    # --- transformers: test_transformers / test_flop_counter with misc messages ---
+    {"reason": "transformers",
+     "file": r"^test_transformers$",
+     "msg": r"Does not support all SDPA backends"},
+    {"reason": "transformers",
+     "file": r"^test_flop_counter$"},
+
+    # --- bfloat16: test_sparse_csr with (no message) ---
+    {"reason": "bfloat16",
+     "file": r"^test_sparse_csr$",
+     "cls": r"[Bb]float16|bf16"},
+    {"reason": "bfloat16",
+     "file": r"^test_sparse$",
+     "cls": r"[Bb]float16|bf16"},
+    {"reason": "bfloat16",
+     "file": r"^test_matmul_cuda$",
+     "msg": r"ROCm doesn't support CUTLASS"},
+
+    # --- explicit NVIDIA test: test_sparse_semi_structured with cutlass in NAME ---
+    {"reason": "explicit NVIDIA test",
+     "file": r"^test_sparse_semi_structured$",
+     "name": r"(?i)cutlass"},
+
+    # --- cusparselt: everything else in test_sparse_semi_structured ---
+    {"reason": "cusparselt",
+     "file": r"^test_sparse_semi_structured$"},
+
+    # --- Quantization: distributed quantization tests ---
+    {"reason": "Quantization",
+     "msg": r"Test skipped for ROCm",
+     "file": r"distributed\.algorithms\.quantization"},
+
+    # --- Process Group: distributed spawn/c10d with "Test skipped for ROCm" ---
+    {"reason": "Process Group",
+     "msg": r"Test skipped for ROCm",
+     "file": r"distributed\.test_distributed_spawn.*nccl"},
+
+    # ==================================================================
+    # TIER 2: Message-based rules (strong signal from skip message)
+    # ==================================================================
+
+    # SDPA_ME
+    {"reason": "SDPA_ME",
+     "msg": r"_fill_mem_eff_dropout_mask"},
+    {"reason": "SDPA_ME",
+     "msg": r"Efficient attention on ROCM doesn't support custom_mask_type"},
+    {"reason": "SDPA_ME",
+     "msg": r"Efficient Attention on ROCM does not support head_dim"},
+
+    # SDPA_FA
+    {"reason": "SDPA_FA",
+     "msg": r"Large numerical errors on ROCM"},
+    {"reason": "SDPA_FA",
+     "msg": r"flash attention not supported"},
+
+    # Will not be supported on ROCm
+    {"reason": "Will not be supported on ROCm",
+     "msg": r"head_dim != head_dim_v unsupported on ROCm"},
+
+    # Triton 3.7 bump
+    {"reason": "triton 3.7 bump",
+     "msg": r"skipIfRocm.*Fails with Triton 3\.7"},
+
+    # MIOpen
+    {"reason": "MIOpen Convolutions",
+     "msg": r"Marked as skipped for MIOpen"},
+
+    # Static CUDA launcher
+    {"reason": "static cuda launcher",
+     "msg": r"Static cuda launcher doesn't work with ROCM"},
+
+    # NUMBA
+    {"reason": "NUMBA",
+     "msg": r"No numba\.cuda"},
+
+    # int4
+    {"reason": "int4",
+     "msg": r"_int4_mm is supported only for CDNA"},
+
+    # FP8
+    {"reason": "FP8",
+     "msg": r"cuBLAS blockwise scaling"},
+
+    # variable length attention
+    {"reason": "variable length attention",
+     "msg": r"ROCm does not support seqused_k"},
+
+    # CUDA IPC
+    {"reason": "Pass with unskip or minor mod",
+     "msg": r"CUDA IPC not available"},
+
+    # Python version
+    {"reason": "Python version",
+     "msg": r"Not supported in Python 3\.1[0-9]+"},
+
+    # cpp_test / CUDA not found
+    {"reason": "cpp_test",
+     "msg": r"CUDA not found"},
+    {"reason": "cpp_test",
+     "msg": r"CUDA_HOME not set"},
+
+    # Foreach
+    {"reason": "Foreach",
+     "msg": r"failed starting on ROCm"},
+
+    # CUTLASS
+    {"reason": "cutlass",
+     "msg": r"ROCm doesn't support CUTLASS|CUTLASS backend is not supported on HIP|ROCm and Windows doesn't support CUTLASS"},
+
+    # Transformers dependency
+    {"reason": "transformers",
+     "msg": r"No transformers"},
+
+    # hipGraph / cudaGraph (but NOT in functorch files -- those stay functorch)
+    {"reason": "hipGraph/cudaGraph",
+     "msg": r"Green contexts are not supported"},
+    {"reason": "functorch",
+     "msg": r"CUDA 12\.4 or greater is required for CUDA Graphs",
+     "file": r"^functorch\."},
+    {"reason": "hipGraph/cudaGraph",
+     "msg": r"CUDA 12\.4 or greater is required for CUDA Graphs"},
+    {"reason": "hipGraph/cudaGraph",
+     "msg": r"ROCM >= 5\.3 required for graphs.*cuda-bindings"},
+
+    # TMA / Blackwell
+    {"reason": "Will not be supported on ROCm",
+     "msg": r"Need.*TMA support"},
+    {"reason": "Will not be supported on ROCm",
+     "msg": r"Need Blackwell"},
+
+    # CUDA SM requirements
+    {"reason": "explicit NVIDIA test",
+     "msg": r"Requires CUDA SM >= [0-9]"},
+    {"reason": "explicit NVIDIA test",
+     "msg": r"Requires CUDA with SM >= [0-9]"},
+    {"reason": "explicit NVIDIA test",
+     "msg": r"Test is only supported on CUDA 1[0-9]"},
+    {"reason": "explicit NVIDIA test",
+     "msg": r"Requires NCCL version greater than"},
+    {"reason": "explicit NVIDIA test",
+     "msg": r"Excluded from CUDA tests"},
+
+    # FP8 — MI300+ / H100+ only
+    {"reason": "FP8",
+     "msg": r"FP8 is only supported on H100\+|FP8 is not supported on this platform|FP8 requires H100\+"},
+    {"reason": "FP8",
+     "msg": r"requires gpu with fp8 support"},
+
+    # Symmetric memory
+    {"reason": "Symmetric memory",
+     "msg": r"SymmMem is not supported on this ROCm arch"},
+
+    # Python version / 3.12+
+    {"reason": "Python version",
+     "msg": r"Failing on python 3\.12\+|torch\.compile is not supported on python 3\.12\+|complex flaky in 3\.12"},
+
+    # Greater than 4 GPU (distributed)
+    {"reason": "Greater than 4 GPU",
+     "msg": r"Need at least 4 CUDA devices"},
+    {"reason": "Greater than 4 GPU",
+     "msg": r"Test requires.*world size of 4"},
+    {"reason": "Greater than 4 GPU",
+     "msg": r"requires [34] GPUs, found [12]"},
+
+    # tensor_parallel — architecture-specific skip
+    {"reason": "tensor_parallel",
+     "msg": r"test only runs on \('gfx942'"},
+
+    # Process Group: subprocess level skip
+    {"reason": "Process Group",
+     "msg": r"Test skipped at subprocess level"},
+
+    # Sharded Tensor: subprocess level skip in _shard
+    {"reason": "Sharded Tensor",
+     "msg": r"Test skipped at subprocess level",
+     "file": r"distributed\._shard"},
+
+    # Process Group: NCCL version / device assert
+    {"reason": "Process Group",
+     "msg": r"NCCL test requires 2\+ GPUs"},
+
+    # Misc: ROCm preserves subnormals
+    {"reason": "Misc",
+     "msg": r"ROCm preserves subnormals"},
+
+    # Misc: GCC codegen
+    {"reason": "Misc",
+     "msg": r"Fails under GCC 1[0-9] due to vector codegen"},
+
+    # Misc: Skipped on ROCm due to hang
+    {"reason": "Misc",
+     "msg": r"Skipped on ROCm due to hang"},
+
+    # Misc: Test skipped for ROCm (generic distributed)
+    {"reason": "Misc",
+     "msg": r"Test skipped for ROCm"},
+
+    # Misc: architecture-specific skips
+    {"reason": "Misc",
+     "msg": r"test skipped on \('gfx"},
+
+    # cuFFT-specific
+    {"reason": "Misc",
+     "msg": r"cuFFT-specific"},
+
+    # ROCTracer profiler
+    {"reason": "Memory allocation",
+     "msg": r"ROCTracer does not capture"},
+
+    # expandable_segments-related messages
+    {"reason": "expandable_segments",
+     "msg": r"expandable_segments mode is not supported on ROCm"},
+    {"reason": "expandable_segments",
+     "msg": r"CUDA >= 11\.0 required for external events in cuda graphs.*rocm"},
+
+    # not enabled by default on rocm
+    {"reason": "expandable_segments",
+     "msg": r"not enabled by default on rocm"},
+
+    # HIP runtime context
+    {"reason": "Misc",
+     "msg": r"HIP runtime doesn't create context"},
+
+    # ==================================================================
+    # TIER 3: File + class based rules (for empty/generic messages)
+    # ==================================================================
+
+    # --- test_cuda class-based disambiguation ---
+    {"reason": "Misc",
+     "file": r"^test_cuda$",
+     "cls": r"^TestCuda$"},
+    {"reason": "compiled optimizer",
+     "file": r"^test_cuda$",
+     "cls": r"TestCudaOptims"},
+    {"reason": "Misc",
+     "file": r"^test_cuda$",
+     "cls": r"TestCudaAutocast"},
+    {"reason": "cpp_test",
+     "file": r"^test_cuda$",
+     "cls": r"TestCompileKernel"},
+
+    # --- test_nn (MI200-specific skips, no message) ---
+    {"reason": "Misc",
+     "file": r"^test_nn$"},
+
+    # --- inductor.test_fp8 ---
+    {"reason": "FP8",
+     "file": r"^inductor\.test_fp8$"},
+
+    # --- test_scaled_matmul_cuda ---
+    {"reason": "FP8",
+     "file": r"^test_scaled_matmul_cuda$"},
+
+    # --- inductor.test_torchinductor_strided_blocks ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_torchinductor_strided_blocks$"},
+
+    # --- inductor.test_flex_decoding ---
+    {"reason": "flex_decoding",
+     "file": r"^inductor\.test_flex_decoding$"},
+
+    # --- inductor.test_loop_ordering ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_loop_ordering$"},
+
+    # --- torch_np / numpy tests ---
+    {"reason": "NumPy",
+     "file": r"^torch_np\."},
+
+    # --- test_binary_ufuncs ---
+    {"reason": "Misc",
+     "file": r"^test_binary_ufuncs$"},
+
+    # --- test_fx ---
+    {"reason": "FX",
+     "file": r"^test_fx$"},
+
+    # --- profiler.test_execution_trace ---
+    {"reason": "Profiler",
+     "file": r"^profiler\.test_execution_trace$"},
+
+    # --- test_cpp_api_parity ---
+    {"reason": "cpp_test",
+     "file": r"^test_cpp_api_parity$"},
+
+    # --- test_expanded_weights ---
+    {"reason": "Misc",
+     "file": r"^test_expanded_weights$"},
+
+    # --- test_linalg (arch-specific skips) ---
+    {"reason": "Linalg",
+     "file": r"^test_linalg$"},
+
+    # --- test_torch (arch-specific skips) ---
+    {"reason": "Misc",
+     "file": r"^test_torch$"},
+
+    # --- nn.test_convolution (arch-specific) ---
+    {"reason": "MIOpen Convolutions",
+     "file": r"^nn\.test_convolution$"},
+
+    # --- inductor.test_aot_inductor_arrayref ---
+    {"reason": "PT2.0 - AOTInductor",
+     "file": r"^inductor\.test_aot_inductor_arrayref$"},
+
+    # --- distributed.test_symmetric_memory ---
+    {"reason": "Symmetric memory",
+     "file": r"^distributed\.test_symmetric_memory$"},
+
+    # --- inductor.test_compiled_autograd HigherOrderOp (MI300 has more classes) ---
+    {"reason": "functorch",
+     "file": r"^inductor\.test_compiled_autograd$",
+     "cls": r"HigherOrderOp"},
+
+    # --- explicit NVIDIA test in various files ---
+    {"reason": "explicit NVIDIA test",
+     "file": r"^test_cuda_nvml_based_avail$"},
+    {"reason": "explicit NVIDIA test",
+     "file": r"^test_cpp_extensions_aot"},
+
+    # --- hipGraph/cudaGraph: only test_graph_* (NOT test_cuda_graph_*) in test_cuda_expandable_segments ---
+    {"reason": "hipGraph/cudaGraph",
+     "file": r"^test_cuda_expandable_segments$",
+     "name": r"^test_graph_"},
+
+    # --- expandable_segments (everything else in test_cuda_expandable_segments) ---
+    {"reason": "expandable_segments",
+     "file": r"^test_cuda_expandable_segments$"},
+
+    # --- Profiler ---
+    {"reason": "Profiler",
+     "file": r"^profiler\.test_profiler$"},
+
+    # --- serialization ---
+    {"reason": "serialization",
+     "file": r"^test_serialization$"},
+
+    # --- dataloader ---
+    {"reason": "dataloader",
+     "file": r"^test_dataloader$"},
+
+    # --- Multi-Processing ---
+    {"reason": "Multi-Processing",
+     "file": r"^test_multiprocessing_spawn$"},
+    {"reason": "Multi-Processing",
+     "file": r"^test_multiprocessing$"},
+
+    # --- hipSparse ---
+    {"reason": "hipSparse",
+     "file": r"^test_sparse_csr$"},
+    {"reason": "hipSparse",
+     "file": r"^test_sparse$",
+     "msg": r"^$"},
+
+    # --- nested tensor ---
+    {"reason": "nested tensor",
+     "file": r"^test_nestedtensor$"},
+
+    # --- asm_elementwise ---
+    {"reason": "asm_elementwise",
+     "file": r"higher_order_ops\.test_inline_asm_elementwise"},
+
+    # --- torchinductor_opinfo_properties ---
+    {"reason": "torchinductor_opinfo_properties",
+     "file": r"^inductor\.test_torchinductor_opinfo_properties$"},
+
+    # --- flex_attention ---
+    {"reason": "flex_attention",
+     "file": r"^inductor\.test_flex_attention$"},
+
+    # --- compiled optimizer ---
+    {"reason": "compiled optimizer",
+     "file": r"^inductor\.test_compiled_optimizers$"},
+
+    # --- inductor combo_kernels ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_combo_kernels$"},
+
+    # --- inductor compiled_autograd (remaining after FuncTorch/DTensor class rules) ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_compiled_autograd$"},
+
+    # --- Foreach (inductor) ---
+    {"reason": "Foreach",
+     "file": r"^inductor\.test_foreach$"},
+
+    # --- inductor codecache / cudacodecache ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_codecache$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_cudacodecache$"},
+
+    # --- inductor GPU cpp wrapper ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_gpu_cpp_wrapper$"},
+
+    # --- inductor torchinductor variants ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_torchinductor$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_torchinductor_dynamic_shapes$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_torchinductor_codegen_dynamic_shapes$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_torchinductor_opinfo$"},
+
+    # --- inductor compile subprocess ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_compile_subprocess$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_compile_worker$"},
+
+    # --- inductor cpu/cuda repro ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_cpu_repro$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_cuda_repro$"},
+
+    # --- inductor custom lowering / minifier ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_custom_lowering$"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_minifier"},
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^inductor\.test_mix_order"},
+
+    # --- inductor aot_inductor ---
+    {"reason": "PT2.0 - AOTInductor",
+     "file": r"^inductor\.test_aot_inductor"},
+
+    # --- functorch ---
+    {"reason": "functorch",
+     "file": r"^functorch\."},
+
+    # --- dynamo ---
+    {"reason": "PT2.0 - Dynamo",
+     "file": r"^dynamo\."},
+
+    # --- export ---
+    {"reason": "PT2.0 - Inductor",
+     "file": r"^export\."},
+
+    # --- tf32: test_nn with "Test is disabled" ---
+    {"reason": "tf32",
+     "file": r"^test_nn$",
+     "msg": r"Test is disabled"},
+
+    # --- MIOpen Convolutions ---
+    {"reason": "MIOpen Convolutions",
+     "file": r"^nn\.test_convolution$"},
+
+    # --- test_stateless ---
+    {"reason": "Misc",
+     "file": r"^test_stateless$"},
+
+    # --- test_cuda_primary_ctx ---
+    {"reason": "Misc",
+     "file": r"^test_cuda_primary_ctx$"},
+
+    # --- test_torchfuzz ---
+    {"reason": "Misc",
+     "file": r"^test_torchfuzz"},
+
+    # ==================================================================
+    # TIER 4: Distributed file-based rules
+    # ==================================================================
+
+    # Sharded Tensor
+    {"reason": "Sharded Tensor",
+     "file": r"^distributed\._shard\."},
+    {"reason": "Sharded Tensor",
+     "file": r"^distributed\._composable\.fsdp\.test_fully_shard_training$"},
+    {"reason": "Sharded Tensor",
+     "file": r"^distributed\._composable\.fsdp\.test_fully_shard_clip_grad"},
+
+    # tensor_parallel
+    {"reason": "tensor_parallel",
+     "file": r"^distributed\.tensor\.parallel\."},
+
+    # pipeline_parallel
+    {"reason": "pipeline_parallel",
+     "file": r"^distributed\.pipelining\."},
+
+    # FSDP
+    {"reason": "FSDP",
+     "file": r"^distributed\.fsdp\."},
+    {"reason": "FSDP",
+     "file": r"^distributed\._composable\.fsdp\."},
+
+    # 2D FSDP / composability
+    {"reason": "2D FSDP",
+     "file": r"^distributed\._composable\.test_composability"},
+
+    # DDP / replicate
+    {"reason": "DDP",
+     "file": r"^distributed\._composable\.test_replicate"},
+
+    # Process Group / c10d
+    {"reason": "Process Group",
+     "file": r"^distributed\.test_c10d_"},
+
+    # PT2.0 - Distributed (dynamo_distributed)
+    {"reason": "PT2.0 - Distributed",
+     "file": r"^distributed\.test_dynamo_distributed$"},
+
+    # Collectives (tensor ops, composability, nccl)
+    {"reason": "Collectives",
+     "file": r"^distributed\.tensor\.test_"},
+    {"reason": "Collectives",
+     "file": r"^distributed\.test_composability$"},
+    {"reason": "Collectives",
+     "file": r"^distributed\.test_nccl$"},
+
+    # Distributed tools
+    {"reason": "Misc",
+     "file": r"^distributed\._tools\."},
+
+    # Distributed elastic
+    {"reason": "elastic",
+     "file": r"^distributed\.elastic\."},
+
+    # Distributed quantization
+    {"reason": "Quantization",
+     "file": r"^distributed\.algorithms\.quantization"},
+
+    # Distributed rpc
+    {"reason": "Misc",
+     "file": r"^distributed\.rpc\."},
+
+    # Distributed spawn
+    {"reason": "Misc",
+     "file": r"^distributed\.test_distributed_spawn"},
+
+    # Distributed (generic catch-all)
+    {"reason": "Misc",
+     "file": r"^distributed\."},
+
+    # ==================================================================
+    # TIER 5: Generic message fallbacks
+    # ==================================================================
+
+    # "Test is disabled" messages
+    {"reason": "Misc",
+     "msg": r"Test is disabled because an issue exists disabling it"},
+
+    # Generic skipIfRocm / skipCUDAIfRocm
+    {"reason": "Misc",
+     "msg": r"skipIfRocm.*doesn't currently work on the ROCm stack"},
+    {"reason": "Misc",
+     "msg": r"skipCUDAIfRocm.*doesn't currently work on the ROCm stack"},
+
+    # "Skipped!" / "Skipped"
+    {"reason": "Misc",
+     "msg": r"^Skipped!?$"},
+
+    # "Skipped on ROCm"
+    {"reason": "Misc",
+     "msg": r"^Skipped on ROCm$"},
+
+    # Not supported on ROCm (generic)
+    {"reason": "Will not be supported on ROCm",
+     "msg": r"Not supported on ROCm"},
+
+    # ==================================================================
+    # TIER 6: Catch-all for remaining test_cuda (no message, generic class)
+    # ==================================================================
+    {"reason": "Misc",
+     "file": r"^test_cuda$"},
+]
+
+
+def extract_message(raw_msg: str) -> str:
+    """Extract a clean message string from the raw CSV message_rocm value."""
+    if not raw_msg or raw_msg.strip() == '':
+        return ''
+    try:
+        d = ast.literal_eval(raw_msg)
+        if isinstance(d, dict):
+            return d.get('message', str(d))
+    except (ValueError, SyntaxError):
+        pass
+    return raw_msg.strip()
+
+
+def classify_test(msg: str, test_file: str, test_class: str, test_name: str,
+                  workflow: str = '') -> str | None:
+    """Return the skip_reason for a test, or None if no rule matches."""
+    for rule in RULES:
+        match = True
+        if 'msg' in rule:
+            if not re.search(rule['msg'], msg, re.IGNORECASE):
+                match = False
+        if 'file' in rule and match:
+            if not re.search(rule['file'], test_file):
+                match = False
+        if 'cls' in rule and match:
+            if not re.search(rule['cls'], test_class):
+                match = False
+        if 'name' in rule and match:
+            if not re.search(rule['name'], test_name):
+                match = False
+        if 'workflow' in rule and match:
+            if workflow and workflow != rule['workflow']:
+                match = False
+        if match:
+            return rule['reason']
+    return None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Auto-classify skip reasons for ROCm parity CSVs')
+    parser.add_argument('-i', '--input', required=True,
+                        help='Input parity CSV file')
+    parser.add_argument('-o', '--output',
+                        help='Output CSV with auto-classified skip_reason column')
+    parser.add_argument('--tsv-out',
+                        help='Also write a TSV file in skip_reasons format '
+                             '(compatible with --skip_reasons in summarize_xml_testreports.py)')
+    parser.add_argument('--only-unclassified', action='store_true',
+                        help='Only classify tests that have no skip_reason (default)')
+    parser.add_argument('--reclassify-all', action='store_true',
+                        help='Re-classify all tests, overwriting existing skip_reason')
+    parser.add_argument('--report', action='store_true',
+                        help='Print classification report to stderr')
+    parser.add_argument('--dry-run', action='store_true',
+                        help='Print report but do not write output files')
+    return parser.parse_args()
+
+
+def detect_columns(fieldnames):
+    """Detect whether CSV uses status_rocm/status_cuda or status_set1/status_set2."""
+    if 'status_rocm' in fieldnames:
+        return 'status_rocm', 'status_cuda', 'message_rocm'
+    elif 'status_set1' in fieldnames:
+        return 'status_set1', 'status_set2', 'message_set1'
+    else:
+        raise ValueError(f"Cannot detect status columns. Available: {fieldnames}")
+
+
+def main():
+    args = parse_args()
+
+    rows = []
+    with open(args.input, newline='') as f:
+        reader = csv.DictReader(f)
+        fieldnames = list(reader.fieldnames)
+        for row in reader:
+            rows.append(row)
+
+    col_rocm, col_cuda, col_msg = detect_columns(fieldnames)
+
+    for col in ('skip_reason', 'assignee', 'comments'):
+        if col not in fieldnames:
+            fieldnames.append(col)
+
+    classified_count = 0
+    already_had_count = 0
+    unclassified_count = 0
+    overwritten_count = 0
+    auto_reasons = Counter()
+    unclassified_msgs = Counter()
+    unclassified_files = Counter()
+    unclassified_details = []
+
+    tsv_entries = []
+
+    for row in rows:
+        status_rocm = row.get(col_rocm, '')
+        status_cuda = row.get(col_cuda, '')
+        existing_reason = row.get('skip_reason', '').strip()
+
+        needs_reason = (
+            status_rocm in ('SKIPPED', 'MISSED')
+            and status_cuda == 'PASSED'
+        )
+
+        if not needs_reason:
+            continue
+
+        raw_msg = row.get(col_msg, '')
+        msg = extract_message(raw_msg)
+        test_file = row.get('test_file', '')
+        test_class = row.get('test_class', '')
+        test_name = row.get('test_name', '')
+        workflow = row.get('work_flow_name', '')
+
+        if existing_reason and not args.reclassify_all:
+            already_had_count += 1
+            tsv_entries.append({
+                'test_file': test_file,
+                'test_name': test_name,
+                'test_class': test_class,
+                'skip_reason': existing_reason,
+                'assignee': row.get('assignee', ' '),
+                'comments': row.get('comments', ' '),
+            })
+            continue
+
+        reason = classify_test(msg, test_file, test_class, test_name, workflow)
+
+        if reason:
+            if existing_reason and existing_reason != reason:
+                overwritten_count += 1
+            row['skip_reason'] = reason
+            row.setdefault('assignee', '')
+            row.setdefault('comments', 'auto-classified')
+            classified_count += 1
+            auto_reasons[reason] += 1
+            tsv_entries.append({
+                'test_file': test_file,
+                'test_name': test_name,
+                'test_class': test_class,
+                'skip_reason': reason,
+                'assignee': row.get('assignee', ' ') if not args.reclassify_all else ' ',
+                'comments': 'auto-classified',
+            })
+        else:
+            unclassified_count += 1
+            display_msg = msg[:100] if msg else '(no message)'
+            unclassified_msgs[display_msg] += 1
+            unclassified_files[test_file] += 1
+            unclassified_details.append(
+                f"  {test_file:55s} {test_class:45s} {test_name[:40]:42s} {display_msg[:50]}")
+
+    if args.report or args.dry_run:
+        total = already_had_count + classified_count + unclassified_count
+        print(f"\n{'='*60}", file=sys.stderr)
+        print(f"AUTO-CLASSIFICATION REPORT", file=sys.stderr)
+        print(f"{'='*60}", file=sys.stderr)
+        print(f"Already had skip_reason:  {already_had_count}", file=sys.stderr)
+        print(f"Auto-classified:          {classified_count}", file=sys.stderr)
+        if overwritten_count:
+            print(f"  (overwritten existing:  {overwritten_count})", file=sys.stderr)
+        print(f"Still unclassified:       {unclassified_count}", file=sys.stderr)
+        if total:
+            pct = (already_had_count + classified_count) / total * 100
+            print(f"Coverage:                 {pct:.1f}%", file=sys.stderr)
+        print(f"Total target tests:       {total}", file=sys.stderr)
+
+        if auto_reasons:
+            print(f"\nAuto-classified by category:", file=sys.stderr)
+            for reason, cnt in auto_reasons.most_common():
+                print(f"  {cnt:5d}  {reason}", file=sys.stderr)
+
+        if unclassified_msgs:
+            print(f"\nUnclassified — top messages:", file=sys.stderr)
+            for msg_key, cnt in unclassified_msgs.most_common(15):
+                print(f"  {cnt:5d}  {msg_key}", file=sys.stderr)
+
+        if unclassified_files:
+            print(f"\nUnclassified — top files:", file=sys.stderr)
+            for f, cnt in unclassified_files.most_common(15):
+                print(f"  {cnt:5d}  {f}", file=sys.stderr)
+
+        if unclassified_details and len(unclassified_details) <= 50:
+            print(f"\nUnclassified tests:", file=sys.stderr)
+            for d in unclassified_details:
+                print(d, file=sys.stderr)
+
+    if args.dry_run:
+        return
+
+    if not args.output:
+        print("No --output specified; use --dry-run for report-only mode.",
+              file=sys.stderr)
+        sys.exit(1)
+
+    with open(args.output, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(row)
+
+    if args.tsv_out and tsv_entries:
+        with open(args.tsv_out, 'w', newline='') as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=['test_file', 'test_name', 'test_class',
+                            'skip_reason', 'assignee', 'comments'],
+                delimiter='\t',
+            )
+            writer.writeheader()
+            for entry in tsv_entries:
+                writer.writerow(entry)
+        print(f"\nWrote TSV with {len(tsv_entries)} entries to {args.tsv_out}",
+              file=sys.stderr)
+
+    print(f"Wrote {len(rows)} rows to {args.output}", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
new file mode 100755
index 0000000000000..6368590567f04
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -0,0 +1,922 @@
+#!/usr/bin/env python3
+
+
+try:
+    import os
+    import json
+    import argparse
+    import requests
+    import re
+    import sys
+    from upload_stats_lib import unzip
+    from upload_test_stats import download_gha_artifacts, download_s3_artifacts
+except ImportError:
+    import subprocess
+    result = subprocess.run(["pip3", "install", "-U", "-r", "requirements.txt"], capture_output=True, text=True)
+    print(result.stdout)
+    print("Please rerun the download_testlogs script")
+    sys.exit(1)
+
+
+# Check if environment variables are set
+required_env_vars = ['GITHUB_TOKEN', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY']
+
+missing_vars = [var for var in required_env_vars if not os.getenv(var)]
+if missing_vars:
+    print(f"ERROR: Please set these environment variables: {', '.join(missing_vars)}")
+    sys.exit(1)
+
+
+# global variables
+error_msgs = []
+# Workflow names mapped to TEST_CONFIG values in PyTorch CI
+# These are set dynamically based on --arch argument in main()
+ROCmWorkflowNames = {}
+CUDAWorkflowNames = {"default": "trunk",
+                    # Same as default, so not used for now
+                    # "distributed": "pull",
+                    "inductor": "inductor"}
+
+authentication_headers = None
+
+def get_commit_hashes(pr_id, token):
+    owner = "pytorch"
+    repo = "pytorch"
+    commits_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_id}/commits"
+    headers = {
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json"
+    }
+    page = 1
+    commits = []
+    while True:
+        response = requests.get(commits_url, headers=headers, params={'page': page})
+        if response.status_code == 200:
+            new_commits = response.json()
+            if not new_commits:
+                break
+            commits.extend(new_commits)
+            page += 1
+        else:
+            print(f"Failed to fetch commits: {response.status_code}")
+            break
+    return commits
+
+def get_latest_commit_sha(pr_id, token):
+    commits = get_commit_hashes(pr_id, token)
+    if commits:
+        return commits[-1]['sha']
+    else:
+        print("No commits found for the given pull request.")
+        sys.exit(1)
+
+def write_test_log_to_file(filename, test_key, jobs, sha):
+    js = [j for j in jobs if test_key in j['name']]
+    if len(js) > 0:
+        if len(js) > 1:
+            print(f"WARNING: Found multiple jobs with key: '{test_key}', selecting first one")
+            for j in js:
+                print(j['name'])
+        test_id = js[0]['id']
+        print(f"key: {test_key}, job Name: {js[0]['name']}, job ID: {test_id}, Downloading to {filename}")
+    else:
+        # Not being able to download logs is not a fatal error since we primarily depend on xml artifacts
+        # so log error and continue
+        error_msg = f"Error: TEST KEY: {test_key} DOES NOT EXIST IN JOBS.\nCheck url - https://hud.pytorch.org/hud/pytorch/pytorch/{sha}/1?per_page=50 - for job name"
+        print(error_msg)
+        error_msgs.append(error_msg)
+        return
+    response = requests.get( "https://ossci-raw-job-status.s3.amazonaws.com/log/" + str(test_id) )
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(response.text)
+
+def get_workflow_jobs(wf):
+    """Get all jobs for a workflow run."""
+    if wf is None:
+        raise Exception("wf is None!")
+    page_size = 100 #max allowed by Github API
+    response = requests.get( wf['jobs_url'], headers=authentication_headers, params={'per_page':page_size} )
+    response_json = response.json()
+    jobs = response_json["jobs"]
+
+    if response_json['total_count'] > page_size:
+        import math
+        for i in range(2, math.ceil(response_json['total_count']/page_size) + 1):
+            response = requests.get( wf['jobs_url'], headers=authentication_headers, params={'per_page':page_size, 'page':i} )
+            jobs += response.json()["jobs"]
+    return jobs
+
+def get_job_ids_by_prefix(wf, prefix):
+    """Get job IDs (as strings) for jobs whose name contains the given prefix."""
+    jobs = get_workflow_jobs(wf)
+    return [str(j['id']) for j in jobs if prefix in j['name']]
+
+def download_logs(wf, test_log_list, test_folder):
+    if wf is None: 
+        raise Exception("wf is None!")
+    
+    jobs = get_workflow_jobs(wf)
+
+    for test_log in test_log_list:
+        write_out_file = test_folder + "/" + test_log[0]
+        write_test_log_to_file(write_out_file, test_log[1], jobs, wf['head_sha'])
+
+def download_gha_artifacts_filtered(workflow_run_id, workflow_run_attempt, prefixes=[], allowed_substrings=None):
+    """Download GHA artifacts matching prefixes and optional substring filters.
+
+    GHA artifact names include run attempt info, e.g.:
+      test-reports-runattempt1-test-default-3-6-linux.rocm.gpu.gfx942.1_68425162477.zip
+    while S3 prefixes look like:
+      test-reports-test-default-3-6
+    We strip the runattemptN- portion before matching prefixes.
+
+    When a shard is re-run, only the latest attempt's artifact exists for that
+    shard, while other shards keep their original attempt. We collect all
+    matching artifacts and prefer the highest run attempt per shard key.
+    """
+    from pathlib import Path
+    from collections import defaultdict
+    artifact_paths = []
+    response = requests.get(
+        f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{workflow_run_id}/artifacts?per_page=100",
+        headers=authentication_headers,
+    )
+    artifacts = response.json().get("artifacts", [])
+    while "next" in response.links:
+        response = requests.get(response.links["next"]["url"], headers=authentication_headers)
+        artifacts.extend(response.json().get("artifacts", []))
+
+    # Group matching artifacts by shard key, keeping highest run attempt
+    # shard key = normalized name without runattemptN- and without runner/jobid suffix
+    best_per_shard = {}
+    for artifact in artifacts:
+        name = artifact["name"]
+        if not name.startswith("test-reports-"):
+            continue
+        if "rerun_disabled" in name:
+            continue
+        normalized = re.sub(r'runattempt\d+-', '', name)
+        if not any(normalized.startswith(pfx) for pfx in prefixes):
+            continue
+        if allowed_substrings and not any(sub in name for sub in allowed_substrings):
+            continue
+        # Extract run attempt number
+        attempt_match = re.search(r'runattempt(\d+)', name)
+        attempt_num = int(attempt_match.group(1)) if attempt_match else 0
+        # Use the shard portion as key (e.g., test-reports-test-default-3-6)
+        shard_key = re.sub(r'-[a-z]+\..*$', '', normalized)
+        if shard_key not in best_per_shard or attempt_num > best_per_shard[shard_key][0]:
+            best_per_shard[shard_key] = (attempt_num, name, artifact["archive_download_url"])
+
+    for shard_key, (attempt_num, name, url) in best_per_shard.items():
+        print(f"Downloading GHA artifact: {name}")
+        dl_response = requests.get(url, headers=authentication_headers)
+        if dl_response.status_code != 200:
+            print(f"  WARNING: Failed to download (HTTP {dl_response.status_code})")
+            continue
+        p = Path(name if name.endswith(".zip") else name + ".zip")
+        with open(p, "wb") as f:
+            f.write(dl_response.content)
+        artifact_paths.append(p)
+
+    return artifact_paths
+
+def _shorten_unzipped_dirs():
+    """Rename unzipped-* directories to short names for Windows MAX_PATH compatibility.
+
+    Converts names like:
+      unzipped-test-reports-runattempt1-test-default-1-6-linux.rocm.gpu.gfx942.1_68613413431.zip
+    to:
+      test-default-1-6
+
+    Preserves the 'test-<config>' prefix so that summarize_xml_testreports.py
+    can still detect workflow type via substring matching.
+    """
+    from pathlib import Path
+    for d in sorted(Path(".").glob("unzipped-*")):
+        if not d.is_dir():
+            continue
+        m = re.search(r'(test-\w+-\d+-\d+)', d.name)
+        if m:
+            short_name = m.group(1)
+            if not Path(short_name).exists():
+                d.rename(short_name)
+                print(f"  Renamed {d.name} -> {short_name}")
+            else:
+                print(f"  WARNING: {short_name} already exists, keeping {d.name}")
+
+def download_xml_files(workflow_run_id, workflow_run_attempts, prefixes=[], allowed_substrings=None):
+    # Get from S3 artifacts
+    artifact_paths = []
+    for prefix in prefixes:
+        print("Trying to download S3 artifacts for workflow_run_attempt {} with prefix {}".format(workflow_run_attempts, prefix))
+        artifact_paths += download_s3_artifacts(
+            prefix,
+            workflow_run_id,
+            workflow_run_attempts,
+            allowed_substrings=allowed_substrings,
+        )
+
+    # Filter out rerun_disabled_tests artifacts (same prefix, different job)
+    before = len(artifact_paths)
+    artifact_paths = [p for p in artifact_paths if "rerun_disabled" not in p.name]
+    if before != len(artifact_paths):
+        print(f"  Filtered out {before - len(artifact_paths)} rerun_disabled artifacts")
+
+    # Fall back to GHA artifacts if S3 returned nothing
+    if len(artifact_paths) == 0:
+        print(f"No S3 artifacts found, trying GHA artifacts as fallback...")
+        artifact_paths = download_gha_artifacts_filtered(
+            workflow_run_id,
+            workflow_run_attempts,
+            prefixes=prefixes,
+            allowed_substrings=allowed_substrings,
+        )
+
+    if len(artifact_paths) == 0:
+        error_msg = f"WARNING: workflow run id: {workflow_run_id} - no artifacts found (S3 or GHA) for prefixes: {prefixes}"
+        print(error_msg)
+        error_msgs.append(error_msg)
+        return
+
+    for path in artifact_paths:
+            unzip(path)
+
+    _shorten_unzipped_dirs()
+
+    # Delete raw zip files now that contents are extracted
+    for path in artifact_paths:
+        try:
+            path.unlink()
+            print(f"  Deleted {path}")
+        except Exception:
+            pass
+
+def download_artifacts(wf, prefixes=[], test_folder=".", allowed_substrings=None):
+    os.chdir(test_folder)
+    #download the xml files
+    download_xml_files(
+        wf['id'],
+        wf.get('run_attempt',1),
+        prefixes,
+        allowed_substrings=allowed_substrings,
+    )
+    os.chdir("..")
+# for older runs, add 'created':'<=YYYY-MM-DD'. see https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates
+def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, ignore_status=False, status='success', error_msg='Error downloading workflow runs'):
+    if not workflow:
+        raise Exception("Workflow must be specified")
+    for page in range(max_pages):
+        params = {'per_page': 30, 'page': page}
+        if not ignore_status:
+            if status:
+                params['status'] = status
+        if created:
+            params['created'] = created
+        if sha:
+            params['head_sha'] = sha
+        else:
+            params['branch'] = "main"
+        print(".")
+
+        # Uncomment below for additional debug info
+        # print(f"authentication_headers: {authentication_headers}")
+        # print(f"params: {params}")
+        # print("https://api.github.com/repos/pytorch/pytorch/actions/workflows/{}.yml/runs".format(workflow))
+        response = requests.get("https://api.github.com/repos/pytorch/pytorch/actions/workflows/{}.yml/runs".format(workflow), headers=authentication_headers, params=params)
+        #print(response.json())
+        workflow_runs = None
+        try:
+            workflow_runs = response.json()['workflow_runs']
+            #print(workflow_runs)
+        except:
+            raise Exception(response.text)
+        return workflow_runs[0]
+        for wf in workflow_runs:
+            wf_name = wf["name"]
+            if not sha and (wf_name == workflow):
+                return wf
+            if sha and (wf_name == workflow) and (wf["head_sha"] == sha):
+                return wf
+
+    # Should not reach here ideally
+    raise Exception(error_msg)
+
+def create_test_folder(wf):
+    if wf is None:
+        raise Exception("wf is None!")
+        #return
+    test_folder = re.sub('T.*Z', '', wf['created_at'].replace(":", "").replace("-", "")) + "_" + wf['head_sha']
+    if not os.path.exists(test_folder):
+        os.mkdir(test_folder)
+
+    cuda_xml_folder = test_folder + "/cuda_xml"
+    if not os.path.exists(cuda_xml_folder):
+        os.mkdir(cuda_xml_folder)
+
+    rocm_xml_folder = test_folder + "/rocm_xml"
+    if not os.path.exists(rocm_xml_folder):
+        os.mkdir(rocm_xml_folder)
+    return [test_folder, cuda_xml_folder, rocm_xml_folder]
+
+_first_folder = None
+
+def get_or_create_test_folder(wf):
+    """Reuse the first folder created so all artifacts land in one place.
+
+    Different upstream workflows for the same SHA can have different created_at
+    dates (e.g. spanning midnight), which would cause create_test_folder to
+    create separate directories.  This wrapper ensures every call returns the
+    same folder that was established by the very first invocation.
+    """
+    global _first_folder
+    if _first_folder is not None:
+        test_folder = _first_folder
+        cuda_xml = test_folder + "/cuda_xml"
+        rocm_xml = test_folder + "/rocm_xml"
+        os.makedirs(cuda_xml, exist_ok=True)
+        os.makedirs(rocm_xml, exist_ok=True)
+        return [test_folder, cuda_xml, rocm_xml]
+    result = create_test_folder(wf)
+    _first_folder = result[0]
+    return result
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Download pytorch unit test logs')
+    parser.add_argument('--created', const=None, help='eg., \'<=YYYY-MM-DD\'. See https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax#query-for-dates')
+    parser.add_argument('--max_pages', type=int, default=10, help='eg., 100')
+    parser.add_argument('--sha1', const=None, help='eg., 3dcd67a1b374faea01f4d2e17beb6bb1fff76d76')
+    parser.add_argument('--exclude_distributed', action='store_true')
+    parser.add_argument('--exclude_inductor', action='store_true')
+    parser.add_argument('--exclude_default', action='store_true')
+    parser.add_argument('--ignore_status', action='store_true')
+    parser.add_argument('--artifacts_only', action='store_true')
+    parser.add_argument('--no_rocm', action='store_true')
+    parser.add_argument('--no_cuda', action='store_true')
+    parser.add_argument('--pr_id', type=int, help='The pull request ID')
+    parser.add_argument('--arch', type=str, choices=['mi200', 'mi300', 'mi355', 'navi31', 'nightly'], default='mi355', help='ROCm GPU architecture (mi200, mi300, mi355, navi31, or nightly, default: mi355)')
+    parser.add_argument('--include_inductor_periodic', action='store_true', help='Also download inductor-periodic benchmark artifacts (into a separate directory, not included in parity CSV)')
+    parser.add_argument('--baseline_sha', type=str, help='Baseline commit SHA to compare against. Downloads the same ROCm workflows for this commit into baseline_xml/.')
+    return parser.parse_args()
+
+# Rate-limit issues
+# Authenticated users get 5000 requests/day
+# Check rate-limit without penalty: curl -H "Authorization: token $GITHUB_TOKEN" -I https://api.github.com/users/octocat
+
+def main():
+    global args
+    args = parse_args()
+    if args.max_pages < 1:
+        args.max_pages=1
+
+    # Set ROCm workflow names based on architecture
+    global ROCmWorkflowNames
+    arch = args.arch  # 'mi200', 'mi300', 'mi355', 'navi31', or 'nightly'
+    if arch == 'nightly':
+        ROCmWorkflowNames = {
+            "default": "rocm-nightly",
+            "distributed": "rocm-nightly",
+            "inductor": "rocm-nightly",
+        }
+    elif arch == 'mi355':
+        ROCmWorkflowNames = {
+            "default": "trunk",
+            "distributed": "trunk",
+            "inductor": "inductor-rocm-mi355"
+        }
+    elif arch == 'mi200':
+        ROCmWorkflowNames = {
+            "default": "trunk-rocm-sandbox",
+            "distributed": "trunk-rocm-sandbox",
+            "inductor": "trunk-rocm-sandbox"
+        }
+    else:
+        # MI300 and navi31 use dedicated ROCm workflows
+        ROCmWorkflowNames = {
+            "default": f"rocm-{arch}",
+            "distributed": f"periodic-rocm-{arch}",
+            "inductor": f"inductor-rocm-{arch}"
+        }
+    # Job key prefix for log downloads - architecture specific
+    # MI200 uses older jammy/py3.10 config, MI300 uses noble/py3.12
+    # Inductor jobs have a different naming format
+    rocm_job_prefixes = {
+        "nightly": {
+            "default": "linux-noble-rocm-nightly-py3.12-gfx942",
+            "distributed": "linux-noble-rocm-nightly-py3.12-gfx942",
+            "inductor": "linux-noble-rocm-nightly-py3.12-gfx942",
+        },
+        "mi200": {
+            "default": "linux-jammy-rocm-py3.10",
+            "distributed": "linux-jammy-rocm-py3.10",
+            "inductor": "linux-jammy-rocm-py3.10"
+        },
+        "mi300": {
+            "default": "linux-noble-rocm-py3.12-mi300",
+            "distributed": "linux-noble-rocm-py3.12-mi300",
+            "inductor": "linux-noble-rocm-py3.12-mi300"
+        },
+        "mi355": {
+            "default": "linux-jammy-rocm-py3.10-mi355",
+            "distributed": "linux-jammy-rocm-py3.10-mi355",
+            "inductor": "linux-noble-rocm-py3.12-mi355"
+        },
+        "navi31": {
+            "default": "linux-jammy-rocm-py3.10-navi31",
+            "distributed": "linux-jammy-rocm-py3.10-navi31",
+            "inductor": "linux-jammy-rocm-py3.10-navi31"
+        }
+    }
+    # Architecture-specific shard counts
+    rocm_shard_counts = {
+        "nightly": {"default": 6, "distributed": 3, "inductor": 2},
+        "mi200": {"default": 6, "distributed": 3, "inductor": 2},
+        "mi300": {"default": 6, "distributed": 3, "inductor": 2},
+        "mi355": {"default": 6, "distributed": 3, "inductor": 2},
+        "navi31": {"default": 2, "distributed": 3, "inductor": 2},
+    }
+    rocm_job_prefix = rocm_job_prefixes[arch]
+    rocm_shards = rocm_shard_counts[arch]
+    rocm_artifact_substrings = ["rocm.gpu"] if arch in ("mi355", "nightly") else None
+    # navi31 only has default tests (no distributed/inductor workflows)
+    if arch in ("navi31",):
+        if not args.exclude_distributed:
+            print(f"NOTE: {arch} has no distributed workflow, auto-excluding distributed")
+            args.exclude_distributed = True
+        if not args.exclude_inductor:
+            print(f"NOTE: {arch} has no inductor workflow, auto-excluding inductor")
+            args.exclude_inductor = True
+    if args.baseline_sha and not args.no_cuda:
+        print("NOTE: baseline_sha provided, auto-skipping CUDA (commit-vs-commit comparison)")
+        args.no_cuda = True
+
+    print(f"Using ROCm architecture: {arch}")
+    print(f"Using ROCm job prefixes: {rocm_job_prefix}")
+    print(f"Using ROCm shard counts: {rocm_shards}")
+
+    token = os.getenv('GITHUB_TOKEN', '...')
+    global authentication_headers
+    authentication_headers = {'Authorization': f'token {token}'}
+    if args.pr_id and args.sha1:
+        error_msg = "Error: Please provide either pr_id or sha!"
+        print(error_msg)
+        sys.exit(1)
+    if args.pr_id:
+        pr_id = args.pr_id
+        sha = get_latest_commit_sha(pr_id, token)        
+    else:
+        sha = args.sha1
+        pr_id = None
+    status = "success"
+    print(sha)
+
+    # When comparing two commits, prefix log filenames with short SHAs
+    if args.baseline_sha:
+        current_prefix = sha[:8] + "_"
+        baseline_prefix = args.baseline_sha[:8] + "_"
+    else:
+        current_prefix = ""
+        baseline_prefix = "baseline_"
+
+    if not args.exclude_distributed and not args.no_rocm:
+        periodic_sha = sha
+        print("==============================================")
+        print(f"Finding ROCm tests in periodic workflow by sha: {sha}")
+        print("==============================================")
+        # find distributed test in periodic workflow with success status
+        error_msg="Error: Periodic workflow not found in scanned workflow runs."
+        #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
+        periodic_fallback_used = False
+        try:
+            periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["distributed"], sha=periodic_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+        except (IndexError, Exception):
+            periodic_wf = None
+        periodic_fallbacks = {
+            "mi355": ("periodic-rocm-mi355", "linux-noble-rocm-py3.12-mi355"),
+            "mi200": ("periodic-rocm-mi200", "linux-jammy-rocm-py3.10"),
+        }
+        if periodic_wf is None and arch in periodic_fallbacks:
+            fallback_wf, fallback_prefix = periodic_fallbacks[arch]
+            print(f"Distributed not found in {ROCmWorkflowNames['distributed']}, falling back to {fallback_wf}")
+            periodic_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=periodic_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+            periodic_fallback_used = True
+        if periodic_wf is None:
+            raise Exception(error_msg)
+        print(f"Using workflow with id:{periodic_wf['id']} as periodic_wf")
+
+        if periodic_fallback_used and arch in periodic_fallbacks:
+            dist_job_prefix = periodic_fallbacks[arch][1]
+        else:
+            dist_job_prefix = rocm_job_prefix['distributed']
+
+        folder_list = get_or_create_test_folder(periodic_wf)
+
+        # Download logs
+        # If the ROCm distributed logs aren't found you might want to check the HUD for the correct tags
+        # HUD link: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=rocm
+        # Make sure "Hide unstable jobs" is unselected, in case ROCm jobs are marked as unstable
+
+        if not args.artifacts_only:
+            dist_shards = rocm_shards["distributed"]
+            test_log_list_rocm_distributed = [
+                [f"{current_prefix}rocm_dist{i}.txt", f"{dist_job_prefix} / test (distributed, {i}, {dist_shards}"]
+                for i in range(1, dist_shards + 1)
+            ]
+            download_logs(periodic_wf, test_log_list_rocm_distributed, folder_list[0])
+
+        # Download artifacts
+        dist_shards = rocm_shards["distributed"]
+        test_artifacts_list_rocm_distributed = [
+            f"test-reports-test-distributed-{i}-{dist_shards}"
+            for i in range(1, dist_shards + 1)
+        ]
+        download_artifacts(
+            periodic_wf,
+            test_artifacts_list_rocm_distributed,
+            folder_list[2],
+            allowed_substrings=rocm_artifact_substrings,
+        )
+        os.chdir("..")
+
+    # Download ROCm default rocm_wf when ROCm is enabled
+    if not args.no_rocm and not args.exclude_default:
+        rocm_sha = sha
+        print("===========================================")
+        print(f"Finding ROCm tests in rocm workflow by sha: {rocm_sha}")
+        print("===========================================")
+        # find tests in rocm workflow with given sha and success status
+        #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
+        error_msg="Error: rocm workflow not found in scanned workflow runs. Try increasing max_pages."
+        default_fallback_used = False
+        try:
+            rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["default"], sha=rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+        except (IndexError, Exception):
+            rocm_wf = None
+        default_fallbacks = {
+            "mi355": ("rocm-mi355", "linux-noble-rocm-py3.12-mi355"),
+        }
+        if rocm_wf is None and arch in default_fallbacks:
+            fallback_wf, fallback_prefix = default_fallbacks[arch]
+            print(f"Default not found in {ROCmWorkflowNames['default']}, falling back to {fallback_wf}")
+            rocm_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+            default_fallback_used = True
+            rocm_job_prefix['default'] = fallback_prefix
+        if rocm_wf is None:
+            raise Exception(error_msg)
+        print(f"Using workflow with id:{rocm_wf['id']} as rocm_wf{' (fallback)' if default_fallback_used else ''}")
+
+        folder_list = get_or_create_test_folder(rocm_wf)
+
+        # Download logs
+        # If logs aren't found you might want to check the HUD for the correct tags
+        # HUD link: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=rocm
+        if not args.artifacts_only:
+            default_shards = rocm_shards["default"]
+            test_log_list_rocm_default = [
+              [f"{current_prefix}rocm{i}.txt", f"{rocm_job_prefix['default']} / test (default, {i}, {default_shards}"]
+              for i in range(1, default_shards + 1)
+            ]
+            download_logs(rocm_wf, test_log_list_rocm_default, folder_list[0])
+
+        # Download artifacts
+        default_shards = rocm_shards["default"]
+        test_artifacts_list_rocm_default = [
+          f"test-reports-test-default-{i}-{default_shards}"
+          for i in range(1, default_shards + 1)
+        ]
+        if not args.exclude_default:
+            download_artifacts(
+                rocm_wf,
+                test_artifacts_list_rocm_default,
+                test_folder=folder_list[2],
+                allowed_substrings=rocm_artifact_substrings,
+            )
+        os.chdir("..")
+
+    # add new inductor workflow downloading for ROCm
+    if not args.no_rocm and not args.exclude_inductor:
+        inductor_rocm_sha = sha
+        # find tests in inductor workflow with given sha and success status
+        #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
+        print("===========================================")
+        print(f"Finding ROCm tests in inductor-rocm workflow by sha: {inductor_rocm_sha}")
+        print("===========================================")
+        error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
+        inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+        print(f"Using workflow with id:{inductor_wf_rocm['id']} as inductor_wf_rocm")
+
+        folder_list = get_or_create_test_folder(inductor_wf_rocm)
+
+        # Download logs
+        if not args.artifacts_only:
+          inductor_shards = rocm_shards["inductor"]
+          test_log_list_rocm_inductor = [
+            [f"{current_prefix}rocm_inductor{i}.txt", f"{rocm_job_prefix['inductor']} / test (inductor, {i}, {inductor_shards}"]
+            for i in range(1, inductor_shards + 1)
+          ]
+          download_logs(inductor_wf_rocm, test_log_list_rocm_inductor, folder_list[0])
+
+        #Download artifacts
+        inductor_shards = rocm_shards["inductor"]
+        test_artifacts_list_rocm_inductor = [
+          f"test-reports-test-inductor-{i}-{inductor_shards}"
+          for i in range(1, inductor_shards + 1)
+        ]
+        download_artifacts(
+            inductor_wf_rocm,
+            test_artifacts_list_rocm_inductor,
+            test_folder=folder_list[2],
+            allowed_substrings=rocm_artifact_substrings,
+        )
+        os.chdir("..")
+
+    if not args.no_cuda:
+        cuda_job_prefix = "linux-jammy-cuda13.0-py3.10-gcc11"
+        pull_sha = sha
+        print("==========================================")
+        print(f"Finding CUDA tests in pull workflow by sha: {pull_sha}")
+        print("==========================================")
+        # find tests in pull workflow with given sha and success status
+        #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
+        error_msg="Error: Pull workflow not found in scanned workflow runs. Try increasing max_pages."
+        pull_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["default"], sha=pull_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+        print(f"Using workflow with id:{pull_wf['id']} as pull_wf")
+
+        # Get job IDs for the target CUDA version to filter S3 artifacts
+        cuda_job_ids = get_job_ids_by_prefix(pull_wf, cuda_job_prefix)
+        cuda_artifact_substrings = [f"_{jid}" for jid in cuda_job_ids] if cuda_job_ids else ["nvidia.gpu"]
+        print(f"Using CUDA job prefix: {cuda_job_prefix}")
+        print(f"Found {len(cuda_job_ids)} CUDA jobs matching prefix")
+
+        folder_list = get_or_create_test_folder(pull_wf)
+
+        # Download logs
+        # If the cuda logs aren't found you might want to check the HUD for the correct tags
+        # Link to HUD: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=cuda
+        if not args.artifacts_only:
+            test_log_list_cuda_default = [
+              ["cuda1.txt", f"{cuda_job_prefix} / test (default, 1, 5"],
+              ["cuda2.txt", f"{cuda_job_prefix} / test (default, 2, 5"],
+              ["cuda3.txt", f"{cuda_job_prefix} / test (default, 3, 5"],
+              ["cuda4.txt", f"{cuda_job_prefix} / test (default, 4, 5"],
+              ["cuda5.txt", f"{cuda_job_prefix} / test (default, 5, 5"],
+            ]
+            test_log_list_cuda = test_log_list_cuda_default
+            if not args.exclude_distributed:
+                test_log_list_cuda_distributed = [
+                  ["cuda_dist1.txt", f"{cuda_job_prefix} / test (distributed, 1, 3"],
+                  ["cuda_dist2.txt", f"{cuda_job_prefix} / test (distributed, 2, 3"],
+                  ["cuda_dist3.txt", f"{cuda_job_prefix} / test (distributed, 3, 3"],
+                ]
+                test_log_list_cuda += test_log_list_cuda_distributed
+
+            download_logs(pull_wf, test_log_list_cuda, folder_list[0])
+
+        # Download artifacts
+        test_artifacts_list_cuda_default = [
+          "test-reports-test-default-1-5",
+          "test-reports-test-default-2-5",
+          "test-reports-test-default-3-5",
+          "test-reports-test-default-4-5",
+          "test-reports-test-default-5-5",
+        ]
+
+        test_artifacts_list_cuda = []
+        if not args.exclude_default:
+            test_artifacts_list_cuda += test_artifacts_list_cuda_default
+
+        if not args.exclude_distributed:
+            test_artifacts_list_cuda_distributed = [
+              "test-reports-test-distributed-1-3",
+              "test-reports-test-distributed-2-3",
+              "test-reports-test-distributed-3-3",
+            ]
+            test_artifacts_list_cuda += test_artifacts_list_cuda_distributed
+
+        if test_artifacts_list_cuda:
+            download_artifacts(
+                pull_wf,
+                test_artifacts_list_cuda,
+                test_folder=folder_list[1],
+                allowed_substrings=cuda_artifact_substrings,
+            )
+        os.chdir("..")
+
+        # add new inductor workflow downloading for CUDA
+        if not args.exclude_inductor:
+            inductor_sha = sha
+            print("==========================================")
+            print(f"Finding CUDA tests in inductor workflow by sha: {inductor_sha}")
+            print("==========================================")
+            # find tests in inductor workflow with given sha and success status
+            #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
+            error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
+            inductor_wf_cuda = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["inductor"], sha=inductor_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+            print(f"Using workflow with id:{inductor_wf_cuda['id']} as inductor_wf_cuda")
+
+            folder_list = get_or_create_test_folder(inductor_wf_cuda)
+
+            # Download logs
+            if not args.artifacts_only:
+              test_log_list_cuda_inductor = [
+                ["cuda_inductor1.txt", "unit-test / inductor-test / test (inductor, 1, 2"],
+                ["cuda_inductor2.txt", "unit-test / inductor-test / test (inductor, 2, 2"],
+              ]
+              download_logs(inductor_wf_cuda, test_log_list_cuda_inductor, folder_list[0])
+
+            test_artifacts_list_cuda_inductor = [
+              "test-reports-test-inductor-1-2",
+              "test-reports-test-inductor-2-2"
+            ]
+            # Inductor workflow is separate, use nvidia.gpu filter (no duplicate CUDA versions)
+            download_artifacts(
+                inductor_wf_cuda,
+                test_artifacts_list_cuda_inductor,
+                test_folder=folder_list[1],
+                allowed_substrings=["nvidia.gpu"],
+            )
+            os.chdir("..")
+
+    # Download baseline commit artifacts for commit-vs-commit comparison
+    if args.baseline_sha and not args.no_rocm:
+        baseline_sha = args.baseline_sha
+        print("==============================================")
+        print(f"Downloading BASELINE ROCm artifacts for sha: {baseline_sha}")
+        print("==============================================")
+
+        import glob
+        existing_folders = sorted(glob.glob("[0-9]*_[0-9a-f]*"), key=os.path.getmtime, reverse=True)
+        if existing_folders:
+            test_folder = existing_folders[0]
+        else:
+            raise Exception("No output folder found from primary downloads")
+
+        baseline_xml_dir = os.path.join(test_folder, "baseline_xml")
+        os.makedirs(baseline_xml_dir, exist_ok=True)
+
+        if not args.exclude_default:
+            try:
+                baseline_default_wf = download_workflow_run(
+                    created=args.created, max_pages=args.max_pages,
+                    workflow=ROCmWorkflowNames["default"], sha=baseline_sha,
+                    ignore_status=args.ignore_status, status=status,
+                    error_msg=f"Baseline default workflow not found for {baseline_sha}",
+                )
+                print(f"Baseline default workflow id: {baseline_default_wf['id']}")
+                default_shards = rocm_shards["default"]
+
+                if not args.artifacts_only:
+                    baseline_default_logs = [
+                        [f"{baseline_prefix}rocm{i}.txt", f"{rocm_job_prefix['default']} / test (default, {i}, {default_shards}"]
+                        for i in range(1, default_shards + 1)
+                    ]
+                    download_logs(baseline_default_wf, baseline_default_logs, test_folder)
+
+                baseline_default_prefixes = [
+                    f"test-reports-test-default-{i}-{default_shards}"
+                    for i in range(1, default_shards + 1)
+                ]
+                download_artifacts(
+                    baseline_default_wf,
+                    baseline_default_prefixes,
+                    test_folder=baseline_xml_dir,
+                    allowed_substrings=rocm_artifact_substrings,
+                )
+                os.chdir("..")
+            except Exception as e:
+                print(f"WARNING: Could not download baseline default artifacts: {e}")
+
+        if not args.exclude_distributed and "distributed" in ROCmWorkflowNames:
+            try:
+                baseline_dist_wf = download_workflow_run(
+                    created=args.created, max_pages=args.max_pages,
+                    workflow=ROCmWorkflowNames["distributed"], sha=baseline_sha,
+                    ignore_status=args.ignore_status, status=status,
+                    error_msg=f"Baseline distributed workflow not found for {baseline_sha}",
+                )
+                print(f"Baseline distributed workflow id: {baseline_dist_wf['id']}")
+                dist_shards = rocm_shards["distributed"]
+
+                if not args.artifacts_only:
+                    baseline_dist_logs = [
+                        [f"{baseline_prefix}rocm_dist{i}.txt", f"{rocm_job_prefix['distributed']} / test (distributed, {i}, {dist_shards}"]
+                        for i in range(1, dist_shards + 1)
+                    ]
+                    download_logs(baseline_dist_wf, baseline_dist_logs, test_folder)
+
+                baseline_dist_prefixes = [
+                    f"test-reports-test-distributed-{i}-{dist_shards}"
+                    for i in range(1, dist_shards + 1)
+                ]
+                download_artifacts(
+                    baseline_dist_wf,
+                    baseline_dist_prefixes,
+                    test_folder=baseline_xml_dir,
+                    allowed_substrings=rocm_artifact_substrings,
+                )
+                os.chdir("..")
+            except Exception as e:
+                print(f"WARNING: Could not download baseline distributed artifacts: {e}")
+
+        if not args.exclude_inductor and "inductor" in ROCmWorkflowNames:
+            try:
+                baseline_inductor_wf = download_workflow_run(
+                    created=args.created, max_pages=args.max_pages,
+                    workflow=ROCmWorkflowNames["inductor"], sha=baseline_sha,
+                    ignore_status=args.ignore_status, status=status,
+                    error_msg=f"Baseline inductor workflow not found for {baseline_sha}",
+                )
+                print(f"Baseline inductor workflow id: {baseline_inductor_wf['id']}")
+                inductor_shards = rocm_shards["inductor"]
+
+                if not args.artifacts_only:
+                    baseline_inductor_logs = [
+                        [f"{baseline_prefix}rocm_inductor{i}.txt", f"{rocm_job_prefix['inductor']} / test (inductor, {i}, {inductor_shards}"]
+                        for i in range(1, inductor_shards + 1)
+                    ]
+                    download_logs(baseline_inductor_wf, baseline_inductor_logs, test_folder)
+
+                baseline_inductor_prefixes = [
+                    f"test-reports-test-inductor-{i}-{inductor_shards}"
+                    for i in range(1, inductor_shards + 1)
+                ]
+                download_artifacts(
+                    baseline_inductor_wf,
+                    baseline_inductor_prefixes,
+                    test_folder=baseline_xml_dir,
+                    allowed_substrings=rocm_artifact_substrings,
+                )
+                os.chdir("..")
+            except Exception as e:
+                print(f"WARNING: Could not download baseline inductor artifacts: {e}")
+
+        print(f"Baseline artifacts saved to: {baseline_xml_dir}")
+
+    # Download inductor-periodic benchmark artifacts (separate from parity CSV)
+    if args.include_inductor_periodic:
+        print("==============================================")
+        print(f"Finding inductor-periodic workflow by sha: {sha}")
+        print("==============================================")
+        error_msg = "Error: inductor-periodic workflow not found for this SHA. It may not have run on this commit."
+        try:
+            inductor_periodic_wf = download_workflow_run(
+                created=args.created, max_pages=args.max_pages,
+                workflow="inductor-periodic", sha=sha,
+                ignore_status=args.ignore_status, status=status,
+                error_msg=error_msg,
+            )
+        except (IndexError, Exception) as e:
+            print(f"WARNING: {e}")
+            inductor_periodic_wf = None
+
+        if inductor_periodic_wf:
+            print(f"Using workflow with id:{inductor_periodic_wf['id']} as inductor_periodic_wf")
+
+            folder_list = get_or_create_test_folder(inductor_periodic_wf)
+            test_folder = folder_list[0]
+
+            rocm_periodic_dir = os.path.join(test_folder, "inductor_periodic_rocm_dir")
+            cuda_periodic_dir = os.path.join(test_folder, "inductor_periodic_cuda_dir")
+            os.makedirs(rocm_periodic_dir, exist_ok=True)
+            os.makedirs(cuda_periodic_dir, exist_ok=True)
+
+            if not args.no_rocm:
+                print("Downloading inductor-periodic ROCm artifacts...")
+                download_artifacts(
+                    inductor_periodic_wf,
+                    ["test-reports-"],
+                    test_folder=rocm_periodic_dir,
+                    allowed_substrings=["rocm.gpu"],
+                )
+                os.chdir("..")
+
+            if not args.no_cuda:
+                print("Downloading inductor-periodic CUDA artifacts...")
+                cuda_periodic_job_ids = get_job_ids_by_prefix(inductor_periodic_wf, "linux.g5")
+                cuda_periodic_substrings = (
+                    [f"_{jid}" for jid in cuda_periodic_job_ids]
+                    if cuda_periodic_job_ids
+                    else ["nvidia.gpu"]
+                )
+                download_artifacts(
+                    inductor_periodic_wf,
+                    ["test-reports-"],
+                    test_folder=cuda_periodic_dir,
+                    allowed_substrings=cuda_periodic_substrings,
+                )
+                os.chdir("..")
+
+            print(f"Inductor-periodic artifacts saved to:")
+            print(f"  ROCm: {rocm_periodic_dir}")
+            print(f"  CUDA: {cuda_periodic_dir}")
+        else:
+            print("Skipping inductor-periodic download (workflow run not found)")
+
+    return
+
+if __name__ == "__main__":
+    main()
+    if error_msgs:
+        for msg in error_msgs:
+            print(msg)
+        exit(1)
diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
new file mode 100644
index 0000000000000..db1773317a91a
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import sys
+
+
+WORKFLOWS = ['default', 'distributed', 'inductor']
+WORKFLOW_DISPLAY = {
+    'default': 'TEST DEFAULT',
+    'distributed': 'TEST DISTRIBUTED',
+    'inductor': 'TEST INDUCTOR',
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate a parity summary from per-architecture test status CSVs'
+    )
+    parser.add_argument(
+        '--csv', nargs='+', required=True,
+        help='CSV file(s) to summarize (one per architecture, same order as --arch)'
+    )
+    parser.add_argument(
+        '--arch', nargs='+', required=True,
+        help='Architecture labels matching --csv order (e.g. mi200 mi300 mi355)'
+    )
+    parser.add_argument('--sha', type=str, default='', help='Commit SHA')
+    parser.add_argument('--pr_id', type=str, default='', help='Pull request ID')
+    parser.add_argument(
+        '--set1_name', type=str, default='set1',
+        help='Name used for set1 in CSV column headers (default: set1)'
+    )
+    parser.add_argument(
+        '--set2_name', type=str, default='set2',
+        help='Name used for set2 in CSV column headers (default: set2)'
+    )
+    parser.add_argument(
+        '--output', type=str, default='parity_summary',
+        help='Output path prefix (produces .csv and .md)'
+    )
+    return parser.parse_args()
+
+
+def load_csv(filepath):
+    with open(filepath, newline='') as f:
+        return list(csv.DictReader(f))
+
+
+def detect_columns(headers, set1_name, set2_name):
+    s1_status = f'status_{set1_name}'
+    s2_status = f'status_{set2_name}'
+    s1_time = f'running_time_{set1_name}'
+    s2_time = f'running_time_{set2_name}'
+    if s1_status not in headers:
+        s1_status = 'status_set1'
+        s2_status = 'status_set2'
+        s1_time = 'running_time_set1'
+        s2_time = 'running_time_set2'
+    return s1_status, s2_status, s1_time, s2_time
+
+
+def workflow_stats_keys(s1_name, s2_name, has_set2=True):
+    s1 = s1_name.upper()
+    s2 = s2_name.upper()
+    if not has_set2:
+        return [
+            f'PASSED ({s1_name})',
+            f'SKIPPED ({s1_name})',
+            f'FAILED ({s1_name})',
+            f'MISSED ({s1_name})',
+            f'TOTAL {s1}',
+        ]
+    return [
+        f'SKIPPED (on {s1_name}, but not on {s2_name})',
+        f'SKIPPED (on {s1_name})',
+        f'SKIPPED (on {s2_name})',
+        f'MISSED (MISSED on {s1_name}, NOT SKIPPED on {s2_name})',
+        f'{s1}ONLY (PASSED on {s1}, NOT PASSED on {s2})',
+        s2,
+        s1,
+        'SKIPPED + MISSED',
+        f'{s2} - (SKIPPED + MISSED)',
+        f'DISAGREE [(SKIPPED+MISSED)/{s2}] %',
+    ]
+
+
+def compute_workflow_stats(rows, s1_col, s2_col, s1_name, s2_name, has_set2=True):
+    s1 = s1_name.upper()
+    s2 = s2_name.upper()
+
+    if not has_set2:
+        vals = {}
+        keys = workflow_stats_keys(s1_name, s2_name, has_set2=False)
+        vals[keys[0]] = sum(1 for r in rows if r[s1_col] == 'PASSED')
+        vals[keys[1]] = sum(1 for r in rows if r[s1_col] == 'SKIPPED')
+        vals[keys[2]] = sum(1 for r in rows if r[s1_col] == 'FAILED')
+        vals[keys[3]] = sum(1 for r in rows if r[s1_col] == 'MISSED')
+        vals[keys[4]] = sum(1 for r in rows if r[s1_col].strip())
+        return vals
+
+    s1_skip_not_s2 = sum(
+        1 for r in rows
+        if r[s1_col] == 'SKIPPED' and r[s2_col] != 'SKIPPED'
+    )
+    s1_skip = sum(1 for r in rows if r[s1_col] == 'SKIPPED')
+    s2_skip = sum(1 for r in rows if r[s2_col] == 'SKIPPED')
+    s1_miss_not_s2_skip = sum(
+        1 for r in rows
+        if r[s1_col] == 'MISSED' and r[s2_col] != 'SKIPPED'
+    )
+    only_s1 = sum(
+        1 for r in rows
+        if r[s1_col] == 'PASSED' and r[s2_col] != 'PASSED'
+    )
+    total_s2 = sum(1 for r in rows if r[s2_col].strip() and r[s2_col].strip() != 'MISSED')
+    total_s1 = sum(1 for r in rows if r[s1_col].strip() and r[s1_col].strip() != 'MISSED')
+
+    skip_miss = s1_skip_not_s2 + s1_miss_not_s2_skip
+    s2_minus = total_s2 - skip_miss
+    pct = (skip_miss / total_s2 * 100) if total_s2 else 0
+
+    vals = {}
+    keys = workflow_stats_keys(s1_name, s2_name)
+    vals[keys[0]] = s1_skip_not_s2
+    vals[keys[1]] = s1_skip
+    vals[keys[2]] = s2_skip
+    vals[keys[3]] = s1_miss_not_s2_skip
+    vals[keys[4]] = only_s1
+    vals[keys[5]] = total_s2
+    vals[keys[6]] = total_s1
+    vals[keys[7]] = skip_miss
+    vals[keys[8]] = s2_minus
+    vals[keys[9]] = f'{pct:.2f}%'
+    return vals
+
+
+def overall_stats_keys(s1_name, s2_name, has_set2=True):
+    s1 = s1_name.upper()
+    s2 = s2_name.upper()
+    if not has_set2:
+        keys = []
+        for status in ['PASSED', 'SKIPPED', 'FAILED', 'XFAILED']:
+            keys.append(f'{status}({s1_name})')
+        keys += [
+            f'TOTAL {s1}',
+            f'TOTAL {s1} RUNNING TIME',
+        ]
+        return keys
+    keys = [
+        'Overall DISAGREE%',
+        'Overall AGREE%',
+    ]
+    for status in ['PASSED', 'SKIPPED', 'FAILED', 'XFAILED']:
+        keys.append(f'{status}({s1_name})')
+        keys.append(f'{status}({s2_name})')
+    keys += [
+        f'TOTAL {s2}',
+        f'TOTAL {s1}',
+        f'TOTAL {s1} RUNNING TIME',
+        f'TOTAL {s2} RUNNING TIME',
+    ]
+    return keys
+
+
+def compute_overall_stats(rows, s1_col, s2_col, s1_time_col, s2_time_col, s1_name, s2_name, has_set2=True):
+    s1 = s1_name.upper()
+    s2 = s2_name.upper()
+
+    def safe_float(v):
+        try:
+            return float(v)
+        except (ValueError, TypeError):
+            return 0.0
+
+    if not has_set2:
+        vals = {}
+        keys = overall_stats_keys(s1_name, s2_name, has_set2=False)
+        idx = 0
+        for status in ['PASSED', 'SKIPPED', 'FAILED', 'XFAILED']:
+            vals[keys[idx]] = sum(1 for r in rows if r[s1_col] == status)
+            idx += 1
+        vals[keys[idx]] = sum(1 for r in rows if r[s1_col].strip())
+        idx += 1
+        vals[keys[idx]] = f'{sum(safe_float(r[s1_time_col]) for r in rows):.2f}'
+        return vals
+
+    total_disagree = 0
+    total_s2 = 0
+    for wf in WORKFLOWS:
+        wf_rows = [r for r in rows if r['work_flow_name'] == wf]
+        s1_skip_not_s2 = sum(
+            1 for r in wf_rows
+            if r[s1_col] == 'SKIPPED' and r[s2_col] != 'SKIPPED'
+        )
+        s1_miss_not_s2_skip = sum(
+            1 for r in wf_rows
+            if r[s1_col] == 'MISSED' and r[s2_col] != 'SKIPPED'
+        )
+        total_disagree += s1_skip_not_s2 + s1_miss_not_s2_skip
+        total_s2 += sum(1 for r in wf_rows if r[s2_col].strip() and r[s2_col].strip() != 'MISSED')
+
+    disagree_pct = (total_disagree / total_s2 * 100) if total_s2 else 0
+    agree_pct = 100 - disagree_pct
+
+    vals = {}
+    keys = overall_stats_keys(s1_name, s2_name)
+    vals[keys[0]] = f'{disagree_pct:.2f}%'
+    vals[keys[1]] = f'{agree_pct:.2f}%'
+
+    idx = 2
+    for status in ['PASSED', 'SKIPPED', 'FAILED', 'XFAILED']:
+        vals[keys[idx]] = sum(1 for r in rows if r[s1_col] == status)
+        vals[keys[idx + 1]] = sum(1 for r in rows if r[s2_col] == status)
+        idx += 2
+
+    vals[keys[idx]] = sum(1 for r in rows if r[s2_col].strip() and r[s2_col].strip() != 'MISSED')
+    idx += 1
+    vals[keys[idx]] = sum(1 for r in rows if r[s1_col].strip() and r[s1_col].strip() != 'MISSED')
+    idx += 1
+
+    vals[keys[idx]] = f'{sum(safe_float(r[s1_time_col]) for r in rows):.2f}'
+    idx += 1
+    vals[keys[idx]] = f'{sum(safe_float(r[s2_time_col]) for r in rows):.2f}'
+    return vals
+
+
+def collect_failed_tests(arch_data, archs, s1_name, s2_name):
+    """Return a list of failed test rows across all architectures."""
+    failed = []
+    for arch in archs:
+        d = arch_data[arch]
+        s1_col, s2_col, _, _ = d['cols']
+        has_set2 = d.get('has_set2', True)
+        for r in d['rows']:
+            s1 = r[s1_col].strip()
+            s2 = r[s2_col].strip() if has_set2 else ''
+            if s1 == 'FAILED' or s2 == 'FAILED':
+                entry = {
+                    'arch': arch,
+                    'test_file': r.get('test_file', ''),
+                    'test_class': r.get('test_class', ''),
+                    'test_name': r.get('test_name', ''),
+                    'workflow': r.get('work_flow_name', ''),
+                    f'status_{s1_name}': s1,
+                }
+                if has_set2:
+                    entry[f'status_{s2_name}'] = s2
+                failed.append(entry)
+    return failed
+
+
+def fmt_val(v):
+    if isinstance(v, int):
+        return f'{v:,}'
+    return str(v)
+
+
+def build_rows(args, archs, arch_data):
+    """Return a list of (label, val_per_arch...) tuples and section markers."""
+    out = []
+    any_has_set2 = any(d.get('has_set2', True) for d in arch_data.values())
+
+    if args.sha:
+        out.append(('__header__', f'Commit SHA: {args.sha}'))
+    if args.pr_id:
+        out.append(('__header__', f'PR ID: {args.pr_id}'))
+
+    wf_keys = workflow_stats_keys(args.set1_name, args.set2_name, has_set2=any_has_set2)
+    for wf in WORKFLOWS:
+        out.append(('__section__', WORKFLOW_DISPLAY[wf]))
+        arch_stats = {}
+        for arch in archs:
+            d = arch_data[arch]
+            s1_col, s2_col, _, _ = d['cols']
+            has_set2 = d.get('has_set2', True)
+            wf_rows = [r for r in d['rows'] if r['work_flow_name'] == wf]
+            arch_stats[arch] = compute_workflow_stats(
+                wf_rows, s1_col, s2_col, args.set1_name, args.set2_name,
+                has_set2=has_set2,
+            )
+        for key in wf_keys:
+            out.append((key, [arch_stats[a][key] for a in archs]))
+
+    out.append(('__section__', 'OVERALL'))
+    ov_keys = overall_stats_keys(args.set1_name, args.set2_name, has_set2=any_has_set2)
+    arch_overall = {}
+    for arch in archs:
+        d = arch_data[arch]
+        s1_col, s2_col, s1_time, s2_time = d['cols']
+        has_set2 = d.get('has_set2', True)
+        arch_overall[arch] = compute_overall_stats(
+            d['rows'], s1_col, s2_col, s1_time, s2_time,
+            args.set1_name, args.set2_name, has_set2=has_set2,
+        )
+    for key in ov_keys:
+        out.append((key, [arch_overall[a][key] for a in archs]))
+    return out
+
+
+def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True):
+    csv_rows = []
+    csv_rows.append([''] + list(archs))
+    for label, vals in rows:
+        if label == '__header__':
+            csv_rows.append([vals])
+        elif label == '__section__':
+            csv_rows.append([])
+            csv_rows.append([vals])
+        else:
+            csv_rows.append([label] + list(vals))
+    csv_rows.append([])
+
+    if failed_tests:
+        csv_rows.append(['FAILED TESTS'])
+        header = ['Arch', 'Workflow', 'Test File', 'Test Class',
+                  'Test Name', f'Status ({s1_name})']
+        if has_set2:
+            header.append(f'Status ({s2_name})')
+        csv_rows.append(header)
+        for t in failed_tests:
+            row = [t['arch'], t['workflow'], t['test_file'],
+                   t['test_class'], t['test_name'],
+                   t[f'status_{s1_name}']]
+            if has_set2:
+                row.append(t.get(f'status_{s2_name}', ''))
+            csv_rows.append(row)
+        csv_rows.append([])
+
+    with open(output_path, 'w', newline='') as f:
+        csv.writer(f).writerows(csv_rows)
+    print(f'CSV written to {output_path}')
+
+
+def write_markdown(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True):
+    lines = []
+    current_section = []
+
+    def flush_table():
+        if not current_section:
+            return
+        header = '| Metric | ' + ' | '.join(archs) + ' |'
+        sep = '| :--- | ' + ' | '.join(['---:'] * len(archs)) + ' |'
+        lines.append(header)
+        lines.append(sep)
+        for label, vals in current_section:
+            formatted = [fmt_val(v) for v in vals]
+            lines.append(f'| {label} | ' + ' | '.join(formatted) + ' |')
+        lines.append('')
+        current_section.clear()
+
+    for label, vals in rows:
+        if label == '__header__':
+            flush_table()
+            lines.append(f'**{vals}**')
+            lines.append('')
+        elif label == '__section__':
+            flush_table()
+            lines.append(f'### {vals}')
+            lines.append('')
+        else:
+            current_section.append((label, vals))
+
+    flush_table()
+
+    if failed_tests:
+        lines.append('### FAILED TESTS')
+        lines.append('')
+        cols = ['Arch', 'Workflow', 'Test File', 'Test Class', 'Test Name',
+                f'Status ({s1_name})']
+        if has_set2:
+            cols.append(f'Status ({s2_name})')
+        lines.append('| ' + ' | '.join(cols) + ' |')
+        lines.append('| ' + ' | '.join(['---'] * len(cols)) + ' |')
+        for t in failed_tests:
+            line = (f"| {t['arch']} | {t['workflow']} | {t['test_file']} "
+                    f"| {t['test_class']} | {t['test_name']} "
+                    f"| {t[f'status_{s1_name}']}")
+            if has_set2:
+                line += f" | {t.get(f'status_{s2_name}', '')}"
+            line += ' |'
+            lines.append(line)
+        lines.append('')
+    else:
+        lines.append('### FAILED TESTS')
+        lines.append('')
+        lines.append('No failed tests found.')
+        lines.append('')
+
+    md = '\n'.join(lines)
+    with open(output_path, 'w') as f:
+        f.write(md)
+    print(f'Markdown written to {output_path}')
+    return md
+
+
+def main():
+    args = parse_args()
+
+    if len(args.csv) != len(args.arch):
+        print('Error: --csv and --arch must have the same number of values')
+        sys.exit(1)
+
+    archs = args.arch
+    arch_data = {}
+    for csv_path, arch in zip(args.csv, archs):
+        rows = load_csv(csv_path)
+        headers = set(rows[0].keys()) if rows else set()
+        cols = detect_columns(headers, args.set1_name, args.set2_name)
+        s2_col = cols[1]
+        has_set2 = any(r.get(s2_col, '').strip() for r in rows)
+        arch_data[arch] = {'rows': rows, 'cols': cols, 'has_set2': has_set2}
+
+    data_rows = build_rows(args, archs, arch_data)
+    failed = collect_failed_tests(arch_data, archs, args.set1_name, args.set2_name)
+    any_has_set2 = any(d.get('has_set2', True) for d in arch_data.values())
+
+    output_base = args.output
+    if output_base.endswith('.csv') or output_base.endswith('.md'):
+        output_base = output_base.rsplit('.', 1)[0]
+
+    write_csv(data_rows, archs, f'{output_base}.csv', failed, args.set1_name, args.set2_name, has_set2=any_has_set2)
+    write_markdown(data_rows, archs, f'{output_base}.md', failed, args.set1_name, args.set2_name, has_set2=any_has_set2)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.automation_scripts/pytorch-unit-test-scripts/requirements.txt b/.automation_scripts/pytorch-unit-test-scripts/requirements.txt
new file mode 100644
index 0000000000000..9ee33b404d9cd
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/requirements.txt
@@ -0,0 +1,4 @@
+pandas
+rockset
+boto3
+requests
diff --git a/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
new file mode 100755
index 0000000000000..ad59b5dea49d9
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
@@ -0,0 +1,717 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import os
+import pandas as pd
+from enum import Enum
+from itertools import chain
+from pathlib import Path
+from upload_test_stats import (
+        parse_xml_report,
+        get_pytest_parallel_times,
+        summarize_test_cases,
+)
+
+# unit test status list
+UT_STATUS_LIST = [
+    "PASSED",
+    "MISSED",
+    "SKIPPED",
+    "FAILED",
+    "XFAILED",
+    "ERROR"
+]
+
+# excluded test suites for comparison
+EXCLUDED_TEST_SUITES = [
+    "_nvfuser.test_dynamo",
+    "_nvfuser.test_python_frontend",
+    "_nvfuser.test_torchscript",
+    "test_jit_cuda_fuser",
+    "test_nvfuser_dynamo",
+    "test_nvfuser_frontend"
+]
+
+
+EXCLUDED_TEST_CLASSES = [
+    "nvfuser_tests",
+    "TensorPipeCudaDdpComparisonTest",
+    "TensorPipeCudaDistAutogradTest",
+    "TensorPipeCudaRemoteModuleTest",
+    "TensorPipeCudaRpcTest",
+    "TensorPipeTensorPipeAgentCudaRpcTest",
+    "TensorPipeTensorPipeCudaDistAutogradTest",
+    "test_cpp_rpc"
+]
+EXCLUDED_TESTS = [
+]
+
+
+# Workflow names
+WorkflowName = Enum('WorkflowName', ['default', 'distributed', 'inductor'])
+
+def _status_priority(test_case):
+    """Return a numeric priority for deduplication of retried tests.
+    PASSED/XFAILED are preferred over FAILED/ERROR/SKIPPED since a
+    passing retry means the test is considered passing (flaky) in CI."""
+    status = get_test_status(test_case)
+    return {"PASSED": 4, "XFAILED": 3, "SKIPPED": 2, "FAILED": 1, "ERROR": 1, "MISSED": 0}.get(status, 0)
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="."):
+    work_flow_name = ""
+    test_cases = {}
+    items_list = os.listdir(path)
+    for dir in items_list:
+        new_dir = path + '/' + dir + '/'
+        if os.path.isdir(new_dir):
+            if "test-default" in new_dir:
+                work_flow_name = WorkflowName.default.name
+            elif "test-distributed" in new_dir:
+                work_flow_name = WorkflowName.distributed.name
+            elif "test-inductor" in new_dir:
+                work_flow_name = WorkflowName.inductor.name
+            for xml_report in Path(new_dir).glob("**/*.xml"):
+                try:
+                    new_cases = parse_xml_report(
+                        tag,
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                        work_flow_name
+                    )
+                except Exception as e:
+                    print(f"WARNING: Skipping malformed XML {xml_report}: {e}")
+                    continue
+                for key, case in new_cases.items():
+                    existing = test_cases.get(key)
+                    if existing is None or _status_priority(case) > _status_priority(existing):
+                        test_cases[key] = case
+    return test_cases
+
+def get_test_status(test_case):
+  # In order of priority: S=skipped, F=failure, E=error, P=pass
+  if not test_case:
+    return "MISSED"
+  elif "skipped" in test_case and test_case["skipped"]:
+      type_message = test_case["skipped"]
+      if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+          return "XFAILED"
+      else:
+          return "SKIPPED"
+  elif "failure" in test_case and test_case["failure"]:
+    return "FAILED"
+  elif "error" in test_case and test_case["error"]:
+    return "ERROR"
+  else:
+    return "PASSED"
+
+def get_test_message(test_case, status=None):
+  if status == "SKIPPED":
+    return test_case["skipped"] if "skipped" in test_case else ""
+  elif status == "FAILED":
+    return test_case["failure"] if "failure" in test_case else ""
+  elif status == "ERROR":
+    return test_case["error"] if "error" in test_case else ""
+  else:
+    if "skipped" in test_case:
+      return test_case["skipped"]
+    elif "failure" in test_case:
+      return test_case["failure"]
+    elif "error" in test_case:
+      return test_case["error"]
+    else:
+      return ""
+
+def get_running_time(test_case):
+  status = get_test_status(test_case)
+  if test_case.__contains__('time'):
+    return test_case["time"]
+  return ""
+
+def check_time_valid(time):
+  if time == "":
+    return False
+  return True
+
+def summarize_xml_files(args):
+    # TODO: Add arguments and parse accordingly
+    set1_path = args.set1 if args.set1 else "."
+    set2_path = args.set2
+    set1_name = args.set1_name
+    set2_name = args.set2_name
+
+    # statistics
+    SKIPPED_DEFAULT = 0
+    MISSED_DEFAULT = 0
+    CUDA_DEFAULT = 0
+    ROCM_DEFAULT = 0
+    ROCMONLY_DEFAULT = 0
+
+    SKIPPED_DISTRIBUTED = 0
+    MISSED_DISTRIBUTED = 0
+    CUDA_DISTRIBUTED = 0
+    ROCM_DISTRIBUTED = 0
+    ROCMONLY_DISTRIBUTED = 0
+
+    SKIPPED_INDUCTOR = 0
+    MISSED_INDUCTOR = 0
+    CUDA_INDUCTOR = 0
+    ROCM_INDUCTOR = 0
+    ROCMONLY_INDUCTOR = 0
+
+    TOTAL_CUDA_RUNNING_TIME = 0.0
+    TOTAL_ROCM_RUNNING_TIME = 0.0
+
+    # filter example: --filter SKIPPED-PASSED-MISSED-PASSED (tuples: set1 status1 - set2 status1, set1 status2 - set2 status2)
+    ut_status_filter = args.filter if args.filter else "."
+    list_of_status = ut_status_filter.split('-') if args.filter else []
+    # assertion: should be an even number length
+    assert len(list_of_status) % 2 == 0
+    list_status_set1 = []
+    list_status_set2 = []
+
+    index = 0
+    while index < len(list_of_status):
+        # special handling for status-NOT_status scenario
+        if "NOT" in list_of_status[index] or "NOT" in list_of_status[index+1]:
+            if "NOT" in list_of_status[index]:
+                items = list_of_status[index].split('_')
+                not_item = items[1]
+                for ind in range(len(UT_STATUS_LIST)):
+                    if UT_STATUS_LIST[ind] != not_item:
+                        list_status_set1.append(UT_STATUS_LIST[ind])
+                        list_status_set2.append(list_of_status[index+1])
+            else:
+                items = list_of_status[index+1].split('_')
+                not_item = items[1]
+                for ind in range(len(UT_STATUS_LIST)):
+                    if UT_STATUS_LIST[ind] != not_item:
+                        list_status_set2.append(UT_STATUS_LIST[ind])
+                        list_status_set1.append(list_of_status[index])
+            index += 2
+        else:
+            list_status_set1.append(list_of_status[index])
+            index += 1
+            list_status_set2.append(list_of_status[index])
+            index += 1
+
+    assert len(list_status_set1) == len(list_status_set2), \
+            "status_list not specified correctly, should be in pairs of two"
+    len_status_filter = len(list_status_set1)
+
+    # define column list
+    column_list = ['set1', 'set2', 'skip_reason', 'assignee', 'comments']
+
+    # function location pattern
+    pattern = "at 0x"
+
+    #parse the xml files
+    test_cases_set1_running_time = parse_xml_reports_as_dict(-1, -1, 'testsuite', set1_path)
+    # TODO: Does it matter what the workflow_run_attempt is set to below??
+    # test_cases is dict of dicts, with keys as tuple of test_file, test_class, test_name and test workflow
+    test_cases_set1 = parse_xml_reports_as_dict(-1, -1, 'testcase', set1_path)
+    for (k,v) in list(test_cases_set1.items()):
+        if v['work_flow_name'] == WorkflowName.default.name:
+            ROCM_DEFAULT += 1
+        elif v['work_flow_name'] == WorkflowName.distributed.name:
+            ROCM_DISTRIBUTED += 1
+        elif v['work_flow_name'] == WorkflowName.inductor.name:
+            ROCM_INDUCTOR += 1
+
+    # start with creating empty dicts for set2 for each test tuple
+    # for rocm/cuda comparison(with valid set2_path), sometimes parity sheet has inaccurate resutls due to different function string but with same test names,
+    # such as test_np_argmin_argmax_keepdims_size_(1, 2, 3, 4)_axis_-4_method_<function argmax at 0x7f1e411e6a70>
+    test_cases_set1_new: Dict[Tuple[str], Dict[str, Any]] = {}
+    if set2_path:
+      for (k,v) in list(test_cases_set1.items()):
+        if pattern in k[2]:
+          values = list(k)
+          index = k[2].find(pattern)
+          values[2] = k[2][0 : index]
+          k_new = tuple(values)
+          test_cases_set1_new[k_new] = v
+          del test_cases_set1[k]
+      #combine two dict
+      test_cases_set1_combined = {**test_cases_set1, **test_cases_set1_new}
+      test_cases = { k:[v, {}] for (k,v) in test_cases_set1_combined.items() }
+    else:
+      test_cases = { k:[v, {}] for (k,v) in test_cases_set1.items() }
+
+    test_cases_set2_running_time = {}
+    if set2_path:
+      assert set2_path != set1_path, \
+              "set2 path not specified correctly, should be different from set1 path"
+      test_cases_set2_running_time = parse_xml_reports_as_dict(-1, -1, 'testsuite', set2_path)
+      test_cases_set2 = parse_xml_reports_as_dict(-1, -1, 'testcase', set2_path)
+      for (k,v) in list(test_cases_set2.items()):
+          if v['work_flow_name'] == WorkflowName.default.name:
+              CUDA_DEFAULT += 1
+          elif v['work_flow_name'] == WorkflowName.distributed.name:
+              CUDA_DISTRIBUTED += 1
+          elif v['work_flow_name'] == WorkflowName.inductor.name:
+              CUDA_INDUCTOR += 1
+
+      # for rocm/cuda comparison, sometimes parity sheet has inaccurate resutls due to different function string but with same test names,
+      # such as test_np_argmin_argmax_keepdims_size_(1, 2, 3, 4)_axis_-4_method_<function argmax at 0x7f1e411e6a70>
+      test_cases_set2_new: Dict[Tuple[str], Dict[str, Any]] = {}
+      for (k,v) in list(test_cases_set2.items()):
+        if pattern in k[2]:
+          values = list(k)
+          index = k[2].find(pattern)
+          values[2] = k[2][0 : index]
+          k_new = tuple(values)
+          test_cases_set2_new[k_new] = v
+          del test_cases_set2[k]
+      #combine two dict
+      test_cases_set2_combined = {**test_cases_set2, **test_cases_set2_new}
+
+      # repopulate set2 dicts for test_tuples from test_cases_set2, 
+      # creating empty dicts for set1 if test_tuple doesn't exist in test_cases
+      for test_case in test_cases_set2_combined:
+        test_cases[test_case] = [test_cases_set1_combined[test_case] if test_case in test_cases_set1_combined else {}, test_cases_set2_combined[test_case]]
+
+    # expand with skip_reason, assignee and comments
+    for (k,v) in list(test_cases.items()):
+        # set1, set2, skip_reason, assignee and comments
+        while len(v) < len(column_list):
+            v.append('')
+
+    # get running time statistics before any exclusion and filter since they are only for comparison
+    # total running time: ROCm and CUDA
+    for (k,v) in list(test_cases_set1_running_time.items()):
+          TOTAL_ROCM_RUNNING_TIME += v["running_time_xml"]
+    for (k,v) in list(test_cases_set2_running_time.items()):
+          TOTAL_CUDA_RUNNING_TIME += v["running_time_xml"]
+
+    # test file level running time: ROCm and CUDA
+    test_file_level_ROCm: Dict[Tuple[str], float] = {}
+    test_file_level_CUDA: Dict[Tuple[str], float] = {}
+    for (k,v) in list(test_cases_set1_running_time.items()):
+          test_file_name = k[0]
+          test_workflow_name = k[2]
+          tar_tup_rocm = (test_file_name, test_workflow_name,)
+          if test_file_level_ROCm.get(tar_tup_rocm) == None:
+              test_file_level_ROCm[ ( test_file_name, test_workflow_name ) ] = v["running_time_xml"]
+          else:
+              test_file_level_ROCm[ ( test_file_name, test_workflow_name ) ] += v["running_time_xml"]
+    for (k,v) in list(test_cases_set2_running_time.items()):
+          test_file_name = k[0]
+          test_workflow_name = k[2]
+          tar_tup_cuda = (test_file_name, test_workflow_name)
+          if test_file_level_CUDA.get(tar_tup_cuda) == None:
+              test_file_level_CUDA[ ( test_file_name, test_workflow_name ) ] = v["running_time_xml"]
+          else:
+              test_file_level_CUDA[ ( test_file_name, test_workflow_name ) ] += v["running_time_xml"]
+
+    # test file level counts: ROCm tests run, passed, skipped, missed; CUDA tests run
+    test_file_counts_ROCm: Dict[Tuple[str], Dict[str, int]] = {}
+    test_file_counts_CUDA: Dict[Tuple[str], int] = {}
+    for (k,v) in list(test_cases_set1.items()):
+        test_file_name = k[0]
+        test_workflow_name = v['work_flow_name']
+        tar_tup = (test_file_name, test_workflow_name)
+        if tar_tup not in test_file_counts_ROCm:
+            test_file_counts_ROCm[tar_tup] = {'tests_run': 0, 'passed': 0, 'skipped': 0, 'missed': 0}
+        test_file_counts_ROCm[tar_tup]['tests_run'] += 1
+        status = get_test_status(v)
+        if status == "PASSED":
+            test_file_counts_ROCm[tar_tup]['passed'] += 1
+        elif status == "SKIPPED":
+            test_file_counts_ROCm[tar_tup]['skipped'] += 1
+        elif status == "MISSED":
+            test_file_counts_ROCm[tar_tup]['missed'] += 1
+    for (k,v) in list(test_cases_set2.items()) if set2_path else []:
+        test_file_name = k[0]
+        test_workflow_name = v['work_flow_name']
+        tar_tup = (test_file_name, test_workflow_name)
+        if tar_tup not in test_file_counts_CUDA:
+            test_file_counts_CUDA[tar_tup] = 0
+        test_file_counts_CUDA[tar_tup] += 1
+
+    # exclude certain tests for comparison
+    if set2_path:
+      for (k,v) in list(test_cases.items()):
+          if k[0] in EXCLUDED_TEST_SUITES:
+              test_cases.pop(k)
+          elif k[1] in EXCLUDED_TEST_CLASSES:
+              test_cases.pop(k)
+          elif (k[0], k[1], k[2]) in EXCLUDED_TESTS:
+              test_cases.pop(k)
+
+    # remove unmatched items if user specified ut status filters
+    if len_status_filter > 0:
+        case_matched = True
+        for (k,v) in list(test_cases.items()):
+            case_matched = False
+            status_set_1 = get_test_status(v[0])
+            status_set_2 = get_test_status(v[1]) if set2_path else ""
+            for index in range(len_status_filter):
+                if status_set_1 == list_status_set1[index] and status_set_2 == list_status_set2[index]:
+                    case_matched = True
+                    break
+
+            if not case_matched:
+                test_cases.pop(k)
+
+    # insert skip_reason, assignee and comments info for the cases that: rocm-missed+cuda-passed OR rocm-skipped+cuda-passed
+    # To do: assume set1 is ROCm currently. Should insert another arg for ROCm and CUDA order?
+    skip_reasons_stat_default = dict()
+    skip_reasons_stat_distributed = dict()
+    skip_reasons_stat_inductor = dict()
+    if args.skip_reasons:
+        # read skip reasons csv file
+        known_skips = pd.read_csv(args.skip_reasons, sep='\t')
+        known_skips = known_skips.to_dict(orient="records")
+
+    # Load previous week's CSV to check if tests existed and get skip reasons
+    prev_week_tests = set()
+    prev_week_skip_reasons = {}  # Maps (test_file, test_class, test_name) -> (skip_reason, assignee, comments)
+    if args.prev_week_csv:
+        prev_week_df = pd.read_csv(args.prev_week_csv)
+        for _, row in prev_week_df.iterrows():
+            test_key = (row['test_file'], row['test_class'], row['test_name'])
+            prev_week_tests.add(test_key)
+            # Also extract skip_reason, assignee, comments if they exist
+            skip_reason = row.get('skip_reason', '') if 'skip_reason' in row and not pd.isna(row.get('skip_reason', '')) else ''
+            assignee = row.get('assignee', '') if 'assignee' in row and not pd.isna(row.get('assignee', '')) else ''
+            comments = row.get('comments', '') if 'comments' in row and not pd.isna(row.get('comments', '')) else ''
+            if skip_reason or assignee or comments:
+                prev_week_skip_reasons[test_key] = (skip_reason, assignee, comments)
+
+    for (k,v) in list(test_cases.items()):
+        status_set_1 = get_test_status(v[0])
+        status_set_2 = get_test_status(v[1]) if set2_path else ""
+        test_file_name = k[0]
+        test_info = v[0]
+        test_info_set2 = []
+        if status_set_1 == "SKIPPED" and status_set_2 != "SKIPPED":
+            if test_info['work_flow_name'] == WorkflowName.default.name:
+                SKIPPED_DEFAULT += 1
+            elif test_info['work_flow_name'] == WorkflowName.distributed.name:
+                SKIPPED_DISTRIBUTED += 1
+            elif test_info['work_flow_name'] == WorkflowName.inductor.name:
+                SKIPPED_INDUCTOR += 1
+        elif set2_path:
+            test_info_set2 = v[1]
+            if status_set_1 == "MISSED" and status_set_2 != "MISSED":
+              if test_info_set2['work_flow_name'] == WorkflowName.default.name:
+                MISSED_DEFAULT += 1
+              elif test_info_set2['work_flow_name'] == WorkflowName.distributed.name:
+                MISSED_DISTRIBUTED += 1
+              elif test_info_set2['work_flow_name'] == WorkflowName.inductor.name:
+                MISSED_INDUCTOR += 1
+
+
+        if args.skip_reasons:
+            if (status_set_1 == "SKIPPED" and status_set_2 != "SKIPPED") or status_set_1 == "MISSED":
+              for known_skip in known_skips:
+                  if test_file_name == known_skip['test_file'] and k[1] == known_skip['test_class'] and k[2] == known_skip['test_name']:
+                      v[2] = known_skip['skip_reason'] if known_skip.__contains__('skip_reason') and not pd.isna(known_skip['skip_reason']) else ' '
+                      if (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.default.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.default.name):
+                          if not skip_reasons_stat_default.__contains__(v[2]):
+                              skip_reasons_stat_default[v[2]] = 1
+                          else:
+                              skip_reasons_stat_default[v[2]] += 1
+                      elif (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.distributed.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.distributed.name):
+                          if not skip_reasons_stat_distributed.__contains__(v[2]):
+                              skip_reasons_stat_distributed[v[2]] = 1
+                          else:
+                              skip_reasons_stat_distributed[v[2]] += 1
+                      elif (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.inductor.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.inductor.name):
+                          if not skip_reasons_stat_inductor.__contains__(v[2]):
+                              skip_reasons_stat_inductor[v[2]] = 1
+                          else:
+                              skip_reasons_stat_inductor[v[2]] += 1
+                      v[3] = known_skip['assignee'] if known_skip.__contains__('assignee') and not pd.isna(known_skip['assignee']) else ' '
+                      v[4] = known_skip['comments'] if known_skip.__contains__('comments') and not pd.isna(known_skip['comments']) else ' '
+                      break
+
+        if status_set_1 == "PASSED" and status_set_2 != "PASSED" and set2_path:
+            if test_info['work_flow_name'] == WorkflowName.default.name:
+                ROCMONLY_DEFAULT += 1
+            elif test_info['work_flow_name'] == WorkflowName.distributed.name:
+                ROCMONLY_DISTRIBUTED += 1
+            elif test_info['work_flow_name'] == WorkflowName.inductor.name:
+                ROCMONLY_INDUCTOR += 1
+
+    skip_reasons_stat_default.pop(' ', None)
+    skip_reasons_stat_distributed.pop(' ', None)
+
+    test_cases_for_csv = {}
+    # k is test_tuple, v is list of rocm and cuda info for that test_tuple
+    skip_reason_file_specified = False
+    if args.skip_reasons:
+        skip_reason_file_specified = True
+    for (k,v) in test_cases.items():
+        item_values = {}
+        item_values["test_file"] = k[0]
+        item_values["test_class"] = k[1]
+        item_values["test_name"] = k[2]
+        item_values[f"status_{set1_name}"] = get_test_status(v[0])
+        item_values[f"status_{set2_name}"] = get_test_status(v[1]) if set2_path else ""
+        # get workflow info
+        v_values = v[0]
+        v1_values = v[1] if set2_path else []
+        workflow_name = ""
+        item_values["work_flow_name"] = ""
+        if item_values[f"status_{set1_name}"] != "MISSED":
+            workflow_name = v_values['work_flow_name']
+        elif item_values[f"status_{set2_name}"] != "MISSED" and item_values[f"status_{set2_name}"] != "":
+            workflow_name = v1_values['work_flow_name']
+        item_values["work_flow_name"] = workflow_name
+        # get test related info
+        item_values[f"message_{set1_name}"] = get_test_message(v[0])
+        item_values[f"message_{set2_name}"] = get_test_message(v[1]) if set2_path else ""
+        # Get skip_reason, assignee, comments from --skip_reasons file if specified
+        if skip_reason_file_specified:
+            item_values["skip_reason"] = v[2]
+            item_values["assignee"] = v[3]
+            item_values["comments"] = v[4]
+        # Check if test existed in previous week's CSV and get skip reasons from there
+        if args.prev_week_csv:
+            test_key = (k[0], k[1], k[2])  # (test_file, test_class, test_name)
+            item_values["existed_last_week"] = "yes" if test_key in prev_week_tests else "no"
+            # If skip_reason not set by --skip_reasons, try to get from prev_week_csv
+            if not skip_reason_file_specified:
+                if test_key in prev_week_skip_reasons:
+                    prev_skip_reason, prev_assignee, prev_comments = prev_week_skip_reasons[test_key]
+                    item_values["skip_reason"] = prev_skip_reason
+                    item_values["assignee"] = prev_assignee
+                    item_values["comments"] = prev_comments
+                else:
+                    item_values["skip_reason"] = ""
+                    item_values["assignee"] = ""
+                    item_values["comments"] = ""
+        if not skip_reason_file_specified and not args.prev_week_csv:
+            item_values["skip_reason"] = ""
+            item_values["assignee"] = ""
+            item_values["comments"] = ""
+        running_time1 = get_running_time(v[0])
+        item_values[f"running_time_{set1_name}"] = running_time1
+        running_time2 = get_running_time(v[1])
+        item_values[f"running_time_{set2_name}"] = running_time2
+        item_values["abs_time_diff"] = ""
+        item_values["relative_time_diff"] = ""
+        if check_time_valid(running_time1) and check_time_valid(running_time2):
+          item_values["abs_time_diff"] = running_time1 - running_time2
+          if get_running_time(v[1]) != 0.0:
+            item_values["relative_time_diff"] = 100 * (running_time1 - running_time2) / running_time2
+        test_cases_for_csv[k] = item_values
+
+    test_cases_for_csv = dict(sorted(test_cases_for_csv.items()))
+
+    #store test_cases in csv
+    tests_from_xml_filename = args.output_csv
+    keys_list = list(set(chain.from_iterable(sub.keys() for sub in test_cases_for_csv.values())))
+
+    def sorting_key(e):
+        if e == "invoking_file":
+          return 0
+        elif e == "test_file":
+          return 1
+        elif e == "test_class":
+          return 2
+        elif e == "test_name":
+          return 3
+        elif e == "work_flow_name":
+          return 4
+        elif e == "skip_reason":
+          return 5
+        elif e == "assignee":
+          return 6
+        elif e == "comments":
+          return 7
+        elif e == f"status_{set1_name}":
+          return 8
+        elif e == f"message_{set1_name}":
+          return 9
+        elif e == f"running_time_{set1_name}":
+          return 10
+        elif e == f"status_{set2_name}":
+          return 11
+        elif e == f"message_{set2_name}":
+          return 12
+        elif e == f"running_time_{set2_name}":
+          return 13
+        elif e == "abs_time_diff":
+          return 14
+        elif e == "relative_time_diff":
+          return 15
+        elif e == "skipped":
+          return 16
+        elif e == "failure":
+          return 17
+        elif e == "error":
+          return 18
+        elif e == "system-out":
+          return 19
+        elif e == "existed_last_week":
+          return 20
+        elif e == "workflow_run_attempt" or e == "job_id":
+          return 1000
+        else:
+          return 100
+
+    keys_list.sort(key=sorting_key)
+
+    with open(tests_from_xml_filename, "w") as outfile:
+         writer = csv.DictWriter(outfile, fieldnames = keys_list)
+         writer.writeheader()
+         writer.writerows(test_cases_for_csv.values())
+    ## TODO - usage yet to be identified
+    #pytest_parallel_times = get_pytest_parallel_times()
+    ##extract test cases summary and save them in csv file
+    #test_cases_summary = summarize_test_cases(test_cases)
+    #testcases_summary_filename = "testcases_summary.csv"
+    #keys_list = list(set(chain.from_iterable(sub.keys() for sub in test_cases_summary)))
+    #with open(testcases_summary_filename, "w") as outfile:
+    #     writer = csv.DictWriter(outfile, fieldnames = keys_list)
+    #     writer.writeheader()
+    #     writer.writerows(test_cases_summary)
+
+    # write test file running time to file
+    test_file_running_time_for_csv = {}
+    for key_rocm in test_file_level_ROCm.keys():
+        item_values = {}
+        item_values["test_file"] = key_rocm[0]
+        item_values["test_workflow"] = key_rocm[1]
+        item_values["rocm_running_time"] = test_file_level_ROCm[key_rocm]
+        item_values["cuda_running_time"] = 0.0
+        if key_rocm in test_file_level_CUDA.keys():
+            item_values["cuda_running_time"] = test_file_level_CUDA[key_rocm]
+        item_values["abs_time_diff"] = item_values["rocm_running_time"] - item_values["cuda_running_time"]
+        item_values["relative_time_diff"] = 0.0
+        if item_values["cuda_running_time"] != 0.0:
+            item_values["relative_time_diff"] = 100 * (item_values["rocm_running_time"] - item_values["cuda_running_time"]) / item_values["cuda_running_time"]
+        # Add test counts
+        item_values["rocm_tests_run"] = test_file_counts_ROCm.get(key_rocm, {}).get('tests_run', 0)
+        item_values["cuda_tests_run"] = test_file_counts_CUDA.get(key_rocm, 0)
+        item_values["rocm_passed"] = test_file_counts_ROCm.get(key_rocm, {}).get('passed', 0)
+        item_values["rocm_skipped"] = test_file_counts_ROCm.get(key_rocm, {}).get('skipped', 0)
+        item_values["rocm_missed"] = test_file_counts_ROCm.get(key_rocm, {}).get('missed', 0)
+        test_file_running_time_for_csv[key_rocm] = item_values
+
+    for key_cuda in test_file_level_CUDA.keys():
+        if not key_cuda in test_file_level_ROCm.keys():
+            item_values = {}
+            item_values["test_file"] = key_cuda[0]
+            item_values["test_workflow"] = key_cuda[1]
+            item_values["rocm_running_time"] = 0.0
+            item_values["cuda_running_time"] = test_file_level_CUDA[key_cuda]
+            item_values["abs_time_diff"] = item_values["rocm_running_time"] - item_values["cuda_running_time"]
+            item_values["relative_time_diff"] = 0.0
+            if item_values["cuda_running_time"] != 0.0:
+                item_values["relative_time_diff"] = 100 * (item_values["rocm_running_time"] - item_values["cuda_running_time"]) / item_values["cuda_running_time"]
+            # Add test counts
+            item_values["rocm_tests_run"] = test_file_counts_ROCm.get(key_cuda, {}).get('tests_run', 0)
+            item_values["cuda_tests_run"] = test_file_counts_CUDA.get(key_cuda, 0)
+            item_values["rocm_passed"] = test_file_counts_ROCm.get(key_cuda, {}).get('passed', 0)
+            item_values["rocm_skipped"] = test_file_counts_ROCm.get(key_cuda, {}).get('skipped', 0)
+            item_values["rocm_missed"] = test_file_counts_ROCm.get(key_cuda, {}).get('missed', 0)
+            test_file_running_time_for_csv[key_cuda] = item_values
+
+    test_file_running_time_for_csv = dict(sorted(test_file_running_time_for_csv.items()))
+    keys_list_running_time = list(set(chain.from_iterable(sub.keys() for sub in test_file_running_time_for_csv.values())))
+    def sorting_key_running_time(e):
+        if e == "test_file":
+          return 0
+        elif e == "test_workflow":
+          return 1
+        elif e == "rocm_running_time":
+          return 2
+        elif e == "cuda_running_time":
+          return 3
+        elif e == "abs_time_diff":
+          return 4
+        elif e == "relative_time_diff":
+          return 5
+        elif e == "rocm_tests_run":
+          return 6
+        elif e == "cuda_tests_run":
+          return 7
+        elif e == "rocm_passed":
+          return 8
+        elif e == "rocm_skipped":
+          return 9
+        elif e == "rocm_missed":
+          return 10
+        else:
+          return 100
+
+    keys_list_running_time.sort(key=sorting_key_running_time)
+    tests_from_xml_file_running_time = args.test_file_running_time_output_csv
+    with open(tests_from_xml_file_running_time, "w") as outfile:
+         writer = csv.DictWriter(outfile, fieldnames = keys_list_running_time)
+         writer.writeheader()
+         writer.writerows(test_file_running_time_for_csv.values())
+
+    # print summary
+    print( " " )
+    print( "_____________________________________" )
+    print( "Test-results" )
+    print( " " )
+    print( "=====Single GPU Number=====" )
+    print( "SKIPPED_DEFAULT, MISSED_DEFAULT, ROCMONLY_DEFAULT, CUDA_DEFAULT, ROCM_DEFAULT" )
+    print( str(SKIPPED_DEFAULT) + ", " + str(MISSED_DEFAULT) + ", " + str(ROCMONLY_DEFAULT) + ", " + str(CUDA_DEFAULT) + ", " + str(ROCM_DEFAULT) )
+    print( " " )
+    print( "=====Distributed GPU Number=====" )
+    print( "SKIPPED_DISTRIBUTED, MISSED_DISTRIBUTED, ROCMONLY_DISTRIBUTED, CUDA_DISTRIBUTED, ROCM_DISTRIBUTED" )
+    print( str(SKIPPED_DISTRIBUTED) + ", " + str(MISSED_DISTRIBUTED) + ", " + str(ROCMONLY_DISTRIBUTED) + ", " + str(CUDA_DISTRIBUTED) + ", " + str(ROCM_DISTRIBUTED) )
+    print( " " )
+    print( "=====Inductor GPU Number=====" )
+    print( "SKIPPED_INDUCTOR, MISSED_INDUCTOR, ROCMONLY_INDUCTOR, CUDA_INDUCTOR, ROCM_INDUCTOR" )
+    print( str(SKIPPED_INDUCTOR) + ", " + str(MISSED_INDUCTOR) + ", " + str(ROCMONLY_INDUCTOR) + ", " + str(CUDA_INDUCTOR) + ", " + str(ROCM_INDUCTOR) )
+    print( " " )
+    print( "SELECTED CAUSES SUMMARY" )
+    print( " " )
+    print( "=====================" )
+    print( "Single GPU test" )
+    sorted_skip_reasons_statistics_default = sorted(skip_reasons_stat_default.keys(), key = lambda x : x.lower())
+    for skip_reason_entry in sorted_skip_reasons_statistics_default:
+        print( skip_reason_entry, ": ", skip_reasons_stat_default[skip_reason_entry] )
+    print( " " )
+    print( "=====================" )
+    print( "Distributed test" )
+    sorted_skip_reasons_distributed_statistics = sorted(skip_reasons_stat_distributed.keys(), key = lambda x : x.lower())
+    for skip_reason_entry in sorted_skip_reasons_distributed_statistics:
+        print( skip_reason_entry, ": ", skip_reasons_stat_distributed[skip_reason_entry] )
+    print( " " )
+    print( "=====================" )
+    print( "Inductor test" )
+    sorted_skip_reasons_statistics_inductor = sorted(skip_reasons_stat_inductor.keys(), key = lambda x : x.lower())
+    for skip_reason_entry in sorted_skip_reasons_statistics_inductor:
+        print( skip_reason_entry, ": ", skip_reasons_stat_inductor[skip_reason_entry] )
+    print( " " )
+    print( "=====================" )
+    print( "Time statistics" )
+    print( "ROCM_RUNNING_TIME, CUDA_RUNNING_TIME" )
+    print( str(TOTAL_ROCM_RUNNING_TIME) + ", " + str(TOTAL_CUDA_RUNNING_TIME) )
+    #print( "ROCm test file level time statistics" )
+    #for (k,v) in list(test_file_level_ROCm.items()):
+      #print( k[0] + ", " + k[1] + ", " + k[2] + ", " + str(v) )
+    #print( "CUDA test file level time statistics" )
+    #for (k,v) in list(test_file_level_CUDA.items()):
+      #print( k[0] + ", " + k[1] + ", " + k[2] + ", " + str(v) )
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse xml test-reports')
+    parser.add_argument("--set1", required=False, type=str, help="absolute or relative path to first test-reports dir")
+    parser.add_argument("--set2", required=False, type=str, help="absolute or relative path to second test-reports dir")
+    parser.add_argument("--set1_name", required=False, type=str, default="set1", help="display name for set1 in CSV column headers (default: set1)")
+    parser.add_argument("--set2_name", required=False, type=str, default="set2", help="display name for set2 in CSV column headers (default: set2)")
+    parser.add_argument("--output_csv", required=False, type=str, help="output csv filename", default="tests_from_xml.csv")
+    parser.add_argument("--filter", required=False, type=str, help="ut status filter flag")
+    parser.add_argument("--skip_reasons", required=False, type=str, help='skip reasons file')
+    parser.add_argument("--test_file_running_time_output_csv", required=False, type=str, help="file running time output csv filename", default="file_running_time_output.csv")
+    parser.add_argument("--prev_week_csv", required=False, type=str, help="previous week's all tests status CSV file to check if tests existed")
+    return parser.parse_args()
+
+def main():
+    global args
+    args = parse_args()
+    summarize_xml_files(args)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/.automation_scripts/pytorch-unit-test-scripts/upload_stats_lib.py b/.automation_scripts/pytorch-unit-test-scripts/upload_stats_lib.py
new file mode 100644
index 0000000000000..218e35768ef2c
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/upload_stats_lib.py
@@ -0,0 +1,187 @@
+import gzip
+import io
+import json
+import os
+import xml.etree.ElementTree as ET
+import zipfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import boto3  # type: ignore[import]
+import requests
+import rockset  # type: ignore[import]
+
+PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
+S3_RESOURCE = boto3.resource("s3")
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+
+def _get_request_headers() -> Dict[str, str]:
+    return {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": "token " + os.environ["GITHUB_TOKEN"],
+    }
+
+
+def _get_artifact_urls(prefix: str, workflow_run_id: int) -> Dict[Path, str]:
+    """Get all workflow artifacts with 'test-report' in the name."""
+    response = requests.get(
+        f"{PYTORCH_REPO}/actions/runs/{workflow_run_id}/artifacts?per_page=100",
+    )
+    artifacts = response.json()["artifacts"]
+    while "next" in response.links.keys():
+        response = requests.get(
+            response.links["next"]["url"], headers=_get_request_headers()
+        )
+        artifacts.extend(response.json()["artifacts"])
+
+    artifact_urls = {}
+    for artifact in artifacts:
+        if artifact["name"].startswith(prefix):
+            artifact_urls[Path(artifact["name"])] = artifact["archive_download_url"]
+    return artifact_urls
+
+
+def _download_artifact(
+    artifact_name: Path, artifact_url: str, workflow_run_attempt: int
+) -> Path:
+    # [Artifact run attempt]
+    # All artifacts on a workflow share a single namespace. However, we can
+    # re-run a workflow and produce a new set of artifacts. To avoid name
+    # collisions, we add `-runattempt1<run #>-` somewhere in the artifact name.
+    #
+    # This code parses out the run attempt number from the artifact name. If it
+    # doesn't match the one specified on the command line, skip it.
+    atoms = str(artifact_name).split("-")
+    for atom in atoms:
+        if atom.startswith("runattempt"):
+            found_run_attempt = int(atom[len("runattempt") :])
+            if workflow_run_attempt != found_run_attempt:
+                print(
+                    f"Skipping {artifact_name} as it is an invalid run attempt. "
+                    f"Expected {workflow_run_attempt}, found {found_run_attempt}."
+                )
+
+    print(f"Downloading {artifact_name}")
+
+    response = requests.get(artifact_url, headers=_get_request_headers())
+    with open(artifact_name, "wb") as f:
+        f.write(response.content)
+    return artifact_name
+
+
+def download_s3_artifacts(
+    prefix: str,
+    workflow_run_id: int,
+    workflow_run_attempt: int,
+    allowed_substrings: Optional[List[str]] = None,
+) -> List[Path]:
+    bucket = S3_RESOURCE.Bucket("gha-artifacts")
+    objs = bucket.objects.filter(
+        Prefix=f"pytorch/pytorch/{workflow_run_id}/{workflow_run_attempt}/artifact/{prefix}"
+    )
+
+    found_one = False
+    paths = []
+    for obj in objs:
+        p = Path(Path(obj.key).name)
+        if allowed_substrings and not any(sub in p.name for sub in allowed_substrings):
+            continue
+        found_one = True
+        print(f"Downloading {p}")
+        with open(p, "wb") as f:
+            f.write(obj.get()["Body"].read())
+        paths.append(p)
+
+    if not found_one:
+        print(
+            "::warning title=s3 artifacts not found::"
+            "Didn't find any test reports in s3, there might be a bug!"
+        )
+    return paths
+
+
+def download_gha_artifacts(
+    prefix: str, workflow_run_id: int, workflow_run_attempt: int
+) -> List[Path]:
+    artifact_urls = _get_artifact_urls(prefix, workflow_run_id)
+    paths = []
+    for name, url in artifact_urls.items():
+        paths.append(_download_artifact(Path(name), url, workflow_run_attempt))
+    return paths
+
+
+def upload_to_rockset(collection: str, docs: List[Any]) -> None:
+    print(f"Writing {len(docs)} documents to Rockset")
+    client = rockset.Client(
+        api_server="api.rs2.usw2.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+    )
+    client.Collection.retrieve(collection).add_docs(docs)
+    print("Done!")
+
+
+def upload_to_s3(
+    workflow_run_id: int,
+    workflow_run_attempt: int,
+    collection: str,
+    docs: List[Dict[str, Any]],
+) -> None:
+    print(f"Writing {len(docs)} documents to S3")
+    body = io.StringIO()
+    for doc in docs:
+        json.dump(doc, body)
+        body.write("\n")
+
+    S3_RESOURCE.Object(
+        "ossci-raw-job-status",
+        f"{collection}/{workflow_run_id}/{workflow_run_attempt}",
+    ).put(
+        Body=gzip.compress(body.getvalue().encode()),
+        ContentEncoding="gzip",
+        ContentType="application/json",
+    )
+    print("Done!")
+
+
+def upload_file_to_s3(
+    file_name: str,
+    bucket: str,
+    key: str,
+) -> None:
+    """
+    Upload a local file to S3
+    """
+    print(f"Upload {file_name} to s3://{bucket}/{key}")
+    boto3.client("s3").upload_file(
+        file_name,
+        bucket,
+        key,
+    )
+
+
+def unzip(p: Path) -> None:
+    """Unzip the provided zipfile to a similarly-named directory.
+
+    Returns None if `p` is not a zipfile.
+
+    Looks like: /tmp/test-reports.zip -> /tmp/unzipped-test-reports/
+    """
+    assert p.is_file()
+    unzipped_dir = p.with_name("unzipped-" + p.stem)
+    print(f"Extracting {p} to {unzipped_dir}")
+
+    with zipfile.ZipFile(p, "r") as zip:
+        zip.extractall(unzipped_dir)
+
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
diff --git a/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py b/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
new file mode 100644
index 0000000000000..8121e8d16928d
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
@@ -0,0 +1,394 @@
+import argparse
+import os
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any, Dict, List, Tuple
+
+from upload_stats_lib import (
+    download_gha_artifacts,
+    download_s3_artifacts,
+    is_rerun_disabled_tests,
+    unzip,
+    upload_to_s3,
+)
+
+
+# Backends list
+BACKENDS_LIST = [
+    "dist-gloo",
+    "dist-nccl"
+]
+
+def get_job_id(report: Path) -> int:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return -1
+
+
+def parse_xml_report(
+    tag: str,
+    report: Path,
+    workflow_id: int,
+    workflow_run_attempt: int,
+    work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    #print(f"Parsing {tag}s for test report: {report}")
+    print(".", end="", flush=True)
+
+    job_id = get_job_id(report)
+    #print(f"Found job id: {job_id}")
+
+    test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+    root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
+
+    for test_case in root.iter(tag):
+        case = process_xml_element(test_case)
+        if tag == 'testcase':
+            case["workflow_id"] = workflow_id
+            case["workflow_run_attempt"] = workflow_run_attempt
+            case["job_id"] = job_id
+            case["work_flow_name"] = work_flow_name
+
+            # [invoking file]
+            # The name of the file that the test is located in is not necessarily
+            # the same as the name of the file that invoked the test.
+            # For example, `test_jit.py` calls into multiple other test files (e.g.
+            # jit/test_dce.py). For sharding/test selection purposes, we want to
+            # record the file that invoked the test.
+            #
+            # To do this, we leverage an implementation detail of how we write out
+            # tests (https://bit.ly/3ajEV1M), which is that reports are created
+            # under a folder with the same name as the invoking file.
+            case_name = report.parent.name
+            for part in report.parts:
+                for backend in BACKENDS_LIST:
+                    if backend in part:
+                        case_name = case_name + "_" + part
+                        break
+                else:
+                    continue
+                break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+        elif tag == 'testsuite':
+            case["work_flow_name"] = work_flow_name
+            case["invoking_xml"] = report.name
+            case["running_time_xml"] = case["time"]
+            case_name = report.parent.name
+            for part in report.parts:
+                for backend in BACKENDS_LIST:
+                    if backend in part:
+                        case_name = case_name + "_" + part
+                        break
+                else:
+                    continue
+                break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+    return test_cases
+
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #       <foo>world</foo>
+    #       <bar>another</bar>
+    #     </testcase>
+    # becomes
+    #    {
+    #       "foo": [{"text": "hello"}, {"text": "world"}],
+    #       "bar": {"text": "another"}
+    #    }
+    for child in element:
+        if child.tag not in ret:
+            ret[child.tag] = process_xml_element(child)
+        else:
+            # If there are multiple tags with the same name, they should be
+            # coalesced into a list.
+            if not isinstance(ret[child.tag], list):
+                ret[child.tag] = [ret[child.tag]]
+            ret[child.tag].append(process_xml_element(child))
+    return ret
+
+
+def get_pytest_parallel_times() -> Dict[Any, Any]:
+    pytest_parallel_times: Dict[Any, Any] = {}
+    for report in Path(".").glob("**/python-pytest/**/*.xml"):
+        invoking_file = report.parent.name
+
+        root = ET.parse(report)
+        # TODO: Skip test reports from rerun disabled tests, same reason as mentioned
+        # above
+        if is_rerun_disabled_tests(root):
+            continue
+
+        assert len(list(root.iter("testsuite"))) == 1
+        for test_suite in root.iter("testsuite"):
+            pytest_parallel_times[
+                (invoking_file, get_job_id(report))
+            ] = test_suite.attrib["time"]
+    return pytest_parallel_times
+
+
+def get_tests(
+    workflow_run_id: int, workflow_run_attempt: int
+) -> Tuple[List[Dict[str, Any]], Dict[Any, Any]]:
+    with TemporaryDirectory() as temp_dir:
+        print("Using temporary directory:", temp_dir)
+        os.chdir(temp_dir)
+
+        # Download and extract all the reports (both GHA and S3)
+        s3_paths = download_s3_artifacts(
+            "test-report", workflow_run_id, workflow_run_attempt
+        )
+        for path in s3_paths:
+            unzip(path)
+
+        artifact_paths = download_gha_artifacts(
+            "test-report", workflow_run_id, workflow_run_attempt
+        )
+        for path in artifact_paths:
+            unzip(path)
+
+        # Parse the reports and transform them to JSON
+        test_cases = []
+        for xml_report in Path(".").glob("**/*.xml"):
+            test_cases.extend(
+                parse_xml_report(
+                    "testcase",
+                    xml_report,
+                    workflow_run_id,
+                    workflow_run_attempt,
+                )
+            )
+
+        pytest_parallel_times = get_pytest_parallel_times()
+
+        return test_cases, pytest_parallel_times
+
+
+def get_tests_for_circleci(
+    workflow_run_id: int, workflow_run_attempt: int
+) -> Tuple[List[Dict[str, Any]], Dict[Any, Any]]:
+    # Parse the reports and transform them to JSON
+    test_cases = []
+    for xml_report in Path(".").glob("**/test/test-reports/**/*.xml"):
+        test_cases.extend(
+            parse_xml_report(
+                "testcase", xml_report, workflow_run_id, workflow_run_attempt
+            )
+        )
+
+    pytest_parallel_times = get_pytest_parallel_times()
+
+    return test_cases, pytest_parallel_times
+
+
+def get_invoking_file_times(
+    test_case_summaries: List[Dict[str, Any]], pytest_parallel_times: Dict[Any, Any]
+) -> List[Dict[str, Any]]:
+    def get_key(summary: Dict[str, Any]) -> Any:
+        return (
+            summary["invoking_file"],
+            summary["job_id"],
+        )
+
+    def init_value(summary: Dict[str, Any]) -> Any:
+        return {
+            "job_id": summary["job_id"],
+            "workflow_id": summary["workflow_id"],
+            "workflow_run_attempt": summary["workflow_run_attempt"],
+            "invoking_file": summary["invoking_file"],
+            "time": 0.0,
+        }
+
+    ret = {}
+    for summary in test_case_summaries:
+        key = get_key(summary)
+        if key not in ret:
+            ret[key] = init_value(summary)
+        ret[key]["time"] += summary["time"]
+
+    for key, val in ret.items():
+        # when running in parallel in pytest, adding the test times will not give the correct
+        # time used to run the file, which will make the sharding incorrect, so if the test is
+        # run in parallel, we take the time reported by the testsuite
+        if key in pytest_parallel_times:
+            val["time"] = pytest_parallel_times[key]
+
+    return list(ret.values())
+
+
+def summarize_test_cases(test_cases: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Group test cases by classname, file, and job_id. We perform the aggregation
+    manually instead of using the `test-suite` XML tag because xmlrunner does
+    not produce reliable output for it.
+    """
+
+    def get_key(test_case: Dict[str, Any]) -> Any:
+        return (
+            test_case.get("file"),
+            test_case.get("classname"),
+            test_case["job_id"],
+            test_case["workflow_id"],
+            test_case["workflow_run_attempt"],
+            # [see: invoking file]
+            test_case["invoking_file"],
+        )
+
+    def init_value(test_case: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            "file": test_case.get("file"),
+            "classname": test_case.get("classname"),
+            "job_id": test_case["job_id"],
+            "workflow_id": test_case["workflow_id"],
+            "workflow_run_attempt": test_case["workflow_run_attempt"],
+            # [see: invoking file]
+            "invoking_file": test_case["invoking_file"],
+            "tests": 0,
+            "failures": 0,
+            "errors": 0,
+            "skipped": 0,
+            "successes": 0,
+            "time": 0.0,
+        }
+
+    ret = {}
+    for test_case in test_cases:
+        key = get_key(test_case)
+        if key not in ret:
+            ret[key] = init_value(test_case)
+
+        ret[key]["tests"] += 1
+
+        if "failure" in test_case:
+            ret[key]["failures"] += 1
+        elif "error" in test_case:
+            ret[key]["errors"] += 1
+        elif "skipped" in test_case:
+            ret[key]["skipped"] += 1
+        else:
+            ret[key]["successes"] += 1
+
+        ret[key]["time"] += test_case["time"]
+    return list(ret.values())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload test stats to Rockset")
+    parser.add_argument(
+        "--workflow-run-id",
+        required=True,
+        help="id of the workflow to get artifacts from",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+    parser.add_argument(
+        "--head-branch",
+        required=True,
+        help="Head branch of the workflow",
+    )
+    parser.add_argument(
+        "--circleci",
+        action="store_true",
+        help="If this is being run through circleci",
+    )
+    args = parser.parse_args()
+
+    print(f"Workflow id is: {args.workflow_run_id}")
+
+    if args.circleci:
+        test_cases, pytest_parallel_times = get_tests_for_circleci(
+            args.workflow_run_id, args.workflow_run_attempt
+        )
+    else:
+        test_cases, pytest_parallel_times = get_tests(
+            args.workflow_run_id, args.workflow_run_attempt
+        )
+
+    # Flush stdout so that any errors in rockset upload show up last in the logs.
+    sys.stdout.flush()
+
+    # For PRs, only upload a summary of test_runs. This helps lower the
+    # volume of writes we do to Rockset.
+    test_case_summary = summarize_test_cases(test_cases)
+    invoking_file_times = get_invoking_file_times(
+        test_case_summary, pytest_parallel_times
+    )
+
+    upload_to_s3(
+        args.workflow_run_id,
+        args.workflow_run_attempt,
+        "test_run_summary",
+        test_case_summary,
+    )
+
+    upload_to_s3(
+        args.workflow_run_id,
+        args.workflow_run_attempt,
+        "invoking_file_times",
+        invoking_file_times,
+    )
+
+    if args.head_branch == "master":
+        # For master jobs, upload everytihng.
+        upload_to_s3(
+            args.workflow_run_id, args.workflow_run_attempt, "test_run", test_cases
+        )
diff --git a/.github/workflows/parity.yml b/.github/workflows/parity.yml
new file mode 100644
index 0000000000000..3dae16d3c6982
--- /dev/null
+++ b/.github/workflows/parity.yml
@@ -0,0 +1,369 @@
+name: Parity Report
+run-name: "${{ inputs.baseline_sha && format('{0} vs {1}', inputs.sha || 'latest', inputs.baseline_sha) || inputs.csv_name || inputs.pr_id && format('PR {0}', inputs.pr_id) || inputs.sha || 'latest' }} · ${{ inputs.arch || 'mi355, mi300, mi200' }}"
+
+on:
+  workflow_dispatch:
+    inputs:
+      # download_testlogs flags
+      sha:
+        description: 'Commit SHA to pull test results for. Example: 67f1ccf46a966e75f37facd497a03f7d1bd72982. Leave empty for latest green on main.'
+        required: false
+        type: string
+      baseline_sha:
+        description: 'Baseline commit SHA to compare against (same workflow/arch). Produces a commit-vs-commit report instead of ROCm-vs-CUDA.'
+        required: false
+        type: string
+      pr_id:
+        description: 'Pull request number (alternative to SHA, uses latest commit). Example: 176306'
+        required: false
+        type: string
+      arch:
+        description: 'ROCm architectures, comma or space separated. Options: mi355, mi300, mi200, nightly, navi31. Example: "nightly, mi355" or "mi300"'
+        required: false
+        default: 'mi355, mi300, mi200'
+        type: string
+      exclude_distributed:
+        description: 'Exclude distributed tests (auto-excluded for navi31)'
+        required: false
+        default: false
+        type: boolean
+      exclude_inductor:
+        description: 'Exclude inductor tests (auto-excluded for navi31)'
+        required: false
+        default: false
+        type: boolean
+      exclude_default:
+        description: 'Exclude default tests'
+        required: false
+        default: false
+        type: boolean
+      include_logs:
+        description: 'Download and include CI log files (.txt) in artifact zip'
+        required: false
+        default: true
+        type: boolean
+      skip_rocm:
+        description: 'Skip downloading ROCm test results (generate CUDA-only report)'
+        required: false
+        default: false
+        type: boolean
+      skip_cuda:
+        description: 'Skip downloading CUDA test results (generate ROCm-only report)'
+        required: false
+        default: false
+        type: boolean
+      # summarize_xml_testreports flags
+      set1_name:
+        description: 'Label for ROCm columns in output CSV. Examples: rocm, nightly, mi300. Default: rocm'
+        required: false
+        default: 'rocm'
+        type: string
+      set2_name:
+        description: 'Label for CUDA columns in output CSV. Examples: cuda, trunk. Default: cuda'
+        required: false
+        default: 'cuda'
+        type: string
+      csv_name:
+        description: 'Custom prefix for output filenames and artifacts. Default: YYYYMMDD_all_tests_status'
+        required: false
+        type: string
+      include_inductor_periodic:
+        description: 'Download inductor-periodic benchmark artifacts (separate from parity CSV)'
+        required: false
+        default: false
+        type: boolean
+      include_xml:
+        description: 'Include raw XML test reports in artifact zip (WARNING: drastically increases artifact size ~10x)'
+        required: false
+        default: false
+        type: boolean
+      auto_classify:
+        description: 'Auto-classify skip reasons for SKIPPED/MISSED tests in the output CSV'
+        required: false
+        default: false
+        type: boolean
+
+jobs:
+  setup-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      arch-matrix: ${{ steps.parse.outputs.matrix }}
+      prefix: ${{ steps.parse.outputs.prefix }}
+    steps:
+      - name: Parse arch input into matrix
+        id: parse
+        run: |
+          ARCHS="${{ inputs.arch }}"
+          ARCHS=$(echo "$ARCHS" | tr ',[:space:]' '\n' | sed '/^$/d' | tr '\n' ' ')
+          JSON=$(echo "$ARCHS" | tr ' ' '\n' | sed '/^$/d' | sed 's/^/"/;s/$/"/' | paste -sd',' | sed 's/^/[/;s/$/]/')
+          echo "matrix=$JSON" >> "$GITHUB_OUTPUT"
+          echo "Architectures: $JSON"
+
+          if [ -n "${{ inputs.csv_name }}" ]; then
+            PREFIX="${{ inputs.csv_name }}"
+          elif [ -n "${{ inputs.sha }}" ]; then
+            PREFIX="${{ inputs.sha }}"
+          elif [ -n "${{ inputs.pr_id }}" ]; then
+            PREFIX="${{ inputs.pr_id }}"
+          else
+            PREFIX="parity"
+          fi
+          PREFIX=$(echo "$PREFIX" | xargs)
+          echo "prefix=$PREFIX" >> "$GITHUB_OUTPUT"
+          echo "Artifact prefix: $PREFIX"
+
+  generate-parity:
+    needs: setup-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: ${{ fromJson(needs.setup-matrix.outputs.arch-matrix) }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: pip install -r requirements.txt
+
+      - name: Download artifacts
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        env:
+          GITHUB_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          ARGS="--arch ${{ matrix.arch }}"
+
+          if [ -n "${{ inputs.sha }}" ]; then
+            ARGS="$ARGS --sha1 ${{ inputs.sha }}"
+          fi
+          if [ -n "${{ inputs.pr_id }}" ]; then
+            ARGS="$ARGS --pr_id ${{ inputs.pr_id }}"
+          fi
+          if [ "${{ inputs.exclude_distributed }}" = "true" ]; then
+            ARGS="$ARGS --exclude_distributed"
+          fi
+          if [ "${{ inputs.exclude_inductor }}" = "true" ]; then
+            ARGS="$ARGS --exclude_inductor"
+          fi
+          if [ "${{ inputs.exclude_default }}" = "true" ]; then
+            ARGS="$ARGS --exclude_default"
+          fi
+          ARGS="$ARGS --ignore_status"
+          if [ "${{ inputs.include_logs }}" != "true" ]; then
+            ARGS="$ARGS --artifacts_only"
+          fi
+          if [ "${{ inputs.skip_rocm }}" = "true" ]; then
+            ARGS="$ARGS --no_rocm"
+          fi
+          if [ "${{ inputs.skip_cuda }}" = "true" ]; then
+            ARGS="$ARGS --no_cuda"
+          fi
+          if [ "${{ inputs.include_inductor_periodic }}" = "true" ]; then
+            ARGS="$ARGS --include_inductor_periodic"
+          fi
+          if [ -n "${{ inputs.baseline_sha }}" ]; then
+            ARGS="$ARGS --baseline_sha ${{ inputs.baseline_sha }}"
+          fi
+
+          echo "Running: python3 ./download_testlogs $ARGS"
+          python3 ./download_testlogs $ARGS 2>&1 | tee download_${{ matrix.arch }}.log
+
+      - name: Identify output folder
+        id: folder
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: |
+          FOLDER=$(ls -dt [0-9]*_[0-9a-f]*/ 2>/dev/null | head -1 | sed 's:/$::')
+          if [ -z "$FOLDER" ]; then
+            echo "ERROR: No output folder found"
+            exit 1
+          fi
+          echo "folder=$FOLDER" >> "$GITHUB_OUTPUT"
+          SHA=$(echo "$FOLDER" | grep -oP '[0-9a-f]{40}')
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+          DATE=$(TZ='America/Los_Angeles' date '+%Y%m%d')
+          echo "date=$DATE" >> "$GITHUB_OUTPUT"
+          mv download_${{ matrix.arch }}.log "$FOLDER/" 2>/dev/null || true
+          echo "Output folder: $FOLDER, SHA: $SHA, Date: $DATE"
+
+      - name: Generate CSV
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: |
+          FOLDER="${{ steps.folder.outputs.folder }}"
+          DATE="${{ steps.folder.outputs.date }}"
+          ARCH="${{ matrix.arch }}"
+
+          if [ -n "${{ inputs.csv_name }}" ]; then
+            CSV_NAME="${{ inputs.csv_name }}_${ARCH}"
+          else
+            CSV_NAME="${DATE}_all_tests_status_${ARCH}"
+          fi
+
+          ARGS="--set1 $FOLDER/rocm_xml"
+          if [ -n "${{ inputs.baseline_sha }}" ]; then
+            ARGS="$ARGS --set2 $FOLDER/baseline_xml"
+            CURRENT_SHORT=$(echo "${{ steps.folder.outputs.sha }}" | cut -c1-8)
+            BASELINE_SHORT=$(echo "${{ inputs.baseline_sha }}" | cut -c1-8)
+            ARGS="$ARGS --set1_name ${CURRENT_SHORT}"
+            ARGS="$ARGS --set2_name ${BASELINE_SHORT}"
+          else
+            if [ "${{ inputs.skip_cuda }}" != "true" ]; then
+              ARGS="$ARGS --set2 $FOLDER/cuda_xml"
+            fi
+            ARGS="$ARGS --set1_name ${{ inputs.set1_name }}"
+            ARGS="$ARGS --set2_name ${{ inputs.set2_name }}"
+          fi
+          ARGS="$ARGS --output_csv $FOLDER/${CSV_NAME}.csv"
+          SHORT_ARCH=$(echo "$ARCH" | sed 's/^mi//')
+          if [ -n "${{ inputs.csv_name }}" ]; then
+            RT_NAME="${{ inputs.csv_name }}_running_time_${SHORT_ARCH}"
+          else
+            RT_NAME="${DATE}_running_time_${SHORT_ARCH}"
+          fi
+          ARGS="$ARGS --test_file_running_time_output_csv $FOLDER/${RT_NAME}.csv"
+
+          echo "Running: python3 -u summarize_xml_testreports.py $ARGS"
+          python3 -u summarize_xml_testreports.py $ARGS 2>&1 | tee "$FOLDER/xml_processing_${DATE}.log"
+
+      - name: Auto-classify skip reasons
+        if: ${{ inputs.auto_classify }}
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: |
+          FOLDER="${{ steps.folder.outputs.folder }}"
+          CSV=$(find "$FOLDER" -maxdepth 1 -name "*.csv" ! -name "*_running_time*" | head -1)
+          if [ -n "$CSV" ]; then
+            echo "Auto-classifying skip reasons in $CSV"
+            python3 auto_classify_skip_reasons.py -i "$CSV" -o "$CSV" --report 2>&1
+          else
+            echo "No parity CSV found in $FOLDER, skipping auto-classify"
+          fi
+
+      - name: Collect upload paths
+        id: upload-paths
+        run: |
+          FOLDER=".automation_scripts/pytorch-unit-test-scripts/${{ steps.folder.outputs.folder }}"
+          PATHS="${FOLDER}/*.csv
+          ${FOLDER}/*.log
+          ${FOLDER}/*.txt
+          ${FOLDER}/inductor_periodic_rocm_dir/
+          ${FOLDER}/inductor_periodic_cuda_dir/"
+          if [ "${{ inputs.include_xml }}" = "true" ]; then
+            PATHS="${PATHS}
+          ${FOLDER}/rocm_xml/
+          ${FOLDER}/cuda_xml/
+          ${FOLDER}/baseline_xml/"
+          fi
+          echo "paths<<EOF" >> "$GITHUB_OUTPUT"
+          echo "$PATHS" >> "$GITHUB_OUTPUT"
+          echo "EOF" >> "$GITHUB_OUTPUT"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ needs.setup-matrix.outputs.prefix }}-results-${{ matrix.arch }}
+          retention-days: 1
+          path: ${{ steps.upload-paths.outputs.paths }}
+
+  summarize:
+    needs: [setup-matrix, generate-parity]
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Download all per-arch CSV artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: ${{ needs.setup-matrix.outputs.prefix }}-results-*
+          path: artifacts
+
+      - name: Build parity report
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: |
+          ARCHS="${{ inputs.arch }}"
+          SHA="${{ inputs.sha }}"
+          PR_ID="${{ inputs.pr_id }}"
+          BASELINE_SHA="${{ inputs.baseline_sha }}"
+          if [ -n "$BASELINE_SHA" ]; then
+            SET1=$(echo "$SHA" | cut -c1-8)
+            SET2=$(echo "$BASELINE_SHA" | cut -c1-8)
+          else
+            SET1="${{ inputs.set1_name }}"
+            SET2="${{ inputs.set2_name }}"
+          fi
+
+          ARCHS=$(echo "$ARCHS" | tr ',[:space:]' ' ')
+          PREFIX=$(echo "${{ needs.setup-matrix.outputs.prefix }}" | xargs)
+          CSV_ARGS=()
+          ARCH_ARGS=()
+          for ARCH in $ARCHS; do
+            ARTIFACT_DIR="../artifacts/${PREFIX}-results-${ARCH}"
+            CSV=$(find "$ARTIFACT_DIR"/ -maxdepth 2 -name "*.csv" ! -name "*_running_time*" ! -name "*_summary*" 2>/dev/null | head -1)
+            if [ -z "$CSV" ]; then
+              echo "WARNING: No CSV found for $ARCH, skipping"
+              continue
+            fi
+            echo "Found CSV for $ARCH: $CSV"
+            CSV_ARGS+=("$CSV")
+            ARCH_ARGS+=("$ARCH")
+          done
+
+          if [ ${#CSV_ARGS[@]} -eq 0 ]; then
+            echo "::warning::No CSVs found for any architecture — some or all generate-parity jobs may have failed"
+            echo "## ⚠ No CSVs produced" >> "$GITHUB_STEP_SUMMARY"
+            echo "No parity CSVs were found. Check the generate-parity job logs for errors." >> "$GITHUB_STEP_SUMMARY"
+            exit 0
+          fi
+
+          ARGS=(--csv "${CSV_ARGS[@]}" --arch "${ARCH_ARGS[@]}")
+          ARGS+=(--set1_name "$SET1" --set2_name "$SET2")
+
+          if [ -n "$SHA" ]; then
+            ARGS+=(--sha "$SHA")
+          else
+            DETECTED_SHA=$(basename "$(find ../artifacts/ -name '*.csv' | head -1)" | grep -oP '[0-9a-f]{40}' || true)
+            if [ -n "$DETECTED_SHA" ]; then
+              ARGS+=(--sha "$DETECTED_SHA")
+            fi
+          fi
+          if [ -n "$PR_ID" ]; then
+            ARGS+=(--pr_id "$PR_ID")
+          fi
+
+          OUTPUT="${PREFIX}_summary"
+          ARGS+=(--output "$OUTPUT")
+
+          echo "Running: python3 generate_summary.py ${ARGS[*]}"
+          python3 generate_summary.py "${ARGS[@]}"
+
+          cat "${OUTPUT}.md" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Add artifact links to summary
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          ARTIFACTS_JSON=$(gh api repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts --paginate -q '.artifacts[] | {name, id}')
+          RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}/artifacts"
+
+          {
+            echo ""
+            echo "### ARTIFACTS"
+            echo ""
+            echo "| Artifact | Link |"
+            echo "| --- | --- |"
+            echo "$ARTIFACTS_JSON" | jq -r '"| \(.name) | [Download]('"${RUN_URL}"'/\(.id)) |"'
+            echo ""
+          } >> "$GITHUB_STEP_SUMMARY"

From 16e431d853730bfe16fe83097cd94af41c71b4f9 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Mon, 13 Apr 2026 08:53:25 -0700
Subject: [PATCH 31/43] [CI] Improve summary output and add log based failure
 detection for parity workflow (#3147)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Adds log-based failure detection to the parity workflow. Tests that
timeout (exit code 124), crash (SIGIOT, SIGSEGV), hit Fatal Python
errors, or OOM never produce JUnit XML output, so they are invisible to
the existing XML-based parity report. This PR closes that gap.

### Changes

- **New script: `detect_log_failures.py`** — Parses raw CI `.txt` log
files to detect test failures not captured in XML reports. Classifies
failures as TIMEOUT, CRASH, CONSISTENT_FAILURE, or NON_ZERO_EXIT.
Outputs a CSV with platform, workflow, test file, category, and reason.
- **`generate_summary.py`** — Adds `--log-failures` argument to accept
CSV(s) from `detect_log_failures.py`. Appends a "LOG-BASED FAILURES (not
in XML)" section to both CSV and markdown output.
- **`parity.yml`** — Adds a "Detect log-based failures" step after XML
processing (runs when `include_logs` is enabled). Wires the resulting
CSV into the summarize job via `--log-failures`.
- Adding in shard information
- Also adding in which workflow we are downloading for in download
testlogs

### How it works

1. `detect_log_failures.py` scans `.txt` log files for patterns like:
   - `Got exit code 124` (timeout)
- `Segmentation fault`, `SIGSEGV`, `SIGIOT`, `Fatal Python error`
(crash)
   - `FAILED CONSISTENTLY`
   - `OutOfMemoryError`, `bad_alloc` (OOM)
2. Results are saved as `log_failures_<arch>.csv` and uploaded as part
of the per-arch artifact
3. The summarize job collects all log failure CSVs and passes them to
`generate_summary.py`
4. The final parity report includes a dedicated section listing these
failures

## Test plan

- [x] Syntax-checked both Python files (`py_compile`)
- [x] Validated `parity.yml` YAML syntax
- [x] Tested `detect_log_failures.py` against actual CI log files from
parity runs
- [x] Verified all files match fork/main (with correct
`.automation_scripts/` paths)
- [x] Run parity workflow with `include_logs: true` to verify end-to-end
Validation:
https://github.com/ethanwee1/pytorch/actions/runs/24352395766

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 .../auto_classify_skip_reasons.py             |   2 +-
 .../detect_log_failures.py                    | 353 ++++++++++++++++++
 .../download_testlogs                         | 162 ++++++--
 .../generate_summary.py                       | 125 +++++--
 .../summarize_xml_testreports.py              | 112 +++---
 .../upload_test_stats.py                      |  10 +-
 .github/workflows/parity.yml                  |  31 +-
 7 files changed, 672 insertions(+), 123 deletions(-)
 create mode 100755 .automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py

diff --git a/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py b/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
index d9d14deefb268..cf948495ec04e 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/auto_classify_skip_reasons.py
@@ -917,7 +917,7 @@ def main():
         test_file = row.get('test_file', '')
         test_class = row.get('test_class', '')
         test_name = row.get('test_name', '')
-        workflow = row.get('work_flow_name', '')
+        workflow = row.get('test_config', '')
 
         if existing_reason and not args.reclassify_all:
             already_had_count += 1
diff --git a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py
new file mode 100755
index 0000000000000..57c813790c9ed
--- /dev/null
+++ b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+"""Scan CI log files (.txt) for test failures not captured in XML reports.
+
+Tests that timeout (exit code 124), crash (SIGIOT, SIGSEGV, Fatal Python error),
+or are killed (SIGKILL, OOM) never produce JUnit XML output. This script detects
+those failures from the raw log files and outputs a CSV/summary.
+
+Usage:
+    python detect_log_failures.py --logs-dir <folder> [--output <path.csv>]
+"""
+
+import argparse
+import csv
+import os
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+
+RE_RUNNING = re.compile(
+    r"Running (?P<test_file>\S+) (?P<shard>\d+)/(?P<total>\d+) \.\.\."
+)
+RE_SUCCESS = re.compile(
+    r"(?P<test_file>\S+) (?P<shard>\d+)/(?P<total>\d+) was successful"
+)
+RE_FAILED = re.compile(
+    r"(?P<test_file>\S+) (?P<shard>\d+)/(?P<total>\d+) failed!(?P<reason>.*)"
+)
+RE_EXIT_CODE = re.compile(r"Got exit code (?P<code>\d+)")
+RE_TIMEOUT = re.compile(r"Command took >(\d+)min, returning 124")
+RE_FAILED_CONSISTENTLY = re.compile(
+    r"FAILED CONSISTENTLY: (?P<test_path>\S+)"
+)
+RE_STEPCURRENT = re.compile(
+    r"stepcurrent:.*Running only (?:test/)?(?P<test_path>\S+)"
+)
+RE_INDIVIDUAL_TEST = re.compile(
+    r"(?P<test_path>\S+\.py::(?P<cls>\w+)::(?P<method>\w+))"
+)
+
+CRASH_PATTERNS = [
+    (re.compile(r"Segmentation fault", re.IGNORECASE), "SEGFAULT"),
+    (re.compile(r"SIGSEGV"), "SIGSEGV"),
+    (re.compile(r"SIGIOT"), "SIGIOT"),
+    (re.compile(r"SIGABRT"), "SIGABRT"),
+    (re.compile(r"SIGKILL"), "SIGKILL"),
+    (re.compile(r"Fatal Python error", re.IGNORECASE), "FATAL_PYTHON"),
+    (re.compile(r"core dumped", re.IGNORECASE), "CORE_DUMP"),
+    (re.compile(r"Aborted \(core dumped\)", re.IGNORECASE), "ABORTED"),
+    (re.compile(r"torch\.cuda\.OutOfMemoryError"), "CUDA_OOM"),
+    (re.compile(r"std::bad_alloc"), "BAD_ALLOC"),
+]
+
+LOG_FILE_MAP = {
+    "rocm": ("rocm", "default"),
+    "rocm_dist": ("rocm", "distributed"),
+    "rocm_inductor": ("rocm", "inductor"),
+    "cuda": ("cuda", "default"),
+    "cuda_dist": ("cuda", "distributed"),
+    "cuda_inductor": ("cuda", "inductor"),
+    "baseline": ("baseline", "default"),
+}
+
+
+def classify_log_file(filename):
+    """Return (platform, test_config, shard_num) from a log filename like rocm3.txt."""
+    stem = Path(filename).stem
+    for prefix, (platform, test_config) in sorted(LOG_FILE_MAP.items(), key=lambda x: -len(x[0])):
+        if stem.startswith(prefix):
+            remainder = stem[len(prefix):]
+            if remainder.isdigit():
+                return platform, test_config, int(remainder)
+    return None, None, None
+
+
+RE_TIMESTAMP = re.compile(r"^\d{4}-\d{2}-\d{2}T[\d:.]+Z\s*")
+
+
+def parse_log_file(filepath):
+    """Parse a single log file and return test file results and consistent failures."""
+    results = {}
+    current_test = None
+    last_failed_test = None
+    consistent_failures = []
+
+    with open(filepath, "r", errors="replace") as f:
+        for line in f:
+            # Lightweight tracking of individual pytest test lines.
+            # These are very frequent (~37% of lines) so we extract the
+            # test name directly without timestamp stripping.
+            if ".py::" in line:
+                m_ind = RE_INDIVIDUAL_TEST.search(line)
+                if m_ind:
+                    active = current_test or last_failed_test
+                    if active and active in results:
+                        # Only update if the pytest path belongs to this shard's test file,
+                        # otherwise rerun output from earlier shards contaminates later ones.
+                        shard_file = results[active]["test_file"]
+                        if shard_file + ".py" in m_ind.group("test_path"):
+                            results[active]["last_test"] = f"{m_ind.group('cls')}::{m_ind.group('method')}"
+
+            if " ... [" not in line and "was successful" not in line \
+               and "failed!" not in line and "Got exit code" not in line \
+               and "returning 124" not in line and "FAILED CONSISTENTLY" not in line \
+               and "Retrying" not in line \
+               and "Segmentation fault" not in line and "SIGIOT" not in line \
+               and "SIGSEGV" not in line and "SIGABRT" not in line \
+               and "SIGKILL" not in line \
+               and "Fatal Python error" not in line and "core dumped" not in line \
+               and "Aborted (core dumped)" not in line \
+               and "OutOfMemoryError" not in line \
+               and "bad_alloc" not in line \
+               and "stepcurrent" not in line:
+                continue
+
+            stripped = RE_TIMESTAMP.sub("", line).rstrip()
+
+            m = RE_RUNNING.search(stripped)
+            if m:
+                key = f"{m.group('test_file')} {m.group('shard')}/{m.group('total')}"
+                current_test = key
+                if key not in results:
+                    results[key] = {
+                        "test_file": m.group("test_file"),
+                        "shard": int(m.group("shard")),
+                        "total": int(m.group("total")),
+                        "status": "RUNNING",
+                        "reason": "",
+                        "exit_codes": [],
+                        "crashes": [],
+                        "crash_tests": [],
+                        "last_test": "",
+                    }
+                continue
+
+            m = RE_SUCCESS.search(stripped)
+            if m:
+                key = f"{m.group('test_file')} {m.group('shard')}/{m.group('total')}"
+                if key in results:
+                    results[key]["status"] = "PASSED"
+                current_test = None
+                last_failed_test = None
+                continue
+
+            m = RE_FAILED.search(stripped)
+            if m:
+                key = f"{m.group('test_file')} {m.group('shard')}/{m.group('total')}"
+                reason = m.group("reason").strip()
+                if key in results:
+                    results[key]["status"] = "FAILED"
+                    if reason:
+                        results[key]["reason"] = reason
+                last_failed_test = key
+                current_test = key
+                continue
+
+            active = current_test or last_failed_test
+
+            # Track stepcurrent rerun lines — identifies crash-causing test
+            m = RE_STEPCURRENT.search(stripped)
+            if m:
+                test_path = m.group("test_path")
+                parts = test_path.split("::")
+                if len(parts) >= 3:
+                    crash_id = f"{parts[1]}::{parts[2]}"
+                elif len(parts) == 2:
+                    crash_id = parts[1]
+                else:
+                    crash_id = None
+                if crash_id and active and active in results:
+                    shard_file = results[active]["test_file"]
+                    if shard_file in test_path:
+                        if crash_id not in results[active]["crash_tests"]:
+                            results[active]["crash_tests"].append(crash_id)
+                continue
+
+            # Track individual pytest test lines for last-running-test context
+            m_ind = RE_INDIVIDUAL_TEST.search(stripped)
+            if m_ind and active and active in results:
+                cls = m_ind.group("cls")
+                method = m_ind.group("method")
+                results[active]["last_test"] = f"{cls}::{method}"
+
+            m = RE_EXIT_CODE.search(stripped)
+            if m:
+                code = int(m.group("code"))
+                if active and active in results:
+                    results[active]["exit_codes"].append(code)
+
+            m = RE_TIMEOUT.search(stripped)
+            if m and active and active in results:
+                if "TIMEOUT" not in results[active]["crashes"]:
+                    results[active]["crashes"].append("TIMEOUT")
+
+            m = RE_FAILED_CONSISTENTLY.search(stripped)
+            if m:
+                consistent_failures.append(m.group("test_path"))
+
+            if active and active in results:
+                for pattern, label in CRASH_PATTERNS:
+                    if pattern.search(stripped):
+                        if label not in results[active]["crashes"]:
+                            results[active]["crashes"].append(label)
+
+    return results, consistent_failures
+
+
+def scan_logs(logs_dir):
+    """Scan all log files and return all non-passing test file results."""
+    all_failures = []
+
+    for fname in sorted(os.listdir(logs_dir)):
+        if not fname.endswith(".txt"):
+            continue
+
+        platform, test_config, shard_num = classify_log_file(fname)
+        if platform is None:
+            continue
+
+        filepath = os.path.join(logs_dir, fname)
+        results, consistent_failures = parse_log_file(filepath)
+
+        for key, info in results.items():
+            if info["status"] == "PASSED":
+                continue
+
+            categories = []
+            if 124 in info["exit_codes"] or "TIMEOUT" in info["crashes"]:
+                categories.append("TIMEOUT")
+            for c in info["crashes"]:
+                if c != "TIMEOUT":
+                    categories.append(c)
+            if info["status"] == "FAILED" and not categories:
+                categories.append("FAILED")
+            if info["status"] == "RUNNING" and not categories:
+                categories.append("INCOMPLETE")
+
+            if not categories:
+                continue
+            # Skip tests stuck in RUNNING with no evidence of failure —
+            # these are typically from multi-shard logs where a different
+            # shard's "Running ..." line appeared but the result was elsewhere.
+            if info["status"] == "RUNNING" and categories == ["INCOMPLETE"]:
+                continue
+
+            reason = info["reason"]
+            # Populate reason with identified crash/timeout test name
+            crash_tests = info.get("crash_tests", [])
+            last_test = info.get("last_test", "")
+            identified_test = ""
+            if crash_tests:
+                identified_test = crash_tests[0]
+            elif last_test:
+                identified_test = last_test
+
+            if identified_test and "::" in identified_test:
+                if not reason:
+                    reason = identified_test
+                elif "::" not in reason:
+                    reason = f"{identified_test} | {reason}"
+
+            all_failures.append({
+                "log_file": fname,
+                "platform": platform,
+                "test_config": test_config,
+                "test_file": info["test_file"],
+                "shard": f"{info['shard']}/{info['total']}",
+                "status": info["status"],
+                "category": "+".join(categories),
+                "reason": reason,
+                "exit_codes": ",".join(str(c) for c in info["exit_codes"]),
+            })
+
+        for test_path in consistent_failures:
+            parts = test_path.split("::")
+            file_part = parts[0].replace("test/", "").replace(".py", "")
+            test_class = parts[1] if len(parts) > 1 else ""
+            test_name = parts[2] if len(parts) > 2 else ""
+
+            all_failures.append({
+                "log_file": fname,
+                "platform": platform,
+                "test_config": test_config,
+                "test_file": file_part,
+                "shard": "",
+                "status": "FAILED_CONSISTENTLY",
+                "category": "CONSISTENT_FAILURE",
+                "reason": f"{test_class}::{test_name}" if test_class else "",
+                "exit_codes": "",
+            })
+
+    return all_failures
+
+
+def write_csv_report(failures, output_path):
+    fieldnames = [
+        "log_file", "platform", "test_config", "test_file", "shard",
+        "status", "category", "reason", "exit_codes",
+    ]
+    with open(output_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(failures)
+    print(f"Log failure report: {output_path} ({len(failures)} entries)")
+
+
+def print_summary(failures):
+    if not failures:
+        print("No log-based failures detected.")
+        return
+
+    by_category = defaultdict(list)
+    for f in failures:
+        by_category[f["category"]].append(f)
+
+    print(f"\n{'='*60}")
+    print("LOG FAILURE DETECTION SUMMARY")
+    print(f"{'='*60}")
+    print(f"Total failures detected: {len(failures)}")
+    print()
+
+    for cat, items in sorted(by_category.items()):
+        print(f"  {cat}: {len(items)}")
+        for item in items:
+            print(f"    - {item['test_file']} ({item['platform']}/{item['test_config']}) [{item['log_file']}]")
+            if item["reason"]:
+                print(f"      Reason: {item['reason'][:120]}")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Detect test failures from CI log files not captured in XML reports"
+    )
+    parser.add_argument(
+        "--logs-dir", required=True,
+        help="Directory containing .txt log files"
+    )
+    parser.add_argument(
+        "--output", default="log_failures.csv",
+        help="Output CSV path (default: log_failures.csv)"
+    )
+    args = parser.parse_args()
+
+    failures = scan_logs(args.logs_dir)
+    print_summary(failures)
+    write_csv_report(failures, args.output)
+    return 0 if not failures else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index 6368590567f04..d391a7c7c10c5 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -106,16 +106,40 @@ def get_workflow_jobs(wf):
             jobs += response.json()["jobs"]
     return jobs
 
+def get_check_runs_for_commit(sha, prefix):
+    """Get check runs for a commit filtered by name prefix.
+
+    The workflow jobs API does not return jobs from reusable workflows
+    (workflow_call). The check-runs API returns all jobs regardless of
+    workflow nesting, so we use it as a fallback.
+    """
+    check_runs = []
+    page = 1
+    while True:
+        response = requests.get(
+            f"https://api.github.com/repos/pytorch/pytorch/commits/{sha}/check-runs",
+            headers=authentication_headers,
+            params={'per_page': 100, 'page': page},
+        )
+        data = response.json()
+        runs = data.get('check_runs', [])
+        check_runs.extend([cr for cr in runs if prefix in cr.get('name', '')])
+        if len(runs) < 100:
+            break
+        page += 1
+    return check_runs
+
 def get_job_ids_by_prefix(wf, prefix):
     """Get job IDs (as strings) for jobs whose name contains the given prefix."""
     jobs = get_workflow_jobs(wf)
     return [str(j['id']) for j in jobs if prefix in j['name']]
 
-def download_logs(wf, test_log_list, test_folder):
+def download_logs(wf, test_log_list, test_folder, jobs=None):
     if wf is None: 
         raise Exception("wf is None!")
     
-    jobs = get_workflow_jobs(wf)
+    if jobs is None:
+        jobs = get_workflow_jobs(wf)
 
     for test_log in test_log_list:
         write_out_file = test_folder + "/" + test_log[0]
@@ -288,16 +312,17 @@ def download_workflow_run(created=None, max_pages=10, workflow=None, sha=None, i
         workflow_runs = None
         try:
             workflow_runs = response.json()['workflow_runs']
-            #print(workflow_runs)
         except:
             raise Exception(response.text)
+        if not workflow_runs:
+            continue
+        # Prefer completed runs over in-progress ones. When multiple
+        # runs exist for the same SHA, the most recent may still be
+        # running and have no artifacts yet.
+        completed = [wf for wf in workflow_runs if wf.get('status') == 'completed']
+        if completed:
+            return completed[0]
         return workflow_runs[0]
-        for wf in workflow_runs:
-            wf_name = wf["name"]
-            if not sha and (wf_name == workflow):
-                return wf
-            if sha and (wf_name == workflow) and (wf["head_sha"] == sha):
-                return wf
 
     # Should not reach here ideally
     raise Exception(error_msg)
@@ -381,7 +406,7 @@ def main():
     elif arch == 'mi355':
         ROCmWorkflowNames = {
             "default": "trunk",
-            "distributed": "trunk",
+            "distributed": "periodic-rocm-mi355",
             "inductor": "inductor-rocm-mi355"
         }
     elif arch == 'mi200':
@@ -418,7 +443,7 @@ def main():
         },
         "mi355": {
             "default": "linux-jammy-rocm-py3.10-mi355",
-            "distributed": "linux-jammy-rocm-py3.10-mi355",
+            "distributed": "linux-noble-rocm-py3.12-mi355",
             "inductor": "linux-noble-rocm-py3.12-mi355"
         },
         "navi31": {
@@ -451,6 +476,9 @@ def main():
         args.no_cuda = True
 
     print(f"Using ROCm architecture: {arch}")
+    print(f"Using ROCm workflows: {ROCmWorkflowNames}")
+    if not args.no_cuda:
+        print(f"Using CUDA workflows: {CUDAWorkflowNames}")
     print(f"Using ROCm job prefixes: {rocm_job_prefix}")
     print(f"Using ROCm shard counts: {rocm_shards}")
 
@@ -481,7 +509,7 @@ def main():
     if not args.exclude_distributed and not args.no_rocm:
         periodic_sha = sha
         print("==============================================")
-        print(f"Finding ROCm tests in periodic workflow by sha: {sha}")
+        print(f"Finding ROCm distributed tests in workflow '{ROCmWorkflowNames['distributed']}' by sha: {sha}")
         print("==============================================")
         # find distributed test in periodic workflow with success status
         error_msg="Error: Periodic workflow not found in scanned workflow runs."
@@ -492,7 +520,7 @@ def main():
         except (IndexError, Exception):
             periodic_wf = None
         periodic_fallbacks = {
-            "mi355": ("periodic-rocm-mi355", "linux-noble-rocm-py3.12-mi355"),
+            "mi355": ("trunk", "linux-jammy-rocm-py3.10-mi355"),
             "mi200": ("periodic-rocm-mi200", "linux-jammy-rocm-py3.10"),
         }
         if periodic_wf is None and arch in periodic_fallbacks:
@@ -502,7 +530,8 @@ def main():
             periodic_fallback_used = True
         if periodic_wf is None:
             raise Exception(error_msg)
-        print(f"Using workflow with id:{periodic_wf['id']} as periodic_wf")
+        dist_wf_name = ROCmWorkflowNames['distributed'] if not periodic_fallback_used else periodic_fallbacks[arch][0]
+        print(f"Using workflow '{dist_wf_name}' with id:{periodic_wf['id']} for ROCm distributed")
 
         if periodic_fallback_used and arch in periodic_fallbacks:
             dist_job_prefix = periodic_fallbacks[arch][1]
@@ -542,7 +571,7 @@ def main():
     if not args.no_rocm and not args.exclude_default:
         rocm_sha = sha
         print("===========================================")
-        print(f"Finding ROCm tests in rocm workflow by sha: {rocm_sha}")
+        print(f"Finding ROCm default tests in workflow '{ROCmWorkflowNames['default']}' by sha: {rocm_sha}")
         print("===========================================")
         # find tests in rocm workflow with given sha and success status
         #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
@@ -563,7 +592,8 @@ def main():
             rocm_job_prefix['default'] = fallback_prefix
         if rocm_wf is None:
             raise Exception(error_msg)
-        print(f"Using workflow with id:{rocm_wf['id']} as rocm_wf{' (fallback)' if default_fallback_used else ''}")
+        default_wf_name = ROCmWorkflowNames['default'] if not default_fallback_used else default_fallbacks[arch][0]
+        print(f"Using workflow '{default_wf_name}' with id:{rocm_wf['id']} for ROCm default{' (fallback)' if default_fallback_used else ''}")
 
         folder_list = get_or_create_test_folder(rocm_wf)
 
@@ -599,11 +629,11 @@ def main():
         # find tests in inductor workflow with given sha and success status
         #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
         print("===========================================")
-        print(f"Finding ROCm tests in inductor-rocm workflow by sha: {inductor_rocm_sha}")
+        print(f"Finding ROCm inductor tests in workflow '{ROCmWorkflowNames['inductor']}' by sha: {inductor_rocm_sha}")
         print("===========================================")
         error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
         inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
-        print(f"Using workflow with id:{inductor_wf_rocm['id']} as inductor_wf_rocm")
+        print(f"Using workflow '{ROCmWorkflowNames['inductor']}' with id:{inductor_wf_rocm['id']} for ROCm inductor")
 
         folder_list = get_or_create_test_folder(inductor_wf_rocm)
 
@@ -632,27 +662,79 @@ def main():
 
     if not args.no_cuda:
         cuda_job_prefix = "linux-jammy-cuda13.0-py3.10-gcc11"
-        pull_sha = sha
         print("==========================================")
-        print(f"Finding CUDA tests in pull workflow by sha: {pull_sha}")
+        print(f"Finding CUDA tests in workflow '{CUDAWorkflowNames['default']}' by sha: {sha}")
         print("==========================================")
-        # find tests in pull workflow with given sha and success status
-        #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
-        error_msg="Error: Pull workflow not found in scanned workflow runs. Try increasing max_pages."
-        pull_wf = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["default"], sha=pull_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
-        print(f"Using workflow with id:{pull_wf['id']} as pull_wf")
 
-        # Get job IDs for the target CUDA version to filter S3 artifacts
-        cuda_job_ids = get_job_ids_by_prefix(pull_wf, cuda_job_prefix)
+        # There can be multiple trunk runs for the same SHA. Find the one
+        # that actually contains CUDA test jobs by checking each run's jobs
+        # list, falling back to check-runs API to resolve the correct run.
+        trunk_wf = None
+        all_cuda_jobs = []
+        cuda_test_jobs = []
+
+        trunk_runs = []
+        params = {'per_page': 10}
+        if not args.ignore_status:
+            params['status'] = status
+        params['head_sha'] = sha
+        resp = requests.get(
+            f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{CUDAWorkflowNames['default']}.yml/runs",
+            headers=authentication_headers, params=params,
+        )
+        trunk_runs = resp.json().get('workflow_runs', [])
+
+        for run in trunk_runs:
+            jobs = get_workflow_jobs(run)
+            test_jobs = [j for j in jobs if cuda_job_prefix in j['name'] and '/ test' in j['name']]
+            if test_jobs:
+                trunk_wf = run
+                all_cuda_jobs = jobs
+                cuda_test_jobs = test_jobs
+                print(f"Found CUDA test jobs in trunk run {run['id']}")
+                break
+
+        if not cuda_test_jobs and trunk_runs:
+            # CUDA test jobs may be in a different run than the one returned
+            # by the jobs API. Use check-runs API to find the actual run.
+            print("No CUDA test jobs in any trunk run's jobs API, trying check-runs API...")
+            check_runs = get_check_runs_for_commit(sha, cuda_job_prefix)
+            cuda_test_jobs = [cr for cr in check_runs if '/ test' in cr['name']]
+            if cuda_test_jobs:
+                # Extract the actual workflow run ID from the check-run details URL
+                import re as _re
+                run_match = _re.search(r'/runs/(\d+)/', cuda_test_jobs[0].get('details_url', ''))
+                if run_match:
+                    actual_run_id = int(run_match.group(1))
+                    # Find or fetch the correct workflow run
+                    for run in trunk_runs:
+                        if run['id'] == actual_run_id:
+                            trunk_wf = run
+                            break
+                    if trunk_wf is None:
+                        resp = requests.get(
+                            f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{actual_run_id}",
+                            headers=authentication_headers,
+                        )
+                        trunk_wf = resp.json()
+                    print(f"CUDA test jobs are in trunk run {trunk_wf['id']} (found via check-runs)")
+                all_cuda_jobs = list(cuda_test_jobs)
+
+        if trunk_wf is None:
+            trunk_wf = trunk_runs[0] if trunk_runs else None
+        if trunk_wf is None:
+            raise Exception("Error: No trunk workflow run found for CUDA tests")
+
+        print(f"Using workflow '{CUDAWorkflowNames['default']}' with id:{trunk_wf['id']} for CUDA default")
+
+        cuda_job_ids = [str(j['id']) for j in cuda_test_jobs]
         cuda_artifact_substrings = [f"_{jid}" for jid in cuda_job_ids] if cuda_job_ids else ["nvidia.gpu"]
         print(f"Using CUDA job prefix: {cuda_job_prefix}")
-        print(f"Found {len(cuda_job_ids)} CUDA jobs matching prefix")
+        print(f"Found {len(cuda_test_jobs)} CUDA test jobs matching prefix")
 
-        folder_list = get_or_create_test_folder(pull_wf)
+        folder_list = get_or_create_test_folder(trunk_wf)
 
         # Download logs
-        # If the cuda logs aren't found you might want to check the HUD for the correct tags
-        # Link to HUD: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=cuda
         if not args.artifacts_only:
             test_log_list_cuda_default = [
               ["cuda1.txt", f"{cuda_job_prefix} / test (default, 1, 5"],
@@ -670,7 +752,7 @@ def main():
                 ]
                 test_log_list_cuda += test_log_list_cuda_distributed
 
-            download_logs(pull_wf, test_log_list_cuda, folder_list[0])
+            download_logs(trunk_wf, test_log_list_cuda, folder_list[0], jobs=all_cuda_jobs)
 
         # Download artifacts
         test_artifacts_list_cuda_default = [
@@ -695,7 +777,7 @@ def main():
 
         if test_artifacts_list_cuda:
             download_artifacts(
-                pull_wf,
+                trunk_wf,
                 test_artifacts_list_cuda,
                 test_folder=folder_list[1],
                 allowed_substrings=cuda_artifact_substrings,
@@ -706,13 +788,13 @@ def main():
         if not args.exclude_inductor:
             inductor_sha = sha
             print("==========================================")
-            print(f"Finding CUDA tests in inductor workflow by sha: {inductor_sha}")
+            print(f"Finding CUDA inductor tests in workflow '{CUDAWorkflowNames['inductor']}' by sha: {inductor_sha}")
             print("==========================================")
             # find tests in inductor workflow with given sha and success status
             #https://docs.github.com/en/rest/actions/workflow-runs#list-workflow-runs-for-a-repository
             error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
             inductor_wf_cuda = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["inductor"], sha=inductor_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
-            print(f"Using workflow with id:{inductor_wf_cuda['id']} as inductor_wf_cuda")
+            print(f"Using workflow '{CUDAWorkflowNames['inductor']}' with id:{inductor_wf_cuda['id']} for CUDA inductor")
 
             folder_list = get_or_create_test_folder(inductor_wf_cuda)
 
@@ -762,7 +844,7 @@ def main():
                     ignore_status=args.ignore_status, status=status,
                     error_msg=f"Baseline default workflow not found for {baseline_sha}",
                 )
-                print(f"Baseline default workflow id: {baseline_default_wf['id']}")
+                print(f"Baseline default workflow '{ROCmWorkflowNames['default']}' id: {baseline_default_wf['id']}")
                 default_shards = rocm_shards["default"]
 
                 if not args.artifacts_only:
@@ -794,7 +876,7 @@ def main():
                     ignore_status=args.ignore_status, status=status,
                     error_msg=f"Baseline distributed workflow not found for {baseline_sha}",
                 )
-                print(f"Baseline distributed workflow id: {baseline_dist_wf['id']}")
+                print(f"Baseline distributed workflow '{ROCmWorkflowNames['distributed']}' id: {baseline_dist_wf['id']}")
                 dist_shards = rocm_shards["distributed"]
 
                 if not args.artifacts_only:
@@ -826,7 +908,7 @@ def main():
                     ignore_status=args.ignore_status, status=status,
                     error_msg=f"Baseline inductor workflow not found for {baseline_sha}",
                 )
-                print(f"Baseline inductor workflow id: {baseline_inductor_wf['id']}")
+                print(f"Baseline inductor workflow '{ROCmWorkflowNames['inductor']}' id: {baseline_inductor_wf['id']}")
                 inductor_shards = rocm_shards["inductor"]
 
                 if not args.artifacts_only:
@@ -855,7 +937,7 @@ def main():
     # Download inductor-periodic benchmark artifacts (separate from parity CSV)
     if args.include_inductor_periodic:
         print("==============================================")
-        print(f"Finding inductor-periodic workflow by sha: {sha}")
+        print(f"Finding inductor-periodic tests in workflow 'inductor-periodic' by sha: {sha}")
         print("==============================================")
         error_msg = "Error: inductor-periodic workflow not found for this SHA. It may not have run on this commit."
         try:
@@ -870,7 +952,7 @@ def main():
             inductor_periodic_wf = None
 
         if inductor_periodic_wf:
-            print(f"Using workflow with id:{inductor_periodic_wf['id']} as inductor_periodic_wf")
+            print(f"Using workflow 'inductor-periodic' with id:{inductor_periodic_wf['id']} for inductor-periodic")
 
             folder_list = get_or_create_test_folder(inductor_periodic_wf)
             test_folder = folder_list[0]
diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
index db1773317a91a..077a203a44b30 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
@@ -2,11 +2,12 @@
 
 import argparse
 import csv
+import os
 import sys
 
 
-WORKFLOWS = ['default', 'distributed', 'inductor']
-WORKFLOW_DISPLAY = {
+TEST_CONFIGS = ['default', 'distributed', 'inductor']
+TEST_CONFIG_DISPLAY = {
     'default': 'TEST DEFAULT',
     'distributed': 'TEST DISTRIBUTED',
     'inductor': 'TEST INDUCTOR',
@@ -39,6 +40,10 @@ def parse_args():
         '--output', type=str, default='parity_summary',
         help='Output path prefix (produces .csv and .md)'
     )
+    parser.add_argument(
+        '--log-failures', nargs='*', default=[],
+        help='CSV file(s) from detect_log_failures.py to include in summary'
+    )
     return parser.parse_args()
 
 
@@ -60,7 +65,7 @@ def detect_columns(headers, set1_name, set2_name):
     return s1_status, s2_status, s1_time, s2_time
 
 
-def workflow_stats_keys(s1_name, s2_name, has_set2=True):
+def test_config_stats_keys(s1_name, s2_name, has_set2=True):
     s1 = s1_name.upper()
     s2 = s2_name.upper()
     if not has_set2:
@@ -85,13 +90,13 @@ def workflow_stats_keys(s1_name, s2_name, has_set2=True):
     ]
 
 
-def compute_workflow_stats(rows, s1_col, s2_col, s1_name, s2_name, has_set2=True):
+def compute_test_config_stats(rows, s1_col, s2_col, s1_name, s2_name, has_set2=True):
     s1 = s1_name.upper()
     s2 = s2_name.upper()
 
     if not has_set2:
         vals = {}
-        keys = workflow_stats_keys(s1_name, s2_name, has_set2=False)
+        keys = test_config_stats_keys(s1_name, s2_name, has_set2=False)
         vals[keys[0]] = sum(1 for r in rows if r[s1_col] == 'PASSED')
         vals[keys[1]] = sum(1 for r in rows if r[s1_col] == 'SKIPPED')
         vals[keys[2]] = sum(1 for r in rows if r[s1_col] == 'FAILED')
@@ -121,7 +126,7 @@ def compute_workflow_stats(rows, s1_col, s2_col, s1_name, s2_name, has_set2=True
     pct = (skip_miss / total_s2 * 100) if total_s2 else 0
 
     vals = {}
-    keys = workflow_stats_keys(s1_name, s2_name)
+    keys = test_config_stats_keys(s1_name, s2_name)
     vals[keys[0]] = s1_skip_not_s2
     vals[keys[1]] = s1_skip
     vals[keys[2]] = s2_skip
@@ -187,8 +192,8 @@ def safe_float(v):
 
     total_disagree = 0
     total_s2 = 0
-    for wf in WORKFLOWS:
-        wf_rows = [r for r in rows if r['work_flow_name'] == wf]
+    for wf in TEST_CONFIGS:
+        wf_rows = [r for r in rows if r['test_config'] == wf]
         s1_skip_not_s2 = sum(
             1 for r in wf_rows
             if r[s1_col] == 'SKIPPED' and r[s2_col] != 'SKIPPED'
@@ -236,12 +241,14 @@ def collect_failed_tests(arch_data, archs, s1_name, s2_name):
             s1 = r[s1_col].strip()
             s2 = r[s2_col].strip() if has_set2 else ''
             if s1 == 'FAILED' or s2 == 'FAILED':
+                shard = r.get(f'shard_{s1_name}', '') if s1 == 'FAILED' else r.get(f'shard_{s2_name}', '')
                 entry = {
                     'arch': arch,
                     'test_file': r.get('test_file', ''),
                     'test_class': r.get('test_class', ''),
                     'test_name': r.get('test_name', ''),
-                    'workflow': r.get('work_flow_name', ''),
+                    'test_config': r.get('test_config', ''),
+                    'shard': shard,
                     f'status_{s1_name}': s1,
                 }
                 if has_set2:
@@ -250,6 +257,26 @@ def collect_failed_tests(arch_data, archs, s1_name, s2_name):
     return failed
 
 
+def load_log_failures(filepaths):
+    """Load log failure CSVs from detect_log_failures.py.
+
+    Extracts the architecture from the filename (e.g. log_failures_mi355.csv -> mi355).
+    """
+    entries = []
+    for fp in filepaths:
+        if not os.path.isfile(fp):
+            continue
+        basename = os.path.basename(fp)
+        arch = ''
+        if basename.startswith('log_failures_') and basename.endswith('.csv'):
+            arch = basename[len('log_failures_'):-len('.csv')]
+        with open(fp, newline='') as f:
+            for row in csv.DictReader(f):
+                row['arch'] = arch
+                entries.append(row)
+    return entries
+
+
 def fmt_val(v):
     if isinstance(v, int):
         return f'{v:,}'
@@ -266,16 +293,16 @@ def build_rows(args, archs, arch_data):
     if args.pr_id:
         out.append(('__header__', f'PR ID: {args.pr_id}'))
 
-    wf_keys = workflow_stats_keys(args.set1_name, args.set2_name, has_set2=any_has_set2)
-    for wf in WORKFLOWS:
-        out.append(('__section__', WORKFLOW_DISPLAY[wf]))
+    wf_keys = test_config_stats_keys(args.set1_name, args.set2_name, has_set2=any_has_set2)
+    for wf in TEST_CONFIGS:
+        out.append(('__section__', TEST_CONFIG_DISPLAY[wf]))
         arch_stats = {}
         for arch in archs:
             d = arch_data[arch]
             s1_col, s2_col, _, _ = d['cols']
             has_set2 = d.get('has_set2', True)
-            wf_rows = [r for r in d['rows'] if r['work_flow_name'] == wf]
-            arch_stats[arch] = compute_workflow_stats(
+            wf_rows = [r for r in d['rows'] if r['test_config'] == wf]
+            arch_stats[arch] = compute_test_config_stats(
                 wf_rows, s1_col, s2_col, args.set1_name, args.set2_name,
                 has_set2=has_set2,
             )
@@ -298,7 +325,21 @@ def build_rows(args, archs, arch_data):
     return out
 
 
-def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True):
+def _parse_log_failure_names(lf):
+    """Extract test_class and test_name from a log failure's reason field.
+
+    Handles formats like 'TestClass::test_method' and
+    'TestClass::test_method | extra reason text'.
+    """
+    reason = lf.get('reason', '')
+    if '::' not in reason:
+        return '', ''
+    test_part = reason.split(' | ', 1)[0] if ' | ' in reason else reason
+    parts = test_part.split('::', 1)
+    return parts[0], parts[1]
+
+
+def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None):
     csv_rows = []
     csv_rows.append([''] + list(archs))
     for label, vals in rows:
@@ -313,26 +354,39 @@ def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_na
 
     if failed_tests:
         csv_rows.append(['FAILED TESTS'])
-        header = ['Arch', 'Workflow', 'Test File', 'Test Class',
-                  'Test Name', f'Status ({s1_name})']
+        header = ['Arch', 'Test Config', 'Test File', 'Test Class',
+                  'Test Name', 'Shard', f'Status ({s1_name})']
         if has_set2:
             header.append(f'Status ({s2_name})')
         csv_rows.append(header)
         for t in failed_tests:
-            row = [t['arch'], t['workflow'], t['test_file'],
-                   t['test_class'], t['test_name'],
+            row = [t['arch'], t['test_config'], t['test_file'],
+                   t['test_class'], t['test_name'], t.get('shard', ''),
                    t[f'status_{s1_name}']]
             if has_set2:
                 row.append(t.get(f'status_{s2_name}', ''))
             csv_rows.append(row)
         csv_rows.append([])
 
+    if log_failures:
+        csv_rows.append(['LOG-BASED FAILURES (not in XML)'])
+        csv_rows.append(['Arch', 'Platform', 'Test Config', 'Test File', 'Test Class', 'Test Name', 'Shard', 'Category', 'Log File'])
+        for lf in log_failures:
+            test_class, test_name = _parse_log_failure_names(lf)
+            csv_rows.append([
+                lf.get('arch', ''), lf.get('platform', ''), lf.get('test_config', ''),
+                lf.get('test_file', ''), test_class, test_name,
+                lf.get('shard', ''), lf.get('category', ''),
+                lf.get('log_file', ''),
+            ])
+        csv_rows.append([])
+
     with open(output_path, 'w', newline='') as f:
         csv.writer(f).writerows(csv_rows)
     print(f'CSV written to {output_path}')
 
 
-def write_markdown(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True):
+def write_markdown(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None):
     lines = []
     current_section = []
 
@@ -366,16 +420,16 @@ def flush_table():
     if failed_tests:
         lines.append('### FAILED TESTS')
         lines.append('')
-        cols = ['Arch', 'Workflow', 'Test File', 'Test Class', 'Test Name',
-                f'Status ({s1_name})']
+        cols = ['Arch', 'Test Config', 'Test File', 'Test Class', 'Test Name',
+                'Shard', f'Status ({s1_name})']
         if has_set2:
             cols.append(f'Status ({s2_name})')
         lines.append('| ' + ' | '.join(cols) + ' |')
         lines.append('| ' + ' | '.join(['---'] * len(cols)) + ' |')
         for t in failed_tests:
-            line = (f"| {t['arch']} | {t['workflow']} | {t['test_file']} "
+            line = (f"| {t['arch']} | {t['test_config']} | {t['test_file']} "
                     f"| {t['test_class']} | {t['test_name']} "
-                    f"| {t[f'status_{s1_name}']}")
+                    f"| {t.get('shard', '')} | {t[f'status_{s1_name}']}")
             if has_set2:
                 line += f" | {t.get(f'status_{s2_name}', '')}"
             line += ' |'
@@ -387,6 +441,24 @@ def flush_table():
         lines.append('No failed tests found.')
         lines.append('')
 
+    if log_failures:
+        lines.append('### LOG-BASED FAILURES (not in XML)')
+        lines.append('')
+        lines.append('These test failures were detected from CI log files but have no XML report')
+        lines.append('(typically due to timeouts, crashes, or process kills).')
+        lines.append('')
+        lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Shard | Category |')
+        lines.append('| --- | --- | --- | --- | --- | --- | --- | --- |')
+        for lf in log_failures:
+            test_class, test_name = _parse_log_failure_names(lf)
+            lines.append(
+                f"| {lf.get('arch', '')} | {lf.get('platform', '')} | {lf.get('test_config', '')} "
+                f"| {lf.get('test_file', '')} | {test_class} "
+                f"| {test_name} | {lf.get('shard', '')} "
+                f"| {lf.get('category', '')} |"
+            )
+        lines.append('')
+
     md = '\n'.join(lines)
     with open(output_path, 'w') as f:
         f.write(md)
@@ -414,13 +486,14 @@ def main():
     data_rows = build_rows(args, archs, arch_data)
     failed = collect_failed_tests(arch_data, archs, args.set1_name, args.set2_name)
     any_has_set2 = any(d.get('has_set2', True) for d in arch_data.values())
+    log_failures = load_log_failures(args.log_failures) if args.log_failures else []
 
     output_base = args.output
     if output_base.endswith('.csv') or output_base.endswith('.md'):
         output_base = output_base.rsplit('.', 1)[0]
 
-    write_csv(data_rows, archs, f'{output_base}.csv', failed, args.set1_name, args.set2_name, has_set2=any_has_set2)
-    write_markdown(data_rows, archs, f'{output_base}.md', failed, args.set1_name, args.set2_name, has_set2=any_has_set2)
+    write_csv(data_rows, archs, f'{output_base}.csv', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures)
+    write_markdown(data_rows, archs, f'{output_base}.md', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures)
 
 
 if __name__ == '__main__':
diff --git a/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
index ad59b5dea49d9..72e587bbf54bd 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/summarize_xml_testreports.py
@@ -3,6 +3,7 @@
 import argparse
 import csv
 import os
+import re
 import pandas as pd
 from enum import Enum
 from itertools import chain
@@ -48,8 +49,8 @@
 ]
 
 
-# Workflow names
-WorkflowName = Enum('WorkflowName', ['default', 'distributed', 'inductor'])
+# Test config names
+TestConfigName = Enum('TestConfigName', ['default', 'distributed', 'inductor'])
 
 def _status_priority(test_case):
     """Return a numeric priority for deduplication of retried tests.
@@ -58,19 +59,27 @@ def _status_priority(test_case):
     status = get_test_status(test_case)
     return {"PASSED": 4, "XFAILED": 3, "SKIPPED": 2, "FAILED": 1, "ERROR": 1, "MISSED": 0}.get(status, 0)
 
+def _extract_shard(dirname):
+    """Extract shard number from directory names like 'test-default-3-6'."""
+    m = re.match(r'test-\w+-(\d+)-(\d+)', dirname)
+    if m:
+        return f"{m.group(1)}/{m.group(2)}"
+    return ""
+
 def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="."):
-    work_flow_name = ""
+    test_config = ""
     test_cases = {}
     items_list = os.listdir(path)
     for dir in items_list:
         new_dir = path + '/' + dir + '/'
         if os.path.isdir(new_dir):
             if "test-default" in new_dir:
-                work_flow_name = WorkflowName.default.name
+                test_config = TestConfigName.default.name
             elif "test-distributed" in new_dir:
-                work_flow_name = WorkflowName.distributed.name
+                test_config = TestConfigName.distributed.name
             elif "test-inductor" in new_dir:
-                work_flow_name = WorkflowName.inductor.name
+                test_config = TestConfigName.inductor.name
+            shard = _extract_shard(dir)
             for xml_report in Path(new_dir).glob("**/*.xml"):
                 try:
                     new_cases = parse_xml_report(
@@ -78,12 +87,13 @@ def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, path="
                         xml_report,
                         workflow_run_id,
                         workflow_run_attempt,
-                        work_flow_name
+                        test_config
                     )
                 except Exception as e:
                     print(f"WARNING: Skipping malformed XML {xml_report}: {e}")
                     continue
                 for key, case in new_cases.items():
+                    case["shard"] = shard
                     existing = test_cases.get(key)
                     if existing is None or _status_priority(case) > _status_priority(existing):
                         test_cases[key] = case
@@ -209,14 +219,14 @@ def summarize_xml_files(args):
     #parse the xml files
     test_cases_set1_running_time = parse_xml_reports_as_dict(-1, -1, 'testsuite', set1_path)
     # TODO: Does it matter what the workflow_run_attempt is set to below??
-    # test_cases is dict of dicts, with keys as tuple of test_file, test_class, test_name and test workflow
+    # test_cases is dict of dicts, with keys as tuple of test_file, test_class, test_name and test_config
     test_cases_set1 = parse_xml_reports_as_dict(-1, -1, 'testcase', set1_path)
     for (k,v) in list(test_cases_set1.items()):
-        if v['work_flow_name'] == WorkflowName.default.name:
+        if v['test_config'] == TestConfigName.default.name:
             ROCM_DEFAULT += 1
-        elif v['work_flow_name'] == WorkflowName.distributed.name:
+        elif v['test_config'] == TestConfigName.distributed.name:
             ROCM_DISTRIBUTED += 1
-        elif v['work_flow_name'] == WorkflowName.inductor.name:
+        elif v['test_config'] == TestConfigName.inductor.name:
             ROCM_INDUCTOR += 1
 
     # start with creating empty dicts for set2 for each test tuple
@@ -245,11 +255,11 @@ def summarize_xml_files(args):
       test_cases_set2_running_time = parse_xml_reports_as_dict(-1, -1, 'testsuite', set2_path)
       test_cases_set2 = parse_xml_reports_as_dict(-1, -1, 'testcase', set2_path)
       for (k,v) in list(test_cases_set2.items()):
-          if v['work_flow_name'] == WorkflowName.default.name:
+          if v['test_config'] == TestConfigName.default.name:
               CUDA_DEFAULT += 1
-          elif v['work_flow_name'] == WorkflowName.distributed.name:
+          elif v['test_config'] == TestConfigName.distributed.name:
               CUDA_DISTRIBUTED += 1
-          elif v['work_flow_name'] == WorkflowName.inductor.name:
+          elif v['test_config'] == TestConfigName.inductor.name:
               CUDA_INDUCTOR += 1
 
       # for rocm/cuda comparison, sometimes parity sheet has inaccurate resutls due to different function string but with same test names,
@@ -289,28 +299,28 @@ def summarize_xml_files(args):
     test_file_level_CUDA: Dict[Tuple[str], float] = {}
     for (k,v) in list(test_cases_set1_running_time.items()):
           test_file_name = k[0]
-          test_workflow_name = k[2]
-          tar_tup_rocm = (test_file_name, test_workflow_name,)
+          test_config_name = k[2]
+          tar_tup_rocm = (test_file_name, test_config_name,)
           if test_file_level_ROCm.get(tar_tup_rocm) == None:
-              test_file_level_ROCm[ ( test_file_name, test_workflow_name ) ] = v["running_time_xml"]
+              test_file_level_ROCm[ ( test_file_name, test_config_name ) ] = v["running_time_xml"]
           else:
-              test_file_level_ROCm[ ( test_file_name, test_workflow_name ) ] += v["running_time_xml"]
+              test_file_level_ROCm[ ( test_file_name, test_config_name ) ] += v["running_time_xml"]
     for (k,v) in list(test_cases_set2_running_time.items()):
           test_file_name = k[0]
-          test_workflow_name = k[2]
-          tar_tup_cuda = (test_file_name, test_workflow_name)
+          test_config_name = k[2]
+          tar_tup_cuda = (test_file_name, test_config_name)
           if test_file_level_CUDA.get(tar_tup_cuda) == None:
-              test_file_level_CUDA[ ( test_file_name, test_workflow_name ) ] = v["running_time_xml"]
+              test_file_level_CUDA[ ( test_file_name, test_config_name ) ] = v["running_time_xml"]
           else:
-              test_file_level_CUDA[ ( test_file_name, test_workflow_name ) ] += v["running_time_xml"]
+              test_file_level_CUDA[ ( test_file_name, test_config_name ) ] += v["running_time_xml"]
 
     # test file level counts: ROCm tests run, passed, skipped, missed; CUDA tests run
     test_file_counts_ROCm: Dict[Tuple[str], Dict[str, int]] = {}
     test_file_counts_CUDA: Dict[Tuple[str], int] = {}
     for (k,v) in list(test_cases_set1.items()):
         test_file_name = k[0]
-        test_workflow_name = v['work_flow_name']
-        tar_tup = (test_file_name, test_workflow_name)
+        test_config_name = v['test_config']
+        tar_tup = (test_file_name, test_config_name)
         if tar_tup not in test_file_counts_ROCm:
             test_file_counts_ROCm[tar_tup] = {'tests_run': 0, 'passed': 0, 'skipped': 0, 'missed': 0}
         test_file_counts_ROCm[tar_tup]['tests_run'] += 1
@@ -323,8 +333,8 @@ def summarize_xml_files(args):
             test_file_counts_ROCm[tar_tup]['missed'] += 1
     for (k,v) in list(test_cases_set2.items()) if set2_path else []:
         test_file_name = k[0]
-        test_workflow_name = v['work_flow_name']
-        tar_tup = (test_file_name, test_workflow_name)
+        test_config_name = v['test_config']
+        tar_tup = (test_file_name, test_config_name)
         if tar_tup not in test_file_counts_CUDA:
             test_file_counts_CUDA[tar_tup] = 0
         test_file_counts_CUDA[tar_tup] += 1
@@ -386,20 +396,20 @@ def summarize_xml_files(args):
         test_info = v[0]
         test_info_set2 = []
         if status_set_1 == "SKIPPED" and status_set_2 != "SKIPPED":
-            if test_info['work_flow_name'] == WorkflowName.default.name:
+            if test_info['test_config'] == TestConfigName.default.name:
                 SKIPPED_DEFAULT += 1
-            elif test_info['work_flow_name'] == WorkflowName.distributed.name:
+            elif test_info['test_config'] == TestConfigName.distributed.name:
                 SKIPPED_DISTRIBUTED += 1
-            elif test_info['work_flow_name'] == WorkflowName.inductor.name:
+            elif test_info['test_config'] == TestConfigName.inductor.name:
                 SKIPPED_INDUCTOR += 1
         elif set2_path:
             test_info_set2 = v[1]
             if status_set_1 == "MISSED" and status_set_2 != "MISSED":
-              if test_info_set2['work_flow_name'] == WorkflowName.default.name:
+              if test_info_set2['test_config'] == TestConfigName.default.name:
                 MISSED_DEFAULT += 1
-              elif test_info_set2['work_flow_name'] == WorkflowName.distributed.name:
+              elif test_info_set2['test_config'] == TestConfigName.distributed.name:
                 MISSED_DISTRIBUTED += 1
-              elif test_info_set2['work_flow_name'] == WorkflowName.inductor.name:
+              elif test_info_set2['test_config'] == TestConfigName.inductor.name:
                 MISSED_INDUCTOR += 1
 
 
@@ -408,17 +418,17 @@ def summarize_xml_files(args):
               for known_skip in known_skips:
                   if test_file_name == known_skip['test_file'] and k[1] == known_skip['test_class'] and k[2] == known_skip['test_name']:
                       v[2] = known_skip['skip_reason'] if known_skip.__contains__('skip_reason') and not pd.isna(known_skip['skip_reason']) else ' '
-                      if (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.default.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.default.name):
+                      if (test_info.__contains__('test_config') and test_info['test_config'] == TestConfigName.default.name) or (test_info_set2.__contains__('test_config') and test_info_set2['test_config'] == TestConfigName.default.name):
                           if not skip_reasons_stat_default.__contains__(v[2]):
                               skip_reasons_stat_default[v[2]] = 1
                           else:
                               skip_reasons_stat_default[v[2]] += 1
-                      elif (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.distributed.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.distributed.name):
+                      elif (test_info.__contains__('test_config') and test_info['test_config'] == TestConfigName.distributed.name) or (test_info_set2.__contains__('test_config') and test_info_set2['test_config'] == TestConfigName.distributed.name):
                           if not skip_reasons_stat_distributed.__contains__(v[2]):
                               skip_reasons_stat_distributed[v[2]] = 1
                           else:
                               skip_reasons_stat_distributed[v[2]] += 1
-                      elif (test_info.__contains__('work_flow_name') and test_info['work_flow_name'] == WorkflowName.inductor.name) or (test_info_set2.__contains__('work_flow_name') and test_info_set2['work_flow_name'] == WorkflowName.inductor.name):
+                      elif (test_info.__contains__('test_config') and test_info['test_config'] == TestConfigName.inductor.name) or (test_info_set2.__contains__('test_config') and test_info_set2['test_config'] == TestConfigName.inductor.name):
                           if not skip_reasons_stat_inductor.__contains__(v[2]):
                               skip_reasons_stat_inductor[v[2]] = 1
                           else:
@@ -428,11 +438,11 @@ def summarize_xml_files(args):
                       break
 
         if status_set_1 == "PASSED" and status_set_2 != "PASSED" and set2_path:
-            if test_info['work_flow_name'] == WorkflowName.default.name:
+            if test_info['test_config'] == TestConfigName.default.name:
                 ROCMONLY_DEFAULT += 1
-            elif test_info['work_flow_name'] == WorkflowName.distributed.name:
+            elif test_info['test_config'] == TestConfigName.distributed.name:
                 ROCMONLY_DISTRIBUTED += 1
-            elif test_info['work_flow_name'] == WorkflowName.inductor.name:
+            elif test_info['test_config'] == TestConfigName.inductor.name:
                 ROCMONLY_INDUCTOR += 1
 
     skip_reasons_stat_default.pop(' ', None)
@@ -450,16 +460,18 @@ def summarize_xml_files(args):
         item_values["test_name"] = k[2]
         item_values[f"status_{set1_name}"] = get_test_status(v[0])
         item_values[f"status_{set2_name}"] = get_test_status(v[1]) if set2_path else ""
-        # get workflow info
+        # get test config info
         v_values = v[0]
         v1_values = v[1] if set2_path else []
-        workflow_name = ""
-        item_values["work_flow_name"] = ""
+        config_name = ""
+        item_values["test_config"] = ""
         if item_values[f"status_{set1_name}"] != "MISSED":
-            workflow_name = v_values['work_flow_name']
+            config_name = v_values['test_config']
         elif item_values[f"status_{set2_name}"] != "MISSED" and item_values[f"status_{set2_name}"] != "":
-            workflow_name = v1_values['work_flow_name']
-        item_values["work_flow_name"] = workflow_name
+            config_name = v1_values['test_config']
+        item_values["test_config"] = config_name
+        item_values[f"shard_{set1_name}"] = v_values.get('shard', '') if v_values else ''
+        item_values[f"shard_{set2_name}"] = v1_values.get('shard', '') if v1_values else ''
         # get test related info
         item_values[f"message_{set1_name}"] = get_test_message(v[0])
         item_values[f"message_{set2_name}"] = get_test_message(v[1]) if set2_path else ""
@@ -514,7 +526,7 @@ def sorting_key(e):
           return 2
         elif e == "test_name":
           return 3
-        elif e == "work_flow_name":
+        elif e == "test_config":
           return 4
         elif e == "skip_reason":
           return 5
@@ -548,6 +560,10 @@ def sorting_key(e):
           return 19
         elif e == "existed_last_week":
           return 20
+        elif e == f"shard_{set1_name}":
+          return 21
+        elif e == f"shard_{set2_name}":
+          return 22
         elif e == "workflow_run_attempt" or e == "job_id":
           return 1000
         else:
@@ -575,7 +591,7 @@ def sorting_key(e):
     for key_rocm in test_file_level_ROCm.keys():
         item_values = {}
         item_values["test_file"] = key_rocm[0]
-        item_values["test_workflow"] = key_rocm[1]
+        item_values["test_config"] = key_rocm[1]
         item_values["rocm_running_time"] = test_file_level_ROCm[key_rocm]
         item_values["cuda_running_time"] = 0.0
         if key_rocm in test_file_level_CUDA.keys():
@@ -596,7 +612,7 @@ def sorting_key(e):
         if not key_cuda in test_file_level_ROCm.keys():
             item_values = {}
             item_values["test_file"] = key_cuda[0]
-            item_values["test_workflow"] = key_cuda[1]
+            item_values["test_config"] = key_cuda[1]
             item_values["rocm_running_time"] = 0.0
             item_values["cuda_running_time"] = test_file_level_CUDA[key_cuda]
             item_values["abs_time_diff"] = item_values["rocm_running_time"] - item_values["cuda_running_time"]
@@ -616,7 +632,7 @@ def sorting_key(e):
     def sorting_key_running_time(e):
         if e == "test_file":
           return 0
-        elif e == "test_workflow":
+        elif e == "test_config":
           return 1
         elif e == "rocm_running_time":
           return 2
diff --git a/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py b/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
index 8121e8d16928d..29384d3bd0b41 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/upload_test_stats.py
@@ -38,7 +38,7 @@ def parse_xml_report(
     report: Path,
     workflow_id: int,
     workflow_run_attempt: int,
-    work_flow_name: str
+    test_config: str
 ) -> Dict[Tuple[str], Dict[str, Any]]:
     """Convert a test report xml file into a JSON-serializable list of test cases."""
     #print(f"Parsing {tag}s for test report: {report}")
@@ -65,7 +65,7 @@ def parse_xml_report(
             case["workflow_id"] = workflow_id
             case["workflow_run_attempt"] = workflow_run_attempt
             case["job_id"] = job_id
-            case["work_flow_name"] = work_flow_name
+            case["test_config"] = test_config
 
             # [invoking file]
             # The name of the file that the test is located in is not necessarily
@@ -87,9 +87,9 @@ def parse_xml_report(
                     continue
                 break
             case["invoking_file"] = case_name
-            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["test_config"] ) ] = case
         elif tag == 'testsuite':
-            case["work_flow_name"] = work_flow_name
+            case["test_config"] = test_config
             case["invoking_xml"] = report.name
             case["running_time_xml"] = case["time"]
             case_name = report.parent.name
@@ -102,7 +102,7 @@ def parse_xml_report(
                     continue
                 break
             case["invoking_file"] = case_name
-            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["test_config"] ) ] = case
 
     return test_cases
 
diff --git a/.github/workflows/parity.yml b/.github/workflows/parity.yml
index 3dae16d3c6982..0e4147f25c5b7 100644
--- a/.github/workflows/parity.yml
+++ b/.github/workflows/parity.yml
@@ -245,6 +245,17 @@ jobs:
             echo "No parity CSV found in $FOLDER, skipping auto-classify"
           fi
 
+      - name: Detect log-based failures (timeouts, crashes)
+        if: ${{ inputs.include_logs }}
+        working-directory: .automation_scripts/pytorch-unit-test-scripts
+        run: |
+          FOLDER="${{ steps.folder.outputs.folder }}"
+          if ls "$FOLDER"/*.txt 1>/dev/null 2>&1; then
+            python3 detect_log_failures.py --logs-dir "$FOLDER" --output "$FOLDER/log_failures_${{ matrix.arch }}.csv" 2>&1 || true
+          else
+            echo "No log files found in $FOLDER, skipping log failure detection"
+          fi
+
       - name: Collect upload paths
         id: upload-paths
         run: |
@@ -310,8 +321,8 @@ jobs:
           CSV_ARGS=()
           ARCH_ARGS=()
           for ARCH in $ARCHS; do
-            ARTIFACT_DIR="../artifacts/${PREFIX}-results-${ARCH}"
-            CSV=$(find "$ARTIFACT_DIR"/ -maxdepth 2 -name "*.csv" ! -name "*_running_time*" ! -name "*_summary*" 2>/dev/null | head -1)
+            ARTIFACT_DIR="../../artifacts/${PREFIX}-results-${ARCH}"
+            CSV=$(find "$ARTIFACT_DIR"/ -maxdepth 2 -name "*.csv" ! -name "*_running_time*" ! -name "*_summary*" ! -name "log_failures_*" 2>/dev/null | head -1)
             if [ -z "$CSV" ]; then
               echo "WARNING: No CSV found for $ARCH, skipping"
               continue
@@ -334,7 +345,7 @@ jobs:
           if [ -n "$SHA" ]; then
             ARGS+=(--sha "$SHA")
           else
-            DETECTED_SHA=$(basename "$(find ../artifacts/ -name '*.csv' | head -1)" | grep -oP '[0-9a-f]{40}' || true)
+            DETECTED_SHA=$(basename "$(find ../../artifacts/ -name '*.csv' | head -1)" | grep -oP '[0-9a-f]{40}' || true)
             if [ -n "$DETECTED_SHA" ]; then
               ARGS+=(--sha "$DETECTED_SHA")
             fi
@@ -343,6 +354,20 @@ jobs:
             ARGS+=(--pr_id "$PR_ID")
           fi
 
+          # Collect log failure CSVs if they exist
+          LOG_FAIL_ARGS=()
+          for ARCH in $ARCHS; do
+            LF="../../artifacts/${PREFIX}-results-${ARCH}"
+            LF_CSV=$(find "$LF"/ -maxdepth 2 -name "log_failures_*.csv" 2>/dev/null | head -1)
+            if [ -n "$LF_CSV" ]; then
+              LOG_FAIL_ARGS+=("$LF_CSV")
+              echo "Found log failures for $ARCH: $LF_CSV"
+            fi
+          done
+          if [ ${#LOG_FAIL_ARGS[@]} -gt 0 ]; then
+            ARGS+=(--log-failures "${LOG_FAIL_ARGS[@]}")
+          fi
+
           OUTPUT="${PREFIX}_summary"
           ARGS+=(--output "$OUTPUT")
 

From 1b685db51a1c23a4d1585a84d20956af376bc131 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 15 Apr 2026 14:39:48 -0700
Subject: [PATCH 32/43] Add Docker build workflow using public TheRock wheels
 (#3149)

Copied from
https://github.com/AMD-ROCm-Internal/rocm-npi-dev/actions/workflows/build_portable_linux_pytorch_dockers.yml

Latest run and docker generated:
docker.io/rocm/pytorch-private:pytorch-nightly-f8d08404-rocm7.13.0a20260413-ubuntu24.04-py3.12-gfx950-dcgpu
https://github.com/ethanwee1/pytorch/actions/runs/24441876981
---
 .github/scripts/install_pytorch_wheels.py     | 306 +++++++++++++
 .github/scripts/install_rocm_deps.sh          | 114 +++++
 .../build_portable_linux_pytorch_dockers.yml  | 433 ++++++++++++++++++
 dockerfiles/Dockerfile                        | 159 +++++++
 4 files changed, 1012 insertions(+)
 create mode 100644 .github/scripts/install_pytorch_wheels.py
 create mode 100644 .github/scripts/install_rocm_deps.sh
 create mode 100644 .github/workflows/build_portable_linux_pytorch_dockers.yml
 create mode 100644 dockerfiles/Dockerfile

diff --git a/.github/scripts/install_pytorch_wheels.py b/.github/scripts/install_pytorch_wheels.py
new file mode 100644
index 0000000000000..cf8dc5eccc0c6
--- /dev/null
+++ b/.github/scripts/install_pytorch_wheels.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+install_pytorch_wheels.py
+
+Installs PyTorch wheels from a pip index URL.
+
+Usage (from repo root):
+    python .github/scripts/install_pytorch_wheels.py --index-url <URL> --amdgpu-family <FAMILY> [OPTIONS]
+
+Examples:
+    # Install latest versions
+    python .github/scripts/install_pytorch_wheels.py \
+        --index-url <BASE_INDEX_URL>/whl \
+        --amdgpu-family gfx1250
+
+    # Install specific versions (matching ROCm builds)
+    python .github/scripts/install_pytorch_wheels.py \
+        --index-url <BASE_INDEX_URL>/whl \
+        --amdgpu-family gfx1250 \
+        --torch-version "2.10.0+devrocm7.12.0.dev0.849eec43b..." \
+        --torchaudio-version "2.11.0a0+devrocm7.12.0.dev0.849eec43b..." \
+        --torchvision-version "0.25.0a0+devrocm7.12.0.dev0.849eec43b..."
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+import urllib.parse
+import urllib.request
+
+
+# Package configuration: (name, always_install)
+PACKAGES = {
+    "torch": True,
+    "torchaudio": True,
+    "torchvision": True,
+    "triton": False,
+    "rocm[devel]": True,
+}
+PYTORCH_PKGS = ["torch", "torchaudio", "torchvision", "triton"]
+
+
+def print_banner(title: str) -> None:
+    """Print a formatted banner."""
+    print("=" * 50)
+    print(title)
+    print("=" * 50)
+
+
+def build_package_spec(name: str, version: str | None) -> str:
+    """Build a pip package spec (e.g., 'torch==2.10.0' or 'torch')."""
+    return f"{name}=={version}" if version else name
+
+def get_latest_package_version_for_rocm(
+    index_url: str, package_name: str, rocm_version: str, required: bool = True,
+    version_prefix: str | None = None,
+) -> str | None:
+    """Return latest package version containing rocm_version by parsing the index HTML.
+
+    If version_prefix is set (e.g. "2.9"), only versions whose base part starts
+    with that prefix are considered.
+    """
+
+    # Build the URL for this package's index page (e.g. .../gfx1250/torch/).
+    rocm_tag = f"rocm{rocm_version}"
+    url = f"{index_url.rstrip('/')}/{package_name}/"
+    # Fetch the package index page; on failure (e.g. 404, timeout) fail if always_install, else return None.
+    try:
+        with urllib.request.urlopen(url, timeout=30) as resp:
+            html = resp.read().decode("utf-8", errors="ignore")
+    except Exception as e:
+        print(f"Error: failed to fetch index for {package_name}: {e}", file=sys.stderr)
+        sys.exit(1)
+    # Parse wheel links: format is package-VERSION-...whl (e.g. torch-0.26.0a0+rocm7.12...-cp312-....whl).
+    # Version can contain dots and + (URL-encoded as %2B), so we capture everything up to .whl.
+    pattern = re.compile(
+        re.escape(package_name) + r"-(.+?)\.whl",
+        re.IGNORECASE,
+    )
+    all_suffixes = [m.group(1).strip() for m in pattern.finditer(html)]
+    # Keep only wheels whose version string contains the requested ROCm tag (e.g. rocm7.12.0a20260224).
+    # Version is the first segment before "-" in the suffix; decode %2B to + for comparison.
+    matching = []
+    for s in all_suffixes:
+        ver = s.split("-")[0]
+        if rocm_tag in ver:
+            matching.append(urllib.parse.unquote(ver))
+    # Filter by version prefix (e.g. "2.9" matches "2.9.0+...", "2.9.1+...").
+    if version_prefix and matching:
+        matching = [v for v in matching if v.split("+")[0].startswith(version_prefix)]
+    # No matching wheels: if required (always_install), fail; otherwise return None (package will be skipped).
+    if not matching:
+        if required:
+            msg = f"Error: no wheel found for {package_name} with ROCm {rocm_version}"
+            if version_prefix:
+                msg += f" and version prefix {version_prefix}"
+            print(msg, file=sys.stderr)
+            sys.exit(1)
+        return None
+    # Pick the latest version by comparing all numeric parts including the ROCm date.
+    def _key(v: str) -> tuple[int, ...]:
+        try:
+            return tuple(int(x) for x in re.split(r"[.\-a+]", v) if x.isdigit())
+        except (ValueError, AttributeError):
+            return (0,)
+    return max(matching, key=_key)
+
+
+def run_pip_install(
+    index_url: str, packages: list[str], break_system_packages: bool = True
+) -> None:
+    """Run pip install with the given packages."""
+    cmd = [sys.executable, "-m", "pip", "install", "--index-url", index_url]
+
+    if break_system_packages:
+        cmd.append("--break-system-packages")
+
+    cmd.extend(packages)
+
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, check=False)
+
+    if result.returncode != 0:
+        print(f"Error: pip install failed with return code {result.returncode}")
+        sys.exit(result.returncode)
+
+
+def check_package(name: str) -> tuple[bool, str | None]:
+    """Check if a package is installed and return (installed, version)."""
+    try:
+        module = __import__(name)
+        return True, getattr(module, "__version__", "unknown")
+    except ImportError:
+        return False, None
+
+
+def verify_installation() -> bool:
+    """Verify PyTorch installation and print version info."""
+    print_banner("Verifying Installation")
+
+    # Check torch separately for ROCm info
+    try:
+        import torch as _torch
+
+        version = getattr(_torch, "__version__", "unknown")
+    except ImportError as e:
+        print(f"Error: torch import failed ({e!r}). If wheels are installed, run rocm-sdk init first.")
+        return False
+
+    print(f"torch: {version}")
+
+    hip_version = _torch.version.hip
+    print(f"ROCm/HIP: {hip_version or 'not available'}")
+    print(f"Built with ROCm: {hip_version is not None}")
+
+    # Check other packages
+    for name in ["torchaudio", "torchvision", "triton", "rocm"]:
+        installed, version = check_package(name)
+        status = version if installed else "not installed"
+        print(f"{name}: {status}")
+
+    return True
+
+
+def list_installed_packages() -> None:
+    """List installed torch-related packages."""
+    print("\nInstalled PyTorch packages:")
+    result = subprocess.run(
+        [sys.executable, "-m", "pip", "list"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+    if result.returncode == 0:
+        keywords = ["torch", "triton", "rocm"]
+        for line in result.stdout.splitlines():
+            if any(kw in line.lower() for kw in keywords):
+                print(f"  {line}")
+
+
+def main() -> int:
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Install PyTorch wheels from a pip index URL",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    parser.add_argument(
+        "--index-url", required=True, help="Base URL for PyTorch wheels index"
+    )
+    parser.add_argument(
+        "--amdgpu-family", required=True, help="AMD GPU family (e.g., gfx1250)"
+    )
+    parser.add_argument(
+        "--rocm-version",
+        help="Optional. ROCm version (e.g. 7.12.0a20260126). When set without --torch-version: discovers and installs latest torch/torchaudio/torchvision/triton built for this ROCm. ",
+    )
+    parser.add_argument(
+        "--torch-version", help="Specific torch version (default: latest)"
+    )
+    parser.add_argument(
+        "--torch-version-prefix",
+        help="Torch version prefix for discovery (e.g. '2.9' matches 2.9.x). "
+             "Only used in auto-discovery mode (--rocm-version without --torch-version).",
+    )
+    parser.add_argument(
+        "--torchaudio-version", help="Specific torchaudio version (default: latest)"
+    )
+    parser.add_argument(
+        "--torchvision-version", help="Specific torchvision version (default: latest)"
+    )
+    parser.add_argument(
+        "--triton-version",
+        help="Specific triton version (default: from torch dependency)",
+    )
+    parser.add_argument(
+        "--no-break-system-packages",
+        action="store_true",
+        help="Don't use --break-system-packages",
+    )
+    parser.add_argument(
+        "--skip-verify", action="store_true", help="Skip verification step"
+    )
+
+    args = parser.parse_args()
+
+    # Build the full index URL
+    index_url = f"{args.index_url.rstrip('/')}/{args.amdgpu_family}/"
+
+    rocm = args.rocm_version
+    rocm_only = bool(rocm and not args.torch_version)
+    torch_prefix = args.torch_version_prefix if rocm_only else None
+    break_sys = not args.no_break_system_packages
+
+    if rocm_only:
+        # Two-pass install:
+        #   Pass 1: torch (pinned) + rocm[devel] (pinned)
+        #   Pass 2: torchaudio, torchvision, triton (unpinned — pip resolves compatibility)
+        torch_version = get_latest_package_version_for_rocm(
+            index_url, "torch", rocm, required=True, version_prefix=torch_prefix,
+        )
+
+        print_banner("PyTorch Wheels Installation")
+        print(f"Index URL:      {index_url}")
+        print(f"AMDGPU Family:  {args.amdgpu_family}")
+        print(f"Python:         {sys.version_info.major}.{sys.version_info.minor}")
+        print(f"torch:          {torch_version}")
+        print(f"rocm[devel]:    {rocm}")
+        print(f"torchaudio:     (pip resolves)")
+        print(f"torchvision:    (pip resolves)")
+        print(f"triton:         (torch dependency)")
+        print("=" * 50)
+
+        # Pass 1: install torch + rocm[devel] with exact versions.
+        # torch's declared dependency on triton pulls in the correct build.
+        primary = [
+            build_package_spec("torch", torch_version),
+            build_package_spec("rocm[devel]", rocm),
+        ]
+        print_banner("Pass 1: torch + rocm[devel]")
+        print(f"Installing: {', '.join(primary)}")
+        run_pip_install(index_url, primary, break_sys)
+
+        # Pass 2: install torchaudio/torchvision without pinning — pip picks
+        # versions compatible with the torch that's already installed
+        companions = ["torchaudio", "torchvision"]
+        print_banner("Pass 2: torchaudio, torchvision (unpinned)")
+        print(f"Installing: {', '.join(companions)}")
+        run_pip_install(index_url, companions, break_sys)
+    else:
+        # Explicit versions mode — install everything in one shot
+        arg_attrs = ["torch_version", "torchaudio_version", "torchvision_version", "triton_version"]
+        versions = {p: getattr(args, a) for p, a in zip(PYTORCH_PKGS, arg_attrs)}
+        versions["rocm[devel]"] = rocm if rocm else None
+
+        print_banner("PyTorch Wheels Installation")
+        print(f"Index URL:      {index_url}")
+        print(f"AMDGPU Family:  {args.amdgpu_family}")
+        print(f"Python:         {sys.version_info.major}.{sys.version_info.minor}")
+        for name, version in versions.items():
+            print(f"{name:14}: {version or 'latest'}")
+        print("=" * 50)
+
+        packages = []
+        for name, always_install in PACKAGES.items():
+            version = versions.get(name)
+            if always_install or version:
+                packages.append(build_package_spec(name, version))
+
+        print(f"Installing: {', '.join(packages)}")
+        run_pip_install(index_url, packages, break_sys)
+
+    # Verify
+    if not args.skip_verify and not verify_installation():
+        return 1
+
+    list_installed_packages()
+    print_banner("Installation complete")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/scripts/install_rocm_deps.sh b/.github/scripts/install_rocm_deps.sh
new file mode 100644
index 0000000000000..e4c0fd91a1066
--- /dev/null
+++ b/.github/scripts/install_rocm_deps.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# install_rocm_deps.sh
+#
+# Installs runtime dependencies for ROCm on various Linux distributions.
+# Automatically detects the distribution and uses the appropriate package manager.
+#
+# Supported distributions:
+#   - Ubuntu 22.04, 24.04 (apt)
+#   - AlmaLinux 8 (dnf)
+#   - Azure Linux 3 (tdnf)
+
+set -e
+
+# Detect distribution type from /etc/os-release
+detect_distro() {
+    if [ -f /etc/os-release ]; then
+        . /etc/os-release
+        echo "$ID"
+    else
+        echo "unknown"
+    fi
+}
+
+DISTRO=$(detect_distro)
+echo "Detected distribution: $DISTRO"
+
+case "$DISTRO" in
+    ubuntu)
+        echo "Installing dependencies using apt..."
+        apt-get update
+        apt-get install -y --no-install-recommends \
+            ca-certificates \
+            curl \
+            build-essential \
+            libelf1 \
+            libnuma1 \
+            libunwind8 \
+            libncurses6 \
+            perl \
+            file \
+            nano \
+            git \
+            python3 \
+            python3-dev \
+            python3-pip \
+            python3-venv \
+            kmod \
+            pkg-config \
+            liblzma-dev \
+            libdrm-dev
+        # libdw: libdw1t64 for Ubuntu 24.04+, libdw1 for older versions
+        apt-get install -y --no-install-recommends libdw1t64 2>/dev/null || \
+            apt-get install -y --no-install-recommends libdw1 || true
+        # libssl: libssl3 for Ubuntu 22.04+, libssl1.1 for older versions
+        apt-get install -y --no-install-recommends libssl3 2>/dev/null || \
+            apt-get install -y --no-install-recommends libssl1.1 || true
+        rm -rf /var/lib/apt/lists/*
+        ;;
+
+    almalinux)
+        echo "Installing dependencies using dnf..."
+        # Fix AlmaLinux repo to use direct baseurl instead of mirrorlist
+        if [ -f /etc/yum.repos.d/almalinux.repo ]; then
+            sed -i 's/^mirrorlist=/#mirrorlist=/g' /etc/yum.repos.d/almalinux.repo
+            sed -i 's/^# baseurl=/baseurl=/g' /etc/yum.repos.d/almalinux.repo
+        fi
+        dnf install -y --setopt=install_weak_deps=False \
+            ca-certificates \
+            curl \
+            libatomic \
+            elfutils-libelf \
+            elfutils-libs \
+            numactl-libs \
+            ncurses-libs \
+            openssl-libs \
+            perl \
+            file \
+            python3 \
+            python3-devel \
+            python3-pip \
+            kmod
+        dnf clean all
+        ;;
+
+    azurelinux)
+        echo "Installing dependencies using tdnf..."
+        tdnf install -y \
+            ca-certificates \
+            curl \
+            tar \
+            libatomic \
+            elfutils-libelf \
+            elfutils-libs \
+            numactl-libs \
+            libunwind \
+            ncurses-libs \
+            openssl-libs \
+            perl \
+            file \
+            python3 \
+            python3-devel \
+            python3-pip \
+            kmod
+        tdnf clean all
+        ;;
+
+    *)
+        echo "Error: Unsupported distribution: $DISTRO"
+        echo "Supported distributions: ubuntu, almalinux, azurelinux"
+        exit 1
+        ;;
+esac
+
+echo "Dependencies installed successfully for $DISTRO"
diff --git a/.github/workflows/build_portable_linux_pytorch_dockers.yml b/.github/workflows/build_portable_linux_pytorch_dockers.yml
new file mode 100644
index 0000000000000..0bd34fb2bc5b7
--- /dev/null
+++ b/.github/workflows/build_portable_linux_pytorch_dockers.yml
@@ -0,0 +1,433 @@
+name: Build Portable Linux PyTorch Dockers
+
+on:
+  schedule:
+    - cron: "0 6 * * *"   # daily at 06:00 UTC
+  workflow_dispatch:
+    inputs:
+      pytorch_repo:
+        description: "GitHub repo to clone into the image (e.g. 'pytorch/pytorch' or 'ROCm/pytorch')"
+        type: string
+        default: "pytorch/pytorch"
+      pytorch_branch:
+        description: "Branch to clone. Default 'nightly' matches theRock wheel builds. For releases use ROCm/pytorch with 'release/2.11', 'release/2.10', etc."
+        type: string
+        default: "nightly"
+      python_version:
+        type: choice
+        options:
+          - "3.12"
+          - "3.10"
+          - "3.11"
+          - "3.13"
+          - "3.14"
+        default: "3.12"
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx950-dcgpu
+          - gfx94X-dcgpu
+          - gfx90X-dcgpu
+          - gfx120X-all
+          - gfx110X-all
+          - gfx110X-dgpu
+          - gfx103X-dgpu
+          - gfx101X-dgpu
+        default: gfx950-dcgpu
+      rocm_version:
+        description: "ROCm version (e.g. '7.13.0a20260413'). Leave empty to auto-discover from the latest available torch wheel."
+        type: string
+      index_url:
+        description: Base URL for PyTorch wheels index
+        type: string
+        default: "https://rocm.nightlies.amd.com/v2-staging"
+
+permissions:
+  contents: read
+
+run-name: >-
+  ${{ github.event_name == 'schedule' && 'Nightly Docker builds' ||
+      format('Build PyTorch Docker ({0}, {1}/{2}, ROCm {3})',
+             inputs.amdgpu_family || 'gfx950-dcgpu',
+             inputs.pytorch_repo || 'pytorch/pytorch',
+             inputs.pytorch_branch || 'nightly',
+             inputs.rocm_version || 'auto') }}
+
+env:
+  REGISTRY: docker.io
+  IMAGE_NAME: rocm/pytorch-private
+  DEFAULT_AMDGPU_FAMILY: gfx950-dcgpu
+  DEFAULT_PYTHON_VERSION: "3.12"
+  DEFAULT_INDEX_URL: "https://rocm.nightlies.amd.com/v2-staging"
+  DEFAULT_BASE_IMAGE: "ubuntu:24.04"
+
+jobs:
+  # ── Nightly matrix build (schedule only) ─────────────────────────────────
+  nightly-matrix:
+    if: github.event_name == 'schedule'
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - pytorch_repo: pytorch/pytorch
+            pytorch_branch: nightly
+            label: nightly
+          - pytorch_repo: ROCm/pytorch
+            pytorch_branch: release/2.11
+            label: "2.11"
+          - pytorch_repo: ROCm/pytorch
+            pytorch_branch: release/2.10
+            label: "2.10"
+          - pytorch_repo: ROCm/pytorch
+            pytorch_branch: release/2.9
+            label: "2.9"
+    name: "Nightly | torch ${{ matrix.label }} | MI355"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout workflow files
+        uses: actions/checkout@v4
+
+      - name: Checkout PyTorch source
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ matrix.pytorch_repo }}
+          ref: ${{ matrix.pytorch_branch }}
+          path: pytorch-src
+          fetch-depth: 1
+
+      - name: Derive torch version prefix from branch
+        id: prefix
+        run: |
+          BRANCH="${{ matrix.pytorch_branch }}"
+          if [[ "$BRANCH" =~ ^release/([0-9]+\.[0-9]+) ]]; then
+            echo "value=${BASH_REMATCH[1]}" >> $GITHUB_OUTPUT
+            echo "Derived torch prefix: ${BASH_REMATCH[1]}"
+          else
+            echo "value=" >> $GITHUB_OUTPUT
+            echo "No prefix (nightly/main branch)"
+          fi
+
+      - name: Discover ROCm version from index
+        id: discover
+        run: |
+          python3 - "${{ env.DEFAULT_INDEX_URL }}" "${{ env.DEFAULT_AMDGPU_FAMILY }}" "${{ steps.prefix.outputs.value }}" <<'PYEOF'
+          import re, sys, urllib.request, urllib.parse
+
+          index_url, gpu_family = sys.argv[1], sys.argv[2]
+          prefix = sys.argv[3] if len(sys.argv) > 3 else ""
+
+          url = f"{index_url.rstrip('/')}/{gpu_family}/torch/"
+          print(f"Fetching torch index: {url}")
+          html = urllib.request.urlopen(url, timeout=60).read().decode()
+
+          pattern = re.compile(r"torch-(.+?)\.whl", re.IGNORECASE)
+          versions = []
+          for m in pattern.finditer(html):
+              ver = urllib.parse.unquote(m.group(1).split("-")[0])
+              if "+rocm" in ver:
+                  versions.append(ver)
+
+          if prefix:
+              versions = [v for v in versions if v.split("+")[0].startswith(prefix)]
+
+          if not versions:
+              print(f"::error::No torch wheels found (prefix={prefix!r})")
+              sys.exit(1)
+
+          def key(v):
+              try:
+                  return tuple(int(x) for x in re.split(r"[.\-a+]", v) if x.isdigit())
+              except (ValueError, AttributeError):
+                  return (0,)
+
+          latest = max(versions, key=key)
+          rocm_ver = re.search(r"\+rocm(.+)", latest).group(1)
+
+          print(f"Latest torch wheel: {latest}")
+          print(f"Discovered ROCm version: {rocm_ver}")
+
+          import os
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+              f.write(f"rocm_version={rocm_ver}\n")
+              f.write(f"torch_wheel_version={latest}\n")
+          PYEOF
+
+      - name: Resolve config
+        id: cfg
+        run: |
+          echo "amdgpu_family=${{ env.DEFAULT_AMDGPU_FAMILY }}" >> $GITHUB_OUTPUT
+          echo "python_version=${{ env.DEFAULT_PYTHON_VERSION }}" >> $GITHUB_OUTPUT
+          echo "rocm_version=${{ steps.discover.outputs.rocm_version }}" >> $GITHUB_OUTPUT
+          echo "index_url=${{ env.DEFAULT_INDEX_URL }}" >> $GITHUB_OUTPUT
+          echo "base_image=${{ env.DEFAULT_BASE_IMAGE }}" >> $GITHUB_OUTPUT
+          echo "torch_prefix=${{ steps.prefix.outputs.value }}" >> $GITHUB_OUTPUT
+          echo "pytorch_repo=${{ matrix.pytorch_repo }}" >> $GITHUB_OUTPUT
+          echo "pytorch_branch=${{ matrix.pytorch_branch }}" >> $GITHUB_OUTPUT
+
+          COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)"
+          echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT
+
+      - name: Generate Docker image tag
+        id: docker-tag
+        run: |
+          BRANCH="${{ matrix.pytorch_branch }}"
+          BRANCH_SAFE="${BRANCH//\//-}"
+          COMMIT="${{ steps.cfg.outputs.pytorch_commit }}"
+          ROCM_VERSION="${{ steps.cfg.outputs.rocm_version }}"
+          PYTHON_VERSION="${{ steps.cfg.outputs.python_version }}"
+          GFX="${{ steps.cfg.outputs.amdgpu_family }}"
+          BASE_IMAGE="${{ steps.cfg.outputs.base_image }}"
+          OS=$(echo "${BASE_IMAGE}" | tr -d ':' | tr '/' '-')
+
+          IMAGE_TAG="pytorch-${BRANCH_SAFE}-${COMMIT}-rocm${ROCM_VERSION}-${OS}-py${PYTHON_VERSION}-${GFX}"
+          IMAGE_TAG="${IMAGE_TAG//+/-}"
+          echo "tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+          echo "Generated image tag: ${IMAGE_TAG}"
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          # ROCm/pytorch: use DOCKERUSERNAME / DOCKERTOKEN
+          # username: ${{ secrets.DOCKERUSERNAME }}
+          # password: ${{ secrets.DOCKERTOKEN }}
+          username: ${{ secrets.DOCKERUSERNAME }}
+          password: ${{ secrets.DCKRPAT }}
+
+      - name: Prepare build context
+        run: |
+          cp dockerfiles/Dockerfile pytorch-src/
+          mkdir -p pytorch-src/.github/scripts
+          cp .github/scripts/install_rocm_deps.sh pytorch-src/.github/scripts/
+          cp .github/scripts/install_pytorch_wheels.py pytorch-src/.github/scripts/
+
+      - name: Build Docker image
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+
+          docker build \
+            --file pytorch-src/Dockerfile \
+            --tag "${IMAGE}" \
+            --label "pytorch.repo=${{ matrix.pytorch_repo }}" \
+            --label "pytorch.branch=${{ matrix.pytorch_branch }}" \
+            --label "pytorch.commit=${{ steps.cfg.outputs.pytorch_commit }}" \
+            --build-arg "BASE_IMAGE=${{ steps.cfg.outputs.base_image }}" \
+            --build-arg "ROCM_VERSION=${{ steps.cfg.outputs.rocm_version }}" \
+            --build-arg "AMDGPU_FAMILY=${{ steps.cfg.outputs.amdgpu_family }}" \
+            --build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \
+            --build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \
+            --build-arg "TORCH_VERSION_PREFIX=${{ steps.prefix.outputs.value }}" \
+            pytorch-src
+
+          echo "Docker image built successfully: ${IMAGE}"
+
+      - name: Get ROCm packages info
+        id: rocm-packages
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+          ROCM_PACKAGES=$(docker run --rm "${IMAGE}" pip freeze | grep -i rocm || echo "No ROCm packages found")
+          echo "rocm_packages<<EOF" >> $GITHUB_OUTPUT
+          echo "${ROCM_PACKAGES}" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+          echo "ROCm packages:"
+          echo "${ROCM_PACKAGES}"
+
+      - name: Push Docker image
+        run: |
+          docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
+          echo "Docker image pushed successfully"
+
+      - name: Post-build summary
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+          echo "## PyTorch Docker Build Summary — ${{ matrix.label }}" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Parameter | Value |" >> $GITHUB_STEP_SUMMARY
+          echo "|-----------|-------|" >> $GITHUB_STEP_SUMMARY
+          echo "| Image | \`${IMAGE}\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Torch Wheel | ${{ steps.discover.outputs.torch_wheel_version }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Repo | ${{ matrix.pytorch_repo }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Branch | ${{ matrix.pytorch_branch }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Commit | ${{ steps.cfg.outputs.pytorch_commit }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| AMDGPU Family | ${{ steps.cfg.outputs.amdgpu_family }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Python | ${{ steps.cfg.outputs.python_version }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| ROCm (discovered) | ${{ steps.cfg.outputs.rocm_version }} |" >> $GITHUB_STEP_SUMMARY
+
+  # ── Single image build (manual dispatch) ──────────────────────────────────
+  build-docker:
+    if: github.event_name == 'workflow_dispatch'
+    name: "Build | ${{ inputs.amdgpu_family }} | ${{ inputs.pytorch_repo || 'pytorch/pytorch' }}@${{ inputs.pytorch_branch || 'nightly' }}"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout workflow files
+        uses: actions/checkout@v4
+
+      - name: Checkout PyTorch source
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ inputs.pytorch_repo || 'pytorch/pytorch' }}
+          ref: ${{ inputs.pytorch_branch || 'nightly' }}
+          path: pytorch-src
+          fetch-depth: 1
+
+      - name: Derive torch version prefix from branch
+        id: prefix
+        run: |
+          BRANCH="${{ inputs.pytorch_branch || 'nightly' }}"
+          if [[ "$BRANCH" =~ ^release/([0-9]+\.[0-9]+) ]]; then
+            echo "value=${BASH_REMATCH[1]}" >> $GITHUB_OUTPUT
+            echo "Derived torch prefix: ${BASH_REMATCH[1]}"
+          else
+            echo "value=" >> $GITHUB_OUTPUT
+            echo "No prefix (nightly/main branch)"
+          fi
+
+      - name: Discover ROCm version from index
+        id: discover
+        if: ${{ !inputs.rocm_version }}
+        run: |
+          python3 - "${{ inputs.index_url || env.DEFAULT_INDEX_URL }}" "${{ inputs.amdgpu_family || env.DEFAULT_AMDGPU_FAMILY }}" "${{ steps.prefix.outputs.value }}" <<'PYEOF'
+          import re, sys, urllib.request, urllib.parse
+
+          index_url, gpu_family = sys.argv[1], sys.argv[2]
+          prefix = sys.argv[3] if len(sys.argv) > 3 else ""
+
+          url = f"{index_url.rstrip('/')}/{gpu_family}/torch/"
+          print(f"Fetching torch index: {url}")
+          html = urllib.request.urlopen(url, timeout=60).read().decode()
+
+          pattern = re.compile(r"torch-(.+?)\.whl", re.IGNORECASE)
+          versions = []
+          for m in pattern.finditer(html):
+              ver = urllib.parse.unquote(m.group(1).split("-")[0])
+              if "+rocm" in ver:
+                  versions.append(ver)
+
+          if prefix:
+              versions = [v for v in versions if v.split("+")[0].startswith(prefix)]
+
+          if not versions:
+              print(f"::error::No torch wheels found (prefix={prefix!r})")
+              sys.exit(1)
+
+          def key(v):
+              try:
+                  return tuple(int(x) for x in re.split(r"[.\-a+]", v) if x.isdigit())
+              except (ValueError, AttributeError):
+                  return (0,)
+
+          latest = max(versions, key=key)
+          rocm_ver = re.search(r"\+rocm(.+)", latest).group(1)
+
+          print(f"Latest torch wheel: {latest}")
+          print(f"Discovered ROCm version: {rocm_ver}")
+
+          import os
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+              f.write(f"rocm_version={rocm_ver}\n")
+              f.write(f"torch_wheel_version={latest}\n")
+          PYEOF
+
+      - name: Resolve inputs with defaults
+        id: cfg
+        run: |
+          echo "amdgpu_family=${{ inputs.amdgpu_family || env.DEFAULT_AMDGPU_FAMILY }}" >> $GITHUB_OUTPUT
+          echo "python_version=${{ inputs.python_version || env.DEFAULT_PYTHON_VERSION }}" >> $GITHUB_OUTPUT
+
+          # Use explicit rocm_version if provided, otherwise use discovered version
+          ROCM="${{ inputs.rocm_version || steps.discover.outputs.rocm_version }}"
+          echo "rocm_version=${ROCM}" >> $GITHUB_OUTPUT
+
+          echo "index_url=${{ inputs.index_url || env.DEFAULT_INDEX_URL }}" >> $GITHUB_OUTPUT
+          echo "base_image=${{ env.DEFAULT_BASE_IMAGE }}" >> $GITHUB_OUTPUT
+          echo "torch_prefix=${{ steps.prefix.outputs.value }}" >> $GITHUB_OUTPUT
+          echo "pytorch_repo=${{ inputs.pytorch_repo || 'pytorch/pytorch' }}" >> $GITHUB_OUTPUT
+          echo "pytorch_branch=${{ inputs.pytorch_branch || 'nightly' }}" >> $GITHUB_OUTPUT
+
+          COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)"
+          echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT
+
+      - name: Generate Docker image tag
+        id: docker-tag
+        run: |
+          BRANCH="${{ steps.cfg.outputs.pytorch_branch }}"
+          BRANCH_SAFE="${BRANCH//\//-}"
+          COMMIT="${{ steps.cfg.outputs.pytorch_commit }}"
+          ROCM_VERSION="${{ steps.cfg.outputs.rocm_version }}"
+          PYTHON_VERSION="${{ steps.cfg.outputs.python_version }}"
+          GFX="${{ steps.cfg.outputs.amdgpu_family }}"
+          BASE_IMAGE="${{ steps.cfg.outputs.base_image }}"
+          OS=$(echo "${BASE_IMAGE}" | tr -d ':' | tr '/' '-')
+
+          IMAGE_TAG="pytorch-${BRANCH_SAFE}-${COMMIT}-rocm${ROCM_VERSION}-${OS}-py${PYTHON_VERSION}-${GFX}"
+          IMAGE_TAG="${IMAGE_TAG//+/-}"
+          echo "tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+          echo "Generated image tag: ${IMAGE_TAG}"
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          # ROCm/pytorch: use DOCKERUSERNAME / DOCKERTOKEN
+          # username: ${{ secrets.DOCKERUSERNAME }}
+          # password: ${{ secrets.DOCKERTOKEN }}
+          username: ${{ secrets.DOCKERUSERNAME }}
+          password: ${{ secrets.DCKRPAT }}
+
+      - name: Prepare build context
+        run: |
+          cp dockerfiles/Dockerfile pytorch-src/
+          mkdir -p pytorch-src/.github/scripts
+          cp .github/scripts/install_rocm_deps.sh pytorch-src/.github/scripts/
+          cp .github/scripts/install_pytorch_wheels.py pytorch-src/.github/scripts/
+
+      - name: Build Docker image
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+
+          docker build \
+            --file pytorch-src/Dockerfile \
+            --tag "${IMAGE}" \
+            --label "pytorch.repo=${{ steps.cfg.outputs.pytorch_repo }}" \
+            --label "pytorch.branch=${{ steps.cfg.outputs.pytorch_branch }}" \
+            --label "pytorch.commit=${{ steps.cfg.outputs.pytorch_commit }}" \
+            --build-arg "BASE_IMAGE=${{ steps.cfg.outputs.base_image }}" \
+            --build-arg "ROCM_VERSION=${{ steps.cfg.outputs.rocm_version }}" \
+            --build-arg "AMDGPU_FAMILY=${{ steps.cfg.outputs.amdgpu_family }}" \
+            --build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \
+            --build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \
+            --build-arg "TORCH_VERSION_PREFIX=${{ steps.cfg.outputs.torch_prefix }}" \
+            pytorch-src
+
+          echo "Docker image built successfully: ${IMAGE}"
+
+      - name: Get ROCm packages info
+        id: rocm-packages
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+          ROCM_PACKAGES=$(docker run --rm "${IMAGE}" pip freeze | grep -i rocm || echo "No ROCm packages found")
+          echo "rocm_packages<<EOF" >> $GITHUB_OUTPUT
+          echo "${ROCM_PACKAGES}" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+          echo "ROCm packages:"
+          echo "${ROCM_PACKAGES}"
+
+      - name: Push Docker image
+        run: |
+          docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
+          echo "Docker image pushed successfully"
+
+      - name: Post-build summary
+        run: |
+          IMAGE="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}"
+          echo "## PyTorch Docker Build Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Parameter | Value |" >> $GITHUB_STEP_SUMMARY
+          echo "|-----------|-------|" >> $GITHUB_STEP_SUMMARY
+          echo "| Image | \`${IMAGE}\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Repo | ${{ steps.cfg.outputs.pytorch_repo }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Branch | ${{ steps.cfg.outputs.pytorch_branch }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| PyTorch Commit | ${{ steps.cfg.outputs.pytorch_commit }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| AMDGPU Family | ${{ steps.cfg.outputs.amdgpu_family }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Python | ${{ steps.cfg.outputs.python_version }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| ROCm | ${{ steps.cfg.outputs.rocm_version }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Torch Version Prefix | ${{ steps.cfg.outputs.torch_prefix || 'latest' }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Index URL | ${{ steps.cfg.outputs.index_url }} |" >> $GITHUB_STEP_SUMMARY
diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile
new file mode 100644
index 0000000000000..361d0219eceef
--- /dev/null
+++ b/dockerfiles/Dockerfile
@@ -0,0 +1,159 @@
+# Dockerfile
+#
+# PyTorch + ROCm image built from TheRock portable wheels.
+# Does NOT include kernel drivers — the host must provide compatible
+# AMDGPU/ROCm kernel components and device access.
+#
+# Recommended docker run flags (mirrors TheRock CI container options):
+#   docker run \
+#     --shm-size=10g \
+#     --cap-add=SYS_PTRACE \
+#     --group-add video \
+#     --device /dev/kfd \
+#     --device /dev/dri \
+#     <image>
+#
+# Supported base images (examples)
+# - ubuntu:24.04
+# - almalinux:8
+# - mcr.microsoft.com/azurelinux/base/core:3.0
+#
+# Build arguments
+# - BASE_IMAGE       : Base Docker image (default: ubuntu:24.04)
+# - ROCM_VERSION     : Full ROCm version string. Supported formats:
+#                      - Nightly: 7.13.0a20260413
+#                      - Dev: 7.12.0.dev0+849eec43b2075459511b9a9ffe3bf1948490e9ee
+# - AMDGPU_FAMILY    : AMD GPU family (e.g., gfx94X-dcgpu, gfx90X-dcgpu, gfx950-dcgpu)
+# - PYTHON_VERSION   : Python version for PyTorch (default: 3.12)
+# - INDEX_URL        : (Required) Base URL for PyTorch wheels index
+# - TORCH_VERSION      : Optional specific PyTorch version. If not set, installs latest.
+# - TORCHAUDIO_VERSION : Optional specific torchaudio version. If not set, installs latest.
+# - TORCHVISION_VERSION: Optional specific torchvision version. If not set, installs latest.
+# - TRITON_VERSION     : Optional specific triton version. If not set, uses torch's dependency.
+#
+# Note: The PyTorch source is included at /workspace/pytorch (from the repo root).
+#
+# Build example (run from repo root):
+#
+#   docker build \
+#     --build-arg BASE_IMAGE=ubuntu:24.04 \
+#     --build-arg ROCM_VERSION=7.13.0a20260413 \
+#     --build-arg AMDGPU_FAMILY=gfx94X-dcgpu \
+#     --build-arg PYTHON_VERSION=3.12 \
+#     --build-arg INDEX_URL=https://rocm.nightlies.amd.com/v2-staging \
+#     -f dockerfiles/Dockerfile \
+#     -t pytorch-rocm:ubuntu24.04-gfx94X-dcgpu-7.13.0a20260413 \
+#     .
+#
+
+# Base image selection
+ARG BASE_IMAGE=ubuntu:24.04
+FROM ${BASE_IMAGE}
+
+# ROCm configuration arguments
+ARG ROCM_VERSION
+ARG AMDGPU_FAMILY
+ARG RELEASE_TYPE=nightly
+
+# PyTorch configuration arguments
+ARG PYTHON_VERSION=3.12
+ARG INDEX_URL
+ARG TORCH_VERSION
+ARG TORCH_VERSION_PREFIX
+ARG TORCHAUDIO_VERSION
+ARG TORCHVISION_VERSION
+ARG TRITON_VERSION
+
+# Copy installation scripts
+COPY .github/scripts/install_rocm_deps.sh /tmp/
+COPY .github/scripts/install_pytorch_wheels.py /tmp/
+
+# Copy PyTorch source from the repo root
+COPY . /workspace/pytorch
+
+# Install system dependencies
+RUN chmod +x /tmp/install_rocm_deps.sh && \
+    /tmp/install_rocm_deps.sh
+
+# Install the requested Python version if not already available.
+# Ubuntu 24.04 ships with 3.12; other versions come from deadsnakes PPA.
+RUN if ! command -v python${PYTHON_VERSION} >/dev/null 2>&1; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends software-properties-common && \
+        add-apt-repository -y ppa:deadsnakes/ppa && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            python${PYTHON_VERSION} \
+            python${PYTHON_VERSION}-dev \
+            python${PYTHON_VERSION}-venv && \
+        rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Create Python virtual environment and upgrade pip/setuptools
+RUN python${PYTHON_VERSION} -m venv /opt/venv && \
+    /opt/venv/bin/python -m pip install --upgrade pip && \
+    /opt/venv/bin/python -m pip install --upgrade setuptools
+
+ENV PATH="/opt/venv/bin:${PATH}"
+
+# Install PyTorch wheels from the public nightlies index.
+RUN /opt/venv/bin/python /tmp/install_pytorch_wheels.py \
+        --no-break-system-packages \
+        --skip-verify \
+        --index-url "${INDEX_URL}" \
+        --amdgpu-family "${AMDGPU_FAMILY}" \
+        ${ROCM_VERSION:+--rocm-version "${ROCM_VERSION}"} \
+        ${TORCH_VERSION:+--torch-version "${TORCH_VERSION}"} \
+        ${TORCH_VERSION_PREFIX:+--torch-version-prefix "${TORCH_VERSION_PREFIX}"} \
+        ${TORCHAUDIO_VERSION:+--torchaudio-version "${TORCHAUDIO_VERSION}"} \
+        ${TORCHVISION_VERSION:+--torchvision-version "${TORCHVISION_VERSION}"} \
+        ${TRITON_VERSION:+--triton-version "${TRITON_VERSION}"}
+
+# Run rocm-sdk init to make rocm buildable
+RUN rocm-sdk init
+
+# ROCm environment variables (mirrors TheRock CI setup in
+# test_pytorch_wheels_full.yml "Initialize ROCm SDK and configure environment").
+# All paths derive from ROCM_HOME which is the rocm-sdk install location.
+ENV ROCM_HOME="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel" \
+    ROCM_PATH="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel" \
+    ROCM_SOURCE_DIR="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel" \
+    ROCM_BIN="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/bin" \
+    ROCM_CMAKE="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/lib/cmake" \
+    PYTORCH_ROCM_ARCH="${AMDGPU_FAMILY}" \
+    VIRTUAL_ENV=/opt/venv \
+    USE_MSLK=0
+
+ENV CMAKE_PREFIX_PATH="${ROCM_CMAKE}" \
+    HIP_DEVICE_LIB_PATH="${ROCM_HOME}/lib/llvm/amdgcn/bitcode" \
+    ROCM_DEVICE_LIB_PATH="${ROCM_HOME}/lib/llvm/amdgcn/bitcode" \
+    ROCM_SYSDEPS_INCLUDE="${ROCM_HOME}/lib/rocm_sysdeps/include" \
+    CPLUS_INCLUDE_PATH="${ROCM_HOME}/lib/rocm_sysdeps/include" \
+    C_INCLUDE_PATH="${ROCM_HOME}/lib/rocm_sysdeps/include" \
+    PKG_CONFIG_PATH="${ROCM_HOME}/lib/rocm_sysdeps/lib/pkgconfig" \
+    LD_LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \
+    LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \
+    CC="${ROCM_HOME}/lib/llvm/bin/clang" \
+    CXX="${ROCM_HOME}/lib/llvm/bin/clang++" \
+    PATH="${ROCM_BIN}:${PATH}"
+
+# Verify PyTorch imports and environment
+RUN python <<'PYEOF'
+import os, torch
+print('torch', torch.__version__)
+print('ROCm/HIP', torch.version.hip)
+print(f'ROCM_HOME={os.environ.get("ROCM_HOME", "NOT SET")}')
+print(f'CC={os.environ.get("CC", "NOT SET")}')
+print(f'CXX={os.environ.get("CXX", "NOT SET")}')
+for mod in ['torchaudio', 'torchvision', 'triton']:
+    try:
+        m = __import__(mod)
+        print(f'{mod} {m.__version__}')
+    except Exception as e:
+        print(f'{mod}: skipped ({e})')
+PYEOF
+
+# Clean up installation scripts
+RUN rm -f /tmp/install_rocm_deps.sh /tmp/install_pytorch_wheels.py
+
+WORKDIR /workspace/pytorch

From e0f6b16e86716e3f4a45f028d72a59f34381d196 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 15 Apr 2026 14:52:35 -0700
Subject: [PATCH 33/43] Fix Docker Hub secret names for
 build_portable_linux_pytorch_dockers (#3159)

Docker credentials were using the ones from my fork and not rocm/pytorch
credentials:
https://github.com/ROCm/pytorch/actions/runs/24479854145/job/71541505148
Latest build
https://github.com/ROCm/pytorch/actions/runs/24480169722/job/71542549933
---
 .../workflows/build_portable_linux_pytorch_dockers.yml | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_portable_linux_pytorch_dockers.yml b/.github/workflows/build_portable_linux_pytorch_dockers.yml
index 0bd34fb2bc5b7..a40fb35b88a11 100644
--- a/.github/workflows/build_portable_linux_pytorch_dockers.yml
+++ b/.github/workflows/build_portable_linux_pytorch_dockers.yml
@@ -187,11 +187,8 @@ jobs:
       - name: Log in to Docker Hub
         uses: docker/login-action@v3
         with:
-          # ROCm/pytorch: use DOCKERUSERNAME / DOCKERTOKEN
-          # username: ${{ secrets.DOCKERUSERNAME }}
-          # password: ${{ secrets.DOCKERTOKEN }}
           username: ${{ secrets.DOCKERUSERNAME }}
-          password: ${{ secrets.DCKRPAT }}
+          password: ${{ secrets.DOCKERTOKEN }}
 
       - name: Prepare build context
         run: |
@@ -366,11 +363,8 @@ jobs:
       - name: Log in to Docker Hub
         uses: docker/login-action@v3
         with:
-          # ROCm/pytorch: use DOCKERUSERNAME / DOCKERTOKEN
-          # username: ${{ secrets.DOCKERUSERNAME }}
-          # password: ${{ secrets.DOCKERTOKEN }}
           username: ${{ secrets.DOCKERUSERNAME }}
-          password: ${{ secrets.DCKRPAT }}
+          password: ${{ secrets.DOCKERTOKEN }}
 
       - name: Prepare build context
         run: |

From 293ee53014ef320069d1af2629f3075dd33cdea8 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Thu, 16 Apr 2026 09:30:57 -0500
Subject: [PATCH 34/43] Make gfx94x-dcgpu the default since theRock CI
 currently runs full testing on that arch

---
 .github/workflows/build_portable_linux_pytorch_dockers.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_portable_linux_pytorch_dockers.yml b/.github/workflows/build_portable_linux_pytorch_dockers.yml
index a40fb35b88a11..d5c9a94c3b1ad 100644
--- a/.github/workflows/build_portable_linux_pytorch_dockers.yml
+++ b/.github/workflows/build_portable_linux_pytorch_dockers.yml
@@ -33,7 +33,7 @@ on:
           - gfx110X-dgpu
           - gfx103X-dgpu
           - gfx101X-dgpu
-        default: gfx950-dcgpu
+        default: gfx94X-dcgpu
       rocm_version:
         description: "ROCm version (e.g. '7.13.0a20260413'). Leave empty to auto-discover from the latest available torch wheel."
         type: string

From 9f8ad3e96fa9a397cf7e24cf505b35c019aef903 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Thu, 23 Apr 2026 07:35:39 -0700
Subject: [PATCH 35/43] [CI] Show only ROCm failures in parity summary and add
 cross-arch column (#3153)

## Summary
- Only display tests where ROCm status is FAILED in the summary (CUDA
status shown as a context column alongside). Previously both ROCm and
CUDA failures were shown.
- Add "Also Failing In" column that shows which other architectures have
the same test tuple (test_file, test_class, test_name) failing, making
it easy to distinguish all-ROCm issues from architecture-specific ones.
- Includes count of failed tests in the section header.
- Add job-level and test-level shard info to "LOG-BASED FAILURES (not in
XML)" and "FAILED TESTS" section
- Includes flaky tests in "LOG-BASED FAILURES (not in XML)" section for
any tests that pass when run in new process

## Test plan

- [x] Cross-arch detection confirmed: tests failing on all 3 archs show
the other 2 in "Also Failing In"; single-arch failures show empty
- [x] CSV and Markdown output both updated consistently
Latest run https://github.com/ROCm/pytorch/actions/runs/24798004968
Run without this PR on the same commit:
https://github.com/ROCm/pytorch/actions/runs/24796654604
---
 .../detect_log_failures.py                    | 189 ++++++++-
 .../generate_summary.py                       | 365 +++++++++++++++---
 .github/workflows/parity.yml                  |   2 +-
 3 files changed, 498 insertions(+), 58 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py
index 57c813790c9ed..0156624c35973 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py
@@ -38,6 +38,10 @@
 RE_INDIVIDUAL_TEST = re.compile(
     r"(?P<test_path>\S+\.py::(?P<cls>\w+)::(?P<method>\w+))"
 )
+RE_INDIV_PASSED = re.compile(
+    r"(?:test/)?(?P<file>\S+\.py)::(?P<cls>\w+)::(?P<method>\S+?)\s+PASSED"
+)
+RE_NEW_PROCESS_SUCCESS = re.compile(r"Test succeeded in new process")
 
 CRASH_PATTERNS = [
     (re.compile(r"Segmentation fault", re.IGNORECASE), "SEGFAULT"),
@@ -78,11 +82,20 @@ def classify_log_file(filename):
 
 
 def parse_log_file(filepath):
-    """Parse a single log file and return test file results and consistent failures."""
+    """Parse a single log file and return test file results, consistent failures,
+    and flaky tests.
+
+    A flaky test is one that failed in its normal-process run but PASSED when the
+    CI harness re-ran it alone in a new subprocess (indicated by a PASSED line
+    for the specific test::class::method, followed by 'Test succeeded in new
+    process, continuing with the rest of the tests').
+    """
     results = {}
     current_test = None
     last_failed_test = None
     consistent_failures = []
+    flaky_tests = []
+    last_passed_individual = None
 
     with open(filepath, "r", errors="replace") as f:
         for line in f:
@@ -111,7 +124,9 @@ def parse_log_file(filepath):
                and "Aborted (core dumped)" not in line \
                and "OutOfMemoryError" not in line \
                and "bad_alloc" not in line \
-               and "stepcurrent" not in line:
+               and "stepcurrent" not in line \
+               and "PASSED" not in line \
+               and "new process" not in line:
                 continue
 
             stripped = RE_TIMESTAMP.sub("", line).rstrip()
@@ -195,7 +210,40 @@ def parse_log_file(filepath):
 
             m = RE_FAILED_CONSISTENTLY.search(stripped)
             if m:
-                consistent_failures.append(m.group("test_path"))
+                shard_str = ""
+                if active and active in results:
+                    info = results[active]
+                    shard_str = f"{info['shard']}/{info['total']}"
+                consistent_failures.append((m.group("test_path"), shard_str))
+
+            # Detect individual PASSED lines for flaky-rerun tracking.
+            m = RE_INDIV_PASSED.search(stripped)
+            if m:
+                last_passed_individual = {
+                    "file": m.group("file"),
+                    "cls": m.group("cls"),
+                    "method": m.group("method"),
+                    "active": active,
+                }
+
+            # When we see 'Test succeeded in new process' after a PASSED
+            # individual test, that test was originally failing in the main
+            # process (CI only falls back to rerun-in-new-process for tests
+            # that crashed or failed) but passed on retry -> flaky.
+            if RE_NEW_PROCESS_SUCCESS.search(stripped) and last_passed_individual:
+                lp = last_passed_individual
+                lp_active = lp.get("active")
+                test_shard = ""
+                if lp_active and lp_active in results:
+                    info = results[lp_active]
+                    test_shard = f"{info['shard']}/{info['total']}"
+                flaky_tests.append({
+                    "file": lp["file"],
+                    "cls": lp["cls"],
+                    "method": lp["method"],
+                    "test_shard": test_shard,
+                })
+                last_passed_individual = None
 
             if active and active in results:
                 for pattern, label in CRASH_PATTERNS:
@@ -203,12 +251,35 @@ def parse_log_file(filepath):
                         if label not in results[active]["crashes"]:
                             results[active]["crashes"].append(label)
 
-    return results, consistent_failures
+    return results, consistent_failures, flaky_tests
 
 
 def scan_logs(logs_dir):
-    """Scan all log files and return all non-passing test file results."""
+    """Scan all log files and return non-passing test file results plus a
+    test-level shard inventory.
+
+    Returns (all_failures, shard_inventory) where shard_inventory is a list
+    of dicts with one entry per (platform, test_config, job_shard, test_file)
+    combination seen in the logs, plus a sorted comma-separated list of the
+    test-level shards observed (e.g. "1/1" or "1/15,2/15,...,15/15"). This
+    lets downstream consumers look up the test-level shard for any XML-based
+    failure whose only shard info is the job-level shard."""
     all_failures = []
+    all_flaky = []
+    shard_map = defaultdict(set)
+
+    # Pre-compute job-level shard totals per (platform, test_config) by
+    # counting how many log files belong to each group. Log files are
+    # 1-indexed (e.g. rocm1.txt..rocm6.txt for a 6-way sharded job), so
+    # the count == total shards for that CI job.
+    shard_totals = defaultdict(int)
+    for fname in os.listdir(logs_dir):
+        if not fname.endswith(".txt"):
+            continue
+        platform, test_config, shard_num = classify_log_file(fname)
+        if platform is None:
+            continue
+        shard_totals[(platform, test_config)] += 1
 
     for fname in sorted(os.listdir(logs_dir)):
         if not fname.endswith(".txt"):
@@ -218,8 +289,31 @@ def scan_logs(logs_dir):
         if platform is None:
             continue
 
+        job_total = shard_totals.get((platform, test_config), 0)
+        job_shard_str = f"{shard_num}/{job_total}" if job_total else str(shard_num)
+
         filepath = os.path.join(logs_dir, fname)
-        results, consistent_failures = parse_log_file(filepath)
+        results, consistent_failures, flaky_tests = parse_log_file(filepath)
+
+        for ft in flaky_tests:
+            file_part = ft["file"].replace("test/", "").replace(".py", "")
+            all_flaky.append({
+                "log_file": fname,
+                "platform": platform,
+                "test_config": test_config,
+                "test_file": file_part,
+                "test_class": ft["cls"],
+                "test_name": ft["method"],
+                "job_shard": job_shard_str,
+                "test_shard": ft["test_shard"],
+            })
+
+        # Record every (test_file, test_shard) observed in this log file,
+        # including PASSED ones, so the inventory covers the full run.
+        for info in results.values():
+            shard_map[(platform, test_config, job_shard_str, info["test_file"])].add(
+                f"{info['shard']}/{info['total']}"
+            )
 
         for key, info in results.items():
             if info["status"] == "PASSED":
@@ -265,14 +359,15 @@ def scan_logs(logs_dir):
                 "platform": platform,
                 "test_config": test_config,
                 "test_file": info["test_file"],
-                "shard": f"{info['shard']}/{info['total']}",
+                "job_shard": job_shard_str,
+                "test_shard": f"{info['shard']}/{info['total']}",
                 "status": info["status"],
                 "category": "+".join(categories),
                 "reason": reason,
                 "exit_codes": ",".join(str(c) for c in info["exit_codes"]),
             })
 
-        for test_path in consistent_failures:
+        for test_path, shard_str in consistent_failures:
             parts = test_path.split("::")
             file_part = parts[0].replace("test/", "").replace(".py", "")
             test_class = parts[1] if len(parts) > 1 else ""
@@ -283,19 +378,47 @@ def scan_logs(logs_dir):
                 "platform": platform,
                 "test_config": test_config,
                 "test_file": file_part,
-                "shard": "",
+                "job_shard": job_shard_str,
+                "test_shard": shard_str,
                 "status": "FAILED_CONSISTENTLY",
                 "category": "CONSISTENT_FAILURE",
                 "reason": f"{test_class}::{test_name}" if test_class else "",
                 "exit_codes": "",
             })
 
-    return all_failures
+    def _sort_shards(vals):
+        def key(v):
+            try:
+                a, b = v.split("/", 1)
+                return (int(b), int(a))
+            except (ValueError, AttributeError):
+                return (0, 0)
+        return sorted(vals, key=key)
+
+    shard_inventory = [
+        {
+            "platform": platform,
+            "test_config": test_config,
+            "job_shard": job_shard_str,
+            "test_file": test_file,
+            "test_shards": ",".join(_sort_shards(shards)),
+        }
+        for (platform, test_config, job_shard_str, test_file), shards in shard_map.items()
+    ]
+    shard_inventory.sort(key=lambda r: (r["platform"], r["test_config"],
+                                        r["job_shard"], r["test_file"]))
+
+    all_flaky.sort(key=lambda r: (r["platform"], r["test_config"],
+                                  r["job_shard"], r["test_file"],
+                                  r["test_class"], r["test_name"]))
+
+    return all_failures, shard_inventory, all_flaky
 
 
 def write_csv_report(failures, output_path):
     fieldnames = [
-        "log_file", "platform", "test_config", "test_file", "shard",
+        "log_file", "platform", "test_config", "test_file",
+        "job_shard", "test_shard",
         "status", "category", "reason", "exit_codes",
     ]
     with open(output_path, "w", newline="") as f:
@@ -305,6 +428,46 @@ def write_csv_report(failures, output_path):
     print(f"Log failure report: {output_path} ({len(failures)} entries)")
 
 
+def write_shards_report(inventory, output_path):
+    fieldnames = ["platform", "test_config", "job_shard", "test_file", "test_shards"]
+    with open(output_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(inventory)
+    print(f"Log shard inventory: {output_path} ({len(inventory)} entries)")
+
+
+def write_flaky_report(flaky, output_path):
+    fieldnames = [
+        "log_file", "platform", "test_config", "test_file",
+        "test_class", "test_name", "job_shard", "test_shard",
+    ]
+    with open(output_path, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(flaky)
+    print(f"Flaky test report: {output_path} ({len(flaky)} entries)")
+
+
+def _derive_sibling_path(output_path, new_prefix):
+    """Given an output path like '.../log_failures_mi355.csv' and
+    new_prefix='log_shards', return '.../log_shards_mi355.csv'. Falls back to
+    appending '.{new_prefix}.csv' if the expected prefix isn't present."""
+    d, base = os.path.split(output_path)
+    if base.startswith("log_failures"):
+        return os.path.join(d, new_prefix + base[len("log_failures"):])
+    stem, ext = os.path.splitext(base)
+    return os.path.join(d, f"{stem}.{new_prefix}{ext or '.csv'}")
+
+
+def _derive_shards_path(output_path):
+    return _derive_sibling_path(output_path, "log_shards")
+
+
+def _derive_flaky_path(output_path):
+    return _derive_sibling_path(output_path, "flaky_tests")
+
+
 def print_summary(failures):
     if not failures:
         print("No log-based failures detected.")
@@ -343,9 +506,11 @@ def main():
     )
     args = parser.parse_args()
 
-    failures = scan_logs(args.logs_dir)
+    failures, shard_inventory, flaky_tests = scan_logs(args.logs_dir)
     print_summary(failures)
     write_csv_report(failures, args.output)
+    write_shards_report(shard_inventory, _derive_shards_path(args.output))
+    write_flaky_report(flaky_tests, _derive_flaky_path(args.output))
     return 0 if not failures else 1
 
 
diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
index 077a203a44b30..bf100e20d8127 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
@@ -231,7 +231,13 @@ def safe_float(v):
 
 
 def collect_failed_tests(arch_data, archs, s1_name, s2_name):
-    """Return a list of failed test rows across all architectures."""
+    """Return a list of failed test rows across all architectures.
+
+    Only collects tests where s1 (ROCm) is FAILED. Each entry records shards
+    for both s1 and s2 so the reviewer can look up the failure in either CI
+    job. 'also_failing_in' is populated later once log failures are known so
+    CUDA log-only failures can be included.
+    """
     failed = []
     for arch in archs:
         d = arch_data[arch]
@@ -240,23 +246,85 @@ def collect_failed_tests(arch_data, archs, s1_name, s2_name):
         for r in d['rows']:
             s1 = r[s1_col].strip()
             s2 = r[s2_col].strip() if has_set2 else ''
-            if s1 == 'FAILED' or s2 == 'FAILED':
-                shard = r.get(f'shard_{s1_name}', '') if s1 == 'FAILED' else r.get(f'shard_{s2_name}', '')
+            if s1 == 'FAILED':
                 entry = {
                     'arch': arch,
                     'test_file': r.get('test_file', ''),
                     'test_class': r.get('test_class', ''),
                     'test_name': r.get('test_name', ''),
                     'test_config': r.get('test_config', ''),
-                    'shard': shard,
+                    f'shard_{s1_name}': r.get(f'shard_{s1_name}', ''),
                     f'status_{s1_name}': s1,
                 }
                 if has_set2:
+                    entry[f'shard_{s2_name}'] = r.get(f'shard_{s2_name}', '')
                     entry[f'status_{s2_name}'] = s2
                 failed.append(entry)
+
     return failed
 
 
+def _add_cross_arch_info(failed_tests, log_failures, s2_name):
+    """Populate 'also_failing_in' for each entry.
+
+    Matches across other ROCm architectures (from XML-based failures) and also
+    includes s2 (CUDA) if a log failure is recorded for the same test tuple.
+    """
+    from collections import defaultdict
+    by_tuple = defaultdict(set)
+    for t in failed_tests:
+        key = (t['test_file'], t['test_class'], t['test_name'])
+        by_tuple[key].add(t['arch'])
+
+    cuda_log_tuples = set()
+    for lf in log_failures or []:
+        if lf.get('platform', '') == s2_name:
+            test_class, test_name = _parse_log_failure_names(lf)
+            cuda_log_tuples.add((lf.get('test_file', ''), test_class, test_name))
+
+    for t in failed_tests:
+        key = (t['test_file'], t['test_class'], t['test_name'])
+        others = sorted(a for a in by_tuple[key] if a != t['arch'])
+        if key in cuda_log_tuples and s2_name not in others:
+            others.append(s2_name)
+        t['also_failing_in'] = ', '.join(others)
+
+
+def _add_log_failure_cross_arch(log_failures, failed_tests, s1_name, s2_name):
+    """Populate 'also_failing_in' for each log failure entry.
+
+    Cross-references: other archs that have the same test failing (either as
+    a log failure or as an XML-based failure), plus s2 (CUDA) if it appears
+    in log failures for the same test tuple.
+    """
+    from collections import defaultdict
+    by_tuple_archs = defaultdict(set)
+
+    for lf in log_failures or []:
+        if lf.get('platform', '') == s1_name:
+            test_class, test_name = _parse_log_failure_names(lf)
+            key = (lf.get('test_file', ''), test_class, test_name)
+            by_tuple_archs[key].add(lf.get('arch', ''))
+    for t in failed_tests or []:
+        key = (t['test_file'], t['test_class'], t['test_name'])
+        by_tuple_archs[key].add(t['arch'])
+
+    cuda_log_tuples = set()
+    for lf in log_failures or []:
+        if lf.get('platform', '') == s2_name:
+            test_class, test_name = _parse_log_failure_names(lf)
+            cuda_log_tuples.add((lf.get('test_file', ''), test_class, test_name))
+
+    for lf in log_failures or []:
+        test_class, test_name = _parse_log_failure_names(lf)
+        key = (lf.get('test_file', ''), test_class, test_name)
+        arch = lf.get('arch', '')
+        others = sorted(a for a in by_tuple_archs[key] if a and a != arch)
+        if key in cuda_log_tuples and s2_name not in others:
+            others.append(s2_name)
+        lf['also_failing_in'] = ', '.join(others)
+
+
 def load_log_failures(filepaths):
     """Load log failure CSVs from detect_log_failures.py.
 
@@ -277,6 +345,110 @@ def load_log_failures(filepaths):
     return entries
 
 
+def load_flaky_tests_as_log_failures(filepaths):
+    """Load flaky_tests_<arch>.csv and return entries shaped like log-failure rows.
+
+    Each returned dict has the same schema as the entries produced by
+    load_log_failures, with category='FLAKY' and reason='<test_class>::<test_name>',
+    so they can be appended to the log_failures list and surfaced in the
+    LOG-BASED FAILURES table alongside crashes/timeouts/etc.
+    """
+    entries = []
+    for fp in filepaths or []:
+        if not fp:
+            continue
+        basename = os.path.basename(fp)
+        if not (basename.startswith('log_failures_') and basename.endswith('.csv')):
+            continue
+        arch = basename[len('log_failures_'):-len('.csv')]
+        flaky_path = os.path.join(
+            os.path.dirname(fp),
+            'flaky_tests_' + basename[len('log_failures_'):],
+        )
+        if not os.path.isfile(flaky_path):
+            continue
+        with open(flaky_path, newline='') as f:
+            for row in csv.DictReader(f):
+                test_class = row.get('test_class', '')
+                test_name = row.get('test_name', '')
+                entries.append({
+                    'arch': arch,
+                    'log_file': row.get('log_file', ''),
+                    'platform': row.get('platform', ''),
+                    'test_config': row.get('test_config', ''),
+                    'test_file': row.get('test_file', ''),
+                    'job_shard': row.get('job_shard', ''),
+                    'test_shard': row.get('test_shard', ''),
+                    'status': 'FLAKY',
+                    'category': 'FLAKY',
+                    'reason': f'{test_class}::{test_name}' if test_class else test_name,
+                    'exit_codes': '',
+                })
+    return entries
+
+
+def load_log_shards(filepaths):
+    """Load log shard inventory CSVs written alongside log_failures files.
+
+    For each log_failures_<arch>.csv, looks for a sibling log_shards_<arch>.csv
+    and returns a lookup dict:
+        (arch, platform, test_config, job_shard, normalized_test_file) -> test_shards_str
+
+    The CSV is produced by detect_log_failures.py and records every
+    (test_file, test_shard) pair observed per job-level shard. If an XML-based
+    failure's key matches, we can back-fill the test-level shard value.
+    """
+    lookup = {}
+    for fp in filepaths:
+        if not fp:
+            continue
+        basename = os.path.basename(fp)
+        arch = ''
+        if basename.startswith('log_failures_') and basename.endswith('.csv'):
+            arch = basename[len('log_failures_'):-len('.csv')]
+            shards_path = os.path.join(
+                os.path.dirname(fp),
+                'log_shards_' + basename[len('log_failures_'):],
+            )
+        else:
+            continue
+        if not os.path.isfile(shards_path):
+            continue
+        with open(shards_path, newline='') as f:
+            for row in csv.DictReader(f):
+                key = (arch, row.get('platform', ''), row.get('test_config', ''),
+                       row.get('job_shard', ''),
+                       _norm_test_file(row.get('test_file', '')))
+                lookup[key] = row.get('test_shards', '')
+    return lookup
+
+
+def _format_test_shards(shards_str):
+    """Collapse a test_shards inventory string into a compact display value.
+
+    - '' -> ''
+    - '1/1' -> '1/1'
+    - '3/14' -> '3/14'
+    - '1/14,6/14,12/14' -> '1,6,12/14' (multiple test-level shards observed)
+    - mixed totals fall back to the raw string."""
+    if not shards_str:
+        return ''
+    parts = [p for p in shards_str.split(',') if p]
+    if len(parts) == 1:
+        return parts[0]
+    totals = set()
+    nums = []
+    for p in parts:
+        if '/' not in p:
+            return shards_str
+        a, b = p.split('/', 1)
+        totals.add(b)
+        nums.append(a)
+    if len(totals) == 1:
+        return f"{','.join(nums)}/{totals.pop()}"
+    return shards_str
+
+
 def fmt_val(v):
     if isinstance(v, int):
         return f'{v:,}'
@@ -325,6 +497,17 @@ def build_rows(args, archs, arch_data):
     return out
 
 
+def _norm_test_file(path):
+    """Normalize a test_file string so XML-sourced ('a.b.c') and log-sourced
+    ('a/b/c') forms compare equal. Also strips a trailing .py if present."""
+    if not path:
+        return ''
+    s = path.replace('/', '.')
+    if s.endswith('.py'):
+        s = s[:-3]
+    return s
+
+
 def _parse_log_failure_names(lf):
     """Extract test_class and test_name from a log failure's reason field.
 
@@ -339,7 +522,7 @@ def _parse_log_failure_names(lf):
     return parts[0], parts[1]
 
 
-def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None):
+def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None, shard_lookup=None):
     csv_rows = []
     csv_rows.append([''] + list(archs))
     for label, vals in rows:
@@ -352,41 +535,87 @@ def write_csv(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_na
             csv_rows.append([label] + list(vals))
     csv_rows.append([])
 
-    if failed_tests:
+    s1_failed = [t for t in (failed_tests or []) if t.get(f'status_{s1_name}') == 'FAILED']
+
+    shard_lookup = shard_lookup or {}
+
+    def _xml_test_shard(t, platform):
+        key = (t.get('arch', ''), platform, t.get('test_config', ''),
+               t.get(f'shard_{platform}', ''),
+               _norm_test_file(t.get('test_file', '')))
+        return _format_test_shards(shard_lookup.get(key, ''))
+
+    if s1_failed:
         csv_rows.append(['FAILED TESTS'])
         header = ['Arch', 'Test Config', 'Test File', 'Test Class',
-                  'Test Name', 'Shard', f'Status ({s1_name})']
+                  'Test Name',
+                  f'Job-Level Shard ({s1_name})',
+                  f'Test-Level Shard ({s1_name})']
+        if has_set2:
+            header.append(f'Job-Level Shard ({s2_name})')
+            header.append(f'Test-Level Shard ({s2_name})')
+        header.append(f'Status ({s1_name})')
         if has_set2:
             header.append(f'Status ({s2_name})')
+        header.append('Also Failing In')
         csv_rows.append(header)
-        for t in failed_tests:
+        for t in s1_failed:
             row = [t['arch'], t['test_config'], t['test_file'],
-                   t['test_class'], t['test_name'], t.get('shard', ''),
-                   t[f'status_{s1_name}']]
+                   t['test_class'], t['test_name'],
+                   t.get(f'shard_{s1_name}', ''),
+                   _xml_test_shard(t, s1_name)]
+            if has_set2:
+                row.append(t.get(f'shard_{s2_name}', ''))
+                row.append(_xml_test_shard(t, s2_name))
+            row.append(t[f'status_{s1_name}'])
             if has_set2:
                 row.append(t.get(f'status_{s2_name}', ''))
+            row.append(t.get('also_failing_in', ''))
             csv_rows.append(row)
         csv_rows.append([])
 
     if log_failures:
-        csv_rows.append(['LOG-BASED FAILURES (not in XML)'])
-        csv_rows.append(['Arch', 'Platform', 'Test Config', 'Test File', 'Test Class', 'Test Name', 'Shard', 'Category', 'Log File'])
+        xml_failed_keys = {
+            (t['arch'], _norm_test_file(t['test_file']), t['test_class'], t['test_name'])
+            for t in (failed_tests or [])
+        }
+        rocm_log_failures = []
         for lf in log_failures:
+            if lf.get('platform', '') != s1_name:
+                continue
             test_class, test_name = _parse_log_failure_names(lf)
-            csv_rows.append([
-                lf.get('arch', ''), lf.get('platform', ''), lf.get('test_config', ''),
-                lf.get('test_file', ''), test_class, test_name,
-                lf.get('shard', ''), lf.get('category', ''),
-                lf.get('log_file', ''),
-            ])
-        csv_rows.append([])
+            key = (lf.get('arch', ''), _norm_test_file(lf.get('test_file', '')),
+                   test_class, test_name)
+            # Skip entries already present in the XML-based FAILED TESTS table
+            # to avoid double-counting the same failure, except for FLAKY
+            # entries which represent an independent signal (a rerun passed).
+            if key in xml_failed_keys and lf.get('category', '') != 'FLAKY':
+                continue
+            rocm_log_failures.append(lf)
+        if rocm_log_failures:
+            csv_rows.append(['LOG-BASED FAILURES (not in XML)'])
+            csv_rows.append(['Arch', 'Platform', 'Test Config', 'Test File', 'Test Class',
+                             'Test Name', 'Job-Level Shard', 'Test-Level Shard',
+                             'Category', 'Also Failing In', 'Log File'])
+            for lf in rocm_log_failures:
+                test_class, test_name = _parse_log_failure_names(lf)
+                csv_rows.append([
+                    lf.get('arch', ''), lf.get('platform', ''), lf.get('test_config', ''),
+                    lf.get('test_file', ''), test_class, test_name,
+                    lf.get('job_shard', ''),
+                    lf.get('test_shard', lf.get('shard', '')),
+                    lf.get('category', ''),
+                    lf.get('also_failing_in', ''),
+                    lf.get('log_file', ''),
+                ])
+            csv_rows.append([])
 
     with open(output_path, 'w', newline='') as f:
         csv.writer(f).writerows(csv_rows)
     print(f'CSV written to {output_path}')
 
 
-def write_markdown(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None):
+def write_markdown(rows, archs, output_path, failed_tests=None, s1_name='set1', s2_name='set2', has_set2=True, log_failures=None, shard_lookup=None):
     lines = []
     current_section = []
 
@@ -417,22 +646,44 @@ def flush_table():
 
     flush_table()
 
-    if failed_tests:
-        lines.append('### FAILED TESTS')
+    s1_failed = [t for t in (failed_tests or []) if t.get(f'status_{s1_name}') == 'FAILED']
+
+    shard_lookup = shard_lookup or {}
+
+    def _xml_test_shard(t, platform):
+        key = (t.get('arch', ''), platform, t.get('test_config', ''),
+               t.get(f'shard_{platform}', ''),
+               _norm_test_file(t.get('test_file', '')))
+        return _format_test_shards(shard_lookup.get(key, ''))
+
+    cols = ['Arch', 'Test Config', 'Test File', 'Test Class', 'Test Name',
+            f'Job-Level Shard ({s1_name})',
+            f'Test-Level Shard ({s1_name})']
+    if has_set2:
+        cols.append(f'Job-Level Shard ({s2_name})')
+        cols.append(f'Test-Level Shard ({s2_name})')
+    cols.append(f'Status ({s1_name})')
+    if has_set2:
+        cols.append(f'Status ({s2_name})')
+    cols.append('Also Failing In')
+
+    if s1_failed:
+        lines.append(f'### FAILED TESTS ({len(s1_failed)})')
         lines.append('')
-        cols = ['Arch', 'Test Config', 'Test File', 'Test Class', 'Test Name',
-                'Shard', f'Status ({s1_name})']
-        if has_set2:
-            cols.append(f'Status ({s2_name})')
         lines.append('| ' + ' | '.join(cols) + ' |')
         lines.append('| ' + ' | '.join(['---'] * len(cols)) + ' |')
-        for t in failed_tests:
+        for t in s1_failed:
             line = (f"| {t['arch']} | {t['test_config']} | {t['test_file']} "
                     f"| {t['test_class']} | {t['test_name']} "
-                    f"| {t.get('shard', '')} | {t[f'status_{s1_name}']}")
+                    f"| {t.get(f'shard_{s1_name}', '')} "
+                    f"| {_xml_test_shard(t, s1_name)}")
+            if has_set2:
+                line += f" | {t.get(f'shard_{s2_name}', '')}"
+                line += f" | {_xml_test_shard(t, s2_name)}"
+            line += f" | {t[f'status_{s1_name}']}"
             if has_set2:
                 line += f" | {t.get(f'status_{s2_name}', '')}"
-            line += ' |'
+            line += f" | {t.get('also_failing_in', '')} |"
             lines.append(line)
         lines.append('')
     else:
@@ -442,22 +693,40 @@ def flush_table():
         lines.append('')
 
     if log_failures:
-        lines.append('### LOG-BASED FAILURES (not in XML)')
-        lines.append('')
-        lines.append('These test failures were detected from CI log files but have no XML report')
-        lines.append('(typically due to timeouts, crashes, or process kills).')
-        lines.append('')
-        lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Shard | Category |')
-        lines.append('| --- | --- | --- | --- | --- | --- | --- | --- |')
+        xml_failed_keys = {
+            (t['arch'], _norm_test_file(t['test_file']), t['test_class'], t['test_name'])
+            for t in (failed_tests or [])
+        }
+        rocm_log_failures = []
         for lf in log_failures:
+            if lf.get('platform', '') != s1_name:
+                continue
             test_class, test_name = _parse_log_failure_names(lf)
-            lines.append(
-                f"| {lf.get('arch', '')} | {lf.get('platform', '')} | {lf.get('test_config', '')} "
-                f"| {lf.get('test_file', '')} | {test_class} "
-                f"| {test_name} | {lf.get('shard', '')} "
-                f"| {lf.get('category', '')} |"
-            )
-        lines.append('')
+            key = (lf.get('arch', ''), _norm_test_file(lf.get('test_file', '')),
+                   test_class, test_name)
+            if key in xml_failed_keys and lf.get('category', '') != 'FLAKY':
+                continue
+            rocm_log_failures.append(lf)
+        if rocm_log_failures:
+            lines.append(f'### LOG-BASED FAILURES (not in XML) ({len(rocm_log_failures)})')
+            lines.append('')
+            lines.append('These test failures were detected from CI log files but have no XML report')
+            lines.append('(typically due to timeouts, crashes, or process kills).')
+            lines.append('')
+            lines.append('| Arch | Platform | Test Config | Test File | Test Class | Test Name | Job-Level Shard | Test-Level Shard | Category | Also Failing In |')
+            lines.append('| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |')
+            for lf in rocm_log_failures:
+                test_class, test_name = _parse_log_failure_names(lf)
+                lines.append(
+                    f"| {lf.get('arch', '')} | {lf.get('platform', '')} | {lf.get('test_config', '')} "
+                    f"| {lf.get('test_file', '')} | {test_class} "
+                    f"| {test_name} "
+                    f"| {lf.get('job_shard', '')} "
+                    f"| {lf.get('test_shard', lf.get('shard', ''))} "
+                    f"| {lf.get('category', '')} "
+                    f"| {lf.get('also_failing_in', '')} |"
+                )
+            lines.append('')
 
     md = '\n'.join(lines)
     with open(output_path, 'w') as f:
@@ -487,13 +756,19 @@ def main():
     failed = collect_failed_tests(arch_data, archs, args.set1_name, args.set2_name)
     any_has_set2 = any(d.get('has_set2', True) for d in arch_data.values())
     log_failures = load_log_failures(args.log_failures) if args.log_failures else []
+    if args.log_failures:
+        log_failures.extend(load_flaky_tests_as_log_failures(args.log_failures))
+    shard_lookup = load_log_shards(args.log_failures) if args.log_failures else {}
+
+    _add_cross_arch_info(failed, log_failures, args.set2_name)
+    _add_log_failure_cross_arch(log_failures, failed, args.set1_name, args.set2_name)
 
     output_base = args.output
     if output_base.endswith('.csv') or output_base.endswith('.md'):
         output_base = output_base.rsplit('.', 1)[0]
 
-    write_csv(data_rows, archs, f'{output_base}.csv', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures)
-    write_markdown(data_rows, archs, f'{output_base}.md', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures)
+    write_csv(data_rows, archs, f'{output_base}.csv', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures, shard_lookup=shard_lookup)
+    write_markdown(data_rows, archs, f'{output_base}.md', failed, args.set1_name, args.set2_name, has_set2=any_has_set2, log_failures=log_failures, shard_lookup=shard_lookup)
 
 
 if __name__ == '__main__':
diff --git a/.github/workflows/parity.yml b/.github/workflows/parity.yml
index 0e4147f25c5b7..b47049ca03f16 100644
--- a/.github/workflows/parity.yml
+++ b/.github/workflows/parity.yml
@@ -322,7 +322,7 @@ jobs:
           ARCH_ARGS=()
           for ARCH in $ARCHS; do
             ARTIFACT_DIR="../../artifacts/${PREFIX}-results-${ARCH}"
-            CSV=$(find "$ARTIFACT_DIR"/ -maxdepth 2 -name "*.csv" ! -name "*_running_time*" ! -name "*_summary*" ! -name "log_failures_*" 2>/dev/null | head -1)
+            CSV=$(find "$ARTIFACT_DIR"/ -maxdepth 2 -name "*.csv" ! -name "*_running_time*" ! -name "*_summary*" ! -name "log_failures_*" ! -name "log_shards_*" ! -name "flaky_tests_*" 2>/dev/null | head -1)
             if [ -z "$CSV" ]; then
               echo "WARNING: No CSV found for $ARCH, skipping"
               continue

From f40195474a39221ee6bc39ede4994429c41e1aa9 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 30 Apr 2026 02:32:11 +0000
Subject: [PATCH 36/43] Ensure one of pr_id/sha1 is provided to
 download_testlogs

---
 .../pytorch-unit-test-scripts/download_testlogs               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index d391a7c7c10c5..617a2df53ccf5 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -485,7 +485,7 @@ def main():
     token = os.getenv('GITHUB_TOKEN', '...')
     global authentication_headers
     authentication_headers = {'Authorization': f'token {token}'}
-    if args.pr_id and args.sha1:
+    if (args.pr_id and args.sha1) or (not args.pr_id and not args.sha1):
         error_msg = "Error: Please provide either pr_id or sha!"
         print(error_msg)
         sys.exit(1)
@@ -496,7 +496,7 @@ def main():
         sha = args.sha1
         pr_id = None
     status = "success"
-    print(sha)
+    print(f"sha: {sha}")
 
     # When comparing two commits, prefix log filenames with short SHAs
     if args.baseline_sha:

From 2479e0edd45613d4920666eeb08ca388cc250bb7 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Mon, 4 May 2026 14:15:36 -0700
Subject: [PATCH 37/43] download_testlogs: use CUDA OSDC test names (#3199)

Repro job without this PR's change:
https://github.com/ROCm/pytorch/actions/runs/25342470426/job/74303089638

Validation run with this PR's change:
https://github.com/ROCm/pytorch/actions/runs/25342235984

Current issue: existing testing is not able to pick up the CUDA
artifacts because the CUDA job and artifact names changed from `test` to
`test-osdc` for default and distributed shards.

Repro inputs: `sha=b1b5b61ddb689ea65aab0915ecfac5cc459b92fb`,
`arch=mi355`, `skip_rocm=false`, `csv_name=pr3199-pre-change-repro`.

CUDA job names now use `test-osdc` for default and distributed shards,
for example:

`linux-jammy-cuda13.0-py3.10-gcc11 / test-osdc (default, 1, 5, ...)`
`linux-jammy-cuda13.0-py3.10-gcc11 / test-osdc (distributed, 1, 3, ...)`

CUDA artifact names now look like:

`test-reports-test-osdc-default-1-5`
`test-reports-test-osdc-distributed-1-3`
---
 .../download_testlogs                         | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index 617a2df53ccf5..b40c4e5f2578c 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -210,8 +210,10 @@ def _shorten_unzipped_dirs():
 
     Converts names like:
       unzipped-test-reports-runattempt1-test-default-1-6-linux.rocm.gpu.gfx942.1_68613413431.zip
+      unzipped-test-reports-runattempt1-test-osdc-default-1-5-mt-l-x86aavx2-29-113-l4_73385044118.zip
     to:
       test-default-1-6
+      test-default-1-5
 
     Preserves the 'test-<config>' prefix so that summarize_xml_testreports.py
     can still detect workflow type via substring matching.
@@ -220,9 +222,9 @@ def _shorten_unzipped_dirs():
     for d in sorted(Path(".").glob("unzipped-*")):
         if not d.is_dir():
             continue
-        m = re.search(r'(test-\w+-\d+-\d+)', d.name)
+        m = re.search(r'test-(?:osdc-)?(default|distributed|inductor)-(\d+)-(\d+)', d.name)
         if m:
-            short_name = m.group(1)
+            short_name = f"test-{m.group(1)}-{m.group(2)}-{m.group(3)}"
             if not Path(short_name).exists():
                 d.rename(short_name)
                 print(f"  Renamed {d.name} -> {short_name}")
@@ -662,6 +664,7 @@ def main():
 
     if not args.no_cuda:
         cuda_job_prefix = "linux-jammy-cuda13.0-py3.10-gcc11"
+        cuda_test_job_kind = "test-osdc"
         print("==========================================")
         print(f"Finding CUDA tests in workflow '{CUDAWorkflowNames['default']}' by sha: {sha}")
         print("==========================================")
@@ -686,7 +689,10 @@ def main():
 
         for run in trunk_runs:
             jobs = get_workflow_jobs(run)
-            test_jobs = [j for j in jobs if cuda_job_prefix in j['name'] and '/ test' in j['name']]
+            test_jobs = [
+                j for j in jobs
+                if cuda_job_prefix in j['name'] and f'/ {cuda_test_job_kind} (' in j['name']
+            ]
             if test_jobs:
                 trunk_wf = run
                 all_cuda_jobs = jobs
@@ -699,7 +705,7 @@ def main():
             # by the jobs API. Use check-runs API to find the actual run.
             print("No CUDA test jobs in any trunk run's jobs API, trying check-runs API...")
             check_runs = get_check_runs_for_commit(sha, cuda_job_prefix)
-            cuda_test_jobs = [cr for cr in check_runs if '/ test' in cr['name']]
+            cuda_test_jobs = [cr for cr in check_runs if f'/ {cuda_test_job_kind} (' in cr['name']]
             if cuda_test_jobs:
                 # Extract the actual workflow run ID from the check-run details URL
                 import re as _re
@@ -737,18 +743,18 @@ def main():
         # Download logs
         if not args.artifacts_only:
             test_log_list_cuda_default = [
-              ["cuda1.txt", f"{cuda_job_prefix} / test (default, 1, 5"],
-              ["cuda2.txt", f"{cuda_job_prefix} / test (default, 2, 5"],
-              ["cuda3.txt", f"{cuda_job_prefix} / test (default, 3, 5"],
-              ["cuda4.txt", f"{cuda_job_prefix} / test (default, 4, 5"],
-              ["cuda5.txt", f"{cuda_job_prefix} / test (default, 5, 5"],
+              ["cuda1.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 1, 5"],
+              ["cuda2.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 2, 5"],
+              ["cuda3.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 3, 5"],
+              ["cuda4.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 4, 5"],
+              ["cuda5.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (default, 5, 5"],
             ]
             test_log_list_cuda = test_log_list_cuda_default
             if not args.exclude_distributed:
                 test_log_list_cuda_distributed = [
-                  ["cuda_dist1.txt", f"{cuda_job_prefix} / test (distributed, 1, 3"],
-                  ["cuda_dist2.txt", f"{cuda_job_prefix} / test (distributed, 2, 3"],
-                  ["cuda_dist3.txt", f"{cuda_job_prefix} / test (distributed, 3, 3"],
+                  ["cuda_dist1.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 1, 3"],
+                  ["cuda_dist2.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 2, 3"],
+                  ["cuda_dist3.txt", f"{cuda_job_prefix} / {cuda_test_job_kind} (distributed, 3, 3"],
                 ]
                 test_log_list_cuda += test_log_list_cuda_distributed
 
@@ -756,11 +762,11 @@ def main():
 
         # Download artifacts
         test_artifacts_list_cuda_default = [
-          "test-reports-test-default-1-5",
-          "test-reports-test-default-2-5",
-          "test-reports-test-default-3-5",
-          "test-reports-test-default-4-5",
-          "test-reports-test-default-5-5",
+          "test-reports-test-osdc-default-1-5",
+          "test-reports-test-osdc-default-2-5",
+          "test-reports-test-osdc-default-3-5",
+          "test-reports-test-osdc-default-4-5",
+          "test-reports-test-osdc-default-5-5",
         ]
 
         test_artifacts_list_cuda = []
@@ -769,9 +775,9 @@ def main():
 
         if not args.exclude_distributed:
             test_artifacts_list_cuda_distributed = [
-              "test-reports-test-distributed-1-3",
-              "test-reports-test-distributed-2-3",
-              "test-reports-test-distributed-3-3",
+              "test-reports-test-osdc-distributed-1-3",
+              "test-reports-test-osdc-distributed-2-3",
+              "test-reports-test-osdc-distributed-3-3",
             ]
             test_artifacts_list_cuda += test_artifacts_list_cuda_distributed
 

From 467069a6d9202d6218436879025a1a3470c7401e Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Tue, 12 May 2026 12:16:34 -0700
Subject: [PATCH 38/43] [CI] Update MI355 parity shard counts (#3213)

## Summary
- Update MI355 parity report shard counts to match current CI artifacts.
- Change default shards from 6 to 10 and distributed shards from 3 to 4.

## Validation
* Combined parity workflow for
`5b9a4786ea4b1a6170c6e5a4878269e7f591224b` on `mi300, mi355`:
<https://github.com/ROCm/pytorch/actions/runs/25738157290>

---------

Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
---
 .../download_testlogs                         | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index b40c4e5f2578c..480336a4e9aeb 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -459,7 +459,7 @@ def main():
         "nightly": {"default": 6, "distributed": 3, "inductor": 2},
         "mi200": {"default": 6, "distributed": 3, "inductor": 2},
         "mi300": {"default": 6, "distributed": 3, "inductor": 2},
-        "mi355": {"default": 6, "distributed": 3, "inductor": 2},
+        "mi355": {"default": 10, "distributed": 4, "inductor": 2}, #trunk
         "navi31": {"default": 2, "distributed": 3, "inductor": 2},
     }
     rocm_job_prefix = rocm_job_prefixes[arch]
@@ -482,7 +482,7 @@ def main():
     if not args.no_cuda:
         print(f"Using CUDA workflows: {CUDAWorkflowNames}")
     print(f"Using ROCm job prefixes: {rocm_job_prefix}")
-    print(f"Using ROCm shard counts: {rocm_shards}")
+    print(f"Using initial ROCm shard counts (may be updated based on actual workflow used): {rocm_shards}")
 
     token = os.getenv('GITHUB_TOKEN', '...')
     global authentication_headers
@@ -546,9 +546,14 @@ def main():
         # If the ROCm distributed logs aren't found you might want to check the HUD for the correct tags
         # HUD link: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=rocm
         # Make sure "Hide unstable jobs" is unselected, in case ROCm jobs are marked as unstable
+    
+        if arch == "mi355":
+            dist_shards = 3 if not periodic_fallback_used else rocm_shards["distributed"]
+        else:
+            dist_shards = rocm_shards["distributed"]
+        print(f"Using final ROCm shard count {dist_shards} for distributed")
 
         if not args.artifacts_only:
-            dist_shards = rocm_shards["distributed"]
             test_log_list_rocm_distributed = [
                 [f"{current_prefix}rocm_dist{i}.txt", f"{dist_job_prefix} / test (distributed, {i}, {dist_shards}"]
                 for i in range(1, dist_shards + 1)
@@ -556,7 +561,6 @@ def main():
             download_logs(periodic_wf, test_log_list_rocm_distributed, folder_list[0])
 
         # Download artifacts
-        dist_shards = rocm_shards["distributed"]
         test_artifacts_list_rocm_distributed = [
             f"test-reports-test-distributed-{i}-{dist_shards}"
             for i in range(1, dist_shards + 1)
@@ -602,8 +606,13 @@ def main():
         # Download logs
         # If logs aren't found you might want to check the HUD for the correct tags
         # HUD link: https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=rocm
-        if not args.artifacts_only:
+        if arch == "mi355":
+            default_shards = 6 if default_fallback_used else rocm_shards["default"]
+        else:
             default_shards = rocm_shards["default"]
+        print(f"Using final ROCm shard count {default_shards} for default")
+
+        if not args.artifacts_only:
             test_log_list_rocm_default = [
               [f"{current_prefix}rocm{i}.txt", f"{rocm_job_prefix['default']} / test (default, {i}, {default_shards}"]
               for i in range(1, default_shards + 1)
@@ -611,7 +620,6 @@ def main():
             download_logs(rocm_wf, test_log_list_rocm_default, folder_list[0])
 
         # Download artifacts
-        default_shards = rocm_shards["default"]
         test_artifacts_list_rocm_default = [
           f"test-reports-test-default-{i}-{default_shards}"
           for i in range(1, default_shards + 1)
@@ -639,9 +647,11 @@ def main():
 
         folder_list = get_or_create_test_folder(inductor_wf_rocm)
 
+        inductor_shards = rocm_shards["inductor"]
+        print(f"Using final ROCm shard count {inductor_shards} for inductor")
+    
         # Download logs
         if not args.artifacts_only:
-          inductor_shards = rocm_shards["inductor"]
           test_log_list_rocm_inductor = [
             [f"{current_prefix}rocm_inductor{i}.txt", f"{rocm_job_prefix['inductor']} / test (inductor, {i}, {inductor_shards}"]
             for i in range(1, inductor_shards + 1)
@@ -649,7 +659,6 @@ def main():
           download_logs(inductor_wf_rocm, test_log_list_rocm_inductor, folder_list[0])
 
         #Download artifacts
-        inductor_shards = rocm_shards["inductor"]
         test_artifacts_list_rocm_inductor = [
           f"test-reports-test-inductor-{i}-{inductor_shards}"
           for i in range(1, inductor_shards + 1)
@@ -802,26 +811,38 @@ def main():
             inductor_wf_cuda = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=CUDAWorkflowNames["inductor"], sha=inductor_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
             print(f"Using workflow '{CUDAWorkflowNames['inductor']}' with id:{inductor_wf_cuda['id']} for CUDA inductor")
 
+            inductor_cuda_jobs = get_workflow_jobs(inductor_wf_cuda)
+            cuda_inductor_test_jobs = [
+                j for j in inductor_cuda_jobs
+                if "unit-test / inductor-test / test-osdc (inductor," in j['name']
+            ]
+            cuda_inductor_job_ids = [str(j['id']) for j in cuda_inductor_test_jobs]
+            cuda_inductor_artifact_substrings = (
+                [f"_{jid}" for jid in cuda_inductor_job_ids]
+                if cuda_inductor_job_ids
+                else None
+            )
+            print(f"Found {len(cuda_inductor_test_jobs)} CUDA inductor OSDC test jobs")
+
             folder_list = get_or_create_test_folder(inductor_wf_cuda)
 
             # Download logs
             if not args.artifacts_only:
               test_log_list_cuda_inductor = [
-                ["cuda_inductor1.txt", "unit-test / inductor-test / test (inductor, 1, 2"],
-                ["cuda_inductor2.txt", "unit-test / inductor-test / test (inductor, 2, 2"],
+                ["cuda_inductor1.txt", "unit-test / inductor-test / test-osdc (inductor, 1, 2"],
+                ["cuda_inductor2.txt", "unit-test / inductor-test / test-osdc (inductor, 2, 2"],
               ]
-              download_logs(inductor_wf_cuda, test_log_list_cuda_inductor, folder_list[0])
+              download_logs(inductor_wf_cuda, test_log_list_cuda_inductor, folder_list[0], jobs=inductor_cuda_jobs)
 
             test_artifacts_list_cuda_inductor = [
-              "test-reports-test-inductor-1-2",
-              "test-reports-test-inductor-2-2"
+              "test-reports-test-osdc-inductor-1-2",
+              "test-reports-test-osdc-inductor-2-2"
             ]
-            # Inductor workflow is separate, use nvidia.gpu filter (no duplicate CUDA versions)
             download_artifacts(
                 inductor_wf_cuda,
                 test_artifacts_list_cuda_inductor,
                 test_folder=folder_list[1],
-                allowed_substrings=["nvidia.gpu"],
+                allowed_substrings=cuda_inductor_artifact_substrings,
             )
             os.chdir("..")
 

From 73d3e635542b32e7bc9df3b033d2eaa5be5cb4f7 Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Thu, 14 May 2026 06:23:17 -0500
Subject: [PATCH 39/43] Update Github token used for parity.yml (#3224)

## Motivation

Old IFU_GITHUB_TOKEN [seems to have
expired](https://github.com/ROCm/pytorch/actions/runs/25856299592/job/75974982737)

## Technical Details

Replace with PARITY_GITHUB_TOKEN (meant specifically for this workflow)

## Test Plan

Run parity.yml with this PR branch and see if it still gives credential
error.

## Test Result

"Download artifacts" step succeeded in
https://github.com/ROCm/pytorch/actions/runs/25857211908/job/75978008711

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
---
 .github/workflows/parity.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/parity.yml b/.github/workflows/parity.yml
index b47049ca03f16..5f88548712818 100644
--- a/.github/workflows/parity.yml
+++ b/.github/workflows/parity.yml
@@ -135,7 +135,7 @@ jobs:
       - name: Download artifacts
         working-directory: .automation_scripts/pytorch-unit-test-scripts
         env:
-          GITHUB_TOKEN: ${{ secrets.IFU_GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.PARITY_GITHUB_TOKEN }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |

From 1d7407964415e2d9f5ebf4feb98dd9a2bf0efd31 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Thu, 14 May 2026 15:53:30 -0700
Subject: [PATCH 40/43] download_testlogs: select CUDA test artifact kind
 dynamically (#3226)

## Summary
- Select the CUDA test artifact kind from the jobs present for the
target SHA.
- Detect whether the target SHA uses test-osdc or legacy test CUDA jobs,
then use the detected kind when building log keys and artifact prefixes.
- Apply the same dynamic selection to CUDA inductor jobs.
- Treat missing per-arch summary buckets as zero so mixed ROCm/CUDA
coverage does not crash report generation.

## Validation
- PR/ciflow case: dispatched `Parity Report` on this branch with
`sha=386f38175e3aaee2dadb36b5c364deff0869664d` and `arch=mi355, mi300,
mi200, navi31`. CUDA default/distributed and inductor selected `test`.
  - Run: https://github.com/ROCm/pytorch/actions/runs/25866762885
- Main branch case: dispatched `Parity Report` on this branch with
`sha=f38b1ec280bafa2ad11f6e767558e73e9eb508a6`, `arch=mi300`,
`skip_rocm=true`, and `exclude_distributed=true`. CUDA default and
inductor selected `test-osdc`.
  - Run: https://github.com/ROCm/pytorch/actions/runs/25867046276
- Local syntax check: `python3 -m py_compile
.automation_scripts/pytorch-unit-test-scripts/download_testlogs
.automation_scripts/pytorch-unit-test-scripts/generate_summary.py`.
---
 .../download_testlogs                         | 69 ++++++++++++-------
 .../generate_summary.py                       |  4 +-
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index 480336a4e9aeb..f31b679bfe634 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -134,6 +134,33 @@ def get_job_ids_by_prefix(wf, prefix):
     jobs = get_workflow_jobs(wf)
     return [str(j['id']) for j in jobs if prefix in j['name']]
 
+def matches_job_prefix(job_name, prefix):
+    """Match the exact CUDA job family without also matching -debug/-sm86 jobs."""
+    return job_name.startswith(f"{prefix} / ")
+
+def get_cuda_test_jobs(jobs, cuda_job_prefix):
+    """Return the CUDA test kind and jobs for either main or PR CI layouts."""
+    for test_kind in ("test-osdc", "test"):
+        test_jobs = [
+            j for j in jobs
+            if matches_job_prefix(j['name'], cuda_job_prefix)
+            and f"/ {test_kind} (" in j['name']
+        ]
+        if test_jobs:
+            return test_kind, test_jobs
+    return "test-osdc", []
+
+def get_cuda_inductor_test_jobs(jobs):
+    """Return the CUDA inductor test kind and jobs for either main or PR CI layouts."""
+    for test_kind in ("test-osdc", "test"):
+        test_jobs = [
+            j for j in jobs
+            if f"unit-test / inductor-test / {test_kind} (inductor," in j['name']
+        ]
+        if test_jobs:
+            return test_kind, test_jobs
+    return "test-osdc", []
+
 def download_logs(wf, test_log_list, test_folder, jobs=None):
     if wf is None: 
         raise Exception("wf is None!")
@@ -673,7 +700,6 @@ def main():
 
     if not args.no_cuda:
         cuda_job_prefix = "linux-jammy-cuda13.0-py3.10-gcc11"
-        cuda_test_job_kind = "test-osdc"
         print("==========================================")
         print(f"Finding CUDA tests in workflow '{CUDAWorkflowNames['default']}' by sha: {sha}")
         print("==========================================")
@@ -698,14 +724,12 @@ def main():
 
         for run in trunk_runs:
             jobs = get_workflow_jobs(run)
-            test_jobs = [
-                j for j in jobs
-                if cuda_job_prefix in j['name'] and f'/ {cuda_test_job_kind} (' in j['name']
-            ]
+            test_kind, test_jobs = get_cuda_test_jobs(jobs, cuda_job_prefix)
             if test_jobs:
                 trunk_wf = run
                 all_cuda_jobs = jobs
                 cuda_test_jobs = test_jobs
+                cuda_test_job_kind = test_kind
                 print(f"Found CUDA test jobs in trunk run {run['id']}")
                 break
 
@@ -714,7 +738,7 @@ def main():
             # by the jobs API. Use check-runs API to find the actual run.
             print("No CUDA test jobs in any trunk run's jobs API, trying check-runs API...")
             check_runs = get_check_runs_for_commit(sha, cuda_job_prefix)
-            cuda_test_jobs = [cr for cr in check_runs if f'/ {cuda_test_job_kind} (' in cr['name']]
+            cuda_test_job_kind, cuda_test_jobs = get_cuda_test_jobs(check_runs, cuda_job_prefix)
             if cuda_test_jobs:
                 # Extract the actual workflow run ID from the check-run details URL
                 import re as _re
@@ -745,6 +769,7 @@ def main():
         cuda_job_ids = [str(j['id']) for j in cuda_test_jobs]
         cuda_artifact_substrings = [f"_{jid}" for jid in cuda_job_ids] if cuda_job_ids else ["nvidia.gpu"]
         print(f"Using CUDA job prefix: {cuda_job_prefix}")
+        print(f"Using CUDA test job kind: {cuda_test_job_kind}")
         print(f"Found {len(cuda_test_jobs)} CUDA test jobs matching prefix")
 
         folder_list = get_or_create_test_folder(trunk_wf)
@@ -771,11 +796,11 @@ def main():
 
         # Download artifacts
         test_artifacts_list_cuda_default = [
-          "test-reports-test-osdc-default-1-5",
-          "test-reports-test-osdc-default-2-5",
-          "test-reports-test-osdc-default-3-5",
-          "test-reports-test-osdc-default-4-5",
-          "test-reports-test-osdc-default-5-5",
+          f"test-reports-{cuda_test_job_kind}-default-1-5",
+          f"test-reports-{cuda_test_job_kind}-default-2-5",
+          f"test-reports-{cuda_test_job_kind}-default-3-5",
+          f"test-reports-{cuda_test_job_kind}-default-4-5",
+          f"test-reports-{cuda_test_job_kind}-default-5-5",
         ]
 
         test_artifacts_list_cuda = []
@@ -784,9 +809,9 @@ def main():
 
         if not args.exclude_distributed:
             test_artifacts_list_cuda_distributed = [
-              "test-reports-test-osdc-distributed-1-3",
-              "test-reports-test-osdc-distributed-2-3",
-              "test-reports-test-osdc-distributed-3-3",
+              f"test-reports-{cuda_test_job_kind}-distributed-1-3",
+              f"test-reports-{cuda_test_job_kind}-distributed-2-3",
+              f"test-reports-{cuda_test_job_kind}-distributed-3-3",
             ]
             test_artifacts_list_cuda += test_artifacts_list_cuda_distributed
 
@@ -812,31 +837,29 @@ def main():
             print(f"Using workflow '{CUDAWorkflowNames['inductor']}' with id:{inductor_wf_cuda['id']} for CUDA inductor")
 
             inductor_cuda_jobs = get_workflow_jobs(inductor_wf_cuda)
-            cuda_inductor_test_jobs = [
-                j for j in inductor_cuda_jobs
-                if "unit-test / inductor-test / test-osdc (inductor," in j['name']
-            ]
+            cuda_inductor_test_job_kind, cuda_inductor_test_jobs = get_cuda_inductor_test_jobs(inductor_cuda_jobs)
             cuda_inductor_job_ids = [str(j['id']) for j in cuda_inductor_test_jobs]
             cuda_inductor_artifact_substrings = (
                 [f"_{jid}" for jid in cuda_inductor_job_ids]
                 if cuda_inductor_job_ids
                 else None
             )
-            print(f"Found {len(cuda_inductor_test_jobs)} CUDA inductor OSDC test jobs")
+            print(f"Using CUDA inductor test job kind: {cuda_inductor_test_job_kind}")
+            print(f"Found {len(cuda_inductor_test_jobs)} CUDA inductor test jobs")
 
             folder_list = get_or_create_test_folder(inductor_wf_cuda)
 
             # Download logs
             if not args.artifacts_only:
               test_log_list_cuda_inductor = [
-                ["cuda_inductor1.txt", "unit-test / inductor-test / test-osdc (inductor, 1, 2"],
-                ["cuda_inductor2.txt", "unit-test / inductor-test / test-osdc (inductor, 2, 2"],
+                ["cuda_inductor1.txt", f"unit-test / inductor-test / {cuda_inductor_test_job_kind} (inductor, 1, 2"],
+                ["cuda_inductor2.txt", f"unit-test / inductor-test / {cuda_inductor_test_job_kind} (inductor, 2, 2"],
               ]
               download_logs(inductor_wf_cuda, test_log_list_cuda_inductor, folder_list[0], jobs=inductor_cuda_jobs)
 
             test_artifacts_list_cuda_inductor = [
-              "test-reports-test-osdc-inductor-1-2",
-              "test-reports-test-osdc-inductor-2-2"
+              f"test-reports-{cuda_inductor_test_job_kind}-inductor-1-2",
+              f"test-reports-{cuda_inductor_test_job_kind}-inductor-2-2"
             ]
             download_artifacts(
                 inductor_wf_cuda,
diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
index bf100e20d8127..de141de4229b7 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
@@ -479,7 +479,7 @@ def build_rows(args, archs, arch_data):
                 has_set2=has_set2,
             )
         for key in wf_keys:
-            out.append((key, [arch_stats[a][key] for a in archs]))
+            out.append((key, [arch_stats[a].get(key, 0) for a in archs]))
 
     out.append(('__section__', 'OVERALL'))
     ov_keys = overall_stats_keys(args.set1_name, args.set2_name, has_set2=any_has_set2)
@@ -493,7 +493,7 @@ def build_rows(args, archs, arch_data):
             args.set1_name, args.set2_name, has_set2=has_set2,
         )
     for key in ov_keys:
-        out.append((key, [arch_overall[a][key] for a in archs]))
+        out.append((key, [arch_overall[a].get(key, 0) for a in archs]))
     return out
 
 

From 69b53d4213125b02dd98bb27e3b4bff3a8c16a56 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Mon, 18 May 2026 08:20:39 -0700
Subject: [PATCH 41/43] download_testlogs: use mi200-specific parity jobs
 (#3229)

## Summary
- Prefer the arch-specific MI200 workflows in `download_testlogs`:
`rocm-mi200`, `periodic-rocm-mi200`, and `inductor-rocm-mi200`.
- Match arch-specific MI200 test jobs with the
`linux-jammy-rocm-py3.10-mi200` prefix for default, distributed, and
inductor shards.
- Keep `trunk-rocm-sandbox` as the fallback workflow for older SHAs that
do not have the MI200-specific workflows, using the legacy
`linux-jammy-rocm-py3.10` prefix in that fallback path.

## Motivation
A parity run for `50d07a990e33f9822ae4d48bed2d7f06c96522d0` tried to
collect MI200 distributed jobs with:

`linux-jammy-rocm-py3.10 / test (distributed, ...)`

The upstream jobs for this SHA are arch-specific and include `-mi200`,
so the log lookup missed all three shards and XML artifact collection
fell through to empty results. The script should look for the
MI200-specific workflows first, then fall back to `trunk-rocm-sandbox`
for older commits.

## Validation
- `python3 -m py_compile
.automation_scripts/pytorch-unit-test-scripts/download_testlogs`
- Confirmed the fixed prefix matches upstream jobs for
`50d07a990e33f9822ae4d48bed2d7f06c96522d0`:
  - `rocm-mi200`: 6 default shard matches
  - `periodic-rocm-mi200`: 3 distributed shard matches
  - `inductor-rocm-mi200`: 2 inductor shard matches
- Dispatched `Parity Report` on this branch with
`sha=50d07a990e33f9822ae4d48bed2d7f06c96522d0`, `arch=mi200`, and
`skip_cuda=true` to validate collection end-to-end.
- Initial run before fallback commit:
https://github.com/ROCm/pytorch/actions/runs/25920564353 (success)
- Current branch run after fallback commit:
https://github.com/ROCm/pytorch/actions/runs/25920808611 (queued)

Made with [Cursor](https://cursor.com)
---
 .../download_testlogs                         | 40 ++++++++++++++-----
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
index f31b679bfe634..ac4214f99fecd 100755
--- a/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
+++ b/.automation_scripts/pytorch-unit-test-scripts/download_testlogs
@@ -440,9 +440,9 @@ def main():
         }
     elif arch == 'mi200':
         ROCmWorkflowNames = {
-            "default": "trunk-rocm-sandbox",
-            "distributed": "trunk-rocm-sandbox",
-            "inductor": "trunk-rocm-sandbox"
+            "default": "rocm-mi200",
+            "distributed": "periodic-rocm-mi200",
+            "inductor": "inductor-rocm-mi200"
         }
     else:
         # MI300 and navi31 use dedicated ROCm workflows
@@ -461,9 +461,9 @@ def main():
             "inductor": "linux-noble-rocm-nightly-py3.12-gfx942",
         },
         "mi200": {
-            "default": "linux-jammy-rocm-py3.10",
-            "distributed": "linux-jammy-rocm-py3.10",
-            "inductor": "linux-jammy-rocm-py3.10"
+            "default": "linux-jammy-rocm-py3.10-mi200",
+            "distributed": "linux-jammy-rocm-py3.10-mi200",
+            "inductor": "linux-jammy-rocm-py3.10-mi200"
         },
         "mi300": {
             "default": "linux-noble-rocm-py3.12-mi300",
@@ -550,7 +550,7 @@ def main():
             periodic_wf = None
         periodic_fallbacks = {
             "mi355": ("trunk", "linux-jammy-rocm-py3.10-mi355"),
-            "mi200": ("periodic-rocm-mi200", "linux-jammy-rocm-py3.10"),
+            "mi200": ("trunk-rocm-sandbox", "linux-jammy-rocm-py3.10"),
         }
         if periodic_wf is None and arch in periodic_fallbacks:
             fallback_wf, fallback_prefix = periodic_fallbacks[arch]
@@ -616,6 +616,7 @@ def main():
             rocm_wf = None
         default_fallbacks = {
             "mi355": ("rocm-mi355", "linux-noble-rocm-py3.12-mi355"),
+            "mi200": ("trunk-rocm-sandbox", "linux-jammy-rocm-py3.10"),
         }
         if rocm_wf is None and arch in default_fallbacks:
             fallback_wf, fallback_prefix = default_fallbacks[arch]
@@ -669,18 +670,37 @@ def main():
         print(f"Finding ROCm inductor tests in workflow '{ROCmWorkflowNames['inductor']}' by sha: {inductor_rocm_sha}")
         print("===========================================")
         error_msg="Error: inductor workflow not found in scanned workflow runs. Try increasing max_pages."
-        inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
-        print(f"Using workflow '{ROCmWorkflowNames['inductor']}' with id:{inductor_wf_rocm['id']} for ROCm inductor")
+        inductor_fallback_used = False
+        try:
+            inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=ROCmWorkflowNames["inductor"], sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+        except (IndexError, Exception):
+            inductor_wf_rocm = None
+        inductor_fallbacks = {
+            "mi200": ("trunk-rocm-sandbox", "linux-jammy-rocm-py3.10"),
+        }
+        if inductor_wf_rocm is None and arch in inductor_fallbacks:
+            fallback_wf, fallback_prefix = inductor_fallbacks[arch]
+            print(f"Inductor not found in {ROCmWorkflowNames['inductor']}, falling back to {fallback_wf}")
+            inductor_wf_rocm = download_workflow_run(created=args.created, max_pages=args.max_pages, workflow=fallback_wf, sha=inductor_rocm_sha, ignore_status=args.ignore_status, status=status, error_msg=error_msg)
+            inductor_fallback_used = True
+        if inductor_wf_rocm is None:
+            raise Exception(error_msg)
+        inductor_wf_name = ROCmWorkflowNames['inductor'] if not inductor_fallback_used else inductor_fallbacks[arch][0]
+        print(f"Using workflow '{inductor_wf_name}' with id:{inductor_wf_rocm['id']} for ROCm inductor")
 
         folder_list = get_or_create_test_folder(inductor_wf_rocm)
 
         inductor_shards = rocm_shards["inductor"]
         print(f"Using final ROCm shard count {inductor_shards} for inductor")
+        if inductor_fallback_used and arch in inductor_fallbacks:
+            inductor_job_prefix = inductor_fallbacks[arch][1]
+        else:
+            inductor_job_prefix = rocm_job_prefix['inductor']
     
         # Download logs
         if not args.artifacts_only:
           test_log_list_rocm_inductor = [
-            [f"{current_prefix}rocm_inductor{i}.txt", f"{rocm_job_prefix['inductor']} / test (inductor, {i}, {inductor_shards}"]
+            [f"{current_prefix}rocm_inductor{i}.txt", f"{inductor_job_prefix} / test (inductor, {i}, {inductor_shards}"]
             for i in range(1, inductor_shards + 1)
           ]
           download_logs(inductor_wf_rocm, test_log_list_rocm_inductor, folder_list[0])

From 029ac706696257adb1f18f98a9921e1400a448c9 Mon Sep 17 00:00:00 2001
From: Ethan Wee <Ethan.Wee@amd.com>
Date: Wed, 20 May 2026 10:23:59 -0700
Subject: [PATCH 42/43] [CI] Handle large parity summary CSV fields (#3240)

## Summary
- Raise the Python CSV parser field limit in `generate_summary.py` so
large parity CSV diagnostic fields can be read.
- Truncate oversized diagnostic text fields while loading rows so long
failure/skip messages do not make summary generation or output unwieldy.
- Preserve test identity, status, timing, and shard fields used by the
parity report tables.

## Root Cause
A parity run failed in the `summarize` job when Python's default CSV
field limit rejected a generated-code assertion message larger than
131,072 bytes:
https://github.com/ROCm/pytorch/actions/runs/26168276671/job/76979094769

The first offending row was
`inductor.test_torchinductor_codegen_dynamic_shapes::DynamicShapesCodegenGPUTests::test_vmap_dot_decomposes_bmm_dynamic_shapes_cuda`,
where `message_rocm` was 145,748 bytes.

## Test plan
- `python3 -m py_compile
.automation_scripts/pytorch-unit-test-scripts/generate_summary.py`
- Re-ran `generate_summary.py` locally against the artifact from the
failed run:
  - Input: `20260520_all_tests_status_mi355.csv` from run `26168276671`
- Output: summary CSV and markdown generated successfully instead of
failing with `_csv.Error: field larger than field limit (131072)`.
- Triggered `parity.yml` on this branch with the same upstream commit
and arch as the failing run:
  - SHA: `27f2e80e30fb950bc455c777a5e8079e9657a157`
  - Arch: `mi355`
- Validation run:
https://github.com/ROCm/pytorch/actions/runs/26175417191
- Result: `setup-matrix`, `generate-parity (mi355)`, and `summarize` all
completed successfully.
- The summarize log shows `CSV written to
27f2e80e30fb950bc455c777a5e8079e9657a157_summary.csv` and `Markdown
written to 27f2e80e30fb950bc455c777a5e8079e9657a157_summary.md`.
---
 .../generate_summary.py                       | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
index de141de4229b7..406f4b49b78cc 100644
--- a/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
+++ b/.automation_scripts/pytorch-unit-test-scripts/generate_summary.py
@@ -12,6 +12,29 @@
     'distributed': 'TEST DISTRIBUTED',
     'inductor': 'TEST INDUCTOR',
 }
+MAX_DIAGNOSTIC_FIELD_CHARS = 20_000
+DIAGNOSTIC_FIELDS = {
+    'comments',
+    'message_cuda',
+    'message_rocm',
+    'message_set1',
+    'message_set2',
+    'reason',
+    'skip_reason',
+}
+
+
+def _configure_csv_field_limit():
+    limit = sys.maxsize
+    while True:
+        try:
+            csv.field_size_limit(limit)
+            return
+        except OverflowError:
+            limit //= 10
+
+
+_configure_csv_field_limit()
 
 
 def parse_args():
@@ -49,7 +72,19 @@ def parse_args():
 
 def load_csv(filepath):
     with open(filepath, newline='') as f:
-        return list(csv.DictReader(f))
+        return [_truncate_diagnostic_fields(row) for row in csv.DictReader(f)]
+
+
+def _truncate_diagnostic_fields(row):
+    for field in DIAGNOSTIC_FIELDS:
+        value = row.get(field, '')
+        if len(value) > MAX_DIAGNOSTIC_FIELD_CHARS:
+            omitted = len(value) - MAX_DIAGNOSTIC_FIELD_CHARS
+            row[field] = (
+                value[:MAX_DIAGNOSTIC_FIELD_CHARS]
+                + f'\n...[truncated {omitted:,} chars by generate_summary.py]'
+            )
+    return row
 
 
 def detect_columns(headers, set1_name, set2_name):

From 2a10123010ff117f03ba3c6b0a9d616633ab9b17 Mon Sep 17 00:00:00 2001
From: srinivamd <52507740+srinivamd@users.noreply.github.com>
Date: Mon, 25 May 2026 20:35:02 -0700
Subject: [PATCH 43/43] [ROCm] Fix TorchScript JIT BF16 HIPRTC overload
 conflict (ROCM-23829)

On ROCm >= 7.13 (rocm-systems PR #4727), HIPRTC headers now bundle
amd_hip_bf16.h which defines __float2bfloat16(float) returning
__hip_bfloat16. PyTorch's TorchScript JIT fuser emits its own inline
__float2bfloat16(const float) returning __nv_bfloat16 into every
JIT-generated kernel. These two definitions differ only in return type,
causing a fatal HIPRTC compile error:

  "functions that differ only in their return type cannot be overloaded"

This breaks all Megatron-DeepSpeed / BF16 JIT fusion workloads
(bias_gelu warmup) at training startup on MI300X/MI350X.

Fix: detect the HIP bf16 header guard
(_HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_) in the emitted JIT string.
When present, typedef __nv_bfloat16 to the native __hip_bfloat16 type
and skip inline intrinsic definitions. When absent (older ROCm),
preserve existing inline definitions for backward compatibility.

Co-Authored-By: Claude Opus 4 (1M context) <noreply@anthropic.com>
---
 torch/csrc/jit/codegen/fuser/cuda/resource_strings.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index 16ccc5002f9ab..71695657c1ebb 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -274,6 +274,11 @@ constexpr auto bfloat16_support_literal =
 #define __align__(x) __attribute__((aligned(x)))
 #endif
 )" BF16_UINT32_DEF R"(
+#if defined(_HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_)
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef struct __align__(2) { unsigned short x; } __nv_bfloat16_raw;
+#else
+
 typedef struct __align__(2) {
   unsigned short x;
 }
@@ -333,6 +338,7 @@ __device__ float __bfloat162float(const __nv_bfloat16 a) {
   return u.fp32;
 }
 #endif /* defined(__cplusplus) */
+#endif /* !defined(_HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_) */
 )";
 #else
 constexpr auto bfloat16_support_literal =