Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@
)
RE_EXIT_CODE = re.compile(r"Got exit code (?P<code>\d+)")
RE_TIMEOUT = re.compile(r"Command took >(\d+)min, returning 124")
RE_JOB_TIMEOUT = re.compile(
r"(?:"
r"The job running on runner .* has exceeded the maximum execution time|"
r"The action has timed out|"
r"operation was canceled|"
r"timed out after \d+"
r")",
re.IGNORECASE,
)
RE_FAILED_CONSISTENTLY = re.compile(
r"FAILED CONSISTENTLY: (?P<test_path>\S+)"
)
Expand Down Expand Up @@ -96,6 +105,9 @@ def parse_log_file(filepath):
consistent_failures = []
flaky_tests = []
last_passed_individual = None
last_individual_test = None
pending_keyboard_interrupt = None
inline_failures = []

with open(filepath, "r", errors="replace") as f:
for line in f:
Expand All @@ -105,6 +117,11 @@ def parse_log_file(filepath):
if ".py::" in line:
m_ind = RE_INDIVIDUAL_TEST.search(line)
if m_ind:
last_individual_test = {
"file": m_ind.group("test_path").split("::", 1)[0],
"cls": m_ind.group("cls"),
"method": m_ind.group("method"),
}
active = current_test or last_failed_test
if active and active in results:
# Only update if the pytest path belongs to this shard's test file,
Expand All @@ -116,6 +133,9 @@ def parse_log_file(filepath):
if " ... [" not in line and "was successful" not in line \
and "failed!" not in line and "Got exit code" not in line \
and "returning 124" not in line and "FAILED CONSISTENTLY" not in line \
and "timed out" not in line and "Timed out" not in line \
and "operation was canceled" not in line \
and "exceeded the maximum execution time" not in line \
and "Retrying" not in line \
and "Segmentation fault" not in line and "SIGIOT" not in line \
and "SIGSEGV" not in line and "SIGABRT" not in line \
Expand All @@ -124,6 +144,7 @@ def parse_log_file(filepath):
and "Aborted (core dumped)" not in line \
and "OutOfMemoryError" not in line \
and "bad_alloc" not in line \
and "KeyboardInterrupt" not in line \
and "stepcurrent" not in line \
and "PASSED" not in line \
and "new process" not in line:
Expand Down Expand Up @@ -202,11 +223,45 @@ def parse_log_file(filepath):
code = int(m.group("code"))
if active and active in results:
results[active]["exit_codes"].append(code)
elif pending_keyboard_interrupt and code in (2, 124):
inline_failures.append({
"file": pending_keyboard_interrupt["file"],
"cls": pending_keyboard_interrupt["cls"],
"method": pending_keyboard_interrupt["method"],
"category": "TIMEOUT",
"status": "FAILED",
"reason": (
f"{pending_keyboard_interrupt['cls']}::"
f"{pending_keyboard_interrupt['method']}"
),
"exit_codes": str(code),
})
pending_keyboard_interrupt = None

m = RE_TIMEOUT.search(stripped)
if m and active and active in results:
if "TIMEOUT" not in results[active]["crashes"]:
results[active]["crashes"].append("TIMEOUT")
elif m and last_individual_test:
inline_failures.append({
"file": last_individual_test["file"],
"cls": last_individual_test["cls"],
"method": last_individual_test["method"],
"category": "TIMEOUT",
"status": "FAILED",
"reason": (
f"{last_individual_test['cls']}::"
f"{last_individual_test['method']}"
),
"exit_codes": "124",
})

if "KeyboardInterrupt" in stripped and last_individual_test:
pending_keyboard_interrupt = last_individual_test

if RE_JOB_TIMEOUT.search(stripped) and active and active in results:
if "TIMEOUT" not in results[active]["crashes"]:
results[active]["crashes"].append("TIMEOUT")

m = RE_FAILED_CONSISTENTLY.search(stripped)
if m:
Expand Down Expand Up @@ -251,7 +306,7 @@ def parse_log_file(filepath):
if label not in results[active]["crashes"]:
results[active]["crashes"].append(label)

return results, consistent_failures, flaky_tests
return results, consistent_failures, flaky_tests, inline_failures


def scan_logs(logs_dir):
Expand Down Expand Up @@ -293,7 +348,7 @@ def scan_logs(logs_dir):
job_shard_str = f"{shard_num}/{job_total}" if job_total else str(shard_num)

filepath = os.path.join(logs_dir, fname)
results, consistent_failures, flaky_tests = parse_log_file(filepath)
results, consistent_failures, flaky_tests, inline_failures = parse_log_file(filepath)

for ft in flaky_tests:
file_part = ft["file"].replace("test/", "").replace(".py", "")
Expand All @@ -308,6 +363,21 @@ def scan_logs(logs_dir):
"test_shard": ft["test_shard"],
})

for failure in inline_failures:
file_part = failure["file"].replace("test/", "").replace(".py", "")
all_failures.append({
"log_file": fname,
"platform": platform,
"test_config": test_config,
"test_file": file_part,
"job_shard": job_shard_str,
"test_shard": "",
"status": failure["status"],
"category": failure["category"],
"reason": failure["reason"],
"exit_codes": failure["exit_codes"],
})

# Record every (test_file, test_shard) observed in this log file,
# including PASSED ones, so the inventory covers the full run.
for info in results.values():
Expand Down
56 changes: 52 additions & 4 deletions .github/workflows/build_portable_linux_pytorch_dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ jobs:
repository: ${{ matrix.pytorch_repo }}
ref: ${{ matrix.pytorch_branch }}
path: pytorch-src
fetch-depth: 0
submodules: recursive

- name: Checkout TheRock AMDGPU metadata
uses: actions/checkout@v4
with:
repository: ROCm/TheRock
path: therock-src
fetch-depth: 1

- name: Derive torch version prefix from branch
Expand Down Expand Up @@ -164,6 +172,10 @@ jobs:
echo "pytorch_repo=${{ matrix.pytorch_repo }}" >> $GITHUB_OUTPUT
echo "pytorch_branch=${{ matrix.pytorch_branch }}" >> $GITHUB_OUTPUT

GFX="${{ env.DEFAULT_AMDGPU_FAMILY }}"
PYTORCH_ROCM_ARCH="$(python3 therock-src/build_tools/github_actions/expand_amdgpu_families.py --amdgpu-families "${GFX}")"
echo "pytorch_rocm_arch=${PYTORCH_ROCM_ARCH}" >> $GITHUB_OUTPUT
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's see if we can use https://github.com/ROCm/TheRock/blob/main/build_tools/github_actions/expand_amdgpu_families.py to get the most up-to-date mapping at any point of time


COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)"
echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT

Expand All @@ -187,8 +199,8 @@ jobs:
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERUSERNAME }}
password: ${{ secrets.DOCKERTOKEN }}
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PAT }}

- name: Prepare build context
run: |
Expand All @@ -213,6 +225,7 @@ jobs:
--build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \
--build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \
--build-arg "TORCH_VERSION_PREFIX=${{ steps.prefix.outputs.value }}" \
--build-arg "PYTORCH_ROCM_ARCH=${{ steps.cfg.outputs.pytorch_rocm_arch }}" \
pytorch-src

echo "Docker image built successfully: ${IMAGE}"
Expand All @@ -228,6 +241,17 @@ jobs:
echo "ROCm packages:"
echo "${ROCM_PACKAGES}"

- name: Scan image for vulnerabilities
uses: aquasecurity/trivy-action@v0.36.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
format: table
severity: CRITICAL
ignore-unfixed: true
exit-code: "1"
scanners: vuln
timeout: 30m

- name: Push Docker image
run: |
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
Expand Down Expand Up @@ -264,6 +288,14 @@ jobs:
repository: ${{ inputs.pytorch_repo || 'pytorch/pytorch' }}
ref: ${{ inputs.pytorch_branch || 'nightly' }}
path: pytorch-src
fetch-depth: 0
submodules: recursive

- name: Checkout TheRock AMDGPU metadata
uses: actions/checkout@v4
with:
repository: ROCm/TheRock
path: therock-src
fetch-depth: 1

- name: Derive torch version prefix from branch
Expand Down Expand Up @@ -340,6 +372,10 @@ jobs:
echo "pytorch_repo=${{ inputs.pytorch_repo || 'pytorch/pytorch' }}" >> $GITHUB_OUTPUT
echo "pytorch_branch=${{ inputs.pytorch_branch || 'nightly' }}" >> $GITHUB_OUTPUT

GFX="${{ inputs.amdgpu_family || env.DEFAULT_AMDGPU_FAMILY }}"
PYTORCH_ROCM_ARCH="$(python3 therock-src/build_tools/github_actions/expand_amdgpu_families.py --amdgpu-families "${GFX}")"
echo "pytorch_rocm_arch=${PYTORCH_ROCM_ARCH}" >> $GITHUB_OUTPUT

COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)"
echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT

Expand All @@ -363,8 +399,8 @@ jobs:
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERUSERNAME }}
password: ${{ secrets.DOCKERTOKEN }}
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PAT }}

- name: Prepare build context
run: |
Expand All @@ -389,6 +425,7 @@ jobs:
--build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \
--build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \
--build-arg "TORCH_VERSION_PREFIX=${{ steps.cfg.outputs.torch_prefix }}" \
--build-arg "PYTORCH_ROCM_ARCH=${{ steps.cfg.outputs.pytorch_rocm_arch }}" \
pytorch-src

echo "Docker image built successfully: ${IMAGE}"
Expand All @@ -404,6 +441,17 @@ jobs:
echo "ROCm packages:"
echo "${ROCM_PACKAGES}"

- name: Scan image for vulnerabilities
uses: aquasecurity/trivy-action@v0.36.0
with:
image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
format: table
severity: CRITICAL
ignore-unfixed: true
exit-code: "1"
scanners: vuln
timeout: 30m

- name: Push Docker image
run: |
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }}
Expand Down
11 changes: 5 additions & 6 deletions .github/workflows/parity.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ on:
default: false
type: boolean
include_logs:
description: 'Download and include CI log files (.txt) in artifact zip'
description: 'Include raw CI log files (.txt) in artifact zip; logs are still scanned for failures'
required: false
default: true
type: boolean
Expand Down Expand Up @@ -157,9 +157,6 @@ jobs:
ARGS="$ARGS --exclude_default"
fi
ARGS="$ARGS --ignore_status"
if [ "${{ inputs.include_logs }}" != "true" ]; then
ARGS="$ARGS --artifacts_only"
fi
if [ "${{ inputs.skip_rocm }}" = "true" ]; then
ARGS="$ARGS --no_rocm"
fi
Expand Down Expand Up @@ -246,7 +243,6 @@ jobs:
fi

- name: Detect log-based failures (timeouts, crashes)
if: ${{ inputs.include_logs }}
working-directory: .automation_scripts/pytorch-unit-test-scripts
run: |
FOLDER="${{ steps.folder.outputs.folder }}"
Expand All @@ -262,9 +258,12 @@ jobs:
FOLDER=".automation_scripts/pytorch-unit-test-scripts/${{ steps.folder.outputs.folder }}"
PATHS="${FOLDER}/*.csv
${FOLDER}/*.log
${FOLDER}/*.txt
${FOLDER}/inductor_periodic_rocm_dir/
${FOLDER}/inductor_periodic_cuda_dir/"
if [ "${{ inputs.include_logs }}" = "true" ]; then
PATHS="${PATHS}
${FOLDER}/*.txt"
fi
if [ "${{ inputs.include_xml }}" = "true" ]; then
PATHS="${PATHS}
${FOLDER}/rocm_xml/
Expand Down
25 changes: 19 additions & 6 deletions dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@
# - AMDGPU_FAMILY : AMD GPU family (e.g., gfx94X-dcgpu, gfx90X-dcgpu, gfx950-dcgpu)
# - PYTHON_VERSION : Python version for PyTorch (default: 3.12)
# - INDEX_URL : (Required) Base URL for PyTorch wheels index
# - PYTORCH_ROCM_ARCH: GPU arch target for source builds / local rebuilds
# - TORCH_VERSION : Optional specific PyTorch version. If not set, installs latest.
# - TORCHAUDIO_VERSION : Optional specific torchaudio version. If not set, installs latest.
# - TORCHVISION_VERSION: Optional specific torchvision version. If not set, installs latest.
# - TRITON_VERSION : Optional specific triton version. If not set, uses torch's dependency.
#
# Note: The PyTorch source is included at /workspace/pytorch (from the repo root).
# Note: The PyTorch source is included at /tmp/pytorch (from the repo root).
#
# Build example (run from repo root):
#
Expand All @@ -58,6 +59,7 @@ ARG RELEASE_TYPE=nightly
# PyTorch configuration arguments
ARG PYTHON_VERSION=3.12
ARG INDEX_URL
ARG PYTORCH_ROCM_ARCH
ARG TORCH_VERSION
ARG TORCH_VERSION_PREFIX
ARG TORCHAUDIO_VERSION
Expand All @@ -69,12 +71,18 @@ COPY .github/scripts/install_rocm_deps.sh /tmp/
COPY .github/scripts/install_pytorch_wheels.py /tmp/

# Copy PyTorch source from the repo root
COPY . /workspace/pytorch
COPY . /tmp/pytorch

# Install system dependencies
RUN chmod +x /tmp/install_rocm_deps.sh && \
/tmp/install_rocm_deps.sh

# Install ccache for PyTorch source rebuilds in the image.
RUN if ! command -v ccache >/dev/null 2>&1; then \
apt-get update; \
apt-get install -y ccache; \
fi

# Install the requested Python version if not already available.
# Ubuntu 24.04 ships with 3.12; other versions come from deadsnakes PPA.
RUN if ! command -v python${PYTHON_VERSION} >/dev/null 2>&1; then \
Expand Down Expand Up @@ -120,7 +128,6 @@ ENV ROCM_HOME="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_dev
ROCM_SOURCE_DIR="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel" \
ROCM_BIN="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/bin" \
ROCM_CMAKE="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/lib/cmake" \
PYTORCH_ROCM_ARCH="${AMDGPU_FAMILY}" \
VIRTUAL_ENV=/opt/venv \
USE_MSLK=0

Expand All @@ -133,8 +140,13 @@ ENV CMAKE_PREFIX_PATH="${ROCM_CMAKE}" \
PKG_CONFIG_PATH="${ROCM_HOME}/lib/rocm_sysdeps/lib/pkgconfig" \
LD_LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \
LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \
CC="${ROCM_HOME}/lib/llvm/bin/clang" \
CXX="${ROCM_HOME}/lib/llvm/bin/clang++" \
CC=/usr/bin/gcc \
CXX=/usr/bin/g++ \
CMAKE_C_COMPILER_LAUNCHER=ccache \
CMAKE_CXX_COMPILER_LAUNCHER=ccache \
CMAKE_CUDA_COMPILER_LAUNCHER=ccache \
CMAKE_HIP_COMPILER_LAUNCHER=ccache \
PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" \
PATH="${ROCM_BIN}:${PATH}"

# Verify PyTorch imports and environment
Expand All @@ -145,6 +157,7 @@ print('ROCm/HIP', torch.version.hip)
print(f'ROCM_HOME={os.environ.get("ROCM_HOME", "NOT SET")}')
print(f'CC={os.environ.get("CC", "NOT SET")}')
print(f'CXX={os.environ.get("CXX", "NOT SET")}')
print(f'PYTORCH_ROCM_ARCH={os.environ.get("PYTORCH_ROCM_ARCH", "NOT SET")}')
for mod in ['torchaudio', 'torchvision', 'triton']:
try:
m = __import__(mod)
Expand All @@ -156,4 +169,4 @@ PYEOF
# Clean up installation scripts
RUN rm -f /tmp/install_rocm_deps.sh /tmp/install_pytorch_wheels.py

WORKDIR /workspace/pytorch
WORKDIR /tmp/pytorch
Loading