diff --git a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py index 0156624c35973..913f1ad16203e 100755 --- a/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py +++ b/.automation_scripts/pytorch-unit-test-scripts/detect_log_failures.py @@ -29,6 +29,15 @@ ) RE_EXIT_CODE = re.compile(r"Got exit code (?P\d+)") RE_TIMEOUT = re.compile(r"Command took >(\d+)min, returning 124") +RE_JOB_TIMEOUT = re.compile( + r"(?:" + r"The job running on runner .* has exceeded the maximum execution time|" + r"The action has timed out|" + r"operation was canceled|" + r"timed out after \d+" + r")", + re.IGNORECASE, +) RE_FAILED_CONSISTENTLY = re.compile( r"FAILED CONSISTENTLY: (?P\S+)" ) @@ -96,6 +105,9 @@ def parse_log_file(filepath): consistent_failures = [] flaky_tests = [] last_passed_individual = None + last_individual_test = None + pending_keyboard_interrupt = None + inline_failures = [] with open(filepath, "r", errors="replace") as f: for line in f: @@ -105,6 +117,11 @@ def parse_log_file(filepath): if ".py::" in line: m_ind = RE_INDIVIDUAL_TEST.search(line) if m_ind: + last_individual_test = { + "file": m_ind.group("test_path").split("::", 1)[0], + "cls": m_ind.group("cls"), + "method": m_ind.group("method"), + } active = current_test or last_failed_test if active and active in results: # Only update if the pytest path belongs to this shard's test file, @@ -116,6 +133,9 @@ def parse_log_file(filepath): if " ... [" not in line and "was successful" not in line \ and "failed!" not in line and "Got exit code" not in line \ and "returning 124" not in line and "FAILED CONSISTENTLY" not in line \ + and "timed out" not in line and "Timed out" not in line \ + and "operation was canceled" not in line \ + and "exceeded the maximum execution time" not in line \ and "Retrying" not in line \ and "Segmentation fault" not in line and "SIGIOT" not in line \ and "SIGSEGV" not in line and "SIGABRT" not in line \ @@ -124,6 +144,7 @@ def parse_log_file(filepath): and "Aborted (core dumped)" not in line \ and "OutOfMemoryError" not in line \ and "bad_alloc" not in line \ + and "KeyboardInterrupt" not in line \ and "stepcurrent" not in line \ and "PASSED" not in line \ and "new process" not in line: @@ -202,11 +223,45 @@ def parse_log_file(filepath): code = int(m.group("code")) if active and active in results: results[active]["exit_codes"].append(code) + elif pending_keyboard_interrupt and code in (2, 124): + inline_failures.append({ + "file": pending_keyboard_interrupt["file"], + "cls": pending_keyboard_interrupt["cls"], + "method": pending_keyboard_interrupt["method"], + "category": "TIMEOUT", + "status": "FAILED", + "reason": ( + f"{pending_keyboard_interrupt['cls']}::" + f"{pending_keyboard_interrupt['method']}" + ), + "exit_codes": str(code), + }) + pending_keyboard_interrupt = None m = RE_TIMEOUT.search(stripped) if m and active and active in results: if "TIMEOUT" not in results[active]["crashes"]: results[active]["crashes"].append("TIMEOUT") + elif m and last_individual_test: + inline_failures.append({ + "file": last_individual_test["file"], + "cls": last_individual_test["cls"], + "method": last_individual_test["method"], + "category": "TIMEOUT", + "status": "FAILED", + "reason": ( + f"{last_individual_test['cls']}::" + f"{last_individual_test['method']}" + ), + "exit_codes": "124", + }) + + if "KeyboardInterrupt" in stripped and last_individual_test: + pending_keyboard_interrupt = last_individual_test + + if RE_JOB_TIMEOUT.search(stripped) and active and active in results: + if "TIMEOUT" not in results[active]["crashes"]: + results[active]["crashes"].append("TIMEOUT") m = RE_FAILED_CONSISTENTLY.search(stripped) if m: @@ -251,7 +306,7 @@ def parse_log_file(filepath): if label not in results[active]["crashes"]: results[active]["crashes"].append(label) - return results, consistent_failures, flaky_tests + return results, consistent_failures, flaky_tests, inline_failures def scan_logs(logs_dir): @@ -293,7 +348,7 @@ def scan_logs(logs_dir): job_shard_str = f"{shard_num}/{job_total}" if job_total else str(shard_num) filepath = os.path.join(logs_dir, fname) - results, consistent_failures, flaky_tests = parse_log_file(filepath) + results, consistent_failures, flaky_tests, inline_failures = parse_log_file(filepath) for ft in flaky_tests: file_part = ft["file"].replace("test/", "").replace(".py", "") @@ -308,6 +363,21 @@ def scan_logs(logs_dir): "test_shard": ft["test_shard"], }) + for failure in inline_failures: + file_part = failure["file"].replace("test/", "").replace(".py", "") + all_failures.append({ + "log_file": fname, + "platform": platform, + "test_config": test_config, + "test_file": file_part, + "job_shard": job_shard_str, + "test_shard": "", + "status": failure["status"], + "category": failure["category"], + "reason": failure["reason"], + "exit_codes": failure["exit_codes"], + }) + # Record every (test_file, test_shard) observed in this log file, # including PASSED ones, so the inventory covers the full run. for info in results.values(): diff --git a/.github/workflows/build_portable_linux_pytorch_dockers.yml b/.github/workflows/build_portable_linux_pytorch_dockers.yml index d5c9a94c3b1ad..f9c2d6a93970e 100644 --- a/.github/workflows/build_portable_linux_pytorch_dockers.yml +++ b/.github/workflows/build_portable_linux_pytorch_dockers.yml @@ -93,6 +93,14 @@ jobs: repository: ${{ matrix.pytorch_repo }} ref: ${{ matrix.pytorch_branch }} path: pytorch-src + fetch-depth: 0 + submodules: recursive + + - name: Checkout TheRock AMDGPU metadata + uses: actions/checkout@v4 + with: + repository: ROCm/TheRock + path: therock-src fetch-depth: 1 - name: Derive torch version prefix from branch @@ -164,6 +172,10 @@ jobs: echo "pytorch_repo=${{ matrix.pytorch_repo }}" >> $GITHUB_OUTPUT echo "pytorch_branch=${{ matrix.pytorch_branch }}" >> $GITHUB_OUTPUT + GFX="${{ env.DEFAULT_AMDGPU_FAMILY }}" + PYTORCH_ROCM_ARCH="$(python3 therock-src/build_tools/github_actions/expand_amdgpu_families.py --amdgpu-families "${GFX}")" + echo "pytorch_rocm_arch=${PYTORCH_ROCM_ARCH}" >> $GITHUB_OUTPUT + COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)" echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT @@ -187,8 +199,8 @@ jobs: - name: Log in to Docker Hub uses: docker/login-action@v3 with: - username: ${{ secrets.DOCKERUSERNAME }} - password: ${{ secrets.DOCKERTOKEN }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PAT }} - name: Prepare build context run: | @@ -213,6 +225,7 @@ jobs: --build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \ --build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \ --build-arg "TORCH_VERSION_PREFIX=${{ steps.prefix.outputs.value }}" \ + --build-arg "PYTORCH_ROCM_ARCH=${{ steps.cfg.outputs.pytorch_rocm_arch }}" \ pytorch-src echo "Docker image built successfully: ${IMAGE}" @@ -228,6 +241,17 @@ jobs: echo "ROCm packages:" echo "${ROCM_PACKAGES}" + - name: Scan image for vulnerabilities + uses: aquasecurity/trivy-action@v0.36.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }} + format: table + severity: CRITICAL + ignore-unfixed: true + exit-code: "1" + scanners: vuln + timeout: 30m + - name: Push Docker image run: | docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }} @@ -264,6 +288,14 @@ jobs: repository: ${{ inputs.pytorch_repo || 'pytorch/pytorch' }} ref: ${{ inputs.pytorch_branch || 'nightly' }} path: pytorch-src + fetch-depth: 0 + submodules: recursive + + - name: Checkout TheRock AMDGPU metadata + uses: actions/checkout@v4 + with: + repository: ROCm/TheRock + path: therock-src fetch-depth: 1 - name: Derive torch version prefix from branch @@ -340,6 +372,10 @@ jobs: echo "pytorch_repo=${{ inputs.pytorch_repo || 'pytorch/pytorch' }}" >> $GITHUB_OUTPUT echo "pytorch_branch=${{ inputs.pytorch_branch || 'nightly' }}" >> $GITHUB_OUTPUT + GFX="${{ inputs.amdgpu_family || env.DEFAULT_AMDGPU_FAMILY }}" + PYTORCH_ROCM_ARCH="$(python3 therock-src/build_tools/github_actions/expand_amdgpu_families.py --amdgpu-families "${GFX}")" + echo "pytorch_rocm_arch=${PYTORCH_ROCM_ARCH}" >> $GITHUB_OUTPUT + COMMIT="$(cd pytorch-src && git rev-parse --short=8 HEAD)" echo "pytorch_commit=${COMMIT}" >> $GITHUB_OUTPUT @@ -363,8 +399,8 @@ jobs: - name: Log in to Docker Hub uses: docker/login-action@v3 with: - username: ${{ secrets.DOCKERUSERNAME }} - password: ${{ secrets.DOCKERTOKEN }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PAT }} - name: Prepare build context run: | @@ -389,6 +425,7 @@ jobs: --build-arg "PYTHON_VERSION=${{ steps.cfg.outputs.python_version }}" \ --build-arg "INDEX_URL=${{ steps.cfg.outputs.index_url }}" \ --build-arg "TORCH_VERSION_PREFIX=${{ steps.cfg.outputs.torch_prefix }}" \ + --build-arg "PYTORCH_ROCM_ARCH=${{ steps.cfg.outputs.pytorch_rocm_arch }}" \ pytorch-src echo "Docker image built successfully: ${IMAGE}" @@ -404,6 +441,17 @@ jobs: echo "ROCm packages:" echo "${ROCM_PACKAGES}" + - name: Scan image for vulnerabilities + uses: aquasecurity/trivy-action@v0.36.0 + with: + image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }} + format: table + severity: CRITICAL + ignore-unfixed: true + exit-code: "1" + scanners: vuln + timeout: 30m + - name: Push Docker image run: | docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.docker-tag.outputs.tag }} diff --git a/.github/workflows/parity.yml b/.github/workflows/parity.yml index b47049ca03f16..ae05210ae70be 100644 --- a/.github/workflows/parity.yml +++ b/.github/workflows/parity.yml @@ -38,7 +38,7 @@ on: default: false type: boolean include_logs: - description: 'Download and include CI log files (.txt) in artifact zip' + description: 'Include raw CI log files (.txt) in artifact zip; logs are still scanned for failures' required: false default: true type: boolean @@ -157,9 +157,6 @@ jobs: ARGS="$ARGS --exclude_default" fi ARGS="$ARGS --ignore_status" - if [ "${{ inputs.include_logs }}" != "true" ]; then - ARGS="$ARGS --artifacts_only" - fi if [ "${{ inputs.skip_rocm }}" = "true" ]; then ARGS="$ARGS --no_rocm" fi @@ -246,7 +243,6 @@ jobs: fi - name: Detect log-based failures (timeouts, crashes) - if: ${{ inputs.include_logs }} working-directory: .automation_scripts/pytorch-unit-test-scripts run: | FOLDER="${{ steps.folder.outputs.folder }}" @@ -262,9 +258,12 @@ jobs: FOLDER=".automation_scripts/pytorch-unit-test-scripts/${{ steps.folder.outputs.folder }}" PATHS="${FOLDER}/*.csv ${FOLDER}/*.log - ${FOLDER}/*.txt ${FOLDER}/inductor_periodic_rocm_dir/ ${FOLDER}/inductor_periodic_cuda_dir/" + if [ "${{ inputs.include_logs }}" = "true" ]; then + PATHS="${PATHS} + ${FOLDER}/*.txt" + fi if [ "${{ inputs.include_xml }}" = "true" ]; then PATHS="${PATHS} ${FOLDER}/rocm_xml/ diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 361d0219eceef..f97f89de464e6 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -26,12 +26,13 @@ # - AMDGPU_FAMILY : AMD GPU family (e.g., gfx94X-dcgpu, gfx90X-dcgpu, gfx950-dcgpu) # - PYTHON_VERSION : Python version for PyTorch (default: 3.12) # - INDEX_URL : (Required) Base URL for PyTorch wheels index +# - PYTORCH_ROCM_ARCH: GPU arch target for source builds / local rebuilds # - TORCH_VERSION : Optional specific PyTorch version. If not set, installs latest. # - TORCHAUDIO_VERSION : Optional specific torchaudio version. If not set, installs latest. # - TORCHVISION_VERSION: Optional specific torchvision version. If not set, installs latest. # - TRITON_VERSION : Optional specific triton version. If not set, uses torch's dependency. # -# Note: The PyTorch source is included at /workspace/pytorch (from the repo root). +# Note: The PyTorch source is included at /tmp/pytorch (from the repo root). # # Build example (run from repo root): # @@ -58,6 +59,7 @@ ARG RELEASE_TYPE=nightly # PyTorch configuration arguments ARG PYTHON_VERSION=3.12 ARG INDEX_URL +ARG PYTORCH_ROCM_ARCH ARG TORCH_VERSION ARG TORCH_VERSION_PREFIX ARG TORCHAUDIO_VERSION @@ -69,12 +71,18 @@ COPY .github/scripts/install_rocm_deps.sh /tmp/ COPY .github/scripts/install_pytorch_wheels.py /tmp/ # Copy PyTorch source from the repo root -COPY . /workspace/pytorch +COPY . /tmp/pytorch # Install system dependencies RUN chmod +x /tmp/install_rocm_deps.sh && \ /tmp/install_rocm_deps.sh +# Install ccache for PyTorch source rebuilds in the image. +RUN if ! command -v ccache >/dev/null 2>&1; then \ + apt-get update; \ + apt-get install -y ccache; \ + fi + # Install the requested Python version if not already available. # Ubuntu 24.04 ships with 3.12; other versions come from deadsnakes PPA. RUN if ! command -v python${PYTHON_VERSION} >/dev/null 2>&1; then \ @@ -120,7 +128,6 @@ ENV ROCM_HOME="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_dev ROCM_SOURCE_DIR="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel" \ ROCM_BIN="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/bin" \ ROCM_CMAKE="/opt/venv/lib/python${PYTHON_VERSION}/site-packages/_rocm_sdk_devel/lib/cmake" \ - PYTORCH_ROCM_ARCH="${AMDGPU_FAMILY}" \ VIRTUAL_ENV=/opt/venv \ USE_MSLK=0 @@ -133,8 +140,13 @@ ENV CMAKE_PREFIX_PATH="${ROCM_CMAKE}" \ PKG_CONFIG_PATH="${ROCM_HOME}/lib/rocm_sysdeps/lib/pkgconfig" \ LD_LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \ LIBRARY_PATH="${ROCM_HOME}/lib/host-math/lib:${ROCM_HOME}/lib/rocm_sysdeps/lib" \ - CC="${ROCM_HOME}/lib/llvm/bin/clang" \ - CXX="${ROCM_HOME}/lib/llvm/bin/clang++" \ + CC=/usr/bin/gcc \ + CXX=/usr/bin/g++ \ + CMAKE_C_COMPILER_LAUNCHER=ccache \ + CMAKE_CXX_COMPILER_LAUNCHER=ccache \ + CMAKE_CUDA_COMPILER_LAUNCHER=ccache \ + CMAKE_HIP_COMPILER_LAUNCHER=ccache \ + PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH}" \ PATH="${ROCM_BIN}:${PATH}" # Verify PyTorch imports and environment @@ -145,6 +157,7 @@ print('ROCm/HIP', torch.version.hip) print(f'ROCM_HOME={os.environ.get("ROCM_HOME", "NOT SET")}') print(f'CC={os.environ.get("CC", "NOT SET")}') print(f'CXX={os.environ.get("CXX", "NOT SET")}') +print(f'PYTORCH_ROCM_ARCH={os.environ.get("PYTORCH_ROCM_ARCH", "NOT SET")}') for mod in ['torchaudio', 'torchvision', 'triton']: try: m = __import__(mod) @@ -156,4 +169,4 @@ PYEOF # Clean up installation scripts RUN rm -f /tmp/install_rocm_deps.sh /tmp/install_pytorch_wheels.py -WORKDIR /workspace/pytorch +WORKDIR /tmp/pytorch