Skip to content

Commit afc724e

Browse files
authored
Add --deps-only flag - fix cray flakiness (#1336)
1 parent ace3632 commit afc724e

11 files changed

Lines changed: 148 additions & 92 deletions

File tree

.github/scripts/run_case_optimization.sh

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
1313
ngpus=1
1414
fi
1515

16-
# Verify the venv Python interpreter exists (created by ./mfc.sh build)
17-
if [ ! -x build/venv/bin/python3 ]; then
18-
echo "ERROR: build/venv/bin/python3 not found."
19-
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
20-
exit 1
21-
fi
22-
2316
benchmarks=(
2417
benchmarks/5eq_rk3_weno3_hllc/case.py
2518
benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -28,6 +21,30 @@ benchmarks=(
2821
benchmarks/igr/case.py
2922
)
3023

24+
# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
25+
# build case-optimized binaries here on the compute node before running.
26+
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
27+
#
28+
# Clean stale MFC target staging before building. On self-hosted CI runners,
29+
# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash)
30+
# can persist and poison subsequent builds. Each case-opt config gets its own
31+
# hash-named staging dir, but install dirs and other artifacts may be stale.
32+
if [ "$job_cluster" != "phoenix" ]; then
33+
# Clean stale MFC target dirs (hash-named) from prior builds, but
34+
# preserve dependency dirs (hipfort, fftw, etc.) since the compute
35+
# node has no internet to re-fetch them.
36+
echo "=== Cleaning stale MFC target staging/install ==="
37+
find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
38+
find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true
39+
40+
echo "=== Building case-optimized binaries on compute node ==="
41+
for case in "${benchmarks[@]}"; do
42+
echo "--- Building: $case ---"
43+
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
44+
done
45+
echo "=== All case-optimized binaries built ==="
46+
fi
47+
3148
passed=0
3249
failed=0
3350
failed_cases=""
@@ -44,7 +61,7 @@ for case in "${benchmarks[@]}"; do
4461
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"
4562

4663
# Build + run with --case-optimization, small grid, 10 timesteps
47-
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
64+
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -c "$job_cluster" -- --gbpp 1 --steps 10; then
4865
# Validate output
4966
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
5067
echo "PASS: $case_name"

.github/workflows/bench.yml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,42 +68,47 @@ jobs:
6868
flag: f
6969
device: gpu
7070
interface: acc
71-
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
71+
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
7272
- cluster: frontier
7373
name: Oak Ridge | Frontier (CCE)
7474
group: phoenix
7575
labels: frontier
7676
flag: f
7777
device: gpu
7878
interface: omp
79-
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
79+
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
8080
- cluster: frontier_amd
8181
name: Oak Ridge | Frontier (AMD)
8282
group: phoenix
8383
labels: frontier
8484
flag: famd
8585
device: gpu
8686
interface: omp
87-
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
87+
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
8888
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
8989
runs-on:
9090
group: ${{ matrix.group }}
9191
labels: ${{ matrix.labels }}
9292
timeout-minutes: 480
9393
steps:
94+
- name: Clean stale output files
95+
run: rm -f *.out
96+
9497
- name: Clone - PR
9598
uses: actions/checkout@v4
9699
with:
97100
path: pr
101+
clean: false
98102

99103
- name: Clone - Master
100104
uses: actions/checkout@v4
101105
with:
102106
repository: MFlowCode/MFC
103107
ref: master
104108
path: master
109+
clean: false
105110

106-
- name: Setup & Build
111+
- name: Fetch Dependencies
107112
if: matrix.build_script != ''
108113
timeout-minutes: 150
109114
run: |

.github/workflows/common/bench.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then
2121
trap 'rm -rf "$currentdir" || true' EXIT
2222
fi
2323

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
24+
# --- Build ---
25+
# Phoenix builds everything inside SLURM (no login-node build step).
26+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
27+
# source code is built here on the compute node.
2628
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
2729
if [ "$job_cluster" = "phoenix" ]; then
2830
source .github/scripts/clean-build.sh
2931
clean_build
3032
fi
3133

32-
if [ ! -d "build" ]; then
33-
source .github/scripts/retry-build.sh
34-
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
35-
fi
34+
source .github/scripts/retry-build.sh
35+
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
3636

3737
# --- Bench cluster flag ---
3838
if [ "$job_cluster" = "phoenix" ]; then

.github/workflows/common/build.sh

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
# Build-only script for all clusters.
3+
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Builds MFC without running tests (--dry-run).
5+
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
6+
7+
set -euo pipefail
8+
9+
source .github/scripts/gpu-opts.sh
10+
build_opts="$gpu_opts"
11+
12+
# --- Phoenix TMPDIR setup ---
13+
if [ "$job_cluster" = "phoenix" ]; then
14+
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
15+
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
16+
mkdir -p $tmpbuild
17+
mkdir -p $currentdir
18+
export TMPDIR=$currentdir
19+
trap 'rm -rf "$currentdir" || true' EXIT
20+
fi
21+
22+
# --- Build ---
23+
# Phoenix builds everything inside SLURM (no login-node build step).
24+
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
25+
# source code is built here on the compute node.
26+
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
27+
# on a different microarchitecture.
28+
if [ "$job_cluster" = "phoenix" ]; then
29+
source .github/scripts/clean-build.sh
30+
clean_build
31+
fi
32+
33+
source .github/scripts/retry-build.sh
34+
35+
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
36+
# (SIGILL from binaries compiled on a different compute node).
37+
validate_cmd=""
38+
if [ "$job_cluster" = "phoenix" ]; then
39+
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
40+
fi
41+
42+
RETRY_VALIDATE_CMD="$validate_cmd" \
43+
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

.github/workflows/common/test.sh

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/bin/bash
2-
# Unified test script for all clusters.
2+
# Test-only script for all clusters.
33
# Runs inside a SLURM job via submit-slurm-job.sh.
4+
# Assumes MFC is already built (by a prior build.sh SLURM job).
45
# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster
56

67
set -euo pipefail
@@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh
910
build_opts="$gpu_opts"
1011

1112
# --- Phoenix TMPDIR setup ---
12-
# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
13-
# spawning MPI processes, it fills up and ORTE session dir creation fails.
14-
# Redirect TMPDIR to project storage, same as bench.sh.
1513
if [ "$job_cluster" = "phoenix" ]; then
1614
tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
1715
currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
@@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then
2119
trap 'rm -rf "$currentdir" || true' EXIT
2220
fi
2321

24-
# --- Build (if not pre-built on login node) ---
25-
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
26-
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
27-
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
28-
if [ "$job_cluster" = "phoenix" ]; then
29-
source .github/scripts/clean-build.sh
30-
clean_build
31-
fi
32-
33-
if [ ! -d "build" ]; then
34-
source .github/scripts/retry-build.sh
35-
36-
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
37-
# (SIGILL from binaries compiled on a different compute node).
38-
validate_cmd=""
39-
if [ "$job_cluster" = "phoenix" ]; then
40-
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
41-
fi
42-
43-
RETRY_VALIDATE_CMD="$validate_cmd" \
44-
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
45-
fi
46-
4722
# --- GPU detection and thread count ---
4823
device_opts=""
4924
rdma_opts=""
@@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
8863
prune_flag="--only-changes"
8964
fi
9065

91-
./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
66+
./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster

.github/workflows/frontier/build.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ esac
1414

1515
job_device=$1
1616
job_interface=$2
17-
run_bench=$3
1817
source .github/scripts/gpu-opts.sh
1918
build_opts="$gpu_opts"
2019

@@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
2423
clean_build
2524

2625
source .github/scripts/retry-build.sh
27-
if [ "$run_bench" == "bench" ]; then
28-
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
29-
else
30-
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
31-
fi
26+
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1

.github/workflows/test.yml

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,14 @@ jobs:
400400
echo "Coverage cache: none available — full test suite will run"
401401
fi
402402
403-
- name: Build (login node)
403+
- name: Fetch Dependencies
404404
if: matrix.cluster != 'phoenix'
405405
timeout-minutes: 60
406406
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
407407

408+
- name: Build
409+
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
410+
408411
- name: Test
409412
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}
410413

@@ -421,23 +424,29 @@ jobs:
421424
if: always()
422425
id: log
423426
run: |
424-
SLUG="test-${{ matrix.device }}-${{ matrix.interface }}"
427+
SHARD_SUFFIX=""
425428
SHARD="${{ matrix.shard }}"
426429
if [ -n "$SHARD" ]; then
427-
SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')"
430+
SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')"
428431
fi
429-
echo "slug=${SLUG}" >> "$GITHUB_OUTPUT"
432+
echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
433+
echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT"
430434
431435
- name: Print Logs
432436
if: always()
433-
run: cat ${{ steps.log.outputs.slug }}.out
437+
run: |
438+
for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do
439+
[ -f "$f" ] && echo "=== $f ===" && cat "$f"
440+
done
434441
435442
- name: Archive Logs
436443
uses: actions/upload-artifact@v4
437444
if: matrix.cluster != 'phoenix'
438445
with:
439-
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }}
440-
path: ${{ steps.log.outputs.slug }}.out
446+
name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }}
447+
path: |
448+
${{ steps.log.outputs.build_slug }}.out
449+
${{ steps.log.outputs.test_slug }}.out
441450
442451
case-optimization:
443452
name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})"
@@ -486,15 +495,20 @@ jobs:
486495
- name: Clean stale output files
487496
run: rm -f *.out
488497

498+
- name: Fetch Dependencies
499+
if: matrix.cluster != 'phoenix'
500+
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}
501+
489502
- name: Pre-Build (SLURM)
490503
if: matrix.cluster == 'phoenix'
491504
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}
492505

493-
- name: Pre-Build (login node)
506+
- name: Build & Run Case-Optimization Tests
494507
if: matrix.cluster != 'phoenix'
495-
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
508+
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
496509

497510
- name: Run Case-Optimization Tests
511+
if: matrix.cluster == 'phoenix'
498512
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
499513

500514
- name: Cancel SLURM Jobs

CMakeLists.txt

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
176176
endif()
177177
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
178178
add_compile_options(
179-
"SHELL:-M 296,878,1391,1069,5025"
179+
"SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242"
180180
"SHELL:-h static" "SHELL:-h keepfiles"
181181
"SHELL:-h acc_model=auto_async_none"
182182
"SHELL: -h acc_model=no_fast_addr"
@@ -190,9 +190,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
190190
add_compile_options(
191191
"SHELL:-h acc_model=auto_async_none"
192192
"SHELL: -h acc_model=no_fast_addr"
193-
"SHELL: -K trap=fp" "SHELL: -G2"
193+
"SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0"
194194
)
195-
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
195+
add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0")
196196
endif()
197197

198198
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
@@ -665,10 +665,9 @@ if (MFC_SIMULATION)
665665
# Disabling IPA per-file avoids the crashes while preserving IPA for
666666
# the rest of simulation (needed for thermochem INLINEALWAYS inlining).
667667
# Applied to Cray+OpenACC and Cray CPU, but NOT Cray+OpenMP: on OpenMP,
668-
# m_thermochem uses !DIR$ INLINEALWAYS (requires IPA), so disabling IPA
669-
# for these files breaks thermochem on-device calls. On OpenACC the
670-
# pyrometheus patch emits !$acc routine seq instead (no IPA needed).
671-
# See PR #1286.
668+
# CCE 19.0.0 IPA workaround: disable interprocedural analysis for files
669+
# that trigger compiler SIGSEGV during IPA (Bug 3: m_phase_change,
670+
# Bug 4: m_bubbles_EL). See PR #1286.
672671
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray" AND NOT MFC_OpenMP)
673672
set_source_files_properties(
674673
"${CMAKE_BINARY_DIR}/fypp/simulation/m_bubbles_EL.fpp.f90"

toolchain/mfc/build.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil
552552

553553
history.add(target.name)
554554

555+
# Dependencies are pinned to fixed versions. If already configured
556+
# (built & installed by a prior --deps-only step), skip entirely
557+
# to avoid re-entering the superbuild (which may access the network).
558+
if target.isDependency and target.is_configured(case):
559+
return
560+
555561
for dep in target.requires.compute():
556562
# If we have already built and installed this target,
557563
# do not do so again. This can be inferred by whether
@@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str
594600
case = case or input.load(ARG("input"), ARG("--"), {})
595601
case.validate_params()
596602

603+
if ARG("deps_only", False) and len(history) == 0:
604+
all_deps = set()
605+
for t in targets:
606+
resolved = get_target(t)
607+
for dep in resolved.requires.compute():
608+
all_deps.add(dep)
609+
610+
cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]")
611+
cons.print(no_indent=True)
612+
613+
if not all_deps:
614+
cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]")
615+
return
616+
617+
for dep in all_deps:
618+
__build_target(dep, case, history)
619+
620+
return
621+
597622
if len(history) == 0:
598623
cons.print(__generate_header(case, targets))
599624
cons.print(no_indent=True)

0 commit comments

Comments
 (0)