Skip to content

Commit 4682f71

Browse files
Chamberlain0w0kilinchange
authored andcommitted
feat: organize test cases by group
1 parent 791c75e commit 4682f71

5 files changed

Lines changed: 560 additions & 347 deletions

File tree

scripts/compare_loss.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,24 @@
1010
from pathlib import Path
1111
from argparse import ArgumentParser
1212

13+
14+
def collect_log_files(base_dir):
15+
"""Collect comparable training logs keyed by basename."""
16+
files = {}
17+
duplicates = {}
18+
19+
for path in base_dir.rglob('*.log'):
20+
if path.name.startswith('build') or path.name.endswith('_profile.log'):
21+
continue
22+
23+
key = path.name
24+
if key in files:
25+
duplicates.setdefault(key, [files[key]]).append(path)
26+
continue
27+
files[key] = path
28+
29+
return files, duplicates
30+
1331
def get_dtype_from_filename(filename):
1432
"""Determine dtype from filename. Returns 'bfloat16' or 'fp32'."""
1533
return 'bfloat16' if '_bfloat16' in filename else 'fp32'
@@ -62,8 +80,20 @@ def main():
6280
args.threshold_fp32 = args.threshold
6381
args.threshold_bf16 = args.threshold
6482

65-
files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
66-
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
83+
files1, duplicates1 = collect_log_files(args.dir1)
84+
files2, duplicates2 = collect_log_files(args.dir2)
85+
86+
if duplicates1:
87+
print(f"Found duplicate log basenames in {args.dir1.resolve()}, cannot compare safely:")
88+
for name, paths in sorted(duplicates1.items()):
89+
print(f" {name}: {', '.join(str(p.relative_to(args.dir1)) for p in paths)}")
90+
sys.exit(1)
91+
92+
if duplicates2:
93+
print(f"Found duplicate log basenames in {args.dir2.resolve()}, cannot compare safely:")
94+
for name, paths in sorted(duplicates2.items()):
95+
print(f" {name}: {', '.join(str(p.relative_to(args.dir2)) for p in paths)}")
96+
sys.exit(1)
6797

6898
only_in_1 = set(files1.keys()) - set(files2.keys())
6999
only_in_2 = set(files2.keys()) - set(files1.keys())

scripts/compare_tps.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,24 @@
1010
from pathlib import Path
1111
from argparse import ArgumentParser
1212

13+
14+
def collect_log_files(base_dir):
15+
"""Collect comparable training logs keyed by basename."""
16+
files = {}
17+
duplicates = {}
18+
19+
for path in base_dir.rglob('*.log'):
20+
if path.name.startswith('build') or path.name.endswith('_profile.log'):
21+
continue
22+
23+
key = path.name
24+
if key in files:
25+
duplicates.setdefault(key, [files[key]]).append(path)
26+
continue
27+
files[key] = path
28+
29+
return files, duplicates
30+
1331
def parse_log(file_path):
1432
"""Extract step -> tok/s mapping from log file."""
1533
pattern = re.compile(r'step\s+(\d+)/\d+.*?\|\s+(\d+)\s+tok/s')
@@ -55,8 +73,20 @@ def main():
5573
parser.add_argument('--verbose', action='store_true', help='Print detailed output for all files, including passed ones')
5674
args = parser.parse_args()
5775

58-
files1 = {f.name: f for f in args.dir1.glob('*.log') if not f.name.startswith('build')}
59-
files2 = {f.name: f for f in args.dir2.glob('*.log') if not f.name.startswith('build')}
76+
files1, duplicates1 = collect_log_files(args.dir1)
77+
files2, duplicates2 = collect_log_files(args.dir2)
78+
79+
if duplicates1:
80+
print(f"Found duplicate log basenames in {args.dir1.resolve()}, cannot compare safely:")
81+
for name, paths in sorted(duplicates1.items()):
82+
print(f" {name}: {', '.join(str(p.relative_to(args.dir1)) for p in paths)}")
83+
sys.exit(1)
84+
85+
if duplicates2:
86+
print(f"Found duplicate log basenames in {args.dir2.resolve()}, cannot compare safely:")
87+
for name, paths in sorted(duplicates2.items()):
88+
print(f" {name}: {', '.join(str(p.relative_to(args.dir2)) for p in paths)}")
89+
sys.exit(1)
6090

6191
only_in_1 = set(files1.keys()) - set(files2.keys())
6292
only_in_2 = set(files2.keys()) - set(files1.keys())

scripts/run_models_and_profile.bash

Lines changed: 119 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,52 @@
33
set -e
44
set -o pipefail
55

6-
CONFIG_FILE="${1:-test_config.json}"
6+
usage() {
7+
cat <<'EOF'
8+
Usage: run_models_and_profile.bash [config_file] [--only-run tag1,tag2]
9+
10+
Options:
11+
--only-run TAGS Only run the specified tag groups, separated by commas.
12+
-h, --help Show this help message.
13+
EOF
14+
}
15+
16+
CONFIG_FILE="test_config.json"
17+
ONLY_RUN_TAGS=""
18+
CONFIG_FILE_SET="no"
19+
20+
while [[ $# -gt 0 ]]; do
21+
case "$1" in
22+
--only-run)
23+
[[ $# -lt 2 ]] && { echo "Error: --only-run requires a comma-separated tag list."; exit 1; }
24+
ONLY_RUN_TAGS="$2"
25+
shift 2
26+
;;
27+
--only-run=*)
28+
ONLY_RUN_TAGS="${1#*=}"
29+
shift
30+
;;
31+
-h|--help)
32+
usage
33+
exit 0
34+
;;
35+
-*)
36+
echo "Error: Unknown option: $1"
37+
usage
38+
exit 1
39+
;;
40+
*)
41+
if [[ "$CONFIG_FILE_SET" == "yes" ]]; then
42+
echo "Error: Multiple config files provided."
43+
usage
44+
exit 1
45+
fi
46+
CONFIG_FILE="$1"
47+
CONFIG_FILE_SET="yes"
48+
shift
49+
;;
50+
esac
51+
done
752

853
# Dependencies check
954
if ! command -v jq >/dev/null 2>&1; then
@@ -33,6 +78,28 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
3378

3479
# Global variable to save the last cmake command
3580
LAST_CMAKE_CMD=""
81+
declare -A SELECTED_TAGS=()
82+
83+
normalize_tag() {
84+
local raw="$1"
85+
raw="${raw#"${raw%%[![:space:]]*}"}"
86+
raw="${raw%"${raw##*[![:space:]]}"}"
87+
printf '%s' "$raw"
88+
}
89+
90+
if [[ -n "$ONLY_RUN_TAGS" ]]; then
91+
IFS=',' read -r -a requested_tags <<< "$ONLY_RUN_TAGS"
92+
for raw_tag in "${requested_tags[@]}"; do
93+
tag="$(normalize_tag "$raw_tag")"
94+
[[ -z "$tag" ]] && continue
95+
SELECTED_TAGS["$tag"]=1
96+
done
97+
98+
if [[ ${#SELECTED_TAGS[@]} -eq 0 ]]; then
99+
echo "Error: --only-run did not contain any valid tags."
100+
exit 1
101+
fi
102+
fi
36103

37104
# Clean the build directory
38105
clean_build_dir() {
@@ -46,9 +113,12 @@ run_and_log() {
46113
local cmd="$1"
47114
local log_name="$2"
48115
local is_profile="$3"
116+
local tag="${4:-basic}"
49117
local timestamp
50118
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
51-
local log_path="$(realpath "${LOG_DIR}/${log_name}.log")"
119+
local tag_log_dir="${LOG_DIR}/${tag}"
120+
mkdir -p "$tag_log_dir"
121+
local log_path="$(realpath "${tag_log_dir}/${log_name}.log")"
52122

53123
echo -e "\033[1;32m============================================================\033[0m"
54124
echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
@@ -99,22 +169,25 @@ run_and_log() {
99169

100170
# If profiling is enabled, move profiling files to the target directory
101171
if [[ "$is_profile" == "yes" ]]; then
102-
move_profile_logs "$log_name"
172+
move_profile_logs "$log_name" "$tag"
103173
fi
104174
}
105175

106176

107177
# Move profiling output logs
108178
move_profile_logs() {
109179
local prefix="$1"
180+
local tag="${2:-basic}"
181+
local tag_profile_dir="${PROFILE_LOG_DIR}/${tag}"
182+
mkdir -p "$tag_profile_dir"
110183

111184
# Move *.report.rankN files
112185
for report_file in "${BUILD_DIR}"/*.report.rank*; do
113186
if [[ -f "$report_file" ]]; then
114187
local base_name
115188
base_name=$(basename "$report_file")
116-
mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
117-
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
189+
mv "$report_file" "${tag_profile_dir}/${prefix}_${base_name}"
190+
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
118191
fi
119192
done
120193

@@ -123,25 +196,39 @@ move_profile_logs() {
123196
if [[ -f "$record_file" ]]; then
124197
local base_name
125198
base_name=$(basename "$record_file")
126-
mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
127-
echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
199+
mv "$record_file" "${tag_profile_dir}/${prefix}_${base_name}"
200+
echo "Moved $base_name to ${tag_profile_dir}/${prefix}_${base_name}"
128201
fi
129202
done
130203
}
131204

132-
# Build "--key value" arg string from tests[i].args (shell-escaped)
205+
# Build "--key value" arg string from test_groups[gi].tests[ti].args (shell-escaped)
133206
args_string_for_test() {
134-
local idx="$1"
135-
jq -r --argjson i "$idx" '
136-
.tests[$i].args
207+
local group_idx="$1"
208+
local test_idx="$2"
209+
jq -r --argjson g "$group_idx" --argjson t "$test_idx" '
210+
.test_groups[$g].tests[$t].args
137211
| to_entries[]
138212
| "--\(.key) \(.value|tostring)"
139213
' "$CONFIG_FILE" | paste -sd' ' -
140214
}
141215

142216
# Run tests
143217
num_builds=$(jq '.builds | length' "$CONFIG_FILE")
144-
num_tests=$(jq '.tests | length' "$CONFIG_FILE")
218+
num_groups=$(jq '.test_groups | length' "$CONFIG_FILE")
219+
220+
selected_group_count=0
221+
for ((gi=0; gi<num_groups; ++gi)); do
222+
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
223+
if [[ ${#SELECTED_TAGS[@]} -eq 0 || -n "${SELECTED_TAGS[$group_tag]}" ]]; then
224+
((selected_group_count += 1))
225+
fi
226+
done
227+
228+
if [[ "$selected_group_count" -eq 0 ]]; then
229+
echo "Error: No matching test groups found for --only-run=${ONLY_RUN_TAGS}"
230+
exit 1
231+
fi
145232

146233
for ((id=0; id<num_builds; ++id)); do
147234
build_id=$(jq -r ".builds[$id].id" "$CONFIG_FILE")
@@ -152,7 +239,7 @@ for ((id=0; id<num_builds; ++id)); do
152239

153240
# always clean before another build
154241
clean_build_dir
155-
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
242+
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
156243

157244
# profile flag for runs
158245
profile_flag="no"
@@ -162,17 +249,27 @@ for ((id=0; id<num_builds; ++id)); do
162249
log_suffix="_profile"
163250
fi
164251

165-
for ((ti=0; ti<num_tests; ++ti)); do
166-
test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
167-
arg_str="$(args_string_for_test "$ti")"
252+
for ((gi=0; gi<num_groups; ++gi)); do
253+
group_tag=$(jq -r ".test_groups[$gi].tag" "$CONFIG_FILE")
254+
if [[ ${#SELECTED_TAGS[@]} -gt 0 && -z "${SELECTED_TAGS[$group_tag]}" ]]; then
255+
continue
256+
fi
257+
258+
num_tests=$(jq ".test_groups[$gi].tests | length" "$CONFIG_FILE")
259+
echo -e "\033[1;36m[TEST GROUP] tag=${group_tag}, cases=${num_tests}\033[0m"
260+
261+
for ((ti=0; ti<num_tests; ++ti)); do
262+
test_id=$(jq -r ".test_groups[$gi].tests[$ti].id" "$CONFIG_FILE")
263+
arg_str="$(args_string_for_test "$gi" "$ti")"
168264

169-
# gpt2
170-
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
171-
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
265+
# gpt2
266+
gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
267+
run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
172268

173-
# llama3
174-
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
175-
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
269+
# llama3
270+
llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
271+
run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag" "$group_tag"
272+
done
176273
done
177274
done
178275

0 commit comments

Comments
 (0)