diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py index c6370332..99d64854 100644 --- a/python/scripts/nvbench_compare.py +++ b/python/scripts/nvbench_compare.py @@ -1,10 +1,15 @@ #!/usr/bin/env python +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import argparse import math import os import sys -from enum import StrEnum +from collections import Counter +from dataclasses import dataclass +from enum import Enum import jsondiff import tabulate @@ -23,15 +28,162 @@ def version_tuple(v): tabulate_version = version_tuple(tabulate.__version__) -all_ref_devices = [] -all_cmp_devices = [] +all_ref_devices: list[dict] = [] +all_cmp_devices: list[dict] = [] config_count = 0 unknown_count = 0 -failure_count = 0 +improvement_count = 0 +regression_count = 0 pass_count = 0 +GPU_TIME_MIN_TAG = "nv/cold/time/gpu/min" +GPU_TIME_MAX_TAG = "nv/cold/time/gpu/max" +GPU_TIME_MEAN_TAG = "nv/cold/time/gpu/mean" +GPU_TIME_STDEV_TAG = "nv/cold/time/gpu/stdev/absolute" +GPU_TIME_STDEV_RELATIVE_TAG = "nv/cold/time/gpu/stdev/relative" +GPU_TIME_MEDIAN_TAG = "nv/cold/time/gpu/median" +GPU_TIME_IR_TAG = "nv/cold/time/gpu/ir/absolute" +GPU_TIME_IR_RELATIVE_TAG = "nv/cold/time/gpu/ir/relative" -class Emoji(StrEnum): +# These dataclasses are treated as parsed value objects. frozen=True prevents +# accidental field reassignment but does not imply deep immutability. + + +@dataclass(frozen=True) +class GpuTimeSummary: + minimum: float | None + maximum: float | None + mean: float | None + stdev: float | None + stdev_relative: float | None + median: float | None + interquartile_range: float | None + interquartile_range_relative: float | None + + +@dataclass(frozen=True) +class TimeEstimate: + center: float | None + relative_dispersion: float | None + + +@dataclass(frozen=True) +class BenchmarkFilterScope: + benchmark_name: str + axis_filters: list[dict] + + +@dataclass(frozen=True) +class BenchmarkFilterPlan: + global_axis_filters: list[dict] + benchmark_scopes: list[BenchmarkFilterScope] + + +class OrderedBenchmarkFilterAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + actions = getattr(namespace, self.dest, None) + actions = [] if actions is None else list(actions) + action_kind = "axis" if option_string in {"-a", "--axis"} else "benchmark" + actions.append((action_kind, values)) + setattr(namespace, self.dest, actions) + + +def state_match_key(state): + device_prefix = f"Device={state['device']}" + state_name = state["name"] + if state_name == device_prefix: + return "" + if state_name.startswith(f"{device_prefix} "): + return state_name[len(device_prefix) + 1 :] + return state_name + + +def group_states_by_match_key(states): + grouped = {} + for state in states: + grouped.setdefault(state_match_key(state), []).append(state) + return grouped + + +def state_group_counts(grouped_states): + return Counter( + {state_name: len(states) for state_name, states in grouped_states.items()} + ) + + +def format_device_ids(device_ids): + return ", ".join(str(device_id) for device_id in device_ids) + + +def parse_device_filter(device_arg, option_name): + device_arg = device_arg.strip() + if device_arg.lower() == "all": + return None + + values = [value.strip() for value in device_arg.split(",")] + if not all(values): + raise ValueError( + f"{option_name} must be 'all', a non-negative integer, " + "or comma-separated non-negative integers" + ) + + try: + device_ids = [int(value) for value in values] + except ValueError as exc: + raise ValueError( + f"{option_name} must be 'all', a non-negative integer, " + "or comma-separated non-negative integers" + ) from exc + if any(device_id < 0 for device_id in device_ids): + raise ValueError( + f"{option_name} must be 'all', a non-negative integer, " + "or comma-separated non-negative integers" + ) + return device_ids + + +def select_devices(all_devices, device_filter, option_name): + if device_filter is None: + return list(all_devices) + + devices_by_id = {device["id"]: device for device in all_devices} + missing_ids = [ + device_id for device_id in device_filter if device_id not in devices_by_id + ] + if missing_ids: + raise ValueError( + f"{option_name} requested device id(s) not present in input: " + f"{format_device_ids(missing_ids)}" + ) + + return [devices_by_id[device_id] for device_id in device_filter] + + +def resolve_benchmark_device_ids(bench, device_filter, option_name): + if device_filter is None: + return list(bench["devices"]) + + benchmark_device_ids = set(bench["devices"]) + missing_ids = [ + device_id + for device_id in device_filter + if device_id not in benchmark_device_ids + ] + if missing_ids: + raise ValueError( + f"benchmark {bench['name']!r} does not contain {option_name} " + f"device id(s): {format_device_ids(missing_ids)}" + ) + + return device_filter + + +def require_matching_device_sections(reference_device_filter, compare_device_filter): + return reference_device_filter is None and compare_device_filter is None + + +# TODO(opavlyk): replace with Emoji(StrEnum) after EOL of Python 3.10 +class Emoji(str, Enum): YELLOW = "\U0001f7e1" BLUE = "\U0001f535" GREEN = "\U0001f7e2" @@ -42,13 +194,153 @@ class Emoji(StrEnum): def colorize(msg: str, fore: Fore, emoji: Emoji, no_color: bool) -> str: if no_color: prefix = "" - if emoji_s := str(emoji): + if emoji_s := emoji.value: prefix = f"{emoji_s} " return f"{prefix}{msg}" else: return f"{fore}{msg}{Fore.RESET}" +def lookup_summary(summaries, tag): + return next((summary for summary in summaries if summary["tag"] == tag), None) + + +def extract_summary_value(summary): + summary_tag = summary.get("tag", "") + for value_data in summary.get("data", []): + if value_data.get("name") != "value": + continue + + value_type = value_data.get("type") + if value_type != "float64": + raise ValueError( + f"summary {summary_tag!r} field 'value' has type " + f"{value_type!r}; expected 'float64'" + ) + if "value" not in value_data: + raise ValueError(f"summary {summary_tag!r} field 'value' is missing value") + return value_data["value"] + + raise ValueError(f"summary {summary_tag!r} is missing field 'value'") + + +def normalize_float_value(value, *, null_value=None): + if value is None: + return null_value + return float(value) + + +def extract_summary_float(summaries, tag, *, null_value=None): + summary = lookup_summary(summaries, tag) + if summary is None: + return None + return normalize_float_value(extract_summary_value(summary), null_value=null_value) + + +def extract_gpu_time_summary(summaries): + return GpuTimeSummary( + minimum=extract_summary_float(summaries, GPU_TIME_MIN_TAG), + maximum=extract_summary_float(summaries, GPU_TIME_MAX_TAG), + mean=extract_summary_float(summaries, GPU_TIME_MEAN_TAG), + stdev=extract_summary_float(summaries, GPU_TIME_STDEV_TAG, null_value=math.inf), + stdev_relative=extract_summary_float( + summaries, GPU_TIME_STDEV_RELATIVE_TAG, null_value=math.inf + ), + median=extract_summary_float(summaries, GPU_TIME_MEDIAN_TAG), + interquartile_range=extract_summary_float( + summaries, GPU_TIME_IR_TAG, null_value=math.inf + ), + interquartile_range_relative=extract_summary_float( + summaries, GPU_TIME_IR_RELATIVE_TAG, null_value=math.inf + ), + ) + + +def compute_relative_dispersion(dispersion, center): + if ( + dispersion is None + or center is None + or center <= 0 + or not math.isfinite(center) + or dispersion < 0 + or math.isnan(dispersion) + ): + return None + return dispersion / center + + +def has_robust_estimate(summary): + return summary.median is not None and ( + summary.interquartile_range_relative is not None + or summary.interquartile_range is not None + ) + + +def has_mean_estimate(summary): + return summary.mean is not None and ( + summary.stdev_relative is not None or summary.stdev is not None + ) + + +def select_relative_dispersion(relative_dispersion, absolute_dispersion, center): + if relative_dispersion is not None: + return relative_dispersion + return compute_relative_dispersion(absolute_dispersion, center) + + +def compute_common_time_estimates(ref_summary, cmp_summary): + if has_robust_estimate(ref_summary) and has_robust_estimate(cmp_summary): + return ( + TimeEstimate( + center=ref_summary.median, + relative_dispersion=select_relative_dispersion( + ref_summary.interquartile_range_relative, + ref_summary.interquartile_range, + ref_summary.median, + ), + ), + TimeEstimate( + center=cmp_summary.median, + relative_dispersion=select_relative_dispersion( + cmp_summary.interquartile_range_relative, + cmp_summary.interquartile_range, + cmp_summary.median, + ), + ), + ) + + if has_mean_estimate(ref_summary) and has_mean_estimate(cmp_summary): + return ( + TimeEstimate( + center=ref_summary.mean, + relative_dispersion=select_relative_dispersion( + ref_summary.stdev_relative, ref_summary.stdev, ref_summary.mean + ), + ), + TimeEstimate( + center=cmp_summary.mean, + relative_dispersion=select_relative_dispersion( + cmp_summary.stdev_relative, cmp_summary.stdev, cmp_summary.mean + ), + ), + ) + + return ( + TimeEstimate( + center=ref_summary.mean, + relative_dispersion=compute_relative_dispersion( + ref_summary.stdev, ref_summary.mean + ), + ), + TimeEstimate( + center=cmp_summary.mean, + relative_dispersion=compute_relative_dispersion( + cmp_summary.stdev, cmp_summary.mean + ), + ), + ) + + def find_matching_bench(needle, haystack): for hay in haystack: if hay["name"] == needle["name"]: @@ -69,8 +361,8 @@ def format_int64_axis_value(axis_name, axis_value, axes): value = int(axis_value["value"]) if axis_flags == "pow2": value = math.log2(value) - return "2^%d" % value - return "%d" % value + return f"2^{value:.0f}" + return f"{value:d}" def format_float64_axis_value(axis_name, axis_value, axes): @@ -78,11 +370,11 @@ def format_float64_axis_value(axis_name, axis_value, axes): def format_type_axis_value(axis_name, axis_value, axes): - return "%s" % axis_value["value"] + return f"{axis_value['value']}" def format_string_axis_value(axis_name, axis_value, axes): - return "%s" % axis_value["value"] + return f"{axis_value['value']}" def format_axis_value(axis_name, axis_value, axes): @@ -98,10 +390,10 @@ def format_axis_value(axis_name, axis_value, axes): return format_string_axis_value(axis_name, axis_value, axes) -def make_display(name: str, display_values: [list[str]]) -> str: +def make_display(name: str, display_values: list[str]) -> str: open_bracket, close_bracket = ("[", "]") if len(display_values) > 1 else ("", "") - display_values = ",".join(display_values) - return f"{name}={open_bracket}{display_values}{close_bracket}" + joined_values = ",".join(display_values) + return f"{name}={open_bracket}{joined_values}{close_bracket}" def parse_axis_filters(axis_args): @@ -152,6 +444,53 @@ def parse_axis_filters(axis_args): return filters +def build_benchmark_filter_plan(filter_actions): + global_axis_args = [] + benchmark_scopes = [] + current_scope = None + + for action_kind, action_value in filter_actions or []: + if action_kind == "benchmark": + current_scope = {"benchmark_name": action_value, "axis_args": []} + benchmark_scopes.append(current_scope) + elif current_scope is None: + global_axis_args.append(action_value) + else: + current_scope["axis_args"].append(action_value) + + return BenchmarkFilterPlan( + global_axis_filters=parse_axis_filters(global_axis_args), + benchmark_scopes=[ + BenchmarkFilterScope( + benchmark_name=scope["benchmark_name"], + axis_filters=parse_axis_filters(scope["axis_args"]), + ) + for scope in benchmark_scopes + ], + ) + + +def benchmark_is_selected(benchmark_name, filter_plan): + return not filter_plan.benchmark_scopes or any( + scope.benchmark_name == benchmark_name for scope in filter_plan.benchmark_scopes + ) + + +def axis_filter_groups_for_benchmark(benchmark_name, filter_plan): + if not filter_plan.benchmark_scopes: + return [filter_plan.global_axis_filters] + + matching_scopes = [ + scope + for scope in filter_plan.benchmark_scopes + if scope.benchmark_name == benchmark_name + ] + return [ + filter_plan.global_axis_filters + scope.axis_filters + for scope in matching_scopes + ] + + def matches_axis_filters(state, axis_filters): if not axis_filters: return True @@ -175,6 +514,23 @@ def matches_axis_filters(state, axis_filters): return True +def matches_axis_filter_groups(state, axis_filter_groups): + return any( + matches_axis_filters(state, axis_filters) for axis_filters in axis_filter_groups + ) + + +def matching_axis_filters(state, axis_filter_groups): + return next( + ( + axis_filters + for axis_filters in axis_filter_groups + if matches_axis_filters(state, axis_filters) + ), + [], + ) + + def format_duration(seconds): if seconds >= 1: multiplier = 1.0 @@ -188,16 +544,21 @@ def format_duration(seconds): else: multiplier = 1e6 units = "us" - return "%0.3f %s" % (seconds * multiplier, units) + return f"{seconds * multiplier:0.3f} {units}" def format_percentage(percentage): - # When there aren't enough samples for a meaningful noise measurement, - # the noise is recorded as infinity. Unfortunately, JSON spec doesn't - # allow for inf, so these get turned into null. if percentage is None: + return "n/a" + if math.isnan(percentage): + return "n/a" + if math.isinf(percentage): return "inf" - return "%0.2f%%" % (percentage * 100.0) + return f"{percentage * 100.0:0.2f}%" + + +def has_finite_noise(noise): + return noise is not None and math.isfinite(noise) def format_axis_values(axis_values, axes, axis_filters=None): @@ -298,9 +659,10 @@ def compare_benches( plot_along, plot, dark, - axis_filters, - benchmark_filters, + filter_plan, no_color, + reference_device_filter=None, + compare_device_filter=None, ): if plot_along: import matplotlib.pyplot as plt @@ -314,12 +676,28 @@ def compare_benches( ref_bench = find_matching_bench(cmp_bench, ref_benches) if not ref_bench: continue - if benchmark_filters and cmp_bench["name"] not in benchmark_filters: + if not benchmark_is_selected(cmp_bench["name"], filter_plan): continue + axis_filter_groups = axis_filter_groups_for_benchmark( + cmp_bench["name"], filter_plan + ) + + cmp_device_ids = resolve_benchmark_device_ids( + cmp_bench, compare_device_filter, "--compare-devices" + ) + ref_device_ids = resolve_benchmark_device_ids( + ref_bench, reference_device_filter, "--reference-devices" + ) + if len(cmp_device_ids) != len(ref_device_ids): + raise ValueError( + f"benchmark {cmp_bench['name']!r} has {len(ref_device_ids)} " + f"reference device(s) but {len(cmp_device_ids)} compare device(s); " + "nvbench_compare pairs devices by position, so each compared " + "benchmark must contain the same number of devices" + ) print(f"""# {cmp_bench["name"]}\n""") - cmp_device_ids = cmp_bench["devices"] axes = cmp_bench["axes"] ref_states = ref_bench["states"] cmp_states = cmp_bench["states"] @@ -344,20 +722,43 @@ def compare_benches( headers.append("Status") colalign.append("center") - for cmp_device_id in cmp_device_ids: - rows = [] - plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}} - - for cmp_state in cmp_states: - cmp_state_name = cmp_state["name"] - ref_state = next( - filter(lambda st: st["name"] == cmp_state_name, ref_states), None + for cmp_device_index, cmp_device_id in enumerate(cmp_device_ids): + ref_device_id = ref_device_ids[cmp_device_index] + ref_device_states = [ + state + for state in ref_states + if state["device"] == ref_device_id + and matches_axis_filter_groups(state, axis_filter_groups) + ] + cmp_device_states = [ + state + for state in cmp_states + if state["device"] == cmp_device_id + and matches_axis_filter_groups(state, axis_filter_groups) + ] + ref_states_by_name = group_states_by_match_key(ref_device_states) + cmp_states_by_name = group_states_by_match_key(cmp_device_states) + ref_state_counts = state_group_counts(ref_states_by_name) + cmp_state_counts = state_group_counts(cmp_states_by_name) + if ref_state_counts != cmp_state_counts: + raise ValueError( + f"benchmark {cmp_bench['name']!r} device pair " + f"ref={ref_device_id} cmp={cmp_device_id} has mismatched " + f"state occurrences: ref={dict(ref_state_counts)}, " + f"cmp={dict(cmp_state_counts)}" ) - if not ref_state: - continue - if not matches_axis_filters(cmp_state, axis_filters): - continue + rows = [] + plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}} + counters = {} + + for cmp_state in cmp_device_states: + cmp_state_name = state_match_key(cmp_state) + occurrence = counters.get(cmp_state_name, 0) + counters[cmp_state_name] = occurrence + 1 + # Duplicate state names are matched by occurrence order within + # the filtered device section. + ref_state = ref_states_by_name[cmp_state_name][occurrence] axis_values = cmp_state["axis_values"] if not axis_values: axis_values = [] @@ -373,112 +774,85 @@ def compare_benches( if not ref_summaries or not cmp_summaries: continue - def lookup_summary(summaries, tag): - return next(filter(lambda s: s["tag"] == tag, summaries), None) - - cmp_time_summary = lookup_summary( - cmp_summaries, "nv/cold/time/gpu/mean" - ) - ref_time_summary = lookup_summary( - ref_summaries, "nv/cold/time/gpu/mean" - ) - cmp_noise_summary = lookup_summary( - cmp_summaries, "nv/cold/time/gpu/stdev/relative" - ) - ref_noise_summary = lookup_summary( - ref_summaries, "nv/cold/time/gpu/stdev/relative" - ) - # TODO: Use other timings, too. Maybe multiple rows, with a # "Timing" column + values "CPU/GPU/Batch"? - if not all( - [ - cmp_time_summary, - ref_time_summary, - cmp_noise_summary, - ref_noise_summary, - ] - ): + cmp_gpu_time = extract_gpu_time_summary(cmp_summaries) + ref_gpu_time = extract_gpu_time_summary(ref_summaries) + ref_estimate, cmp_estimate = compute_common_time_estimates( + ref_gpu_time, cmp_gpu_time + ) + + cmp_time = cmp_estimate.center + ref_time = ref_estimate.center + + if cmp_time is None or ref_time is None: continue - def extract_value(summary): - summary_data = summary["data"] - value_data = next( - filter(lambda v: v["name"] == "value", summary_data) - ) - assert value_data["type"] == "float64" - return value_data["value"] + if not math.isfinite(cmp_time) or not math.isfinite(ref_time): + continue - cmp_time = extract_value(cmp_time_summary) - ref_time = extract_value(ref_time_summary) - cmp_noise = extract_value(cmp_noise_summary) - ref_noise = extract_value(ref_noise_summary) + if cmp_time <= 0.0 or ref_time <= 0.0: + continue - # Convert string encoding to expected numerics: - cmp_time = float(cmp_time) - ref_time = float(ref_time) + cmp_noise = cmp_estimate.relative_dispersion + ref_noise = ref_estimate.relative_dispersion diff = cmp_time - ref_time frac_diff = diff / ref_time - if ref_noise and cmp_noise: - ref_noise = float(ref_noise) - cmp_noise = float(cmp_noise) - min_noise = min(ref_noise, cmp_noise) - elif ref_noise: - ref_noise = float(ref_noise) - min_noise = ref_noise - elif cmp_noise: - cmp_noise = float(cmp_noise) - min_noise = cmp_noise + if not has_finite_noise(ref_noise) or not has_finite_noise(cmp_noise): + max_noise = None else: - min_noise = None # Noise is inf + max_noise = max(ref_noise, cmp_noise) if plot_along: axis_name = [] - axis_value = "--" + axis_value = None for av in axis_values: if av["name"] != plot_along: axis_name.append(f"""{av["name"]} = {av["value"]}""") else: axis_value = float(av["value"]) - axis_name = ", ".join(axis_name) + if axis_value is not None: + axis_name = ", ".join(axis_name) - if axis_name not in plot_data["cmp"]: - plot_data["cmp"][axis_name] = {} - plot_data["ref"][axis_name] = {} - plot_data["cmp_noise"][axis_name] = {} - plot_data["ref_noise"][axis_name] = {} + if axis_name not in plot_data["cmp"]: + plot_data["cmp"][axis_name] = {} + plot_data["ref"][axis_name] = {} + plot_data["cmp_noise"][axis_name] = {} + plot_data["ref_noise"][axis_name] = {} - plot_data["cmp"][axis_name][axis_value] = cmp_time - plot_data["ref"][axis_name][axis_value] = ref_time - plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise - plot_data["ref_noise"][axis_name][axis_value] = ref_noise + plot_data["cmp"][axis_name][axis_value] = cmp_time + plot_data["ref"][axis_name][axis_value] = ref_time + plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise + plot_data["ref_noise"][axis_name][axis_value] = ref_noise global config_count global unknown_count global pass_count - global failure_count + global improvement_count + global regression_count config_count += 1 - if not min_noise: + if max_noise is None: unknown_count += 1 status_label = "????" status = colorize(status_label, Fore.YELLOW, Emoji.YELLOW, no_color) - elif abs(frac_diff) <= min_noise: + elif abs(frac_diff) <= max_noise: pass_count += 1 status_label = "SAME" status = colorize(status_label, Fore.BLUE, Emoji.BLUE, no_color) elif diff < 0: - failure_count += 1 + improvement_count += 1 status_label = "FAST" status = colorize(status_label, Fore.GREEN, Emoji.GREEN, no_color) else: - failure_count += 1 + regression_count += 1 status_label = "SLOW" status = colorize(status_label, Fore.RED, Emoji.RED, no_color) if abs(frac_diff) >= threshold: + axis_filters = matching_axis_filters(cmp_state, axis_filter_groups) row.append(format_duration(ref_time)) row.append(format_percentage(ref_noise)) row.append(format_duration(cmp_time)) @@ -507,19 +881,19 @@ def extract_value(summary): continue cmp_device = find_device_by_id(cmp_device_id, all_cmp_devices) - ref_device = find_device_by_id(ref_state["device"], all_ref_devices) + ref_device = find_device_by_id(ref_device_id, all_ref_devices) + if ref_device is None or cmp_device is None: + raise ValueError( + f"benchmark {cmp_bench['name']!r} references device pair " + f"ref={ref_device_id} cmp={cmp_device_id}, but device metadata is missing" + ) if cmp_device == ref_device: - print("## [%d] %s\n" % (cmp_device["id"], cmp_device["name"])) + print(f"## [{cmp_device['id']}] {cmp_device['name']}\n") else: print( - "## [%d] %s vs. [%d] %s\n" - % ( - ref_device["id"], - ref_device["name"], - cmp_device["id"], - cmp_device["name"], - ) + f"## [{ref_device['id']}] {ref_device['name']} vs. " + f"[{cmp_device['id']}] {cmp_device['name']}\n" ) # colalign and github format require tabulate 0.8.3 if tabulate_version >= (0, 8, 3): @@ -534,39 +908,84 @@ def extract_value(summary): print("") if plot_along: - plt.xscale("log") - plt.yscale("log") - plt.xlabel(plot_along) - plt.ylabel("time [s]") - plt.title(cmp_device["name"]) - - def plot_line(key, shape, label): - x = [float(x) for x in plot_data[key][axis].keys()] - y = list(plot_data[key][axis].values()) - - noise = list(plot_data[key + "_noise"][axis].values()) - - top = [y[i] + y[i] * noise[i] for i in range(len(x))] - bottom = [y[i] - y[i] * noise[i] for i in range(len(x))] - - p = plt.plot(x, y, shape, marker="o", label=label) - plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1) - - for axis in plot_data["cmp"].keys(): - plot_line("cmp", "-", axis) - plot_line("ref", "--", axis + " ref") - - plt.legend() - plt.show() + fig = plt.figure() + try: + plt.xscale("log") + plt.yscale("log") + plt.xlabel(plot_along) + plt.ylabel("time [s]") + plt.title(cmp_device["name"]) + + def plot_line(key, shape, label, data_axis, data=plot_data): + axis_times = data[key][data_axis] + if not axis_times: + return + axis_noise = data[key + "_noise"][data_axis] + series = sorted( + ( + ( + float(axis_value), + axis_times[axis_value], + axis_noise[axis_value], + ) + for axis_value in axis_times + ), + key=lambda item: item[0], + ) + x, y, noise = map(list, zip(*series, strict=True)) + + p = plt.plot(x, y, shape, marker="o", label=label) + + def plot_confidence_band(first, last): + if last - first < 2: + return + + band_x = x[first:last] + band_y = y[first:last] + band_noise = noise[first:last] + top = [ + band_y[i] + band_y[i] * band_noise[i] + for i in range(len(band_x)) + ] + bottom = [ + max( + band_y[i] - band_y[i] * band_noise[i], + band_y[i] * 0.001, + ) + for i in range(len(band_x)) + ] + plt.fill_between( + band_x, bottom, top, color=p[0].get_color(), alpha=0.1 + ) + + start = None + for i, noise_value in enumerate(noise): + if has_finite_noise(noise_value) and start is None: + start = i + if not has_finite_noise(noise_value) and start is not None: + plot_confidence_band(start, i) + start = None + + if start is not None: + plot_confidence_band(start, len(x)) + + for axis in plot_data["cmp"].keys(): + plot_line("cmp", "-", axis, axis) + plot_line("ref", "--", axis + " ref", axis) + + plt.legend() + plt.show() + finally: + plt.close(fig) if plot: title = "%SOL Bandwidth change" if len(comparison_device_names) == 1: title = f"{title} - {next(iter(comparison_device_names))}" - if axis_filters: + if filter_plan.global_axis_filters: axis_label = ", ".join( axis_filter["display"] - for axis_filter in axis_filters + for axis_filter in filter_plan.global_axis_filters if len(axis_filter["values"]) == 1 ) if axis_label: @@ -574,7 +993,14 @@ def plot_line(key, shape, label): plot_comparison_entries(comparison_entries, title=title, dark=dark) -def main(): +def main() -> int: + """ + Returns a process exit code. + - 0 means the comparison completed successfully. + - 1 signals an error has occurred. + + The number of detected regressions is reported in the summary output. + """ help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]" parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text) parser.add_argument( @@ -612,32 +1038,51 @@ def main(): action="store_true", help="Use emoji instead of ANSI color codes (useful for GitHub issues/PRs)", ) + parser.add_argument( + "--reference-devices", + default="all", + help="Reference devices to compare: all, a non-negative integer id, or comma-separated ids", + ) + parser.add_argument( + "--compare-devices", + default="all", + help="Compare devices to compare: all, a non-negative integer id, or comma-separated ids", + ) parser.add_argument( "-a", "--axis", - action="append", - default=[], - help="Filter on axis value, e.g. -a Elements{io}=2^20 (can repeat)", + dest="filter_actions", + action=OrderedBenchmarkFilterAction, + help=( + "Filter on axis value, e.g. -a Elements{io}=2^20. Applies to the " + "most recent --benchmark, or all benchmarks if specified before any " + "--benchmark arguments." + ), ) parser.add_argument( "-b", "--benchmark", - action="append", - default=[], + dest="filter_actions", + action=OrderedBenchmarkFilterAction, help="Filter by benchmark name (can repeat)", ) args, files_or_dirs = parser.parse_known_args() - print(files_or_dirs) try: - axis_filters = parse_axis_filters(args.axis) + filter_plan = build_benchmark_filter_plan(args.filter_actions) + reference_device_filter = parse_device_filter( + args.reference_devices, "--reference-devices" + ) + compare_device_filter = parse_device_filter( + args.compare_devices, "--compare-devices" + ) except ValueError as exc: print(str(exc)) - sys.exit(1) + return 1 if len(files_or_dirs) != 2: parser.print_help() - sys.exit(1) + return 1 # if provided two directories, find all the exactly named files # in both and treat them as the reference and compare @@ -664,41 +1109,60 @@ def main(): global all_ref_devices global all_cmp_devices - all_ref_devices = ref_root["devices"] - all_cmp_devices = cmp_root["devices"] + try: + all_ref_devices = select_devices( + ref_root["devices"], reference_device_filter, "--reference-devices" + ) + all_cmp_devices = select_devices( + cmp_root["devices"], compare_device_filter, "--compare-devices" + ) + except ValueError as exc: + print(str(exc)) + return 1 - if ref_root["devices"] != cmp_root["devices"]: + if len(all_ref_devices) != len(all_cmp_devices): + print( + f"--reference-devices selected {len(all_ref_devices)} device(s), " + f"but --compare-devices selected {len(all_cmp_devices)} device(s)" + ) + return 1 + + if all_ref_devices != all_cmp_devices: warn_fore = Fore.YELLOW if args.ignore_devices else Fore.RED msg_text = "Device sections do not match" print(colorize(msg_text, warn_fore, Emoji.NONE, args.no_color), end="") print(": ", end="") - print( - jsondiff.diff( - ref_root["devices"], cmp_root["devices"], syntax="symmetric" - ) + print(jsondiff.diff(all_ref_devices, all_cmp_devices, syntax="symmetric")) + if not args.ignore_devices and require_matching_device_sections( + reference_device_filter, compare_device_filter + ): + return 1 + + try: + compare_benches( + ref_root["benchmarks"], + cmp_root["benchmarks"], + args.threshold, + args.plot_along, + args.plot, + args.dark, + filter_plan, + args.no_color, + reference_device_filter, + compare_device_filter, ) - if not args.ignore_devices: - sys.exit(1) - - compare_benches( - ref_root["benchmarks"], - cmp_root["benchmarks"], - args.threshold, - args.plot_along, - args.plot, - args.dark, - axis_filters, - args.benchmark, - args.no_color, - ) + except ValueError as exc: + print(str(exc)) + return 1 print("# Summary\n") - print("- Total Matches: %d" % config_count) - print(" - Pass (diff <= min_noise): %d" % pass_count) - print(" - Unknown (infinite noise): %d" % unknown_count) - print(" - Failure (diff > min_noise): %d" % failure_count) - return failure_count + print(f"- Total Matches: {config_count}") + print(f" - Pass (abs(%Diff) <= max_noise): {pass_count}") + print(f" - Improvement (abs(%Diff) > max_noise, %Diff < 0): {improvement_count}") + print(f" - Regression (abs(%Diff) > max_noise, %Diff > 0): {regression_count}") + print(f" - Unknown (infinite or unavailable noise): {unknown_count}") + return 0 if __name__ == "__main__": diff --git a/python/test/test_nvbench_compare.py b/python/test/test_nvbench_compare.py new file mode 100644 index 00000000..c6d8c147 --- /dev/null +++ b/python/test/test_nvbench_compare.py @@ -0,0 +1,593 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + + +@pytest.fixture +def nvbench_compare(monkeypatch): + class DummyLine: + def get_color(self): + return "black" + + pyplot = types.ModuleType("matplotlib.pyplot") + pyplot.figure = lambda *args, **kwargs: None + pyplot.xscale = lambda *args, **kwargs: None + pyplot.yscale = lambda *args, **kwargs: None + pyplot.xlabel = lambda *args, **kwargs: None + pyplot.ylabel = lambda *args, **kwargs: None + pyplot.title = lambda *args, **kwargs: None + pyplot.plot = lambda *args, **kwargs: [DummyLine()] + pyplot.fill_between = lambda *args, **kwargs: None + pyplot.legend = lambda *args, **kwargs: None + pyplot.show = lambda *args, **kwargs: None + pyplot.close = lambda *args, **kwargs: None + + matplotlib = types.ModuleType("matplotlib") + matplotlib.pyplot = pyplot + monkeypatch.setitem(sys.modules, "matplotlib", matplotlib) + monkeypatch.setitem(sys.modules, "matplotlib.pyplot", pyplot) + monkeypatch.setitem( + sys.modules, + "seaborn", + types.SimpleNamespace(set_theme=lambda *args, **kwargs: None), + ) + monkeypatch.setitem( + sys.modules, "jsondiff", types.SimpleNamespace(diff=lambda *args, **kwargs: {}) + ) + monkeypatch.setitem( + sys.modules, + "tabulate", + types.SimpleNamespace( + __version__="0.8.10", tabulate=lambda *args, **kwargs: "" + ), + ) + monkeypatch.setitem( + sys.modules, + "colorama", + types.SimpleNamespace( + Fore=types.SimpleNamespace( + BLUE="", + GREEN="", + RED="", + RESET="", + YELLOW="", + ) + ), + ) + + module_path = Path(__file__).resolve().parents[1] / "scripts" / "nvbench_compare.py" + spec = importlib.util.spec_from_file_location("nvbench_compare", module_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def make_state( + nvbench_compare, name, *, mean="1.0", noise="0.01", axis_value=None, device=0 +): + return { + "name": name, + "device": device, + "axis_values": [] + if axis_value is None + else [{"name": "A", "type": "int64", "value": axis_value}], + "summaries": [ + { + "tag": nvbench_compare.GPU_TIME_MEAN_TAG, + "data": [{"name": "value", "type": "float64", "value": mean}], + }, + { + "tag": nvbench_compare.GPU_TIME_STDEV_RELATIVE_TAG, + "data": [{"name": "value", "type": "float64", "value": noise}], + }, + ], + } + + +def make_summary(nvbench_compare, tag, value): + return { + "tag": getattr(nvbench_compare, tag), + "data": [{"name": "value", "type": "float64", "value": value}], + } + + +def make_benchmark(states, *, name="bench"): + devices = [] + for state in states: + if state["device"] not in devices: + devices.append(state["device"]) + + return { + "name": name, + "devices": devices, + "axes": [{"name": "A", "type": "int64", "flags": ""}] + if any(state["axis_values"] for state in states) + else [], + "states": states, + } + + +def set_test_devices(monkeypatch, nvbench_compare, ref_devices=None, cmp_devices=None): + devices = [{"id": 0, "name": "Test GPU"}] + monkeypatch.setattr( + nvbench_compare, + "all_ref_devices", + devices if ref_devices is None else ref_devices, + ) + monkeypatch.setattr( + nvbench_compare, + "all_cmp_devices", + devices if cmp_devices is None else cmp_devices, + ) + monkeypatch.setattr(nvbench_compare, "config_count", 0) + monkeypatch.setattr(nvbench_compare, "pass_count", 0) + monkeypatch.setattr(nvbench_compare, "improvement_count", 0) + monkeypatch.setattr(nvbench_compare, "regression_count", 0) + monkeypatch.setattr(nvbench_compare, "unknown_count", 0) + + +def make_filter_plan(nvbench_compare, filter_actions=None): + return nvbench_compare.build_benchmark_filter_plan(filter_actions or []) + + +def test_compare_benches_accepts_matching_duplicate_state_counts( + monkeypatch, nvbench_compare +): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state2"), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state1", mean="1.005"), + make_state(nvbench_compare, "state1", mean="1.005"), + make_state(nvbench_compare, "state2", mean="1.005"), + ] + ) + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + assert nvbench_compare.config_count == 3 + assert nvbench_compare.pass_count == 3 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_compare_benches_rejects_swapped_duplicate_state_counts( + monkeypatch, nvbench_compare +): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state2"), + make_state(nvbench_compare, "state2"), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state1"), + make_state(nvbench_compare, "state2"), + make_state(nvbench_compare, "state2"), + make_state(nvbench_compare, "state2"), + ] + ) + ] + + with pytest.raises(ValueError, match="mismatched state occurrences"): + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + +def test_compare_benches_matches_duplicate_states_after_axis_filter( + monkeypatch, nvbench_compare +): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="1.0", axis_value=1), + make_state(nvbench_compare, "state", mean="2.0", axis_value=2), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="2.0", axis_value=2), + make_state(nvbench_compare, "state", mean="1.0", axis_value=1), + ] + ) + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare, [("axis", "A=2")]), + no_color=True, + ) + + assert nvbench_compare.config_count == 1 + assert nvbench_compare.pass_count == 1 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_compare_benches_skips_non_finite_centers(monkeypatch, nvbench_compare): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "finite", mean="1.0"), + make_state(nvbench_compare, "nan", mean="nan"), + make_state(nvbench_compare, "inf", mean="inf"), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "finite", mean="1.0"), + make_state(nvbench_compare, "nan", mean="1.0"), + make_state(nvbench_compare, "inf", mean="1.0"), + ] + ) + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + assert nvbench_compare.config_count == 1 + assert nvbench_compare.pass_count == 1 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_compare_benches_prefers_median_and_iqr_when_available( + monkeypatch, nvbench_compare +): + set_test_devices(monkeypatch, nvbench_compare) + + ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") + ref_state["summaries"].extend( + [ + make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), + make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"), + ] + ) + cmp_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") + cmp_state["summaries"].extend( + [ + make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), + make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"), + ] + ) + + nvbench_compare.compare_benches( + [make_benchmark([ref_state])], + [make_benchmark([cmp_state])], + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + assert nvbench_compare.config_count == 1 + assert nvbench_compare.pass_count == 0 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 1 + assert nvbench_compare.unknown_count == 0 + + +def test_compare_benches_marks_unavailable_noise_unknown(monkeypatch, nvbench_compare): + set_test_devices(monkeypatch, nvbench_compare) + + missing_noise_ref = make_state(nvbench_compare, "missing_noise") + missing_noise_ref["summaries"] = [ + make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0") + ] + missing_noise_cmp = make_state(nvbench_compare, "missing_noise") + missing_noise_cmp["summaries"] = [ + make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001") + ] + + null_noise_ref = make_state(nvbench_compare, "null_noise") + null_noise_ref["summaries"] = [ + make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0"), + make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None), + ] + null_noise_cmp = make_state(nvbench_compare, "null_noise") + null_noise_cmp["summaries"] = [ + make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001"), + make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None), + ] + + nvbench_compare.compare_benches( + [make_benchmark([missing_noise_ref, null_noise_ref])], + [make_benchmark([missing_noise_cmp, null_noise_cmp])], + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + assert nvbench_compare.config_count == 2 + assert nvbench_compare.pass_count == 0 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 2 + + +def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_compare): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "with_axis", axis_value=1), + make_state(nvbench_compare, "without_axis"), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "with_axis", axis_value=1), + make_state(nvbench_compare, "without_axis"), + ] + ) + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along="A", + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + ) + + assert nvbench_compare.config_count == 2 + assert nvbench_compare.pass_count == 2 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_device_filter_parser_accepts_all_and_duplicate_ids(nvbench_compare): + assert nvbench_compare.parse_device_filter(" all ", "--reference-devices") is None + assert nvbench_compare.parse_device_filter("0", "--reference-devices") == [0] + assert nvbench_compare.parse_device_filter("0, 2,0", "--reference-devices") == [ + 0, + 2, + 0, + ] + + +@pytest.mark.parametrize( + "device_arg", + [ + "", + " ", + "gpu", + "-1", + "0,gpu", + "0,-1", + "0,", + ",0", + ], +) +def test_device_filter_parser_rejects_invalid_values(nvbench_compare, device_arg): + with pytest.raises(ValueError, match="must be 'all'"): + nvbench_compare.parse_device_filter(device_arg, "--reference-devices") + + +def test_explicit_device_filters_downgrade_device_mismatch_to_warning(nvbench_compare): + assert nvbench_compare.require_matching_device_sections(None, None) + assert not nvbench_compare.require_matching_device_sections([0], None) + assert not nvbench_compare.require_matching_device_sections(None, [1]) + assert not nvbench_compare.require_matching_device_sections([0], [1]) + + +def test_compare_benches_pairs_filtered_devices_by_position( + monkeypatch, nvbench_compare +): + set_test_devices( + monkeypatch, + nvbench_compare, + ref_devices=[ + {"id": 0, "name": "Reference GPU 0"}, + {"id": 1, "name": "Reference GPU 1"}, + ], + cmp_devices=[ + {"id": 0, "name": "Compare GPU 0"}, + {"id": 1, "name": "Compare GPU 1"}, + ], + ) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "Device=0", mean="1.0", device=0), + make_state(nvbench_compare, "Device=1", mean="9.0", device=1), + ] + ) + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "Device=0", mean="9.0", device=0), + make_state(nvbench_compare, "Device=1", mean="1.0", device=1), + ] + ) + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan(nvbench_compare), + no_color=True, + reference_device_filter=[0], + compare_device_filter=[1], + ) + + assert nvbench_compare.config_count == 1 + assert nvbench_compare.pass_count == 1 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_axis_filter_applies_to_most_recent_benchmark(monkeypatch, nvbench_compare): + set_test_devices(monkeypatch, nvbench_compare) + + ref_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="1.0", axis_value=1), + make_state(nvbench_compare, "state", mean="2.0", axis_value=2), + ], + name="bench1", + ), + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="3.0", axis_value=1), + make_state(nvbench_compare, "state", mean="4.0", axis_value=2), + ], + name="bench2", + ), + ] + cmp_benches = [ + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="1.0", axis_value=1), + make_state(nvbench_compare, "state", mean="2.0", axis_value=2), + ], + name="bench1", + ), + make_benchmark( + [ + make_state(nvbench_compare, "state", mean="3.0", axis_value=1), + make_state(nvbench_compare, "state", mean="4.0", axis_value=2), + ], + name="bench2", + ), + ] + + nvbench_compare.compare_benches( + ref_benches, + cmp_benches, + threshold=0.0, + plot_along=None, + plot=False, + dark=False, + filter_plan=make_filter_plan( + nvbench_compare, + [("benchmark", "bench1"), ("axis", "A=2"), ("benchmark", "bench2")], + ), + no_color=True, + ) + + assert nvbench_compare.config_count == 3 + assert nvbench_compare.pass_count == 3 + assert nvbench_compare.improvement_count == 0 + assert nvbench_compare.regression_count == 0 + assert nvbench_compare.unknown_count == 0 + + +def test_main_returns_success_exit_code_when_regressions_are_detected( + monkeypatch, capsys, nvbench_compare +): + devices = [{"id": 0, "name": "Test GPU"}] + ref_root = { + "devices": devices, + "benchmarks": [ + make_benchmark([make_state(nvbench_compare, "state", mean="1.0")]) + ], + } + cmp_root = { + "devices": devices, + "benchmarks": [ + make_benchmark([make_state(nvbench_compare, "state", mean="1.2")]) + ], + } + + def read_file(path): + return ref_root if path == "ref.json" else cmp_root + + monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file) + monkeypatch.setattr(sys, "argv", ["nvbench_compare", "ref.json", "cmp.json"]) + + assert nvbench_compare.main() == 0 + assert nvbench_compare.regression_count == 1 + assert ( + "Regression (abs(%Diff) > max_noise, %Diff > 0): 1" in capsys.readouterr().out + )