diff --git a/.github/reports/build-stats-report.html b/.github/reports/build-stats-report.html new file mode 100644 index 000000000..84c01d511 --- /dev/null +++ b/.github/reports/build-stats-report.html @@ -0,0 +1,196 @@ + + + + + DiskANN Release Build Stats + + + + + + +
+

DiskANN Release Build Statistics

+

Generated: {{ data.generated }} — Last 30 days ({{ data.dates.length }} runs)

+ +

Total Build Time Trend

+
+ +

Build Time by Crate (Top 15)

+
+ +

Total Binary Size Trend

+
+ +

Binary Size per Binary

+
+ +

Latest Cargo Bloat (diskann-benchmark)

+
{{ data.latest_cargo_bloat || 'No cargo bloat data available.' }}
+ +

Latest LLVM Lines (diskann-benchmark)

+
{{ data.latest_cargo_llvm_lines || 'No cargo llvm-lines data available.' }}
+ + +
+ + + + + diff --git a/.github/scripts/build-stats-report-data.py b/.github/scripts/build-stats-report-data.py new file mode 100644 index 000000000..3155d959b --- /dev/null +++ b/.github/scripts/build-stats-report-data.py @@ -0,0 +1,165 @@ +"""Aggregate build-stats artifacts into a JS data file for the HTML report. + +Reads from: + collected/runs.tsv — tab-separated: run_id, created_at, head_sha + collected// — contains cargo-timing.html, build-stats-size.json, cargo-bloat.txt + +Usage: python build-stats-report-data.py +""" +import json +import re +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def parse_cargo_timing(html_path: Path) -> dict: + """Parse build times from a cargo-timing.html file.""" + if not html_path.exists(): + return {} + + html = html_path.read_text() + + m = re.search(r"DURATION\s*=\s*(\d+(?:\.\d+)?)", html) + total_s = float(m.group(1)) if m else 0 + + m2 = re.search(r"Total time:([^<]+)", html) + total_display = m2.group(1).strip() if m2 else f"{total_s:.1f}s" + + m = re.search(r"const UNIT_DATA\s*=\s*(\[.*?\]);", html, re.DOTALL) + if not m: + return {"total_wall_time_s": total_s, "total_time_display": total_display, "units": []} + + units = json.loads(m.group(1)) + units_sorted = sorted(units, key=lambda u: u.get("duration", 0), reverse=True) + + return { + "total_wall_time_s": total_s, + "total_time_display": total_display, + "units": [{"name": u["name"], "version": u.get("version", ""), "duration": u.get("duration", 0)} for u in units_sorted], + } + + +def main(): + collected_dir = Path(sys.argv[1]) + output_dir = Path(sys.argv[2]) + output_dir.mkdir(parents=True, exist_ok=True) + + runs_tsv = collected_dir / "runs.tsv" + + # Parse runs.tsv and load per-run artifacts + runs = [] + for line in runs_tsv.read_text().strip().splitlines(): + parts = line.split("\t") + if len(parts) < 3: + raise ValueError(f"Malformed line in runs.tsv: {line!r}") + run_id, created_at, head_sha = parts[0], parts[1], parts[2] + run_dir = collected_dir / run_id + + timing_path = run_dir / "target/cargo-timings/cargo-timing.html" + bs_path = run_dir / "build-stats-size.json" + cb_path = run_dir / "cargo-bloat.txt" + ll_path = run_dir / "cargo-llvm-lines.txt" + + if not timing_path.exists(): + raise FileNotFoundError(f"Missing cargo-timing.html for run {run_id} in {run_dir}") + + runs.append({ + "run_id": run_id, + "created_at": created_at, + "head_sha": head_sha, + "build_times": parse_cargo_timing(timing_path), + "binary_sizes": json.loads(bs_path.read_text()) if bs_path.exists() else [], + "cargo_bloat": cb_path.read_text() if cb_path.exists() else "", + "cargo_llvm_lines": ll_path.read_text() if ll_path.exists() else "", + }) + + runs.sort(key=lambda r: r["created_at"]) + + dates = [] + total_build_times = [] + crate_times: dict[str, list] = {} + total_binary_sizes = [] + per_binary: dict[str, list] = {} + + for run in runs: + dt_str = run.get("created_at", "") + dates.append(dt_str[:10] if dt_str else "?") + + bt = run.get("build_times", {}) + total_build_times.append(bt.get("total_wall_time_s", 0)) + + # Per-crate build times + units = bt.get("units", []) + seen = set() + for u in units: + name = u.get("name", "") + if name not in crate_times: + crate_times[name] = [None] * (len(dates) - 1) + crate_times[name].append(u.get("duration", 0)) + seen.add(name) + for name in crate_times: + if name not in seen: + crate_times[name].append(None) + + # Binary sizes + bs = run.get("binary_sizes", []) + total_binary_sizes.append(sum(b.get("bytes", 0) for b in bs)) + + seen_bins = set() + for b in bs: + bname = b.get("name", "") + if bname not in per_binary: + per_binary[bname] = [None] * (len(dates) - 1) + per_binary[bname].append(b.get("bytes", 0)) + seen_bins.add(bname) + for bname in per_binary: + if bname not in seen_bins: + per_binary[bname].append(None) + + # Top 15 crates by average duration + def avg(lst): + vals = [v for v in lst if v is not None] + return sum(vals) / len(vals) if vals else 0 + + top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15] + + # Latest cargo bloat and llvm-lines + latest_bloat = next((r["cargo_bloat"] for r in reversed(runs) if r.get("cargo_bloat")), "") + latest_llvm_lines = next((r["cargo_llvm_lines"] for r in reversed(runs) if r.get("cargo_llvm_lines")), "") + + # Latest run details + latest_run = None + if runs: + last = runs[-1] + bt = last.get("build_times", {}) + latest_run = { + "created_at": last.get("created_at", "?"), + "head_sha": last.get("head_sha", "?")[:12], + "total_time_display": bt.get("total_time_display", "?"), + "units": bt.get("units", []), + "binary_sizes": last.get("binary_sizes", []), + } + + build_data = { + "generated": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"), + "dates": dates, + "total_build_times": total_build_times, + "total_binary_sizes_mib": [s / 1048576 for s in total_binary_sizes], + "crate_datasets": [{"label": name, "data": crate_times[name]} for name in top_crates], + "binary_datasets": [ + {"label": name, "data": [b / 1048576 if b is not None else None for b in per_binary[name]]} + for name in sorted(per_binary.keys()) + ], + "latest_cargo_bloat": latest_bloat, + "latest_cargo_llvm_lines": latest_llvm_lines, + "latest_run": latest_run, + } + + js_path = output_dir / "build-stats-report.js" + js_path.write_text(f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n") + print(f"Generated {js_path} ({len(runs)} runs)") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/build-stats-report.sh b/.github/scripts/build-stats-report.sh new file mode 100644 index 000000000..d1617ef6d --- /dev/null +++ b/.github/scripts/build-stats-report.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# Collect build-stats artifacts from recent CI runs and generate an HTML report. +# +# Usage: build-stats-report.sh [github_repository] [collected_dir] [report_dir] +# +# Examples: +# build-stats-report.sh +# build-stats-report.sh microsoft/DiskANN collected report + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +GITHUB_REPOSITORY="${1:-microsoft/DiskANN}" + +if [ -z "${2:-}" ]; then + WORK_DIR=$(mktemp -d) + COLLECTED_DIR="$WORK_DIR/collected" + REPORT_DIR="$WORK_DIR/report" + mkdir -p "$COLLECTED_DIR" "$REPORT_DIR" +else + COLLECTED_DIR="$2" + REPORT_DIR="${3:?report_dir is required when collected_dir is provided}" +fi + +# GNU date: Linux/WSL. BSD date (macOS): use 'date -u -v-30d' instead. +if date -u -d '30 days ago' '+%Y' >/dev/null 2>&1; then + SINCE=$(date -u -d '30 days ago' '+%Y-%m-%dT%H:%M:%SZ') +else + SINCE=$(date -u -v-30d '+%Y-%m-%dT%H:%M:%SZ') +fi + +gh api --paginate \ + "repos/$GITHUB_REPOSITORY/actions/workflows/build-stats.yml/runs?status=success&created=>=$SINCE&per_page=100" \ + --jq '.workflow_runs[] | [.id, .created_at, .head_sha] | @tsv' \ + > "$COLLECTED_DIR/runs.tsv" || true + +if [ ! -s "$COLLECTED_DIR/runs.tsv" ]; then + echo "::warning::No successful build-stats runs found in the last 30 days" + exit 1 +fi + +echo "Found $(wc -l < "$COLLECTED_DIR/runs.tsv") runs" + +while IFS=$'\t' read -r run_id created_at head_sha; do + gh run download "$run_id" --repo "$GITHUB_REPOSITORY" \ + --name build-stats --dir "$COLLECTED_DIR/$run_id" 2>/dev/null \ + || echo "::warning::Skipping run $run_id (artifact expired)" +done < "$COLLECTED_DIR/runs.tsv" + +python3 "$SCRIPT_DIR/build-stats-report-data.py" "$COLLECTED_DIR" "$REPORT_DIR" +cp "$SCRIPT_DIR/../reports/build-stats-report.html" "$REPORT_DIR/" + +echo "" +echo "Report: $REPORT_DIR/build-stats-report.html" diff --git a/.github/scripts/build-stats-size.py b/.github/scripts/build-stats-size.py new file mode 100644 index 000000000..5788830f6 --- /dev/null +++ b/.github/scripts/build-stats-size.py @@ -0,0 +1,21 @@ +"""Scan a release directory for executable binaries and write a JSON size report. + +Usage: python build-stats-size.py +""" +import json +import os +import sys +from pathlib import Path + +release_dir = Path(sys.argv[1]) +output_file = Path(sys.argv[2]) + +binaries = [] +for p in sorted(release_dir.iterdir()): + if not p.is_file() or p.suffix in (".d", ".rlib", ".rmeta", ".o", ".dwp"): + continue + if not os.access(p, os.X_OK) or p.stat().st_size < 1024: + continue + binaries.append({"name": p.name, "bytes": p.stat().st_size}) + +output_file.write_text(json.dumps(binaries, indent=2)) diff --git a/.github/workflows/build-stats-report.yml b/.github/workflows/build-stats-report.yml new file mode 100644 index 000000000..48f7975be --- /dev/null +++ b/.github/workflows/build-stats-report.yml @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# Aggregates release build statistics from the last 30 days of +# `build-stats.yml` runs and produces an HTML dashboard with trend charts. +# +# Intended for weekly review. + +on: + schedule: + - cron: "0 8 * * *" + workflow_dispatch: # allows manual triggering from the Actions UI + +name: Build Stats Report + +defaults: + run: + shell: bash + +permissions: + contents: read + actions: read + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + report: + name: generate report + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Generate report + run: | + set -euo pipefail + mkdir -p collected + mkdir -p report + bash .github/scripts/build-stats-report.sh "${{ github.repository }}" collected report + + - name: Upload report + uses: actions/upload-artifact@v4 + with: + name: build-stats-report + path: report/ + retention-days: 90 + diff --git a/.github/workflows/build-stats.yml b/.github/workflows/build-stats.yml new file mode 100644 index 000000000..955e215a8 --- /dev/null +++ b/.github/workflows/build-stats.yml @@ -0,0 +1,63 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# Build statistics workflow. +# +# Runs on every push to main to capture release build timings, binary sizes, +# and cargo-bloat analysis. The data is uploaded as structured JSON artifacts +# so the companion `build-stats-report.yml` workflow can aggregate trends. + +on: + push: + branches: ["main"] + workflow_dispatch: # allows manual triggering from the Actions UI + +name: Build Stats + +env: + CARGO_TERM_COLOR: always + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + build-release: + name: release build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + run: rustup show + + - name: Install tools + uses: taiki-e/install-action@v2 + with: + tool: cargo-bloat,cargo-llvm-lines + + - name: Build release with timings + run: cargo clean && cargo build --workspace --release --locked --timings + + - name: Collect binary sizes + run: python3 .github/scripts/build-stats-size.py target/release build-stats-size.json + + - name: Run cargo bloat + run: cargo bloat --release --package diskann-benchmark -n 100 | tee cargo-bloat.txt + + - name: Run cargo llvm-lines + run: cargo llvm-lines --release --package diskann-benchmark | head -100 | tee cargo-llvm-lines.txt + + - name: Upload build stats + uses: actions/upload-artifact@v4 + with: + name: build-stats + path: | + target/cargo-timings/cargo-timing.html + build-stats-size.json + cargo-bloat.txt + cargo-llvm-lines.txt + retention-days: 90