From 1cc48fb071b8a9478d2a851086a4e5396e680555 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 16:57:11 -0700 Subject: [PATCH 01/30] Create two workflows --- .github/workflows/build-release.yml | 172 ++++++++++ .github/workflows/produce-build-stats.yml | 395 ++++++++++++++++++++++ 2 files changed, 567 insertions(+) create mode 100644 .github/workflows/build-release.yml create mode 100644 .github/workflows/produce-build-stats.yml diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 000000000..c9cd2126d --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,172 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# Release build statistics workflow. +# +# Runs on every push to main to capture release build timings, binary sizes, +# and cargo-bloat analysis. The data is uploaded as structured JSON artifacts +# so the companion `produce-build-stats.yml` workflow can aggregate trends. + +on: + push: + branches: ["main"] + +name: Release Build Stats + +concurrency: + group: ${{ github.workflow }}-${{ github.sha }} + cancel-in-progress: true + +env: + CARGO_TERM_COLOR: always + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + build-release: + name: release build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + run: rustup show + + - uses: Swatinem/rust-cache@v2 + with: + # Only cache registry/git — we cargo clean for cold build timings. + cache-targets: false + + - name: Install cargo-bloat + uses: taiki-e/install-action@v2 + with: + tool: cargo-bloat + + - name: Build release with timings + run: | + set -euxo pipefail + cargo clean + cargo build --workspace --release --locked --timings + + - name: Upload cargo-timing report + uses: actions/upload-artifact@v4 + with: + name: cargo-timing + path: | + target/cargo-timings/cargo-timing.html + retention-days: 90 + + - name: Parse and display build times + run: | + python3 - <<'PYEOF' + import re, json, sys + from pathlib import Path + + html_path = Path("target/cargo-timings/cargo-timing.html") + if not html_path.exists(): + print("::warning::cargo-timing.html not found") + sys.exit(0) + + html = html_path.read_text() + + # Extract wall-clock duration (seconds) + m = re.search(r'DURATION\s*=\s*(\d+(?:\.\d+)?)', html) + total_s = float(m.group(1)) if m else 0 + + # Extract human-readable total time from summary table + m2 = re.search(r'Total time:([^<]+)', html) + total_display = m2.group(1).strip() if m2 else f"{total_s:.1f}s" + + # Extract per-unit data from the embedded JSON + m = re.search(r'const UNIT_DATA\s*=\s*(\[.*?\]);', html, re.DOTALL) + if not m: + print("::warning::Could not parse UNIT_DATA from timing report") + sys.exit(0) + + units = json.loads(m.group(1)) + + # Sort by duration descending + units_sorted = sorted(units, key=lambda u: u.get("duration", 0), reverse=True) + + # Print markdown table to console + print(f"\n### Release Build Times (Total wall time: {total_display})\n") + print("| # | Crate | Version | Duration |") + print("|---|-------|---------|----------|") + for i, u in enumerate(units_sorted, 1): + name = u.get("name", "?") + version = u.get("version", "?") + duration = u.get("duration", 0) + print(f"| {i} | {name} | {version} | {duration:.1f}s |") + + # Write structured JSON artifact + stats = { + "total_wall_time_s": total_s, + "total_time_display": total_display, + "units": [ + { + "name": u["name"], + "version": u.get("version", ""), + "duration": u.get("duration", 0), + } + for u in units_sorted + ], + } + Path("build-times.json").write_text(json.dumps(stats, indent=2)) + PYEOF + + - name: Log binary sizes + run: | + set +x + echo "" + echo "### Release Binary Sizes" + echo "" + echo "| Binary | Size (bytes) | Size |" + echo "|--------|-------------|------|" + + python3 - <<'PYEOF' + import json, os + from pathlib import Path + + binaries = [] + release_dir = Path("target/release") + for p in sorted(release_dir.iterdir()): + if not p.is_file(): + continue + # On Linux, check executable bit and skip non-ELF extensions + if p.suffix in (".d", ".rlib", ".rmeta", ".o", ".dwp"): + continue + if not os.access(p, os.X_OK): + continue + size = p.stat().st_size + if size < 1024: + continue # skip tiny files (build scripts, etc.) + if size > 1048576: + human = f"{size / 1048576:.1f} MiB" + elif size > 1024: + human = f"{size / 1024:.1f} KiB" + else: + human = f"{size} B" + print(f"| {p.name} | {size} | {human} |") + binaries.append({"name": p.name, "bytes": size}) + + Path("binary-sizes.json").write_text(json.dumps(binaries, indent=2)) + PYEOF + + - name: Run cargo bloat + run: | + cargo bloat --release --package diskann-benchmark -n 100 | tee cargo-bloat.txt + + - name: Upload build stats + uses: actions/upload-artifact@v4 + with: + name: build-stats + path: | + build-times.json + binary-sizes.json + cargo-bloat.txt + retention-days: 90 diff --git a/.github/workflows/produce-build-stats.yml b/.github/workflows/produce-build-stats.yml new file mode 100644 index 000000000..63c5a3ea8 --- /dev/null +++ b/.github/workflows/produce-build-stats.yml @@ -0,0 +1,395 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +# Aggregates release build statistics from the last 30 days of +# `build-release.yml` runs and produces an HTML dashboard with trend charts. +# +# Intended for weekly scrum review. + +on: + schedule: + - cron: "0 8 * * *" + workflow_dispatch: + +name: Produce Build Stats Report + +defaults: + run: + shell: bash + +permissions: + contents: read + actions: read + +env: + GH_TOKEN: ${{ github.token }} + +jobs: + report: + name: generate report + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Collect artifacts from recent runs + run: | + set -euo pipefail + mkdir -p collected + + # List successful runs of build-release.yml from the last 30 days + SINCE=$(date -u -d '30 days ago' '+%Y-%m-%dT%H:%M:%SZ') + + echo "Fetching runs since $SINCE ..." + + gh api --paginate \ + "repos/${{ github.repository }}/actions/workflows/build-release.yml/runs?status=success&created=>=$SINCE&per_page=100" \ + --jq '.workflow_runs[] | [.id, .created_at, .head_sha] | @tsv' \ + > runs.tsv || true + + if [ ! -s runs.tsv ]; then + echo "::warning::No successful build-release runs found in the last 30 days" + echo '[]' > collected/all_runs.json + exit 0 + fi + + echo "Found $(wc -l < runs.tsv) runs" + + # For each run, download the build-stats artifact and build metadata + echo '[]' > collected/all_runs.json + + while IFS=$'\t' read -r run_id created_at head_sha; do + echo "Processing run $run_id ($created_at) ..." + ARTIFACT_DIR="collected/$run_id" + mkdir -p "$ARTIFACT_DIR" + + # Download build-stats artifact + if ! gh run download "$run_id" \ + --repo "${{ github.repository }}" \ + --name build-stats \ + --dir "$ARTIFACT_DIR/build-stats" 2>/dev/null; then + echo " ::warning::Could not download build-stats for run $run_id (artifact may have expired)" + rm -rf "$ARTIFACT_DIR" + continue + fi + echo " Downloaded build-stats for run $run_id" + + # Build a metadata record and append to collected data + python3 -c " + import json, sys + from pathlib import Path + + base = Path(sys.argv[1]) / 'build-stats' + record = { + 'run_id': int(sys.argv[2]), + 'created_at': sys.argv[3], + 'head_sha': sys.argv[4], + } + + bt = base / 'build-times.json' + if bt.exists(): + record['build_times'] = json.loads(bt.read_text()) + + bs = base / 'binary-sizes.json' + if bs.exists(): + record['binary_sizes'] = json.loads(bs.read_text()) + + cb = base / 'cargo-bloat.txt' + if cb.exists(): + record['cargo_bloat'] = cb.read_text() + + all_file = Path('collected/all_runs.json') + data = json.loads(all_file.read_text()) + data.append(record) + all_file.write_text(json.dumps(data)) + " "$ARTIFACT_DIR" "$run_id" "$created_at" "$head_sha" + + done < runs.tsv + + COUNT=$(python3 -c "import json; print(len(json.loads(open('collected/all_runs.json').read())))") + echo "Collected data for $COUNT runs" + + - name: Generate HTML report + run: | + python3 - <<'PYEOF' + import json + from pathlib import Path + from datetime import datetime + + data = json.loads(Path("collected/all_runs.json").read_text()) + + # Sort by date ascending + data.sort(key=lambda r: r.get("created_at", "")) + + # --- Prepare chart data --- + + dates = [] + total_build_times = [] + # crate_times: { crate_name: [time_per_run...] } + crate_times = {} + total_binary_sizes = [] + # per_binary: { binary_name: [size_per_run...] } + per_binary = {} + + for run in data: + dt_str = run.get("created_at", "") + dates.append(dt_str[:10] if dt_str else "?") + + bt = run.get("build_times", {}) + total_build_times.append(bt.get("total_wall_time_s", 0)) + + # Per-crate build times + units = bt.get("units", []) + seen_crates = set() + for u in units: + name = u.get("name", "") + if name not in crate_times: + crate_times[name] = [None] * (len(dates) - 1) + crate_times[name].append(u.get("duration", 0)) + seen_crates.add(name) + for name in crate_times: + if name not in seen_crates: + crate_times[name].append(None) + + # Binary sizes + bs = run.get("binary_sizes", []) + run_total_size = sum(b.get("bytes", 0) for b in bs) + total_binary_sizes.append(run_total_size) + + seen_bins = set() + for b in bs: + bname = b.get("name", "") + if bname not in per_binary: + per_binary[bname] = [None] * (len(dates) - 1) + per_binary[bname].append(b.get("bytes", 0)) + seen_bins.add(bname) + for bname in per_binary: + if bname not in seen_bins: + per_binary[bname].append(None) + + # Filter to top 15 crates by average duration + def avg_non_none(lst): + vals = [v for v in lst if v is not None] + return sum(vals) / len(vals) if vals else 0 + + top_crates = sorted(crate_times.keys(), key=lambda c: avg_non_none(crate_times[c]), reverse=True)[:15] + + # Latest cargo bloat + latest_bloat = "" + for run in reversed(data): + if run.get("cargo_bloat"): + latest_bloat = run["cargo_bloat"] + break + + # --- Color palette --- + COLORS = [ + "#4e79a7", "#f28e2b", "#e15759", "#76b7b2", "#59a14f", + "#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac", + "#86bcb6", "#8cd17d", "#b6992d", "#499894", "#d37295", + ] + + def js_array(lst): + return json.dumps(lst) + + def js_datasets_crates(): + datasets = [] + for i, name in enumerate(top_crates): + color = COLORS[i % len(COLORS)] + datasets.append({ + "label": name, + "data": crate_times[name], + "borderColor": color, + "backgroundColor": color + "33", + "tension": 0.3, + "spanGaps": True, + }) + return json.dumps(datasets) + + def js_datasets_binaries(): + datasets = [] + for i, name in enumerate(sorted(per_binary.keys())): + color = COLORS[i % len(COLORS)] + datasets.append({ + "label": name, + "data": [b / 1048576 if b is not None else None for b in per_binary[name]], + "borderColor": color, + "backgroundColor": color + "33", + "tension": 0.3, + "spanGaps": True, + }) + return json.dumps(datasets) + + # --- Generate HTML --- + now = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC") + + html = f""" + + + + DiskANN Release Build Stats + + + + +

DiskANN Release Build Statistics

+

Generated: {now} — Last 30 days ({len(data)} runs)

+ +

Total Build Time Trend

+
+ +
+ +

Build Time by Crate (Top 15)

+
+ +
+ +

Total Binary Size Trend

+
+ +
+ +

Binary Size per Binary

+
+ +
+ +

Latest Cargo Bloat (diskann-benchmark)

+
{latest_bloat if latest_bloat else "No cargo bloat data available."}
+ +

Latest Build Details

+ """ + + # Add latest run details table + if data: + latest = data[-1] + bt = latest.get("build_times", {}) + units = bt.get("units", []) + html += f""" +

Run: {latest.get('created_at', '?')} — + Commit: {latest.get('head_sha', '?')[:12]} — + Total wall time: {bt.get('total_time_display', '?')}

+ + + + """ + for i, u in enumerate(units, 1): + html += f"\n" + html += "
#CrateVersionDuration
{i}{u.get('name','?')}{u.get('version','?')}{u.get('duration',0):.1f}s
\n" + + bs = latest.get("binary_sizes", []) + if bs: + html += """ +

Binary Sizes

+ + + + """ + for b in bs: + size = b.get("bytes", 0) + human = f"{size / 1048576:.1f} MiB" if size > 1048576 else f"{size / 1024:.1f} KiB" + html += f"\n" + html += "
BinarySize (bytes)Size
{b.get('name','?')}{size:,}{human}
\n" + + html += f""" + + + + """ + + Path("build-stats-report.html").write_text(html) + print(f"Report generated: build-stats-report.html ({len(data)} runs)") + PYEOF + + - name: Upload report + uses: actions/upload-artifact@v4 + with: + name: build-stats-report + path: build-stats-report.html + retention-days: 90 From 9cda4e6f380467e020c479b47b368eeeab7f3f95 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 17:00:01 -0700 Subject: [PATCH 02/30] Add workflow dispatch --- .github/workflows/build-release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index c9cd2126d..fe66974df 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -10,6 +10,7 @@ on: push: branches: ["main"] + workflow_dispatch: name: Release Build Stats From 9d4301d80ec452945b2f91fde7a6c04a72caf0a2 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 17:08:37 -0700 Subject: [PATCH 03/30] Enable on push --- .github/workflows/build-release.yml | 4 +++- .github/workflows/produce-build-stats.yml | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index fe66974df..d7d07ca42 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -9,7 +9,9 @@ on: push: - branches: ["main"] + branches: + - "main" + - "u/arrayka/release_builds_ci" # temporary: remove before merge workflow_dispatch: name: Release Build Stats diff --git a/.github/workflows/produce-build-stats.yml b/.github/workflows/produce-build-stats.yml index 63c5a3ea8..8425fbd0b 100644 --- a/.github/workflows/produce-build-stats.yml +++ b/.github/workflows/produce-build-stats.yml @@ -7,6 +7,9 @@ # Intended for weekly scrum review. on: + push: + branches: + - "u/arrayka/release_builds_ci" # temporary: remove before merge schedule: - cron: "0 8 * * *" workflow_dispatch: From f2baa290bdc2ec3038fa16551a621e80265a1b41 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 18:02:40 -0700 Subject: [PATCH 04/30] Extract build stats HTML --- .github/scripts/build-stats-report.html | 176 +++++++++++++++ .github/workflows/produce-build-stats.yml | 264 ++++------------------ 2 files changed, 221 insertions(+), 219 deletions(-) create mode 100644 .github/scripts/build-stats-report.html diff --git a/.github/scripts/build-stats-report.html b/.github/scripts/build-stats-report.html new file mode 100644 index 000000000..61bf2c9c6 --- /dev/null +++ b/.github/scripts/build-stats-report.html @@ -0,0 +1,176 @@ + + + + + DiskANN Release Build Stats + + + + + +

DiskANN Release Build Statistics

+

+ +

Total Build Time Trend

+
+ +

Build Time by Crate (Top 15)

+
+ +

Total Binary Size Trend

+
+ +

Binary Size per Binary

+
+ +

Latest Cargo Bloat (diskann-benchmark)

+

+
+  

Latest Build Details

+
+ + + + diff --git a/.github/workflows/produce-build-stats.yml b/.github/workflows/produce-build-stats.yml index 8425fbd0b..f0cc254c9 100644 --- a/.github/workflows/produce-build-stats.yml +++ b/.github/workflows/produce-build-stats.yml @@ -111,26 +111,21 @@ jobs: COUNT=$(python3 -c "import json; print(len(json.loads(open('collected/all_runs.json').read())))") echo "Collected data for $COUNT runs" - - name: Generate HTML report + - name: Generate data file for report run: | + mkdir -p report python3 - <<'PYEOF' import json from pathlib import Path from datetime import datetime data = json.loads(Path("collected/all_runs.json").read_text()) - - # Sort by date ascending data.sort(key=lambda r: r.get("created_at", "")) - # --- Prepare chart data --- - dates = [] total_build_times = [] - # crate_times: { crate_name: [time_per_run...] } crate_times = {} total_binary_sizes = [] - # per_binary: { binary_name: [size_per_run...] } per_binary = {} for run in data: @@ -140,7 +135,6 @@ jobs: bt = run.get("build_times", {}) total_build_times.append(bt.get("total_wall_time_s", 0)) - # Per-crate build times units = bt.get("units", []) seen_crates = set() for u in units: @@ -153,10 +147,8 @@ jobs: if name not in seen_crates: crate_times[name].append(None) - # Binary sizes bs = run.get("binary_sizes", []) - run_total_size = sum(b.get("bytes", 0) for b in bs) - total_binary_sizes.append(run_total_size) + total_binary_sizes.append(sum(b.get("bytes", 0) for b in bs)) seen_bins = set() for b in bs: @@ -169,12 +161,12 @@ jobs: if bname not in seen_bins: per_binary[bname].append(None) - # Filter to top 15 crates by average duration - def avg_non_none(lst): + # Top 15 crates by average duration + def avg(lst): vals = [v for v in lst if v is not None] return sum(vals) / len(vals) if vals else 0 - top_crates = sorted(crate_times.keys(), key=lambda c: avg_non_none(crate_times[c]), reverse=True)[:15] + top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15] # Latest cargo bloat latest_bloat = "" @@ -183,216 +175,50 @@ jobs: latest_bloat = run["cargo_bloat"] break - # --- Color palette --- - COLORS = [ - "#4e79a7", "#f28e2b", "#e15759", "#76b7b2", "#59a14f", - "#edc948", "#b07aa1", "#ff9da7", "#9c755f", "#bab0ac", - "#86bcb6", "#8cd17d", "#b6992d", "#499894", "#d37295", - ] - - def js_array(lst): - return json.dumps(lst) - - def js_datasets_crates(): - datasets = [] - for i, name in enumerate(top_crates): - color = COLORS[i % len(COLORS)] - datasets.append({ - "label": name, - "data": crate_times[name], - "borderColor": color, - "backgroundColor": color + "33", - "tension": 0.3, - "spanGaps": True, - }) - return json.dumps(datasets) - - def js_datasets_binaries(): - datasets = [] - for i, name in enumerate(sorted(per_binary.keys())): - color = COLORS[i % len(COLORS)] - datasets.append({ - "label": name, - "data": [b / 1048576 if b is not None else None for b in per_binary[name]], - "borderColor": color, - "backgroundColor": color + "33", - "tension": 0.3, - "spanGaps": True, - }) - return json.dumps(datasets) - - # --- Generate HTML --- - now = datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC") - - html = f""" - - - - DiskANN Release Build Stats - - - - -

DiskANN Release Build Statistics

-

Generated: {now} — Last 30 days ({len(data)} runs)

- -

Total Build Time Trend

-
- -
- -

Build Time by Crate (Top 15)

-
- -
- -

Total Binary Size Trend

-
- -
- -

Binary Size per Binary

-
- -
- -

Latest Cargo Bloat (diskann-benchmark)

-
{latest_bloat if latest_bloat else "No cargo bloat data available."}
- -

Latest Build Details

- """ - - # Add latest run details table + # Latest run details + latest_run = None if data: - latest = data[-1] - bt = latest.get("build_times", {}) - units = bt.get("units", []) - html += f""" -

Run: {latest.get('created_at', '?')} — - Commit: {latest.get('head_sha', '?')[:12]} — - Total wall time: {bt.get('total_time_display', '?')}

- - - - """ - for i, u in enumerate(units, 1): - html += f"\n" - html += "
#CrateVersionDuration
{i}{u.get('name','?')}{u.get('version','?')}{u.get('duration',0):.1f}s
\n" - - bs = latest.get("binary_sizes", []) - if bs: - html += """ -

Binary Sizes

- - - - """ - for b in bs: - size = b.get("bytes", 0) - human = f"{size / 1048576:.1f} MiB" if size > 1048576 else f"{size / 1024:.1f} KiB" - html += f"\n" - html += "
BinarySize (bytes)Size
{b.get('name','?')}{size:,}{human}
\n" - - html += f""" - - - - """ - - Path("build-stats-report.html").write_text(html) - print(f"Report generated: build-stats-report.html ({len(data)} runs)") + last = data[-1] + bt = last.get("build_times", {}) + latest_run = { + "created_at": last.get("created_at", "?"), + "head_sha": last.get("head_sha", "?")[:12], + "total_time_display": bt.get("total_time_display", "?"), + "units": bt.get("units", []), + "binary_sizes": last.get("binary_sizes", []), + } + + # Assemble the data object + build_data = { + "generated": datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), + "dates": dates, + "total_build_times": total_build_times, + "total_binary_sizes_mib": [s / 1048576 for s in total_binary_sizes], + "crate_datasets": [ + {"label": name, "data": crate_times[name]} + for name in top_crates + ], + "binary_datasets": [ + {"label": name, "data": [b / 1048576 if b is not None else None for b in per_binary[name]]} + for name in sorted(per_binary.keys()) + ], + "latest_cargo_bloat": latest_bloat, + "latest_run": latest_run, + } + + # Write as a JS file that assigns to a global constant + js_content = f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n" + Path("report/build-stats-data.js").write_text(js_content) + print(f"Generated build-stats-data.js ({len(data)} runs)") PYEOF + - name: Assemble report + run: | + cp .github/scripts/build-stats-report.html report/build-stats-report.html + - name: Upload report uses: actions/upload-artifact@v4 with: name: build-stats-report - path: build-stats-report.html + path: report/ retention-days: 90 From b25c600c8078b4ff845c253ca1804dfe804811cb Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 18:07:59 -0700 Subject: [PATCH 05/30] Extract scripts --- .github/scripts/generate-stats-data.py | 100 +++++++++++++ .github/scripts/parse-build-stats.py | 67 +++++++++ .github/workflows/build-release.yml | 115 +-------------- .github/workflows/produce-build-stats.yml | 168 +++------------------- 4 files changed, 190 insertions(+), 260 deletions(-) create mode 100644 .github/scripts/generate-stats-data.py create mode 100644 .github/scripts/parse-build-stats.py diff --git a/.github/scripts/generate-stats-data.py b/.github/scripts/generate-stats-data.py new file mode 100644 index 000000000..c1b4338a9 --- /dev/null +++ b/.github/scripts/generate-stats-data.py @@ -0,0 +1,100 @@ +"""Aggregate build-stats artifacts into a JS data file for the HTML report.""" +import json +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def main(): + collected_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("collected") + output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("report") + output_dir.mkdir(parents=True, exist_ok=True) + + data = json.loads((collected_dir / "all_runs.json").read_text()) + data.sort(key=lambda r: r.get("created_at", "")) + + dates = [] + total_build_times = [] + crate_times: dict[str, list] = {} + total_binary_sizes = [] + per_binary: dict[str, list] = {} + + for run in data: + dt_str = run.get("created_at", "") + dates.append(dt_str[:10] if dt_str else "?") + + bt = run.get("build_times", {}) + total_build_times.append(bt.get("total_wall_time_s", 0)) + + # Per-crate build times + units = bt.get("units", []) + seen = set() + for u in units: + name = u.get("name", "") + if name not in crate_times: + crate_times[name] = [None] * (len(dates) - 1) + crate_times[name].append(u.get("duration", 0)) + seen.add(name) + for name in crate_times: + if name not in seen: + crate_times[name].append(None) + + # Binary sizes + bs = run.get("binary_sizes", []) + total_binary_sizes.append(sum(b.get("bytes", 0) for b in bs)) + + seen_bins = set() + for b in bs: + bname = b.get("name", "") + if bname not in per_binary: + per_binary[bname] = [None] * (len(dates) - 1) + per_binary[bname].append(b.get("bytes", 0)) + seen_bins.add(bname) + for bname in per_binary: + if bname not in seen_bins: + per_binary[bname].append(None) + + # Top 15 crates by average duration + def avg(lst): + vals = [v for v in lst if v is not None] + return sum(vals) / len(vals) if vals else 0 + + top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15] + + # Latest cargo bloat + latest_bloat = next((r["cargo_bloat"] for r in reversed(data) if r.get("cargo_bloat")), "") + + # Latest run details + latest_run = None + if data: + last = data[-1] + bt = last.get("build_times", {}) + latest_run = { + "created_at": last.get("created_at", "?"), + "head_sha": last.get("head_sha", "?")[:12], + "total_time_display": bt.get("total_time_display", "?"), + "units": bt.get("units", []), + "binary_sizes": last.get("binary_sizes", []), + } + + build_data = { + "generated": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"), + "dates": dates, + "total_build_times": total_build_times, + "total_binary_sizes_mib": [s / 1048576 for s in total_binary_sizes], + "crate_datasets": [{"label": name, "data": crate_times[name]} for name in top_crates], + "binary_datasets": [ + {"label": name, "data": [b / 1048576 if b is not None else None for b in per_binary[name]]} + for name in sorted(per_binary.keys()) + ], + "latest_cargo_bloat": latest_bloat, + "latest_run": latest_run, + } + + js_path = output_dir / "build-stats-data.js" + js_path.write_text(f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n") + print(f"Generated {js_path} ({len(data)} runs)") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/parse-build-stats.py b/.github/scripts/parse-build-stats.py new file mode 100644 index 000000000..53aa08976 --- /dev/null +++ b/.github/scripts/parse-build-stats.py @@ -0,0 +1,67 @@ +"""Parse cargo-timing.html and target/release binaries into JSON artifacts.""" +import json +import os +import re +import sys +from pathlib import Path + +html_path = Path("target/cargo-timings/cargo-timing.html") +if not html_path.exists(): + print("::warning::cargo-timing.html not found") + sys.exit(0) + +html = html_path.read_text() + +# --- Build times --- +m = re.search(r"DURATION\s*=\s*(\d+(?:\.\d+)?)", html) +total_s = float(m.group(1)) if m else 0 + +m2 = re.search(r"Total time:([^<]+)", html) +total_display = m2.group(1).strip() if m2 else f"{total_s:.1f}s" + +m = re.search(r"const UNIT_DATA\s*=\s*(\[.*?\]);", html, re.DOTALL) +if not m: + print("::warning::Could not parse UNIT_DATA from timing report") + sys.exit(0) + +units = json.loads(m.group(1)) +units_sorted = sorted(units, key=lambda u: u.get("duration", 0), reverse=True) + +# Print markdown table +print(f"\n### Release Build Times (Total wall time: {total_display})\n") +print("| # | Crate | Version | Duration |") +print("|---|-------|---------|----------|") +for i, u in enumerate(units_sorted, 1): + print(f"| {i} | {u.get('name', '?')} | {u.get('version', '?')} | {u.get('duration', 0):.1f}s |") + +Path("build-times.json").write_text(json.dumps({ + "total_wall_time_s": total_s, + "total_time_display": total_display, + "units": [{"name": u["name"], "version": u.get("version", ""), "duration": u.get("duration", 0)} for u in units_sorted], +}, indent=2)) + +# --- Binary sizes --- +print("\n### Release Binary Sizes\n") +print("| Binary | Size (bytes) | Size |") +print("|--------|-------------|------|") + +binaries = [] +release_dir = Path("target/release") +for p in sorted(release_dir.iterdir()): + if not p.is_file(): + continue + if p.suffix in (".d", ".rlib", ".rmeta", ".o", ".dwp"): + continue + if not os.access(p, os.X_OK): + continue + size = p.stat().st_size + if size < 1024: + continue + if size > 1048576: + human = f"{size / 1048576:.1f} MiB" + else: + human = f"{size / 1024:.1f} KiB" + print(f"| {p.name} | {size} | {human} |") + binaries.append({"name": p.name, "bytes": size}) + +Path("binary-sizes.json").write_text(json.dumps(binaries, indent=2)) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index d7d07ca42..2162ed2f0 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -42,7 +42,6 @@ jobs: - uses: Swatinem/rust-cache@v2 with: - # Only cache registry/git — we cargo clean for cold build timings. cache-targets: false - name: Install cargo-bloat @@ -51,124 +50,20 @@ jobs: tool: cargo-bloat - name: Build release with timings - run: | - set -euxo pipefail - cargo clean - cargo build --workspace --release --locked --timings + run: cargo clean && cargo build --workspace --release --locked --timings - - name: Upload cargo-timing report - uses: actions/upload-artifact@v4 - with: - name: cargo-timing - path: | - target/cargo-timings/cargo-timing.html - retention-days: 90 - - - name: Parse and display build times - run: | - python3 - <<'PYEOF' - import re, json, sys - from pathlib import Path - - html_path = Path("target/cargo-timings/cargo-timing.html") - if not html_path.exists(): - print("::warning::cargo-timing.html not found") - sys.exit(0) - - html = html_path.read_text() - - # Extract wall-clock duration (seconds) - m = re.search(r'DURATION\s*=\s*(\d+(?:\.\d+)?)', html) - total_s = float(m.group(1)) if m else 0 - - # Extract human-readable total time from summary table - m2 = re.search(r'Total time:([^<]+)', html) - total_display = m2.group(1).strip() if m2 else f"{total_s:.1f}s" - - # Extract per-unit data from the embedded JSON - m = re.search(r'const UNIT_DATA\s*=\s*(\[.*?\]);', html, re.DOTALL) - if not m: - print("::warning::Could not parse UNIT_DATA from timing report") - sys.exit(0) - - units = json.loads(m.group(1)) - - # Sort by duration descending - units_sorted = sorted(units, key=lambda u: u.get("duration", 0), reverse=True) - - # Print markdown table to console - print(f"\n### Release Build Times (Total wall time: {total_display})\n") - print("| # | Crate | Version | Duration |") - print("|---|-------|---------|----------|") - for i, u in enumerate(units_sorted, 1): - name = u.get("name", "?") - version = u.get("version", "?") - duration = u.get("duration", 0) - print(f"| {i} | {name} | {version} | {duration:.1f}s |") - - # Write structured JSON artifact - stats = { - "total_wall_time_s": total_s, - "total_time_display": total_display, - "units": [ - { - "name": u["name"], - "version": u.get("version", ""), - "duration": u.get("duration", 0), - } - for u in units_sorted - ], - } - Path("build-times.json").write_text(json.dumps(stats, indent=2)) - PYEOF - - - name: Log binary sizes - run: | - set +x - echo "" - echo "### Release Binary Sizes" - echo "" - echo "| Binary | Size (bytes) | Size |" - echo "|--------|-------------|------|" - - python3 - <<'PYEOF' - import json, os - from pathlib import Path - - binaries = [] - release_dir = Path("target/release") - for p in sorted(release_dir.iterdir()): - if not p.is_file(): - continue - # On Linux, check executable bit and skip non-ELF extensions - if p.suffix in (".d", ".rlib", ".rmeta", ".o", ".dwp"): - continue - if not os.access(p, os.X_OK): - continue - size = p.stat().st_size - if size < 1024: - continue # skip tiny files (build scripts, etc.) - if size > 1048576: - human = f"{size / 1048576:.1f} MiB" - elif size > 1024: - human = f"{size / 1024:.1f} KiB" - else: - human = f"{size} B" - print(f"| {p.name} | {size} | {human} |") - binaries.append({"name": p.name, "bytes": size}) - - Path("binary-sizes.json").write_text(json.dumps(binaries, indent=2)) - PYEOF + - name: Parse and display build stats + run: python3 .github/scripts/parse-build-stats.py - name: Run cargo bloat - run: | - cargo bloat --release --package diskann-benchmark -n 100 | tee cargo-bloat.txt + run: cargo bloat --release --package diskann-benchmark -n 100 | tee cargo-bloat.txt - name: Upload build stats uses: actions/upload-artifact@v4 with: name: build-stats path: | + target/cargo-timings/cargo-timing.html build-times.json binary-sizes.json cargo-bloat.txt diff --git a/.github/workflows/produce-build-stats.yml b/.github/workflows/produce-build-stats.yml index f0cc254c9..a5481383d 100644 --- a/.github/workflows/produce-build-stats.yml +++ b/.github/workflows/produce-build-stats.yml @@ -38,12 +38,8 @@ jobs: run: | set -euo pipefail mkdir -p collected - - # List successful runs of build-release.yml from the last 30 days SINCE=$(date -u -d '30 days ago' '+%Y-%m-%dT%H:%M:%SZ') - echo "Fetching runs since $SINCE ..." - gh api --paginate \ "repos/${{ github.repository }}/actions/workflows/build-release.yml/runs?status=success&created=>=$SINCE&per_page=100" \ --jq '.workflow_runs[] | [.id, .created_at, .head_sha] | @tsv' \ @@ -56,165 +52,36 @@ jobs: fi echo "Found $(wc -l < runs.tsv) runs" - - # For each run, download the build-stats artifact and build metadata echo '[]' > collected/all_runs.json while IFS=$'\t' read -r run_id created_at head_sha; do - echo "Processing run $run_id ($created_at) ..." - ARTIFACT_DIR="collected/$run_id" - mkdir -p "$ARTIFACT_DIR" - - # Download build-stats artifact - if ! gh run download "$run_id" \ - --repo "${{ github.repository }}" \ - --name build-stats \ - --dir "$ARTIFACT_DIR/build-stats" 2>/dev/null; then - echo " ::warning::Could not download build-stats for run $run_id (artifact may have expired)" - rm -rf "$ARTIFACT_DIR" + dir="collected/$run_id/build-stats" + if ! gh run download "$run_id" --repo "${{ github.repository }}" \ + --name build-stats --dir "$dir" 2>/dev/null; then + echo "::warning::Skipping run $run_id (artifact expired)" continue fi - echo " Downloaded build-stats for run $run_id" - - # Build a metadata record and append to collected data python3 -c " import json, sys from pathlib import Path - - base = Path(sys.argv[1]) / 'build-stats' - record = { - 'run_id': int(sys.argv[2]), - 'created_at': sys.argv[3], - 'head_sha': sys.argv[4], - } - - bt = base / 'build-times.json' - if bt.exists(): - record['build_times'] = json.loads(bt.read_text()) - - bs = base / 'binary-sizes.json' - if bs.exists(): - record['binary_sizes'] = json.loads(bs.read_text()) - - cb = base / 'cargo-bloat.txt' - if cb.exists(): - record['cargo_bloat'] = cb.read_text() - - all_file = Path('collected/all_runs.json') - data = json.loads(all_file.read_text()) + base = Path(sys.argv[1]) + record = {'run_id': int(sys.argv[2]), 'created_at': sys.argv[3], 'head_sha': sys.argv[4]} + for name, key in [('build-times.json','build_times'),('binary-sizes.json','binary_sizes'),('cargo-bloat.txt','cargo_bloat')]: + p = base / name + if p.exists(): + record[key] = json.loads(p.read_text()) if name.endswith('.json') else p.read_text() + all_f = Path('collected/all_runs.json') + data = json.loads(all_f.read_text()) data.append(record) - all_file.write_text(json.dumps(data)) - " "$ARTIFACT_DIR" "$run_id" "$created_at" "$head_sha" - + all_f.write_text(json.dumps(data)) + " "$dir" "$run_id" "$created_at" "$head_sha" done < runs.tsv - COUNT=$(python3 -c "import json; print(len(json.loads(open('collected/all_runs.json').read())))") - echo "Collected data for $COUNT runs" - - - name: Generate data file for report + - name: Generate report run: | mkdir -p report - python3 - <<'PYEOF' - import json - from pathlib import Path - from datetime import datetime - - data = json.loads(Path("collected/all_runs.json").read_text()) - data.sort(key=lambda r: r.get("created_at", "")) - - dates = [] - total_build_times = [] - crate_times = {} - total_binary_sizes = [] - per_binary = {} - - for run in data: - dt_str = run.get("created_at", "") - dates.append(dt_str[:10] if dt_str else "?") - - bt = run.get("build_times", {}) - total_build_times.append(bt.get("total_wall_time_s", 0)) - - units = bt.get("units", []) - seen_crates = set() - for u in units: - name = u.get("name", "") - if name not in crate_times: - crate_times[name] = [None] * (len(dates) - 1) - crate_times[name].append(u.get("duration", 0)) - seen_crates.add(name) - for name in crate_times: - if name not in seen_crates: - crate_times[name].append(None) - - bs = run.get("binary_sizes", []) - total_binary_sizes.append(sum(b.get("bytes", 0) for b in bs)) - - seen_bins = set() - for b in bs: - bname = b.get("name", "") - if bname not in per_binary: - per_binary[bname] = [None] * (len(dates) - 1) - per_binary[bname].append(b.get("bytes", 0)) - seen_bins.add(bname) - for bname in per_binary: - if bname not in seen_bins: - per_binary[bname].append(None) - - # Top 15 crates by average duration - def avg(lst): - vals = [v for v in lst if v is not None] - return sum(vals) / len(vals) if vals else 0 - - top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15] - - # Latest cargo bloat - latest_bloat = "" - for run in reversed(data): - if run.get("cargo_bloat"): - latest_bloat = run["cargo_bloat"] - break - - # Latest run details - latest_run = None - if data: - last = data[-1] - bt = last.get("build_times", {}) - latest_run = { - "created_at": last.get("created_at", "?"), - "head_sha": last.get("head_sha", "?")[:12], - "total_time_display": bt.get("total_time_display", "?"), - "units": bt.get("units", []), - "binary_sizes": last.get("binary_sizes", []), - } - - # Assemble the data object - build_data = { - "generated": datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC"), - "dates": dates, - "total_build_times": total_build_times, - "total_binary_sizes_mib": [s / 1048576 for s in total_binary_sizes], - "crate_datasets": [ - {"label": name, "data": crate_times[name]} - for name in top_crates - ], - "binary_datasets": [ - {"label": name, "data": [b / 1048576 if b is not None else None for b in per_binary[name]]} - for name in sorted(per_binary.keys()) - ], - "latest_cargo_bloat": latest_bloat, - "latest_run": latest_run, - } - - # Write as a JS file that assigns to a global constant - js_content = f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n" - Path("report/build-stats-data.js").write_text(js_content) - print(f"Generated build-stats-data.js ({len(data)} runs)") - PYEOF - - - name: Assemble report - run: | - cp .github/scripts/build-stats-report.html report/build-stats-report.html + python3 .github/scripts/generate-stats-data.py collected report + cp .github/scripts/build-stats-report.html report/ - name: Upload report uses: actions/upload-artifact@v4 @@ -222,3 +89,4 @@ jobs: name: build-stats-report path: report/ retention-days: 90 + From 50e9d08e8a9d322b96eded2178d1d8c098341916 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 18:17:31 -0700 Subject: [PATCH 06/30] Improve scripts --- .github/scripts/generate-stats-data.py | 49 +++++++++++++++++++---- .github/workflows/produce-build-stats.yml | 26 ++---------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/.github/scripts/generate-stats-data.py b/.github/scripts/generate-stats-data.py index c1b4338a9..8739b363f 100644 --- a/.github/scripts/generate-stats-data.py +++ b/.github/scripts/generate-stats-data.py @@ -1,4 +1,11 @@ -"""Aggregate build-stats artifacts into a JS data file for the HTML report.""" +"""Aggregate build-stats artifacts into a JS data file for the HTML report. + +Reads from: + collected/runs.tsv — tab-separated: run_id, created_at, head_sha + collected// — contains build-times.json, binary-sizes.json, cargo-bloat.txt + +Usage: python generate-stats-data.py +""" import json import sys from datetime import datetime, timezone @@ -10,8 +17,34 @@ def main(): output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("report") output_dir.mkdir(parents=True, exist_ok=True) - data = json.loads((collected_dir / "all_runs.json").read_text()) - data.sort(key=lambda r: r.get("created_at", "")) + runs_tsv = collected_dir / "runs.tsv" + + # Parse runs.tsv and load per-run artifacts + runs = [] + for line in runs_tsv.read_text().strip().splitlines(): + parts = line.split("\t") + if len(parts) < 3: + continue + run_id, created_at, head_sha = parts[0], parts[1], parts[2] + run_dir = collected_dir / run_id + + bt_path = run_dir / "build-times.json" + bs_path = run_dir / "binary-sizes.json" + cb_path = run_dir / "cargo-bloat.txt" + + if not bt_path.exists(): + continue # skip runs without data + + runs.append({ + "run_id": run_id, + "created_at": created_at, + "head_sha": head_sha, + "build_times": json.loads(bt_path.read_text()) if bt_path.exists() else {}, + "binary_sizes": json.loads(bs_path.read_text()) if bs_path.exists() else [], + "cargo_bloat": cb_path.read_text() if cb_path.exists() else "", + }) + + runs.sort(key=lambda r: r["created_at"]) dates = [] total_build_times = [] @@ -19,7 +52,7 @@ def main(): total_binary_sizes = [] per_binary: dict[str, list] = {} - for run in data: + for run in runs: dt_str = run.get("created_at", "") dates.append(dt_str[:10] if dt_str else "?") @@ -62,12 +95,12 @@ def avg(lst): top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15] # Latest cargo bloat - latest_bloat = next((r["cargo_bloat"] for r in reversed(data) if r.get("cargo_bloat")), "") + latest_bloat = next((r["cargo_bloat"] for r in reversed(runs) if r.get("cargo_bloat")), "") # Latest run details latest_run = None - if data: - last = data[-1] + if runs: + last = runs[-1] bt = last.get("build_times", {}) latest_run = { "created_at": last.get("created_at", "?"), @@ -93,7 +126,7 @@ def avg(lst): js_path = output_dir / "build-stats-data.js" js_path.write_text(f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n") - print(f"Generated {js_path} ({len(data)} runs)") + print(f"Generated {js_path} ({len(runs)} runs)") if __name__ == "__main__": diff --git a/.github/workflows/produce-build-stats.yml b/.github/workflows/produce-build-stats.yml index a5481383d..f8e9e24b3 100644 --- a/.github/workflows/produce-build-stats.yml +++ b/.github/workflows/produce-build-stats.yml @@ -47,34 +47,16 @@ jobs: if [ ! -s runs.tsv ]; then echo "::warning::No successful build-release runs found in the last 30 days" - echo '[]' > collected/all_runs.json exit 0 fi echo "Found $(wc -l < runs.tsv) runs" - echo '[]' > collected/all_runs.json + cp runs.tsv collected/runs.tsv while IFS=$'\t' read -r run_id created_at head_sha; do - dir="collected/$run_id/build-stats" - if ! gh run download "$run_id" --repo "${{ github.repository }}" \ - --name build-stats --dir "$dir" 2>/dev/null; then - echo "::warning::Skipping run $run_id (artifact expired)" - continue - fi - python3 -c " - import json, sys - from pathlib import Path - base = Path(sys.argv[1]) - record = {'run_id': int(sys.argv[2]), 'created_at': sys.argv[3], 'head_sha': sys.argv[4]} - for name, key in [('build-times.json','build_times'),('binary-sizes.json','binary_sizes'),('cargo-bloat.txt','cargo_bloat')]: - p = base / name - if p.exists(): - record[key] = json.loads(p.read_text()) if name.endswith('.json') else p.read_text() - all_f = Path('collected/all_runs.json') - data = json.loads(all_f.read_text()) - data.append(record) - all_f.write_text(json.dumps(data)) - " "$dir" "$run_id" "$created_at" "$head_sha" + gh run download "$run_id" --repo "${{ github.repository }}" \ + --name build-stats --dir "collected/$run_id" 2>/dev/null \ + || echo "::warning::Skipping run $run_id (artifact expired)" done < runs.tsv - name: Generate report From 4858e4eaf151b84e3fe3db346b3fa03b1b6962b3 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 18:22:01 -0700 Subject: [PATCH 07/30] Switched template to Vue 3 --- .github/scripts/build-stats-report.html | 275 +++++++++++++----------- 1 file changed, 146 insertions(+), 129 deletions(-) diff --git a/.github/scripts/build-stats-report.html b/.github/scripts/build-stats-report.html index 61bf2c9c6..55a24c119 100644 --- a/.github/scripts/build-stats-report.html +++ b/.github/scripts/build-stats-report.html @@ -3,6 +3,7 @@ DiskANN Release Build Stats + +

DiskANN Release Build Statistics

-

+

Generated: {{ data.generated }} — Last 30 days ({{ data.dates.length }} runs)

Total Build Time Trend

-
+

Build Time by Crate (Top 15)

-
+

Total Binary Size Trend

-
+

Binary Size per Binary

-
+

Latest Cargo Bloat (diskann-benchmark)

-

-
-  

Latest Build Details

-
- - + return { data, formatSize, totalBuildTime, crateBuildTime, totalBinarySize, perBinarySize }; + } +}).mount('#app'); + + From cc38f5bb5cb91f3db610a49c673ee1053cc43ea0 Mon Sep 17 00:00:00 2001 From: "Alex Razumov (from Dev Box)" Date: Thu, 7 May 2026 18:33:15 -0700 Subject: [PATCH 08/30] Latest Build Details (Top 20 Crates) --- .github/scripts/build-stats-report.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/build-stats-report.html b/.github/scripts/build-stats-report.html index 55a24c119..65caa5707 100644 --- a/.github/scripts/build-stats-report.html +++ b/.github/scripts/build-stats-report.html @@ -50,7 +50,7 @@

Latest Cargo Bloat (diskann-benchmark)

{{ data.latest_cargo_bloat || 'No cargo bloat data available.' }}