diff --git a/.github/reports/build-stats-report.html b/.github/reports/build-stats-report.html
new file mode 100644
index 000000000..84c01d511
--- /dev/null
+++ b/.github/reports/build-stats-report.html
@@ -0,0 +1,196 @@
+
+
+
+
+ DiskANN Release Build Stats
+
+
+
+
+
+
+
+
DiskANN Release Build Statistics
+
Generated: {{ data.generated }} — Last 30 days ({{ data.dates.length }} runs)
+
+
Total Build Time Trend
+
+
+
Build Time by Crate (Top 15)
+
+
+
Total Binary Size Trend
+
+
+
Binary Size per Binary
+
+
+
Latest Cargo Bloat (diskann-benchmark)
+
{{ data.latest_cargo_bloat || 'No cargo bloat data available.' }}
+
+
Latest LLVM Lines (diskann-benchmark)
+
{{ data.latest_cargo_llvm_lines || 'No cargo llvm-lines data available.' }}
+
+
+ Latest Build Details (Top 20 Crates)
+
+ Run: {{ data.latest_run.created_at }} —
+ Commit: {{ data.latest_run.head_sha }} —
+ Total wall time: {{ data.latest_run.total_time_display }}
+
+
+
+ | # | Crate | Version | Duration |
+
+
+ | {{ i + 1 }} |
+ {{ u.name }} |
+ {{ u.version }} |
+ {{ u.duration.toFixed(1) }}s |
+
+
+
+
+
+ Binary Sizes
+
+ | Binary | Size (bytes) | Size |
+
+
+ | {{ b.name }} |
+ {{ b.bytes.toLocaleString() }} |
+ {{ formatSize(b.bytes) }} |
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.github/scripts/build-stats-report-data.py b/.github/scripts/build-stats-report-data.py
new file mode 100644
index 000000000..3155d959b
--- /dev/null
+++ b/.github/scripts/build-stats-report-data.py
@@ -0,0 +1,165 @@
+"""Aggregate build-stats artifacts into a JS data file for the HTML report.
+
+Reads from:
+ collected/runs.tsv — tab-separated: run_id, created_at, head_sha
+ collected// — contains cargo-timing.html, build-stats-size.json, cargo-bloat.txt
+
+Usage: python build-stats-report-data.py
+"""
+import json
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def parse_cargo_timing(html_path: Path) -> dict:
+ """Parse build times from a cargo-timing.html file."""
+ if not html_path.exists():
+ return {}
+
+ html = html_path.read_text()
+
+ m = re.search(r"DURATION\s*=\s*(\d+(?:\.\d+)?)", html)
+ total_s = float(m.group(1)) if m else 0
+
+ m2 = re.search(r"Total time:([^<]+) | ", html)
+ total_display = m2.group(1).strip() if m2 else f"{total_s:.1f}s"
+
+ m = re.search(r"const UNIT_DATA\s*=\s*(\[.*?\]);", html, re.DOTALL)
+ if not m:
+ return {"total_wall_time_s": total_s, "total_time_display": total_display, "units": []}
+
+ units = json.loads(m.group(1))
+ units_sorted = sorted(units, key=lambda u: u.get("duration", 0), reverse=True)
+
+ return {
+ "total_wall_time_s": total_s,
+ "total_time_display": total_display,
+ "units": [{"name": u["name"], "version": u.get("version", ""), "duration": u.get("duration", 0)} for u in units_sorted],
+ }
+
+
+def main():
+ collected_dir = Path(sys.argv[1])
+ output_dir = Path(sys.argv[2])
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ runs_tsv = collected_dir / "runs.tsv"
+
+ # Parse runs.tsv and load per-run artifacts
+ runs = []
+ for line in runs_tsv.read_text().strip().splitlines():
+ parts = line.split("\t")
+ if len(parts) < 3:
+ raise ValueError(f"Malformed line in runs.tsv: {line!r}")
+ run_id, created_at, head_sha = parts[0], parts[1], parts[2]
+ run_dir = collected_dir / run_id
+
+ timing_path = run_dir / "target/cargo-timings/cargo-timing.html"
+ bs_path = run_dir / "build-stats-size.json"
+ cb_path = run_dir / "cargo-bloat.txt"
+ ll_path = run_dir / "cargo-llvm-lines.txt"
+
+ if not timing_path.exists():
+ raise FileNotFoundError(f"Missing cargo-timing.html for run {run_id} in {run_dir}")
+
+ runs.append({
+ "run_id": run_id,
+ "created_at": created_at,
+ "head_sha": head_sha,
+ "build_times": parse_cargo_timing(timing_path),
+ "binary_sizes": json.loads(bs_path.read_text()) if bs_path.exists() else [],
+ "cargo_bloat": cb_path.read_text() if cb_path.exists() else "",
+ "cargo_llvm_lines": ll_path.read_text() if ll_path.exists() else "",
+ })
+
+ runs.sort(key=lambda r: r["created_at"])
+
+ dates = []
+ total_build_times = []
+ crate_times: dict[str, list] = {}
+ total_binary_sizes = []
+ per_binary: dict[str, list] = {}
+
+ for run in runs:
+ dt_str = run.get("created_at", "")
+ dates.append(dt_str[:10] if dt_str else "?")
+
+ bt = run.get("build_times", {})
+ total_build_times.append(bt.get("total_wall_time_s", 0))
+
+ # Per-crate build times
+ units = bt.get("units", [])
+ seen = set()
+ for u in units:
+ name = u.get("name", "")
+ if name not in crate_times:
+ crate_times[name] = [None] * (len(dates) - 1)
+ crate_times[name].append(u.get("duration", 0))
+ seen.add(name)
+ for name in crate_times:
+ if name not in seen:
+ crate_times[name].append(None)
+
+ # Binary sizes
+ bs = run.get("binary_sizes", [])
+ total_binary_sizes.append(sum(b.get("bytes", 0) for b in bs))
+
+ seen_bins = set()
+ for b in bs:
+ bname = b.get("name", "")
+ if bname not in per_binary:
+ per_binary[bname] = [None] * (len(dates) - 1)
+ per_binary[bname].append(b.get("bytes", 0))
+ seen_bins.add(bname)
+ for bname in per_binary:
+ if bname not in seen_bins:
+ per_binary[bname].append(None)
+
+ # Top 15 crates by average duration
+ def avg(lst):
+ vals = [v for v in lst if v is not None]
+ return sum(vals) / len(vals) if vals else 0
+
+ top_crates = sorted(crate_times.keys(), key=lambda c: avg(crate_times[c]), reverse=True)[:15]
+
+ # Latest cargo bloat and llvm-lines
+ latest_bloat = next((r["cargo_bloat"] for r in reversed(runs) if r.get("cargo_bloat")), "")
+ latest_llvm_lines = next((r["cargo_llvm_lines"] for r in reversed(runs) if r.get("cargo_llvm_lines")), "")
+
+ # Latest run details
+ latest_run = None
+ if runs:
+ last = runs[-1]
+ bt = last.get("build_times", {})
+ latest_run = {
+ "created_at": last.get("created_at", "?"),
+ "head_sha": last.get("head_sha", "?")[:12],
+ "total_time_display": bt.get("total_time_display", "?"),
+ "units": bt.get("units", []),
+ "binary_sizes": last.get("binary_sizes", []),
+ }
+
+ build_data = {
+ "generated": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC"),
+ "dates": dates,
+ "total_build_times": total_build_times,
+ "total_binary_sizes_mib": [s / 1048576 for s in total_binary_sizes],
+ "crate_datasets": [{"label": name, "data": crate_times[name]} for name in top_crates],
+ "binary_datasets": [
+ {"label": name, "data": [b / 1048576 if b is not None else None for b in per_binary[name]]}
+ for name in sorted(per_binary.keys())
+ ],
+ "latest_cargo_bloat": latest_bloat,
+ "latest_cargo_llvm_lines": latest_llvm_lines,
+ "latest_run": latest_run,
+ }
+
+ js_path = output_dir / "build-stats-report.js"
+ js_path.write_text(f"const BUILD_DATA = {json.dumps(build_data, indent=2)};\n")
+ print(f"Generated {js_path} ({len(runs)} runs)")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/.github/scripts/build-stats-report.sh b/.github/scripts/build-stats-report.sh
new file mode 100644
index 000000000..d1617ef6d
--- /dev/null
+++ b/.github/scripts/build-stats-report.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# Collect build-stats artifacts from recent CI runs and generate an HTML report.
+#
+# Usage: build-stats-report.sh [github_repository] [collected_dir] [report_dir]
+#
+# Examples:
+# build-stats-report.sh
+# build-stats-report.sh microsoft/DiskANN collected report
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+GITHUB_REPOSITORY="${1:-microsoft/DiskANN}"
+
+if [ -z "${2:-}" ]; then
+ WORK_DIR=$(mktemp -d)
+ COLLECTED_DIR="$WORK_DIR/collected"
+ REPORT_DIR="$WORK_DIR/report"
+ mkdir -p "$COLLECTED_DIR" "$REPORT_DIR"
+else
+ COLLECTED_DIR="$2"
+ REPORT_DIR="${3:?report_dir is required when collected_dir is provided}"
+fi
+
+# GNU date: Linux/WSL. BSD date (macOS): use 'date -u -v-30d' instead.
+if date -u -d '30 days ago' '+%Y' >/dev/null 2>&1; then
+ SINCE=$(date -u -d '30 days ago' '+%Y-%m-%dT%H:%M:%SZ')
+else
+ SINCE=$(date -u -v-30d '+%Y-%m-%dT%H:%M:%SZ')
+fi
+
+gh api --paginate \
+ "repos/$GITHUB_REPOSITORY/actions/workflows/build-stats.yml/runs?status=success&created=>=$SINCE&per_page=100" \
+ --jq '.workflow_runs[] | [.id, .created_at, .head_sha] | @tsv' \
+ > "$COLLECTED_DIR/runs.tsv" || true
+
+if [ ! -s "$COLLECTED_DIR/runs.tsv" ]; then
+ echo "::warning::No successful build-stats runs found in the last 30 days"
+ exit 1
+fi
+
+echo "Found $(wc -l < "$COLLECTED_DIR/runs.tsv") runs"
+
+while IFS=$'\t' read -r run_id created_at head_sha; do
+ gh run download "$run_id" --repo "$GITHUB_REPOSITORY" \
+ --name build-stats --dir "$COLLECTED_DIR/$run_id" 2>/dev/null \
+ || echo "::warning::Skipping run $run_id (artifact expired)"
+done < "$COLLECTED_DIR/runs.tsv"
+
+python3 "$SCRIPT_DIR/build-stats-report-data.py" "$COLLECTED_DIR" "$REPORT_DIR"
+cp "$SCRIPT_DIR/../reports/build-stats-report.html" "$REPORT_DIR/"
+
+echo ""
+echo "Report: $REPORT_DIR/build-stats-report.html"
diff --git a/.github/scripts/build-stats-size.py b/.github/scripts/build-stats-size.py
new file mode 100644
index 000000000..5788830f6
--- /dev/null
+++ b/.github/scripts/build-stats-size.py
@@ -0,0 +1,21 @@
+"""Scan a release directory for executable binaries and write a JSON size report.
+
+Usage: python build-stats-size.py
+"""
+import json
+import os
+import sys
+from pathlib import Path
+
+release_dir = Path(sys.argv[1])
+output_file = Path(sys.argv[2])
+
+binaries = []
+for p in sorted(release_dir.iterdir()):
+ if not p.is_file() or p.suffix in (".d", ".rlib", ".rmeta", ".o", ".dwp"):
+ continue
+ if not os.access(p, os.X_OK) or p.stat().st_size < 1024:
+ continue
+ binaries.append({"name": p.name, "bytes": p.stat().st_size})
+
+output_file.write_text(json.dumps(binaries, indent=2))
diff --git a/.github/workflows/build-stats-report.yml b/.github/workflows/build-stats-report.yml
new file mode 100644
index 000000000..48f7975be
--- /dev/null
+++ b/.github/workflows/build-stats-report.yml
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# Aggregates release build statistics from the last 30 days of
+# `build-stats.yml` runs and produces an HTML dashboard with trend charts.
+#
+# Intended for weekly review.
+
+on:
+ schedule:
+ - cron: "0 8 * * *"
+ workflow_dispatch: # allows manual triggering from the Actions UI
+
+name: Build Stats Report
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+ actions: read
+
+env:
+ GH_TOKEN: ${{ github.token }}
+
+jobs:
+ report:
+ name: generate report
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Generate report
+ run: |
+ set -euo pipefail
+ mkdir -p collected
+ mkdir -p report
+ bash .github/scripts/build-stats-report.sh "${{ github.repository }}" collected report
+
+ - name: Upload report
+ uses: actions/upload-artifact@v4
+ with:
+ name: build-stats-report
+ path: report/
+ retention-days: 90
+
diff --git a/.github/workflows/build-stats.yml b/.github/workflows/build-stats.yml
new file mode 100644
index 000000000..955e215a8
--- /dev/null
+++ b/.github/workflows/build-stats.yml
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+# Build statistics workflow.
+#
+# Runs on every push to main to capture release build timings, binary sizes,
+# and cargo-bloat analysis. The data is uploaded as structured JSON artifacts
+# so the companion `build-stats-report.yml` workflow can aggregate trends.
+
+on:
+ push:
+ branches: ["main"]
+ workflow_dispatch: # allows manual triggering from the Actions UI
+
+name: Build Stats
+
+env:
+ CARGO_TERM_COLOR: always
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+
+jobs:
+ build-release:
+ name: release build
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Install Rust
+ run: rustup show
+
+ - name: Install tools
+ uses: taiki-e/install-action@v2
+ with:
+ tool: cargo-bloat,cargo-llvm-lines
+
+ - name: Build release with timings
+ run: cargo clean && cargo build --workspace --release --locked --timings
+
+ - name: Collect binary sizes
+ run: python3 .github/scripts/build-stats-size.py target/release build-stats-size.json
+
+ - name: Run cargo bloat
+ run: cargo bloat --release --package diskann-benchmark -n 100 | tee cargo-bloat.txt
+
+ - name: Run cargo llvm-lines
+ run: cargo llvm-lines --release --package diskann-benchmark | head -100 | tee cargo-llvm-lines.txt
+
+ - name: Upload build stats
+ uses: actions/upload-artifact@v4
+ with:
+ name: build-stats
+ path: |
+ target/cargo-timings/cargo-timing.html
+ build-stats-size.json
+ cargo-bloat.txt
+ cargo-llvm-lines.txt
+ retention-days: 90