PyPSA · FBumann · May 27, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -12,3 +12,30 @@ updates:
     github-actions:
       patterns:
       - '*'
+
+# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump
+# → CodSpeed CI runs and attributes any perf delta to that specific
+# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned)
+# stable while still surfacing upstream perf changes per-PR with
+# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...)
+# have no version specifier so Dependabot leaves them alone — only the
+# ``==`` pins in ``[benchmarks]`` produce PRs.
+- package-ecosystem: pip
+  directory: /
+  schedule:
+    interval: monthly
+  open-pull-requests-limit: 5
+  groups:
+    # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant —
+    # they don't move CodSpeed signal, so batching into one PR cuts
+    # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay
+    # un-grouped so each gets its own attributed CodSpeed delta.
+    benchmark-tooling:
+      patterns:
+        - pytest
+        - pytest-benchmark
+        - pytest-memray
+        - pytest-codspeed
+        - nbconvert
+        - typer
+        - plotly
diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
@@ -0,0 +1,43 @@
+name: Benchmark smoke
+
+# Builds every spec and fires every phase once under --quick
+# --benchmark-disable: a "did a refactor break a spec?" check, not timing.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ '*' ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    name: Benchmark smoke (quick)
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install package and benchmark dependencies
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmark smoke
+      run: |
+        python -m benchmarks smoke
+
+    - name: Execute walkthrough notebook
+      # Catches doc rot: the walkthrough must stay runnable end-to-end.
+      run: |
+        python -m benchmarks notebook
diff --git a/.github/workflows/codspeed-macro.yml b/.github/workflows/codspeed-macro.yml
@@ -0,0 +1,62 @@
+name: CodSpeed (walltime macro)
+
+# Wall-clock benchmarks on CodSpeed's dedicated bare-metal macro runners — the
+# mode that reflects the real cost of dense-vs-sparse work (cache, allocation,
+# native numpy/scipy), which instruction counting under-weights.
+#
+# Master push (updates the walltime baseline) + manual dispatch + opt-in per-PR
+# via the ``trigger:benchmark`` label. Off every *unlabelled* PR: macro-runner
+# minutes are metered (600/month free), and self-hosted bare-metal shouldn't run
+# arbitrary PR code — the label is a maintainer-controlled gate, so only apply it
+# to trusted (same-repo) PRs.
+#
+# Requires the repo under a GitHub org (macro runners are org-only) with the
+# CodSpeed app connected to the repo (OIDC auth — no token secret needed).
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    types: [ labeled, synchronize ]
+    branches: [ master ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  macro:
+    name: CodSpeed walltime (macro runner)
+    # Always on master push / dispatch; on PRs only when explicitly labelled.
+    if: >-
+      ${{ github.event_name != 'pull_request' ||
+          contains(github.event.pull_request.labels.*.name, 'trigger:benchmark') }}
+    runs-on: codspeed-macro
+    # Non-gating until the CodSpeed app is connected to the repo (OIDC auth).
+    continue-on-error: true
+    permissions:
+      contents: read   # actions/checkout
+      id-token: write  # OIDC auth with CodSpeed — no token secret
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      # Pinned ``[benchmarks]`` extra so Dependabot bumps → one CodSpeed delta each.
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed (walltime)
+      uses: CodSpeedHQ/action@v4
+      with:
+        mode: walltime
+        run: |
+          pytest benchmarks/ --quick --codspeed
diff --git a/.github/workflows/codspeed-memory.yml b/.github/workflows/codspeed-memory.yml
@@ -0,0 +1,48 @@
+name: CodSpeed (memory)
+
+# Heap-allocation tracking — the always-on signal for this sparsity/memory fork.
+# Fast (~2 min) and free on a GitHub runner, so it runs on master (baseline) and
+# every PR. A solo instrument on ubuntu: its one upload per (commit, env) never
+# clashes with the walltime run, which is a separate bare-metal environment.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  memory:
+    name: CodSpeed memory
+    runs-on: ubuntu-latest
+    # Non-gating: informational, never blocks a merge.
+    continue-on-error: true
+    permissions:
+      contents: read   # actions/checkout
+      id-token: write  # OIDC auth with CodSpeed — no token secret
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed (memory)
+      uses: CodSpeedHQ/action@v4
+      with:
+        mode: memory
+        run: |
+          pytest benchmarks/ --quick --codspeed
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,10 @@ benchmark/scripts/__pycache__
 benchmark/scripts/benchmarks-pypsa-eur/__pycache__
 benchmark/scripts/leftovers/
 
+# Benchmarks (internal suite): regenerable .ipynb viewing artifacts
+benchmarks/walkthrough.ipynb
+benchmarks/.ipynb_checkpoints/
+
 # IDE
 .idea/
 

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,94 +1,69 @@
 # Internal Performance Benchmarks
 
-Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
+End-to-end performance tracking for `linopy` — build → solver handoff
+→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm
+runtime is out of scope.
 
-> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
+**The walkthrough is load-bearing.** Phase coverage, CLI introspection,
+the two-snapshot regression workflow with inline Plotly views, and
+how to extend the suite live in [`walkthrough.md`](walkthrough.md).
+This README only covers install and how to open the walkthrough.
 
-## Setup
+> `benchmark/` (singular) is the legacy external-framework suite.
+> `benchmarks/` (plural) is this internal suite.
 
-```bash
-pip install -e ".[benchmarks]"
-```
+## Models vs patterns
 
-## Running benchmarks
+Two kinds of benchmark spec, same harness (time *or* peak memory — a
+`run`/`sweep` `--metric` flag, same phases), distinguished by their sweep axis:
 
-```bash
-# Quick smoke test (small sizes only)
-pytest benchmarks/ --quick
+- **Models** (`models/`, `REGISTRY`) — whole `linopy.Model`s swept over
+  `size` (axis `n`): "how does cost scale with the problem?"
+- **Patterns** (`patterns/`, `PATTERNS`) — fragments of realistic modelling
+  code (a balance constraint, a KVL contraction) swept over `severity`
+  (0–100, axis `severity`): "how does cost respond as one data shape goes
+  from benign to pathological?" Each `PatternSpec.description` documents what
+  its dial means (`"0: …, 100: …"`).
 
-# Full timing benchmarks
-pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py
+Both kinds build a complete `linopy.Model`, so both run the **same phases** and
+share the phase drivers (`test_build.py`, `test_matrices.py`, …) and `memory`
+grid — they're just more `(spec, value)` rows, tagged by `axis`. There is no
+separate pattern driver. Running a pattern through `build` *and* `lp_write`
+shows whether a dense-`_term` blow-up propagates to export or collapses.
 
-# Run a specific model
-pytest benchmarks/test_build.py -k basic
-```
+Patterns target the operations where the dense-`_term` representation forces
+materialisation — `groupby().sum()` padding, sparse `@` densification — so a
+`severity` sweep draws the cost cliff, and a cross-version `compare` shows a
+kernel change bending it. Adding either is one file: drop it in `models/` or
+`patterns/`, call `register(...)` / `register_pattern(...)`.
 
-## Comparing timing between branches
+## Install
 
 ```bash
-# Save baseline results on master
-git checkout master
-pytest benchmarks/test_build.py --benchmark-save=master
-
-# Switch to feature branch and compare
-git checkout my-feature
-pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
-
-# Compare saved results without re-running
-pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
+uv sync --extra dev --extra benchmarks
+source .venv/bin/activate
 ```
 
-Results are stored in `.benchmarks/` (gitignored).
-
-## Memory benchmarks
+`pypsa` is optional — `pypsa_scigrid` and
+`test_pypsa_carbon_management.py` skip gracefully without it. Install
+when you need them: `uv pip install pypsa`.
 
-`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.
+The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that
+affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`,
+`dask`, etc.). `sweep` installs these into each per-version venv, so
+"same deps, only linopy varies" comes for free without a separate
+lockfile — bump the pins in pyproject and the next sweep picks them up.
 
-By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
+## Open the walkthrough
 
 ```bash
-# Save baseline on master
-git checkout master
-python benchmarks/memory.py save master
-
-# Save feature branch
-git checkout my-feature
-python benchmarks/memory.py save my-feature
-
-# Compare
-python benchmarks/memory.py compare master my-feature
-
-# Quick mode (smaller sizes, faster)
-python benchmarks/memory.py save master --quick
-
-# Measure a specific phase (includes build overhead)
-python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
+python -m benchmarks notebook --build       # (re)generate walkthrough.ipynb
+jupyter lab benchmarks/walkthrough.ipynb    # ...or PyCharm / VSCode
 ```
 
-Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).
-
-> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.
-
-## Models
-
-| Model | Description | Sizes |
-|-------|-------------|-------|
-| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
-| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
-| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
-| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
-| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |
-
-## Phases
-
-| Phase | File | What it measures |
-|-------|------|------------------|
-| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
-| LP write | `test_lp_write.py` | Writing the model to an LP file |
-| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |
-
-## Adding a new model
+The `.md` is the source of truth; the `.ipynb` is a disposable,
+gitignored build artifact. Edit the `.md`, re-run `--build`, re-open.
+Same workflow in any editor.
 
-1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
-2. Add parametrized tests in the relevant `test_*.py` files
-3. Add a quick threshold in `conftest.py`
+CI executes the walkthrough end-to-end on every PR
+(`python -m benchmarks notebook`) so the examples can't silently rot.