Skip to content

Commit 2f537ab

Browse files
rich7420ryankert01
andauthored
feat: add SVHN Quantum Kernel SVM benchmark (#1175)
* feat: add SVHN Quantum Kernel SVM benchmark * move to right side * chore: address reviewer comments --------- Co-authored-by: Ryan Huang <hcr@apache.org>
1 parent 34cd796 commit 2f537ab

2 files changed

Lines changed: 529 additions & 0 deletions

File tree

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one or more
4+
# contributor license agreements. See the NOTICE file distributed with
5+
# this work for additional information regarding copyright ownership.
6+
# The ASF licenses this file to You under the Apache License, Version 2.0
7+
# (the "License"); you may not use this file except in compliance with
8+
# the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
"""
19+
Quantum Kernel SVM — CPU baseline (CPU encoding) — SVHN dataset.
20+
21+
Pipeline:
22+
SVHN (32×32×3) → Flatten (3072) → L2-norm + zero-pad (4096, 12 qubits)
23+
→ Quantum Kernel K[i,j] = (encoded[i] · encoded[j])² → sklearn SVM
24+
25+
Encoding: CPU NumPy (L2-normalise + zero-pad to 2^12 = 4096).
26+
Kernel: Precomputed squared inner product of amplitude-encoded state vectors.
27+
Classifier: sklearn.svm.SVC(kernel='precomputed').
28+
29+
Each pipeline step is timed separately to show the encoding fraction.
30+
"""
31+
32+
from __future__ import annotations
33+
34+
import argparse
35+
import os
36+
import time
37+
import urllib.request
38+
39+
import numpy as np
40+
41+
try:
42+
from sklearn.preprocessing import StandardScaler
43+
from sklearn.svm import SVC
44+
except ImportError as e:
45+
raise SystemExit(
46+
"scikit-learn is required. Install with: uv sync --group benchmark"
47+
) from e
48+
49+
try:
50+
from scipy.io import loadmat
51+
except ImportError as e:
52+
raise SystemExit("scipy is required. Install with: pip install scipy") from e
53+
54+
55+
# ---------------------------------------------------------------------------
56+
# SVHN data loading
57+
# ---------------------------------------------------------------------------
58+
59+
SVHN_URLS = {
60+
"train": "http://ufldl.stanford.edu/housenumbers/train_32x32.mat",
61+
"test": "http://ufldl.stanford.edu/housenumbers/test_32x32.mat",
62+
}
63+
64+
65+
def _download_if_needed(url: str, dest: str) -> str:
66+
if not os.path.exists(dest):
67+
os.makedirs(os.path.dirname(dest), exist_ok=True)
68+
print(f" Downloading {url} ...")
69+
urllib.request.urlretrieve(url, dest)
70+
print(f" Saved to {dest}")
71+
return dest
72+
73+
74+
def load_svhn(
75+
data_home: str | None = None,
76+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
77+
"""Load SVHN train/test: (n, 3072) float64 in [0,1], labels 0-9."""
78+
if data_home is None:
79+
data_home = os.path.join(os.path.expanduser("~"), "scikit_learn_data", "svhn")
80+
81+
train_path = _download_if_needed(
82+
SVHN_URLS["train"], os.path.join(data_home, "train_32x32.mat")
83+
)
84+
test_path = _download_if_needed(
85+
SVHN_URLS["test"], os.path.join(data_home, "test_32x32.mat")
86+
)
87+
88+
train_mat = loadmat(train_path)
89+
test_mat = loadmat(test_path)
90+
91+
X_train = (
92+
train_mat["X"].transpose(3, 0, 1, 2).reshape(-1, 3072).astype(np.float64)
93+
/ 255.0
94+
)
95+
X_test = (
96+
test_mat["X"].transpose(3, 0, 1, 2).reshape(-1, 3072).astype(np.float64) / 255.0
97+
)
98+
Y_train = train_mat["y"].ravel().astype(int) % 10
99+
Y_test = test_mat["y"].ravel().astype(int) % 10
100+
101+
return X_train, X_test, Y_train, Y_test
102+
103+
104+
# ---------------------------------------------------------------------------
105+
# Encoding & kernel
106+
# ---------------------------------------------------------------------------
107+
108+
NUM_QUBITS = 12
109+
STATE_DIM = 2**NUM_QUBITS # 4096
110+
CLASS_POS = 1
111+
CLASS_NEG = 7
112+
113+
114+
def _filter_binary(X, Y):
115+
mask = (Y == CLASS_POS) | (Y == CLASS_NEG)
116+
return X[mask], np.where(Y[mask] == CLASS_POS, 1, -1)
117+
118+
119+
def encode_cpu(X: np.ndarray) -> np.ndarray:
120+
"""L2-normalise + zero-pad to 4096. Returns (n, 4096) float64."""
121+
norms = np.linalg.norm(X, axis=1, keepdims=True)
122+
norms[norms == 0] = 1.0
123+
X_normed = X / norms
124+
pad = STATE_DIM - X.shape[1]
125+
if pad > 0:
126+
X_normed = np.concatenate(
127+
[X_normed, np.zeros((X_normed.shape[0], pad), dtype=X_normed.dtype)], axis=1
128+
)
129+
return X_normed
130+
131+
132+
def compute_kernel(X1: np.ndarray, X2: np.ndarray) -> np.ndarray:
133+
"""Quantum kernel: K[i,j] = |⟨ψ(x_j)|ψ(x_i)⟩|² = (X1 @ X2.T)²."""
134+
return (X1 @ X2.T) ** 2
135+
136+
137+
# ---------------------------------------------------------------------------
138+
# Main
139+
# ---------------------------------------------------------------------------
140+
141+
142+
def main() -> None:
143+
parser = argparse.ArgumentParser(
144+
description="Quantum Kernel SVM — CPU baseline (CPU) — SVHN (12 qubits)"
145+
)
146+
parser.add_argument(
147+
"--n-samples",
148+
type=int,
149+
default=5000,
150+
help="Total samples for CV (default: 5000)",
151+
)
152+
parser.add_argument("--folds", type=int, default=5, help="CV folds (default: 5)")
153+
parser.add_argument(
154+
"--seed", type=int, default=42, help="Random seed (default: 42)"
155+
)
156+
parser.add_argument(
157+
"--svm-c",
158+
type=float,
159+
default=100.0,
160+
help="SVM regularisation C (default: 100.0)",
161+
)
162+
parser.add_argument("--data-home", type=str, default=None, help="Data cache dir")
163+
args = parser.parse_args()
164+
165+
print("Quantum Kernel SVM — CPU baseline — SVHN")
166+
print(
167+
f" {NUM_QUBITS} qubits, {STATE_DIM}-dim state, binary: digit {CLASS_POS} vs {CLASS_NEG}"
168+
)
169+
print(f" n_samples={args.n_samples}, {args.folds}-fold CV, C={args.svm_c}")
170+
print()
171+
172+
# Load & filter
173+
print(" Loading SVHN ...")
174+
X_train_all, X_test_all, Y_train_all, Y_test_all = load_svhn(
175+
data_home=args.data_home
176+
)
177+
X_all = np.concatenate([X_train_all, X_test_all], axis=0)
178+
Y_all = np.concatenate([Y_train_all, Y_test_all], axis=0)
179+
X_bin, Y_bin = _filter_binary(X_all, Y_all)
180+
print(f" Binary filtered: {len(Y_bin):,} samples (pos={np.mean(Y_bin == 1):.2f})")
181+
182+
rng = np.random.default_rng(args.seed)
183+
if args.n_samples < len(Y_bin):
184+
idx = rng.choice(len(Y_bin), size=args.n_samples, replace=False)
185+
X_bin, Y_bin = X_bin[idx], Y_bin[idx]
186+
print(f" Subsampled: {len(Y_bin):,} samples")
187+
print()
188+
189+
# Step 1: StandardScaler + Encode (all data, once)
190+
t0 = time.perf_counter()
191+
scaler = StandardScaler().fit(X_bin)
192+
X_scaled = scaler.transform(X_bin)
193+
X_encoded = encode_cpu(X_scaled)
194+
encode_sec = time.perf_counter() - t0
195+
print(
196+
f" Step 1: Scale+Encode ........ {encode_sec:.4f}s (n={len(Y_bin)}, dim={STATE_DIM})"
197+
)
198+
199+
# Step 2: Full kernel matrix
200+
t0 = time.perf_counter()
201+
K_full = compute_kernel(X_encoded, X_encoded)
202+
kernel_sec = time.perf_counter() - t0
203+
print(
204+
f" Step 2: Kernel ........ {kernel_sec:.4f}s ({K_full.shape[0]}×{K_full.shape[1]})"
205+
)
206+
207+
# Step 3: k-fold cross-validation
208+
from sklearn.model_selection import StratifiedKFold
209+
210+
skf = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.seed)
211+
212+
fold_accs = []
213+
cv_fit_sec = 0.0
214+
cv_pred_sec = 0.0
215+
216+
print(f"\n Step 3: {args.folds}-fold Cross-Validation")
217+
for fold, (train_idx, test_idx) in enumerate(skf.split(X_encoded, Y_bin), 1):
218+
K_train = K_full[np.ix_(train_idx, train_idx)]
219+
K_test = K_full[np.ix_(test_idx, train_idx)]
220+
221+
t0 = time.perf_counter()
222+
svm = SVC(kernel="precomputed", C=args.svm_c)
223+
svm.fit(K_train, Y_bin[train_idx])
224+
cv_fit_sec += time.perf_counter() - t0
225+
226+
t0 = time.perf_counter()
227+
acc = svm.score(K_test, Y_bin[test_idx])
228+
cv_pred_sec += time.perf_counter() - t0
229+
230+
fold_accs.append(acc)
231+
n_sv = svm.n_support_.sum()
232+
print(
233+
f" Fold {fold}/{args.folds}: acc={acc:.4f} "
234+
f"(train={len(train_idx)}, test={len(test_idx)}, SVs={n_sv})"
235+
)
236+
237+
mean_acc = np.mean(fold_accs)
238+
std_acc = np.std(fold_accs)
239+
240+
total_sec = encode_sec + kernel_sec + cv_fit_sec + cv_pred_sec
241+
encode_pct = encode_sec / total_sec * 100
242+
243+
print(f"\n {'─' * 50}")
244+
print(f" Encode time: ........ {encode_sec:.4f}s")
245+
print(f" Kernel time: ........ {kernel_sec:.4f}s")
246+
print(f" CV fit time: ........ {cv_fit_sec:.4f}s ({args.folds} folds)")
247+
print(f" CV predict time: ........ {cv_pred_sec:.4f}s")
248+
print(f" Total: ........ {total_sec:.4f}s")
249+
print(f" Encoding fraction: ........ {encode_pct:.1f}%")
250+
print(f" Accuracy: ........ {mean_acc:.4f} ± {std_acc:.4f}")
251+
252+
253+
if __name__ == "__main__":
254+
main()

0 commit comments

Comments
 (0)