From ca7a574b6f3054917b920c43b5598d4b10516741 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 10 May 2026 20:08:35 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Vectorize=20BasicEstimator.?= =?UTF-8?q?predict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vectorized the prediction logic in BasicEstimator using the squared Euclidean distance expansion formula. This replaces the iterative O(N) loop over query embeddings with optimized matrix operations. Key changes: - Added `norms_sq` pre-calculation to `fit()`. - Implemented vectorized `predict()` using `np.dot`. - Added numerical stability guards and backward compatibility. - Improved robustness for single-embedding inputs. Co-authored-by: guesswh0 <10531675+guesswh0@users.noreply.github.com> --- .jules/bolt.md | 3 ++ face_engine/models/basic_estimator.py | 43 +++++++++++++++++++++------ 2 files changed, 37 insertions(+), 9 deletions(-) create mode 100644 .jules/bolt.md diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..650c701 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2025-05-10 - [Numerical Stability in Distance Expansion] +**Learning:** Using the expansion formula ||a-b||^2 = ||a||^2 + ||b||^2 - 2ab for vectorized distance calculation provides significant speedup but can introduce small floating-point discrepancies (negative values) due to subtractive cancellation. +**Action:** Always use `np.maximum(dists_sq, 0)` after the expansion formula and allow slightly relaxed test tolerances (e.g., `atol=1e-5`) if comparing against iterative `np.linalg.norm`. diff --git a/face_engine/models/basic_estimator.py b/face_engine/models/basic_estimator.py index fbbf2b9..130f673 100644 --- a/face_engine/models/basic_estimator.py +++ b/face_engine/models/basic_estimator.py @@ -18,23 +18,48 @@ class BasicEstimator(Estimator, name="basic"): def __init__(self): self.embeddings = None self.class_names = None + self.norms_sq = None def fit(self, embeddings, class_names, **kwargs): - self.embeddings = embeddings + self.embeddings = np.asarray(embeddings) self.class_names = class_names + # Pre-calculate squared norms for faster distance calculation + self.norms_sq = np.sum(self.embeddings**2, axis=1) def predict(self, embeddings): if self.class_names is None: raise TrainError("Model is not fitted yet!") - scores = [] - class_names = [] - for embedding in embeddings: - distances = np.linalg.norm(self.embeddings - embedding, axis=1) - index = np.argmin(distances) - score = np.exp(-0.5 * distances[index] ** 2) - scores.append(score) - class_names.append(self.class_names[index]) + embeddings = np.asarray(embeddings) + if embeddings.ndim == 1: + embeddings = embeddings[np.newaxis, :] + + # Using expansion formula: ||a-b||^2 = ||a||^2 + ||b||^2 - 2ab + # This is much faster than looping and using np.linalg.norm + q_norms_sq = np.sum(embeddings**2, axis=1, keepdims=True) + + # Handle backward compatibility for models fitted with older versions + fitted_norms_sq = getattr(self, "norms_sq", None) + if fitted_norms_sq is None: + fitted_norms_sq = np.sum(self.embeddings**2, axis=1) + + # Calculate squared Euclidean distances + dists_sq = ( + q_norms_sq + + fitted_norms_sq + - 2 * np.dot(embeddings, self.embeddings.T) + ) + + # Numerical stability: ensure distances are non-negative + dists_sq = np.maximum(dists_sq, 0) + + # Find best matches + indices = np.argmin(dists_sq, axis=1) + min_dists_sq = dists_sq[np.arange(len(embeddings)), indices] + + scores = np.exp(-0.5 * min_dists_sq).tolist() + class_names = [self.class_names[i] for i in indices] + return scores, class_names def save(self, dirname):