Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## 2025-05-15 - [Numerical Precision in Vectorized Distance Calculation]
**Learning:** Using the expansion formula ||a-b||^2 = ||a||^2 + ||b||^2 - 2ab for vectorized distance calculation provides significant speedup (up to 10x) by leveraging BLAS via NumPy. However, it can introduce small negative values due to floating-point precision issues (subtractive cancellation).
**Action:** Always wrap the resulting distance matrix with `np.maximum(dists_sq, 0)` and use slightly relaxed tolerances (e.g., `atol=1e-5`) in unit tests when comparing against standard Euclidean distance.
39 changes: 30 additions & 9 deletions face_engine/models/basic_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,41 @@ class BasicEstimator(Estimator, name="basic"):
def __init__(self):
self.embeddings = None
self.class_names = None
self.fitted_norms_sq = None

def fit(self, embeddings, class_names, **kwargs):
self.embeddings = embeddings
self.class_names = class_names
# Pre-calculate squared norms for faster distance computation
self.fitted_norms_sq = np.sum(self.embeddings**2, axis=1)

def predict(self, embeddings):
if self.class_names is None:
raise TrainError("Model is not fitted yet!")

scores = []
class_names = []
for embedding in embeddings:
distances = np.linalg.norm(self.embeddings - embedding, axis=1)
index = np.argmin(distances)
score = np.exp(-0.5 * distances[index] ** 2)
scores.append(score)
class_names.append(self.class_names[index])
return scores, class_names
embeddings = np.asarray(embeddings)
if embeddings.size == 0:
return [], []

# Vectorized distance calculation using the expansion formula:
# ||a - b||^2 = ||a||^2 + ||b||^2 - 2 * <a, b>
# This significantly reduces complexity by using matrix operations.
query_norms_sq = np.sum(embeddings**2, axis=1, keepdims=True)
dot_products = np.dot(embeddings, self.embeddings.T)

# Calculate squared distances
dists_sq = query_norms_sq + self.fitted_norms_sq - 2 * dot_products
# Handle potential tiny negative values due to floating point precision
dists_sq = np.maximum(dists_sq, 0)

# Find closest fitted embeddings for each query
indices = np.argmin(dists_sq, axis=1)
# Use squared distance directly for score calculation
min_dists_sq = dists_sq[np.arange(len(embeddings)), indices]
scores = np.exp(-0.5 * min_dists_sq)

class_names = [self.class_names[i] for i in indices]
return scores.tolist(), class_names

def save(self, dirname):
name = "%s.estimator.%s" % (self.name, "p")
Expand All @@ -46,3 +63,7 @@ def load(self, dirname):
name = "%s.estimator.%s" % (self.name, "p")
with open(os.path.join(dirname, name), "rb") as file:
self.__dict__.update(pickle.load(file))

# Reconstruct fitted_norms_sq if it's missing (for backward compatibility)
if self.embeddings is not None and getattr(self, "fitted_norms_sq", None) is None:
self.fitted_norms_sq = np.sum(self.embeddings**2, axis=1)