Skip to content

Commit 4951c00

Browse files
Add Euclidean distance and similarity metrics to diff command
1 parent 2ea005b commit 4951c00

4 files changed

Lines changed: 97 additions & 16 deletions

File tree

setup.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
11
from setuptools import setup, find_namespace_packages
22
import os
3+
import re
34

45
# Get the absolute path of the current directory
56
here = os.path.abspath(os.path.dirname(__file__))
67

8+
# Extract version from types.h
9+
def get_version():
10+
types_h_path = os.path.join(here, "src", "core", "types.h")
11+
with open(types_h_path, 'r') as f:
12+
content = f.read()
13+
14+
# Extract version string from types.h
15+
match = re.search(r'#define\s+EB_VERSION_STR\s+"([^"]+)"', content)
16+
if match:
17+
return match.group(1)
18+
return "0.1.0" # Default if not found
19+
720
setup(
821
name="embedding_bridge",
9-
version="0.1.0",
22+
version=get_version(),
1023
packages=find_namespace_packages(where="src/python", include=["embedding_bridge*"]),
1124
package_dir={"": "src/python"},
1225
install_requires=[

src/cli/diff.c

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ static bool is_valid_hash(const char* str);
4848
static int ends_with(const char* str, const char* suffix);
4949
static int check_invalid_values(const float* embedding, size_t dims);
5050
static float cosine_similarity(const float* vec1, const float* vec2, size_t dims);
51+
static float euclidean_distance(const float* vec1, const float* vec2, size_t dims);
52+
static float euclidean_similarity(const float* vec1, const float* vec2, size_t dims);
5153

5254
static const char* DIFF_USAGE =
5355
"Usage: eb diff <input1> <input2>\n"
@@ -105,6 +107,49 @@ static float cosine_similarity(const float *vec1, const float *vec2, size_t dims
105107
return similarity;
106108
}
107109

110+
/* Calculate Euclidean distance between two float vectors */
111+
static float euclidean_distance(const float *vec1, const float *vec2, size_t dims)
112+
{
113+
double sum = 0.0;
114+
115+
DEBUG_PRINT("Calculating Euclidean distance for %zu dimensions", dims);
116+
117+
for (size_t i = 0; i < dims; i++) {
118+
// Check for invalid values
119+
if (isnan(vec1[i]) || isnan(vec2[i]) ||
120+
isinf(vec1[i]) || isinf(vec2[i])) {
121+
DEBUG_PRINT("Invalid value detected at index %zu: vec1=%f, vec2=%f",
122+
i, vec1[i], vec2[i]);
123+
return INFINITY;
124+
}
125+
126+
double diff = (double)vec1[i] - (double)vec2[i];
127+
sum += diff * diff;
128+
}
129+
130+
DEBUG_PRINT("Euclidean distance squared: %f", sum);
131+
132+
return (float)sqrt(sum);
133+
}
134+
135+
/* Calculate normalized Euclidean similarity (0 to 1 scale, where 1 is identical) */
136+
static float euclidean_similarity(const float *vec1, const float *vec2, size_t dims)
137+
{
138+
float distance = euclidean_distance(vec1, vec2, dims);
139+
140+
if (isinf(distance) || isnan(distance)) {
141+
return 0.0f;
142+
}
143+
144+
// Normalize to [0,1] range where 1 means identical
145+
// Using a common approach: 1 / (1 + distance)
146+
float similarity = 1.0f / (1.0f + distance);
147+
148+
DEBUG_PRINT("Calculated normalized Euclidean similarity: %f", similarity);
149+
150+
return similarity;
151+
}
152+
108153
/* Load embedding from .npy file */
109154
static float* load_npy_embedding(const char *filepath, size_t *dims)
110155
{
@@ -436,7 +481,7 @@ int cmd_diff(int argc, char *argv[])
436481
const char *hash1, *hash2;
437482
float *emb1 = NULL, *emb2 = NULL;
438483
size_t dims1, dims2;
439-
float similarity;
484+
float cos_similarity, euc_distance, euc_similarity;
440485
int ret = 1; // Initialize to error state
441486
bool is_test = getenv("EB_TEST_MODE") != NULL;
442487

@@ -454,9 +499,11 @@ int cmd_diff(int argc, char *argv[])
454499
/* Quick check for identical inputs */
455500
if (strcmp(hash1, hash2) == 0) {
456501
if (is_test)
457-
printf("→ Similarity: 100%%");
502+
printf("→ Cosine Similarity: 100%%\n→ Euclidean Distance: 0.00\n→ Euclidean Similarity: 100%%");
458503
else
459-
printf(COLOR_BOLD_GREEN "→ Similarity: 100%%" COLOR_RESET "\n");
504+
printf(COLOR_BOLD_GREEN "→ Cosine Similarity: 100%%" COLOR_RESET "\n"
505+
COLOR_BOLD_GREEN "→ Euclidean Distance: 0.00" COLOR_RESET "\n"
506+
COLOR_BOLD_GREEN "→ Euclidean Similarity: 100%%" COLOR_RESET "\n");
460507
return 0;
461508
}
462509

@@ -481,16 +528,24 @@ int cmd_diff(int argc, char *argv[])
481528
goto cleanup;
482529
}
483530

484-
/* Calculate similarity */
485-
similarity = cosine_similarity(emb1, emb2, dims1);
486-
DEBUG_PRINT("Calculated raw similarity: %f", similarity);
531+
/* Calculate similarities */
532+
cos_similarity = cosine_similarity(emb1, emb2, dims1);
533+
euc_distance = euclidean_distance(emb1, emb2, dims1);
534+
euc_similarity = euclidean_similarity(emb1, emb2, dims1);
535+
536+
DEBUG_PRINT("Calculated raw cosine similarity: %f", cos_similarity);
537+
DEBUG_PRINT("Calculated raw Euclidean distance: %f", euc_distance);
538+
DEBUG_PRINT("Calculated raw Euclidean similarity: %f", euc_similarity);
487539

488540
/* Print result */
489541
if (is_test) {
490-
printf("→ Similarity: %.0f%%", similarity * 100);
542+
printf("→ Cosine Similarity: %.0f%%\n→ Euclidean Distance: %.2f\n→ Euclidean Similarity: %.0f%%",
543+
cos_similarity * 100, euc_distance, euc_similarity * 100);
491544
} else {
492-
printf(COLOR_BOLD_GREEN "→ Similarity: %.0f%%" COLOR_RESET "\n",
493-
similarity * 100);
545+
printf(COLOR_BOLD_GREEN "→ Cosine Similarity: %.0f%%" COLOR_RESET "\n"
546+
COLOR_BOLD_GREEN "→ Euclidean Distance: %.2f" COLOR_RESET "\n"
547+
COLOR_BOLD_GREEN "→ Euclidean Similarity: %.0f%%" COLOR_RESET "\n",
548+
cos_similarity * 100, euc_distance, euc_similarity * 100);
494549
}
495550
ret = 0;
496551

src/cli/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
7676

7777
// Global help shows version
7878
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
79-
printf("eb version %d\n", EB_VERSION);
79+
printf("eb version %s\n", EB_VERSION_STR);
8080
return 0;
8181
}
8282

src/core/types.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,26 @@
99
// Magic numbers for binary format
1010
#define EB_MAGIC_VECTOR 0x53564245 // "EBVS"
1111
#define EB_MAGIC_META 0x4D564245 // "EBVM"
12-
#define EB_VERSION 0x00000001
12+
// START OF SET THE VERSION HERE
13+
// Version components
14+
#define EB_VERSION_MAJOR 0
15+
#define EB_VERSION_MINOR 1
16+
#define EB_VERSION_PATCH 0
17+
18+
// Full version as string
19+
#define EB_VERSION_STR "0.1.0"
20+
// END OF SET THE VERSION HERE
21+
22+
23+
// Full version as integer for compatibility checks
24+
#define EB_VERSION ((EB_VERSION_MAJOR << 16) | (EB_VERSION_MINOR << 8) | EB_VERSION_PATCH)
1325

1426
// Version compatibility macros
15-
#define EB_VERSION_MAJOR(v) ((v) >> 16)
16-
#define EB_VERSION_MINOR(v) ((v) & 0xFFFF)
17-
#define EB_MAKE_VERSION(major, minor) (((major) << 16) | (minor))
18-
#define EB_VERSION_COMPATIBLE(v1, v2) (EB_VERSION_MAJOR(v1) == EB_VERSION_MAJOR(v2))
27+
#define EB_GET_VERSION_MAJOR(v) ((v) >> 16)
28+
#define EB_GET_VERSION_MINOR(v) (((v) >> 8) & 0xFF)
29+
#define EB_GET_VERSION_PATCH(v) ((v) & 0xFF)
30+
#define EB_MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | (patch))
31+
#define EB_VERSION_COMPATIBLE(v1, v2) (EB_GET_VERSION_MAJOR(v1) == EB_GET_VERSION_MAJOR(v2))
1932

2033
// Core data types
2134
typedef enum {

0 commit comments

Comments
 (0)