@@ -48,6 +48,8 @@ static bool is_valid_hash(const char* str);
4848static int ends_with (const char * str , const char * suffix );
4949static int check_invalid_values (const float * embedding , size_t dims );
5050static float cosine_similarity (const float * vec1 , const float * vec2 , size_t dims );
51+ static float euclidean_distance (const float * vec1 , const float * vec2 , size_t dims );
52+ static float euclidean_similarity (const float * vec1 , const float * vec2 , size_t dims );
5153
5254static const char * DIFF_USAGE =
5355 "Usage: eb diff <input1> <input2>\n"
@@ -105,6 +107,49 @@ static float cosine_similarity(const float *vec1, const float *vec2, size_t dims
105107 return similarity ;
106108}
107109
110+ /* Calculate Euclidean distance between two float vectors */
111+ static float euclidean_distance (const float * vec1 , const float * vec2 , size_t dims )
112+ {
113+ double sum = 0.0 ;
114+
115+ DEBUG_PRINT ("Calculating Euclidean distance for %zu dimensions" , dims );
116+
117+ for (size_t i = 0 ; i < dims ; i ++ ) {
118+ // Check for invalid values
119+ if (isnan (vec1 [i ]) || isnan (vec2 [i ]) ||
120+ isinf (vec1 [i ]) || isinf (vec2 [i ])) {
121+ DEBUG_PRINT ("Invalid value detected at index %zu: vec1=%f, vec2=%f" ,
122+ i , vec1 [i ], vec2 [i ]);
123+ return INFINITY ;
124+ }
125+
126+ double diff = (double )vec1 [i ] - (double )vec2 [i ];
127+ sum += diff * diff ;
128+ }
129+
130+ DEBUG_PRINT ("Euclidean distance squared: %f" , sum );
131+
132+ return (float )sqrt (sum );
133+ }
134+
135+ /* Calculate normalized Euclidean similarity (0 to 1 scale, where 1 is identical) */
136+ static float euclidean_similarity (const float * vec1 , const float * vec2 , size_t dims )
137+ {
138+ float distance = euclidean_distance (vec1 , vec2 , dims );
139+
140+ if (isinf (distance ) || isnan (distance )) {
141+ return 0.0f ;
142+ }
143+
144+ // Normalize to [0,1] range where 1 means identical
145+ // Using a common approach: 1 / (1 + distance)
146+ float similarity = 1.0f / (1.0f + distance );
147+
148+ DEBUG_PRINT ("Calculated normalized Euclidean similarity: %f" , similarity );
149+
150+ return similarity ;
151+ }
152+
108153/* Load embedding from .npy file */
109154static float * load_npy_embedding (const char * filepath , size_t * dims )
110155{
@@ -436,7 +481,7 @@ int cmd_diff(int argc, char *argv[])
436481 const char * hash1 , * hash2 ;
437482 float * emb1 = NULL , * emb2 = NULL ;
438483 size_t dims1 , dims2 ;
439- float similarity ;
484+ float cos_similarity , euc_distance , euc_similarity ;
440485 int ret = 1 ; // Initialize to error state
441486 bool is_test = getenv ("EB_TEST_MODE" ) != NULL ;
442487
@@ -454,9 +499,11 @@ int cmd_diff(int argc, char *argv[])
454499 /* Quick check for identical inputs */
455500 if (strcmp (hash1 , hash2 ) == 0 ) {
456501 if (is_test )
457- printf ("→ Similarity: 100%%" );
502+ printf ("→ Cosine Similarity: 100%%\n→ Euclidean Distance: 0.00\n→ Euclidean Similarity: 100%%" );
458503 else
459- printf (COLOR_BOLD_GREEN "→ Similarity: 100%%" COLOR_RESET "\n" );
504+ printf (COLOR_BOLD_GREEN "→ Cosine Similarity: 100%%" COLOR_RESET "\n"
505+ COLOR_BOLD_GREEN "→ Euclidean Distance: 0.00" COLOR_RESET "\n"
506+ COLOR_BOLD_GREEN "→ Euclidean Similarity: 100%%" COLOR_RESET "\n" );
460507 return 0 ;
461508 }
462509
@@ -481,16 +528,24 @@ int cmd_diff(int argc, char *argv[])
481528 goto cleanup ;
482529 }
483530
484- /* Calculate similarity */
485- similarity = cosine_similarity (emb1 , emb2 , dims1 );
486- DEBUG_PRINT ("Calculated raw similarity: %f" , similarity );
531+ /* Calculate similarities */
532+ cos_similarity = cosine_similarity (emb1 , emb2 , dims1 );
533+ euc_distance = euclidean_distance (emb1 , emb2 , dims1 );
534+ euc_similarity = euclidean_similarity (emb1 , emb2 , dims1 );
535+
536+ DEBUG_PRINT ("Calculated raw cosine similarity: %f" , cos_similarity );
537+ DEBUG_PRINT ("Calculated raw Euclidean distance: %f" , euc_distance );
538+ DEBUG_PRINT ("Calculated raw Euclidean similarity: %f" , euc_similarity );
487539
488540 /* Print result */
489541 if (is_test ) {
490- printf ("→ Similarity: %.0f%%" , similarity * 100 );
542+ printf ("→ Cosine Similarity: %.0f%%\n→ Euclidean Distance: %.2f\n→ Euclidean Similarity: %.0f%%" ,
543+ cos_similarity * 100 , euc_distance , euc_similarity * 100 );
491544 } else {
492- printf (COLOR_BOLD_GREEN "→ Similarity: %.0f%%" COLOR_RESET "\n" ,
493- similarity * 100 );
545+ printf (COLOR_BOLD_GREEN "→ Cosine Similarity: %.0f%%" COLOR_RESET "\n"
546+ COLOR_BOLD_GREEN "→ Euclidean Distance: %.2f" COLOR_RESET "\n"
547+ COLOR_BOLD_GREEN "→ Euclidean Similarity: %.0f%%" COLOR_RESET "\n" ,
548+ cos_similarity * 100 , euc_distance , euc_similarity * 100 );
494549 }
495550 ret = 0 ;
496551
0 commit comments