fix: Address review feedback for embed_stream

fede-kamel · fede-kamel · commit 5c8a4ec41f0e · 2026-02-25T10:40:28.000-05:00
1. V2 embed_stream mishandles duplicate texts (High):
   - Added used_batch_indices tracking like base_client
   - Now correctly assigns unique indices to duplicate texts

2. Unused variable total_embeddings_yielded (Low):
   - Removed from both base_client.py and v2/client.py
diff --git a/src/cohere/base_client.py b/src/cohere/base_client.py
@@ -1207,7 +1207,6 @@ def embed_stream(
 
         # Process texts in batches
         texts_list = list(texts)
-        total_embeddings_yielded = 0
 
         for batch_start in range(0, len(texts_list), batch_size):
             batch_end = min(batch_start + batch_size, len(texts_list))
diff --git a/src/cohere/v2/client.py b/src/cohere/v2/client.py
@@ -583,7 +583,6 @@ def embed_stream(
 
         # Process texts in batches
         texts_list = list(texts)
-        total_embeddings_yielded = 0
 
         for batch_start in range(0, len(texts_list), batch_size):
             batch_end = min(batch_start + batch_size, len(texts_list))
@@ -600,15 +599,26 @@ def embed_stream(
                 truncate=truncate,
                 request_options=request_options,
             )
-            
+
             # Parse embeddings from response incrementally
             parser = StreamingEmbedParser(response._response, batch_texts)
+            # Track used indices to handle duplicate texts correctly
+            used_batch_indices: set[int] = set()
+
             for embedding in parser.iter_embeddings():
                 # The parser sets embedding.text correctly for multiple embedding types
                 # Adjust the global index based on text position in batch
                 if embedding.text and embedding.text in batch_texts:
-                    text_idx_in_batch = batch_texts.index(embedding.text)
-                    embedding.index = batch_start + text_idx_in_batch
+                    # Find the next unused occurrence of this text in the batch
+                    # This handles duplicate texts correctly
+                    text_idx_in_batch = None
+                    for idx, text in enumerate(batch_texts):
+                        if text == embedding.text and idx not in used_batch_indices:
+                            text_idx_in_batch = idx
+                            used_batch_indices.add(idx)
+                            break
+                    if text_idx_in_batch is not None:
+                        embedding.index = batch_start + text_idx_in_batch
                 yield embedding
 
     def rerank(