Skip to content

Commit e309699

Browse files
authored
Merge pull request #19 from brianmeyer/codex/rec-192-193-tag-followups
Harden generated media tags and batch tag merge
2 parents 0ebe474 + bf4ec6f commit e309699

4 files changed

Lines changed: 104 additions & 2 deletions

File tree

src/recallforge/search.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,18 @@ def search_batch(
15301530
if not batch_queries:
15311531
return []
15321532

1533+
def _merge_tags(items: List[Any]) -> Optional[List[str]]:
1534+
merged: List[str] = []
1535+
seen: set[str] = set()
1536+
for item in items:
1537+
for tag in getattr(item, "tags", None) or []:
1538+
cleaned = str(tag or "").strip().lower()
1539+
if not cleaned or cleaned in seen:
1540+
continue
1541+
seen.add(cleaned)
1542+
merged.append(cleaned)
1543+
return merged or None
1544+
15331545
def run_single_query(q: BatchQuery) -> List[tuple]:
15341546
"""Run a single query and return (result, score) tuples."""
15351547
mode = q.mode or "hybrid"
@@ -1582,11 +1594,14 @@ def run_single_query(q: BatchQuery) -> List[tuple]:
15821594
if filepath not in merged:
15831595
merged[filepath] = {
15841596
'result': result,
1597+
'results': [result],
15851598
'rrf_score': 0.0,
15861599
'query_indices': set(),
15871600
'query_scores': {},
15881601
'best_score': 0.0,
15891602
}
1603+
else:
1604+
merged[filepath]['results'].append(result)
15901605

15911606
# RRF contribution: rank-based, not insertion-order-based
15921607
merged[filepath]['rrf_score'] += weight / (rrf_k + rank + 1)
@@ -1612,7 +1627,7 @@ def run_single_query(q: BatchQuery) -> List[tuple]:
16121627
score=data['rrf_score'],
16131628
source=','.join(str(i) for i in sorted(data['query_indices'])),
16141629
query_scores=data['query_scores'],
1615-
tags=getattr(result, "tags", None),
1630+
tags=_merge_tags(data['results']),
16161631
))
16171632

16181633
final_results.sort(key=lambda x: x.score, reverse=True)

src/recallforge/storage/indexing_ops.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def _parse_generated_media_tags(self, raw: str) -> List[str]:
132132
if not text:
133133
return []
134134

135+
fenced_match = re.match(r"^```(?:[A-Za-z0-9_+-]+)?\s*\n?(.*?)\n?```$", text, flags=re.DOTALL)
136+
if fenced_match:
137+
text = fenced_match.group(1).strip()
138+
135139
candidates: List[str] = []
136140
if text.startswith("[") and text.endswith("]"):
137141
try:

tests/test_search_batch.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,67 @@ def mock_search(self, query):
525525
self.assertIn(0, results[0].query_scores)
526526
self.assertIn(1, results[0].query_scores)
527527

528+
def test_same_document_merges_tags_deterministically(self):
529+
"""Duplicate hits should merge tag sets in stable first-seen order."""
530+
backend = StubBackend()
531+
storage = StubStorage()
532+
533+
results_list = [
534+
[
535+
type('HybridResult', (), {
536+
'filepath': 'shared.md',
537+
'display_path': 'shared.md',
538+
'title': 'shared.md',
539+
'context': None,
540+
'hash': 'h1',
541+
'docid': 'd1',
542+
'collection': 'test',
543+
'modified_at': '2026-01-01',
544+
'body_length': 100,
545+
'body': 'shared content',
546+
'score': 0.8,
547+
'source': 'hybrid',
548+
'tags': ['alpha', 'shared'],
549+
}),
550+
],
551+
[
552+
type('HybridResult', (), {
553+
'filepath': 'shared.md',
554+
'display_path': 'shared.md',
555+
'title': 'shared.md',
556+
'context': None,
557+
'hash': 'h1',
558+
'docid': 'd1',
559+
'collection': 'test',
560+
'modified_at': '2026-01-01',
561+
'body_length': 100,
562+
'body': 'shared content',
563+
'score': 0.9,
564+
'source': 'hybrid',
565+
'tags': ['shared', 'beta'],
566+
}),
567+
],
568+
]
569+
570+
call_idx = [0]
571+
572+
def mock_search(self, query):
573+
idx = call_idx[0]
574+
call_idx[0] += 1
575+
return results_list[idx]
576+
577+
with patch.object(HybridSearcher, '__init__', lambda self, **kwargs: None):
578+
with patch.object(HybridSearcher, 'search', mock_search):
579+
results = search_batch(
580+
["query one", "query two"],
581+
backend=backend,
582+
storage=storage,
583+
limit=10,
584+
)
585+
586+
self.assertEqual(len(results), 1)
587+
self.assertEqual(results[0].tags, ["alpha", "shared", "beta"])
588+
528589

529590
if __name__ == "__main__":
530-
unittest.main()
591+
unittest.main()

tests/test_storage.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,28 @@ def test_memory_lookup_surfaces_media_tags(self):
13401340
["neural network", "diagram", "hidden layers"],
13411341
)
13421342

1343+
def test_generated_media_tags_strip_fenced_json(self):
1344+
embedder = CaptioningEmbedder()
1345+
1346+
def fenced_json(_prompt: str, max_tokens: int = 60) -> str:
1347+
return '```json\n["diagram", "hidden layers", "neural network"]\n```'
1348+
1349+
embedder.generate_text = fenced_json
1350+
1351+
self.backend.index_image(
1352+
path=self.image_path,
1353+
collection="test",
1354+
embed_func=embedder,
1355+
caption_media=True,
1356+
)
1357+
1358+
rows = self.backend._embeddings_table.search().where("content_type = 'image'").to_list()
1359+
self.assertEqual(len(rows), 1)
1360+
self.assertEqual(
1361+
json.loads(rows[0].get("tags") or "[]"),
1362+
["diagram", "hidden layers", "neural network"],
1363+
)
1364+
13431365
def test_index_video_keeps_parent_memory_and_links_children(self):
13441366
embedder = CaptioningEmbedder()
13451367
logical_path = str(Path(self.video_path).expanduser().resolve())

0 commit comments

Comments
 (0)