From 970dd29b307526dee22314a5662b82c5fd1c6117 Mon Sep 17 00:00:00 2001 From: cosmos-dev Date: Fri, 26 Jun 2026 21:57:56 +0000 Subject: [PATCH 1/8] feat(cosmos-retriever): vendor the Cosmos Retriever Python service FastAPI service (POST /search, GET /health) wrapping CosmosRetriever, which runs a multi-turn retrieval agent over a Cosmos DB corpus. Pluggable inference backend: harmony_vllm (fine-tuned pat-jj/harness-1), openai_chat (any OpenAI-compatible chat model), or openai_responses (reasoning models such as gpt-5.4 on Azure AI Foundry). Includes tests for the server and agent loops. The .NET agentic_search tool calls this service over HTTP. --- cosmos-retriever/.env.example | 61 + cosmos-retriever/.github/workflows/ci.yml | 37 + cosmos-retriever/.gitignore | 37 + cosmos-retriever/.python-version | 1 + cosmos-retriever/LICENSE | 201 + cosmos-retriever/README.md | 149 + cosmos-retriever/corpus_registry.json | 17 + cosmos-retriever/pyproject.toml | 80 + cosmos-retriever/scripts/bench_browsecomp.py | 199 + cosmos-retriever/scripts/bench_erag.py | 169 + cosmos-retriever/scripts/diagnose_qst_0099.py | 155 + cosmos-retriever/scripts/erag_repeat.py | 52 + .../scripts/run_with_upstream_env.sh | 46 + .../src/cosmos_retriever/__init__.py | 12 + .../src/cosmos_retriever/__main__.py | 127 + .../src/cosmos_retriever/agent.py | 224 + .../src/cosmos_retriever/config.py | 461 ++ .../cosmos_retriever/datagen/BrowseComp-Plus | 1 + .../src/cosmos_retriever/datagen/README.md | 96 + .../src/cosmos_retriever/datagen/__init__.py | 0 .../datagen/generate_sft_rl_splits.py | 140 + .../datagen/search_dataset.py | 1333 +++++ .../datagen/splits/browsecompplus_splits.json | 846 +++ .../datagen/splits/patents_splits.json | 3252 ++++++++++++ .../datagen/splits/sec_splits.json | 4685 +++++++++++++++++ .../datagen/splits/summary.json | 34 + .../datagen/splits/web_splits.json | 2794 ++++++++++ .../src/cosmos_retriever/env_rl.py | 812 +++ .../cosmos_retriever/inference/__init__.py | 15 + .../src/cosmos_retriever/inference/base.py | 29 + .../inference/evaluate_harness1_vllm.py | 498 ++ .../cosmos_retriever/inference/openai_chat.py | 367 ++ .../src/cosmos_retriever/inference/vllm.py | 309 ++ .../src/cosmos_retriever/prompts.py | 90 + .../src/cosmos_retriever/rerank.py | 483 ++ .../src/cosmos_retriever/retriever.py | 497 ++ .../src/cosmos_retriever/server.py | 178 + .../src/cosmos_retriever/tasks.py | 422 ++ .../src/cosmos_retriever/tools.py | 878 +++ .../src/cosmos_retriever/trajectory.py | 491 ++ .../src/cosmos_retriever/ultra_core.py | 1951 +++++++ .../src/cosmos_retriever/utils.py | 28 + cosmos-retriever/tests/__init__.py | 1 + cosmos-retriever/tests/conftest.py | 26 + cosmos-retriever/tests/test_chat_agent.py | 271 + cosmos-retriever/tests/test_server.py | 99 + cosmos-retriever/tests/test_tools.py | 104 + cosmos-retriever/tests/test_trajectory.py | 134 + 48 files changed, 22892 insertions(+) create mode 100644 cosmos-retriever/.env.example create mode 100644 cosmos-retriever/.github/workflows/ci.yml create mode 100644 cosmos-retriever/.gitignore create mode 100644 cosmos-retriever/.python-version create mode 100644 cosmos-retriever/LICENSE create mode 100644 cosmos-retriever/README.md create mode 100644 cosmos-retriever/corpus_registry.json create mode 100644 cosmos-retriever/pyproject.toml create mode 100644 cosmos-retriever/scripts/bench_browsecomp.py create mode 100644 cosmos-retriever/scripts/bench_erag.py create mode 100644 cosmos-retriever/scripts/diagnose_qst_0099.py create mode 100644 cosmos-retriever/scripts/erag_repeat.py create mode 100755 cosmos-retriever/scripts/run_with_upstream_env.sh create mode 100644 cosmos-retriever/src/cosmos_retriever/__init__.py create mode 100644 cosmos-retriever/src/cosmos_retriever/__main__.py create mode 100644 cosmos-retriever/src/cosmos_retriever/agent.py create mode 100644 cosmos-retriever/src/cosmos_retriever/config.py create mode 120000 cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/README.md create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/__init__.py create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json create mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json create mode 100644 cosmos-retriever/src/cosmos_retriever/env_rl.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/__init__.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/base.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/vllm.py create mode 100644 cosmos-retriever/src/cosmos_retriever/prompts.py create mode 100644 cosmos-retriever/src/cosmos_retriever/rerank.py create mode 100644 cosmos-retriever/src/cosmos_retriever/retriever.py create mode 100644 cosmos-retriever/src/cosmos_retriever/server.py create mode 100644 cosmos-retriever/src/cosmos_retriever/tasks.py create mode 100644 cosmos-retriever/src/cosmos_retriever/tools.py create mode 100644 cosmos-retriever/src/cosmos_retriever/trajectory.py create mode 100644 cosmos-retriever/src/cosmos_retriever/ultra_core.py create mode 100644 cosmos-retriever/src/cosmos_retriever/utils.py create mode 100644 cosmos-retriever/tests/__init__.py create mode 100644 cosmos-retriever/tests/conftest.py create mode 100644 cosmos-retriever/tests/test_chat_agent.py create mode 100644 cosmos-retriever/tests/test_server.py create mode 100644 cosmos-retriever/tests/test_tools.py create mode 100644 cosmos-retriever/tests/test_trajectory.py diff --git a/cosmos-retriever/.env.example b/cosmos-retriever/.env.example new file mode 100644 index 0000000..8c9f8dc --- /dev/null +++ b/cosmos-retriever/.env.example @@ -0,0 +1,61 @@ +# ----- Inference backend ----- +# "harmony_vllm" (default): the fine-tuned pat-jj/harness-1 checkpoint served by +# vLLM, driven with raw Harmony token-IDs (set VLLM_* below). +# "openai_chat": ANY OpenAI-compatible chat model (Azure AI Foundry deployment, +# OpenAI, local server, ...) driven with standard function/tool calling +# (set CHAT_* below). +INFERENCE_BACKEND=harmony_vllm + +# ----- vLLM serving the Harness-1 model (harmony_vllm backend) ----- +# URL of the OpenAI-compatible vLLM endpoint that serves `pat-jj/harness-1`. +VLLM_BASE_URL=http://127.0.0.1:8000 +VLLM_MODEL_NAME=harness-1 +VLLM_TIMEOUT_S=900 + +# ----- Generic chat LLM endpoint (openai_chat backend) ----- +# Any OpenAI-compatible chat-completions endpoint. For Azure AI Foundry, +# CHAT_BASE_URL is the deployment's OpenAI-compatible URL and CHAT_MODEL is the +# deployment name. Set CHAT_API_VERSION to use the Azure OpenAI client instead. +# CHAT_BASE_URL=https://your-resource.services.ai.azure.com/openai/v1 +# CHAT_API_KEY= +# CHAT_MODEL=gpt-4o +# CHAT_API_VERSION= +# CHAT_TEMPERATURE=0.7 +# CHAT_MAX_TOKENS=4096 +# CHAT_MAX_TURNS=20 + +# ----- Cosmos DB (required) ----- +# The corpus container must already be ingested with `id`, `docid`, `chunk_idx`, +# `text`, and `embedding` fields per the Harness-1 schema. +ACCOUNT_URI=https://your-cosmos-account.documents.azure.com:443/ +COSMOS_DATABASE=your-database-name +COSMOS_CORPUS_CONTAINER=your-corpus-container +# Optional: leave unset to use AzureCliCredential / DefaultAzureCredential. +# COSMOS_KEY= + +# ----- Embeddings for SearchCorpusTool (required) ----- +# OpenAI by default; set AZURE_OPENAI_* to route through Azure OpenAI instead. +OPENAI_API_KEY=sk-... +OPENAI_EMBEDDING_MODEL=text-embedding-3-small +# AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com +# AZURE_OPENAI_API_KEY= +# Optional instruction prepended to embedding queries (used by some Qwen embedders). +# EMBED_QUERY_INSTRUCTION= + +# ----- Reranker (optional) ----- +# Pick at most one of these. Leave both unset to disable reranking. +# Baseten: +# BASETEN_API_KEY= +# BASETEN_MODEL_URL=https://model-xyz.api.baseten.co/environments/production/sync +# Local vLLM Qwen3-Reranker-8B (run on a separate port): +# VLLM_RERANKER_URL=http://127.0.0.1:8011 + +# ----- Retriever budget knobs (optional) ----- +# COSMOS_RETRIEVER_MAX_TURNS=35 +# COSMOS_RETRIEVER_THRESHOLD_BUDGET=16384 +# COSMOS_RETRIEVER_TOKEN_BUDGET=32268 + +# ----- HTTP server ----- +HOST=0.0.0.0 +PORT=9000 +LOG_LEVEL=info diff --git a/cosmos-retriever/.github/workflows/ci.yml b/cosmos-retriever/.github/workflows/ci.yml new file mode 100644 index 0000000..86d3c81 --- /dev/null +++ b/cosmos-retriever/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: ci + +on: + push: + branches: [main] + pull_request: + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint-and-test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + + - name: Install package with dev extras + run: uv pip install --system -e ".[dev]" + + - name: Ruff lint + run: ruff check src tests + + - name: Pytest + run: pytest -q diff --git a/cosmos-retriever/.gitignore b/cosmos-retriever/.gitignore new file mode 100644 index 0000000..8133b6c --- /dev/null +++ b/cosmos-retriever/.gitignore @@ -0,0 +1,37 @@ +# --- Python --- +__pycache__/ +*.py[cod] +*$py.class +*.egg-info/ +.eggs/ +build/ +dist/ +.coverage +.coverage.* +htmlcov/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +# --- Virtual envs --- +.venv/ +venv/ +env/ + +# --- IDE --- +.vscode/ +.idea/ +*.swp + +# --- Secrets / local config --- +.env +.env.local +.env.*.local + +# --- Logs / scratch --- +*.log +tmp/ +runs/ + +# --- Build artefacts --- +src/*.egg-info/ diff --git a/cosmos-retriever/.python-version b/cosmos-retriever/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/cosmos-retriever/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/cosmos-retriever/LICENSE b/cosmos-retriever/LICENSE new file mode 100644 index 0000000..29f81d8 --- /dev/null +++ b/cosmos-retriever/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/cosmos-retriever/README.md b/cosmos-retriever/README.md new file mode 100644 index 0000000..467443d --- /dev/null +++ b/cosmos-retriever/README.md @@ -0,0 +1,149 @@ +# Cosmos Retriever (Python helper) + +A Python library + FastAPI service that runs the **Harness-1** multi-turn +search agent (`pat-jj/harness-1`, a fine-tuned `openai/gpt-oss-20b` served by +vLLM) against an Azure Cosmos DB corpus and returns the curated documents as +JSON. + +The [Azure Cosmos DB MCP Toolkit](../MCPToolKit/)'s `agentic_search` tool +calls this service's `POST /search` endpoint over HTTP. A one-shot CLI is also +provided for local testing. + +```text + Claude Desktop / AI Foundry / VS Code + │ + │ MCP streamable-HTTP + ▼ + Azure Cosmos DB MCP Toolkit (.NET) + ├─ list_databases / list_collections / ... (8 native tools) + └─ agentic_search ◀─── 9th tool + │ + │ HTTP: POST http://127.0.0.1:9000/search + ▼ + cosmos_retriever (this package, FastAPI + uvicorn) + ├─ TokenBudgetRetrievalSubagent + ├─ SearchCorpus / Grep / ReadDocument / PruneChunks tools + └─ VLLMHarmonyInferenceModel ──► vLLM /v1/completions (token-IDs) + Cosmos DB hybrid RRF + Azure OpenAI embeddings + Qwen3-Reranker (Baseten or local vLLM) +``` + +## Install + +```bash +cd cosmos-retriever +uv venv --python 3.11 .venv +uv pip install --python .venv/bin/python -e ".[dev]" +``` + +## HTTP service + +The MCP Toolkit talks to a long-lived FastAPI service. Start it with: + +```bash +python -m cosmos_retriever serve # binds HOST:PORT (default 0.0.0.0:9000) +``` + +Endpoints: + +| Method & path | Body / response | +|---|---| +| `GET /health` | `{"status": "ok"}` | +| `POST /search` | request `{"query": str, "maxDocuments": int, "database": str?, "container": str?}` → the JSON result below | + +```bash +curl -s http://127.0.0.1:9000/search \ + -H 'content-type: application/json' \ + -d '{"query": "Who discovered radium?", "maxDocuments": 5}' +``` + +## CLI + +A one-shot CLI for local testing. JSON goes to **stdout**, logs go to **stderr**. + +```bash +python -m cosmos_retriever search \ + --query "Who discovered radium?" \ + --max-documents 5 +``` + +Output (same schema returned by `POST /search`): +```json +{ + "query": "Who discovered radium?", + "num_turns": 5, + "elapsed_s": 32.3, + "documents": [ + { "id": "96308__3", "rank": 0, "justification": "...", "text": "..." } + ] +} +``` + +## Configuration + +All settings come from environment variables (or a `.env` / `.env.local` file +at the repo root). Required: + +| Variable | Purpose | +|---|---| +| `VLLM_BASE_URL` | OpenAI-compatible vLLM endpoint serving Harness-1 | +| `ACCOUNT_URI` / `COSMOS_DATABASE` / `COSMOS_CORPUS_CONTAINER` | Cosmos target | +| `OPENAI_API_KEY` *(or `AZURE_OPENAI_*`)* | Embeddings backend | + +### Inference backend + +`INFERENCE_BACKEND` selects what drives the retrieval agent: + +| Value | Model | Endpoint vars | +|---|---|---| +| `harmony_vllm` *(default)* | The fine-tuned `pat-jj/harness-1` checkpoint, driven with raw Harmony token-IDs. | `VLLM_BASE_URL`, `VLLM_MODEL_NAME` | +| `openai_chat` | **Any** OpenAI-compatible chat model (Azure AI Foundry deployment, OpenAI, local server, ...), driven with standard function/tool calling. | `CHAT_BASE_URL`, `CHAT_API_KEY`, `CHAT_MODEL`, optional `CHAT_API_VERSION` | + +With `openai_chat` the agent uses the same Cosmos tools, so retrieval quality +depends on the chosen model's tool-use ability rather than the Harness-1 +checkpoint. Example (Azure AI Foundry): + +```bash +INFERENCE_BACKEND=openai_chat \ +CHAT_BASE_URL=https://your-resource.services.ai.azure.com/openai/v1 \ +CHAT_API_KEY=... \ +CHAT_MODEL=gpt-4o \ +python -m cosmos_retriever serve +``` + +Optional reranker (pick at most one): +- `BASETEN_API_KEY` + `BASETEN_MODEL_URL` — Baseten Qwen3-Reranker-8B classify +- `VLLM_RERANKER_URL` — local vLLM `/score` endpoint with Qwen3-Reranker-8B + +A bundled wrapper script reads `../harness-1/.env.local` (the upstream repo's +local config) and re-exports under our variable names: + +```bash +scripts/run_with_upstream_env.sh \ + python -m cosmos_retriever search --query "..." +``` + +## Layout + +```text +src/cosmos_retriever/ + __init__.py # CosmosRetriever, RetrievalResult, RetrievedDocument + __main__.py # `python -m cosmos_retriever {search,serve}` + server.py # FastAPI app: GET /health + POST /search + retriever.py # CosmosRetriever facade + agent.py # 3 agent classes + prune_chunks_from_trajectory + tools.py # SearchCorpus / Grep / ReadDocument / PruneChunks + trajectory.py # Action / Observation / Trajectory + Harmony rendering + rerank.py # Reranker ABC + Baseten + local-vLLM + inference/ + base.py # AgentInferenceModel ABC + vllm.py # VLLMHarmonyInferenceModel (httpx → /v1/completions) + prompts.py # retrieval subagent system prompt + config.py # RetrieverSettings (pydantic-settings) + utils.py +``` + +## License + +Apache 2.0. diff --git a/cosmos-retriever/corpus_registry.json b/cosmos-retriever/corpus_registry.json new file mode 100644 index 0000000..3f608fb --- /dev/null +++ b/cosmos-retriever/corpus_registry.json @@ -0,0 +1,17 @@ +{ + "browsecomp_corpus_container": { + "account_uri": "https://aryans-internship-cosmos.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "https://embedding-west-us-resource.services.ai.azure.com/openai/v1", + "embed_api_key_env": "AZURE_OPENAI_EMBED_API_KEY", + "embed_model": "text-embedding-3-small" + }, + "enterprise_ragbench_corpus": { + "account_uri": "https://aryans-internship-cosmos-prov.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "http://172.17.0.2:8002/v1", + "embed_api_key_env": null, + "embed_model": "qwen3-embed", + "embed_query_instruction": "Given a question, retrieve documents that answer it" + } +} diff --git a/cosmos-retriever/pyproject.toml b/cosmos-retriever/pyproject.toml new file mode 100644 index 0000000..172321c --- /dev/null +++ b/cosmos-retriever/pyproject.toml @@ -0,0 +1,80 @@ +[project] +name = "cosmos-retriever" +version = "0.1.0" +description = "Multi-turn search agent (Harness-1) as a Python library + CLI, designed to be invoked by the Azure Cosmos DB MCP Toolkit's `agentic_search` tool." +readme = "README.md" +requires-python = ">=3.11" +license = { text = "Apache-2.0" } +authors = [{ name = "Harness-1 Contributors" }] +keywords = ["retrieval", "rag", "cosmos-db", "vllm", "agent"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +dependencies = [ + "anyio>=4.0,<5", + "azure-cosmos>=4.7,<5", + "azure-identity>=1.17,<2", + "fastapi>=0.110,<1", + "httpx>=0.27,<1", + "json-repair>=0.20,<1", + "openai>=1.40,<2", + "openai-harmony>=0.0.8,<1", + "pydantic>=2.7,<3", + "pydantic-settings>=2.4,<3", + "structlog>=24,<26", + "tenacity>=8.3,<10", + "tiktoken>=0.7,<1", + "uvicorn>=0.30,<1", +] + +[project.optional-dependencies] +baseten = ["baseten-performance-client>=0.4,<1"] +dev = [ + "mypy>=1.10,<2", + "pytest>=8,<9", + "pytest-asyncio>=0.23,<1", + "respx>=0.21,<1", + "ruff>=0.6,<1", +] + +[project.scripts] +cosmos-retriever = "cosmos_retriever.__main__:main" + +[project.urls] +Homepage = "https://github.com/your-org/cosmos-retriever" + +[build-system] +requires = ["hatchling>=1.24"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/cosmos_retriever"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP", "SIM", "N"] +ignore = ["E501"] # line length handled by formatter + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["B", "N"] + +[tool.mypy] +python_version = "3.11" +strict = false +warn_unused_ignores = true +warn_redundant_casts = true +ignore_missing_imports = true +files = ["src/cosmos_retriever"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +addopts = "-q" diff --git a/cosmos-retriever/scripts/bench_browsecomp.py b/cosmos-retriever/scripts/bench_browsecomp.py new file mode 100644 index 0000000..7b450f0 --- /dev/null +++ b/cosmos-retriever/scripts/bench_browsecomp.py @@ -0,0 +1,199 @@ +"""Run an N-question slice of BrowseComp+ through the standalone retriever, +score recall@curated against gold docs, and save per-query records as JSONL. + +Usage:: + + # with reranker (default — VLLM_RERANKER_URL must be set in the env) + python scripts/bench_browsecomp.py \\ + --n 83 --seed 42 --parallel 4 \\ + --container browsecomp_corpus_container \\ + --output runs/bench_bc83_rerank.jsonl + + # without reranker + VLLM_RERANKER_URL= python scripts/bench_browsecomp.py \\ + --n 83 --seed 42 --parallel 4 \\ + --container browsecomp_corpus_container \\ + --output runs/bench_bc83_norerank.jsonl + +Records contain: query_id, query, gold_docids, retrieved_chunk_ids, +retrieved_docids (chunk_id.split('__')[0]), recall, precision, num_turns, +elapsed_s, error. +""" + +from __future__ import annotations + +import argparse +import json +import os +import random +import sys +import time +import traceback +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +from cosmos_retriever.config import RetrieverSettings, init_logging +from cosmos_retriever.retriever import CosmosRetriever + +DATASET = Path("/nvme/harness-1/external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl") +QREL_GOLD = Path("/nvme/harness-1/external/BrowseComp-Plus/topics-qrels/qrel_golds.txt") +QREL_EVIDENCE = Path("/nvme/harness-1/external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt") + + +def load_qrels(path: Path) -> dict[str, set[str]]: + """TREC qrels: ``query_id Q0 doc_id relevance`` -> {qid: {docid, ...}}.""" + d: dict[str, set[str]] = defaultdict(set) + if not path.exists(): + return d + for line in path.open(): + parts = line.split() + if len(parts) == 4: + d[parts[0]].add(parts[2]) + return d + + +_GOLD = load_qrels(QREL_GOLD) +_EVIDENCE = load_qrels(QREL_EVIDENCE) +# Reference positives for "Recall" = gold ∪ evidence (search_dataset.py BrowseCompPlusDataset). +_UNION: dict[str, set[str]] = defaultdict(set) +for _q in set(_GOLD) | set(_EVIDENCE): + _UNION[_q] = _GOLD.get(_q, set()) | _EVIDENCE.get(_q, set()) + + +def load_dataset(n: int, seed: int) -> list[dict]: + rows = [json.loads(l) for l in DATASET.open()] + rng = random.Random(seed) + rng.shuffle(rows) + return rows[:n] + + +def score(retrieved_chunk_ids: list[str], gold_docids: set[str]) -> tuple[float, float]: + if not gold_docids: + return 0.0, 0.0 + retrieved_docids = {cid.split("__")[0] for cid in retrieved_chunk_ids} + hit = retrieved_docids & gold_docids + recall = len(hit) / len(gold_docids) + precision = len(hit) / len(retrieved_docids) if retrieved_docids else 0.0 + return recall, precision + + +def _recall(found: set[str], positives: set[str]) -> float: + return len(found & positives) / len(positives) if positives else 0.0 + + +def run_one(retriever: CosmosRetriever, row: dict, max_docs: int) -> dict: + qid = row["query_id"] + query = row["query"] + gold_pos = _GOLD.get(str(qid), set()) # final-answer positives + union_pos = _UNION.get(str(qid), set()) # reference "Recall" positives = gold ∪ evidence + started = time.perf_counter() + try: + result = retriever.search(query=query, max_documents=max_docs) + elapsed = time.perf_counter() - started + curated_docids = {d.id.split("__")[0] for d in result.documents} + pool_docids = set(result.pool_doc_ids) + recall = _recall(curated_docids, union_pos) # Recall (curated set) + trajectory_recall = _recall(pool_docids, union_pos) # Trajectory Recall (pool) + final_answer_recall = _recall(curated_docids, gold_pos) # Final-Answer Recall (curated vs gold) + precision = ( + len(curated_docids & union_pos) / len(curated_docids) if curated_docids else 0.0 + ) + return { + "query_id": qid, + "query": query, + "union_pos": sorted(union_pos), + "gold_pos": sorted(gold_pos), + "curated_docids": sorted(curated_docids), + "pool_docids": sorted(pool_docids), + "num_curated": len(curated_docids), + "n_pool": len(pool_docids), + "recall": recall, + "trajectory_recall": trajectory_recall, + "final_answer_recall": final_answer_recall, + "precision": precision, + "num_turns": result.num_turns, + "elapsed_s": round(elapsed, 2), + "error": None, + } + except Exception as exc: # noqa: BLE001 — record all failures so the bench keeps going + return { + "query_id": qid, + "query": query, + "union_pos": sorted(union_pos), + "gold_pos": sorted(gold_pos), + "curated_docids": [], + "pool_docids": [], + "num_curated": 0, + "n_pool": 0, + "recall": 0.0, + "trajectory_recall": 0.0, + "final_answer_recall": 0.0, + "precision": 0.0, + "num_turns": None, + "elapsed_s": round(time.perf_counter() - started, 2), + "error": f"{type(exc).__name__}: {exc}", + "traceback": traceback.format_exc(), + } + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--n", type=int, default=83) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--parallel", type=int, default=4) + ap.add_argument("--container", default="browsecomp_corpus_container") + ap.add_argument("--max-documents", type=int, default=30) + ap.add_argument("--output", required=True) + args = ap.parse_args() + + init_logging() + settings = RetrieverSettings() + print( + f"[bench] reranker={'ON' if settings.vllm_reranker_url else 'OFF'} " + f"vllm={settings.vllm_base_url} container={args.container} n={args.n} parallel={args.parallel}", + file=sys.stderr, + ) + + rows = load_dataset(args.n, args.seed) + retriever = CosmosRetriever(settings=settings, corpus_name=args.container) + + out = Path(args.output) + out.parent.mkdir(parents=True, exist_ok=True) + done = 0 + recall_sum = 0.0 + traj_sum = 0.0 + fa_sum = 0.0 + err_count = 0 + with out.open("w") as f, ThreadPoolExecutor(max_workers=args.parallel) as ex: + futures = {ex.submit(run_one, retriever, row, args.max_documents): row for row in rows} + for fut in as_completed(futures): + rec = fut.result() + f.write(json.dumps(rec) + "\n") + f.flush() + done += 1 + recall_sum += rec["recall"] + traj_sum += rec["trajectory_recall"] + fa_sum += rec["final_answer_recall"] + if rec["error"]: + err_count += 1 + print( + f"[bench] {done}/{len(rows)} qid={rec['query_id']} " + f"recall={rec['recall']:.2f} traj={rec['trajectory_recall']:.2f} fa={rec['final_answer_recall']:.2f} " + f"n_cur={rec['num_curated']} n_pool={rec['n_pool']} " + f"turns={rec['num_turns']} elapsed={rec['elapsed_s']}s " + f"err={'Y' if rec['error'] else 'N'}", + file=sys.stderr, + ) + + n = max(done, 1) + print( + f"[bench] DONE n={done} Recall={recall_sum/n:.3f} " + f"Trajectory={traj_sum/n:.3f} Final-Answer={fa_sum/n:.3f} errors={err_count}", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/cosmos-retriever/scripts/bench_erag.py b/cosmos-retriever/scripts/bench_erag.py new file mode 100644 index 0000000..6b0a8b3 --- /dev/null +++ b/cosmos-retriever/scripts/bench_erag.py @@ -0,0 +1,169 @@ +"""Run an N-question slice of EnterpriseRAG-Bench (ERAG) through the standalone +retriever, score recall@curated against gold docs, and save per-query records +as JSONL. + +Mirrors ``bench_browsecomp.py`` but loads the ERAG questions parquet +(``question_id``, ``question``, ``expected_doc_ids``) instead of the BrowseComp +JSONL, and defaults to the ``enterprise_ragbench_corpus`` container. + +Usage:: + + python scripts/bench_erag.py \\ + --n 500 --seed 42 --parallel 4 \\ + --container enterprise_ragbench_corpus \\ + --output runs/bench_erag500.jsonl + +Budget / turn knobs are read from the environment (COSMOS_RETRIEVER_MAX_TURNS, +COSMOS_RETRIEVER_THRESHOLD_BUDGET, COSMOS_RETRIEVER_TOKEN_BUDGET) via RetrieverSettings. + +Records contain: query_id, query, gold_docids, retrieved_chunk_ids, +retrieved_docids (chunk_id.split('__')[0]), recall, precision, num_turns, +elapsed_s, error. +""" + +from __future__ import annotations + +import argparse +import json +import random +import sys +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import pyarrow.parquet as pq + +from cosmos_retriever.config import RetrieverSettings, init_logging +from cosmos_retriever.retriever import CosmosRetriever + +DATASET = Path( + "/nvme/hf-cache/hub/datasets--onyx-dot-app--EnterpriseRAG-Bench/" + "snapshots/69916e31c68aa5963c00248fd7f0bc12d04fd235/data/questions/test.parquet" +) + + +def load_dataset(n: int, seed: int) -> list[dict]: + table = pq.read_table(DATASET, columns=["question_id", "question", "expected_doc_ids"]) + cols = table.to_pydict() + rows = [ + { + "query_id": qid, + "query": q, + "gold_docids": list(gold) if gold else [], + } + for qid, q, gold in zip( + cols["question_id"], cols["question"], cols["expected_doc_ids"], strict=True + ) + ] + rng = random.Random(seed) + rng.shuffle(rows) + return rows[:n] + + +def score(retrieved_chunk_ids: list[str], gold_docids: set[str]) -> tuple[float, float]: + if not gold_docids: + return 0.0, 0.0 + retrieved_docids = {cid.split("__")[0] for cid in retrieved_chunk_ids} + hit = retrieved_docids & gold_docids + recall = len(hit) / len(gold_docids) + precision = len(hit) / len(retrieved_docids) if retrieved_docids else 0.0 + return recall, precision + + +def run_one(retriever: CosmosRetriever, row: dict, max_docs: int) -> dict: + qid = row["query_id"] + query = row["query"] + gold_docids = set(row["gold_docids"]) + started = time.perf_counter() + try: + result = retriever.search(query=query, max_documents=max_docs) + elapsed = time.perf_counter() - started + retrieved = [d.id for d in result.documents] + recall, precision = score(retrieved, gold_docids) + return { + "query_id": qid, + "query": query, + "gold_docids": sorted(gold_docids), + "retrieved_chunk_ids": retrieved, + "retrieved_docids": sorted({c.split("__")[0] for c in retrieved}), + "num_curated": len(retrieved), + "recall": recall, + "precision": precision, + "num_turns": result.num_turns, + "elapsed_s": round(elapsed, 2), + "error": None, + } + except Exception as exc: # noqa: BLE001 — record all failures so the bench keeps going + return { + "query_id": qid, + "query": query, + "gold_docids": sorted(gold_docids), + "retrieved_chunk_ids": [], + "retrieved_docids": [], + "num_curated": 0, + "recall": 0.0, + "precision": 0.0, + "num_turns": None, + "elapsed_s": round(time.perf_counter() - started, 2), + "error": f"{type(exc).__name__}: {exc}", + "traceback": traceback.format_exc(), + } + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--n", type=int, default=500) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--parallel", type=int, default=4) + ap.add_argument("--container", default="enterprise_ragbench_corpus") + ap.add_argument("--max-documents", type=int, default=20) + ap.add_argument("--output", required=True) + args = ap.parse_args() + + init_logging() + settings = RetrieverSettings() + print( + f"[bench] reranker={'ON' if settings.vllm_reranker_url else 'OFF'} " + f"vllm={settings.vllm_base_url} container={args.container} n={args.n} " + f"parallel={args.parallel} max_turns={settings.cosmos_retriever_max_turns} " + f"threshold={settings.cosmos_retriever_threshold_budget} token={settings.cosmos_retriever_token_budget}", + file=sys.stderr, + ) + + rows = load_dataset(args.n, args.seed) + retriever = CosmosRetriever(settings=settings, corpus_name=args.container) + + out = Path(args.output) + out.parent.mkdir(parents=True, exist_ok=True) + done = 0 + recall_sum = 0.0 + err_count = 0 + with out.open("w") as f, ThreadPoolExecutor(max_workers=args.parallel) as ex: + futures = {ex.submit(run_one, retriever, row, args.max_documents): row for row in rows} + for fut in as_completed(futures): + rec = fut.result() + f.write(json.dumps(rec) + "\n") + f.flush() + done += 1 + recall_sum += rec["recall"] + if rec["error"]: + err_count += 1 + print( + f"[bench] {done}/{len(rows)} qid={rec['query_id']} " + f"recall={rec['recall']:.2f} n={rec['num_curated']} " + f"turns={rec['num_turns']} elapsed={rec['elapsed_s']}s " + f"err={'Y' if rec['error'] else 'N'}", + file=sys.stderr, + ) + + avg_recall = recall_sum / max(done, 1) + print( + f"[bench] DONE n={done} mean_recall={avg_recall:.3f} errors={err_count}", + file=sys.stderr, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/cosmos-retriever/scripts/diagnose_qst_0099.py b/cosmos-retriever/scripts/diagnose_qst_0099.py new file mode 100644 index 0000000..800f000 --- /dev/null +++ b/cosmos-retriever/scripts/diagnose_qst_0099.py @@ -0,0 +1,155 @@ +"""Diagnose where the gold doc loses rank for qst_0099 in the ERAG corpus.""" + +from __future__ import annotations + +import os +import sys +import time + +os.environ.setdefault("CORPUS_REGISTRY_FILE", "/nvme/cosmos-retriever/corpus_registry.json") + +from cosmos_retriever.config import get_settings # noqa: E402 + +QUERY = ( + "What was the temporary mitigation applied to the internal load balancer " + "serving the gen-infer VIPs around 03:40 UTC that immediately reduced TCP " + "retransmits?" +) +GOLD = "dsid_fa2d9f0bda0e4d6b9174ae6b15f7b37e" + + +def with_retry(label, fn, attempts=4): + last = None + for i in range(attempts): + try: + return fn() + except Exception as e: # noqa: BLE001 + last = e + wait = 2**i + print( + f" [{label}] attempt {i + 1} failed: {type(e).__name__}: {str(e)[:120]} — retry in {wait}s" + ) + time.sleep(wait) + raise last # type: ignore[misc] + + +def main() -> int: + settings = get_settings() + corpus = settings.resolve_corpus("enterprise_ragbench_corpus") + print("=== corpus ===") + print(f" account_uri = {corpus.account_uri}") + print(f" database = {corpus.database}") + print(f" container = {corpus.container}") + print(f" embed_model = {corpus.embed_model} url={corpus.embed_base_url}") + print() + + db = settings.build_cosmos_database(corpus) + container = db.get_container_client(corpus.container) + oc = settings.build_openai_client(corpus) + + print("=== 1. gold-doc presence (partition-key lookup) ===") + rows = with_retry( + "presence", + lambda: list( + container.query_items( + query="SELECT TOP 5 c.id, c.docid, c.chunk_idx FROM c WHERE c.docid = @d", + parameters=[{"name": "@d", "value": GOLD}], + partition_key=GOLD, + ) + ), + ) + print(f" {len(rows)} chunks for {GOLD}:") + for r in rows: + print(f" id={r['id']} chunk_idx={r['chunk_idx']}") + if not rows: + print(" FATAL: gold doc not in container.") + return 1 + print() + + from cosmos_retriever.tools import _fts_literal_args, _query_with_retry, _tokenize_for_fts + + emb_text = QUERY + if corpus.embed_query_instruction: + emb_text = f"Instruct: {corpus.embed_query_instruction}\nQuery: {QUERY}" + emb = with_retry( + "embed", + lambda: oc.embeddings.create(model=corpus.embed_model, input=[emb_text]).data[0].embedding, + ) + print(f"=== 2. RRF top-50 (no rerank) — embed_dim={len(emb)} ===") + terms = _tokenize_for_fts(QUERY) or [QUERY] + sql = ( + "SELECT TOP @k c.id, c.docid, c.chunk_idx FROM c\n" + "ORDER BY RANK RRF(" + "VectorDistance(c.embedding, @qVec), " + f"FullTextScore(c.text, {_fts_literal_args(terms)})" + ")" + ) + rrf_rows = with_retry( + "rrf", + lambda: _query_with_retry( + container, + sql, + [{"name": "@k", "value": 50}, {"name": "@qVec", "value": emb}], + ), + ) + gold_rank = None + for rank, r in enumerate(rrf_rows, 1): + if r["docid"] == GOLD: + gold_rank = rank + break + print(f" pool size = {len(rrf_rows)} gold_rank = {gold_rank}") + print(" top-10 ids:") + for rank, r in enumerate(rrf_rows[:10], 1): + marker = " GOLD ✓" if r["docid"] == GOLD else "" + print(f" rank={rank:>2} {r['id']}{marker}") + print() + + if gold_rank is None: + print("Gold doc not in top-50. Retrieval itself is missing it.") + return 0 + + if not settings.vllm_reranker_url: + print("=== 3. (no VLLM_RERANKER_URL set, skipping rerank check) ===") + return 0 + + from cosmos_retriever.rerank import VLLMReranker + + reranker = VLLMReranker(base_url=settings.vllm_reranker_url) + print("=== 3. Qwen3-Reranker reordering of those 50 ===") + + docs: list[str] = [] + for r in rrf_rows: + text_rows = with_retry( + f"fetch_{r['id']}", + lambda r=r: list( + container.query_items( + query="SELECT TOP 1 c.text FROM c WHERE c.id = @i", + parameters=[{"name": "@i", "value": r["id"]}], + partition_key=r["docid"], + ) + ), + attempts=3, + ) + docs.append(text_rows[0]["text"] if text_rows else "") + + reranked = reranker(QUERY, docs) + new_gold_rank = None + for new_rank, rr in enumerate(reranked, 1): + if rrf_rows[rr.original_index]["docid"] == GOLD: + new_gold_rank = new_rank + print( + f" rerank position = {new_rank} (was {gold_rank}) score={rr.score:.4f} GOLD ✓" + ) + break + print(" top-5 after rerank:") + for new_rank, rr in enumerate(reranked[:5], 1): + rid = rrf_rows[rr.original_index]["id"] + marker = " GOLD ✓" if rrf_rows[rr.original_index]["docid"] == GOLD else "" + print(f" rank={new_rank:>2} score={rr.score:.4f} {rid}{marker}") + if new_gold_rank is None: + print(" GOLD ABSENT in reranked list — reranker scored other docs higher.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/cosmos-retriever/scripts/erag_repeat.py b/cosmos-retriever/scripts/erag_repeat.py new file mode 100644 index 0000000..8d91ed1 --- /dev/null +++ b/cosmos-retriever/scripts/erag_repeat.py @@ -0,0 +1,52 @@ +"""Run agentic search 3x against ERAG and check whether gold doc surfaces.""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys + +REPO = "/nvme/cosmos-retriever" +QUERY = ( + "What was the temporary mitigation applied to the internal load balancer " + "serving the gen-infer VIPs around 03:40 UTC that immediately reduced TCP " + "retransmits?" +) +GOLD = "dsid_fa2d9f0bda0e4d6b9174ae6b15f7b37e" + +env = os.environ.copy() + + +def run_once(idx: int) -> None: + cmd = [ + f"{REPO}/.venv/bin/python", + "-m", + "cosmos_retriever", + "search", + "--container", + "enterprise_ragbench_corpus", + "--query", + QUERY, + "--max-documents", + "5", + ] + proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False) + if proc.returncode != 0: + print(f"run {idx}: subprocess exit={proc.returncode}", file=sys.stderr) + print(proc.stderr[-2000:], file=sys.stderr) + return + data = json.loads(proc.stdout) + ids = [d["id"].split("__")[0] for d in data["documents"]] + print( + f"run {idx}: turns={data['num_turns']:>2} " + f"elapsed={data['elapsed_s']:>5.1f}s " + f"gold_hit={GOLD in ids:>5} " + f"ranked_ids={ids}" + ) + + +if __name__ == "__main__": + n = int(sys.argv[1]) if len(sys.argv) > 1 else 3 + for i in range(1, n + 1): + run_once(i) diff --git a/cosmos-retriever/scripts/run_with_upstream_env.sh b/cosmos-retriever/scripts/run_with_upstream_env.sh new file mode 100755 index 0000000..89cc953 --- /dev/null +++ b/cosmos-retriever/scripts/run_with_upstream_env.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Bridge: source the upstream harness-1 .env.local and re-export the values our +# RetrieverSettings expects under its own variable names, then exec whatever +# command was passed on the command line. +# +# Maps: +# AZURE_OPENAI_EMBED_API_KEY -> AZURE_OPENAI_API_KEY (the embed-only key) +# AZURE_OPENAI_EMBED_DEPLOYMENT -> OPENAI_EMBEDDING_MODEL +# ACCOUNT_URI / COSMOS_DATABASE / COSMOS_CORPUS_CONTAINER -> passed through +# AZURE_OPENAI_ENDPOINT -> passed through +# +# Targets the live vLLM in the running pytorch container at 172.17.0.2:8000 +# (harness-1 model) and the matching reranker on :8011. +# +# Usage: scripts/run_with_upstream_env.sh python -m cosmos_retriever smoke --query "..." + +set -euo pipefail + +UPSTREAM_ENV="${UPSTREAM_ENV:-/nvme/harness-1/.env.local}" + +if [[ ! -r "${UPSTREAM_ENV}" ]]; then + echo "error: cannot read ${UPSTREAM_ENV}" >&2 + exit 1 +fi + +# shellcheck disable=SC1090 +set -a +source "${UPSTREAM_ENV}" +set +a + +# --- Map upstream var names to ours ----------------------------------------- +export OPENAI_EMBEDDING_MODEL="${AZURE_OPENAI_EMBED_DEPLOYMENT:-text-embedding-3-small}" +if [[ -n "${AZURE_OPENAI_EMBED_API_KEY:-}" ]]; then + # Our config reads AZURE_OPENAI_API_KEY for the embedding endpoint. + export AZURE_OPENAI_API_KEY="${AZURE_OPENAI_EMBED_API_KEY}" +fi + +# --- Point at the running vLLM in the pytorch container -------------------- +export VLLM_BASE_URL="${VLLM_BASE_URL:-http://172.17.0.2:8000}" +export VLLM_MODEL_NAME="${VLLM_MODEL_NAME:-harness-1}" +export VLLM_RERANKER_URL="${VLLM_RERANKER_URL:-http://172.17.0.2:8011}" + +# --- Sensible default timeouts / budgets so we don't wait forever ---------- +export VLLM_TIMEOUT_S="${VLLM_TIMEOUT_S:-600}" + +exec "$@" diff --git a/cosmos-retriever/src/cosmos_retriever/__init__.py b/cosmos-retriever/src/cosmos_retriever/__init__.py new file mode 100644 index 0000000..ee847cc --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/__init__.py @@ -0,0 +1,12 @@ +"""Cosmos Retriever: multi-turn search agent exposed as an MCP tool.""" + +from __future__ import annotations + +from cosmos_retriever.retriever import ( + CosmosRetriever, + RetrievalResult, + RetrievedDocument, +) + +__all__ = ["CosmosRetriever", "RetrievalResult", "RetrievedDocument"] +__version__ = "0.1.0" diff --git a/cosmos-retriever/src/cosmos_retriever/__main__.py b/cosmos-retriever/src/cosmos_retriever/__main__.py new file mode 100644 index 0000000..3d4516b --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/__main__.py @@ -0,0 +1,127 @@ +"""``python -m cosmos_retriever search ...`` — one-shot CLI. + +Designed to be invoked as a subprocess by the Azure Cosmos DB MCP Toolkit's +``agentic_search`` tool. Prints a single JSON document to **stdout**; +structured logs go to **stderr** so the consumer can pipe stdout straight +into ``json.loads`` / ``JsonDocument.Parse`` without filtering. + +Output schema:: + + { + "query": str, + "num_turns": int, + "elapsed_s": float, + "documents": [ + { "id": str, "text": str, "justification": str | null, "rank": int } + ] + } + +Errors are printed as ``{"error": ""}`` to stdout with non-zero exit. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import asdict + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="cosmos-retriever", + description="Run the Harness-1 multi-turn retrieval agent and emit JSON.", + ) + sub = parser.add_subparsers(dest="cmd", required=True) + + search = sub.add_parser("search", help="Run one search end-to-end.") + search.add_argument("--query", required=True) + search.add_argument("--max-documents", type=int, default=20) + search.add_argument( + "--database", + default=None, + help="Override Cosmos database name (else COSMOS_DATABASE env var).", + ) + search.add_argument( + "--container", + default=None, + help="Override Cosmos corpus container name (else COSMOS_CORPUS_CONTAINER env var).", + ) + + serve = sub.add_parser( + "serve", + help="Run the FastAPI HTTP service the MCP Toolkit calls into.", + ) + serve.add_argument( + "--host", + default=None, + help="Bind address (else HOST env var, default 0.0.0.0).", + ) + serve.add_argument( + "--port", + type=int, + default=None, + help="Bind port (else PORT env var, default 9000).", + ) + return parser + + +def _cmd_search(args: argparse.Namespace) -> int: + # Defer heavy imports so `--help` and bad CLI calls fail fast without + # initialising clients (or scanning .env). `get_settings()` calls + # `init_logging()` itself, so we don't need to call it explicitly. + from cosmos_retriever.config import get_settings # noqa: PLC0415 + from cosmos_retriever.retriever import CosmosRetriever # noqa: PLC0415 + + settings = get_settings() + if args.database: + # Manual database override only meaningful when no registry entry + # already pins the database for this container. + settings.cosmos_database = args.database + + retriever = CosmosRetriever(settings=settings, corpus_name=args.container) + result = retriever.search(args.query, max_documents=args.max_documents) + json.dump(asdict(result), sys.stdout, default=str, ensure_ascii=False) + sys.stdout.write("\n") + sys.stdout.flush() + return 0 + + +def _cmd_serve(args: argparse.Namespace) -> int: + # Defer heavy imports so `--help` stays fast and import errors surface here. + import uvicorn # noqa: PLC0415 + + from cosmos_retriever.config import get_settings # noqa: PLC0415 + from cosmos_retriever.server import create_app # noqa: PLC0415 + + settings = get_settings() + host = args.host or settings.host + port = args.port or settings.port + app = create_app(settings) + uvicorn.run(app, host=host, port=port, log_level=settings.log_level.lower()) + return 0 + + +def main(argv: list[str] | None = None) -> int: + parser = _build_parser() + args = parser.parse_args(argv) + try: + if args.cmd == "search": + return _cmd_search(args) + if args.cmd == "serve": + return _cmd_serve(args) + except Exception as exc: # noqa: BLE001 — propagate as JSON error to caller + json.dump( + {"error": str(exc), "type": type(exc).__name__}, + sys.stdout, + ensure_ascii=False, + ) + sys.stdout.write("\n") + sys.stdout.flush() + return 1 + parser.error(f"Unknown command: {args.cmd}") + return 2 # unreachable + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/cosmos-retriever/src/cosmos_retriever/agent.py b/cosmos-retriever/src/cosmos_retriever/agent.py new file mode 100644 index 0000000..a4f84a7 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/agent.py @@ -0,0 +1,224 @@ +"""Harmony token parsing helpers for the Tinker/gpt-oss agent path. + +Only the static helpers used by ``env_rl`` to turn sampled Harmony +completion tokens into an :class:`Action` remain here. The live inference +models and agent loop live in the ``cosmos_retriever.inference`` package. +""" + +import json +import json_repair +import re +import uuid +from typing import Any, Dict, List, Optional + +from openai_harmony import HarmonyEncoding, Message +import structlog + +from cosmos_retriever.tools import ToolSet, UserTextTool +from cosmos_retriever.trajectory import Action, ActionBuilder + + +logger = structlog.get_logger("search_agent.agent") + + +class TinkerAgentInferenceModel: + """Static helpers for parsing Harmony completion tokens into actions.""" + + @staticmethod + def _extract_first_json_object(s: str) -> Optional[str]: + """Return the substring for the first balanced top-level JSON object/array. + + Walks the string tracking brace/bracket depth and string quoting so + that trailing garbage (extra text, duplicate objects, ``[END]`` + markers, etc.) is silently discarded. Returns ``None`` when no + balanced object is found. + """ + # Find the opening delimiter + start = -1 + open_ch = "" + for i, ch in enumerate(s): + if ch in ('{', '['): + start = i + open_ch = ch + break + if start < 0: + return None + + close_ch = '}' if open_ch == '{' else ']' + depth = 0 + in_str = False + esc = False + for i in range(start, len(s)): + ch = s[i] + if esc: + esc = False + continue + if ch == '\\' and in_str: + esc = True + continue + if ch == '"' and not esc: + in_str = not in_str + continue + if not in_str: + if ch == open_ch: + depth += 1 + elif ch == close_ch: + depth -= 1 + if depth == 0: + return s[start: i + 1] + return None + + @staticmethod + def _repair_json_escapes(s: str) -> str: + """Fix invalid backslash escapes that are illegal in JSON.""" + s = re.sub(r'\\(?!["\\/bfnrt]|u[0-9a-fA-F]{4})', r'\\\\', s) + s = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s) + return s + + @staticmethod + def _parse_json(json_string: str, strict_mode: bool = True) -> Any: + """Parse JSON string with automatic fallback repairs. + + Repair pipeline (strict_mode=True): + 1. ``json.loads`` on the raw string. + 2. Extract the first balanced JSON object/array, discard trailing + garbage, then ``json.loads`` again. + 3. Additionally fix invalid backslash escapes, then retry. + Non-strict mode delegates to ``json_repair``. + """ + if not strict_mode: + return json_repair.loads(json_string) + + # 1. Fast path – raw string parses cleanly + try: + return json.loads(json_string) + except json.JSONDecodeError: + pass + + # 2. Extract first JSON object, ignore trailing garbage + first_obj = TinkerAgentInferenceModel._extract_first_json_object(json_string) + if first_obj is not None: + try: + return json.loads(first_obj) + except json.JSONDecodeError: + pass + + # 3. Also fix bad escapes on the extracted object + repaired = TinkerAgentInferenceModel._repair_json_escapes(first_obj) + try: + return json.loads(repaired) + except json.JSONDecodeError: + pass + + # Nothing worked – raise the original error for the caller to handle + return json.loads(json_string) + + @staticmethod + def handle_tool_message( + message: Message, + toolset: ToolSet, + action_builder: ActionBuilder, + strict_mode: bool = True, + ) -> None: + if message.recipient == "functions.multi_tool_use": + args = TinkerAgentInferenceModel._parse_json( + message.content[0].text, strict_mode + ) + tool_calls: List[Dict[str, Any]] = [] + if isinstance(args, list): + tool_calls = args + elif isinstance(args, dict): + tool_calls = args["tool_calls"] + else: + raise ValueError(f"Invalid tool calls: {args}") + for tool_call in tool_calls: + # Harmony formats tool names with a functions. prefix, remove it + raw_name = tool_call.get("tool_name") + if raw_name is None: + raise ValueError("Tool call missing 'tool_name'") + parsed_tool_name = (raw_name or "").replace("functions.", "").replace("<|constrain|>", "").strip() + if not parsed_tool_name: + raise ValueError("Tool name empty after parsing") + tool = toolset.get_tool(parsed_tool_name) + if tool is None: + raise ValueError(f"Tool not found: {parsed_tool_name}") + tool_args = tool_call["parameters"] + source = tool_call["tool_name"] + "_" + uuid.uuid4().hex + action_builder.add_tool_call(tool=tool, params=tool_args, source=source) + else: + # Harmony formats tool names with a functions. prefix, remove it + recipient = message.recipient + if recipient is None: + raise ValueError("Tool message has no recipient (malformed output)") + parsed_tool_name = (recipient or "").replace("functions.", "").replace("<|constrain|>", "").strip() + if not parsed_tool_name: + raise ValueError("Tool name empty after parsing recipient") + tool = toolset.get_tool(parsed_tool_name) + if tool is None: + raise ValueError(f"Tool not found: {parsed_tool_name}") + tool_args = TinkerAgentInferenceModel._parse_json( + message.content[0].text, strict_mode + ) + source = (recipient or "") + "_" + uuid.uuid4().hex + action_builder.add_tool_call(tool=tool, params=tool_args, source=source) + + @staticmethod + def tinker_tokens_to_harmony_format( + encoding: HarmonyEncoding, tokens: List[int] + ) -> List[Message]: + return encoding.parse_messages_from_completion_tokens(tokens) + + @staticmethod + def harmony_tinker_tokens_to_action( + encoding: HarmonyEncoding, + tokens: List[int], + toolset: ToolSet, + strict_mode: bool = True, + ) -> Action: + action_builder = ActionBuilder() + parsed = TinkerAgentInferenceModel.tinker_tokens_to_harmony_format( + encoding, tokens + ) + for i, message in enumerate[Message](parsed): + if message.channel == "analysis": + # NOTE: GPT oss 20b occasionally outputs a tool call on analysis, since built in tools are allowed to do so + # we respect the call and redirect to commentary channel for now + if message.recipient: + logger.warning( + "Output tool call on analysis channel, redirecting to commentary channel" + ) + TinkerAgentInferenceModel.handle_tool_message( + message, toolset, action_builder, strict_mode + ) + else: + action_builder.add_reasoning(message.content[0].text) + + elif message.channel == "commentary": + TinkerAgentInferenceModel.handle_tool_message( + message, toolset, action_builder, strict_mode + ) + elif message.channel == "final": + action_builder.add_tool_call( + tool=UserTextTool(), + params={"text": str(message.content[0].text)}, + source="agent", + ) + elif message.channel is None: + # Handle messages with no channel - likely incomplete/malformed tokens + # Try to extract any text content as reasoning if available + if ( + message.content + and hasattr(message.content[0], "text") + and message.content[0].text + ): + logger.debug( + f"Message with None channel, treating as reasoning: {message.content[0].text[:100]}..." + ) + action_builder.add_reasoning(message.content[0].text) + else: + logger.debug( + f"Skipping message with None channel and no usable content" + ) + else: + raise ValueError(f"Unknown channel: {message.channel}") + return action_builder.build() diff --git a/cosmos-retriever/src/cosmos_retriever/config.py b/cosmos-retriever/src/cosmos_retriever/config.py new file mode 100644 index 0000000..ccc8086 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/config.py @@ -0,0 +1,461 @@ +"""Runtime configuration for the Cosmos retriever. + +All values are loaded from environment variables (or a ``.env`` file at the +repo root). Sensitive fields use :class:`pydantic.SecretStr` so they're not +accidentally rendered into logs. + +**Multi-corpus support.** When you have more than one ingested corpus that was +built with *different* embedding models, register them via ``CORPUS_REGISTRY`` +(JSON string) or ``CORPUS_REGISTRY_FILE`` (path to a JSON file). Each entry +overrides the Cosmos account / database and embedding-client config for that +container name. Caller passes ``container=""`` at search time and the +right embedder is picked automatically. + +Example registry JSON:: + + { + "browsecomp_corpus_container": { + "account_uri": "https://accountA.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "https://embedding-west-us-resource.services.ai.azure.com/openai/v1", + "embed_api_key_env": "AZURE_OPENAI_API_KEY", + "embed_model": "text-embedding-3-small" + }, + "enterprise_ragbench_corpus": { + "account_uri": "https://accountB.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "http://172.17.0.2:8002/v1", + "embed_api_key_env": null, + "embed_model": "qwen3-embed", + "embed_query_instruction": "Given a question, retrieve documents that answer it" + } + } +""" + +from __future__ import annotations + +import json +import logging +import os +import sys +from dataclasses import dataclass +from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import structlog +from azure.cosmos import CosmosClient, DatabaseProxy +from azure.identity import AzureCliCredential, DefaultAzureCredential +from dotenv import load_dotenv +from openai import OpenAI +from pydantic import Field, SecretStr +from pydantic_settings import BaseSettings, SettingsConfigDict + +if TYPE_CHECKING: + from baseten_performance_client import PerformanceClient + +REPO_ROOT = Path(__file__).resolve().parents[2] +DEFAULT_ENV_FILES = (str(REPO_ROOT / ".env.local"), str(REPO_ROOT / ".env")) + +# Also export to os.environ so the registry's *_env indirection +# (e.g. embed_api_key_env="AZURE_OPENAI_EMBED_API_KEY") can find values +# that live only in the .env file. +for _env_path in DEFAULT_ENV_FILES: + load_dotenv(_env_path, override=False) + + +def init_logging( + app_level: int = logging.INFO, + *, + lib_level: int = logging.WARNING, + colors: bool = True, +) -> None: + """Configure structlog without lowering library log thresholds. + + Logs are written to **stderr** so that subprocess invocations (e.g. the + MCP Toolkit's ``agentic_search`` tool) can keep stdout reserved for the + JSON result. + """ + + logging.basicConfig(level=lib_level, format="%(message)s", stream=sys.stderr, force=True) + structlog.configure_once( + processors=[ + structlog.processors.TimeStamper(fmt="iso", utc=True), + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.dev.ConsoleRenderer(colors=colors), + ], + wrapper_class=structlog.make_filtering_bound_logger(app_level), + cache_logger_on_first_use=True, + logger_factory=structlog.PrintLoggerFactory(file=sys.stderr), + ) + + +@dataclass(frozen=True) +class CorpusConfig: + """Per-corpus resolved configuration. + + All fields are *resolved* — no env-var references, no ``${...}`` placeholders. + :py:meth:`RetrieverSettings.resolve_corpus` does the lookup and fallback. + """ + + container: str + account_uri: str + database: str + embed_base_url: str | None + """Base URL of the OpenAI-compatible embedding endpoint. + + ``None`` means "use plain ``OPENAI_API_KEY``" against api.openai.com. + Azure OpenAI users should pass ``https://.../openai/v1``. + Local vLLM users should pass ``http://host:port/v1``. + """ + + embed_api_key: SecretStr | None + embed_model: str + embed_query_instruction: str | None = None + """Optional instruction prepended to query text (Qwen-style embedders only).""" + + cosmos_key: SecretStr | None = None + + +class RetrieverSettings(BaseSettings): + """Runtime configuration loaded from environment variables or .env files.""" + + model_config = SettingsConfigDict( + env_file=DEFAULT_ENV_FILES, + env_file_encoding="utf-8", + extra="ignore", + case_sensitive=False, + ) + + # --- Inference backend selection -------------------------------------- + # "harmony_vllm" (default): the fine-tuned pat-jj/harness-1 checkpoint + # served by vLLM, driven with raw Harmony token-IDs. + # "openai_chat": ANY OpenAI-compatible chat model (Azure AI Foundry + # deployment, OpenAI, local server, ...) driven with standard + # /chat/completions function/tool calling. Set the CHAT_* vars below. + # "openai_responses": same, but via the /responses API (required by + # reasoning models like gpt-5.x that are only exposed there). + inference_backend: str = Field( + default="harmony_vllm", + description='Inference backend: "harmony_vllm", "openai_chat", or "openai_responses".', + ) + + # --- vLLM serving the Harness-1 model (harmony_vllm backend) ------------ + vllm_base_url: str = Field( + default="http://127.0.0.1:8000", + description="OpenAI-compatible vLLM endpoint serving pat-jj/harness-1.", + ) + vllm_model_name: str = Field(default="harness-1") + vllm_timeout_s: float = Field(default=900.0, ge=1.0) + + # --- Generic chat LLM endpoint (openai_chat backend) ------------------- + # Any OpenAI-compatible chat-completions endpoint. For Azure AI Foundry, + # CHAT_BASE_URL is the deployment's OpenAI-compatible URL (or set + # CHAT_API_VERSION to use the Azure OpenAI client) and CHAT_MODEL is the + # deployment name. + chat_base_url: str | None = Field( + default=None, + description="Base URL of an OpenAI-compatible chat-completions endpoint.", + ) + chat_api_key: SecretStr | None = None + chat_model: str | None = Field( + default=None, description="Chat model / Foundry deployment name." + ) + chat_api_version: str | None = Field( + default=None, + description="Set for Azure OpenAI-style endpoints (uses the AzureOpenAI client).", + ) + chat_temperature: float = Field(default=0.7, ge=0.0, le=2.0) + chat_max_tokens: int = Field(default=4096, ge=256) + chat_max_turns: int = Field(default=20, ge=1, le=200) + chat_reasoning_effort: str | None = Field( + default=None, + description='Reasoning effort for reasoning models on the responses API (e.g. "low", "medium", "high").', + ) + + # --- Cosmos DB (default / fallback corpus) ----------------------------- + account_uri: str = Field(description="Cosmos DB account URI (default corpus).") + cosmos_database: str + cosmos_corpus_container: str + cosmos_key: SecretStr | None = None + + # --- Embeddings (default corpus) -------------------------------------- + openai_api_key: SecretStr | None = None + openai_embedding_model: str | None = None + embed_endpoint: str | None = Field( + default=None, + description=( + "Embedding endpoint base URL. Leave unset to use plain OpenAI " + "(api.openai.com). For Azure pass https://.../openai/v1; " + "for a local server pass http://host:port/v1." + ), + ) + embed_query_instruction: str | None = None + + # --- Multi-corpus registry -------------------------------------------- + corpus_registry: str | None = Field( + default=None, + description="JSON string mapping container name -> CorpusConfig overrides.", + ) + corpus_registry_file: str | None = Field( + default=None, + description="Path to a JSON file holding the corpus registry.", + ) + + # --- Reranker (optional) ---------------------------------------------- + baseten_api_key: SecretStr | None = None + baseten_model_url: str | None = None + vllm_reranker_url: str | None = None # local vLLM /score endpoint + + # --- Retriever knobs -------------------------------------------------- + cosmos_retriever_max_turns: int = Field(default=35, ge=1, le=200, alias="COSMOS_RETRIEVER_MAX_TURNS") + cosmos_retriever_threshold_budget: int = Field( + default=16384, ge=1024, alias="COSMOS_RETRIEVER_THRESHOLD_BUDGET" + ) + cosmos_retriever_token_budget: int = Field( + default=32268, ge=4096, alias="COSMOS_RETRIEVER_TOKEN_BUDGET" + ) + cosmos_retriever_search_display_limit: int = Field(default=15, ge=1, le=50) + + # --- HTTP server ------------------------------------------------------ + host: str = Field(default="0.0.0.0") # noqa: S104 binding to all is intended for containers + port: int = Field(default=9000, ge=1, le=65535) + log_level: str = Field(default="info") + + # ------------------------------------------------------------------ + # Registry parsing + corpus resolution + # ------------------------------------------------------------------ + def _load_registry(self) -> dict[str, dict[str, Any]]: + """Return the parsed registry (empty dict if none configured).""" + + if self.corpus_registry_file: + path = Path(self.corpus_registry_file) + if not path.is_file(): + raise FileNotFoundError(f"CORPUS_REGISTRY_FILE points at missing file: {path}") + raw = path.read_text(encoding="utf-8") + elif self.corpus_registry: + raw = self.corpus_registry + else: + return {} + + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise ValueError(f"corpus_registry is not valid JSON: {exc}") from exc + + if not isinstance(data, dict): + raise ValueError("corpus_registry must be a JSON object {container_name: {...}}") + return data + + def resolve_corpus(self, container: str | None = None) -> CorpusConfig: + """Return the fully-resolved :class:`CorpusConfig` for ``container``. + + Resolution order: + + 1. If ``container`` is set and present in the registry, use that entry. + 2. If ``container`` is set but **not** in the registry, fall back to + the default-corpus env vars, swapping in just the container name. + 3. If ``container`` is ``None``, use the default-corpus env vars verbatim. + """ + + registry = self._load_registry() + target = container or self.cosmos_corpus_container + entry = registry.get(target) + + def _resolve_default_embed() -> tuple[str | None, SecretStr | None, str | None]: + # ``embed_endpoint`` unset (None) => plain OpenAI against api.openai.com. + return self.embed_endpoint, self.openai_api_key, self.openai_embedding_model + + if entry is None: + base, key, model = _resolve_default_embed() + return CorpusConfig( + container=target, + account_uri=self.account_uri, + database=self.cosmos_database, + embed_base_url=base, + embed_api_key=key, + embed_model=model, + embed_query_instruction=self.embed_query_instruction, + cosmos_key=self.cosmos_key, + ) + + # Registry entry — resolve every field, with sensible fallbacks. + api_key_env = entry.get("embed_api_key_env") + api_key_value: SecretStr | None = None + if api_key_env: + raw_key = os.environ.get(api_key_env) + if raw_key: + api_key_value = SecretStr(raw_key) + + cosmos_key_env = entry.get("cosmos_key_env") + cosmos_key_value: SecretStr | None = self.cosmos_key + if cosmos_key_env: + raw_ck = os.environ.get(cosmos_key_env) + if raw_ck: + cosmos_key_value = SecretStr(raw_ck) + + return CorpusConfig( + container=target, + account_uri=entry.get("account_uri") or self.account_uri, + database=entry.get("database") or self.cosmos_database, + embed_base_url=entry.get("embed_base_url"), + embed_api_key=api_key_value, + embed_model=entry.get("embed_model") or self.openai_embedding_model, + embed_query_instruction=entry.get("embed_query_instruction"), + cosmos_key=cosmos_key_value, + ) + + # ------------------------------------------------------------------ + # Client factories + # ------------------------------------------------------------------ + def _cosmos_credential(self): + """Build the credential the Cosmos SDK should use. + + We default to :class:`AzureCliCredential` (i.e. whoever ran + ``az login``) rather than :class:`DefaultAzureCredential` because on + Azure VMs the broader chain picks up the host's managed identity + first, which often lives in a *different* AAD tenant from the Cosmos + account and produces a misleading 401. Users who explicitly want the + broader chain can opt in by setting ``COSMOS_USE_DEFAULT_CREDENTIAL=1``. + """ + + if os.environ.get("COSMOS_USE_DEFAULT_CREDENTIAL", "").lower() in {"1", "true", "yes"}: + return DefaultAzureCredential() + return AzureCliCredential() + + def build_cosmos_database(self, corpus: CorpusConfig) -> DatabaseProxy: + """Return a Cosmos database proxy for ``corpus`` (its account + database).""" + + if corpus.cosmos_key is not None: + client = CosmosClient(corpus.account_uri, credential=corpus.cosmos_key.get_secret_value()) + else: + client = CosmosClient(corpus.account_uri, credential=self._cosmos_credential()) + return client.get_database_client(corpus.database) + + def build_openai_client(self, corpus: CorpusConfig) -> OpenAI: + """Return an embeddings client for ``corpus``. + + Works for plain OpenAI, Azure OpenAI (when ``embed_base_url`` ends in + ``/openai/v1``), and any OpenAI-compatible local server (vLLM, TGI, + Ollama). + """ + + kwargs: dict[str, Any] = {} + if corpus.embed_base_url: + kwargs["base_url"] = corpus.embed_base_url + # The OpenAI SDK rejects api_key=None, so substitute a placeholder + # when talking to a keyless local server. + kwargs["api_key"] = ( + corpus.embed_api_key.get_secret_value() if corpus.embed_api_key is not None else "EMPTY" + ) + return OpenAI(**kwargs) + + # ----- Generic chat backend ------------------------------------------ + @property + def use_chat_backend(self) -> bool: + """True when the agent should be driven by a generic chat-completions model.""" + + return self.inference_backend.strip().lower() == "openai_chat" + + @property + def use_responses_backend(self) -> bool: + """True when the agent should be driven via the OpenAI /responses API.""" + + return self.inference_backend.strip().lower() == "openai_responses" + + @property + def use_generic_llm_backend(self) -> bool: + """True for any OpenAI-compatible backend (chat-completions or responses).""" + + return self.use_chat_backend or self.use_responses_backend + + def build_chat_client(self) -> OpenAI: + """Return an OpenAI-compatible chat client for the configured endpoint. + + Supports plain OpenAI / any OpenAI-compatible server via + ``CHAT_BASE_URL``, and Azure OpenAI-style endpoints when + ``CHAT_API_VERSION`` is set (uses :class:`openai.AzureOpenAI`). + """ + + if not self.chat_base_url: + raise ValueError( + "CHAT_BASE_URL must be set when INFERENCE_BACKEND=openai_chat." + ) + if not self.chat_model: + raise ValueError( + "CHAT_MODEL (the deployment / model name) must be set when " + "INFERENCE_BACKEND=openai_chat." + ) + api_key = ( + self.chat_api_key.get_secret_value() if self.chat_api_key is not None else "EMPTY" + ) + if self.chat_api_version: + from openai import AzureOpenAI # noqa: PLC0415 — optional Azure path + + return AzureOpenAI( + azure_endpoint=self.chat_base_url, + api_key=api_key, + api_version=self.chat_api_version, + ) + return OpenAI(base_url=self.chat_base_url, api_key=api_key) + + # ----- Legacy single-corpus accessors (kept for backwards compat) ----- + def get_cosmos_client(self) -> CosmosClient: + corpus = self.resolve_corpus() + if corpus.cosmos_key is not None: + return CosmosClient(corpus.account_uri, credential=corpus.cosmos_key.get_secret_value()) + return CosmosClient(corpus.account_uri, credential=self._cosmos_credential()) + + def get_cosmos_database(self) -> DatabaseProxy: + return self.build_cosmos_database(self.resolve_corpus()) + + def get_openai_client(self) -> OpenAI: + return self.build_openai_client(self.resolve_corpus()) + + def get_baseten_client(self) -> PerformanceClient: + """Return a Baseten classify client (only valid when both env vars are set).""" + + if self.baseten_api_key is None or not self.baseten_model_url: + raise ValueError( + "BASETEN_API_KEY and BASETEN_MODEL_URL must both be set to use Baseten reranking." + ) + # Imported lazily so the optional dependency isn't required at import time. + from baseten_performance_client import PerformanceClient # noqa: PLC0415 + + return PerformanceClient( + base_url=self.baseten_model_url, + api_key=self.baseten_api_key.get_secret_value(), + ) + + +@lru_cache(maxsize=1) +def get_settings() -> RetrieverSettings: + """Return a cached :class:`RetrieverSettings` instance and configure logging.""" + + settings = RetrieverSettings() # type: ignore[call-arg] + init_logging(app_level=_log_level_to_int(settings.log_level)) + return settings + + +# Upstream Harness-1 modules (rerank.py, agent.py, datagen/) call ``get_config()`` +# and treat its return value as a ``Config`` with ``get_baseten_client`` etc. +# We forward to the same singleton so those modules import cleanly. +def get_config() -> "RetrieverSettings": + return get_settings() + + +def _log_level_to_int(level: str) -> int: + return getattr(logging, level.upper(), logging.INFO) + + +__all__ = [ + "CorpusConfig", + "DEFAULT_ENV_FILES", + "REPO_ROOT", + "RetrieverSettings", + "get_config", + "get_settings", + "init_logging", +] diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus b/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus new file mode 120000 index 0000000..c047219 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus @@ -0,0 +1 @@ +/nvme/harness-1/external/BrowseComp-Plus \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/README.md b/cosmos-retriever/src/cosmos_retriever/datagen/README.md new file mode 100644 index 0000000..389b746 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/README.md @@ -0,0 +1,96 @@ +# Datasets + +This repository includes the evaluation code used by Harness-1, but it does not +bundle large retrieval corpora or private Chroma indexes. + +## Public Ready-To-Run Path: BrowseComp+ + +BrowseComp+ is the recommended public smoke/evaluation dataset for this release. +The evaluator expects the public BrowseComp+ query/answer files and qrels on +disk, plus a Chroma collection containing the corresponding BrowseComp+ corpus +chunks. + +### 1. Download BrowseComp+ + +Clone the public BrowseComp+ release and follow its instructions to obtain the +decrypted query/answer file: + +```bash +git clone https://github.com/texttron/BrowseComp-Plus external/BrowseComp-Plus +``` + +After setup, you should have files equivalent to: + +```text +external/BrowseComp-Plus/topics-qrels/queries.tsv +external/BrowseComp-Plus/topics-qrels/qrel_golds.txt +external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt +external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl +``` + +### 2. Configure local paths + +Copy `.env.example` to `.env.local` and point these variables at the downloaded +files: + +```bash +BROWSECOMPPLUS_QUERIES_PATH=external/BrowseComp-Plus/topics-qrels/queries.tsv +BROWSECOMPPLUS_QRELS_GOLD_PATH=external/BrowseComp-Plus/topics-qrels/qrel_golds.txt +BROWSECOMPPLUS_QRELS_EVIDENCE_PATH=external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt +BROWSECOMPPLUS_ANSWERS_PATH=external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl +``` + +### 3. Build or provide the BrowseComp+ retrieval collection + +The search harness retrieves from Chroma. For BrowseComp+, create a Chroma +collection named `browsecomp_plus_test` containing the BrowseComp+ corpus chunks, +with document IDs matching the qrel document IDs. Configure your Chroma access in +`.env.local`: + +```bash +CHROMA_API_KEY=... +CHROMA_DATABASE=... +``` + +At minimum, each indexed chunk should preserve: + +- the document/chunk ID used in the qrels, +- text content, +- any metadata your Chroma deployment requires for retrieval. + +The evaluator looks up the collection name from the dataset class, so keeping +the collection name `browsecomp_plus_test` is the least surprising path. + +### 4. Run a BrowseComp+ Harness-1 eval + +Set your checkpoint path privately in the environment, then run: + +```bash +set -a && source .env.local && set +a + +PYTHONPATH=. uv run python inference/evaluate_harness1.py \ + --dataset browsecompplus \ + --split test \ + --collection-split test \ + --max-turns 40 \ + --temperature 1.0 \ + --checkpoints harness1="$HARNESS1_TINKER_CHECKPOINT" \ + --output tmp/eval_harness1_browsecompplus.json +``` + +The released Hugging Face checkpoint can be used for model loading and serving, +but the full search evaluation still requires a configured retrieval backend and +the Harness-1 tool environment. + +## Other In-Domain Corpora + +The `web`, `sec`, and `patents` in-domain corpora used in the paper are not +distributed here as public ready-made datasets/indexes. To reproduce those +settings, construct the corresponding data and Chroma collections yourself. We +recommend using the Context-1 data-generation repository as the reference +pipeline: + +https://github.com/chroma-core/context-1-data-gen + +Once your corpora are indexed in Chroma with compatible collection names and +document IDs, the same Harness-1 evaluation scripts can target those datasets. diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/__init__.py b/cosmos-retriever/src/cosmos_retriever/datagen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py b/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py new file mode 100644 index 0000000..9edd94d --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""Generate SFT/RL query splits for all datasets. + +Splits each dataset's train queries into: + - SFT: 30% of train queries + - RL: 70% of train queries + +Output: JSON files per dataset with query_id lists for each split. + +Usage: + python -m datagen.generate_sft_rl_splits --output_dir datagen/splits + python -m datagen.generate_sft_rl_splits --output_dir datagen/splits --datasets browsecompplus sec patents web +""" + +import argparse +import json +import os +import sys + +from cosmos_retriever.datagen.search_dataset import get_dataset + + +DATASETS = ["browsecompplus", "sec", "patents", "web"] + + +def generate_splits(dataset_name: str) -> dict: + """Generate SFT/RL splits for a single dataset. + + Returns a dict with split info and query_id lists. + """ + print(f"\n{'='*60}") + print(f"Dataset: {dataset_name}") + print(f"{'='*60}") + + ds = get_dataset(dataset_name) + + all_ids = ds.get_all_query_ids() + train_ids = ds.get_train_query_ids() + test_ids = ds.get_test_query_ids() + sft_ids = ds.get_sft_query_ids() + rl_ids = ds.get_rl_query_ids() + + # Verify no overlap + sft_set = set(sft_ids) + rl_set = set(rl_ids) + train_set = set(train_ids) + test_set = set(test_ids) + + assert sft_set & rl_set == set(), "SFT and RL sets overlap!" + assert sft_set | rl_set == train_set, f"SFT + RL != train set! diff={train_set - (sft_set | rl_set)}" + + overlap = train_set & test_set + if overlap: + print(f" WARNING: {len(overlap)} query IDs overlap between train and test (pre-split dataset artifact)") + # For pre-split datasets, train/test may share IDs — this is expected + + print(f" Total queries: {len(all_ids)}") + print(f" Train queries: {len(train_ids)} ({len(train_ids)/len(all_ids)*100:.1f}%)") + print(f" Test queries: {len(test_ids)} ({len(test_ids)/len(all_ids)*100:.1f}%)") + print(f" ── SFT queries: {len(sft_ids)} ({len(sft_ids)/len(train_ids)*100:.1f}% of train, {len(sft_ids)/len(all_ids)*100:.1f}% of total)") + print(f" ── RL queries: {len(rl_ids)} ({len(rl_ids)/len(train_ids)*100:.1f}% of train, {len(rl_ids)/len(all_ids)*100:.1f}% of total)") + + return { + "dataset": dataset_name, + "total_queries": len(all_ids), + "train_queries": len(train_ids), + "test_queries": len(test_ids), + "sft_queries": len(sft_ids), + "rl_queries": len(rl_ids), + "sft_ratio": len(sft_ids) / len(train_ids) if train_ids else 0, + "rl_ratio": len(rl_ids) / len(train_ids) if train_ids else 0, + "sft_query_ids": sorted(sft_ids), + "rl_query_ids": sorted(rl_ids), + "test_query_ids": sorted(test_ids), + } + + +def main(): + parser = argparse.ArgumentParser(description="Generate SFT/RL query splits") + parser.add_argument( + "--output_dir", + type=str, + default="datagen/splits", + help="Output directory for split files (default: datagen/splits)", + ) + parser.add_argument( + "--datasets", + nargs="+", + default=DATASETS, + help=f"Datasets to process (default: {' '.join(DATASETS)})", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + summary = [] + + for ds_name in args.datasets: + try: + split_info = generate_splits(ds_name) + + # Save per-dataset split file + output_path = os.path.join(args.output_dir, f"{ds_name}_splits.json") + with open(output_path, "w") as f: + json.dump(split_info, f, indent=2) + print(f" Saved to: {output_path}") + + summary.append({ + "dataset": ds_name, + "total": split_info["total_queries"], + "train": split_info["train_queries"], + "test": split_info["test_queries"], + "sft": split_info["sft_queries"], + "rl": split_info["rl_queries"], + }) + except Exception as e: + print(f" ERROR: {e}") + import traceback + traceback.print_exc() + continue + + # Save summary + summary_path = os.path.join(args.output_dir, "summary.json") + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + + # Print summary table + print(f"\n{'='*60}") + print("Summary") + print(f"{'='*60}") + print(f"{'Dataset':<18} {'Total':>6} {'Train':>6} {'Test':>6} {'SFT':>6} {'RL':>6}") + print(f"{'-'*18} {'-'*6} {'-'*6} {'-'*6} {'-'*6} {'-'*6}") + for s in summary: + print(f"{s['dataset']:<18} {s['total']:>6} {s['train']:>6} {s['test']:>6} {s['sft']:>6} {s['rl']:>6}") + print(f"\nAll splits saved to: {args.output_dir}") + + +if __name__ == "__main__": + main() + diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py b/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py new file mode 100644 index 0000000..277e697 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py @@ -0,0 +1,1333 @@ +from abc import ABC, abstractmethod +import ast +from collections import defaultdict +from enum import Enum +from typing import List, Literal, Optional, Set, Tuple +import datasets +import csv +import json +import random +from urllib.parse import urlsplit, urlunsplit +import cosmos_retriever.config as config +from cosmos_retriever.tasks import chunk_ids_to_doc_ids + + +SPLIT_SEED = 42 +TRAIN_RATIO = 0.8 + +# Within the train split, further divide into SFT and RL subsets +SFT_RL_SPLIT_SEED = 123 # Different seed from train/test split for independence +SFT_RATIO = 0.3 # 30% of train queries for SFT, 70% for RL + +# Type alias for fact-level document structure +FactItem = dict # {"fact": str, "chunk_ids": List[str], "is_final_answer": bool} + + +def normalize_document_id(document_id: str) -> str: + """Normalize a document ID for evaluation. + + For URL-like IDs, strip the fragment to avoid mismatches between equivalent + links such as ``/wiki/Foo`` and ``/wiki/Foo#section``. + """ + if "://" not in document_id: + return document_id + + parsed = urlsplit(document_id) + return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, parsed.query, "")) + + +def load_hf_dataset_first_available( + hf_path: str, + *, + split_preferences: Tuple[str, ...] = ("test", "train", "validation"), +) -> datasets.Dataset: + """Load a HuggingFace dataset and pick the first available preferred split.""" + cfg = config.get_config() + token = cfg.huggingface_token + raw = datasets.load_dataset(hf_path, token=token) + + for split_name in split_preferences: + if split_name in raw and len(raw[split_name]) > 0: + return raw[split_name] + + # Fallback to first non-empty split, then first split if all are empty. + for split_name in raw.keys(): + if len(raw[split_name]) > 0: + return raw[split_name] + + first_split = next(iter(raw.keys())) + return raw[first_split] + + +# ============================================================================ +# Backward-compatible enum (used by existing callers) +# ============================================================================ + + +class SearchDatasetName(Enum): + """Backward-compatible enum. Prefer using get_dataset(name_str) directly.""" + BROWSECOMPPLUS = "browsecompplus" + BC_PLUS = "bc_plus" + EPSTEIN = "epstein" + LONGSEALQA = "longsealqa" + SEAL0QA = "seal0qa" + FRAMES = "frames" + HOTPOTQA_SUBSET = "hotpotqa_subset" + PODCASTS_TEST = "podcasts_test" + WEB = "web" + PATENTS = "patents" + SEC = "sec" + WEB_SIMPLE = "web_simple" + SEC_SIMPLE = "sec_simple" + DEEPSEARCH = "deepsearch" + GAIA = "gaia" + OTHER = "other" + + +# ============================================================================ +# Search Dataset Base Class +# ============================================================================ + + +class SearchDataset(ABC): + """ + Abstract base class for search datasets. + + A search dataset is a dataset of search queries and the documents that are required + to answer the query or that are relevant to the query. + + Subclasses must implement `_load_dataset()` to populate `_search_queries_dataset` + with a HuggingFace Dataset containing the following columns: + - query_id: The query id + - query: The search query + - document_ids: The documents that are required to answer the query or that are relevant to the query. + For document-level evaluation: List[str] of document/chunk IDs. + For fact-level evaluation: List[FactItem] where each FactItem has + {"fact": str, "chunk_ids": List[str], "is_final_answer": bool}. + - answer: The answer to the query + + Subclasses can override `evaluation_mode` property to change evaluation behavior: + - "document": Standard document/chunk-level evaluation (default) + - "fact": Fact-level evaluation where a fact is found if ANY of its chunk_ids are retrieved + + For final_answer_recall evaluation: + - Document-level datasets can override `_get_final_answer_document_ids()` to specify + which document IDs are "final answer" documents (e.g., gold vs evidence in BrowseCompPlus). + - Fact-level datasets automatically use facts where is_final_answer=True. + """ + + _search_queries_dataset: datasets.Dataset + _query_index: dict # Maps query_id -> row dict for O(1) lookups + _train_query_ids: List[str] # Query IDs in the train split + _test_query_ids: List[str] # Query IDs in the test split + + # Cosmos container configuration - override in subclasses. + # A list (typically of length 1 in the Cosmos port; load balancing is + # handled server-side via RU/s rather than via per-request sharding). + COSMOS_CONTAINERS: List[str] = [] + # Optional split-specific containers (if not set, falls back to COSMOS_CONTAINERS) + COSMOS_CONTAINERS_TRAIN: Optional[List[str]] = None + COSMOS_CONTAINERS_TEST: Optional[List[str]] = None + + def __init__(self) -> None: + # Subclass loads dataset into self._search_queries_dataset + self._load_dataset() + + # Build common indices + self._build_query_index() + self._create_train_test_split() + + @abstractmethod + def _load_dataset(self) -> None: + """Load the dataset into self._search_queries_dataset. Implemented by subclasses.""" + pass + + @property + @abstractmethod + def name(self) -> str: + """Return the name identifier for this dataset.""" + pass + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """Return the evaluation mode for this dataset. + + - "document": Standard document/chunk-level evaluation. document_ids is List[str]. + - "fact": Fact-level evaluation. document_ids is List[FactItem] where each fact + has chunk_ids. A fact counts as found if ANY of its chunk_ids are retrieved. + + Override this in subclasses that use fact-level evaluation. + """ + return "document" + + def get_cosmos_containers( + self, split: Optional[Literal["train", "test"]] = None + ) -> List[str]: + """Get the Cosmos container names that back this dataset. + + Args: + split: If provided, return containers specific to that split. + If None, returns the default containers. + + Returns: + A list of Cosmos container names. In the Cosmos port this is + typically a single-element list (no per-request sharding). + + Raises: + ValueError: If no containers are configured for the requested split. + """ + if split == "train" and self.COSMOS_CONTAINERS_TRAIN is not None: + collections = self.COSMOS_CONTAINERS_TRAIN + elif split == "test" and self.COSMOS_CONTAINERS_TEST is not None: + collections = self.COSMOS_CONTAINERS_TEST + else: + collections = self.COSMOS_CONTAINERS + + if not collections: + raise ValueError( + f"No Cosmos containers configured for dataset '{self.name}'" + + (f" (split={split})" if split else "") + ) + return collections + + def _build_query_index(self) -> None: + """Build query index for O(1) lookups instead of O(n) filter operations.""" + self._query_index = {} + for i in range(len(self._search_queries_dataset)): + row = self._search_queries_dataset[i] + # Handle document_ids that may be stored as string instead of list + # TODO: We should fix this in the dataset itself. + document_ids = row["document_ids"] + if isinstance(document_ids, str): + document_ids = ast.literal_eval(document_ids) + # For document-level evaluation, ensure document_ids are strings + # (model outputs are strings, so we need consistent types for comparison) + if self.evaluation_mode == "document": + document_ids = [ + normalize_document_id(str(doc_id)) for doc_id in document_ids + ] + # Ensure query_id is always a string + query_id = str(row["query_id"]) + self._query_index[query_id] = { + "query_id": query_id, + "query": row["query"], + "document_ids": document_ids, + "answer": row["answer"], + } + + def _create_train_test_split(self) -> None: + """Create deterministic train/test split (80/20).""" + all_query_ids = list(self._query_index.keys()) + all_query_ids_sorted = sorted(all_query_ids) # Sort for determinism + rng = random.Random(SPLIT_SEED) + rng.shuffle(all_query_ids_sorted) + split_idx = int(len(all_query_ids_sorted) * TRAIN_RATIO) + self._train_query_ids = all_query_ids_sorted[:split_idx] + self._test_query_ids = all_query_ids_sorted[split_idx:] + + def get_train_query_ids(self) -> List[str]: + """Return all query ids in the train split (80% of data).""" + return self._train_query_ids.copy() + + def get_test_query_ids(self) -> List[str]: + """Return all query ids in the test split (20% of data).""" + return self._test_query_ids.copy() + + def _create_sft_rl_split(self) -> None: + """Split train queries into SFT (30%) and RL (70%) subsets. + + This is a deterministic sub-split of the train set. The split is + performed after the train/test split, so it's independent of it. + """ + train_ids_sorted = sorted(self._train_query_ids) # Sort for determinism + rng = random.Random(SFT_RL_SPLIT_SEED) + rng.shuffle(train_ids_sorted) + split_idx = int(len(train_ids_sorted) * SFT_RATIO) + self._sft_query_ids = train_ids_sorted[:split_idx] + self._rl_query_ids = train_ids_sorted[split_idx:] + + def get_sft_query_ids(self) -> List[str]: + """Return query ids for SFT training (30% of train split).""" + if not hasattr(self, "_sft_query_ids"): + self._create_sft_rl_split() + return self._sft_query_ids.copy() + + def get_rl_query_ids(self) -> List[str]: + """Return query ids for RL training (70% of train split).""" + if not hasattr(self, "_rl_query_ids"): + self._create_sft_rl_split() + return self._rl_query_ids.copy() + + def get_random_query( + self, split: Optional[Literal["train", "test"]] = None + ) -> Tuple[str, str]: + """Get a random query from the search queries dataset. + + Args: + split: If provided, only sample from the specified split ("train" or "test"). + If None, sample from all queries. + + Returns the query id and query text. + """ + if split == "train": + query_ids = self._train_query_ids + elif split == "test": + query_ids = self._test_query_ids + else: + query_ids = list(self._query_index.keys()) + + query_id = random.choice(query_ids) + return (query_id, self._query_index[query_id]["query"]) + + def get_all_query_ids( + self, split: Optional[Literal["train", "test", "sft", "rl"]] = None + ) -> List[str]: + """Return all query ids contained in the dataset. + + Args: + split: If provided, only return query ids from the specified split. + - "train": All train queries (80% of data) + - "test": All test queries (20% of data) + - "sft": SFT subset of train queries (30% of train = 24% of total) + - "rl": RL subset of train queries (70% of train = 56% of total) + - None: All query ids + """ + if split == "train": + return self._train_query_ids.copy() + elif split == "test": + return self._test_query_ids.copy() + elif split == "sft": + return self.get_sft_query_ids() + elif split == "rl": + return self.get_rl_query_ids() + return list(self._query_index.keys()) + + def get_expected_document_ids(self, query_id: str) -> List[str]: + """Get the expected document/chunk ids for a given query id. + + For document-level datasets: returns the document_ids list directly. + For fact-level datasets: returns a flattened list of all chunk_ids from all facts. + + Returns a list of document/chunk IDs. + """ + return list(self._get_all_relevant_chunk_ids(query_id)) + + def get_expected_facts(self, query_id: str) -> List[FactItem]: + """Get the expected facts for a given query id. + + Only meaningful for fact-level datasets (evaluation_mode == "fact"). + For document-level datasets, this returns an empty list. + + Returns a list of fact objects, each with keys: + - "fact": str - description of the fact + - "chunk_ids": List[str] - chunk IDs containing this fact + - "is_final_answer": bool - whether this fact is the final answer + """ + if self.evaluation_mode != "fact": + raise ValueError(f"Dataset {self.name} is not a fact-level dataset") + return self._query_index[query_id]["document_ids"] + + def get_expected_answer(self, query_id: str) -> str: + """Get the expected answer for a given query id. + + Returns the expected answer. + """ + return self._query_index[query_id]["answer"] + + def get_query_by_id(self, query_id: str) -> Tuple[str, str]: + """Get a query by id from the search queries dataset. + + Returns the query id and query text. + """ + row = self._query_index[query_id] + return (row["query_id"], row["query"]) + + def _get_all_relevant_chunk_ids(self, query_id: str) -> Set[str]: + """Get all relevant chunk IDs for a query, handling both evaluation modes. + + For document-level: returns document_ids directly. + For fact-level: extracts and flattens all chunk_ids from fact objects. + """ + document_ids = self._query_index[query_id]["document_ids"] + + if self.evaluation_mode == "fact": + # Fact-level: extract chunk_ids from each fact object + all_chunk_ids: Set[str] = set() + for fact in document_ids: + all_chunk_ids.update(fact["chunk_ids"]) + return all_chunk_ids + else: + # Document-level: document_ids is already a flat list + return set(document_ids) + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Get document IDs that correspond to "final answer" documents. + + For document-level datasets: By default, returns all document_ids. + Subclasses can override this to return only "gold" or "final answer" documents. + + For fact-level datasets: Returns chunk_ids from facts where is_final_answer=True. + """ + document_ids = self._query_index[query_id]["document_ids"] + + if self.evaluation_mode == "fact": + # Fact-level: extract chunk_ids only from final answer facts + final_answer_chunk_ids: Set[str] = set() + for fact in document_ids: + if fact.get("is_final_answer", False): + final_answer_chunk_ids.update(fact["chunk_ids"]) + return final_answer_chunk_ids + else: + # Document-level: by default, all documents are considered "final answer" + # Subclasses can override to provide gold-only documents + return set(document_ids) + + def _get_final_answer_facts(self, query_id: str) -> List[FactItem]: + """Get facts that are marked as final answer. + + Only meaningful for fact-level datasets. + Returns facts where is_final_answer=True. + """ + if self.evaluation_mode != "fact": + return [] + document_ids = self._query_index[query_id]["document_ids"] + return [fact for fact in document_ids if fact.get("is_final_answer", False)] + + def evaluate_results_recall( + self, query_id: str, retrieved_chunk_ids: List[str] + ) -> float: + """Evaluate the recall of the retrieved chunk ids for a given query. + + For document-level evaluation: + Recall = True Positives / (True Positives + False Negatives) + where positives are document IDs. + + For fact-level evaluation: + Recall = (facts found) / (total facts) + A fact is considered found if ANY of its chunk_ids are in the retrieved set. + """ + retrieved_set = set(retrieved_chunk_ids) + + if self.evaluation_mode == "fact": + # Fact-level recall: count facts where at least one chunk_id is retrieved + facts = self._query_index[query_id]["document_ids"] + if len(facts) == 0: + return 0.0 + + facts_found = sum( + 1 + for fact in facts + if set(fact["chunk_ids"]).intersection(retrieved_set) + ) + return facts_found / len(facts) + else: + # Document-level recall + retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) + relevant_document_ids_set: Set[str] = set( + self._query_index[query_id]["document_ids"] + ) + + true_positives = len( + retrieved_document_ids_set.intersection(relevant_document_ids_set) + ) + false_negatives = len( + relevant_document_ids_set - retrieved_document_ids_set + ) + if true_positives + false_negatives == 0: + return 0.0 + return true_positives / (true_positives + false_negatives) + + def evaluate_results_final_answer_recall( + self, query_id: str, retrieved_chunk_ids: List[str] + ) -> float: + """Evaluate the final answer recall of the retrieved chunk ids for a given query. + + This metric measures recall specifically on "final answer" or "gold" documents/facts: + + For document-level evaluation (e.g., BrowseCompPlus): + Uses _get_final_answer_document_ids() which can be overridden by subclasses + to return only "gold" documents (excluding "evidence" documents). + Recall = (gold docs found) / (total gold docs) + + For fact-level evaluation: + Only considers facts where is_final_answer=True. + Recall = (final answer facts found) / (total final answer facts) + A fact is found if ANY of its chunk_ids are in the retrieved set. + """ + retrieved_set = set(retrieved_chunk_ids) + + if self.evaluation_mode == "fact": + # Fact-level: only count final answer facts + final_answer_facts = self._get_final_answer_facts(query_id) + if len(final_answer_facts) == 0: + return 0.0 + + facts_found = sum( + 1 + for fact in final_answer_facts + if set(fact["chunk_ids"]).intersection(retrieved_set) + ) + return facts_found / len(final_answer_facts) + else: + # Document-level: use final answer document IDs + retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) + final_answer_document_ids_set: Set[str] = ( + self._get_final_answer_document_ids(query_id) + ) + + if len(final_answer_document_ids_set) == 0: + return 0.0 + + true_positives = len( + retrieved_document_ids_set.intersection(final_answer_document_ids_set) + ) + return true_positives / len(final_answer_document_ids_set) + + def evaluate_results_precision( + self, query_id: str, retrieved_chunk_ids: List[str] + ) -> float: + """Evaluate the precision of the retrieved chunk ids for a given query. + + For document-level evaluation: + Precision = True Positives / (True Positives + False Positives) + where positives are document IDs. + + For fact-level evaluation: + Precision = (relevant chunks retrieved) / (total chunks retrieved) + A chunk is relevant if it appears in any fact's chunk_ids. + """ + retrieved_set = set(retrieved_chunk_ids) + + if self.evaluation_mode == "fact": + # Fact-level precision: what fraction of retrieved chunks are relevant + if len(retrieved_set) == 0: + return 0.0 + + all_relevant_chunk_ids = self._get_all_relevant_chunk_ids(query_id) + relevant_retrieved = len(retrieved_set.intersection(all_relevant_chunk_ids)) + return relevant_retrieved / len(retrieved_set) + else: + # Document-level precision + retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) + relevant_document_ids_set: Set[str] = set( + self._query_index[query_id]["document_ids"] + ) + + true_positives = len( + retrieved_document_ids_set.intersection(relevant_document_ids_set) + ) + false_positives = len( + retrieved_document_ids_set - relevant_document_ids_set + ) + if true_positives + false_positives == 0: + return 0.0 + return true_positives / (true_positives + false_positives) + + def evaluate_results_f1_score( + self, query_id: str, retrieved_chunk_ids: List[str] + ) -> float: + """Evaluate the F1 score of the retrieved chunk ids for a given query. + + F1 score is defined as 2 * (Precision * Recall) / (Precision + Recall) + Works for both document-level and fact-level evaluation modes. + """ + precision = self.evaluate_results_precision(query_id, retrieved_chunk_ids) + recall = self.evaluate_results_recall(query_id, retrieved_chunk_ids) + if precision + recall == 0: + return 0.0 + return 2 * (precision * recall) / (precision + recall) + + @classmethod + def from_known_dataset(cls, name: "SearchDatasetName") -> "SearchDataset": + """Backward-compatible factory method. Prefer get_dataset(name_str) instead.""" + return get_dataset(name.value) + + +# ============================================================================ +# Pre-Split Dataset Base Class +# ============================================================================ + + +class PreSplitSearchDataset(SearchDataset): + """ + Base class for search datasets with separate train/test HuggingFace paths. + + Instead of loading a single dataset and applying an 80/20 split, this class + loads from separate train and test HF paths and uses those as the canonical splits. + + Subclasses must define: + - HF_PATH_TRAIN: HuggingFace path for train split + - HF_PATH_TEST: HuggingFace path for test split + - name property + + Optionally: + - HF_SPLIT_TRAIN: The split name in the train dataset (default: "train") + - HF_SPLIT_TEST: The split name in the test dataset (default: "test") + - Override `_post_load_setup()` for additional processing (e.g., gold_document_ids) + """ + + HF_PATH_TRAIN: str + HF_PATH_TEST: str + HF_SPLIT_TRAIN: str = "train" + HF_SPLIT_TEST: str = "test" + + def _load_dataset(self) -> None: + """Load train and test datasets from separate HF paths.""" + cfg = config.get_config() + token = cfg.huggingface_token + + train_ds = datasets.load_dataset(self.HF_PATH_TRAIN, token=token)[ + self.HF_SPLIT_TRAIN + ] + test_ds = datasets.load_dataset(self.HF_PATH_TEST, token=token)[ + self.HF_SPLIT_TEST + ] + + # Store query IDs from each split before combining + self._presplit_train_ids = [str(qid) for qid in train_ds["query_id"]] + self._presplit_test_ids = [str(qid) for qid in test_ds["query_id"]] + + # Combine into single dataset for unified access + self._search_queries_dataset = datasets.concatenate_datasets( + [train_ds, test_ds] + ) + + # Hook for subclass-specific post-load processing + self._post_load_setup() + + def _post_load_setup(self) -> None: + """Override in subclasses for additional setup after loading.""" + pass + + def _create_train_test_split(self) -> None: + """Use the pre-defined splits instead of random 80/20.""" + self._train_query_ids = self._presplit_train_ids + self._test_query_ids = self._presplit_test_ids + + +class SingleSplitSearchDataset(SearchDataset): + """Dataset helper for eval-only corpora that expose a single HF split. + + For these datasets we typically want deterministic sampling from the full set, + so we expose all query IDs through both train and test partitions. + """ + + HF_PATH: str + HF_SPLIT_PREFERENCES: Tuple[str, ...] = ("test", "train", "validation") + + def _load_dataset(self) -> None: + self._search_queries_dataset = load_hf_dataset_first_available( + self.HF_PATH, split_preferences=self.HF_SPLIT_PREFERENCES + ) + + def _create_train_test_split(self) -> None: + all_query_ids = sorted(self._query_index.keys()) + self._train_query_ids = all_query_ids + self._test_query_ids = all_query_ids + + +# ============================================================================ +# BrowseComp+ Dataset +# ============================================================================ + + +class BrowseCompPlusDataset(SearchDataset): + """BrowseComp+ search dataset.""" + + _gold_document_ids: dict[str, Set[str]] # Maps query_id -> gold document IDs + + # Container name comes from config.cosmos_corpus_container (see get_cosmos_containers). + COSMOS_CONTAINERS = ["browsecomp_corpus_container"] + + def get_cosmos_containers(self, split=None): + return [config.get_config().cosmos_corpus_container] + + @property + def name(self) -> str: + return "browsecompplus" + + def _load_dataset(self) -> None: + cfg = config.get_config() + + qrels_gold = self._load_qrels(cfg.browsecompplus_qrels_gold_path) + qrels_evidence = self._load_qrels(cfg.browsecompplus_qrels_evidence_path) + + # Store gold document IDs separately for final_answer_recall + self._gold_document_ids = { + query_id: set(doc_ids) for query_id, doc_ids in qrels_gold.items() + } + + # Combine qrels_gold and qrels_evidence for overall recall + qrels: dict[str, list] = defaultdict(list) + for query_id, doc_ids in qrels_gold.items(): + qrels[query_id].extend(doc_ids) + for query_id, doc_ids in qrels_evidence.items(): + qrels[query_id].extend(doc_ids) + + queries = self._load_queries(cfg.browsecompplus_queries_path) + answers = self._load_decrypted_answers(cfg.browsecompplus_answers_path) + + query_ids = list(queries.keys()) + self._search_queries_dataset = datasets.Dataset.from_dict( + { + "query_id": query_ids, + "query": [queries[query_id] for query_id in query_ids], + "document_ids": [qrels[query_id] for query_id in query_ids], + "answer": [answers[query_id] for query_id in query_ids], + } + ) + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + @staticmethod + def _load_qrels(path: str) -> dict: + """Load qrels from a TREC-format file.""" + qrels: dict[str, dict[str, int]] = {} + with open(path, "r") as f: + for line in f: + parts = line.strip().split() + query_id = parts[0] + doc_id = parts[2] + relevance = int(parts[3]) + if query_id not in qrels: + qrels[query_id] = {} + qrels[query_id][doc_id] = relevance + return qrels + + @staticmethod + def _load_queries(path: str) -> dict: + """Load queries from a TSV file.""" + queries = {} + with open(path) as fd: + rd = csv.reader(fd, delimiter="\t", quotechar='"') + for row in rd: + query_id = row[0] + query_text = row[1] + queries[query_id] = query_text + return queries + + @staticmethod + def _load_decrypted_answers(path: str) -> dict: + """Load decrypted answers from a JSONL file.""" + answers = {} + with open(path, "r") as f: + for line in f: + doc = json.loads(line) + answers[doc["query_id"]] = doc["answer"] + return answers + + +# ============================================================================ +# Other Datasets +# ============================================================================ + + +class WebDataset(SearchDataset): + """Web search dataset. + + Loads from kellyhongg/web_1_17_test (test split) and kellyhongg/web_train_1_17 (train split). + If the train dataset is empty/unavailable, falls back to using only the test dataset + with an 80/20 random split. + """ + + HF_PATH_TRAIN = "kellyhongg/1_17_web_train" + HF_PATH_TEST = "kellyhongg/1_17_web_test" + COSMOS_CONTAINERS_TRAIN = [f"web_train_1_17_replica_{i}" for i in range(1, 45)] + COSMOS_CONTAINERS_TEST = [f"web_test_1_17_replica_{i}" for i in range(1, 45)] + + _gold_document_ids: dict[str, Set[str]] + _has_presplit: bool = False # Whether we have separate train/test data + + @property + def name(self) -> str: + return "web" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + return "document" + + def _load_dataset(self) -> None: + cfg = config.get_config() + token = cfg.huggingface_token + + test_ds = None + train_ds = None + + # Load test dataset + try: + test_ds = datasets.load_dataset(self.HF_PATH_TEST, token=token)["test"] + except Exception: + pass + + # Try loading train dataset + try: + raw_train = datasets.load_dataset(self.HF_PATH_TRAIN, token=token) + # Pick the first available split + for split_name in ["train", "test"]: + if split_name in raw_train and len(raw_train[split_name]) > 0: + train_ds = raw_train[split_name] + break + except Exception: + pass + + if train_ds is not None and test_ds is not None: + # Both available: use pre-split + self._has_presplit = True + self._presplit_train_ids = [str(qid) for qid in train_ds["query_id"]] + self._presplit_test_ids = [str(qid) for qid in test_ds["query_id"]] + self._search_queries_dataset = datasets.concatenate_datasets([train_ds, test_ds]) + elif test_ds is not None: + # Only test available: use it with random 80/20 split + self._has_presplit = False + self._search_queries_dataset = test_ds + elif train_ds is not None: + # Only train available + self._has_presplit = False + self._search_queries_dataset = train_ds + else: + raise ValueError("Neither train nor test data could be loaded for WebDataset") + + # Extract gold_document_ids + gold_document_ids = [ + ast.literal_eval(docids) if isinstance(docids, str) else docids + for docids in self._search_queries_dataset["gold_document_ids"] + ] + self._gold_document_ids = { + str(query_id): set(doc_ids) + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _create_train_test_split(self) -> None: + """Use pre-split if available, otherwise random 80/20.""" + if self._has_presplit: + self._train_query_ids = self._presplit_train_ids + self._test_query_ids = self._presplit_test_ids + else: + # Fall back to random split + super()._create_train_test_split() + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + +class EpsteinDataset(SearchDataset): + HF_PATH = "kellyhongg/epstein_1_14" + + @property + def name(self) -> str: + return "epstein" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """Epstein uses document-level evaluation.""" + return "document" + + def _load_dataset(self) -> None: + self._search_queries_dataset = datasets.load_dataset(self.HF_PATH)["test"] + gold_document_ids = [ + ast.literal_eval(docids) + for docids in self._search_queries_dataset["gold_document_ids"] + ] + + self._gold_document_ids = { + str(query_id): set(doc_ids) + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + +class PatentsDataset(PreSplitSearchDataset): + """Patents search dataset with pre-split train/test HF paths.""" + + HF_PATH_TRAIN = "kellyhongg/1_18_patents_train" + HF_PATH_TEST = "kellyhongg/1_18_patents_test" + HF_SPLIT_TRAIN = "train" + HF_SPLIT_TEST = "test" + COSMOS_CONTAINERS_TRAIN = [f"patents_train_1_18_replica_{i}" for i in range(1, 45)] + COSMOS_CONTAINERS_TEST = [f"patents_test_1_18_replica_{i}" for i in range(1, 45)] + + _gold_document_ids: dict[str, Set[str]] + + @property + def name(self) -> str: + return "patents" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + return "document" + + def _post_load_setup(self) -> None: + """Extract gold_document_ids from the combined dataset.""" + gold_document_ids = [ + ast.literal_eval(docids) if isinstance(docids, str) else docids + for docids in self._search_queries_dataset["gold_document_ids"] + ] + self._gold_document_ids = { + str(query_id): set(doc_ids) + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + +class SECDataset(PreSplitSearchDataset): + """SEC Filings search dataset with pre-split train/test HF paths. + + Uses sec_1_4 (full combined corpus, ~2.1M chunks) for both train and test + retrieval. The previous sec_train_1_14 collection was missing ~15% of GT + chunk IDs for train queries. Test HF data uses kellyhongg/sec_test_new + which filters out tasks with overlapping chunks. + """ + + HF_PATH_TRAIN = "kellyhongg/1_18_sec_train" + HF_PATH_TEST = "kellyhongg/sec_test_new" + HF_SPLIT_TRAIN = "train" + HF_SPLIT_TEST = "test" + COSMOS_CONTAINERS_TRAIN = ["sec_1_4"] + COSMOS_CONTAINERS_TEST = ["sec_1_4"] + + @property + def name(self) -> str: + return "sec" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """SEC Filings uses fact-level evaluation.""" + return "fact" + + +class PodcastsTestSet(SearchDataset): + HF_PATH = "kellyhongg/1_25_podcasts_test" + + @property + def name(self) -> str: + return "podcasts_test" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """Podcasts uses document-level evaluation.""" + return "document" + + def _load_dataset(self) -> None: + self._search_queries_dataset = datasets.load_dataset( + self.HF_PATH, token=config.get_config().huggingface_token + )["test"] + gold_document_ids = [ + ast.literal_eval(docids) if isinstance(docids, str) else docids + for docids in self._search_queries_dataset["gold_document_ids"] + ] + + # Ensure gold_document_ids are strings (model outputs are strings) + self._gold_document_ids = { + str(query_id): set(str(doc_id) for doc_id in doc_ids) + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + +class WebSimpleDataset(PreSplitSearchDataset): + """Web Simple search dataset with pre-split train/test HF paths.""" + + HF_PATH_TRAIN = "kellyhongg/1_25_web_simple_train" + HF_PATH_TEST = "kellyhongg/1_25_web_simple_test" + HF_SPLIT_TRAIN = "train" + HF_SPLIT_TEST = "test" + # Same as WebDataset + COSMOS_CONTAINERS_TRAIN = [f"web_train_1_17_replica_{i}" for i in range(1, 45)] + COSMOS_CONTAINERS_TEST = [f"web_test_1_17_replica_{i}" for i in range(1, 45)] + + _gold_document_ids: dict[str, Set[str]] + + @property + def name(self) -> str: + return "web_simple" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + return "document" + + def _post_load_setup(self) -> None: + """Extract gold_document_ids from the combined dataset.""" + gold_document_ids = [ + ast.literal_eval(docids) if isinstance(docids, str) else docids + for docids in self._search_queries_dataset["gold_document_ids"] + ] + self._gold_document_ids = { + str(query_id): set(str(doc_id) for doc_id in doc_ids) + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + """Return only gold document IDs (excluding evidence documents).""" + return self._gold_document_ids.get(query_id, set()) + + +class SECSimpleDataset(PreSplitSearchDataset): + """SEC Simple search dataset with pre-split train/test HF paths.""" + + HF_PATH_TRAIN = "kellyhongg/1_25_sec_simple_train" + HF_PATH_TEST = "kellyhongg/1_25_sec_simple_test" + HF_SPLIT_TRAIN = "train" + HF_SPLIT_TEST = "test" + COSMOS_CONTAINERS_TRAIN = ["sec_1_4"] + COSMOS_CONTAINERS_TEST = ["sec_1_4"] + + @property + def name(self) -> str: + return "sec_simple" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """SEC Simple uses fact-level evaluation.""" + return "fact" + + +# ============================================================================ +# Additional Benchmark Datasets (Kelly April 2026 refresh) +# ============================================================================ + + +class BCPlusDataset(SingleSplitSearchDataset): + """BrowseComp+ benchmark loaded directly from HuggingFace.""" + + HF_PATH = "kellyhongg/bc_plus" + HF_SPLIT_PREFERENCES = ("test", "train") + COSMOS_CONTAINERS = ["browsecomp_corpus_container"] + + def get_cosmos_containers(self, split=None): + return [config.get_config().cosmos_corpus_container] + + _gold_document_ids: dict[str, Set[str]] + + @property + def name(self) -> str: + return "bc_plus" + + def _load_dataset(self) -> None: + self._search_queries_dataset = load_hf_dataset_first_available( + self.HF_PATH, split_preferences=self.HF_SPLIT_PREFERENCES + ) + + gold_document_ids = [ + ast.literal_eval(docids) if isinstance(docids, str) else docids + for docids in self._search_queries_dataset["gold_document_ids"] + ] + self._gold_document_ids = { + str(query_id): { + normalize_document_id(str(doc_id)) for doc_id in doc_ids + } + for query_id, doc_ids in zip( + self._search_queries_dataset["query_id"], gold_document_ids + ) + } + + def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: + return self._gold_document_ids.get(query_id, set()) + + +class LongSealQADataset(SingleSplitSearchDataset): + """LongSealQA retrieval dataset backed by a Chroma collection.""" + + HF_PATH = "kellyhongg/longsealqa" + HF_SPLIT_PREFERENCES = ("test", "train") + COSMOS_CONTAINERS = ["longsealqa"] + + @property + def name(self) -> str: + return "longsealqa" + + +class Seal0QADataset(SingleSplitSearchDataset): + """Seal0QA open-web retrieval dataset (web search based).""" + + HF_PATH = "kellyhongg/seal0qa" + HF_SPLIT_PREFERENCES = ("test", "train") + + @property + def name(self) -> str: + return "seal0qa" + + +class FramesDataset(SingleSplitSearchDataset): + """FRAMES benchmark for Wikipedia-focused retrieval.""" + + HF_PATH = "kellyhongg/frames" + HF_SPLIT_PREFERENCES = ("test", "train") + + @property + def name(self) -> str: + return "frames" + + +class HotpotQASubsetDataset(SingleSplitSearchDataset): + """HotpotQA subset benchmark for Wikipedia-focused retrieval.""" + + HF_PATH = "kellyhongg/hotpotqa_subset" + HF_SPLIT_PREFERENCES = ("test", "train") + + @property + def name(self) -> str: + return "hotpotqa_subset" + + +# ============================================================================ +# SEC Filings Dataset (legacy - uses HuggingFace kellyhongg/sec_filings) +# ============================================================================ + + +class SECFilingsDataset(SearchDataset): + """SEC Filings search dataset from HuggingFace. + + This dataset uses fact-level evaluation where document_ids contains a list of + fact objects, each with chunk_ids. A fact is considered found if ANY of its + chunk_ids are retrieved. + """ + + HF_PATH = "kellyhongg/sec_filings" + COSMOS_CONTAINERS = [f"latest_sec_filings_replica_{i}" for i in range(46)] + + @property + def name(self) -> str: + return "sec_filings" + + @property + def evaluation_mode(self) -> Literal["document", "fact"]: + """SEC Filings uses fact-level evaluation.""" + return "fact" + + def _load_dataset(self) -> None: + self._search_queries_dataset = datasets.load_dataset(self.HF_PATH)["test"] + + def get_cosmos_containers( + self, split: Optional[Literal["train", "test"]] = None + ) -> List[str]: + """Return the Cosmos container names for SEC Filings.""" + return self.COSMOS_CONTAINERS + + +# ============================================================================ +# QA-Only Benchmark Datasets (no document_ids, answer-evaluation only) +# ============================================================================ + + +class DeepSearchDataset(SearchDataset): + """xbench/DeepSearch benchmark dataset. + + This is an answer-evaluation benchmark (no document_ids / recall evaluation). + The dataset is encrypted; the decrypt code is available at the xbench_evals + GitHub repo. We load the raw HF dataset and map it to the SearchDataset + interface with empty document_ids so that the query/answer pipeline works. + + HF columns: id, prompt, answer, reference_steps, canary + """ + + HF_PATH = "xbench/DeepSearch" + + @property + def name(self) -> str: + return "deepsearch" + + def _load_dataset(self) -> None: + raw_ds = datasets.load_dataset(self.HF_PATH, split="train") + + self._search_queries_dataset = datasets.Dataset.from_dict( + { + "query_id": [str(row["id"]) for row in raw_ds], + "query": [row["prompt"] for row in raw_ds], + "document_ids": [[] for _ in range(len(raw_ds))], + "answer": [row["answer"] for row in raw_ds], + } + ) + + +class GAIADataset(SearchDataset): + """GAIA benchmark dataset (gaia-benchmark/GAIA). + + This is an answer-evaluation benchmark for general AI assistants. + It is a gated dataset — you must accept the terms on the HF page before + loading: https://huggingface.co/datasets/gaia-benchmark/GAIA + + We load the '2023_all' config and combine validation + test splits. + HF columns: task_id, Question, Level, Final answer, file_name, file_path, + Annotator Metadata + + document_ids is set to empty because GAIA does not provide retrieval labels. + """ + + HF_PATH = "gaia-benchmark/GAIA" + HF_CONFIG = "2023_all" + + @property + def name(self) -> str: + return "gaia" + + def _load_dataset(self) -> None: + cfg = config.get_config() + token = cfg.huggingface_token + + raw_ds = datasets.load_dataset( + self.HF_PATH, self.HF_CONFIG, token=token + ) + + # GAIA typically has 'validation' and 'test' splits. + # 'test' answers are hidden, so we use 'validation' as our primary data. + # If both exist, concatenate them; otherwise use whichever is available. + splits_to_use = [] + for split_name in ["validation", "test"]: + if split_name in raw_ds: + splits_to_use.append(raw_ds[split_name]) + + if not splits_to_use: + raise ValueError( + f"GAIA dataset has no usable splits. Available: {list(raw_ds.keys())}" + ) + + combined = datasets.concatenate_datasets(splits_to_use) + + self._search_queries_dataset = datasets.Dataset.from_dict( + { + "query_id": [str(row["task_id"]) for row in combined], + "query": [row["Question"] for row in combined], + "document_ids": [[] for _ in range(len(combined))], + "answer": [row["Final answer"] for row in combined], + } + ) + + +# ============================================================================ +# Dataset Registry & Factory +# ============================================================================ + + + + +# ============================================================================ +# Enterprise RAG Bench Dataset (Onyx EnterpriseRAG-Bench) +# ============================================================================ + + +class EnterpriseRagBenchDataset(SearchDataset): + """Onyx EnterpriseRAG-Bench, 500-question leaderboard split. + + Document-level retrieval eval. 30 of the 500 questions + (high_level + info_not_found) have empty `expected_doc_ids` — they are + included by default but contribute recall=0. + """ + + _gold_document_ids: dict + + def get_cosmos_containers(self, split=None): + import os + return [os.environ.get("ENT_RAG_CONTAINER", "enterprise_ragbench_corpus")] + + @property + def name(self) -> str: + return "enterprise_rag" + + def _load_dataset(self) -> None: + ds = datasets.load_dataset( + "onyx-dot-app/EnterpriseRAG-Bench", "questions", split="test" + ) + rows = [] + for r in ds: + qid = r.get("question_id") or r.get("qid") + if qid is None: + continue + rows.append({ + "qid": str(qid), + "question": r.get("question", ""), + "expected_doc_ids": list(r.get("expected_doc_ids") or []), + "gold_answer": r.get("gold_answer", ""), + }) + self._gold_document_ids = { + r["qid"]: set(r["expected_doc_ids"]) for r in rows + } + self._search_queries_dataset = datasets.Dataset.from_dict({ + "query_id": [r["qid"] for r in rows], + "query": [r["question"] for r in rows], + "document_ids": [list(r["expected_doc_ids"]) for r in rows], + "answer": [r["gold_answer"] for r in rows], + }) + + def _get_final_answer_document_ids(self, query_id): + return self._gold_document_ids.get(query_id, set()) + + def _create_train_test_split(self) -> None: + all_query_ids = sorted(self._query_index.keys()) + self._train_query_ids = all_query_ids + self._test_query_ids = all_query_ids + + +DATASET_REGISTRY: dict[str, type[SearchDataset]] = { + "browsecompplus": BrowseCompPlusDataset, + "enterprise_rag": EnterpriseRagBenchDataset, + "bc_plus": BCPlusDataset, + "epstein": EpsteinDataset, + "longsealqa": LongSealQADataset, + "seal0qa": Seal0QADataset, + "frames": FramesDataset, + "hotpotqa_subset": HotpotQASubsetDataset, + "podcasts_test": PodcastsTestSet, + "web": WebDataset, + "patents": PatentsDataset, + "sec": SECDataset, + "web_simple": WebSimpleDataset, + "sec_simple": SECSimpleDataset, + "sec_filings": SECFilingsDataset, # Legacy dataset - works with existing collections + "deepsearch": DeepSearchDataset, + "gaia": GAIADataset, +} + + +def get_dataset(name: str) -> SearchDataset: + """Create a search dataset by name. + + Args: + name: The dataset name. Available datasets: + - "browsecompplus": BrowseComp+ dataset + - "bc_plus": BrowseComp+ HF dataset variant (single collection) + - "epstein": Epstein dataset + - "longsealqa": LongSeal QA dataset + - "seal0qa": Seal0 QA dataset (open-web) + - "frames": FRAMES dataset + - "hotpotqa_subset": HotpotQA subset dataset + - "podcasts_test": Podcasts test dataset + - "web": Web dataset (pre-split train/test) + - "patents": Patents dataset (pre-split train/test) + - "sec": SEC Filings dataset (pre-split train/test) + - "web_simple": Web Simple dataset (pre-split train/test) + - "sec_simple": SEC Simple dataset (pre-split train/test) + - "deepsearch": xbench/DeepSearch benchmark (answer-eval only) + - "gaia": GAIA benchmark (answer-eval only, gated) + + Returns: + An instance of the corresponding dataset class. + + Raises: + ValueError: If the dataset name is not recognized. + """ + if name not in DATASET_REGISTRY: + available = ", ".join(DATASET_REGISTRY.keys()) + raise ValueError(f"Unknown dataset: {name}. Available datasets: {available}") + return DATASET_REGISTRY[name]() diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json new file mode 100644 index 0000000..98e17a6 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json @@ -0,0 +1,846 @@ +{ + "dataset": "browsecompplus", + "total_queries": 830, + "train_queries": 664, + "test_queries": 166, + "sft_queries": 199, + "rl_queries": 465, + "sft_ratio": 0.2996987951807229, + "rl_ratio": 0.7003012048192772, + "sft_query_ids": [ + "100", + "1010", + "1016", + "1022", + "1023", + "1026", + "1033", + "1046", + "1049", + "1055", + "1068", + "107", + "1071", + "1089", + "1090", + "1098", + "1108", + "1117", + "1118", + "1122", + "113", + "1134", + "1138", + "1139", + "1148", + "1158", + "116", + "1172", + "1176", + "1184", + "1188", + "120", + "1210", + "1218", + "122", + "1228", + "1233", + "1235", + "1237", + "124", + "1240", + "1243", + "1247", + "1249", + "125", + "1258", + "1260", + "1266", + "138", + "15", + "155", + "170", + "177", + "178", + "183", + "194", + "20", + "201", + "203", + "206", + "209", + "216", + "226", + "233", + "242", + "256", + "265", + "268", + "27", + "276", + "293", + "299", + "304", + "315", + "327", + "328", + "337", + "349", + "372", + "376", + "380", + "383", + "402", + "406", + "408", + "409", + "41", + "410", + "411", + "416", + "421", + "424", + "429", + "433", + "435", + "436", + "442", + "449", + "46", + "467", + "471", + "473", + "483", + "485", + "486", + "487", + "493", + "497", + "498", + "5", + "503", + "506", + "507", + "512", + "513", + "517", + "520", + "53", + "540", + "560", + "563", + "568", + "570", + "571", + "575", + "577", + "584", + "588", + "59", + "590", + "591", + "592", + "593", + "594", + "611", + "618", + "62", + "631", + "637", + "651", + "655", + "661", + "665", + "669", + "673", + "679", + "682", + "703", + "707", + "709", + "714", + "719", + "723", + "724", + "737", + "738", + "744", + "768", + "769", + "771", + "786", + "788", + "791", + "792", + "793", + "801", + "806", + "814", + "815", + "82", + "827", + "828", + "833", + "851", + "870", + "872", + "882", + "886", + "89", + "895", + "896", + "90", + "915", + "916", + "919", + "926", + "927", + "930", + "932", + "944", + "948", + "951", + "96", + "960", + "961", + "966", + "968", + "981", + "996" + ], + "rl_query_ids": [ + "1", + "10", + "1000", + "1002", + "1003", + "1005", + "1007", + "1008", + "1015", + "1018", + "1019", + "102", + "1020", + "1021", + "1025", + "1027", + "1028", + "103", + "1032", + "1035", + "1037", + "1038", + "1039", + "1040", + "1041", + "1042", + "1043", + "1044", + "1045", + "1047", + "1052", + "1057", + "1058", + "106", + "1060", + "1061", + "1062", + "1063", + "1066", + "1072", + "1073", + "1076", + "1078", + "1081", + "1082", + "1083", + "1091", + "1092", + "1093", + "1094", + "1095", + "1099", + "11", + "110", + "1101", + "1103", + "1105", + "1107", + "111", + "1119", + "1124", + "1126", + "1131", + "1133", + "1135", + "1142", + "1147", + "1150", + "1152", + "1153", + "1155", + "1161", + "1162", + "1163", + "1167", + "1169", + "1174", + "1177", + "1179", + "1182", + "1185", + "1187", + "119", + "1192", + "1193", + "1194", + "1196", + "1198", + "12", + "1200", + "1201", + "1203", + "1204", + "1206", + "1208", + "1209", + "1212", + "1214", + "1215", + "1219", + "1220", + "1221", + "1222", + "1223", + "1225", + "1226", + "1227", + "1230", + "1231", + "1232", + "1234", + "1236", + "1238", + "1239", + "1242", + "1246", + "1250", + "1252", + "1253", + "1254", + "1259", + "126", + "1262", + "1263", + "1264", + "1265", + "127", + "130", + "134", + "149", + "156", + "160", + "161", + "165", + "166", + "171", + "174", + "176", + "179", + "18", + "180", + "181", + "184", + "186", + "190", + "192", + "196", + "199", + "202", + "205", + "210", + "211", + "215", + "219", + "23", + "234", + "235", + "236", + "238", + "239", + "241", + "244", + "245", + "246", + "248", + "249", + "25", + "250", + "251", + "253", + "255", + "257", + "261", + "262", + "263", + "264", + "266", + "267", + "270", + "275", + "278", + "279", + "280", + "282", + "283", + "284", + "285", + "286", + "287", + "291", + "294", + "295", + "297", + "298", + "3", + "301", + "303", + "305", + "308", + "309", + "310", + "311", + "314", + "317", + "320", + "322", + "323", + "33", + "331", + "333", + "335", + "342", + "347", + "350", + "351", + "353", + "356", + "357", + "36", + "364", + "366", + "37", + "370", + "377", + "387", + "389", + "39", + "390", + "391", + "392", + "393", + "401", + "403", + "413", + "414", + "417", + "420", + "426", + "427", + "428", + "432", + "434", + "438", + "443", + "445", + "446", + "450", + "454", + "468", + "470", + "472", + "478", + "481", + "484", + "490", + "491", + "494", + "495", + "496", + "499", + "50", + "500", + "501", + "505", + "51", + "511", + "516", + "52", + "521", + "523", + "524", + "527", + "528", + "529", + "530", + "532", + "533", + "534", + "535", + "537", + "538", + "539", + "54", + "542", + "543", + "544", + "546", + "548", + "549", + "55", + "550", + "551", + "552", + "553", + "556", + "558", + "56", + "561", + "569", + "576", + "58", + "580", + "581", + "582", + "583", + "587", + "595", + "596", + "598", + "599", + "6", + "600", + "601", + "602", + "603", + "607", + "61", + "619", + "620", + "621", + "624", + "627", + "628", + "629", + "63", + "630", + "635", + "636", + "639", + "64", + "642", + "644", + "645", + "650", + "652", + "662", + "664", + "666", + "67", + "670", + "671", + "674", + "675", + "678", + "684", + "685", + "686", + "69", + "692", + "694", + "695", + "696", + "70", + "700", + "701", + "702", + "71", + "710", + "711", + "713", + "715", + "716", + "717", + "718", + "72", + "720", + "725", + "726", + "728", + "729", + "730", + "731", + "735", + "739", + "74", + "741", + "745", + "746", + "753", + "756", + "757", + "758", + "759", + "761", + "764", + "770", + "773", + "774", + "775", + "776", + "778", + "781", + "783", + "784", + "785", + "79", + "794", + "796", + "798", + "8", + "800", + "802", + "804", + "81", + "810", + "816", + "819", + "820", + "821", + "822", + "823", + "83", + "832", + "835", + "836", + "838", + "843", + "844", + "847", + "85", + "850", + "852", + "854", + "856", + "86", + "861", + "863", + "864", + "865", + "867", + "87", + "871", + "875", + "883", + "884", + "887", + "897", + "898", + "904", + "906", + "907", + "909", + "910", + "92", + "921", + "922", + "923", + "928", + "942", + "946", + "95", + "950", + "952", + "963", + "97", + "970", + "971", + "978", + "979", + "980", + "984", + "986", + "991", + "992", + "999" + ], + "test_query_ids": [ + "1004", + "1012", + "1029", + "1030", + "1034", + "1036", + "1048", + "105", + "1053", + "1065", + "1077", + "1079", + "1085", + "109", + "1096", + "1097", + "1106", + "1110", + "1111", + "1115", + "1121", + "1127", + "1128", + "1141", + "1144", + "1149", + "1164", + "1190", + "1191", + "1195", + "1207", + "121", + "1211", + "1213", + "1216", + "1217", + "1224", + "1248", + "1257", + "128", + "131", + "132", + "140", + "152", + "153", + "154", + "159", + "168", + "169", + "175", + "191", + "193", + "200", + "22", + "221", + "223", + "228", + "229", + "231", + "237", + "240", + "254", + "26", + "288", + "289", + "30", + "319", + "324", + "330", + "354", + "362", + "367", + "371", + "384", + "394", + "395", + "396", + "397", + "405", + "418", + "422", + "425", + "464", + "469", + "480", + "49", + "502", + "509", + "514", + "515", + "519", + "525", + "555", + "562", + "572", + "579", + "60", + "605", + "610", + "614", + "625", + "632", + "633", + "638", + "643", + "653", + "672", + "68", + "681", + "688", + "689", + "7", + "706", + "708", + "712", + "722", + "732", + "734", + "742", + "747", + "749", + "751", + "754", + "760", + "763", + "772", + "78", + "787", + "790", + "797", + "805", + "809", + "811", + "826", + "830", + "834", + "840", + "853", + "869", + "873", + "876", + "88", + "885", + "893", + "894", + "899", + "905", + "912", + "920", + "925", + "93", + "934", + "936", + "941", + "943", + "947", + "959", + "962", + "969", + "972", + "976", + "98", + "983", + "985", + "987", + "998" + ] +} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json new file mode 100644 index 0000000..119af23 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json @@ -0,0 +1,3252 @@ +{ + "dataset": "patents", + "total_queries": 3107, + "train_queries": 2518, + "test_queries": 718, + "sft_queries": 755, + "rl_queries": 1763, + "sft_ratio": 0.2998411437648928, + "rl_ratio": 0.7001588562351072, + "sft_query_ids": [ + "0_2", + "0_5", + "0_9", + "100_2", + "100_4", + "100_5", + "100_6", + "100_7", + "101_2", + "101_5", + "102_1", + "102_2", + "102_3", + "103_1", + "103_2", + "103_4", + "103_9", + "104_3", + "104_8", + "106_9", + "107_1", + "107_3", + "112_0", + "112_1", + "116_10", + "116_12", + "11_0", + "11_13", + "11_8", + "122_10", + "122_8", + "123_0", + "123_1", + "123_5", + "123_6", + "125_0", + "129_0", + "130_1", + "130_12", + "130_4", + "130_6", + "130_7", + "130_9", + "131_2", + "131_3", + "143_0", + "143_1", + "143_10", + "144_1", + "144_10", + "144_13", + "144_2", + "144_5", + "144_6", + "144_7", + "144_8", + "146_11", + "146_6", + "146_9", + "151_11", + "151_14", + "151_16", + "152_0", + "152_10", + "152_11", + "152_13", + "152_14", + "152_17", + "152_2", + "152_20", + "152_26", + "152_27", + "152_9", + "154_11", + "156_2", + "156_3", + "157_3", + "158_4", + "158_6", + "159_0", + "159_1", + "159_10", + "159_22", + "159_23", + "159_7", + "159_9", + "163_12", + "163_3", + "163_4", + "164_11", + "164_12", + "164_15", + "164_18", + "164_7", + "164_8", + "166_7", + "166_8", + "170_8", + "181_15", + "181_3", + "181_4", + "181_8", + "183_2", + "183_5", + "183_6", + "184_2", + "184_3", + "184_4", + "184_5", + "184_7", + "184_8", + "186_1", + "186_12", + "186_17", + "186_5", + "186_7", + "187_10", + "187_19", + "187_3", + "187_4", + "187_5", + "187_9", + "189_2", + "189_5", + "189_8", + "191_13", + "191_16", + "191_2", + "191_26", + "193_0", + "193_12", + "193_16", + "193_20", + "193_4", + "193_5", + "193_9", + "198_9", + "199_1", + "199_14", + "199_17", + "199_6", + "199_7", + "19_14", + "19_20", + "19_21", + "19_23", + "19_28", + "1_1", + "1_3", + "203_12", + "203_5", + "203_8", + "204_0", + "204_24", + "204_25", + "204_28", + "204_3", + "204_7", + "205_11", + "205_13", + "205_15", + "205_19", + "206_0", + "206_1", + "206_12", + "206_2", + "206_4", + "206_9", + "207_2", + "207_5", + "209_1", + "211_0", + "211_1", + "211_2", + "211_3", + "213_10", + "213_11", + "213_2", + "213_3", + "213_4", + "213_9", + "215_1", + "215_19", + "215_24", + "215_25", + "215_26", + "215_3", + "215_4", + "215_8", + "217_1", + "21_27", + "21_7", + "220_1", + "220_11", + "220_15", + "220_17", + "220_18", + "220_26", + "222_10", + "223_3", + "224_13", + "224_14", + "224_23", + "225_0", + "22_11", + "22_12", + "22_2", + "22_3", + "22_5", + "22_6", + "22_8", + "230_0", + "230_2", + "231_10", + "231_14", + "231_17", + "231_21", + "231_4", + "231_7", + "231_8", + "232_3", + "233_5", + "233_8", + "234_4", + "235_4", + "235_6", + "236_0", + "236_2", + "236_4", + "236_6", + "237_10", + "237_15", + "237_8", + "239_2", + "239_5", + "240_0", + "240_1", + "240_12", + "240_3", + "240_4", + "240_5", + "240_6", + "240_7", + "241_0", + "241_1", + "241_4", + "241_6", + "243_0", + "243_11", + "243_16", + "243_18", + "243_9", + "246_0", + "246_1", + "246_5", + "248_6", + "249_11", + "249_13", + "249_2", + "249_4", + "249_6", + "249_7", + "249_9", + "24_11", + "24_3", + "250_13", + "250_18", + "250_22", + "250_24", + "250_25", + "250_28", + "250_4", + "252_0", + "252_1", + "252_12", + "252_19", + "252_3", + "252_4", + "253_0", + "253_1", + "254_1", + "254_13", + "254_14", + "254_3", + "254_4", + "257_0", + "257_12", + "257_2", + "259_0", + "262_11", + "262_7", + "265_0", + "266_1", + "267_0", + "267_1", + "267_4", + "267_9", + "269_0", + "269_4", + "269_5", + "273_0", + "273_1", + "276_0", + "276_1", + "278_6", + "278_7", + "278_8", + "279_12", + "279_13", + "27_10", + "281_2", + "281_3", + "281_4", + "286_1", + "286_11", + "286_2", + "288_16", + "288_5", + "28_18", + "28_19", + "291_0", + "291_1", + "291_2", + "291_3", + "291_6", + "291_7", + "298_12", + "298_3", + "298_9", + "299_5", + "299_6", + "299_7", + "299_8", + "299_9", + "2_0", + "2_1", + "300_1", + "300_10", + "300_17", + "300_7", + "301_0", + "301_10", + "301_11", + "301_13", + "301_15", + "301_17", + "301_4", + "301_8", + "302_12", + "302_18", + "302_19", + "302_2", + "302_7", + "302_8", + "303_1", + "303_13", + "303_14", + "303_2", + "303_6", + "303_9", + "305_11", + "305_12", + "305_13", + "305_15", + "305_17", + "305_3", + "305_4", + "305_8", + "308_0", + "310_15", + "310_17", + "310_2", + "311_3", + "311_7", + "317_0", + "318_1", + "318_12", + "318_17", + "318_8", + "319_2", + "31_12", + "31_15", + "31_2", + "31_5", + "31_8", + "323_0", + "323_12", + "323_2", + "323_6", + "323_8", + "323_9", + "324_17", + "326_5", + "327_1", + "327_3", + "331_12", + "331_24", + "332_0", + "332_1", + "332_5", + "335_1", + "335_6", + "335_9", + "336_12", + "336_7", + "337_1", + "337_11", + "339_17", + "339_4", + "341_16", + "341_9", + "344_0", + "344_1", + "345_1", + "348_2", + "355_2", + "356_14", + "356_16", + "356_20", + "356_21", + "356_23", + "356_24", + "356_26", + "356_8", + "361_14", + "361_3", + "361_4", + "363_12", + "363_15", + "363_2", + "363_3", + "363_6", + "363_7", + "364_11", + "364_13", + "364_5", + "366_0", + "366_3", + "372_11", + "372_14", + "372_8", + "373_11", + "373_14", + "373_15", + "373_3", + "373_4", + "373_6", + "373_7", + "373_8", + "374_0", + "374_2", + "374_4", + "374_6", + "381_2", + "388_0", + "388_11", + "388_13", + "388_14", + "388_5", + "389_6", + "395_18", + "397_0", + "398_0", + "398_5", + "398_7", + "399_14", + "399_15", + "399_16", + "39_15", + "39_18", + "39_19", + "39_2", + "39_7", + "402_6", + "406_0", + "406_11", + "406_4", + "406_8", + "407_16", + "407_6", + "407_8", + "407_9", + "411_0", + "411_6", + "412_1", + "412_12", + "412_15", + "412_19", + "412_2", + "412_3", + "412_8", + "413_3", + "413_6", + "413_7", + "416_3", + "416_4", + "420_0", + "420_11", + "420_2", + "420_6", + "420_7", + "420_8", + "421_16", + "421_2", + "421_21", + "421_5", + "422_1", + "422_11", + "422_12", + "422_15", + "422_16", + "422_17", + "422_23", + "422_24", + "422_4", + "422_6", + "425_13", + "425_17", + "425_21", + "425_22", + "425_26", + "425_27", + "425_28", + "426_10", + "426_11", + "426_21", + "426_25", + "438_0", + "438_2", + "43_1", + "43_2", + "43_3", + "43_4", + "440_9", + "445_3", + "445_4", + "448_15", + "448_18", + "448_2", + "448_4", + "448_8", + "449_1", + "452_10", + "452_12", + "452_14", + "452_3", + "452_4", + "452_5", + "452_8", + "453_14", + "453_9", + "454_2", + "454_3", + "454_5", + "456_13", + "459_33", + "459_36", + "460_0", + "460_12", + "460_14", + "460_17", + "460_23", + "460_29", + "460_3", + "460_4", + "460_7", + "460_8", + "460_9", + "465_9", + "467_4", + "469_9", + "470_0", + "470_3", + "471_15", + "471_16", + "471_4", + "471_7", + "471_9", + "472_0", + "472_11", + "473_0", + "474_1", + "482_12", + "482_15", + "482_17", + "482_5", + "482_7", + "485_0", + "485_5", + "485_6", + "486_15", + "486_16", + "486_3", + "486_7", + "488_1", + "488_4", + "493_0", + "493_11", + "493_4", + "493_5", + "493_7", + "494_0", + "494_2", + "494_4", + "494_9", + "499_1", + "499_6", + "499_8", + "499_9", + "49_15", + "49_19", + "49_2", + "49_6", + "49_7", + "49_9", + "508_2", + "514_0", + "514_1", + "514_14", + "514_4", + "514_5", + "515_11", + "515_12", + "515_13", + "515_5", + "515_9", + "517_0", + "517_1", + "518_12", + "518_17", + "518_35", + "518_37", + "518_39", + "518_42", + "518_43", + "518_7", + "518_9", + "521_0", + "521_4", + "521_5", + "521_9", + "52_16", + "530_1", + "530_2", + "530_5", + "530_8", + "530_9", + "54_0", + "55_1", + "55_15", + "55_20", + "55_21", + "55_4", + "58_3", + "5_0", + "5_1", + "5_10", + "5_14", + "5_3", + "5_5", + "5_7", + "5_8", + "60_16", + "60_24", + "60_25", + "60_27", + "60_6", + "60_9", + "62_1", + "62_13", + "62_14", + "65_0", + "65_12", + "65_13", + "65_3", + "65_4", + "65_8", + "66_0", + "66_14", + "66_17", + "66_18", + "68_10", + "68_13", + "68_14", + "68_7", + "68_8", + "69_1", + "69_10", + "69_2", + "69_3", + "69_4", + "69_6", + "70_1", + "70_6", + "70_8", + "70_9", + "76_1", + "76_13", + "76_20", + "76_6", + "78_1", + "78_16", + "78_18", + "78_19", + "78_22", + "78_24", + "78_26", + "78_6", + "78_7", + "78_8", + "78_9", + "81_17", + "82_2", + "82_4", + "83_1", + "83_10", + "83_11", + "83_2", + "83_3", + "83_4", + "83_5", + "83_9", + "84_1", + "85_1", + "85_13", + "85_14", + "85_15", + "85_16", + "85_17", + "85_19", + "85_20", + "85_3", + "85_7", + "86_0", + "86_13", + "86_17", + "86_20", + "86_22", + "86_23", + "86_3", + "87_0", + "87_13", + "87_16", + "87_5", + "87_9", + "90_5", + "92_10", + "92_3", + "98_0", + "98_1", + "98_13", + "98_15", + "98_16", + "98_5", + "98_6" + ], + "rl_query_ids": [ + "0_0", + "0_1", + "0_3", + "0_4", + "0_6", + "0_7", + "0_8", + "100_1", + "100_13", + "100_3", + "100_8", + "100_9", + "101_0", + "101_1", + "101_10", + "101_3", + "101_4", + "101_6", + "101_9", + "102_0", + "103_0", + "103_10", + "103_3", + "103_5", + "103_6", + "103_7", + "103_8", + "104_2", + "104_4", + "104_5", + "104_6", + "104_7", + "105_1", + "105_4", + "106_10", + "106_11", + "106_6", + "107_0", + "107_4", + "112_2", + "113_17", + "113_18", + "115_0", + "115_1", + "116_14", + "116_8", + "117_3", + "117_4", + "118_1", + "11_1", + "11_10", + "11_14", + "11_15", + "11_16", + "11_2", + "11_3", + "11_4", + "11_5", + "11_6", + "11_7", + "11_9", + "122_9", + "123_2", + "123_3", + "123_4", + "123_7", + "124_10", + "125_1", + "129_1", + "129_3", + "129_4", + "129_7", + "129_8", + "130_0", + "130_10", + "130_11", + "130_13", + "130_2", + "130_3", + "130_5", + "130_8", + "131_0", + "131_1", + "134_0", + "134_1", + "134_2", + "136_4", + "136_5", + "136_6", + "137_8", + "139_0", + "139_1", + "143_11", + "143_12", + "143_13", + "143_14", + "143_15", + "143_2", + "143_3", + "143_4", + "143_5", + "143_6", + "143_7", + "143_8", + "143_9", + "144_0", + "144_11", + "144_12", + "144_14", + "144_3", + "144_4", + "144_9", + "146_1", + "146_10", + "146_12", + "146_13", + "146_14", + "146_15", + "146_16", + "146_17", + "146_2", + "146_3", + "146_4", + "146_5", + "146_7", + "146_8", + "14_0", + "150_1", + "150_13", + "150_14", + "150_15", + "150_16", + "150_2", + "150_3", + "150_4", + "151_0", + "151_1", + "151_10", + "151_12", + "151_13", + "151_15", + "151_17", + "151_18", + "151_2", + "151_3", + "151_4", + "151_5", + "151_6", + "151_7", + "151_8", + "151_9", + "152_1", + "152_12", + "152_15", + "152_16", + "152_18", + "152_19", + "152_21", + "152_22", + "152_23", + "152_24", + "152_25", + "152_28", + "152_29", + "152_3", + "152_4", + "152_5", + "152_6", + "152_7", + "152_8", + "153_0", + "154_1", + "154_12", + "154_2", + "156_0", + "156_1", + "156_4", + "157_12", + "157_13", + "157_15", + "157_17", + "157_4", + "157_6", + "157_7", + "157_8", + "158_0", + "158_1", + "158_10", + "158_2", + "158_3", + "158_5", + "158_7", + "158_8", + "158_9", + "159_11", + "159_12", + "159_13", + "159_14", + "159_17", + "159_18", + "159_2", + "159_21", + "159_24", + "159_3", + "159_4", + "159_5", + "159_6", + "159_8", + "163_0", + "163_1", + "163_10", + "163_11", + "163_13", + "163_14", + "163_15", + "163_16", + "163_2", + "163_5", + "163_6", + "163_7", + "163_8", + "163_9", + "164_0", + "164_1", + "164_13", + "164_14", + "164_16", + "164_17", + "164_19", + "164_2", + "164_20", + "164_21", + "164_22", + "164_23", + "164_3", + "164_4", + "164_5", + "164_6", + "164_9", + "166_10", + "166_12", + "166_13", + "166_4", + "166_5", + "166_6", + "166_9", + "170_16", + "170_17", + "170_5", + "170_7", + "176_1", + "181_0", + "181_1", + "181_10", + "181_11", + "181_12", + "181_13", + "181_14", + "181_16", + "181_2", + "181_5", + "181_6", + "181_7", + "181_9", + "183_0", + "183_1", + "183_3", + "183_4", + "184_0", + "184_1", + "184_10", + "184_11", + "184_12", + "184_13", + "184_14", + "184_15", + "184_6", + "184_9", + "186_0", + "186_10", + "186_11", + "186_13", + "186_15", + "186_2", + "186_4", + "186_8", + "186_9", + "187_0", + "187_1", + "187_11", + "187_12", + "187_2", + "187_6", + "187_7", + "187_8", + "188_0", + "188_5", + "188_6", + "188_8", + "188_9", + "189_0", + "189_1", + "189_3", + "189_4", + "189_6", + "189_7", + "191_0", + "191_1", + "191_10", + "191_11", + "191_12", + "191_14", + "191_15", + "191_17", + "191_18", + "191_19", + "191_20", + "191_21", + "191_22", + "191_23", + "191_24", + "191_25", + "191_3", + "191_4", + "191_5", + "191_6", + "191_7", + "191_8", + "191_9", + "193_1", + "193_10", + "193_11", + "193_13", + "193_14", + "193_15", + "193_17", + "193_18", + "193_19", + "193_2", + "193_21", + "193_22", + "193_23", + "193_24", + "193_25", + "193_26", + "193_3", + "193_6", + "193_7", + "193_8", + "195_12", + "195_13", + "195_14", + "198_10", + "198_3", + "198_4", + "198_5", + "198_6", + "198_7", + "199_0", + "199_15", + "199_16", + "199_18", + "199_19", + "199_2", + "199_3", + "199_4", + "199_5", + "199_8", + "199_9", + "19_11", + "19_12", + "19_13", + "19_15", + "19_17", + "19_18", + "19_19", + "19_22", + "19_24", + "19_25", + "1_0", + "1_4", + "203_0", + "203_1", + "203_10", + "203_11", + "203_14", + "203_3", + "203_4", + "203_6", + "203_7", + "203_9", + "204_1", + "204_10", + "204_11", + "204_12", + "204_13", + "204_14", + "204_15", + "204_16", + "204_2", + "204_22", + "204_23", + "204_26", + "204_27", + "204_4", + "204_5", + "204_6", + "204_8", + "204_9", + "205_12", + "205_14", + "205_18", + "205_20", + "205_21", + "205_23", + "206_10", + "206_11", + "206_13", + "206_14", + "206_3", + "206_5", + "206_6", + "206_7", + "206_8", + "207_3", + "208_1", + "209_0", + "209_2", + "209_3", + "209_4", + "211_4", + "211_5", + "212_9", + "213_0", + "213_1", + "213_12", + "213_13", + "213_14", + "213_5", + "213_6", + "213_7", + "213_8", + "215_0", + "215_10", + "215_11", + "215_12", + "215_13", + "215_14", + "215_15", + "215_16", + "215_17", + "215_18", + "215_2", + "215_20", + "215_21", + "215_22", + "215_23", + "215_5", + "215_6", + "215_7", + "215_9", + "217_0", + "21_22", + "21_25", + "21_26", + "21_29", + "21_8", + "220_0", + "220_10", + "220_12", + "220_13", + "220_14", + "220_16", + "220_19", + "220_2", + "220_20", + "220_21", + "220_22", + "220_23", + "220_24", + "220_25", + "220_27", + "220_28", + "220_29", + "220_3", + "220_4", + "220_5", + "220_6", + "220_7", + "220_8", + "220_9", + "222_1", + "222_2", + "222_6", + "222_8", + "223_0", + "223_1", + "223_2", + "223_4", + "223_5", + "223_6", + "224_15", + "224_21", + "224_22", + "227_0", + "227_1", + "227_2", + "227_3", + "227_4", + "227_5", + "227_6", + "227_7", + "227_8", + "227_9", + "229_0", + "22_14", + "22_16", + "22_7", + "230_1", + "230_3", + "230_4", + "230_6", + "231_0", + "231_1", + "231_11", + "231_12", + "231_13", + "231_15", + "231_16", + "231_18", + "231_19", + "231_2", + "231_20", + "231_22", + "231_23", + "231_24", + "231_25", + "231_26", + "231_3", + "231_5", + "231_6", + "231_9", + "232_0", + "232_1", + "232_2", + "232_4", + "233_4", + "233_6", + "233_7", + "235_0", + "235_1", + "235_2", + "235_3", + "235_5", + "236_1", + "236_3", + "236_5", + "236_7", + "236_8", + "236_9", + "237_0", + "237_1", + "237_11", + "237_12", + "237_13", + "237_14", + "237_16", + "237_17", + "237_18", + "237_19", + "237_2", + "237_3", + "237_4", + "237_5", + "237_6", + "237_7", + "237_9", + "239_0", + "239_1", + "239_4", + "240_10", + "240_11", + "240_13", + "240_14", + "240_2", + "240_8", + "240_9", + "241_2", + "241_3", + "241_5", + "243_1", + "243_10", + "243_12", + "243_13", + "243_15", + "243_17", + "243_19", + "243_2", + "243_3", + "243_4", + "243_5", + "243_6", + "243_7", + "243_8", + "246_2", + "246_3", + "248_4", + "248_5", + "248_7", + "248_8", + "248_9", + "249_0", + "249_1", + "249_10", + "249_12", + "249_14", + "249_15", + "249_17", + "249_18", + "249_19", + "249_3", + "249_5", + "249_8", + "24_0", + "24_1", + "24_10", + "24_12", + "24_2", + "24_4", + "24_5", + "24_6", + "24_7", + "24_8", + "24_9", + "250_10", + "250_11", + "250_12", + "250_14", + "250_15", + "250_16", + "250_17", + "250_19", + "250_20", + "250_21", + "250_23", + "250_26", + "250_27", + "250_29", + "250_3", + "250_7", + "250_8", + "250_9", + "252_10", + "252_11", + "252_13", + "252_14", + "252_2", + "252_5", + "252_6", + "252_7", + "252_8", + "252_9", + "253_2", + "254_10", + "254_12", + "254_15", + "254_16", + "254_18", + "254_19", + "254_20", + "254_21", + "254_22", + "254_24", + "254_25", + "254_26", + "254_5", + "254_6", + "254_7", + "254_8", + "254_9", + "257_1", + "257_11", + "257_17", + "257_4", + "257_8", + "259_1", + "262_0", + "262_1", + "262_10", + "262_16", + "262_17", + "262_2", + "262_3", + "262_4", + "262_5", + "262_6", + "262_8", + "262_9", + "267_10", + "267_11", + "267_2", + "267_3", + "267_5", + "267_6", + "267_7", + "267_8", + "269_1", + "269_2", + "269_3", + "269_6", + "273_2", + "275_22", + "276_2", + "278_0", + "278_1", + "278_10", + "278_11", + "278_2", + "278_3", + "278_4", + "278_5", + "278_9", + "279_10", + "279_3", + "279_4", + "27_0", + "27_1", + "27_11", + "27_12", + "27_2", + "27_3", + "27_4", + "27_8", + "27_9", + "280_0", + "281_0", + "281_1", + "286_0", + "286_10", + "286_7", + "286_8", + "286_9", + "288_1", + "288_12", + "288_13", + "288_15", + "288_2", + "288_3", + "288_4", + "288_6", + "291_11", + "291_12", + "291_25", + "291_26", + "291_4", + "291_5", + "291_8", + "291_9", + "294_5", + "294_6", + "298_0", + "298_1", + "298_10", + "298_11", + "298_14", + "298_2", + "298_4", + "298_5", + "298_6", + "298_7", + "298_8", + "299_0", + "299_1", + "299_2", + "299_3", + "299_4", + "2_2", + "2_3", + "2_4", + "2_5", + "300_0", + "300_11", + "300_12", + "300_14", + "300_15", + "300_16", + "300_2", + "300_3", + "300_4", + "300_5", + "300_6", + "300_8", + "300_9", + "301_1", + "301_12", + "301_14", + "301_16", + "301_2", + "301_3", + "301_5", + "301_6", + "301_7", + "301_9", + "302_0", + "302_1", + "302_10", + "302_11", + "302_13", + "302_14", + "302_15", + "302_16", + "302_17", + "302_3", + "302_4", + "302_5", + "302_6", + "302_9", + "303_0", + "303_10", + "303_11", + "303_12", + "303_15", + "303_16", + "303_17", + "303_18", + "303_19", + "303_3", + "303_4", + "303_5", + "303_7", + "303_8", + "304_10", + "304_11", + "305_0", + "305_1", + "305_10", + "305_14", + "305_16", + "305_18", + "305_19", + "305_2", + "305_5", + "305_6", + "305_7", + "305_9", + "307_8", + "308_1", + "308_2", + "310_16", + "311_0", + "311_1", + "311_16", + "311_2", + "311_4", + "311_5", + "311_6", + "311_8", + "311_9", + "314_4", + "316_0", + "317_1", + "317_2", + "317_3", + "318_0", + "318_10", + "318_11", + "318_13", + "318_14", + "318_15", + "318_16", + "318_18", + "318_19", + "318_2", + "318_3", + "318_4", + "318_5", + "318_6", + "318_7", + "318_9", + "319_3", + "319_4", + "319_5", + "31_0", + "31_1", + "31_10", + "31_11", + "31_13", + "31_14", + "31_16", + "31_17", + "31_18", + "31_19", + "31_4", + "31_6", + "31_7", + "31_9", + "322_10", + "322_6", + "322_7", + "322_8", + "323_1", + "323_10", + "323_11", + "323_3", + "323_4", + "323_5", + "323_7", + "324_16", + "324_18", + "327_0", + "327_2", + "331_0", + "331_1", + "331_10", + "331_11", + "331_13", + "331_14", + "331_15", + "331_16", + "331_17", + "331_18", + "331_19", + "331_2", + "331_20", + "331_21", + "331_22", + "331_23", + "331_3", + "331_4", + "331_5", + "331_6", + "331_7", + "331_8", + "331_9", + "332_2", + "332_3", + "332_4", + "335_10", + "335_11", + "335_12", + "335_16", + "335_17", + "335_19", + "335_2", + "335_3", + "335_4", + "335_5", + "335_7", + "335_8", + "336_0", + "336_1", + "336_10", + "336_11", + "336_13", + "336_14", + "336_15", + "336_16", + "336_17", + "336_18", + "336_19", + "336_20", + "336_21", + "336_22", + "336_4", + "336_5", + "336_6", + "336_8", + "336_9", + "337_0", + "337_12", + "337_2", + "337_3", + "337_4", + "337_5", + "337_6", + "337_7", + "337_8", + "339_13", + "339_14", + "339_15", + "339_5", + "339_6", + "339_8", + "341_0", + "341_1", + "341_10", + "341_11", + "341_12", + "341_13", + "341_14", + "341_15", + "341_17", + "341_18", + "341_2", + "341_3", + "341_4", + "341_5", + "341_6", + "341_7", + "341_8", + "343_1", + "344_2", + "345_0", + "345_2", + "345_3", + "345_4", + "352_13", + "355_0", + "355_1", + "355_3", + "355_4", + "355_5", + "355_6", + "355_7", + "355_8", + "355_9", + "356_0", + "356_1", + "356_10", + "356_11", + "356_12", + "356_13", + "356_15", + "356_19", + "356_2", + "356_22", + "356_25", + "356_27", + "356_3", + "356_33", + "356_4", + "356_5", + "356_6", + "356_7", + "356_9", + "360_2", + "361_0", + "361_1", + "361_10", + "361_12", + "361_13", + "361_2", + "361_5", + "361_6", + "361_7", + "361_8", + "361_9", + "363_0", + "363_1", + "363_10", + "363_11", + "363_13", + "363_14", + "363_16", + "363_17", + "363_18", + "363_19", + "363_4", + "363_5", + "363_8", + "363_9", + "364_12", + "364_2", + "364_4", + "364_6", + "364_9", + "366_1", + "366_2", + "370_7", + "370_8", + "372_12", + "372_15", + "373_0", + "373_1", + "373_10", + "373_12", + "373_13", + "373_2", + "373_5", + "373_9", + "374_1", + "374_3", + "374_5", + "374_7", + "374_8", + "377_0", + "377_1", + "381_0", + "381_1", + "381_4", + "384_0", + "384_1", + "384_2", + "384_3", + "384_4", + "384_5", + "384_6", + "384_7", + "385_0", + "385_1", + "385_2", + "385_3", + "388_1", + "388_10", + "388_12", + "388_15", + "388_16", + "388_2", + "388_3", + "388_4", + "388_6", + "388_7", + "388_8", + "388_9", + "38_0", + "38_1", + "392_0", + "392_1", + "392_2", + "392_3", + "392_4", + "392_5", + "392_6", + "392_7", + "394_0", + "394_1", + "394_2", + "395_15", + "397_2", + "398_1", + "398_2", + "398_3", + "398_4", + "398_6", + "399_10", + "399_11", + "399_12", + "399_13", + "399_17", + "399_18", + "399_19", + "39_0", + "39_1", + "39_13", + "39_14", + "39_3", + "39_5", + "39_8", + "400_12", + "402_5", + "406_1", + "406_10", + "406_12", + "406_13", + "406_14", + "406_2", + "406_3", + "406_5", + "406_6", + "406_7", + "406_9", + "407_10", + "407_11", + "407_12", + "407_15", + "407_17", + "407_5", + "407_7", + "408_0", + "408_1", + "411_1", + "411_2", + "411_3", + "411_4", + "411_5", + "412_0", + "412_10", + "412_11", + "412_13", + "412_14", + "412_16", + "412_17", + "412_18", + "412_4", + "412_5", + "412_6", + "412_7", + "412_9", + "413_0", + "413_1", + "413_2", + "413_4", + "413_5", + "413_8", + "416_0", + "416_1", + "416_2", + "41_11", + "420_1", + "420_10", + "420_12", + "420_13", + "420_14", + "420_15", + "420_16", + "420_17", + "420_18", + "420_19", + "420_3", + "420_4", + "420_5", + "420_9", + "421_0", + "421_1", + "421_10", + "421_11", + "421_12", + "421_13", + "421_14", + "421_15", + "421_17", + "421_18", + "421_19", + "421_20", + "421_22", + "421_23", + "421_3", + "421_4", + "421_6", + "421_7", + "421_8", + "421_9", + "422_0", + "422_10", + "422_13", + "422_14", + "422_18", + "422_19", + "422_2", + "422_20", + "422_21", + "422_22", + "422_25", + "422_3", + "422_5", + "422_7", + "422_8", + "422_9", + "424_1", + "425_0", + "425_1", + "425_14", + "425_15", + "425_16", + "425_18", + "425_19", + "425_2", + "425_20", + "425_23", + "425_24", + "425_25", + "425_3", + "425_4", + "425_7", + "426_12", + "426_13", + "426_15", + "426_17", + "426_18", + "426_22", + "426_24", + "426_29", + "426_4", + "426_6", + "438_1", + "43_5", + "43_6", + "43_7", + "43_8", + "43_9", + "440_10", + "445_1", + "445_2", + "445_6", + "445_7", + "445_8", + "448_12", + "448_13", + "448_14", + "448_3", + "448_5", + "449_0", + "449_2", + "449_3", + "449_4", + "449_5", + "449_6", + "449_7", + "449_8", + "449_9", + "452_0", + "452_1", + "452_11", + "452_13", + "452_15", + "452_16", + "452_17", + "452_18", + "452_19", + "452_2", + "452_6", + "452_7", + "452_9", + "453_0", + "453_1", + "453_10", + "453_11", + "453_13", + "453_15", + "453_16", + "453_17", + "453_2", + "453_3", + "453_4", + "453_5", + "453_6", + "453_7", + "453_8", + "454_0", + "454_1", + "455_16", + "455_18", + "459_32", + "459_34", + "459_35", + "459_37", + "459_38", + "459_39", + "459_40", + "460_1", + "460_10", + "460_11", + "460_13", + "460_15", + "460_16", + "460_18", + "460_19", + "460_2", + "460_20", + "460_21", + "460_22", + "460_24", + "460_25", + "460_26", + "460_27", + "460_28", + "460_5", + "460_6", + "465_12", + "465_5", + "467_13", + "467_5", + "467_6", + "467_7", + "467_8", + "469_1", + "469_10", + "469_11", + "469_12", + "469_13", + "469_14", + "469_15", + "469_16", + "469_17", + "469_18", + "469_19", + "469_2", + "469_4", + "469_5", + "469_7", + "469_8", + "470_1", + "470_5", + "471_0", + "471_1", + "471_11", + "471_13", + "471_14", + "471_17", + "471_19", + "471_2", + "471_6", + "471_8", + "472_1", + "472_10", + "472_12", + "472_14", + "472_15", + "472_2", + "472_4", + "472_5", + "472_7", + "472_8", + "472_9", + "473_1", + "473_2", + "473_3", + "474_0", + "476_4", + "47_0", + "482_0", + "482_1", + "482_10", + "482_11", + "482_13", + "482_14", + "482_19", + "482_2", + "482_20", + "482_22", + "482_3", + "482_4", + "482_6", + "482_8", + "482_9", + "483_11", + "483_12", + "483_13", + "483_15", + "483_16", + "483_17", + "485_1", + "485_2", + "485_3", + "485_4", + "486_0", + "486_10", + "486_2", + "486_4", + "486_8", + "486_9", + "487_0", + "487_1", + "487_2", + "487_3", + "487_4", + "488_0", + "488_2", + "488_3", + "488_5", + "488_6", + "488_7", + "490_18", + "490_22", + "490_25", + "490_27", + "490_3", + "490_6", + "490_8", + "490_9", + "492_19", + "492_20", + "493_1", + "493_10", + "493_12", + "493_2", + "493_3", + "493_6", + "493_9", + "494_1", + "494_3", + "494_7", + "499_0", + "499_2", + "499_3", + "499_4", + "499_5", + "499_7", + "49_0", + "49_1", + "49_10", + "49_11", + "49_12", + "49_13", + "49_14", + "49_16", + "49_17", + "49_18", + "49_3", + "49_4", + "49_5", + "49_8", + "508_10", + "508_13", + "508_19", + "508_9", + "514_10", + "514_11", + "514_13", + "514_2", + "514_3", + "514_6", + "515_0", + "515_1", + "515_10", + "515_14", + "515_15", + "515_2", + "515_3", + "515_4", + "515_6", + "515_7", + "515_8", + "517_2", + "517_3", + "517_4", + "518_10", + "518_11", + "518_13", + "518_14", + "518_15", + "518_16", + "518_2", + "518_3", + "518_34", + "518_36", + "518_38", + "518_4", + "518_40", + "518_41", + "518_5", + "518_6", + "518_8", + "521_1", + "521_10", + "521_11", + "521_12", + "521_2", + "521_3", + "521_6", + "521_7", + "521_8", + "524_0", + "528_0", + "528_1", + "530_11", + "530_12", + "530_13", + "530_14", + "530_3", + "530_4", + "530_6", + "530_7", + "53_0", + "53_1", + "53_2", + "54_1", + "54_2", + "55_11", + "55_12", + "55_14", + "55_16", + "55_2", + "55_22", + "55_5", + "55_6", + "58_10", + "58_4", + "59_0", + "5_11", + "5_12", + "5_13", + "5_2", + "5_4", + "5_6", + "5_9", + "60_0", + "60_1", + "60_10", + "60_11", + "60_12", + "60_13", + "60_14", + "60_15", + "60_17", + "60_18", + "60_19", + "60_20", + "60_21", + "60_22", + "60_26", + "60_3", + "60_4", + "60_7", + "62_0", + "62_10", + "62_11", + "62_12", + "62_2", + "62_3", + "62_4", + "62_5", + "62_6", + "62_7", + "62_8", + "62_9", + "64_5", + "65_1", + "65_10", + "65_11", + "65_14", + "65_15", + "65_16", + "65_2", + "65_5", + "65_6", + "65_7", + "65_9", + "66_1", + "66_10", + "66_11", + "66_12", + "66_13", + "66_15", + "66_16", + "66_2", + "66_3", + "66_4", + "66_5", + "66_6", + "66_7", + "66_8", + "66_9", + "68_11", + "68_12", + "68_15", + "68_5", + "68_6", + "68_9", + "69_0", + "69_11", + "69_12", + "69_5", + "69_7", + "69_8", + "69_9", + "70_0", + "70_10", + "70_11", + "70_12", + "70_13", + "70_14", + "70_2", + "70_3", + "70_4", + "70_5", + "70_7", + "73_0", + "76_15", + "78_0", + "78_10", + "78_11", + "78_12", + "78_13", + "78_14", + "78_15", + "78_17", + "78_2", + "78_20", + "78_21", + "78_23", + "78_25", + "78_3", + "78_4", + "78_5", + "81_16", + "81_24", + "81_3", + "81_7", + "82_1", + "82_10", + "82_3", + "82_5", + "82_6", + "82_7", + "82_8", + "82_9", + "83_0", + "83_12", + "83_6", + "83_7", + "83_8", + "84_2", + "85_0", + "85_10", + "85_11", + "85_12", + "85_18", + "85_2", + "85_21", + "85_22", + "85_23", + "85_4", + "85_5", + "85_6", + "85_8", + "85_9", + "86_1", + "86_10", + "86_12", + "86_14", + "86_15", + "86_16", + "86_18", + "86_2", + "86_21", + "86_25", + "86_26", + "86_4", + "86_5", + "86_6", + "86_8", + "86_9", + "87_1", + "87_10", + "87_11", + "87_12", + "87_14", + "87_15", + "87_17", + "87_18", + "87_2", + "87_3", + "87_4", + "87_6", + "87_7", + "87_8", + "8_1", + "90_4", + "92_0", + "92_1", + "92_11", + "92_12", + "92_2", + "92_4", + "92_5", + "92_6", + "92_7", + "92_8", + "92_9", + "97_2", + "98_10", + "98_11", + "98_12", + "98_14", + "98_17", + "98_18", + "98_2", + "98_3", + "98_4", + "98_7", + "98_8", + "98_9" + ], + "test_query_ids": [ + "0_0", + "0_1", + "0_10", + "0_11", + "0_12", + "0_13", + "0_14", + "0_15", + "0_16", + "0_17", + "0_18", + "0_19", + "0_2", + "0_20", + "0_21", + "0_22", + "0_23", + "0_24", + "0_25", + "0_26", + "0_27", + "0_28", + "0_29", + "0_3", + "0_30", + "0_31", + "0_4", + "0_5", + "0_6", + "0_7", + "0_8", + "0_9", + "100_5", + "100_6", + "100_7", + "104_0", + "104_1", + "104_5", + "104_6", + "106_0", + "106_1", + "106_10", + "106_11", + "106_12", + "106_2", + "106_3", + "106_4", + "106_5", + "106_6", + "106_7", + "106_8", + "106_9", + "113_0", + "113_1", + "113_2", + "113_4", + "120_0", + "120_1", + "120_10", + "120_11", + "120_12", + "120_13", + "120_14", + "120_15", + "120_16", + "120_17", + "120_2", + "120_3", + "120_4", + "120_5", + "120_6", + "120_7", + "120_8", + "120_9", + "121_0", + "121_1", + "121_2", + "121_3", + "121_4", + "121_5", + "121_6", + "121_7", + "121_8", + "123_0", + "123_1", + "123_10", + "123_11", + "123_14", + "123_15", + "123_16", + "123_17", + "123_18", + "123_2", + "123_3", + "123_4", + "123_5", + "123_6", + "123_7", + "123_8", + "123_9", + "125_12", + "125_13", + "125_14", + "125_15", + "125_16", + "125_20", + "125_21", + "125_22", + "125_23", + "125_24", + "125_3", + "125_4", + "125_5", + "125_6", + "125_7", + "127_1", + "127_11", + "127_12", + "127_2", + "127_4", + "128_0", + "128_1", + "128_10", + "128_11", + "128_12", + "128_13", + "128_14", + "128_15", + "128_16", + "128_17", + "128_18", + "128_19", + "128_2", + "128_20", + "128_21", + "128_22", + "128_23", + "128_24", + "128_25", + "128_26", + "128_27", + "128_3", + "128_4", + "128_5", + "128_6", + "128_7", + "128_8", + "128_9", + "129_1", + "129_2", + "129_3", + "130_13", + "130_15", + "130_16", + "130_17", + "131_0", + "131_1", + "131_2", + "131_3", + "131_4", + "131_5", + "131_6", + "14_0", + "14_1", + "14_10", + "14_11", + "14_12", + "14_13", + "14_14", + "14_15", + "14_16", + "14_17", + "14_2", + "14_3", + "14_4", + "14_5", + "14_6", + "14_7", + "14_8", + "14_9", + "17_0", + "17_1", + "17_10", + "17_11", + "17_12", + "17_13", + "17_14", + "17_15", + "17_16", + "17_17", + "17_18", + "17_2", + "17_3", + "17_4", + "17_5", + "17_6", + "17_7", + "17_8", + "17_9", + "18_11", + "18_12", + "19_0", + "19_1", + "19_2", + "19_3", + "19_4", + "19_5", + "19_6", + "19_7", + "19_8", + "19_9", + "23_10", + "23_15", + "23_16", + "23_19", + "23_2", + "23_21", + "23_22", + "23_23", + "23_24", + "23_25", + "23_27", + "23_3", + "23_4", + "23_5", + "23_6", + "23_7", + "23_8", + "26_0", + "26_1", + "26_10", + "26_11", + "26_12", + "26_2", + "26_3", + "26_4", + "26_5", + "26_6", + "26_7", + "26_8", + "26_9", + "27_0", + "28_0", + "28_1", + "28_2", + "28_4", + "28_5", + "2_0", + "2_1", + "2_10", + "2_11", + "2_12", + "2_13", + "2_14", + "2_17", + "2_18", + "2_19", + "2_2", + "2_3", + "2_4", + "2_5", + "2_6", + "2_7", + "2_8", + "2_9", + "30_0", + "30_1", + "30_10", + "30_11", + "30_12", + "30_13", + "30_14", + "30_2", + "30_3", + "30_4", + "30_5", + "30_6", + "30_7", + "30_8", + "30_9", + "31_0", + "31_1", + "31_10", + "31_11", + "31_2", + "31_3", + "31_4", + "31_5", + "31_6", + "31_7", + "31_8", + "31_9", + "36_0", + "36_1", + "36_10", + "36_11", + "36_12", + "36_13", + "36_14", + "36_15", + "36_16", + "36_17", + "36_18", + "36_19", + "36_2", + "36_20", + "36_21", + "36_22", + "36_23", + "36_24", + "36_25", + "36_26", + "36_28", + "36_29", + "36_3", + "36_4", + "36_5", + "36_6", + "36_7", + "36_8", + "36_9", + "37_10", + "37_8", + "37_9", + "3_0", + "3_1", + "3_2", + "3_3", + "3_4", + "3_5", + "3_6", + "3_8", + "3_9", + "41_0", + "41_1", + "41_2", + "41_3", + "41_4", + "42_10", + "42_11", + "42_12", + "42_18", + "42_20", + "42_3", + "42_8", + "42_9", + "44_0", + "44_1", + "44_10", + "44_11", + "44_12", + "44_13", + "44_14", + "44_15", + "44_16", + "44_18", + "44_19", + "44_2", + "44_20", + "44_22", + "44_23", + "44_24", + "44_25", + "44_3", + "44_4", + "44_5", + "44_6", + "44_7", + "44_8", + "44_9", + "47_0", + "47_1", + "47_10", + "47_11", + "47_12", + "47_13", + "47_14", + "47_15", + "47_16", + "47_17", + "47_18", + "47_19", + "47_2", + "47_20", + "47_3", + "47_4", + "47_5", + "47_6", + "47_7", + "47_8", + "47_9", + "49_0", + "49_1", + "49_10", + "49_11", + "49_13", + "49_2", + "49_3", + "49_4", + "49_5", + "49_6", + "49_7", + "49_8", + "49_9", + "4_0", + "4_1", + "4_10", + "4_11", + "4_12", + "4_13", + "4_14", + "4_15", + "4_16", + "4_17", + "4_18", + "4_19", + "4_2", + "4_3", + "4_4", + "4_5", + "4_6", + "4_7", + "4_8", + "4_9", + "50_0", + "50_1", + "50_2", + "50_3", + "50_4", + "50_5", + "50_6", + "50_7", + "50_8", + "50_9", + "52_2", + "55_0", + "55_1", + "55_2", + "55_3", + "57_4", + "5_0", + "5_1", + "5_2", + "61_0", + "61_1", + "61_10", + "61_11", + "61_12", + "61_13", + "61_14", + "61_15", + "61_16", + "61_17", + "61_18", + "61_19", + "61_2", + "61_20", + "61_21", + "61_22", + "61_23", + "61_24", + "61_3", + "61_4", + "61_5", + "61_6", + "61_7", + "61_8", + "61_9", + "62_0", + "62_1", + "62_2", + "62_3", + "62_4", + "62_5", + "62_6", + "62_7", + "62_8", + "62_9", + "65_0", + "66_6", + "67_10", + "67_11", + "67_12", + "67_13", + "67_4", + "67_5", + "67_6", + "67_9", + "68_3", + "68_4", + "68_5", + "68_6", + "68_7", + "69_1", + "69_10", + "69_14", + "69_4", + "69_6", + "6_0", + "6_1", + "6_2", + "6_3", + "6_4", + "6_5", + "6_6", + "6_7", + "6_8", + "6_9", + "70_1", + "70_10", + "70_11", + "70_13", + "70_14", + "70_15", + "70_16", + "70_17", + "70_18", + "70_19", + "70_2", + "70_21", + "70_3", + "70_4", + "70_5", + "70_6", + "70_7", + "70_8", + "70_9", + "71_0", + "71_1", + "71_10", + "71_11", + "71_12", + "71_13", + "71_15", + "71_16", + "71_17", + "71_18", + "71_19", + "71_2", + "71_3", + "71_4", + "71_5", + "71_6", + "71_7", + "71_8", + "71_9", + "72_10", + "72_11", + "72_12", + "72_13", + "72_14", + "72_15", + "73_0", + "73_1", + "73_2", + "73_3", + "73_4", + "73_5", + "73_6", + "73_7", + "73_8", + "73_9", + "74_0", + "74_1", + "74_10", + "74_2", + "74_3", + "74_4", + "74_5", + "74_6", + "74_7", + "74_8", + "74_9", + "75_1", + "75_3", + "75_4", + "75_5", + "75_6", + "75_7", + "75_8", + "77_0", + "77_1", + "77_10", + "77_2", + "77_3", + "77_4", + "77_5", + "77_6", + "77_7", + "77_8", + "79_0", + "79_1", + "79_10", + "79_11", + "79_2", + "79_3", + "79_4", + "79_5", + "79_6", + "79_7", + "79_9", + "80_0", + "80_1", + "80_10", + "80_11", + "80_12", + "80_13", + "80_14", + "80_18", + "80_2", + "80_24", + "80_3", + "80_4", + "80_5", + "80_6", + "80_7", + "80_8", + "80_9", + "81_0", + "81_1", + "82_0", + "86_1", + "88_0", + "88_1", + "88_10", + "88_2", + "88_4", + "88_5", + "88_6", + "88_7", + "88_8", + "88_9", + "89_0", + "89_1", + "89_10", + "89_11", + "89_12", + "89_13", + "89_14", + "89_15", + "89_16", + "89_17", + "89_18", + "89_2", + "89_3", + "89_4", + "89_5", + "89_6", + "89_7", + "89_8", + "89_9", + "8_10", + "8_14", + "90_17", + "90_18", + "91_0", + "91_1", + "91_2", + "91_3", + "91_4", + "91_5", + "91_6", + "91_7", + "92_0", + "92_1", + "92_10", + "92_11", + "92_12", + "92_13", + "92_14", + "92_2", + "92_3", + "92_4", + "92_5", + "92_6", + "92_7", + "92_8", + "92_9", + "95_0", + "95_1", + "95_10", + "95_11", + "95_12", + "95_13", + "95_14", + "95_15", + "95_16", + "95_17", + "95_18", + "95_19", + "95_2", + "95_3", + "95_4", + "95_5", + "95_6", + "95_7", + "95_8", + "95_9", + "96_0", + "96_1", + "96_2", + "96_3", + "96_4", + "96_5", + "96_6", + "97_0", + "97_1", + "97_2", + "97_3", + "98_0", + "98_1", + "98_2", + "98_3", + "98_4", + "98_5", + "98_6", + "98_7", + "98_8" + ] +} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json new file mode 100644 index 0000000..9f0937a --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json @@ -0,0 +1,4685 @@ +{ + "dataset": "sec", + "total_queries": 4084, + "train_queries": 3453, + "test_queries": 1216, + "sft_queries": 1035, + "rl_queries": 2418, + "sft_ratio": 0.2997393570807993, + "rl_ratio": 0.7002606429192006, + "sft_query_ids": [ + "1000_1", + "1001_0", + "1002_0", + "1004_1", + "1005_0", + "1006_1", + "1007_1", + "1007_2", + "1010_2", + "1011_0", + "1012_1", + "1013_0", + "1013_1", + "1014_0", + "1016_0", + "1018_2", + "101_0", + "1020_1", + "1020_2", + "1023_0", + "1023_1", + "1024_0", + "1025_2", + "1027_0", + "1027_2", + "1028_0", + "102_0", + "1033_0", + "1034_1", + "1036_0", + "1036_2", + "1042_0", + "1043_1", + "1045_0", + "1045_2", + "1047_0", + "1051_0", + "1052_0", + "1053_0", + "1053_1", + "1056_1", + "1056_2", + "1057_0", + "1063_0", + "1068_0", + "106_0", + "1071_0", + "1073_1", + "1074_1", + "1075_0", + "1077_0", + "1082_0", + "1082_1", + "1090_0", + "1090_1", + "1091_1", + "1092_0", + "1092_1", + "1097_1", + "1098_0", + "1099_0", + "10_0", + "1104_1", + "1105_0", + "1106_1", + "1107_1", + "1111_0", + "1112_0", + "1113_0", + "1115_0", + "1115_1", + "1117_1", + "1120_0", + "1122_0", + "1124_1", + "1125_0", + "1127_1", + "1129_0", + "112_0", + "1132_0", + "1133_0", + "1135_1", + "1137_0", + "1143_0", + "1144_1", + "1145_1", + "1148_0", + "1149_0", + "1154_1", + "1157_0", + "1157_1", + "1161_0", + "1164_0", + "1164_1", + "1166_1", + "116_0", + "1170_0", + "1170_1", + "1173_0", + "1173_1", + "1176_0", + "1176_1", + "1178_0", + "117_0", + "1183_0", + "1183_1", + "1185_1", + "1186_0", + "1188_0", + "1190_0", + "1193_1", + "1196_1", + "119_0", + "11_0", + "1200_1", + "1205_0", + "1207_0", + "1208_0", + "1208_1", + "1209_1", + "1210_0", + "1213_1", + "1215_0", + "1216_1", + "1219_1", + "1221_1", + "1222_0", + "1225_0", + "1226_0", + "1226_1", + "1227_0", + "1229_1", + "1231_1", + "1233_0", + "1238_1", + "1243_0", + "1244_0", + "1245_0", + "1246_1", + "1247_0", + "1247_1", + "1248_1", + "1249_1", + "1250_0", + "1252_1", + "1253_0", + "1257_0", + "1258_0", + "1259_0", + "125_0", + "1260_0", + "1261_0", + "1261_1", + "1263_1", + "1266_1", + "1269_0", + "1269_1", + "1271_1", + "1272_1", + "1273_0", + "1275_0", + "1275_1", + "1280_1", + "1281_1", + "1283_0", + "1286_1", + "1287_0", + "1289_0", + "1290_1", + "1291_1", + "1293_1", + "1296_0", + "1296_1", + "1298_1", + "1300_1", + "1301_1", + "1306_1", + "1307_0", + "1308_1", + "1309_0", + "1313_1", + "1314_0", + "1317_0", + "1319_0", + "1320_1", + "1322_0", + "1322_1", + "1323_0", + "1325_0", + "1329_1", + "1337_1", + "1338_1", + "1343_1", + "1344_1", + "1346_0", + "1348_1", + "134_0", + "1350_1", + "1351_1", + "1353_0", + "1353_1", + "1356_1", + "1358_0", + "1360_0", + "1360_1", + "1365_1", + "1367_1", + "1369_0", + "136_0", + "1371_0", + "1380_0", + "1380_1", + "1382_0", + "1386_0", + "1391_0", + "1392_0", + "1393_0", + "1394_1", + "1396_0", + "1396_1", + "1399_1", + "1405_0", + "1405_1", + "1406_1", + "1409_0", + "140_0", + "1410_1", + "1413_0", + "1415_0", + "1417_0", + "1418_0", + "1418_1", + "1421_0", + "1423_0", + "1429_1", + "1430_0", + "143_0", + "1440_0", + "1447_0", + "1448_0", + "1451_1", + "1456_1", + "1457_1", + "1458_0", + "1460_1", + "1461_0", + "1464_0", + "1465_1", + "1467_0", + "1470_1", + "1471_1", + "1473_0", + "1475_0", + "1477_0", + "1478_1", + "1479_1", + "1481_0", + "1482_0", + "1484_0", + "1486_1", + "1490_0", + "1493_0", + "1493_1", + "1497_0", + "1497_1", + "1498_1", + "14_0", + "1500_0", + "1503_1", + "1509_0", + "1510_0", + "1511_1", + "1514_0", + "1514_1", + "1515_0", + "1516_0", + "1517_1", + "1519_1", + "1522_0", + "1523_1", + "1528_0", + "1529_1", + "152_0", + "1530_0", + "1531_1", + "1533_1", + "1535_0", + "1537_1", + "1541_0", + "1542_0", + "1543_0", + "1548_1", + "1553_0", + "1553_1", + "1558_1", + "1560_1", + "1568_0", + "1574_0", + "1577_1", + "1581_0", + "1583_0", + "1584_0", + "1585_0", + "1590_0", + "1591_0", + "1597_0", + "1597_1", + "159_0", + "1602_0", + "1603_1", + "1605_0", + "1607_0", + "1609_1", + "160_0", + "1610_0", + "1611_0", + "1612_1", + "1615_0", + "1615_1", + "1616_1", + "1619_1", + "1620_1", + "1624_0", + "1628_0", + "1628_1", + "1629_1", + "1633_0", + "1633_1", + "1635_1", + "1638_0", + "1645_0", + "1652_0", + "1655_0", + "1655_1", + "1657_0", + "1659_0", + "1661_0", + "1666_0", + "1667_1", + "1668_1", + "1670_0", + "1671_0", + "1672_0", + "1672_1", + "1675_0", + "1676_0", + "1677_1", + "1678_0", + "167_0", + "1683_0", + "1686_0", + "1688_0", + "1691_0", + "1691_1", + "1700_0", + "1702_0", + "1704_0", + "1705_0", + "170_0", + "1713_0", + "1714_0", + "1717_0", + "171_0", + "1720_0", + "1721_0", + "1727_0", + "1728_1", + "172_0", + "1730_0", + "1732_0", + "1734_1", + "1735_0", + "1737_1", + "1738_0", + "1739_1", + "1748_0", + "1750_0", + "1753_0", + "1754_0", + "1755_0", + "1760_1", + "1761_0", + "1765_0", + "1767_1", + "176_0", + "1770_0", + "1771_0", + "1772_0", + "1774_0", + "1775_0", + "1778_0", + "1784_1", + "1787_0", + "1787_1", + "1788_0", + "1789_0", + "1789_1", + "1790_0", + "1793_1", + "1794_0", + "1796_0", + "179_0", + "1802_0", + "1809_0", + "1813_0", + "1815_0", + "1815_1", + "1816_0", + "1819_0", + "1821_0", + "1823_0", + "1824_1", + "1829_0", + "1830_1", + "1839_0", + "1848_1", + "1850_1", + "1851_1", + "1853_0", + "1855_0", + "1856_0", + "1857_0", + "1864_0", + "1869_1", + "1871_0", + "1872_0", + "1873_1", + "1874_0", + "1881_0", + "1885_1", + "1889_0", + "188_0", + "1890_0", + "1892_0", + "1893_0", + "1895_0", + "1896_0", + "1897_0", + "1897_1", + "1904_1", + "1905_0", + "190_0", + "1910_1", + "1914_1", + "1915_1", + "1918_0", + "1923_1", + "1924_1", + "1926_0", + "1929_0", + "1930_0", + "1939_0", + "1942_0", + "1944_0", + "1947_0", + "1956_0", + "1957_1", + "1962_1", + "1963_0", + "1964_1", + "1965_0", + "1967_1", + "1968_1", + "1970_1", + "1971_0", + "1972_1", + "1973_0", + "1973_1", + "1975_0", + "1982_1", + "1994_0", + "1994_1", + "1996_1", + "1_0", + "2002_0", + "2003_1", + "2004_0", + "2004_1", + "2005_1", + "2006_0", + "2007_0", + "2010_0", + "2010_1", + "2013_0", + "2016_0", + "2016_1", + "2019_0", + "201_0", + "2020_0", + "2023_1", + "2025_1", + "2026_0", + "2026_1", + "2028_1", + "2029_1", + "2031_0", + "2036_0", + "2037_0", + "2037_1", + "2038_0", + "2038_1", + "2039_0", + "203_0", + "2041_1", + "2043_1", + "2046_1", + "2047_1", + "2048_0", + "2048_1", + "2053_0", + "2055_0", + "2057_1", + "2060_1", + "2063_1", + "2064_0", + "2069_0", + "2069_1", + "2073_0", + "2073_1", + "2075_0", + "2075_1", + "2076_0", + "2076_1", + "2080_0", + "2081_0", + "2082_1", + "2084_0", + "2085_0", + "2089_0", + "2092_1", + "2097_0", + "2098_0", + "2099_0", + "2105_1", + "2108_0", + "2112_1", + "2113_1", + "2114_0", + "2114_1", + "2115_1", + "2116_1", + "2118_1", + "211_0", + "2123_1", + "2126_1", + "2127_0", + "2129_0", + "2134_0", + "2138_0", + "213_0", + "2140_0", + "2142_1", + "2143_0", + "2144_0", + "2145_0", + "2145_1", + "2147_0", + "2149_0", + "214_0", + "2152_0", + "2152_1", + "2154_1", + "2161_1", + "2167_0", + "2170_1", + "2171_0", + "2174_0", + "2176_0", + "2177_0", + "2181_0", + "2181_1", + "2183_0", + "2187_0", + "21_0", + "2200_0", + "2202_1", + "2203_0", + "2215_0", + "2216_1", + "2217_0", + "221_0", + "2221_0", + "2223_0", + "2225_0", + "2230_0", + "2233_0", + "2234_0", + "2236_0", + "2238_0", + "2240_0", + "2242_0", + "2243_0", + "2250_0", + "2254_0", + "2255_0", + "2259_0", + "2260_0", + "2264_0", + "2269_0", + "2272_0", + "2274_0", + "2275_0", + "2278_0", + "2281_0", + "2282_0", + "2289_0", + "228_0", + "2291_0", + "2297_0", + "2300_0", + "2315_0", + "2318_0", + "2324_0", + "2327_0", + "2328_0", + "2331_0", + "2344_0", + "2347_0", + "234_1", + "2354_0", + "2357_0", + "2358_0", + "2360_0", + "2367_0", + "2370_0", + "2379_0", + "2380_0", + "2383_0", + "2385_0", + "2386_0", + "239_0", + "239_2", + "23_0", + "2400_0", + "2403_0", + "2405_0", + "2408_0", + "2411_0", + "2413_0", + "2418_0", + "241_0", + "2424_0", + "2431_0", + "2433_0", + "2437_0", + "2438_0", + "2443_0", + "2445_0", + "2447_0", + "2449_0", + "2463_0", + "2464_0", + "2470_0", + "2471_0", + "2472_0", + "2477_0", + "2478_0", + "2479_0", + "2483_0", + "2486_0", + "2487_0", + "24_0", + "2510_0", + "2511_0", + "2513_0", + "2521_0", + "2531_0", + "253_0", + "253_2", + "2544_0", + "254_0", + "2552_0", + "2557_0", + "2559_0", + "2560_0", + "2563_0", + "2567_0", + "2572_0", + "2575_0", + "2588_0", + "2590_0", + "2592_0", + "2593_0", + "259_1", + "25_0", + "265_0", + "265_2", + "266_0", + "268_0", + "269_0", + "271_0", + "272_0", + "279_0", + "27_0", + "280_0", + "286_0", + "28_0", + "290_1", + "292_0", + "297_0", + "303_0", + "305_0", + "30_0", + "311_0", + "313_0", + "314_1", + "316_0", + "327_1", + "329_0", + "331_0", + "333_0", + "333_1", + "336_0", + "337_0", + "338_0", + "339_0", + "339_1", + "340_0", + "343_0", + "348_1", + "348_2", + "349_0", + "34_0", + "350_0", + "351_0", + "355_0", + "356_2", + "365_1", + "367_0", + "36_0", + "379_0", + "380_1", + "382_0", + "382_1", + "382_2", + "389_1", + "391_0", + "392_0", + "395_1", + "39_0", + "404_0", + "407_0", + "40_0", + "412_0", + "413_0", + "426_0", + "427_0", + "427_1", + "428_0", + "430_0", + "430_1", + "431_1", + "431_2", + "432_0", + "432_1", + "432_2", + "436_0", + "437_1", + "438_0", + "439_0", + "43_0", + "440_0", + "440_1", + "442_0", + "442_2", + "443_0", + "449_0", + "449_1", + "452_2", + "453_2", + "454_0", + "459_0", + "45_0", + "461_0", + "467_0", + "473_1", + "476_0", + "479_0", + "482_1", + "482_2", + "495_0", + "495_2", + "499_0", + "4_0", + "500_0", + "501_1", + "505_0", + "509_1", + "510_0", + "510_1", + "511_0", + "514_0", + "516_0", + "517_0", + "518_0", + "51_0", + "525_0", + "527_0", + "529_0", + "52_0", + "533_0", + "536_0", + "539_1", + "539_2", + "540_0", + "542_0", + "553_0", + "557_0", + "55_0", + "563_1", + "570_0", + "573_0", + "576_1", + "576_2", + "57_0", + "580_0", + "580_1", + "586_1", + "589_0", + "58_0", + "590_0", + "591_0", + "592_0", + "596_0", + "598_1", + "599_0", + "59_0", + "603_0", + "612_0", + "613_0", + "615_0", + "618_2", + "619_0", + "619_2", + "620_0", + "622_0", + "627_0", + "629_0", + "62_0", + "635_1", + "637_1", + "639_0", + "639_1", + "644_0", + "649_0", + "651_0", + "652_0", + "653_0", + "653_2", + "654_1", + "654_2", + "658_0", + "660_0", + "664_0", + "665_0", + "666_2", + "667_0", + "667_2", + "668_1", + "669_0", + "675_0", + "678_1", + "680_0", + "681_2", + "684_0", + "686_0", + "688_0", + "690_0", + "696_0", + "697_2", + "698_0", + "6_0", + "700_0", + "704_0", + "707_0", + "709_1", + "714_0", + "715_0", + "717_0", + "71_0", + "721_1", + "725_0", + "729_2", + "738_0", + "73_0", + "742_2", + "743_0", + "743_1", + "743_2", + "744_2", + "745_0", + "749_0", + "74_0", + "751_1", + "753_2", + "754_1", + "755_0", + "756_0", + "756_1", + "762_2", + "763_0", + "763_1", + "764_0", + "764_1", + "765_1", + "768_0", + "76_0", + "770_1", + "771_0", + "772_0", + "774_0", + "774_1", + "774_2", + "775_2", + "776_0", + "777_0", + "778_0", + "779_0", + "779_1", + "77_2", + "780_0", + "781_2", + "782_0", + "783_0", + "783_2", + "786_0", + "796_2", + "800_0", + "802_0", + "806_0", + "806_1", + "809_0", + "80_0", + "80_1", + "814_0", + "816_0", + "816_1", + "818_0", + "820_0", + "820_1", + "820_2", + "825_0", + "826_0", + "828_0", + "831_1", + "836_0", + "845_0", + "847_0", + "84_0", + "857_0", + "857_1", + "858_0", + "863_2", + "866_0", + "867_0", + "868_0", + "869_0", + "86_0", + "874_0", + "876_0", + "876_1", + "876_2", + "878_0", + "879_0", + "879_1", + "882_0", + "884_0", + "887_0", + "887_1", + "890_0", + "891_1", + "892_0", + "893_2", + "894_1", + "899_0", + "904_0", + "909_0", + "913_0", + "916_1", + "917_1", + "919_2", + "921_0", + "924_2", + "925_0", + "927_0", + "929_0", + "930_2", + "931_0", + "932_0", + "934_0", + "936_0", + "936_2", + "937_1", + "939_0", + "940_1", + "942_1", + "942_2", + "943_0", + "944_1", + "945_0", + "946_1", + "94_0", + "950_0", + "952_1", + "952_2", + "954_0", + "955_1", + "957_2", + "959_1", + "959_2", + "95_0", + "962_1", + "963_1", + "967_0", + "968_2", + "969_1", + "972_1", + "973_1", + "973_2", + "977_0", + "977_2", + "982_1", + "983_0", + "984_2", + "985_2", + "987_1", + "988_2", + "990_1", + "995_1", + "995_2", + "996_1", + "997_0", + "998_0", + "998_1", + "99_0" + ], + "rl_query_ids": [ + "0_0", + "1000_0", + "1000_2", + "1001_1", + "1001_2", + "1002_1", + "1002_2", + "1003_0", + "1004_0", + "1004_2", + "1006_0", + "1006_2", + "1007_0", + "1009_0", + "100_0", + "1010_0", + "1010_1", + "1011_1", + "1011_2", + "1012_0", + "1012_2", + "1014_1", + "1014_2", + "1015_0", + "1015_1", + "1015_2", + "1018_0", + "1018_1", + "1019_0", + "1019_1", + "1020_0", + "1021_0", + "1022_0", + "1022_1", + "1022_2", + "1024_1", + "1024_2", + "1025_0", + "1025_1", + "1026_0", + "1026_1", + "1026_2", + "1027_1", + "1030_0", + "1030_1", + "1030_2", + "1031_0", + "1031_1", + "1032_0", + "1032_1", + "1032_2", + "1033_1", + "1033_2", + "1034_0", + "1035_0", + "1035_1", + "1036_1", + "1038_0", + "1038_1", + "1038_2", + "103_0", + "1040_0", + "1041_0", + "1041_1", + "1041_2", + "1043_0", + "1043_2", + "1045_1", + "1046_0", + "1047_1", + "1047_2", + "1048_0", + "1049_0", + "104_0", + "1050_0", + "1050_1", + "1050_2", + "1051_1", + "1051_2", + "1054_0", + "1054_1", + "1054_2", + "1056_0", + "1057_1", + "1057_2", + "1059_0", + "1059_1", + "1059_2", + "105_0", + "1060_0", + "1060_1", + "1061_0", + "1061_1", + "1061_2", + "1062_0", + "1062_1", + "1064_0", + "1064_1", + "1064_2", + "1066_0", + "1066_1", + "1067_0", + "1068_1", + "1068_2", + "1070_0", + "1070_1", + "1071_1", + "1072_0", + "1072_1", + "1073_0", + "1074_0", + "1076_0", + "1076_1", + "1077_1", + "1078_0", + "1078_1", + "107_0", + "1080_0", + "1080_1", + "1081_0", + "1081_1", + "1084_0", + "1085_0", + "1086_0", + "1087_0", + "1087_1", + "1089_0", + "1089_1", + "108_0", + "1091_0", + "1093_0", + "1093_1", + "1094_0", + "1094_1", + "1095_0", + "1095_1", + "1097_0", + "1098_1", + "109_0", + "1103_0", + "1103_1", + "1104_0", + "1105_1", + "1106_0", + "1107_0", + "1108_0", + "110_0", + "1110_0", + "1110_1", + "1111_1", + "1113_1", + "1114_0", + "1114_1", + "1117_0", + "1118_0", + "1118_1", + "1119_0", + "1119_1", + "111_0", + "1120_1", + "1121_0", + "1121_1", + "1123_0", + "1124_0", + "1126_0", + "1127_0", + "1128_0", + "1128_1", + "1129_1", + "1131_0", + "1131_1", + "1132_1", + "1133_1", + "1134_0", + "1134_1", + "1135_0", + "1136_0", + "1137_1", + "1138_0", + "1138_1", + "1139_0", + "1139_1", + "113_0", + "1140_0", + "1140_1", + "1141_0", + "1141_1", + "1142_0", + "1142_1", + "1144_0", + "1145_0", + "1147_0", + "1147_1", + "1148_1", + "114_0", + "1150_0", + "1151_0", + "1151_1", + "1153_0", + "1154_0", + "1155_0", + "1155_1", + "1156_0", + "1156_1", + "1158_0", + "1158_1", + "1159_0", + "115_0", + "1161_1", + "1163_0", + "1163_1", + "1166_0", + "1168_0", + "1172_0", + "1172_1", + "1175_0", + "1175_1", + "1177_0", + "1178_1", + "1179_0", + "1180_0", + "1180_1", + "1181_0", + "1181_1", + "1182_0", + "1182_1", + "1185_0", + "1186_1", + "1188_1", + "1189_0", + "1189_1", + "118_0", + "1190_1", + "1191_0", + "1192_0", + "1192_1", + "1193_0", + "1194_0", + "1194_1", + "1195_0", + "1196_0", + "1197_0", + "1197_1", + "1198_0", + "1198_1", + "1199_0", + "1199_1", + "1200_0", + "1201_0", + "1203_0", + "1203_1", + "1204_0", + "1204_1", + "1206_0", + "1206_1", + "1207_1", + "1209_0", + "120_0", + "1210_1", + "1213_0", + "1215_1", + "1216_0", + "1217_0", + "1217_1", + "1218_0", + "1218_1", + "1219_0", + "121_0", + "1221_0", + "1222_1", + "1224_0", + "1228_0", + "1228_1", + "1229_0", + "1230_0", + "1231_0", + "1232_0", + "1233_1", + "1234_0", + "1234_1", + "1235_0", + "1235_1", + "1236_0", + "1237_0", + "1237_1", + "1238_0", + "1239_0", + "123_0", + "1240_0", + "1243_1", + "1245_1", + "1246_0", + "1248_0", + "1249_0", + "124_0", + "1251_0", + "1251_1", + "1252_0", + "1255_0", + "1256_0", + "1256_1", + "1258_1", + "1259_1", + "1260_1", + "1263_0", + "1264_0", + "1264_1", + "1265_0", + "1265_1", + "1266_0", + "126_0", + "1270_0", + "1270_1", + "1271_0", + "1272_0", + "1273_1", + "1274_0", + "1276_0", + "1276_1", + "1277_0", + "1277_1", + "1279_0", + "1280_0", + "1281_0", + "1282_0", + "1282_1", + "1283_1", + "1284_0", + "1284_1", + "1285_0", + "1285_1", + "1286_0", + "1287_1", + "1289_1", + "128_0", + "1290_0", + "1291_0", + "1292_0", + "1292_1", + "1293_0", + "1294_0", + "1294_1", + "1295_0", + "1297_0", + "1297_1", + "1298_0", + "129_0", + "1300_0", + "1301_0", + "1302_0", + "1302_1", + "1303_0", + "1303_1", + "1304_0", + "1304_1", + "1305_0", + "1305_1", + "1306_0", + "1307_1", + "1308_0", + "130_0", + "1310_0", + "1310_1", + "1311_0", + "1311_1", + "1312_0", + "1312_1", + "1313_0", + "1314_1", + "1315_0", + "1315_1", + "1316_0", + "1316_1", + "1317_1", + "1318_0", + "1318_1", + "1319_1", + "131_0", + "1320_0", + "1321_0", + "1321_1", + "1323_1", + "1324_0", + "1325_1", + "1326_0", + "1326_1", + "1328_0", + "1329_0", + "1330_0", + "1330_1", + "1331_0", + "1331_1", + "1332_0", + "1332_1", + "1333_0", + "1334_0", + "1334_1", + "1335_0", + "1335_1", + "1336_0", + "1336_1", + "1337_0", + "1338_0", + "133_0", + "1340_0", + "1340_1", + "1341_0", + "1342_0", + "1343_0", + "1344_0", + "1346_1", + "1347_0", + "1347_1", + "1348_0", + "1350_0", + "1351_0", + "1352_0", + "1352_1", + "1354_0", + "1354_1", + "1356_0", + "1357_0", + "1358_1", + "135_0", + "1361_0", + "1361_1", + "1362_0", + "1362_1", + "1364_0", + "1364_1", + "1365_0", + "1366_0", + "1366_1", + "1367_0", + "1369_1", + "1370_0", + "1371_1", + "1372_0", + "1372_1", + "1374_0", + "1375_0", + "1375_1", + "1376_0", + "1376_1", + "1377_0", + "1377_1", + "137_0", + "1381_0", + "1381_1", + "1382_1", + "1384_0", + "1384_1", + "1385_0", + "1385_1", + "1386_1", + "1387_0", + "1389_0", + "138_0", + "1390_0", + "1391_1", + "1394_0", + "1395_0", + "1395_1", + "1397_0", + "1397_1", + "1399_0", + "13_0", + "1400_0", + "1401_0", + "1401_1", + "1402_0", + "1402_1", + "1403_0", + "1403_1", + "1404_0", + "1404_1", + "1406_0", + "1407_0", + "1407_1", + "1408_0", + "1408_1", + "1409_1", + "1410_0", + "1411_0", + "1411_1", + "1412_0", + "1415_1", + "1416_0", + "1417_1", + "141_0", + "1420_0", + "1422_0", + "1422_1", + "1424_0", + "1425_0", + "1425_1", + "1427_0", + "1427_1", + "1428_0", + "1428_1", + "1429_0", + "142_0", + "1430_1", + "1431_0", + "1431_1", + "1433_0", + "1433_1", + "1434_0", + "1434_1", + "1435_0", + "1435_1", + "1436_0", + "1436_1", + "1437_0", + "1438_0", + "1438_1", + "1439_0", + "1439_1", + "1441_0", + "1441_1", + "1442_0", + "1442_1", + "1443_0", + "1443_1", + "1444_0", + "1446_0", + "1446_1", + "1447_1", + "1448_1", + "1449_0", + "1449_1", + "144_0", + "1450_0", + "1450_1", + "1451_0", + "1452_0", + "1452_1", + "1453_0", + "1453_1", + "1454_0", + "1454_1", + "1455_0", + "1456_0", + "1457_0", + "1458_1", + "1459_0", + "145_0", + "1460_0", + "1461_1", + "1462_0", + "1462_1", + "1463_0", + "1464_1", + "1465_0", + "1466_0", + "1466_1", + "1467_1", + "1468_0", + "1468_1", + "1469_0", + "1470_0", + "1471_0", + "1472_0", + "1472_1", + "1475_1", + "1477_1", + "1478_0", + "1479_0", + "147_0", + "1480_0", + "1480_1", + "1481_1", + "1482_1", + "1484_1", + "1485_0", + "1486_0", + "1487_0", + "1488_0", + "1488_1", + "1489_0", + "1489_1", + "148_0", + "1490_1", + "1491_0", + "1491_1", + "1492_0", + "1492_1", + "1494_0", + "1494_1", + "1495_0", + "1495_1", + "1498_0", + "149_0", + "1500_1", + "1501_0", + "1501_1", + "1503_0", + "1504_0", + "1504_1", + "1507_0", + "1507_1", + "150_0", + "1510_1", + "1511_0", + "1512_0", + "1512_1", + "1513_0", + "1515_1", + "1516_1", + "1517_0", + "1519_0", + "1520_0", + "1520_1", + "1521_0", + "1521_1", + "1522_1", + "1523_0", + "1527_0", + "1528_1", + "1529_0", + "1530_1", + "1531_0", + "1533_0", + "1534_0", + "1534_1", + "1535_1", + "1536_0", + "1536_1", + "1537_0", + "1538_0", + "1538_1", + "1539_0", + "1539_1", + "153_0", + "1540_0", + "1540_1", + "1541_1", + "1542_1", + "1543_1", + "1545_0", + "1545_1", + "1546_0", + "1547_0", + "1547_1", + "1548_0", + "1549_0", + "1549_1", + "154_0", + "1550_0", + "1550_1", + "1554_0", + "1554_1", + "1555_0", + "1555_1", + "1556_0", + "1556_1", + "1557_0", + "1557_1", + "1558_0", + "1559_0", + "1559_1", + "155_0", + "1560_0", + "1561_0", + "1561_1", + "1562_0", + "1562_1", + "1563_0", + "1563_1", + "1564_0", + "1564_1", + "1566_0", + "1566_1", + "1567_0", + "1567_1", + "1568_1", + "1569_0", + "1569_1", + "156_0", + "1572_0", + "1572_1", + "1573_0", + "1573_1", + "1574_1", + "1575_0", + "1575_1", + "1577_0", + "1578_0", + "1578_1", + "1579_0", + "157_0", + "1580_0", + "1580_1", + "1581_1", + "1582_0", + "1583_1", + "1584_1", + "1585_1", + "1587_0", + "1587_1", + "1589_0", + "158_0", + "1592_0", + "1592_1", + "1593_0", + "1594_0", + "1594_1", + "1596_0", + "1596_1", + "1599_0", + "1599_1", + "1600_0", + "1600_1", + "1602_1", + "1603_0", + "1605_1", + "1606_0", + "1606_1", + "1607_1", + "1608_0", + "1608_1", + "1609_0", + "1610_1", + "1611_1", + "1612_0", + "1614_0", + "1614_1", + "1616_0", + "1617_0", + "1617_1", + "1619_0", + "161_0", + "1620_0", + "1621_0", + "1621_1", + "1622_0", + "1622_1", + "1623_0", + "1625_0", + "1625_1", + "1626_0", + "1626_1", + "1627_0", + "1627_1", + "1629_0", + "162_0", + "1630_0", + "1631_0", + "1632_0", + "1632_1", + "1635_0", + "1636_0", + "1636_1", + "1638_1", + "1639_0", + "163_0", + "1641_0", + "1642_0", + "1642_1", + "1643_0", + "1643_1", + "1644_0", + "1644_1", + "1645_1", + "1646_0", + "1646_1", + "1647_0", + "1647_1", + "1649_0", + "164_0", + "1650_0", + "1650_1", + "1651_0", + "1651_1", + "1652_1", + "1653_0", + "1654_0", + "1654_1", + "1656_0", + "1656_1", + "1657_1", + "1658_0", + "1658_1", + "1659_1", + "165_0", + "1660_0", + "1660_1", + "1661_1", + "1662_0", + "1662_1", + "1664_0", + "1664_1", + "1665_0", + "1665_1", + "1666_1", + "1667_0", + "1668_0", + "1669_0", + "1669_1", + "1673_0", + "1673_1", + "1674_0", + "1675_1", + "1677_0", + "1678_1", + "1679_0", + "1679_1", + "1681_0", + "1681_1", + "1682_0", + "1683_1", + "1684_0", + "1684_1", + "1685_0", + "1685_1", + "1686_1", + "1687_0", + "168_0", + "1690_0", + "1692_0", + "1692_1", + "1693_0", + "1693_1", + "1694_0", + "1695_0", + "1695_1", + "1696_0", + "1697_0", + "1697_1", + "1698_0", + "1699_0", + "1699_1", + "169_0", + "1704_1", + "1706_0", + "1706_1", + "1707_0", + "1708_0", + "1708_1", + "1709_0", + "1710_0", + "1710_1", + "1712_0", + "1716_0", + "1718_0", + "1719_0", + "1722_0", + "1725_0", + "1726_0", + "1727_1", + "1728_0", + "1729_0", + "1729_1", + "1731_0", + "1731_1", + "1733_0", + "1734_0", + "1735_1", + "1737_0", + "1738_1", + "1739_0", + "1740_0", + "1740_1", + "1741_0", + "1742_0", + "1742_1", + "1743_0", + "1743_1", + "1746_0", + "1746_1", + "1747_0", + "1747_1", + "1748_1", + "1749_0", + "1749_1", + "174_0", + "1750_1", + "1751_0", + "1751_1", + "1752_0", + "1753_1", + "1756_0", + "1757_0", + "1757_1", + "1759_0", + "175_0", + "1760_0", + "1765_1", + "1766_0", + "1766_1", + "1767_0", + "1768_0", + "1768_1", + "1769_0", + "1769_1", + "1772_1", + "1776_0", + "1779_0", + "1779_1", + "177_0", + "1780_0", + "1780_1", + "1782_0", + "1783_0", + "1784_0", + "1785_0", + "1785_1", + "1786_0", + "1786_1", + "1788_1", + "178_0", + "1790_1", + "1791_0", + "1792_0", + "1792_1", + "1793_0", + "1794_1", + "1796_1", + "1797_0", + "1797_1", + "1798_0", + "1798_1", + "1799_0", + "1800_0", + "1803_0", + "1804_0", + "1805_0", + "1807_0", + "1808_0", + "1809_1", + "180_0", + "1810_0", + "1811_0", + "1811_1", + "1812_0", + "1813_1", + "1816_1", + "1817_0", + "1818_0", + "1818_1", + "1819_1", + "181_0", + "1820_0", + "1820_1", + "1821_1", + "1822_0", + "1823_1", + "1824_0", + "1825_0", + "1825_1", + "1829_1", + "182_0", + "1830_0", + "1831_0", + "1831_1", + "1832_0", + "1832_1", + "1833_0", + "1833_1", + "1835_0", + "1835_1", + "1837_0", + "1838_0", + "183_0", + "1840_0", + "1842_0", + "1844_0", + "1845_0", + "1845_1", + "1846_0", + "1848_0", + "1849_0", + "1849_1", + "184_0", + "1850_0", + "1851_0", + "1852_0", + "1852_1", + "1853_1", + "1854_0", + "1854_1", + "1855_1", + "1856_1", + "1858_0", + "1859_0", + "185_0", + "1860_0", + "1861_0", + "1862_0", + "1863_0", + "1863_1", + "1865_0", + "1866_0", + "1866_1", + "1869_0", + "186_0", + "1870_0", + "1870_1", + "1871_1", + "1872_1", + "1873_0", + "1874_1", + "1875_0", + "1875_1", + "1877_0", + "1877_1", + "1878_0", + "1878_1", + "1879_0", + "187_0", + "1880_0", + "1880_1", + "1881_1", + "1882_0", + "1882_1", + "1883_0", + "1883_1", + "1885_0", + "1886_0", + "1887_0", + "1887_1", + "1891_0", + "1894_0", + "1894_1", + "1895_1", + "1899_0", + "189_0", + "18_0", + "1900_0", + "1901_0", + "1902_0", + "1902_1", + "1903_0", + "1903_1", + "1904_0", + "1906_0", + "1906_1", + "1907_0", + "1907_1", + "1908_0", + "1910_0", + "1911_0", + "1911_1", + "1914_0", + "1915_0", + "1916_0", + "191_0", + "1920_0", + "1921_0", + "1923_0", + "1924_0", + "1926_1", + "1927_0", + "1927_1", + "1928_0", + "1929_1", + "192_0", + "1931_0", + "1934_0", + "1935_0", + "1936_0", + "1938_0", + "193_0", + "1940_0", + "1941_0", + "1943_0", + "1946_0", + "1949_0", + "194_0", + "1950_0", + "1950_1", + "1951_0", + "1951_1", + "1952_0", + "1954_0", + "1954_1", + "1955_0", + "1955_1", + "1956_1", + "1957_0", + "1958_0", + "1959_0", + "195_0", + "1960_0", + "1960_1", + "1962_0", + "1963_1", + "1964_0", + "1965_1", + "1967_0", + "1968_0", + "1970_0", + "1971_1", + "1972_0", + "1974_0", + "1974_1", + "1975_1", + "1976_0", + "1976_1", + "1977_0", + "1977_1", + "1978_0", + "1979_0", + "197_0", + "1980_0", + "1980_1", + "1981_0", + "1981_1", + "1982_0", + "1983_0", + "1983_1", + "1984_0", + "1986_0", + "1986_1", + "1987_0", + "1987_1", + "1989_0", + "198_0", + "1991_0", + "1993_0", + "1993_1", + "1995_0", + "1995_1", + "1996_0", + "1997_0", + "1997_1", + "1998_0", + "1998_1", + "1999_0", + "1999_1", + "199_0", + "19_0", + "2000_0", + "2000_1", + "2001_0", + "2001_1", + "2002_1", + "2003_0", + "2005_0", + "2006_1", + "2007_1", + "2008_0", + "2008_1", + "200_0", + "2012_0", + "2013_1", + "2014_0", + "2014_1", + "2017_0", + "2017_1", + "2019_1", + "2020_1", + "2021_0", + "2021_1", + "2022_0", + "2022_1", + "2023_0", + "2025_0", + "2027_0", + "2027_1", + "2028_0", + "2029_0", + "202_0", + "2030_0", + "2030_1", + "2031_1", + "2032_0", + "2032_1", + "2033_0", + "2034_0", + "2034_1", + "2035_0", + "2035_1", + "2036_1", + "2039_1", + "2040_0", + "2040_1", + "2041_0", + "2042_0", + "2042_1", + "2043_0", + "2044_0", + "2044_1", + "2045_0", + "2046_0", + "2047_0", + "2049_0", + "2049_1", + "204_0", + "2050_0", + "2050_1", + "2051_0", + "2051_1", + "2053_1", + "2054_0", + "2054_1", + "2056_0", + "2056_1", + "2057_0", + "2058_0", + "205_0", + "2060_0", + "2061_0", + "2062_0", + "2062_1", + "2063_0", + "2064_1", + "2065_0", + "2066_0", + "2066_1", + "2067_0", + "2068_0", + "206_0", + "2070_0", + "2070_1", + "2074_0", + "2074_1", + "2077_0", + "2080_1", + "2081_1", + "2082_0", + "2084_1", + "2086_0", + "2086_1", + "2087_0", + "2087_1", + "2088_0", + "2088_1", + "2089_1", + "208_0", + "2090_0", + "2090_1", + "2091_0", + "2091_1", + "2092_0", + "2093_0", + "2093_1", + "2094_0", + "2094_1", + "2095_0", + "2095_1", + "2097_1", + "2098_1", + "2099_1", + "209_0", + "20_0", + "2100_0", + "2100_1", + "2101_0", + "2102_0", + "2102_1", + "2103_0", + "2104_0", + "2104_1", + "2105_0", + "2106_0", + "2106_1", + "2107_0", + "2107_1", + "2108_1", + "210_0", + "2110_0", + "2110_1", + "2111_0", + "2111_1", + "2112_0", + "2113_0", + "2115_0", + "2116_0", + "2117_0", + "2118_0", + "2119_0", + "2119_1", + "2120_0", + "2120_1", + "2121_0", + "2121_1", + "2122_0", + "2122_1", + "2123_0", + "2124_0", + "2126_0", + "2127_1", + "2128_0", + "2128_1", + "2129_1", + "212_0", + "2130_0", + "2130_1", + "2132_0", + "2132_1", + "2133_0", + "2133_1", + "2134_1", + "2135_0", + "2135_1", + "2136_0", + "2136_1", + "2137_0", + "2137_1", + "2138_1", + "2142_0", + "2143_1", + "2146_0", + "2146_1", + "2147_1", + "2148_0", + "2149_1", + "2153_0", + "2154_0", + "2155_0", + "2155_1", + "2156_0", + "2156_1", + "2157_0", + "2157_1", + "2159_0", + "215_0", + "2160_0", + "2161_0", + "2162_0", + "2162_1", + "2163_0", + "2164_0", + "2165_0", + "2168_0", + "2169_0", + "216_0", + "2170_0", + "2171_1", + "2172_0", + "2172_1", + "2173_0", + "2173_1", + "2174_1", + "2177_1", + "2178_0", + "2178_1", + "2179_0", + "217_0", + "2180_0", + "2180_1", + "2183_1", + "2184_0", + "2184_1", + "2185_0", + "2186_0", + "2187_1", + "2188_0", + "2189_0", + "2189_1", + "218_0", + "2190_0", + "2190_1", + "2191_0", + "2191_1", + "2192_0", + "2194_0", + "2195_0", + "2196_0", + "2197_0", + "2198_0", + "2199_0", + "2201_0", + "2202_0", + "2205_0", + "2205_1", + "2206_0", + "2207_0", + "2208_0", + "2208_1", + "220_0", + "2210_0", + "2211_0", + "2211_1", + "2212_0", + "2212_1", + "2213_0", + "2213_1", + "2216_0", + "2218_0", + "2219_0", + "2220_0", + "2222_0", + "2224_0", + "2226_0", + "2227_0", + "2228_0", + "2229_0", + "222_0", + "2232_0", + "2235_0", + "2239_0", + "223_0", + "2241_0", + "2244_0", + "2245_0", + "2246_0", + "2247_0", + "2249_0", + "224_0", + "2251_0", + "2252_0", + "2253_0", + "2257_0", + "2258_0", + "225_0", + "2262_0", + "2265_0", + "2266_0", + "2267_0", + "2270_0", + "2271_0", + "2273_0", + "2276_0", + "2277_0", + "2279_0", + "2280_0", + "2283_0", + "2284_0", + "2285_0", + "2286_0", + "2287_0", + "2288_0", + "2292_0", + "2294_0", + "2295_0", + "2296_0", + "2298_0", + "229_0", + "22_0", + "2301_0", + "2302_0", + "2304_0", + "2305_0", + "2306_0", + "2307_0", + "2308_0", + "2309_0", + "230_0", + "2310_0", + "2312_0", + "2313_0", + "2314_0", + "2317_0", + "2319_0", + "231_0", + "2320_0", + "2321_0", + "2322_0", + "2325_0", + "2326_0", + "2329_0", + "232_0", + "2330_0", + "2332_0", + "2333_0", + "2334_0", + "2335_0", + "2336_0", + "233_0", + "2340_0", + "2341_0", + "2342_0", + "2343_0", + "2345_0", + "2346_0", + "2348_0", + "2349_0", + "234_0", + "234_2", + "2350_0", + "2351_0", + "2352_0", + "2353_0", + "2355_0", + "2356_0", + "235_0", + "235_1", + "235_2", + "2361_0", + "2364_0", + "2365_0", + "2368_0", + "2369_0", + "236_0", + "2371_0", + "2372_0", + "2373_0", + "2374_0", + "2375_0", + "2376_0", + "2377_0", + "2378_0", + "237_0", + "2381_0", + "2384_0", + "2387_0", + "2388_0", + "2390_0", + "2392_0", + "2395_0", + "2397_0", + "2398_0", + "2399_0", + "239_1", + "2402_0", + "2404_0", + "2406_0", + "2407_0", + "2409_0", + "240_0", + "2410_0", + "2412_0", + "2416_0", + "2420_0", + "2421_0", + "2422_0", + "2423_0", + "2425_0", + "2426_0", + "2428_0", + "2430_0", + "2435_0", + "2436_0", + "2439_0", + "243_0", + "2440_0", + "2441_0", + "2442_0", + "2444_0", + "2446_0", + "2448_0", + "244_0", + "244_1", + "244_2", + "2450_0", + "2451_0", + "2452_0", + "2453_0", + "2454_0", + "2455_0", + "2457_0", + "2458_0", + "2459_0", + "245_0", + "2461_0", + "2462_0", + "2465_0", + "2466_0", + "2467_0", + "2468_0", + "2469_0", + "246_0", + "2474_0", + "2475_0", + "2476_0", + "2480_0", + "2481_0", + "2482_0", + "2484_0", + "2488_0", + "2489_0", + "248_0", + "2490_0", + "2491_0", + "2492_0", + "2493_0", + "2494_0", + "2495_0", + "2496_0", + "2497_0", + "2498_0", + "2499_0", + "249_0", + "2500_0", + "2501_0", + "2502_0", + "2503_0", + "2504_0", + "2505_0", + "2506_0", + "2507_0", + "2509_0", + "250_0", + "2512_0", + "2514_0", + "2515_0", + "2516_0", + "2517_0", + "2518_0", + "2519_0", + "251_0", + "2522_0", + "2523_0", + "2525_0", + "2526_0", + "2528_0", + "2529_0", + "252_0", + "2530_0", + "2532_0", + "2533_0", + "2536_0", + "2537_0", + "2538_0", + "253_1", + "2540_0", + "2541_0", + "2542_0", + "2543_0", + "2545_0", + "2547_0", + "2548_0", + "2549_0", + "2550_0", + "2551_0", + "2553_0", + "2554_0", + "2555_0", + "2556_0", + "255_0", + "2562_0", + "2564_0", + "2566_0", + "2568_0", + "2569_0", + "256_0", + "2571_0", + "2574_0", + "2577_0", + "2579_0", + "257_0", + "2580_0", + "2581_0", + "2582_0", + "2583_0", + "2584_0", + "2585_0", + "2586_0", + "2587_0", + "2589_0", + "258_0", + "2591_0", + "2596_0", + "259_0", + "259_2", + "260_0", + "262_0", + "263_0", + "264_0", + "265_1", + "267_0", + "270_0", + "273_0", + "274_0", + "275_0", + "276_0", + "277_0", + "278_0", + "281_0", + "282_0", + "283_0", + "284_0", + "285_0", + "287_0", + "288_0", + "289_0", + "290_0", + "290_2", + "291_0", + "293_0", + "295_0", + "298_0", + "29_0", + "300_0", + "301_0", + "302_0", + "304_0", + "308_0", + "309_0", + "310_0", + "312_0", + "314_0", + "314_2", + "317_0", + "318_0", + "318_1", + "318_2", + "319_0", + "31_0", + "320_0", + "321_0", + "322_0", + "323_0", + "324_0", + "325_0", + "326_0", + "327_0", + "329_1", + "329_2", + "32_0", + "330_0", + "332_0", + "334_0", + "33_0", + "341_0", + "342_0", + "344_0", + "345_0", + "346_0", + "348_0", + "352_0", + "353_0", + "354_0", + "355_1", + "355_2", + "356_0", + "356_1", + "357_0", + "358_0", + "360_0", + "361_0", + "364_0", + "365_0", + "368_0", + "369_0", + "370_0", + "371_0", + "372_0", + "373_0", + "374_0", + "375_0", + "378_0", + "37_0", + "380_0", + "380_2", + "383_0", + "384_0", + "385_0", + "386_0", + "387_0", + "389_0", + "38_0", + "390_0", + "391_1", + "391_2", + "393_0", + "394_0", + "395_0", + "395_2", + "396_0", + "397_0", + "398_0", + "399_0", + "399_1", + "399_2", + "400_0", + "401_0", + "402_0", + "406_0", + "409_0", + "410_0", + "411_0", + "414_0", + "415_0", + "416_0", + "417_0", + "418_0", + "418_1", + "418_2", + "419_0", + "41_0", + "420_0", + "421_0", + "422_0", + "423_0", + "425_0", + "426_1", + "427_2", + "429_0", + "429_1", + "429_2", + "42_0", + "430_2", + "431_0", + "433_0", + "433_1", + "433_2", + "435_0", + "437_0", + "437_2", + "442_1", + "444_0", + "446_0", + "448_0", + "449_2", + "44_0", + "450_0", + "451_0", + "451_1", + "451_2", + "452_0", + "452_1", + "453_0", + "453_1", + "456_0", + "456_1", + "456_2", + "457_0", + "458_0", + "460_0", + "462_0", + "463_0", + "464_0", + "465_0", + "466_0", + "468_0", + "469_0", + "46_0", + "470_0", + "471_0", + "472_0", + "473_0", + "473_2", + "474_0", + "475_0", + "477_0", + "478_0", + "47_0", + "480_0", + "481_0", + "482_0", + "483_0", + "484_0", + "485_0", + "485_1", + "485_2", + "486_0", + "487_0", + "488_0", + "489_0", + "48_0", + "490_0", + "491_0", + "493_0", + "494_0", + "495_1", + "496_0", + "497_0", + "497_1", + "497_2", + "498_0", + "499_1", + "499_2", + "49_0", + "500_1", + "500_2", + "501_0", + "501_2", + "502_0", + "502_1", + "502_2", + "503_0", + "503_1", + "503_2", + "506_0", + "507_0", + "508_0", + "509_0", + "50_0", + "510_2", + "512_0", + "513_0", + "513_1", + "513_2", + "515_0", + "517_1", + "517_2", + "519_0", + "520_0", + "520_1", + "520_2", + "522_0", + "524_0", + "528_0", + "530_0", + "531_0", + "534_0", + "534_1", + "534_2", + "537_0", + "538_0", + "539_0", + "53_0", + "541_0", + "542_1", + "542_2", + "543_0", + "544_0", + "545_0", + "546_0", + "547_0", + "548_0", + "548_1", + "548_2", + "54_0", + "550_0", + "551_0", + "552_0", + "554_0", + "555_0", + "556_0", + "557_1", + "557_2", + "558_0", + "558_1", + "558_2", + "559_0", + "560_0", + "560_1", + "560_2", + "562_0", + "563_0", + "564_0", + "565_0", + "566_0", + "567_0", + "568_0", + "569_0", + "56_0", + "574_0", + "576_0", + "577_0", + "579_0", + "579_1", + "579_2", + "580_2", + "581_0", + "583_0", + "583_1", + "584_0", + "584_1", + "584_2", + "585_0", + "586_0", + "586_2", + "587_0", + "588_0", + "588_1", + "588_2", + "589_1", + "591_1", + "591_2", + "594_0", + "595_0", + "597_0", + "597_1", + "597_2", + "598_0", + "5_0", + "600_0", + "601_0", + "607_0", + "608_0", + "609_0", + "609_1", + "609_2", + "60_0", + "610_0", + "611_0", + "614_0", + "614_1", + "617_0", + "618_0", + "618_1", + "619_1", + "61_0", + "621_0", + "623_0", + "623_1", + "624_0", + "625_0", + "625_1", + "626_0", + "628_0", + "630_0", + "630_1", + "630_2", + "631_0", + "632_0", + "633_0", + "634_0", + "635_0", + "636_0", + "637_0", + "637_2", + "638_0", + "639_2", + "63_0", + "642_0", + "643_0", + "647_0", + "648_0", + "649_1", + "649_2", + "64_0", + "650_0", + "653_1", + "654_0", + "655_0", + "656_0", + "657_0", + "658_1", + "659_0", + "65_0", + "660_1", + "661_0", + "662_0", + "663_0", + "664_1", + "664_2", + "666_0", + "666_1", + "667_1", + "668_0", + "668_2", + "66_0", + "670_0", + "671_0", + "673_0", + "674_0", + "677_0", + "678_0", + "678_2", + "679_0", + "67_0", + "681_0", + "681_1", + "682_0", + "683_0", + "683_1", + "683_2", + "685_0", + "687_0", + "689_0", + "68_0", + "691_0", + "692_0", + "692_1", + "692_2", + "693_0", + "694_0", + "695_0", + "697_0", + "697_1", + "699_0", + "69_0", + "700_1", + "700_2", + "701_0", + "702_0", + "705_0", + "708_0", + "709_0", + "70_0", + "711_0", + "713_0", + "716_0", + "718_0", + "719_0", + "719_1", + "719_2", + "720_0", + "721_0", + "722_0", + "722_1", + "723_0", + "724_0", + "724_1", + "724_2", + "727_0", + "728_0", + "729_0", + "729_1", + "72_0", + "731_0", + "732_0", + "733_0", + "734_0", + "735_0", + "736_0", + "736_1", + "736_2", + "739_0", + "740_0", + "741_0", + "742_0", + "742_1", + "744_0", + "744_1", + "746_0", + "747_0", + "748_0", + "750_0", + "751_0", + "751_2", + "752_0", + "753_0", + "753_1", + "754_0", + "754_2", + "756_2", + "757_0", + "758_0", + "759_0", + "759_1", + "759_2", + "75_0", + "760_0", + "760_1", + "760_2", + "761_0", + "761_1", + "762_0", + "762_1", + "763_2", + "765_0", + "765_2", + "766_0", + "769_0", + "769_1", + "769_2", + "770_0", + "775_0", + "775_1", + "779_2", + "77_0", + "77_1", + "781_0", + "781_1", + "783_1", + "784_0", + "787_0", + "788_0", + "789_0", + "78_0", + "790_0", + "791_0", + "792_0", + "793_0", + "793_1", + "793_2", + "794_0", + "795_0", + "796_0", + "796_1", + "797_0", + "797_1", + "797_2", + "798_0", + "798_1", + "798_2", + "799_0", + "79_0", + "7_0", + "801_0", + "803_0", + "803_1", + "803_2", + "804_0", + "805_0", + "806_2", + "807_0", + "808_0", + "80_2", + "810_0", + "811_0", + "811_1", + "812_0", + "813_0", + "815_0", + "817_0", + "819_0", + "81_0", + "821_0", + "822_0", + "823_0", + "824_0", + "824_1", + "827_0", + "828_1", + "828_2", + "829_0", + "82_0", + "830_0", + "831_0", + "831_2", + "832_0", + "833_0", + "835_0", + "837_0", + "838_0", + "839_0", + "83_0", + "840_0", + "841_0", + "841_1", + "841_2", + "842_0", + "843_0", + "844_0", + "845_1", + "845_2", + "846_0", + "848_0", + "849_0", + "850_0", + "851_0", + "853_0", + "853_1", + "853_2", + "854_0", + "856_0", + "858_1", + "858_2", + "859_0", + "85_0", + "860_0", + "860_1", + "861_0", + "861_1", + "861_2", + "862_0", + "863_0", + "863_1", + "864_0", + "865_0", + "870_0", + "871_0", + "875_0", + "875_1", + "875_2", + "877_0", + "878_1", + "878_2", + "879_2", + "87_0", + "880_0", + "881_0", + "883_0", + "883_1", + "883_2", + "886_0", + "886_1", + "886_2", + "887_2", + "888_0", + "88_0", + "891_0", + "891_2", + "893_0", + "893_1", + "894_0", + "894_2", + "895_0", + "896_0", + "897_0", + "899_1", + "899_2", + "89_0", + "8_0", + "900_0", + "901_0", + "903_0", + "905_0", + "907_0", + "908_0", + "908_1", + "908_2", + "910_0", + "910_1", + "910_2", + "911_0", + "911_1", + "911_2", + "913_1", + "914_0", + "915_0", + "915_1", + "915_2", + "916_0", + "916_2", + "917_0", + "918_0", + "918_1", + "918_2", + "919_0", + "919_1", + "920_0", + "921_1", + "921_2", + "924_0", + "924_1", + "925_1", + "925_2", + "927_1", + "927_2", + "929_1", + "929_2", + "92_0", + "930_0", + "930_1", + "932_1", + "932_2", + "934_1", + "934_2", + "935_0", + "936_1", + "937_0", + "937_2", + "939_1", + "939_2", + "93_0", + "940_0", + "940_2", + "941_0", + "941_1", + "941_2", + "942_0", + "943_1", + "943_2", + "944_0", + "944_2", + "946_0", + "946_2", + "947_0", + "947_1", + "948_0", + "948_1", + "949_0", + "949_1", + "951_0", + "952_0", + "953_0", + "953_1", + "954_1", + "954_2", + "955_0", + "955_2", + "956_0", + "956_1", + "956_2", + "957_0", + "957_1", + "959_0", + "960_0", + "960_1", + "960_2", + "962_0", + "963_0", + "963_2", + "964_0", + "965_0", + "965_1", + "965_2", + "966_0", + "966_1", + "968_0", + "968_1", + "969_0", + "971_0", + "972_0", + "973_0", + "975_0", + "975_1", + "975_2", + "977_1", + "978_0", + "97_0", + "980_0", + "980_1", + "980_2", + "981_0", + "981_1", + "981_2", + "982_0", + "982_2", + "983_1", + "984_0", + "984_1", + "985_0", + "985_1", + "986_0", + "986_1", + "986_2", + "987_0", + "987_2", + "988_0", + "988_1", + "989_0", + "989_1", + "989_2", + "98_0", + "990_0", + "990_2", + "991_0", + "991_1", + "993_0", + "994_0", + "994_1", + "994_2", + "995_0", + "996_0", + "996_2", + "999_0", + "999_1", + "9_0" + ], + "test_query_ids": [ + "0_0", + "0_1", + "0_2", + "100_0", + "100_1", + "101_0", + "102_0", + "102_1", + "102_2", + "103_0", + "103_1", + "103_2", + "104_0", + "104_1", + "104_2", + "105_0", + "105_1", + "105_2", + "106_0", + "106_1", + "108_0", + "109_0", + "109_1", + "109_2", + "110_0", + "110_1", + "110_2", + "111_0", + "111_1", + "111_2", + "112_0", + "112_1", + "112_2", + "113_0", + "113_1", + "113_2", + "114_0", + "114_1", + "114_2", + "115_0", + "116_0", + "116_1", + "116_2", + "117_0", + "117_1", + "117_2", + "118_0", + "118_1", + "118_2", + "119_0", + "119_1", + "119_2", + "11_0", + "11_1", + "11_2", + "120_0", + "121_0", + "121_1", + "122_0", + "122_1", + "122_2", + "123_0", + "123_1", + "123_2", + "124_0", + "124_1", + "124_2", + "125_0", + "126_0", + "126_1", + "126_2", + "127_0", + "127_1", + "127_2", + "128_0", + "128_1", + "128_2", + "129_0", + "129_1", + "129_2", + "12_0", + "130_0", + "130_1", + "130_2", + "131_0", + "131_1", + "131_2", + "132_0", + "132_1", + "132_2", + "133_0", + "133_1", + "133_2", + "134_0", + "134_1", + "134_2", + "136_0", + "136_1", + "136_2", + "137_0", + "137_1", + "138_0", + "138_1", + "138_2", + "139_0", + "139_1", + "13_0", + "140_0", + "140_1", + "140_2", + "141_0", + "141_1", + "141_2", + "142_0", + "142_1", + "142_2", + "144_0", + "145_0", + "146_0", + "146_1", + "146_2", + "148_0", + "148_1", + "148_2", + "149_0", + "149_1", + "149_2", + "14_0", + "14_1", + "14_2", + "150_0", + "150_1", + "150_2", + "151_0", + "151_1", + "151_2", + "152_0", + "153_0", + "153_1", + "153_2", + "154_0", + "154_1", + "154_2", + "155_0", + "155_1", + "155_2", + "156_0", + "156_1", + "156_2", + "157_0", + "157_1", + "157_2", + "158_0", + "158_1", + "158_2", + "159_0", + "159_1", + "159_2", + "15_0", + "15_1", + "161_0", + "161_1", + "161_2", + "162_0", + "162_1", + "162_2", + "163_0", + "164_0", + "165_0", + "165_1", + "165_2", + "166_0", + "166_1", + "166_2", + "167_0", + "168_0", + "168_1", + "169_0", + "169_1", + "169_2", + "16_0", + "16_1", + "16_2", + "170_0", + "170_1", + "170_2", + "171_0", + "171_1", + "171_2", + "172_0", + "173_0", + "173_1", + "173_2", + "174_0", + "174_1", + "175_0", + "175_1", + "175_2", + "176_0", + "176_1", + "176_2", + "177_0", + "177_1", + "177_2", + "178_0", + "179_0", + "179_1", + "179_2", + "17_0", + "17_1", + "17_2", + "180_0", + "180_1", + "180_2", + "181_0", + "181_1", + "181_2", + "182_0", + "183_0", + "183_1", + "184_0", + "184_1", + "184_2", + "185_0", + "185_1", + "185_2", + "186_0", + "186_1", + "186_2", + "187_0", + "187_1", + "187_2", + "188_0", + "189_0", + "189_1", + "189_2", + "18_0", + "18_1", + "18_2", + "192_0", + "192_1", + "192_2", + "193_0", + "193_1", + "193_2", + "194_0", + "194_1", + "196_0", + "196_1", + "197_0", + "197_1", + "197_2", + "198_0", + "198_1", + "19_0", + "19_1", + "19_2", + "1_0", + "1_1", + "200_0", + "200_1", + "200_2", + "201_0", + "201_1", + "202_0", + "202_1", + "203_0", + "204_0", + "205_0", + "205_1", + "206_0", + "206_1", + "207_0", + "207_1", + "208_0", + "208_1", + "209_0", + "20_0", + "20_1", + "20_2", + "210_0", + "210_1", + "211_0", + "211_1", + "212_0", + "213_0", + "213_1", + "215_0", + "215_1", + "216_0", + "216_1", + "217_0", + "217_1", + "218_0", + "218_1", + "219_0", + "219_1", + "21_0", + "220_0", + "220_1", + "221_0", + "221_1", + "222_0", + "222_1", + "223_0", + "223_1", + "225_0", + "226_0", + "226_1", + "227_0", + "227_1", + "228_0", + "228_1", + "229_0", + "22_0", + "22_1", + "22_2", + "230_0", + "230_1", + "231_0", + "231_1", + "232_0", + "232_1", + "233_0", + "233_1", + "234_0", + "235_0", + "235_1", + "236_0", + "236_1", + "237_0", + "239_0", + "239_1", + "23_0", + "23_1", + "23_2", + "241_0", + "241_1", + "243_0", + "243_1", + "244_0", + "244_1", + "247_0", + "247_1", + "248_0", + "248_1", + "249_0", + "24_0", + "24_1", + "24_2", + "250_0", + "250_1", + "251_0", + "251_1", + "252_0", + "252_1", + "253_0", + "253_1", + "254_0", + "254_1", + "255_0", + "255_1", + "256_0", + "256_1", + "257_0", + "257_1", + "258_0", + "258_1", + "259_0", + "259_1", + "25_0", + "25_1", + "25_2", + "260_0", + "260_1", + "261_0", + "261_1", + "263_0", + "264_0", + "264_1", + "265_0", + "266_0", + "266_1", + "267_0", + "267_1", + "268_0", + "268_1", + "269_0", + "269_1", + "26_0", + "26_1", + "270_0", + "270_1", + "271_0", + "272_0", + "272_1", + "273_0", + "273_1", + "275_0", + "275_1", + "276_0", + "276_1", + "279_0", + "279_1", + "27_0", + "27_1", + "27_2", + "280_0", + "280_1", + "281_0", + "281_1", + "282_0", + "282_1", + "283_0", + "283_1", + "284_0", + "284_1", + "285_0", + "285_1", + "286_0", + "286_1", + "287_0", + "287_1", + "288_0", + "288_1", + "289_0", + "289_1", + "28_0", + "28_1", + "28_2", + "290_0", + "291_0", + "291_1", + "292_0", + "292_1", + "293_0", + "293_1", + "294_0", + "294_1", + "295_0", + "295_1", + "296_0", + "296_1", + "297_0", + "297_1", + "298_0", + "298_1", + "299_0", + "299_1", + "2_0", + "2_1", + "300_0", + "300_1", + "301_0", + "302_0", + "302_1", + "303_0", + "303_1", + "305_0", + "305_1", + "306_0", + "306_1", + "307_0", + "307_1", + "308_0", + "308_1", + "309_0", + "309_1", + "30_0", + "30_1", + "30_2", + "310_0", + "311_0", + "311_1", + "312_0", + "312_1", + "313_0", + "313_1", + "314_0", + "315_0", + "315_1", + "316_0", + "317_0", + "317_1", + "318_0", + "318_1", + "319_0", + "319_1", + "31_0", + "31_1", + "31_2", + "320_0", + "320_1", + "321_0", + "321_1", + "322_0", + "324_0", + "324_1", + "325_0", + "325_1", + "326_0", + "326_1", + "327_0", + "327_1", + "328_0", + "328_1", + "329_0", + "329_1", + "32_0", + "32_1", + "32_2", + "330_0", + "330_1", + "331_0", + "331_1", + "332_0", + "332_1", + "333_0", + "333_1", + "334_0", + "334_1", + "335_0", + "335_1", + "336_0", + "336_1", + "337_0", + "337_1", + "338_0", + "338_1", + "339_0", + "339_1", + "33_0", + "33_1", + "340_0", + "340_1", + "341_0", + "341_1", + "342_0", + "342_1", + "344_0", + "344_1", + "346_0", + "346_1", + "348_0", + "348_1", + "349_0", + "349_1", + "34_0", + "34_1", + "34_2", + "351_0", + "352_0", + "353_0", + "353_1", + "354_0", + "354_1", + "355_0", + "355_1", + "357_0", + "358_0", + "358_1", + "359_0", + "359_1", + "360_0", + "361_0", + "361_1", + "362_0", + "362_1", + "364_0", + "364_1", + "365_0", + "365_1", + "366_0", + "366_1", + "367_0", + "367_1", + "368_0", + "368_1", + "369_0", + "369_1", + "36_0", + "36_1", + "36_2", + "370_0", + "370_1", + "371_0", + "371_1", + "372_0", + "372_1", + "373_0", + "374_0", + "375_0", + "375_1", + "376_0", + "376_1", + "377_0", + "377_1", + "378_0", + "378_1", + "379_0", + "379_1", + "37_0", + "37_1", + "381_0", + "381_1", + "382_0", + "383_0", + "383_1", + "384_0", + "384_1", + "385_0", + "385_1", + "386_0", + "387_0", + "387_1", + "389_0", + "389_1", + "38_0", + "38_1", + "38_2", + "390_0", + "390_1", + "391_0", + "391_1", + "392_0", + "392_1", + "393_0", + "393_1", + "394_0", + "394_1", + "395_0", + "395_1", + "396_0", + "396_1", + "397_0", + "397_1", + "398_0", + "398_1", + "399_0", + "399_1", + "39_0", + "3_0", + "3_1", + "400_0", + "400_1", + "402_0", + "402_1", + "403_0", + "403_1", + "404_0", + "404_1", + "405_0", + "405_1", + "406_0", + "406_1", + "407_0", + "407_1", + "408_0", + "408_1", + "409_0", + "409_1", + "40_0", + "40_1", + "40_2", + "410_0", + "410_1", + "411_0", + "411_1", + "412_0", + "412_1", + "413_0", + "413_1", + "414_0", + "414_1", + "415_0", + "415_1", + "416_0", + "417_0", + "417_1", + "418_0", + "418_1", + "419_0", + "419_1", + "420_0", + "420_1", + "421_0", + "421_1", + "423_0", + "423_1", + "424_0", + "425_0", + "426_0", + "426_1", + "428_0", + "428_1", + "429_0", + "429_1", + "42_0", + "430_0", + "431_0", + "431_1", + "432_0", + "432_1", + "433_0", + "433_1", + "434_0", + "434_1", + "435_0", + "435_1", + "436_0", + "436_1", + "437_0", + "437_1", + "438_0", + "438_1", + "439_0", + "439_1", + "43_0", + "43_1", + "43_2", + "440_0", + "440_1", + "441_0", + "442_0", + "443_0", + "443_1", + "444_0", + "444_1", + "446_0", + "447_0", + "448_0", + "448_1", + "449_0", + "44_0", + "44_1", + "44_2", + "450_0", + "450_1", + "451_0", + "451_1", + "452_0", + "452_1", + "453_0", + "453_1", + "454_0", + "456_0", + "456_1", + "458_0", + "458_1", + "459_0", + "459_1", + "45_0", + "45_1", + "45_2", + "460_0", + "460_1", + "461_0", + "462_0", + "464_0", + "464_1", + "465_0", + "465_1", + "466_0", + "467_0", + "468_0", + "468_1", + "46_0", + "46_1", + "46_2", + "470_0", + "470_1", + "471_0", + "471_1", + "472_0", + "472_1", + "473_0", + "473_1", + "474_0", + "476_0", + "476_1", + "477_0", + "477_1", + "478_0", + "478_1", + "479_0", + "479_1", + "480_0", + "480_1", + "481_0", + "482_0", + "482_1", + "483_0", + "483_1", + "484_0", + "484_1", + "485_0", + "485_1", + "486_0", + "486_1", + "487_0", + "487_1", + "488_0", + "488_1", + "489_0", + "48_0", + "48_1", + "48_2", + "490_0", + "490_1", + "492_0", + "492_1", + "494_0", + "494_1", + "495_0", + "495_1", + "496_0", + "496_1", + "497_0", + "497_1", + "498_0", + "499_0", + "499_1", + "49_0", + "49_1", + "49_2", + "4_0", + "4_1", + "4_2", + "500_0", + "500_1", + "501_0", + "501_1", + "502_0", + "502_1", + "503_0", + "503_1", + "504_0", + "504_1", + "506_0", + "506_1", + "507_0", + "507_1", + "508_0", + "508_1", + "509_0", + "50_0", + "50_1", + "50_2", + "510_0", + "510_1", + "512_0", + "512_1", + "513_0", + "513_1", + "514_0", + "514_1", + "515_0", + "516_0", + "516_1", + "517_0", + "517_1", + "519_0", + "519_1", + "51_0", + "51_1", + "51_2", + "520_0", + "521_0", + "521_1", + "522_0", + "522_1", + "523_0", + "523_1", + "524_0", + "524_1", + "525_0", + "525_1", + "526_0", + "526_1", + "527_0", + "527_1", + "528_0", + "528_1", + "529_0", + "529_1", + "52_0", + "52_1", + "52_2", + "530_0", + "531_0", + "531_1", + "532_0", + "532_1", + "533_0", + "534_0", + "535_0", + "536_0", + "536_1", + "538_0", + "538_1", + "539_0", + "53_0", + "53_1", + "53_2", + "540_0", + "540_1", + "541_0", + "541_1", + "542_0", + "542_1", + "543_0", + "543_1", + "544_0", + "544_1", + "545_0", + "545_1", + "546_0", + "546_1", + "547_0", + "548_0", + "548_1", + "549_0", + "549_1", + "54_0", + "54_1", + "54_2", + "550_0", + "550_1", + "552_0", + "553_0", + "553_1", + "554_0", + "554_1", + "555_0", + "555_1", + "556_0", + "556_1", + "557_0", + "557_1", + "558_0", + "558_1", + "559_0", + "559_1", + "55_0", + "55_1", + "55_2", + "560_0", + "560_1", + "561_0", + "561_1", + "562_0", + "563_0", + "563_1", + "564_0", + "564_1", + "565_0", + "565_1", + "566_0", + "566_1", + "568_0", + "568_1", + "569_0", + "56_0", + "56_1", + "56_2", + "570_0", + "570_1", + "571_0", + "572_0", + "572_1", + "573_0", + "573_1", + "574_0", + "575_0", + "575_1", + "576_0", + "576_1", + "577_0", + "577_1", + "578_0", + "578_1", + "579_0", + "579_1", + "57_0", + "57_1", + "57_2", + "580_0", + "580_1", + "581_0", + "581_1", + "582_0", + "582_1", + "583_0", + "584_0", + "584_1", + "586_0", + "586_1", + "587_0", + "588_0", + "588_1", + "589_0", + "589_1", + "58_0", + "58_1", + "58_2", + "590_0", + "590_1", + "592_0", + "592_1", + "593_0", + "594_0", + "594_1", + "595_0", + "595_1", + "596_0", + "596_1", + "597_0", + "598_0", + "598_1", + "599_0", + "599_1", + "59_0", + "59_1", + "59_2", + "5_0", + "5_1", + "5_2", + "600_0", + "600_1", + "601_0", + "601_1", + "602_0", + "602_1", + "603_0", + "603_1", + "604_0", + "604_1", + "605_0", + "606_0", + "606_1", + "607_0", + "607_1", + "608_0", + "608_1", + "609_0", + "609_1", + "60_0", + "60_1", + "60_2", + "610_0", + "610_1", + "613_0", + "614_0", + "615_0", + "616_0", + "616_1", + "617_0", + "617_1", + "619_0", + "619_1", + "61_0", + "61_1", + "61_2", + "621_0", + "621_1", + "623_0", + "624_0", + "625_0", + "625_1", + "626_0", + "628_0", + "628_1", + "629_0", + "629_1", + "62_0", + "62_1", + "632_0", + "632_1", + "633_0", + "633_1", + "634_0", + "634_1", + "635_0", + "636_0", + "636_1", + "637_0", + "637_1", + "638_0", + "639_0", + "639_1", + "63_0", + "63_1", + "63_2", + "640_0", + "640_1", + "641_0", + "641_1", + "642_0", + "642_1", + "643_0", + "643_1", + "644_0", + "646_0", + "646_1", + "647_0", + "648_0", + "648_1", + "649_0", + "64_0", + "64_1", + "64_2", + "650_0", + "650_1", + "651_0", + "651_1", + "652_0", + "652_1", + "653_0", + "653_1", + "654_0", + "655_0", + "655_1", + "65_0", + "65_1", + "65_2", + "67_0", + "67_1", + "67_2", + "68_0", + "68_1", + "69_0", + "69_1", + "69_2", + "6_0", + "6_1", + "6_2", + "70_0", + "70_1", + "70_2", + "71_0", + "71_1", + "71_2", + "72_0", + "72_1", + "72_2", + "73_0", + "73_1", + "73_2", + "74_0", + "74_1", + "74_2", + "75_0", + "75_1", + "75_2", + "76_0", + "76_1", + "76_2", + "77_0", + "77_1", + "78_0", + "78_1", + "78_2", + "7_0", + "7_1", + "80_0", + "80_1", + "80_2", + "81_0", + "81_1", + "82_0", + "82_1", + "83_0", + "83_1", + "83_2", + "84_0", + "84_1", + "84_2", + "85_0", + "86_0", + "86_1", + "86_2", + "87_0", + "87_1", + "87_2", + "88_0", + "88_1", + "88_2", + "8_0", + "90_0", + "90_1", + "91_0", + "91_1", + "91_2", + "92_0", + "92_1", + "92_2", + "93_0", + "93_1", + "94_0", + "94_1", + "94_2", + "95_0", + "95_1", + "95_2", + "96_0", + "96_1", + "96_2", + "97_0", + "97_1", + "97_2", + "98_0", + "98_1", + "98_2", + "9_0", + "9_1", + "9_2" + ] +} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json new file mode 100644 index 0000000..1eebdca --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json @@ -0,0 +1,34 @@ +[ + { + "dataset": "browsecompplus", + "total": 830, + "train": 664, + "test": 166, + "sft": 199, + "rl": 465 + }, + { + "dataset": "sec", + "total": 4084, + "train": 3453, + "test": 1216, + "sft": 1035, + "rl": 2418 + }, + { + "dataset": "patents", + "total": 3107, + "train": 2518, + "test": 718, + "sft": 755, + "rl": 1763 + }, + { + "dataset": "web", + "total": 2351, + "train": 2224, + "test": 554, + "sft": 667, + "rl": 1557 + } +] \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json new file mode 100644 index 0000000..0915b3c --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json @@ -0,0 +1,2794 @@ +{ + "dataset": "web", + "total_queries": 2351, + "train_queries": 2224, + "test_queries": 554, + "sft_queries": 667, + "rl_queries": 1557, + "sft_ratio": 0.29991007194244607, + "rl_ratio": 0.700089928057554, + "sft_query_ids": [ + "0_2", + "100_3", + "101_1", + "102_1", + "102_3", + "103_0", + "103_1", + "105_0", + "106_2", + "106_3", + "107_0", + "107_2", + "108_1", + "109_0", + "10_1", + "10_2", + "10_3", + "111_1", + "115_0", + "115_3", + "117_2", + "118_2", + "11_0", + "120_1", + "121_1", + "123_0", + "123_1", + "123_3", + "125_0", + "125_3", + "126_0", + "126_2", + "127_1", + "129_1", + "12_0", + "12_1", + "131_2", + "131_3", + "135_0", + "135_1", + "139_0", + "139_2", + "13_1", + "140_3", + "143_0", + "143_1", + "143_2", + "144_2", + "145_0", + "145_1", + "146_0", + "146_1", + "146_2", + "148_0", + "149_0", + "149_1", + "149_2", + "150_3", + "151_0", + "153_1", + "155_0", + "156_3", + "158_0", + "159_0", + "160_2", + "161_1", + "161_3", + "162_1", + "163_1", + "163_3", + "164_1", + "164_2", + "165_0", + "166_1", + "169_0", + "16_3", + "171_0", + "171_1", + "175_2", + "17_2", + "180_0", + "180_2", + "182_0", + "184_1", + "184_2", + "184_3", + "187_1", + "188_0", + "189_0", + "189_1", + "190_0", + "190_1", + "190_2", + "190_3", + "191_1", + "192_2", + "193_1", + "196_0", + "196_2", + "196_3", + "197_2", + "199_1", + "1_3", + "201_0", + "203_1", + "203_2", + "204_0", + "204_1", + "204_2", + "204_3", + "206_1", + "207_2", + "209_2", + "212_1", + "212_2", + "215_0", + "215_2", + "215_3", + "219_0", + "219_1", + "21_1", + "21_2", + "220_0", + "220_1", + "222_1", + "223_1", + "223_3", + "224_0", + "226_0", + "227_1", + "228_0", + "228_1", + "229_3", + "22_2", + "22_3", + "230_0", + "231_2", + "231_3", + "232_0", + "233_0", + "234_0", + "236_0", + "237_0", + "238_1", + "239_0", + "239_3", + "23_1", + "240_0", + "242_1", + "244_0", + "244_1", + "245_1", + "245_3", + "247_0", + "247_1", + "247_2", + "249_1", + "24_0", + "24_1", + "24_2", + "250_0", + "251_0", + "251_3", + "252_0", + "253_0", + "253_1", + "253_3", + "254_0", + "258_3", + "259_0", + "260_2", + "261_1", + "262_0", + "263_2", + "263_3", + "264_1", + "264_2", + "265_0", + "265_1", + "266_0", + "266_2", + "267_0", + "267_3", + "268_0", + "268_1", + "268_3", + "269_1", + "26_0", + "270_2", + "271_0", + "272_0", + "272_1", + "272_2", + "273_2", + "274_0", + "275_1", + "275_2", + "275_3", + "276_0", + "278_1", + "279_0", + "279_3", + "280_1", + "280_3", + "286_0", + "286_1", + "288_1", + "28_3", + "293_1", + "293_3", + "295_0", + "295_3", + "299_0", + "299_2", + "2_0", + "2_1", + "300_2", + "301_0", + "303_1", + "304_3", + "307_1", + "307_2", + "309_0", + "309_2", + "309_3", + "30_1", + "315_0", + "315_1", + "316_0", + "316_3", + "317_2", + "31_3", + "321_0", + "322_2", + "322_3", + "324_0", + "324_1", + "324_3", + "325_0", + "326_0", + "329_0", + "32_0", + "32_2", + "32_3", + "331_0", + "332_3", + "334_0", + "334_1", + "336_1", + "337_2", + "338_0", + "339_0", + "339_3", + "33_2", + "33_3", + "340_1", + "341_0", + "341_2", + "343_0", + "343_2", + "348_0", + "348_1", + "349_0", + "349_1", + "34_0", + "350_2", + "350_3", + "351_0", + "351_2", + "351_3", + "352_0", + "354_2", + "357_2", + "35_1", + "35_2", + "360_2", + "360_3", + "361_1", + "362_0", + "364_3", + "366_3", + "367_3", + "368_0", + "368_1", + "370_2", + "371_2", + "372_0", + "374_0", + "375_1", + "376_1", + "377_3", + "378_1", + "379_0", + "381_3", + "382_0", + "382_1", + "382_2", + "384_0", + "384_2", + "385_0", + "386_1", + "38_0", + "38_1", + "390_1", + "390_2", + "391_1", + "395_1", + "395_3", + "396_2", + "397_3", + "398_0", + "398_2", + "39_2", + "39_3", + "3_1", + "3_2", + "401_2", + "402_0", + "402_2", + "405_0", + "405_2", + "407_0", + "407_1", + "407_2", + "408_2", + "40_0", + "40_1", + "412_1", + "412_2", + "412_3", + "414_1", + "414_3", + "415_1", + "415_2", + "416_1", + "417_1", + "421_1", + "422_0", + "422_2", + "426_1", + "428_1", + "42_2", + "435_1", + "436_1", + "436_3", + "437_0", + "437_1", + "438_2", + "439_0", + "439_1", + "439_2", + "43_1", + "43_2", + "440_0", + "444_0", + "444_2", + "445_1", + "447_0", + "447_1", + "448_1", + "448_2", + "448_3", + "449_1", + "449_2", + "449_3", + "451_0", + "453_2", + "454_0", + "454_2", + "455_0", + "455_1", + "456_0", + "456_1", + "456_2", + "456_3", + "457_1", + "457_2", + "458_1", + "45_1", + "45_2", + "460_0", + "461_1", + "467_2", + "468_0", + "469_0", + "46_1", + "46_2", + "471_0", + "472_2", + "473_1", + "474_0", + "474_1", + "475_0", + "475_1", + "475_3", + "476_3", + "477_0", + "478_0", + "478_2", + "480_3", + "482_1", + "485_0", + "485_3", + "487_0", + "487_1", + "488_1", + "489_1", + "489_2", + "48_1", + "491_1", + "492_1", + "492_2", + "492_3", + "495_1", + "497_2", + "498_1", + "499_0", + "499_1", + "4_3", + "503_1", + "503_2", + "508_2", + "509_2", + "510_0", + "510_1", + "511_0", + "511_1", + "512_0", + "515_2", + "516_3", + "517_0", + "519_3", + "520_0", + "520_2", + "521_1", + "521_2", + "522_2", + "522_3", + "523_0", + "524_0", + "524_2", + "525_0", + "526_0", + "527_1", + "527_2", + "527_3", + "52_0", + "530_0", + "530_1", + "534_2", + "535_0", + "535_1", + "537_1", + "537_3", + "539_3", + "541_0", + "543_0", + "544_3", + "545_1", + "545_3", + "548_0", + "548_2", + "548_3", + "54_0", + "551_1", + "554_0", + "554_3", + "555_0", + "555_1", + "555_2", + "558_2", + "559_2", + "55_0", + "560_0", + "560_2", + "561_1", + "562_0", + "566_3", + "569_1", + "573_0", + "574_0", + "576_0", + "576_3", + "577_2", + "578_2", + "582_1", + "584_0", + "584_3", + "586_1", + "586_3", + "587_3", + "588_2", + "589_0", + "590_0", + "591_3", + "592_0", + "593_0", + "593_1", + "595_2", + "595_3", + "597_0", + "597_3", + "59_0", + "59_3", + "5_1", + "5_2", + "601_0", + "602_2", + "602_3", + "603_0", + "603_1", + "604_1", + "604_2", + "609_2", + "609_3", + "60_0", + "611_1", + "612_0", + "612_1", + "614_2", + "615_0", + "616_0", + "616_1", + "617_0", + "618_1", + "618_2", + "61_1", + "620_0", + "621_1", + "622_1", + "624_1", + "625_0", + "627_0", + "628_0", + "62_2", + "62_3", + "631_3", + "633_0", + "633_1", + "634_1", + "635_1", + "635_2", + "635_3", + "637_1", + "637_2", + "637_3", + "641_2", + "642_0", + "644_2", + "646_1", + "647_2", + "647_3", + "649_3", + "64_0", + "650_1", + "651_0", + "651_2", + "652_0", + "652_1", + "653_0", + "654_1", + "655_0", + "655_2", + "655_3", + "658_0", + "658_1", + "659_2", + "659_3", + "65_3", + "663_0", + "663_1", + "663_2", + "666_1", + "66_0", + "66_3", + "670_1", + "670_2", + "671_1", + "671_2", + "672_0", + "672_3", + "673_1", + "673_2", + "676_3", + "677_1", + "677_3", + "679_2", + "679_3", + "67_1", + "67_3", + "680_3", + "683_1", + "683_3", + "686_2", + "686_3", + "687_2", + "688_1", + "689_0", + "692_0", + "692_1", + "693_0", + "695_0", + "695_2", + "696_2", + "697_2", + "698_2", + "698_3", + "6_2", + "700_0", + "700_2", + "703_1", + "706_0", + "708_0", + "708_2", + "70_0", + "710_0", + "710_2", + "710_3", + "712_0", + "713_3", + "715_1", + "716_1", + "719_0", + "719_1", + "722_3", + "725_0", + "726_1", + "726_2", + "726_3", + "727_1", + "728_0", + "730_0", + "731_2", + "731_3", + "732_0", + "732_1", + "732_2", + "734_0", + "734_2", + "735_1", + "735_3", + "73_1", + "74_1", + "76_2", + "77_3", + "78_2", + "79_0", + "7_0", + "80_1", + "82_0", + "84_1", + "88_0", + "88_2", + "88_3", + "89_1", + "8_0", + "8_1", + "8_2", + "8_3", + "91_2", + "92_2", + "93_2", + "94_0", + "94_1", + "98_2", + "99_2", + "9_1" + ], + "rl_query_ids": [ + "0_0", + "0_1", + "0_3", + "100_0", + "100_1", + "100_2", + "101_0", + "101_2", + "101_3", + "102_0", + "102_2", + "103_2", + "103_3", + "104_0", + "104_1", + "104_2", + "104_3", + "105_1", + "105_2", + "105_3", + "106_0", + "106_1", + "107_1", + "108_0", + "108_2", + "108_3", + "109_1", + "109_2", + "10_0", + "111_0", + "111_2", + "111_3", + "112_0", + "112_1", + "112_2", + "112_3", + "113_0", + "114_0", + "114_1", + "114_2", + "114_3", + "115_1", + "115_2", + "116_0", + "116_1", + "116_2", + "116_3", + "117_0", + "117_1", + "117_3", + "118_0", + "118_1", + "119_0", + "11_1", + "11_2", + "120_0", + "120_2", + "120_3", + "121_0", + "121_2", + "121_3", + "122_0", + "122_1", + "122_2", + "122_3", + "123_2", + "124_0", + "124_1", + "124_2", + "124_3", + "125_1", + "125_2", + "126_1", + "127_0", + "127_2", + "127_3", + "129_0", + "129_2", + "129_3", + "12_2", + "12_3", + "130_0", + "131_0", + "131_1", + "132_0", + "134_0", + "134_1", + "135_2", + "135_3", + "136_0", + "136_1", + "136_2", + "137_0", + "138_0", + "138_1", + "138_2", + "138_3", + "139_1", + "139_3", + "13_0", + "13_2", + "13_3", + "140_0", + "140_1", + "140_2", + "141_0", + "141_1", + "141_2", + "141_3", + "144_0", + "144_1", + "144_3", + "145_2", + "145_3", + "146_3", + "147_0", + "147_1", + "148_1", + "149_3", + "14_0", + "14_1", + "150_0", + "150_1", + "150_2", + "151_1", + "151_2", + "151_3", + "152_0", + "152_1", + "152_2", + "153_0", + "153_2", + "153_3", + "154_0", + "154_1", + "156_0", + "156_1", + "156_2", + "157_0", + "157_1", + "158_1", + "159_1", + "159_2", + "15_0", + "160_0", + "160_1", + "160_3", + "161_0", + "161_2", + "162_0", + "163_0", + "163_2", + "164_0", + "164_3", + "165_1", + "166_0", + "166_2", + "166_3", + "167_0", + "168_0", + "168_1", + "169_1", + "169_2", + "169_3", + "16_0", + "16_1", + "16_2", + "170_0", + "170_1", + "171_2", + "173_0", + "173_1", + "173_2", + "174_0", + "174_1", + "174_2", + "174_3", + "175_0", + "175_1", + "175_3", + "176_0", + "176_1", + "177_0", + "177_1", + "177_2", + "177_3", + "17_0", + "17_1", + "17_3", + "180_1", + "180_3", + "181_1", + "183_0", + "183_1", + "183_2", + "183_3", + "184_0", + "185_0", + "185_1", + "185_2", + "185_3", + "186_0", + "186_1", + "187_0", + "188_1", + "188_2", + "188_3", + "189_2", + "18_0", + "18_1", + "18_2", + "18_3", + "191_0", + "191_2", + "191_3", + "192_0", + "192_1", + "193_0", + "193_2", + "195_0", + "195_1", + "195_2", + "195_3", + "196_1", + "197_0", + "197_1", + "197_3", + "198_0", + "198_1", + "199_0", + "199_2", + "19_0", + "19_1", + "19_2", + "19_3", + "1_0", + "1_1", + "1_2", + "201_1", + "201_2", + "201_3", + "202_0", + "202_1", + "202_2", + "202_3", + "203_0", + "203_3", + "205_0", + "205_1", + "206_0", + "206_2", + "207_0", + "207_1", + "207_3", + "208_0", + "208_1", + "208_2", + "208_3", + "209_0", + "209_1", + "209_3", + "20_0", + "20_1", + "20_2", + "20_3", + "211_0", + "211_1", + "211_2", + "211_3", + "212_0", + "212_3", + "213_0", + "213_1", + "213_2", + "213_3", + "214_0", + "214_1", + "214_2", + "215_1", + "216_0", + "216_1", + "216_2", + "216_3", + "217_0", + "217_1", + "217_2", + "217_3", + "218_0", + "218_1", + "219_2", + "219_3", + "21_0", + "21_3", + "220_2", + "220_3", + "221_0", + "221_1", + "221_2", + "221_3", + "222_0", + "222_2", + "222_3", + "223_0", + "223_2", + "225_0", + "225_1", + "225_2", + "227_0", + "227_2", + "228_2", + "228_3", + "229_0", + "229_1", + "229_2", + "22_0", + "22_1", + "230_1", + "230_2", + "231_0", + "231_1", + "232_1", + "232_2", + "232_3", + "233_1", + "234_1", + "235_0", + "236_1", + "236_2", + "236_3", + "237_1", + "237_2", + "237_3", + "238_0", + "238_2", + "238_3", + "239_1", + "239_2", + "23_0", + "23_2", + "23_3", + "240_1", + "240_2", + "240_3", + "241_0", + "241_1", + "241_2", + "242_0", + "242_2", + "243_0", + "243_1", + "243_2", + "243_3", + "244_2", + "245_0", + "245_2", + "246_0", + "246_1", + "246_2", + "246_3", + "248_0", + "248_1", + "248_2", + "248_3", + "249_0", + "249_2", + "249_3", + "24_3", + "250_1", + "250_2", + "250_3", + "251_1", + "251_2", + "252_1", + "253_2", + "254_1", + "254_2", + "254_3", + "256_0", + "256_1", + "257_0", + "257_1", + "257_2", + "257_3", + "258_0", + "258_1", + "258_2", + "259_1", + "260_0", + "260_1", + "261_0", + "261_2", + "261_3", + "263_0", + "263_1", + "264_0", + "264_3", + "265_2", + "265_3", + "266_1", + "266_3", + "267_1", + "267_2", + "268_2", + "269_0", + "269_2", + "269_3", + "26_1", + "26_2", + "26_3", + "270_0", + "270_1", + "270_3", + "271_1", + "272_3", + "273_0", + "273_1", + "274_1", + "274_2", + "275_0", + "276_1", + "277_0", + "278_0", + "278_2", + "278_3", + "279_1", + "279_2", + "27_0", + "280_0", + "280_2", + "281_0", + "281_1", + "281_2", + "283_0", + "283_1", + "284_0", + "285_0", + "285_1", + "285_2", + "285_3", + "286_2", + "286_3", + "287_1", + "287_2", + "287_3", + "288_0", + "288_2", + "289_0", + "289_1", + "289_2", + "289_3", + "28_0", + "28_1", + "28_2", + "290_0", + "290_1", + "290_2", + "290_3", + "291_0", + "291_1", + "291_2", + "291_3", + "292_0", + "292_1", + "293_0", + "293_2", + "294_0", + "294_1", + "294_2", + "295_1", + "295_2", + "296_0", + "296_1", + "296_2", + "296_3", + "297_0", + "299_1", + "29_0", + "29_1", + "29_2", + "29_3", + "2_2", + "2_3", + "300_0", + "300_1", + "301_1", + "301_2", + "302_0", + "303_0", + "303_2", + "303_3", + "304_0", + "304_1", + "304_2", + "305_0", + "306_0", + "306_1", + "306_2", + "306_3", + "307_0", + "309_1", + "30_0", + "30_2", + "30_3", + "310_0", + "310_1", + "311_0", + "311_1", + "311_2", + "311_3", + "312_0", + "313_0", + "313_1", + "313_2", + "313_3", + "314_0", + "314_1", + "314_2", + "314_3", + "316_1", + "316_2", + "317_0", + "317_1", + "318_0", + "318_1", + "318_2", + "318_3", + "31_0", + "31_1", + "31_2", + "320_0", + "320_1", + "320_2", + "320_3", + "321_1", + "321_2", + "321_3", + "322_0", + "322_1", + "323_0", + "323_1", + "323_2", + "323_3", + "324_2", + "326_1", + "326_2", + "326_3", + "327_0", + "327_1", + "327_2", + "327_3", + "329_1", + "329_2", + "329_3", + "32_1", + "330_0", + "330_1", + "330_2", + "331_1", + "331_2", + "331_3", + "332_0", + "332_1", + "332_2", + "333_0", + "333_1", + "333_2", + "333_3", + "336_0", + "336_2", + "336_3", + "337_0", + "337_1", + "337_3", + "339_1", + "339_2", + "33_0", + "33_1", + "340_0", + "341_1", + "341_3", + "342_0", + "342_1", + "343_1", + "344_0", + "344_1", + "344_2", + "344_3", + "345_0", + "345_1", + "345_2", + "346_0", + "347_0", + "347_1", + "347_2", + "347_3", + "348_2", + "348_3", + "349_2", + "349_3", + "34_1", + "350_0", + "350_1", + "351_1", + "352_1", + "352_2", + "353_0", + "353_1", + "354_0", + "354_1", + "354_3", + "355_0", + "355_1", + "355_2", + "356_0", + "356_1", + "357_0", + "357_1", + "357_3", + "358_0", + "358_1", + "358_2", + "358_3", + "35_0", + "35_3", + "360_0", + "360_1", + "361_0", + "363_0", + "363_1", + "363_2", + "363_3", + "364_0", + "364_1", + "364_2", + "365_0", + "365_1", + "365_2", + "366_0", + "366_1", + "366_2", + "367_0", + "367_1", + "367_2", + "368_2", + "368_3", + "369_0", + "369_1", + "369_2", + "369_3", + "36_0", + "370_0", + "370_1", + "371_0", + "371_1", + "374_1", + "374_2", + "374_3", + "375_0", + "375_2", + "375_3", + "376_0", + "376_2", + "376_3", + "377_0", + "377_1", + "377_2", + "378_0", + "378_2", + "378_3", + "37_0", + "37_1", + "37_2", + "381_0", + "381_1", + "381_2", + "383_0", + "383_1", + "383_2", + "383_3", + "384_1", + "385_1", + "385_2", + "386_0", + "386_2", + "387_0", + "389_0", + "389_1", + "38_2", + "38_3", + "390_0", + "390_3", + "391_0", + "392_0", + "392_1", + "392_2", + "392_3", + "393_0", + "393_1", + "394_0", + "394_1", + "394_2", + "394_3", + "395_0", + "395_2", + "396_0", + "396_1", + "396_3", + "397_0", + "397_1", + "397_2", + "398_1", + "399_0", + "399_1", + "39_0", + "39_1", + "3_0", + "400_0", + "400_1", + "401_0", + "401_1", + "401_3", + "402_1", + "402_3", + "403_0", + "403_1", + "403_2", + "403_3", + "405_1", + "405_3", + "406_0", + "406_1", + "407_3", + "408_0", + "408_1", + "408_3", + "409_0", + "40_2", + "411_0", + "411_1", + "411_2", + "411_3", + "412_0", + "413_0", + "413_1", + "413_2", + "413_3", + "414_0", + "414_2", + "415_0", + "415_3", + "416_0", + "416_2", + "416_3", + "417_0", + "417_2", + "417_3", + "418_0", + "41_0", + "41_1", + "420_0", + "420_1", + "420_2", + "420_3", + "421_0", + "421_2", + "421_3", + "422_1", + "422_3", + "423_0", + "423_1", + "424_0", + "426_0", + "426_2", + "426_3", + "427_0", + "427_1", + "427_2", + "428_0", + "428_2", + "428_3", + "429_0", + "429_1", + "429_2", + "429_3", + "42_0", + "42_1", + "42_3", + "430_0", + "430_1", + "430_2", + "430_3", + "432_0", + "432_1", + "432_2", + "432_3", + "433_0", + "433_1", + "433_2", + "433_3", + "435_0", + "435_2", + "435_3", + "436_0", + "436_2", + "438_0", + "438_1", + "438_3", + "439_3", + "43_0", + "43_3", + "440_1", + "440_2", + "440_3", + "441_0", + "442_0", + "442_1", + "443_0", + "444_1", + "444_3", + "445_0", + "445_2", + "445_3", + "446_0", + "446_1", + "447_2", + "448_0", + "449_0", + "450_0", + "450_1", + "450_2", + "450_3", + "451_1", + "451_2", + "451_3", + "453_0", + "453_1", + "453_3", + "454_1", + "454_3", + "457_0", + "457_3", + "458_0", + "458_2", + "459_0", + "459_1", + "459_2", + "459_3", + "45_0", + "45_3", + "460_1", + "460_2", + "461_0", + "461_2", + "461_3", + "462_0", + "462_1", + "462_2", + "462_3", + "464_0", + "464_1", + "464_2", + "464_3", + "465_0", + "465_1", + "465_2", + "465_3", + "466_0", + "467_0", + "467_1", + "467_3", + "468_1", + "468_2", + "468_3", + "469_1", + "469_2", + "469_3", + "46_0", + "46_3", + "470_0", + "470_1", + "470_2", + "470_3", + "472_0", + "472_1", + "472_3", + "473_0", + "475_2", + "476_0", + "476_1", + "476_2", + "478_1", + "479_0", + "480_0", + "480_1", + "480_2", + "481_0", + "482_0", + "482_2", + "482_3", + "483_0", + "483_1", + "483_2", + "483_3", + "484_0", + "484_1", + "484_2", + "484_3", + "485_1", + "485_2", + "486_0", + "487_2", + "488_0", + "488_2", + "488_3", + "489_0", + "489_3", + "48_0", + "491_0", + "491_2", + "491_3", + "492_0", + "493_0", + "495_0", + "495_2", + "495_3", + "496_0", + "496_1", + "496_2", + "496_3", + "497_0", + "497_1", + "497_3", + "498_0", + "498_2", + "499_2", + "499_3", + "49_0", + "49_1", + "49_2", + "49_3", + "4_0", + "4_1", + "4_2", + "500_0", + "500_1", + "500_2", + "500_3", + "501_0", + "501_1", + "501_2", + "502_0", + "502_1", + "503_0", + "503_3", + "504_0", + "504_1", + "504_2", + "504_3", + "505_0", + "505_1", + "506_0", + "506_1", + "506_2", + "507_0", + "507_1", + "507_2", + "507_3", + "508_0", + "508_1", + "508_3", + "509_0", + "509_1", + "509_3", + "50_0", + "50_1", + "50_2", + "50_3", + "510_2", + "510_3", + "511_2", + "511_3", + "512_1", + "512_2", + "512_3", + "514_0", + "514_1", + "514_2", + "515_0", + "515_1", + "516_0", + "516_1", + "516_2", + "517_1", + "519_0", + "519_1", + "519_2", + "51_0", + "51_1", + "51_2", + "51_3", + "520_1", + "520_3", + "521_0", + "521_3", + "522_0", + "522_1", + "523_1", + "523_2", + "523_3", + "524_1", + "524_3", + "525_1", + "525_2", + "527_0", + "528_0", + "528_1", + "528_2", + "528_3", + "529_0", + "52_1", + "52_2", + "52_3", + "530_2", + "530_3", + "531_0", + "531_1", + "533_0", + "533_1", + "533_2", + "533_3", + "534_0", + "534_1", + "534_3", + "536_0", + "537_0", + "537_2", + "538_0", + "538_1", + "539_0", + "539_1", + "539_2", + "540_0", + "540_1", + "540_2", + "540_3", + "542_0", + "542_1", + "542_2", + "542_3", + "543_1", + "543_2", + "543_3", + "544_0", + "544_1", + "544_2", + "545_0", + "545_2", + "548_1", + "549_0", + "549_1", + "549_2", + "549_3", + "54_1", + "550_0", + "550_1", + "551_0", + "553_0", + "553_1", + "553_2", + "553_3", + "554_1", + "554_2", + "556_0", + "556_1", + "556_2", + "557_0", + "557_1", + "557_2", + "557_3", + "558_0", + "558_1", + "558_3", + "559_0", + "559_1", + "559_3", + "55_1", + "55_2", + "55_3", + "560_1", + "560_3", + "561_0", + "561_2", + "561_3", + "562_1", + "562_2", + "562_3", + "563_0", + "564_0", + "565_0", + "565_1", + "566_0", + "566_1", + "566_2", + "567_0", + "567_1", + "567_2", + "567_3", + "568_0", + "568_1", + "568_2", + "568_3", + "569_0", + "569_2", + "569_3", + "571_0", + "571_1", + "571_2", + "571_3", + "572_0", + "572_1", + "572_2", + "572_3", + "573_1", + "574_1", + "574_2", + "574_3", + "575_0", + "575_1", + "575_2", + "576_1", + "576_2", + "577_0", + "577_1", + "578_0", + "578_1", + "578_3", + "579_0", + "579_1", + "579_2", + "579_3", + "57_0", + "57_1", + "581_0", + "581_1", + "581_2", + "581_3", + "582_0", + "583_0", + "583_1", + "583_2", + "583_3", + "584_1", + "584_2", + "585_0", + "586_0", + "586_2", + "587_0", + "587_1", + "587_2", + "588_0", + "588_1", + "588_3", + "589_1", + "589_2", + "58_0", + "58_1", + "590_1", + "591_0", + "591_1", + "591_2", + "592_1", + "592_2", + "592_3", + "593_2", + "593_3", + "595_0", + "595_1", + "596_0", + "596_1", + "596_2", + "596_3", + "597_1", + "597_2", + "598_0", + "598_1", + "599_0", + "599_1", + "59_1", + "59_2", + "5_0", + "5_3", + "600_0", + "600_1", + "600_2", + "601_1", + "601_2", + "601_3", + "602_0", + "602_1", + "603_2", + "603_3", + "604_0", + "607_0", + "607_1", + "607_2", + "607_3", + "608_0", + "608_1", + "608_2", + "608_3", + "609_0", + "609_1", + "60_1", + "60_2", + "60_3", + "611_0", + "613_0", + "614_0", + "614_1", + "614_3", + "616_2", + "616_3", + "617_1", + "617_2", + "617_3", + "618_0", + "618_3", + "619_0", + "61_0", + "621_0", + "621_2", + "622_0", + "622_2", + "623_0", + "623_1", + "623_2", + "623_3", + "624_0", + "624_2", + "624_3", + "625_1", + "625_2", + "625_3", + "626_0", + "626_1", + "627_1", + "627_2", + "629_0", + "629_1", + "629_2", + "629_3", + "62_0", + "62_1", + "631_0", + "631_1", + "631_2", + "632_0", + "632_1", + "632_2", + "632_3", + "633_2", + "633_3", + "634_0", + "634_2", + "634_3", + "635_0", + "637_0", + "638_0", + "639_0", + "640_0", + "640_1", + "641_0", + "641_1", + "641_3", + "642_1", + "642_2", + "642_3", + "643_0", + "643_1", + "644_0", + "644_1", + "644_3", + "646_0", + "646_2", + "646_3", + "647_0", + "647_1", + "648_0", + "649_0", + "649_1", + "649_2", + "64_1", + "64_2", + "650_0", + "650_2", + "650_3", + "651_1", + "651_3", + "652_2", + "652_3", + "653_1", + "653_2", + "653_3", + "654_0", + "655_1", + "656_0", + "657_0", + "657_1", + "657_2", + "657_3", + "658_2", + "658_3", + "659_0", + "659_1", + "65_0", + "65_1", + "65_2", + "660_0", + "660_1", + "660_2", + "660_3", + "661_0", + "661_1", + "662_0", + "662_1", + "662_2", + "662_3", + "663_3", + "664_0", + "665_0", + "665_1", + "665_2", + "665_3", + "666_0", + "667_0", + "667_1", + "667_2", + "668_0", + "669_0", + "669_1", + "669_2", + "66_1", + "66_2", + "670_0", + "670_3", + "671_0", + "672_1", + "672_2", + "673_0", + "675_0", + "675_1", + "675_2", + "675_3", + "676_0", + "676_1", + "676_2", + "677_0", + "677_2", + "678_0", + "679_0", + "679_1", + "67_0", + "67_2", + "680_0", + "680_1", + "680_2", + "683_0", + "683_2", + "684_0", + "684_1", + "684_2", + "685_0", + "686_0", + "686_1", + "687_0", + "687_1", + "687_3", + "688_0", + "688_2", + "688_3", + "689_1", + "689_2", + "689_3", + "68_0", + "690_0", + "690_1", + "692_2", + "692_3", + "693_1", + "695_1", + "695_3", + "696_0", + "696_1", + "696_3", + "697_0", + "697_1", + "697_3", + "698_0", + "698_1", + "699_0", + "699_1", + "6_0", + "6_1", + "6_3", + "700_1", + "700_3", + "701_0", + "701_1", + "701_2", + "703_0", + "703_2", + "703_3", + "704_0", + "704_1", + "705_0", + "705_1", + "705_2", + "706_1", + "706_2", + "706_3", + "707_0", + "707_1", + "708_1", + "709_0", + "709_1", + "709_2", + "709_3", + "710_1", + "711_0", + "711_1", + "711_2", + "712_1", + "712_2", + "712_3", + "713_0", + "713_1", + "713_2", + "714_0", + "714_1", + "715_0", + "715_2", + "715_3", + "716_0", + "716_2", + "716_3", + "717_0", + "717_1", + "717_2", + "717_3", + "718_0", + "719_2", + "71_0", + "71_1", + "71_2", + "71_3", + "720_0", + "720_1", + "722_0", + "722_1", + "722_2", + "723_0", + "723_1", + "725_1", + "725_2", + "725_3", + "726_0", + "727_0", + "727_2", + "727_3", + "728_1", + "728_2", + "728_3", + "729_0", + "729_1", + "729_2", + "729_3", + "730_1", + "730_2", + "730_3", + "731_0", + "731_1", + "732_3", + "734_1", + "735_0", + "735_2", + "736_0", + "73_0", + "74_0", + "74_2", + "75_0", + "75_1", + "75_2", + "75_3", + "76_0", + "76_1", + "76_3", + "77_0", + "77_1", + "77_2", + "78_0", + "78_1", + "78_3", + "79_1", + "7_1", + "7_2", + "7_3", + "80_0", + "80_2", + "80_3", + "81_0", + "82_1", + "82_2", + "82_3", + "83_0", + "83_1", + "84_0", + "84_2", + "84_3", + "85_0", + "85_1", + "85_3", + "87_0", + "88_1", + "89_0", + "90_0", + "90_1", + "90_2", + "90_3", + "91_0", + "91_1", + "91_3", + "92_0", + "92_1", + "92_3", + "93_0", + "93_1", + "93_3", + "94_2", + "94_3", + "95_0", + "95_1", + "96_0", + "96_1", + "96_2", + "96_3", + "97_0", + "97_1", + "97_2", + "97_3", + "98_0", + "98_1", + "98_3", + "99_0", + "99_1", + "99_3", + "9_0", + "9_2" + ], + "test_query_ids": [ + "0_0", + "0_1", + "0_2", + "0_3", + "100_0", + "100_1", + "100_2", + "101_0", + "101_1", + "101_2", + "102_0", + "102_1", + "102_2", + "102_3", + "103_0", + "103_1", + "103_2", + "103_3", + "104_0", + "104_1", + "104_2", + "105_0", + "105_1", + "106_0", + "106_1", + "107_0", + "107_1", + "107_2", + "107_3", + "108_0", + "108_1", + "108_2", + "108_3", + "109_0", + "109_1", + "109_2", + "109_3", + "10_0", + "10_1", + "10_2", + "10_3", + "110_0", + "110_1", + "110_2", + "110_3", + "112_0", + "112_1", + "112_2", + "112_3", + "113_0", + "113_1", + "113_2", + "113_3", + "114_0", + "114_1", + "114_2", + "115_0", + "115_1", + "115_2", + "115_3", + "117_0", + "117_1", + "118_0", + "118_1", + "118_2", + "118_3", + "119_1", + "119_2", + "11_0", + "11_1", + "11_2", + "11_3", + "120_0", + "120_1", + "120_2", + "121_0", + "121_1", + "121_2", + "121_3", + "122_0", + "122_1", + "122_2", + "122_3", + "123_0", + "123_1", + "123_2", + "123_3", + "125_0", + "125_1", + "125_2", + "126_0", + "126_1", + "126_2", + "126_3", + "127_0", + "127_1", + "128_0", + "128_1", + "128_2", + "128_3", + "129_0", + "129_1", + "12_0", + "12_1", + "12_2", + "12_3", + "130_0", + "130_1", + "130_2", + "130_3", + "131_0", + "131_1", + "132_0", + "132_1", + "132_2", + "132_3", + "133_0", + "133_1", + "133_2", + "133_3", + "134_0", + "134_1", + "134_2", + "134_3", + "136_0", + "136_1", + "137_0", + "137_1", + "137_2", + "138_0", + "138_1", + "138_2", + "138_3", + "139_0", + "139_1", + "13_0", + "13_1", + "13_2", + "13_3", + "140_0", + "140_1", + "140_2", + "140_3", + "141_0", + "141_1", + "141_2", + "141_3", + "142_0", + "142_1", + "142_2", + "142_3", + "143_0", + "143_1", + "143_2", + "144_0", + "144_1", + "144_2", + "144_3", + "145_0", + "145_1", + "145_2", + "145_3", + "146_0", + "146_1", + "146_2", + "146_3", + "147_0", + "147_1", + "147_2", + "147_3", + "148_0", + "148_1", + "148_2", + "148_3", + "149_0", + "149_1", + "149_2", + "14_0", + "14_1", + "150_0", + "151_0", + "151_1", + "151_2", + "151_3", + "152_0", + "152_1", + "152_2", + "152_3", + "153_0", + "153_1", + "153_2", + "153_3", + "154_0", + "154_1", + "154_2", + "154_3", + "155_0", + "155_1", + "155_2", + "155_3", + "156_0", + "156_1", + "156_2", + "156_3", + "157_0", + "157_1", + "158_0", + "159_0", + "159_1", + "159_2", + "159_3", + "15_0", + "15_1", + "15_2", + "15_3", + "161_0", + "161_1", + "162_0", + "162_1", + "162_2", + "162_3", + "163_0", + "163_1", + "163_2", + "164_0", + "165_0", + "165_1", + "165_2", + "165_3", + "166_0", + "166_1", + "166_2", + "166_3", + "167_0", + "167_1", + "167_2", + "167_3", + "168_0", + "168_1", + "168_2", + "168_3", + "169_0", + "169_1", + "169_2", + "169_3", + "16_0", + "16_1", + "16_2", + "16_3", + "170_0", + "170_1", + "170_2", + "170_3", + "172_0", + "172_1", + "172_2", + "172_3", + "173_0", + "173_1", + "173_2", + "173_3", + "175_0", + "175_1", + "176_0", + "176_1", + "176_2", + "177_0", + "177_1", + "177_2", + "177_3", + "178_0", + "179_0", + "179_1", + "179_2", + "17_0", + "17_1", + "17_2", + "180_0", + "180_1", + "180_2", + "180_3", + "181_0", + "181_1", + "182_0", + "182_1", + "182_2", + "182_3", + "185_0", + "185_1", + "186_0", + "186_1", + "186_2", + "186_3", + "187_0", + "187_1", + "187_2", + "187_3", + "18_0", + "18_1", + "18_2", + "19_0", + "19_1", + "19_2", + "19_3", + "1_0", + "1_1", + "1_2", + "1_3", + "21_0", + "21_1", + "22_0", + "22_1", + "22_2", + "22_3", + "23_0", + "23_1", + "23_2", + "23_3", + "24_0", + "24_1", + "24_2", + "24_3", + "25_0", + "25_1", + "26_0", + "27_0", + "27_1", + "28_0", + "28_1", + "28_2", + "29_0", + "29_1", + "29_2", + "29_3", + "2_0", + "2_1", + "2_2", + "2_3", + "30_0", + "30_1", + "30_2", + "31_0", + "31_1", + "31_2", + "32_0", + "32_1", + "32_2", + "32_3", + "33_0", + "33_1", + "33_2", + "33_3", + "34_0", + "34_1", + "34_2", + "35_0", + "36_0", + "36_1", + "36_2", + "36_3", + "37_0", + "37_1", + "37_2", + "37_3", + "38_0", + "38_1", + "39_0", + "39_1", + "3_0", + "3_1", + "3_2", + "3_3", + "40_0", + "40_1", + "40_2", + "40_3", + "41_0", + "41_1", + "41_2", + "41_3", + "42_0", + "45_0", + "45_1", + "45_2", + "45_3", + "46_0", + "46_1", + "46_2", + "46_3", + "47_0", + "47_1", + "48_0", + "48_1", + "48_2", + "48_3", + "49_0", + "49_1", + "49_2", + "49_3", + "4_0", + "4_1", + "4_2", + "4_3", + "50_0", + "50_1", + "50_2", + "50_3", + "52_0", + "53_0", + "53_1", + "53_2", + "53_3", + "54_0", + "54_1", + "55_0", + "55_1", + "55_2", + "55_3", + "56_0", + "56_1", + "56_2", + "57_0", + "57_1", + "57_2", + "57_3", + "58_0", + "58_1", + "59_0", + "59_1", + "5_0", + "5_1", + "5_2", + "5_3", + "60_0", + "60_1", + "60_2", + "60_3", + "61_0", + "61_1", + "62_0", + "62_1", + "62_2", + "62_3", + "63_0", + "63_1", + "63_2", + "63_3", + "64_0", + "64_1", + "64_2", + "64_3", + "66_0", + "66_1", + "68_0", + "6_0", + "6_1", + "6_2", + "6_3", + "70_0", + "70_1", + "70_2", + "70_3", + "71_0", + "71_1", + "72_0", + "73_0", + "73_1", + "74_0", + "74_1", + "74_2", + "74_3", + "75_0", + "75_1", + "75_2", + "75_3", + "76_0", + "77_0", + "77_1", + "77_2", + "77_3", + "78_0", + "78_1", + "78_2", + "79_0", + "79_1", + "79_2", + "79_3", + "7_0", + "7_1", + "81_0", + "81_1", + "81_2", + "81_3", + "82_0", + "82_1", + "82_2", + "82_3", + "83_0", + "83_1", + "84_0", + "84_1", + "84_2", + "85_0", + "85_1", + "85_2", + "85_3", + "86_0", + "86_1", + "86_2", + "86_3", + "87_0", + "88_0", + "89_0", + "8_0", + "8_1", + "90_0", + "90_1", + "90_2", + "90_3", + "91_0", + "91_1", + "91_2", + "91_3", + "92_0", + "92_1", + "92_2", + "92_3", + "93_0", + "94_0", + "94_1", + "94_2", + "94_3", + "95_0", + "95_1", + "95_2", + "95_3", + "96_0", + "96_1", + "96_2", + "96_3", + "97_0", + "97_1", + "97_2", + "97_3", + "98_0", + "98_1", + "98_2", + "99_0", + "99_1", + "99_2", + "9_0", + "9_1", + "9_2", + "9_3" + ] +} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/env_rl.py b/cosmos-retriever/src/cosmos_retriever/env_rl.py new file mode 100644 index 0000000..6e4b6a7 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/env_rl.py @@ -0,0 +1,812 @@ + +# Allow direct execution from subdirectories while keeping imports package-relative. +import sys +from pathlib import Path + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +"""Inference-only search environment. + +Drives the trained Harness-1 policy over a corpus for a single query: it owns +the ``WorkingMemory`` / ``curate`` / ``fan_out_search`` machinery and renders +budget-bounded context each turn via ``ultra_core``. There is no gold data, +reward computation, or RL training here — recall is scored externally by the +caller (e.g. ``scripts/bench_erag.py``) against ``env.wm.curated_ids``. + +Consumed by ``retriever.py`` and ``inference/evaluate_harness1_vllm.py``. +""" + +import asyncio +import copy +import json +import os +import re +import time +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Set, + Tuple, +) + +import structlog +import tinker +from openai_harmony import ( + Conversation, + HarmonyEncoding, + HarmonyEncodingName, + Message, + Role, + load_harmony_encoding, +) +from tinker_cookbook.rl.types import ( + Env, + Observation as TinkerObservation, + StopCondition, + Action as TinkerAction, + StepResult, +) + +from cosmos_retriever.agent import TinkerAgentInferenceModel +from cosmos_retriever.trajectory import ( + Action, + Observation, + ActionBuilder, + ObservationBuilder, +) +from cosmos_retriever.tools import ( + Tool, + ToolSet, + ToolSchema, + ToolCallMetadata, + SearchCorpusTool, + SearchCorpusToolCallMetadata, + GrepCorpusTool, + GrepCorpusToolCallMetadata, + ReadDocumentTool, + PruneChunksTool, + UserTextTool, + SEARCH_CORPUS_SCHEMA, + GREP_CORPUS_SCHEMA, + READ_DOCUMENT_SCHEMA, + MULTI_TOOL_USE_SCHEMA, +) + +from cosmos_retriever.ultra_core import ( + WorkingMemory, + WorkingMemorySnapshot, + build_result_summary, + get_system_prompt, + render_context_within_budget, + parse_doc_ids_from_observation, + parse_doc_texts_from_observation, + # Schemas + FAN_OUT_SEARCH_SCHEMA, + CURATE_SCHEMA, + END_SEARCH_SCHEMA, + REVIEW_DOCS_SCHEMA, + VERIFY_SCHEMA, + # v8d helpers + append_token_marker, + compress_search_observation, + auto_populate_from_first_search, + build_rerank_instruction, + exec_verify_claim, + AUTO_POPULATE_TOP_K, + V8D_AUTO_POPULATE_FIRST_SEARCH, + V8D_IMPORTANCE_TAGGING, + V8D_SENTENCE_COMPRESS, + V8D_TOKEN_BUDGET_MARKER, + V8D_VERIFY_TOOL, + V8D_ADAPTIVE_RERANK_INSTRUCTION, + # Constants + FAN_OUT_MAX_QUERIES, + MAX_REVIEW_DOCS, + MAX_FORMAT_RETRIES, + CURATE_NUDGE_INTERVAL, + CURATE_NUDGE_PROMPT, + FORMAT_RETRY_PROMPT, + FORMAT_ERROR_PENALTY, + RECENT_K, + PROMPT_TOKEN_BUDGET, + SEARCH_DISPLAY_LIMIT, + MAX_TURNS, +) + +logger = structlog.get_logger("ultra_rl_v3") + +# Save trajectory details for debugging +SAVE_TRAJECTORIES = os.environ.get("SAVE_TRAJECTORIES", "1") == "1" +TRAJECTORY_SAVE_PATH = os.environ.get("TRAJECTORY_SAVE_PATH", None) +ABLATE_VERIFY_UNAVAILABLE = os.environ.get("ABLATE_VERIFY_UNAVAILABLE", "0") == "1" +ABLATE_REVIEW_DOCS_UNAVAILABLE = os.environ.get("ABLATE_REVIEW_DOCS_UNAVAILABLE", "0") == "1" + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Tool Stubs (for toolset registration — dispatch handled by env) +# ═══════════════════════════════════════════════════════════════════════════════ + +class FanOutSearchToolCallMetadata(ToolCallMetadata): + returned_chunk_ids: List[str] + queries_executed: int + + +class FanOutSearchTool(Tool): + tool_schema: ToolSchema + def __init__(self): + super().__init__(tool_schema=FAN_OUT_SEARCH_SCHEMA) + def __call__(self, params, overrides=None): + raise NotImplementedError("Handled by env") + + +class CurateTool(Tool): + tool_schema: ToolSchema + def __init__(self): + super().__init__(tool_schema=CURATE_SCHEMA) + def __call__(self, params, overrides=None): + raise NotImplementedError("Handled by env") + + +class EndSearchTool(Tool): + tool_schema: ToolSchema + def __init__(self): + super().__init__(tool_schema=END_SEARCH_SCHEMA) + def __call__(self, params, overrides=None): + return "Search concluded.", None + + +class ReviewDocsTool(Tool): + tool_schema: ToolSchema + def __init__(self): + super().__init__(tool_schema=REVIEW_DOCS_SCHEMA) + def __call__(self, params, overrides=None): + raise NotImplementedError("Handled by env") + + +class VerifyTool(Tool): + """v8d: stub for the verify(doc_ids, claim) tool. Dispatched by env.""" + tool_schema: ToolSchema + def __init__(self): + super().__init__(tool_schema=VERIFY_SCHEMA) + def __call__(self, params, overrides=None): + raise NotImplementedError("Handled by env") + +class SlidingWindowSearchEnv(Env): + """Two-tier-memory search environment with budget-enforced context rendering. + + Inference-only: drives the trained policy and exposes the curated documents + via :pyattr:`wm`. There is no gold data or reward computation here; recall is + scored by the caller against :pyattr:`wm.curated_ids`. + """ + + def __init__( + self, + toolset: ToolSet, + search_tool: SearchCorpusTool, + query_id: str, + query_text: str, + dataset_name: str, + text_token_counter: Optional[Callable[[str], int]] = None, + max_turns: int = MAX_TURNS, + normalize_ids: bool = True, + ): + self.toolset = toolset + self.search_tool = search_tool + self.query_id = query_id + self.query_text = query_text + self.text_token_counter = text_token_counter + self.max_turns = max_turns + + self.enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + self.stop_condition: StopCondition = [200002, 200012] + + self._normalize_ids = normalize_ids + + self.wm = WorkingMemory(query_text, normalize_ids=self._normalize_ids) + self.system_prompt = get_system_prompt(query_text) + + self._all_actions: List[Action] = [] + self._all_observations: List[Observation] = [] + self._wm_snapshots: List[WorkingMemorySnapshot] = [] + self._result_summaries: List[str] = [] + + self._ids_seen: Set[str] = set() + self._doc_id_to_query: Dict[str, str] = {} + + self._episode_ended: bool = False + self._current_turn: int = 0 + self._format_retries: int = 0 + self._turns_since_curate: int = 0 + self._total_curate_calls: int = 0 + self._tool_types_used: Set[str] = set() + + self._approx_prompt_tokens: int = 0 + self._first_search_done: bool = False + self._dataset_name: str = dataset_name + self._openai_client = None # lazily acquired when needed + # Build rerank instruction once per episode (cheap if LLM path disabled) + self.wm.rerank_instruction = build_rerank_instruction( + query=query_text, + dataset_name=self._dataset_name, + openai_client=None, + use_llm=False, + ) + + # ── Environment Interface ────────────────────────────────────────────── + + async def initial_observation(self) -> Tuple[TinkerObservation, StopCondition]: + self.wm = WorkingMemory(self.query_text, normalize_ids=self._normalize_ids) + self._wm_snapshots.append(self.wm.snapshot()) + + tokens = render_context_within_budget( + system_prompt=self.system_prompt, + wm_text=None, + recent_actions=[], + recent_observations=[], + result_summaries=None, + enc=self.enc, + ) + return tinker.ModelInput.from_ints(tokens), self.stop_condition + + async def step(self, action_tokens: TinkerAction) -> StepResult: + full_toolset = self._build_full_toolset() + + # Parse action tokens + try: + action = TinkerAgentInferenceModel.harmony_tinker_tokens_to_action( + self.enc, action_tokens, full_toolset, + ) + except Exception as e: + return self._handle_format_error(str(e)) + + if len(action.tools) == 0: + return self._handle_format_error("Reasoning-only action with no tool calls") + + # Check for episode end + has_end_search = any( + t.tool_schema.name == "end_search" for t in action.tools + ) + has_user_text = any(isinstance(t, UserTextTool) for t in action.tools) + + if has_end_search or has_user_text: + self._episode_ended = True + self._save_trajectory() + logger.info( + "episode_done", + n_curated=len(self.wm.curated_ids), + turns=self._current_turn, + query_id=self.query_id, + ) + return StepResult( + reward=0.0, + episode_done=True, + next_observation=tinker.ModelInput.empty(), + next_stop_condition=self.stop_condition, + ) + + # Capture pool size BEFORE tool execution + pool_size_before = self.wm.get_pool_size() + + # Execute tools + try: + observation = await asyncio.to_thread(self._execute_tools, action) + except Exception as e: + logger.error("tool_exec_error", error=str(e)[:300], qid=self.query_id) + return StepResult( + reward=0.0, + episode_done=True, + next_observation=tinker.ModelInput.empty(), + next_stop_condition=self.stop_condition, + metrics={"no_error": 0.0, "tool_error": 1.0, "max_turns_reached": 0.0}, + ) + + self._format_retries = 0 + + # Track curate state + has_curate = any(t.tool_schema.name == "curate" for t in action.tools) + if has_curate: + self._turns_since_curate = 0 + self._total_curate_calls += 1 + else: + self._turns_since_curate += 1 + + for t in action.tools: + self._tool_types_used.add(t.tool_schema.name) + + self._all_actions.append(action) + self._all_observations.append(observation) + self.wm.advance_turn() + self._current_turn += 1 + self._wm_snapshots.append(self.wm.snapshot()) + + # Build result summary + tool_names = [ + t.tool_schema.name for t in action.tools + if not isinstance(t, UserTextTool) + ] + obs_text = "\n".join(observation.observations) if observation.observations else "" + summary = build_result_summary( + obs_text=obs_text, + tool_names=tool_names, + wm=self.wm, + turns_since_curate=self._turns_since_curate, + tool_types_used=self._tool_types_used, + current_turn=self._current_turn, + pool_size_before=pool_size_before, + ) + self._result_summaries.append(summary) + + # Max turns check + if self._current_turn >= self.max_turns: + self._episode_ended = True + self._save_trajectory() + return StepResult( + reward=0.0, + episode_done=True, + next_observation=tinker.ModelInput.empty(), + next_stop_condition=self.stop_condition, + metrics={"max_turns_reached": 1.0}, + ) + + # Render context for next turn (budget-enforced) + try: + tokens = self._render_next_context() + except Exception as e: + logger.error("render_error", error=str(e)[:300], qid=self.query_id) + return StepResult( + reward=0.0, + episode_done=True, + next_observation=tinker.ModelInput.empty(), + next_stop_condition=self.stop_condition, + metrics={"no_error": 0.0, "max_turns_reached": 0.0}, + ) + + return StepResult( + reward=0.0, + episode_done=False, + next_observation=tinker.ModelInput.from_ints(tokens), + next_stop_condition=self.stop_condition, + ) + + # ── Context Rendering (single pathway via ultra_core) ────────────────── + + def _render_next_context(self) -> List[int]: + """Render context for the next turn using render_context_within_budget.""" + n_turns = len(self._all_actions) + + if n_turns <= RECENT_K: + wm_text = None + recent_actions = self._all_actions + recent_obs = self._all_observations + recent_summaries = self._result_summaries + else: + wm_boundary = n_turns - RECENT_K + wm_text = self._wm_snapshots[wm_boundary].text + recent_actions = self._all_actions[-RECENT_K:] + recent_obs = self._all_observations[-RECENT_K:] + recent_summaries = self._result_summaries[-RECENT_K:] + + nudge = None + if (self._turns_since_curate >= CURATE_NUDGE_INTERVAL + and self.wm.get_pool_size() > 0): + nudge = CURATE_NUDGE_PROMPT + + tokens = render_context_within_budget( + system_prompt=self.system_prompt, + wm_text=wm_text, + recent_actions=recent_actions, + recent_observations=recent_obs, + result_summaries=recent_summaries, + enc=self.enc, + nudge_prompt=nudge, + ) + # v8d: stash size so the next tool output can append an accurate marker. + self._approx_prompt_tokens = len(tokens) + return tokens + + def _render_retry_context(self) -> List[int]: + """Re-render current context with retry prompt appended.""" + n_turns = len(self._all_actions) + + if n_turns <= RECENT_K: + wm_text = None + recent_actions = self._all_actions + recent_obs = self._all_observations + recent_summaries = self._result_summaries + else: + wm_boundary = n_turns - RECENT_K + wm_text = self._wm_snapshots[wm_boundary].text + recent_actions = self._all_actions[-RECENT_K:] + recent_obs = self._all_observations[-RECENT_K:] + recent_summaries = self._result_summaries[-RECENT_K:] + + return render_context_within_budget( + system_prompt=self.system_prompt, + wm_text=wm_text, + recent_actions=recent_actions, + recent_observations=recent_obs, + result_summaries=recent_summaries, + enc=self.enc, + retry_prompt=FORMAT_RETRY_PROMPT, + ) + + # ── Format Error Handling ────────────────────────────────────────────── + + def _handle_format_error(self, error_msg: str) -> StepResult: + self._format_retries += 1 + if self._format_retries <= MAX_FORMAT_RETRIES: + logger.warning( + "format_retry", + error=error_msg[:200], + retry=self._format_retries, + qid=self.query_id, + ) + try: + tokens = self._render_retry_context() + except Exception: + tokens = render_context_within_budget( + self.system_prompt, None, [], [], None, + self.enc, retry_prompt=FORMAT_RETRY_PROMPT, + ) + return StepResult( + reward=0.0, + episode_done=False, + next_observation=tinker.ModelInput.from_ints(tokens), + next_stop_condition=self.stop_condition, + metrics={"format_retry": float(self._format_retries)}, + ) + else: + logger.error( + "format_error_final", + error=error_msg[:300], + retries=self._format_retries, + qid=self.query_id, + ) + return StepResult( + reward=0.0, + episode_done=True, + next_observation=tinker.ModelInput.empty(), + next_stop_condition=self.stop_condition, + metrics={ + "no_error": 0.0, + "format_error": 1.0, + "max_turns_reached": 0.0, + }, + ) + + # ── Tool Dispatch ────────────────────────────────────────────────────── + + def _build_full_toolset(self) -> ToolSet: + ts = ToolSet(name="ultra_v3_toolset") + for name, tool in self.toolset.tools.items(): + ts.tools[name] = tool + ts.tools["fan_out_search"] = FanOutSearchTool() + ts.tools["curate"] = CurateTool() + ts.tools["end_search"] = EndSearchTool() + ts.tools["review_docs"] = ReviewDocsTool() + if V8D_VERIFY_TOOL: + ts.tools["verify"] = VerifyTool() + return ts + + def _execute_tools(self, action: Action) -> Observation: + obs_builder = ObservationBuilder() + + for tool, params, source in zip(action.tools, action.params, action.sources): + if isinstance(tool, UserTextTool): + obs_builder.add_observation("", source=source, tool_metadata=None) + continue + + name = tool.tool_schema.name + logger.info("tool_call", tool=name, qid=self.query_id, turn=self._current_turn) + try: + if name == "fan_out_search": + output, meta = self._exec_fan_out_search(params) + obs_builder.add_observation(output, source=source, tool_metadata=meta) + elif name == "search_corpus": + output, meta = self._exec_search(params) + obs_builder.add_observation(output, source=source, tool_metadata=meta) + elif name == "grep_corpus": + output, meta = self._exec_grep(params) + obs_builder.add_observation(output, source=source, tool_metadata=meta) + elif name == "read_document": + output, meta = self._exec_read_doc(params) + obs_builder.add_observation(output, source=source, tool_metadata=meta) + elif name == "curate": + output = self._exec_curate(params) + obs_builder.add_observation(output, source=source, tool_metadata=None) + elif name == "review_docs": + output = self._exec_review_docs(params) + obs_builder.add_observation(output, source=source, tool_metadata=None) + elif name == "verify" and V8D_VERIFY_TOOL: + output = self._exec_verify(params) + obs_builder.add_observation(output, source=source, tool_metadata=None) + elif name == "end_search": + obs_builder.add_observation("Search concluded.", source=source, tool_metadata=None) + elif name == "prune_chunks": + obs_builder.add_observation( + "Context is managed via working memory. No pruning needed.", + source=source, tool_metadata=None, + ) + else: + obs_builder.add_observation( + f"Unknown tool: {name}", source=source, tool_metadata=None, + ) + except Exception as e: + logger.warning("tool_error", tool=name, error=str(e)[:200], qid=self.query_id) + obs_builder.add_observation( + f"Error executing {name}: {str(e)[:200]}", + source=source, tool_metadata=None, + ) + + return obs_builder.build() + + def _maybe_wrap_search_output( + self, + output: str, + query_for_compress: str, + first_search_ranked_ids: Optional[List[str]] = None, + ) -> str: + """v8d wrapper: BM25 compress + auto-populate + token marker.""" + # 1. Sentence-level compression (no-op unless flag on) + if V8D_SENTENCE_COMPRESS and query_for_compress: + output = compress_search_observation(query_for_compress, output) + + # 2. Auto-populate the curated set from the first search's top hits + if ( + V8D_AUTO_POPULATE_FIRST_SEARCH + and not self._first_search_done + and first_search_ranked_ids + ): + added = auto_populate_from_first_search( + self.wm, first_search_ranked_ids, top_k=AUTO_POPULATE_TOP_K, + ) + self._first_search_done = True + if added > 0: + output = ( + output + + f"\n\n[AUTO-POPULATED] Top {added} docs from this search have been " + "added to your curated set at 'fair' importance. Use `curate` with " + "`importance` to promote/demote and `remove_ids` to drop irrelevant ones." + ) + + # 3. Token budget marker (no-op unless flag on) + if V8D_TOKEN_BUDGET_MARKER and self.text_token_counter is not None: + try: + used = self._approx_prompt_tokens + self.text_token_counter(output) + output = append_token_marker(output, used) + except Exception: + pass + + return output + + def _exec_search(self, params: Dict) -> Tuple[str, Optional[ToolCallMetadata]]: + query = params.get("query") or params.get("q", "") + pool_before = self.wm.get_pool_size() + # v8d: pipe per-episode rerank instruction through to the search tool. + overrides: Dict[str, Any] = {"ignore_ids": list(self._ids_seen)} + if V8D_ADAPTIVE_RERANK_INSTRUCTION and self.wm.rerank_instruction: + overrides["rerank_instruction"] = self.wm.rerank_instruction + output, meta = self.search_tool(params, overrides) + ranked_ids: List[str] = [] + if meta and isinstance(meta, SearchCorpusToolCallMetadata): + ranked_ids = list(meta.returned_chunk_ids) + self._ids_seen.update(meta.returned_chunk_ids) + doc_texts = parse_doc_texts_from_observation(output) + self.wm.add_to_pool(meta.returned_chunk_ids, doc_texts) + for cid in meta.returned_chunk_ids: + doc_id = cid.split("_")[0] if "_" in cid else cid + self._doc_id_to_query.setdefault(doc_id, str(query)) + num_new = self.wm.get_pool_size() - pool_before + self.wm.add_search_record( + "search", str(query)[:60], len(meta.returned_chunk_ids), + num_new=num_new, + ) + output = self._maybe_wrap_search_output( + output, query_for_compress=str(query), + first_search_ranked_ids=ranked_ids, + ) + return output, meta + + def _exec_fan_out_search(self, params: Dict) -> Tuple[str, Optional[FanOutSearchToolCallMetadata]]: + queries = params.get("queries", []) + if not isinstance(queries, list) or not queries: + return "No queries provided.", FanOutSearchToolCallMetadata( + returned_chunk_ids=[], queries_executed=0, + ) + + queries = queries[:FAN_OUT_MAX_QUERIES] + all_results: List[str] = [] + all_chunk_ids: List[str] = [] + pool_before = self.wm.get_pool_size() + + for q in queries: + if not isinstance(q, str) or not q.strip(): + continue + try: + overrides: Dict[str, Any] = {"ignore_ids": list(self._ids_seen)} + if V8D_ADAPTIVE_RERANK_INSTRUCTION and self.wm.rerank_instruction: + overrides["rerank_instruction"] = self.wm.rerank_instruction + output, meta = self.search_tool({"query": q}, overrides) + all_results.append(output) + if meta and isinstance(meta, SearchCorpusToolCallMetadata): + self._ids_seen.update(meta.returned_chunk_ids) + doc_texts = parse_doc_texts_from_observation(output) + self.wm.add_to_pool(meta.returned_chunk_ids, doc_texts) + all_chunk_ids.extend(meta.returned_chunk_ids) + for cid in meta.returned_chunk_ids: + doc_id = cid.split("_")[0] if "_" in cid else cid + self._doc_id_to_query.setdefault(doc_id, str(q)) + except Exception as e: + logger.warning("fan_out_error", query=str(q)[:100], error=str(e)[:200]) + all_results.append("No results.") + + q_summary = "; ".join(str(q)[:30] for q in queries[:3]) + num_new = self.wm.get_pool_size() - pool_before + self.wm.add_search_record( + "fan_out", q_summary, len(all_chunk_ids), num_new=num_new, + ) + combined = "\n".join(all_results) if all_results else "No results found." + # v8d: compress (using concatenated query string), auto-populate, token marker + concat_query = " ".join(str(q) for q in queries if isinstance(q, str)) + combined = self._maybe_wrap_search_output( + combined, + query_for_compress=concat_query, + first_search_ranked_ids=all_chunk_ids, + ) + return combined, FanOutSearchToolCallMetadata( + returned_chunk_ids=all_chunk_ids, queries_executed=len(queries), + ) + + def _exec_grep(self, params: Dict) -> Tuple[str, Optional[ToolCallMetadata]]: + grep_tool = self.toolset.get_tool("grep_corpus") + if grep_tool is None: + return "grep_corpus not available.", None + pool_before = self.wm.get_pool_size() + output, meta = grep_tool(params) + if meta and isinstance(meta, GrepCorpusToolCallMetadata): + doc_texts = parse_doc_texts_from_observation(output) + self.wm.add_to_pool(meta.returned_chunk_ids, doc_texts) + num_new = self.wm.get_pool_size() - pool_before + self.wm.add_search_record( + "grep", str(params.get("pattern", ""))[:60], + len(meta.returned_chunk_ids), num_new=num_new, + ) + # v8d: grep results can still benefit from sentence-level compression and token marker + output = self._maybe_wrap_search_output( + output, query_for_compress=str(params.get("pattern", "")), + first_search_ranked_ids=None, + ) + return output, meta + + def _exec_read_doc(self, params: Dict) -> Tuple[str, Optional[ToolCallMetadata]]: + read_tool = self.toolset.get_tool("read_document") + if read_tool is None: + return "read_document not available.", None + doc_id = params.get("doc_id") or params.get("id", "") + if self._normalize_ids and "_" in doc_id: + doc_id = doc_id.split("_")[0] + overrides = {} + if doc_id in self._doc_id_to_query: + overrides["query"] = self._doc_id_to_query[doc_id] + pool_before = self.wm.get_pool_size() + output, meta = read_tool(params, overrides or None) + doc_texts = parse_doc_texts_from_observation(output) + if doc_texts: + self.wm.add_to_pool(list(doc_texts.keys()), doc_texts) + num_new = self.wm.get_pool_size() - pool_before + self.wm.add_search_record( + "read", str(doc_id)[:30], + len(doc_texts) if doc_texts else 1, num_new=num_new, + ) + # v8d: read_document returns full text — compression is too aggressive here, + # but still append token marker. + if V8D_TOKEN_BUDGET_MARKER and self.text_token_counter is not None: + try: + used = self._approx_prompt_tokens + self.text_token_counter(output) + output = append_token_marker(output, used) + except Exception: + pass + return output, meta + + def _exec_curate(self, params: Dict) -> str: + add_ids = params.get("add_ids", []) + remove_ids = params.get("remove_ids", []) + if not isinstance(add_ids, list): + add_ids = [str(add_ids)] if add_ids else [] + if not isinstance(remove_ids, list): + remove_ids = [str(remove_ids)] if remove_ids else [] + + importance: Optional[Dict[str, str]] = None + if V8D_IMPORTANCE_TAGGING: + raw = params.get("importance") + if isinstance(raw, dict): + importance = {str(k): str(v) for k, v in raw.items()} + + return self.wm.curate(add_ids, remove_ids, importance=importance) + + def _exec_verify(self, params: Dict) -> str: + """v8d: verify claim against specific docs via LLM. No corpus call.""" + if ABLATE_VERIFY_UNAVAILABLE: + self.wm.add_search_record("verify", "unavailable", 0, num_new=0) + return "verify: unavailable in this ablation." + + doc_ids = params.get("doc_ids", []) + claim = str(params.get("claim", "")).strip() + if not isinstance(doc_ids, list): + doc_ids = [str(doc_ids)] if doc_ids else [] + doc_ids = [str(d).strip() for d in doc_ids if d][:5] + if not doc_ids or not claim: + return "verify: doc_ids or claim missing." + + # Resolve full text from WM's doc_store (verify does NOT re-query the corpus). + doc_texts: Dict[str, str] = {} + for did in doc_ids: + norm = self.wm._normalize_id(did) + store = self.wm.doc_store.get(norm, {}) + txt = store.get("full_text") or store.get("snippet") or "" + if txt: + doc_texts[norm] = txt + + if self._openai_client is None: + try: + from cosmos_retriever.config import get_config + self._openai_client = get_config().get_openai_client() + except Exception as e: + return f"verify: openai client unavailable ({str(e)[:80]})" + + self.wm.add_search_record( + "verify", claim[:50], len(doc_ids), num_new=0, + ) + return exec_verify_claim(self._openai_client, doc_texts, claim) + + def _exec_review_docs(self, params: Dict) -> str: + if ABLATE_REVIEW_DOCS_UNAVAILABLE: + self.wm.add_search_record("review", "unavailable", 0) + return "review_docs: unavailable in this ablation." + + doc_ids = params.get("doc_ids", []) + if not isinstance(doc_ids, list): + doc_ids = [str(doc_ids)] if doc_ids else [] + doc_ids = [str(x).strip() for x in doc_ids if x][:MAX_REVIEW_DOCS] + if not doc_ids: + return "No doc_ids provided." + result = self.wm.review_docs(doc_ids) + self.wm.add_search_record("review", ", ".join(doc_ids[:3]), len(doc_ids)) + return result + + # ── Trajectory Saving ────────────────────────────────────── + + def _save_trajectory(self) -> None: + if not SAVE_TRAJECTORIES: + return + try: + save_dir = TRAJECTORY_SAVE_PATH or os.environ.get("LOG_PATH", "./tmp/rl_ultra_v3") + save_dir = os.path.join(save_dir, "trajectories") + os.makedirs(save_dir, exist_ok=True) + + record = { + "query_id": self.query_id, + "dataset": self._dataset_name, + "normalize_ids": self._normalize_ids, + "turns": self._current_turn, + "curated_ids": self.wm.curated_ids, + # Persist v8d per-doc tags so downstream analysis can filter + # to high-confidence subsets (e.g., very_high/high only). + "curated_importance": dict(self.wm.curated_importance), + "pool_ids": self.wm.pool_ids[:50], + "pool_size": len(self.wm.pool_ids), + "search_history": self.wm.search_history, + } + save_file = os.path.join(save_dir, "episodes.jsonl") + with open(save_file, "a") as f: + f.write(json.dumps(record) + "\n") + except Exception as e: + logger.warning("save_error", error=str(e)[:200]) diff --git a/cosmos-retriever/src/cosmos_retriever/inference/__init__.py b/cosmos-retriever/src/cosmos_retriever/inference/__init__.py new file mode 100644 index 0000000..52f6266 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/__init__.py @@ -0,0 +1,15 @@ +"""Inference-model adapters that drive the agent loop.""" + +from __future__ import annotations + +from cosmos_retriever.inference.base import ( + AgentInferenceModel, + InferenceContext, +) +from cosmos_retriever.inference.vllm import VLLMHarmonyInferenceModel + +__all__ = [ + "AgentInferenceModel", + "InferenceContext", + "VLLMHarmonyInferenceModel", +] diff --git a/cosmos-retriever/src/cosmos_retriever/inference/base.py b/cosmos-retriever/src/cosmos_retriever/inference/base.py new file mode 100644 index 0000000..662e145 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/base.py @@ -0,0 +1,29 @@ +"""Inference-model abstraction used by the agent loop.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass + +from cosmos_retriever.tools import ToolSet +from cosmos_retriever.trajectory import Action, Trajectory + + +@dataclass +class InferenceContext: + """Inputs to one model call.""" + + trajectory: Trajectory + toolset: ToolSet + max_tokens: int | None = None + + +class AgentInferenceModel(ABC): + """Translate an :class:`InferenceContext` into the next :class:`Action`.""" + + @abstractmethod + def __call__(self, context: InferenceContext) -> Action | None: + """Sample the next action from the model. Return ``None`` to stop.""" + + +__all__ = ["AgentInferenceModel", "InferenceContext"] diff --git a/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py b/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py new file mode 100644 index 0000000..cb814fa --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py @@ -0,0 +1,498 @@ +"""Evaluate Harness-1 against a local vLLM OpenAI-compatible endpoint. + +This mirrors inference/evaluate_harness1.py, but replaces the Tinker sampling +client with raw token-id calls to vLLM /v1/completions. It is intended for +parity checks of the released Hugging Face checkpoint served by vLLM. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import random +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Dict, List + +import structlog +import tiktoken + +# Allow direct execution while keeping imports package-relative. +import sys + +_REPO_ROOT = Path(__file__).resolve().parents[1] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + +from cosmos_retriever.datagen.search_dataset import SearchDataset, get_dataset +from cosmos_retriever.config import get_config +from cosmos_retriever.tools import ( + GrepCorpusTool, + PruneChunksTool, + ReadDocumentTool, + SearchCorpusTool, + ToolSet, + UserTextTool, +) +from tinker_cookbook.completers import StopCondition, TokensWithLogprobs +from cosmos_retriever.env_rl import MAX_TURNS, SEARCH_DISPLAY_LIMIT, SlidingWindowSearchEnv + +logger = structlog.get_logger("evaluate_harness1_vllm") + +SAVE_FULL_TRAJECTORIES = os.environ.get("SAVE_FULL_TRAJECTORIES", "0") == "1" + + +class VllmTokenCompleter: + """Token-level policy backed by vLLM raw completions.""" + + def __init__( + self, + *, + base_url: str, + model: str, + max_tokens: int, + temperature: float, + top_p: float, + timeout: int, + ) -> None: + self.base_url = base_url.rstrip("/") + self.model = model + self.max_tokens = max_tokens + self.temperature = temperature + self.top_p = top_p + self.timeout = timeout + + @property + def completions_url(self) -> str: + if self.base_url.endswith("/v1"): + return f"{self.base_url}/completions" + return f"{self.base_url}/v1/completions" + + async def __call__(self, model_input, stop: StopCondition) -> TokensWithLogprobs: + prompt_tokens = model_input.to_ints() + payload = { + "model": self.model, + "prompt": prompt_tokens, + "max_tokens": self.max_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "stream": False, + "return_token_ids": True, + } + if stop and all(isinstance(s, int) for s in stop): + payload["stop_token_ids"] = list(stop) + elif stop: + payload["stop"] = list(stop) + + data = await asyncio.to_thread(self._post_json, payload) + choice = data["choices"][0] + tokens = ( + choice.get("token_ids") + or choice.get("tokens") + or choice.get("text_token_ids") + or [] + ) + if not tokens: + raise RuntimeError(f"vLLM response did not include token IDs: {str(data)[:500]}") + return TokensWithLogprobs(tokens=[int(t) for t in tokens], maybe_logprobs=None) + + def _post_json(self, payload: Dict) -> Dict: + body = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + self.completions_url, + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=self.timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"vLLM HTTP {exc.code}: {detail[:1000]}") from exc + + +def save_full_trajectory(env: SlidingWindowSearchEnv) -> None: + traj_root = os.environ.get("TRAJECTORY_SAVE_PATH") or os.environ.get( + "LOG_PATH", "./tmp/rl_ultra_v3" + ) + full_dir = os.path.join(traj_root, "full") + os.makedirs(full_dir, exist_ok=True) + + turns = [] + for i, (action, obs) in enumerate(zip(env._all_actions, env._all_observations)): + turn_record = {"turn": i} + if action.reasoning: + turn_record["reasoning"] = action.reasoning + + tool_calls = [] + for tool, params in zip(action.tools, action.params): + name = "user_text" if isinstance(tool, UserTextTool) else tool.tool_schema.name + tool_calls.append({"tool": name, "params": params}) + turn_record["tool_calls"] = tool_calls + + tool_returns = [] + for j, obs_text in enumerate(obs.observations): + tr = {"text": obs_text} + if j < len(obs.tool_metadata) and obs.tool_metadata[j] is not None: + try: + tr["metadata"] = obs.tool_metadata[j].model_dump() + except Exception: + tr["metadata"] = str(obs.tool_metadata[j]) + tool_returns.append(tr) + turn_record["tool_returns"] = tool_returns + turns.append(turn_record) + + record = { + "query_id": env.query_id, + "query_text": env.wm.query, + "dataset": env.dataset.name, + "system_prompt": env.system_prompt, + "turns": turns, + "curated_ids": env.wm.curated_ids, + "curated_importance": dict(env.wm.curated_importance), + "reward": env._terminal_reward, + "metrics": { + k: v + for k, v in env._terminal_metrics.items() + if isinstance(v, (int, float, str, bool)) + }, + } + qid_safe = str(env.query_id).replace("/", "_") + with open(os.path.join(full_dir, f"{qid_safe}.json"), "w", encoding="utf-8") as f: + json.dump(record, f, indent=2, default=str) + + +async def run_single_episode( + env: SlidingWindowSearchEnv, + policy: VllmTokenCompleter, +) -> Dict: + ob, stop_condition = await env.initial_observation() + turns = 0 + start = time.time() + + while True: + ac_with_logprobs = await policy(ob, stop_condition) + step_result = await env.step(ac_with_logprobs.tokens) + turns += 1 + if step_result.episode_done: + break + ob = step_result.next_observation + stop_condition = step_result.next_stop_condition + + elapsed = time.time() - start + result = { + "turns": turns, + "n_curated": len(env.wm.curated_ids), + "n_pool": len(env.wm.pool_ids), + "elapsed_s": round(elapsed, 1), + "tool_types_used": list(env._tool_types_used), + "total_curate_calls": env._total_curate_calls, + "pool_ids": list(env.wm.pool_ids), + } + return result + + +async def eval_single_query( + qid: str, + dataset: SearchDataset, + toolset: ToolSet, + search_tool: SearchCorpusTool, + text_token_counter, + policy: VllmTokenCompleter, + max_turns: int, +) -> Dict: + _, query_text = dataset.get_query_by_id(qid) + env = SlidingWindowSearchEnv( + toolset=toolset, + search_tool=search_tool, + query_id=qid, + query_text=query_text, + dataset=dataset, + text_token_counter=text_token_counter, + max_turns=max_turns, + ) + try: + result = await run_single_episode(env=env, policy=policy) + result["query_id"] = qid + result["query"] = query_text[:80] + if SAVE_FULL_TRAJECTORIES: + save_full_trajectory(env) + logger.info( + "episode_result", + qid=qid, + recall=round(result.get("recall", 0), 3), + trajectory_recall=round(result.get("trajectory_recall", 0), 3), + final_answer_recall=round(result.get("final_answer_recall", 0), 3), + reward=round(result.get("reward", 0), 3), + curated=result["n_curated"], + pool=result["n_pool"], + turns=result["turns"], + error=result["error"], + time=result["elapsed_s"], + ) + return result + except Exception as exc: + logger.error("episode_failed", qid=qid, error=str(exc)[:500]) + return { + "query_id": qid, + "query": query_text[:80], + "error": True, + "reward": 0, + "recall": 0, + "trajectory_recall": 0, + "final_answer_recall": 0, + "precision": 0, + "n_curated": 0, + "n_pool": 0, + "turns": 0, + } + + +async def eval_queries( + query_ids: List[str], + dataset: SearchDataset, + toolset: ToolSet, + search_tool: SearchCorpusTool, + text_token_counter, + policy: VllmTokenCompleter, + max_turns: int, + parallel: int, + partial_output: Path | None = None, +) -> List[Dict]: + sem = asyncio.Semaphore(parallel) + write_lock = asyncio.Lock() + completed = 0 + + async def bounded(qid: str) -> Dict: + nonlocal completed + async with sem: + result = await eval_single_query( + qid, + dataset, + toolset, + search_tool, + text_token_counter, + policy, + max_turns, + ) + if partial_output is not None: + async with write_lock: + completed += 1 + partial_output.parent.mkdir(parents=True, exist_ok=True) + with partial_output.open("a", encoding="utf-8") as f: + f.write(json.dumps(result, default=str) + "\n") + logger.info( + "partial_result_saved", + path=str(partial_output), + completed=completed, + total=len(query_ids), + qid=qid, + ) + return result + + return list(await asyncio.gather(*(bounded(qid) for qid in query_ids))) + + +def summarize_results(results: List[Dict]) -> Dict: + n = len(results) + + def mean(key: str) -> float: + return sum(float(r.get(key, 0.0)) for r in results) / max(n, 1) + + return { + "n": n, + "errors": sum(1 for r in results if r.get("error")), + "recall": mean("recall"), + "trajectory_recall": mean("trajectory_recall"), + "final_answer_recall": mean("final_answer_recall"), + "precision": mean("precision"), + "reward": mean("reward"), + "turns": mean("turns"), + "n_curated": mean("n_curated"), + "n_pool": mean("n_pool"), + } + + +def print_results_table(name: str, results: List[Dict]) -> None: + summary = summarize_results(results) + print(f"\n{'=' * 80}") + print(f" {name}") + print(f"{'=' * 80}") + print(f" n: {summary['n']} errors: {summary['errors']}") + print(f" Recall: {summary['recall']:.4f}") + print(f" Trajectory Recall: {summary['trajectory_recall']:.4f}") + print(f" Final-Answer Recall: {summary['final_answer_recall']:.4f}") + print(f" Precision: {summary['precision']:.4f}") + print(f" Reward: {summary['reward']:.4f}") + print(f" Turns: {summary['turns']:.2f}") + print(f"{'=' * 80}\n") + + +async def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dataset", default="browsecompplus") + parser.add_argument("--split", default="test", choices=["all", "test", "train", "rl"]) + parser.add_argument("--collection-split", default="test", choices=["test", "train", "rl"]) + parser.add_argument("--n-queries", type=int, default=100) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--query-ids", nargs="*", default=None) + parser.add_argument("--max-turns", type=int, default=MAX_TURNS) + parser.add_argument("--max-tokens", type=int, default=2048) + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top-p", type=float, default=0.9) + parser.add_argument("--parallel", type=int, default=1) + parser.add_argument("--base-url", default="http://127.0.0.1:8000/v1") + parser.add_argument("--model", default="harness-1") + parser.add_argument("--timeout", type=int, default=900) + parser.add_argument("--output", default=None) + parser.add_argument( + "--partial-output", + default=None, + help="Append one JSON line per completed query so interrupted runs keep progress.", + ) + parser.add_argument( + "--reranker", + type=str, + default="baseten", + choices=["baseten", "vllm", "none"], + help="Reranker backend: baseten (original), vllm (local Qwen3-Reranker-8B drop-in), or none.", + ) + args = parser.parse_args() + + config = get_config() + tiktoken_enc = tiktoken.get_encoding("o200k_harmony") + text_token_counter = lambda text: len(tiktoken_enc.encode(text)) + + dataset = get_dataset(args.dataset) + collection_names = dataset.get_cosmos_containers(split=args.collection_split) + cosmos_database = config.get_cosmos_database() + import os as _os + _EMBED_BASE_URL = _os.environ.get("EMBED_BASE_URL") + if _EMBED_BASE_URL: + from openai import OpenAI as _OpenAI + openai_client = _OpenAI( + base_url=_EMBED_BASE_URL, + api_key=_os.environ.get("EMBED_API_KEY", "EMPTY"), + ) + _embed_model = _os.environ.get("EMBED_MODEL", "qwen3-embed") + else: + openai_client = config.get_openai_client() + _embed_model = "text-embedding-3-small" + + try: + _reranker_backend = getattr(args, "reranker", "baseten") + if _reranker_backend == "none": + reranker = None + elif _reranker_backend == "vllm": + from harness.rerank import VLLMQwen3Reranker + + reranker = VLLMQwen3Reranker(token_counter=text_token_counter, max_tokens=4096) + else: + from harness.rerank import BasetenReranker + + reranker = BasetenReranker(token_counter=text_token_counter, max_tokens=4096) + except Exception: + reranker = None + + search_tool = SearchCorpusTool( + cosmos_database=cosmos_database, + openai_client=openai_client, + cosmos_container_name=collection_names[0], + openai_ef_name=_embed_model, + reranker=reranker, + snippet_max_chars=2048, + display_limit=SEARCH_DISPLAY_LIMIT, + ) + toolset = ToolSet(name=f"{args.dataset}_toolset") + toolset.add_tool(search_tool) + toolset.add_tool( + GrepCorpusTool( + cosmos_database=cosmos_database, + cosmos_container_name=collection_names[0], + token_counter=text_token_counter, + ) + ) + toolset.add_tool( + ReadDocumentTool( + cosmos_database=cosmos_database, + cosmos_container_name=collection_names[0], + reranker=reranker, + token_counter=text_token_counter, + max_tokens=4096, + ) + ) + toolset.add_tool(PruneChunksTool()) + + if args.split == "all": + all_qids = dataset.get_all_query_ids() + elif args.split == "test": + all_qids = dataset.get_test_query_ids() + elif args.split == "rl": + all_qids = dataset.get_rl_query_ids() + else: + all_qids = dataset.get_all_query_ids(split="train") + + if args.query_ids: + known_qids = set(all_qids) + query_ids = [qid for qid in args.query_ids if qid in known_qids] + if not query_ids: + raise ValueError("No valid query IDs remained after filtering") + else: + rng = random.Random(args.seed) + query_ids = rng.sample(all_qids, min(args.n_queries, len(all_qids))) + + policy = VllmTokenCompleter( + base_url=args.base_url, + model=args.model, + max_tokens=args.max_tokens, + temperature=args.temperature, + top_p=args.top_p, + timeout=args.timeout, + ) + + logger.info( + "evaluating_vllm", + model=args.model, + base_url=args.base_url, + n=len(query_ids), + parallel=args.parallel, + ) + results = await eval_queries( + query_ids=query_ids, + dataset=dataset, + toolset=toolset, + search_tool=search_tool, + text_token_counter=text_token_counter, + policy=policy, + max_turns=args.max_turns, + parallel=args.parallel, + partial_output=Path(args.partial_output) if args.partial_output else None, + ) + print_results_table(args.model, results) + + if args.output: + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + payload = { + args.model: [ + { + k: v + for k, v in r.items() + if isinstance(v, (int, float, str, bool, list)) + } + for r in results + ], + "_summary": summarize_results(results), + } + output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + logger.info("results_saved", path=str(output_path)) + + +if __name__ == "__main__": + os.environ.setdefault("PYTHONDONTWRITEBYTECODE", "1") + asyncio.run(main()) diff --git a/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py b/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py new file mode 100644 index 0000000..011037f --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py @@ -0,0 +1,367 @@ +"""Generic OpenAI-compatible **chat-completions** retrieval agent. + +This is the inference backend for *any* chat model (an Azure AI Foundry +deployment, OpenAI, a local OpenAI-compatible server, ...) — as opposed to +:mod:`cosmos_retriever.inference.vllm`, which only works with the fine-tuned +``pat-jj/harness-1`` checkpoint driven by raw Harmony token-IDs. + +Instead of the Harmony channel/token protocol, this drives the same Cosmos +:class:`~cosmos_retriever.tools.ToolSet` through **standard function/tool +calling**: + +1. Render the retrieval system prompt (the same one the trained model used). +2. Advertise the four real tools (``search_corpus``, ``grep_corpus``, + ``read_document``, ``prune_chunks``) as OpenAI ``tools`` function schemas. +3. Loop: call ``/v1/chat/completions``; if the model returns ``tool_calls``, + execute each against the toolset and feed the results back as ``role:tool`` + messages; otherwise treat the assistant text as the final answer. +4. Parse the final ```` blocks the prompt asks for and hydrate + each with the chunk text we saw during searches. + +The loop is fully synchronous (the Cosmos SDK + OpenAI SDK calls are sync), so +the FastAPI server runs it on a worker thread just like the Harmony path. +""" + +from __future__ import annotations + +import json +import re +from dataclasses import dataclass, field + +import json_repair +import openai +import structlog + +from cosmos_retriever.prompts import get_retrieval_subagent_prompt +from cosmos_retriever.tools import ToolSet +from cosmos_retriever.utils import ProviderFormat + +logger = structlog.get_logger("cosmos_retriever.inference.openai_chat") + +# Only these tools are exposed to a generic chat model. The ``ultra`` stub +# tools (fan_out_search / curate / review_docs / end_search) are dispatched by +# the Harmony env and would raise if a chat model tried to call them. +_CHAT_TOOL_NAMES = ("search_corpus", "grep_corpus", "read_document", "prune_chunks") + +# Matches the per-result header the search/grep tools emit: +# "\n# DOCUMENT ID: ( tokens) \n" +_DOC_RESULT_RE = re.compile(r"#\s*DOCUMENT ID:\s*(?P\S+)(?:\s*\(\d+\s*tokens\))?") + +# Matches the final answer blocks the system prompt asks the model to produce. +_FINAL_DOC_RE = re.compile( + r"[^\"'\s>]+)[\"']?\s*>\s*" + r"(?:\s*(?P.*?)\s*\s*)?" + r"", + re.IGNORECASE | re.DOTALL, +) + + +@dataclass +class ChatDocument: + """A single curated document produced by the chat agent.""" + + id: str + text: str = "" + justification: str | None = None + rank: int | None = None + + +@dataclass +class ChatSearchResult: + """Output of :func:`run_chat_search`.""" + + documents: list[ChatDocument] + num_turns: int + final_text: str = "" + metadata: dict[str, str | int | float] = field(default_factory=dict) + + +def _parse_tool_arguments(raw: str | None) -> dict: + """Parse a tool-call ``arguments`` JSON string, tolerating minor breakage.""" + + if not raw: + return {} + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + try: + parsed = json_repair.loads(raw) + except Exception: # noqa: BLE001 — last-ditch; bad args become {} + return {} + return parsed if isinstance(parsed, dict) else {} + + +def _collect_doc_text(observation: str, store: dict[str, str]) -> None: + """Record the first chunk text seen for each ``# DOCUMENT ID:`` in a result.""" + + matches = list(_DOC_RESULT_RE.finditer(observation)) + for idx, match in enumerate(matches): + chunk_id = match.group("id") + start = match.end() + end = matches[idx + 1].start() if idx + 1 < len(matches) else len(observation) + body = observation[start:end].strip() + if body and not store.get(chunk_id): + store[chunk_id] = body + + +def _extract_documents( + final_text: str, doc_text: dict[str, str], max_documents: int +) -> list[ChatDocument]: + """Pull ranked ```` blocks out of the model's final answer.""" + + documents: list[ChatDocument] = [] + seen: set[str] = set() + for match in _FINAL_DOC_RE.finditer(final_text): + doc_id = match.group("id") + if doc_id in seen: + continue + seen.add(doc_id) + justification = match.group("justification") + text = doc_text.get(doc_id) or doc_text.get(doc_id.split("__")[0]) or "" + documents.append( + ChatDocument( + id=doc_id, + text=text, + justification=justification.strip() if justification else None, + rank=len(documents), + ) + ) + if len(documents) >= max_documents: + break + return documents + + +def run_chat_search( + *, + toolset: ToolSet, + client: openai.OpenAI, + model: str, + query: str, + max_documents: int = 20, + max_turns: int = 20, + temperature: float = 0.7, + max_tokens: int = 4096, +) -> ChatSearchResult: + """Run the multi-turn retrieval agent against a generic chat model. + + Args: + toolset: The Cosmos-backed :class:`ToolSet` (built **without** the + ultra stub tools). + client: An OpenAI-compatible client (``openai.OpenAI`` / + ``openai.AzureOpenAI``). + model: Model or Foundry deployment name passed as ``model=``. + query: Natural-language information need. + max_documents: Cap on curated documents to return / ask for. + max_turns: Hard cap on chat round-trips. + temperature / max_tokens: Sampling controls per call. + + Returns: + A :class:`ChatSearchResult` with ranked documents and run metadata. + """ + + tool_specs = [ + tool.get_format(ProviderFormat.OPENAI_HARMONY) # Chat-Completions function shape + for name, tool in toolset.tools.items() + if name in _CHAT_TOOL_NAMES + ] + + messages: list[dict] = [ + {"role": "system", "content": get_retrieval_subagent_prompt(query, num_output_docs=max_documents)}, + { + "role": "user", + "content": ( + "Use the available tools to search the corpus, then return ONLY the " + "ranked blocks (with a ) for the most " + "relevant documents. Do not answer the question yourself." + ), + }, + ] + + doc_text: dict[str, str] = {} + tool_types_used: set[str] = set() + tool_call_count = 0 + final_text = "" + num_turns = 0 + + for _ in range(max_turns): + response = client.chat.completions.create( + model=model, + messages=messages, + tools=tool_specs, + tool_choice="auto", + temperature=temperature, + max_tokens=max_tokens, + ) + num_turns += 1 + message = response.choices[0].message + tool_calls = message.tool_calls or [] + + # Echo the assistant turn back into the transcript. + assistant_entry: dict = {"role": "assistant", "content": message.content or ""} + if tool_calls: + assistant_entry["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": {"name": tc.function.name, "arguments": tc.function.arguments}, + } + for tc in tool_calls + ] + messages.append(assistant_entry) + + if not tool_calls: + final_text = message.content or "" + break + + for tc in tool_calls: + name = tc.function.name + tool_types_used.add(name) + tool_call_count += 1 + args = _parse_tool_arguments(tc.function.arguments) + tool = toolset.get_tool(name) + if tool is None: + output = f"Error: unknown tool '{name}'." + else: + try: + output, _metadata = tool(args) + _collect_doc_text(output, doc_text) + except Exception as exc: # noqa: BLE001 — surface tool errors to the model + logger.warning("chat_tool_error", tool=name, error=str(exc)) + output = f"Error executing '{name}': {exc}" + messages.append({"role": "tool", "tool_call_id": tc.id, "content": output}) + else: + # Loop exhausted without a final (no-tool-call) turn: fall back to the + # last assistant text we saw, if any. + for entry in reversed(messages): + if entry.get("role") == "assistant" and entry.get("content"): + final_text = entry["content"] + break + + documents = _extract_documents(final_text, doc_text, max_documents) + + logger.info( + "chat_search_complete", + model=model, + num_turns=num_turns, + num_documents=len(documents), + tool_calls=tool_call_count, + ) + + return ChatSearchResult( + documents=documents, + num_turns=num_turns, + final_text=final_text, + metadata={ + "backend": "openai_chat", + "model": model, + "tool_calls": tool_call_count, + "tool_types_used": ",".join(sorted(tool_types_used)), + }, + ) + + +def run_responses_search( + *, + toolset: ToolSet, + client: openai.OpenAI, + model: str, + query: str, + max_documents: int = 20, + max_turns: int = 20, + max_tokens: int = 4096, + reasoning_effort: str | None = None, +) -> ChatSearchResult: + """Run the retrieval agent against an OpenAI **/responses** API model. + + Reasoning models such as ``gpt-5.x`` are exposed only through the + ``responses`` endpoint, which uses a different shape from chat-completions: + a plain-string first ``input``, flat function tool schemas, and multi-turn + continuation via ``previous_response_id`` + ``function_call_output`` items. + + Args mirror :func:`run_chat_search`, plus ``reasoning_effort`` which (when + set) is forwarded as ``reasoning={"effort": ...}`` for reasoning models. + """ + + tool_specs = [ + tool.get_format(ProviderFormat.OPENAI) # flat Responses function shape + for name, tool in toolset.tools.items() + if name in _CHAT_TOOL_NAMES + ] + + prompt = ( + get_retrieval_subagent_prompt(query, num_output_docs=max_documents) + + "\n\nUse the available tools to search the corpus, then return ONLY the " + "ranked blocks (each with a ) for the most " + "relevant documents. Do not answer the question yourself." + ) + + common: dict = {"model": model, "tools": tool_specs, "max_output_tokens": max_tokens} + if reasoning_effort: + common["reasoning"] = {"effort": reasoning_effort} + + doc_text: dict[str, str] = {} + tool_types_used: set[str] = set() + tool_call_count = 0 + final_text = "" + + response = client.responses.create(input=prompt, **common) + num_turns = 1 + + while True: + function_calls = [o for o in response.output if getattr(o, "type", None) == "function_call"] + if not function_calls: + final_text = getattr(response, "output_text", "") or "" + break + if num_turns >= max_turns: + final_text = getattr(response, "output_text", "") or "" + break + + outputs: list[dict] = [] + for fc in function_calls: + name = fc.name + tool_types_used.add(name) + tool_call_count += 1 + args = _parse_tool_arguments(fc.arguments) + tool = toolset.get_tool(name) + if tool is None: + output = f"Error: unknown tool '{name}'." + else: + try: + output, _metadata = tool(args) + _collect_doc_text(output, doc_text) + except Exception as exc: # noqa: BLE001 — surface tool errors to the model + logger.warning("responses_tool_error", tool=name, error=str(exc)) + output = f"Error executing '{name}': {exc}" + outputs.append( + {"type": "function_call_output", "call_id": fc.call_id, "output": output} + ) + + response = client.responses.create( + previous_response_id=response.id, input=outputs, **common + ) + num_turns += 1 + + documents = _extract_documents(final_text, doc_text, max_documents) + + logger.info( + "responses_search_complete", + model=model, + num_turns=num_turns, + num_documents=len(documents), + tool_calls=tool_call_count, + ) + + return ChatSearchResult( + documents=documents, + num_turns=num_turns, + final_text=final_text, + metadata={ + "backend": "openai_responses", + "model": model, + "tool_calls": tool_call_count, + "tool_types_used": ",".join(sorted(tool_types_used)), + }, + ) + + +__all__ = ["ChatDocument", "ChatSearchResult", "run_chat_search", "run_responses_search"] diff --git a/cosmos-retriever/src/cosmos_retriever/inference/vllm.py b/cosmos-retriever/src/cosmos_retriever/inference/vllm.py new file mode 100644 index 0000000..b3a7537 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/vllm.py @@ -0,0 +1,309 @@ +"""vLLM inference adapter for the Harness-1 model. + +Talks to an OpenAI-compatible vLLM ``/v1/completions`` endpoint, exchanging +**raw token-IDs** in/out so the model's Harmony format is preserved end-to-end. +That is the only inference path the trained Harness-1 checkpoint ships with; +JSON Chat-Completions would lose the channel structure. + +The token-stream lifecycle: + +1. :py:meth:`~cosmos_retriever.trajectory.Trajectory.to_openai_harmony_format` + produces an ``openai_harmony.Conversation``. +2. :py:meth:`HarmonyEncoding.render_conversation` turns it into ``list[int]``. +3. We POST those ints as ``"prompt": [...]``, with ``"return_token_ids": True``. +4. vLLM responds with ``token_ids``; we feed them through + :py:meth:`HarmonyEncoding.parse_messages_from_completion_tokens` and + replay the ``analysis`` / ``commentary`` / ``final`` channels into an + :class:`~cosmos_retriever.trajectory.Action`. +""" + +from __future__ import annotations + +import json +import re +import uuid +from typing import Any + +import httpx +import json_repair +import structlog +import tenacity +from openai_harmony import ( + HarmonyEncoding, + HarmonyEncodingName, + Message, + RenderConversationConfig, + load_harmony_encoding, +) + +from cosmos_retriever.inference.base import AgentInferenceModel, InferenceContext +from cosmos_retriever.tools import ToolSet, UserTextTool +from cosmos_retriever.trajectory import Action, ActionBuilder +from cosmos_retriever.utils import ProviderFormat + +logger = structlog.get_logger("cosmos_retriever.inference.vllm") + + +class VLLMHarmonyInferenceModel(AgentInferenceModel): + """Inference against vLLM serving the Harness-1 model with Harmony tokens. + + Args: + base_url: Base URL of the vLLM server (e.g. ``http://127.0.0.1:8000``). + model_name: ``--served-model-name`` advertised by vLLM + (defaults to ``"harness-1"``). + max_completion_tokens: Default sampling budget per call. + temperature: Sampling temperature. + top_p: Top-p / nucleus sampling. + timeout_s: HTTP timeout in seconds. + strict_mode: When True, JSON tool arguments must parse cleanly with + :py:func:`json.loads` (with light recovery); when False fall back + to :pypi:`json-repair`. Train-time uses ``True``; production + usually wants ``False`` since the model occasionally emits + slightly-malformed JSON. + context_window: Hard token cap of the served checkpoint + (gpt-oss-20b is 32768 without YARN scaling). + """ + + def __init__( + self, + base_url: str, + *, + model_name: str = "harness-1", + max_completion_tokens: int = 4096, + temperature: float = 1.0, + top_p: float = 0.9, + timeout_s: float = 900.0, + strict_mode: bool = False, + context_window: int = 32768, + ) -> None: + self.base_url = base_url.rstrip("/") + self.model_name = model_name + self.max_completion_tokens = max_completion_tokens + self.temperature = temperature + self.top_p = top_p + self.timeout_s = timeout_s + self.strict_mode = strict_mode + self.context_window = context_window + + self.enc: HarmonyEncoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + self.stop_token_ids = list(self.enc.stop_tokens_for_assistant_actions()) + self._client = httpx.Client( + timeout=timeout_s, + headers={"Content-Type": "application/json"}, + ) + + # ------------------------------------------------------------------ + # Public entry point + # ------------------------------------------------------------------ + def __call__(self, context: InferenceContext) -> Action | None: + trajectory = context.trajectory + toolset = context.toolset + + request_messages = trajectory.to_provider_format(ProviderFormat.OPENAI_HARMONY) + input_tokens = self.enc.render_conversation( + request_messages, + config=RenderConversationConfig(auto_drop_analysis=False), + ) + prompt_length = len(input_tokens) + + requested_max = context.max_tokens or self.max_completion_tokens + available = self.context_window - prompt_length - 100 + if available < requested_max: + logger.warning( + "capping_max_tokens", + prompt_length=prompt_length, + requested=requested_max, + available=available, + context_window=self.context_window, + ) + effective_max = max(256, available) + else: + effective_max = requested_max + + resp_tokens = self._sample(list(input_tokens), effective_max) + return self._decode_harmony_action(resp_tokens, toolset) + + # ------------------------------------------------------------------ + # Sampling + # ------------------------------------------------------------------ + @tenacity.retry( + stop=tenacity.stop_after_attempt(5), + wait=tenacity.wait_exponential(multiplier=1, min=4, max=15), + before_sleep=lambda _: logger.warning("retry_vllm_sample"), + ) + def _sample(self, input_tokens: list[int], max_tokens: int) -> list[int]: + payload = { + "model": self.model_name, + "prompt": input_tokens, + "max_tokens": max_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "stream": False, + "stop_token_ids": self.stop_token_ids, + "return_token_ids": True, + } + resp = self._client.post(f"{self.base_url}/v1/completions", json=payload) + if resp.status_code >= 400: + raise RuntimeError(f"vLLM error {resp.status_code}: {resp.text}") + data = resp.json() + return data["choices"][0].get("token_ids", []) + + # ------------------------------------------------------------------ + # Harmony token decoding → Action + # ------------------------------------------------------------------ + def _decode_harmony_action(self, tokens: list[int], toolset: ToolSet) -> Action: + action_builder = ActionBuilder() + messages = self.enc.parse_messages_from_completion_tokens(tokens) + for message in messages: + channel = message.channel + if channel == "analysis": + # Some checkpoints occasionally emit a tool call on the + # analysis channel; treat it as a commentary call so the + # downstream tool dispatch still runs. + if message.recipient: + logger.warning("tool_call_on_analysis_channel_redirected") + self._handle_tool_message(message, toolset, action_builder) + else: + action_builder.add_reasoning(message.content[0].text) + elif channel == "commentary": + self._handle_tool_message(message, toolset, action_builder) + elif channel == "final": + action_builder.add_tool_call( + tool=UserTextTool(), + params={"text": str(message.content[0].text)}, + source="agent", + ) + elif channel is None: + if message.content and getattr(message.content[0], "text", None): + logger.debug("none_channel_treated_as_reasoning") + action_builder.add_reasoning(message.content[0].text) + else: + logger.debug("none_channel_skipped") + else: + raise ValueError(f"Unknown channel: {channel}") + return action_builder.build() + + def _handle_tool_message( + self, + message: Message, + toolset: ToolSet, + action_builder: ActionBuilder, + ) -> None: + if message.recipient == "functions.multi_tool_use": + args = self._parse_json(message.content[0].text) + if isinstance(args, list): + tool_calls = args + elif isinstance(args, dict): + tool_calls = args.get("tool_calls", []) + else: + raise ValueError(f"Invalid multi_tool_use payload: {args!r}") + for tool_call in tool_calls: + raw_name = tool_call.get("tool_name") + if not raw_name: + raise ValueError("Tool call missing 'tool_name'") + parsed_name = self._strip_function_prefix(raw_name) + tool = toolset.get_tool(parsed_name) + if tool is None: + raise ValueError(f"Tool not found: {parsed_name}") + source = f"{tool_call['tool_name']}_{uuid.uuid4().hex}" + action_builder.add_tool_call( + tool=tool, params=tool_call.get("parameters", {}), source=source + ) + else: + recipient = message.recipient + if recipient is None: + raise ValueError("Tool message has no recipient (malformed output)") + parsed_name = self._strip_function_prefix(recipient) + tool = toolset.get_tool(parsed_name) + if tool is None: + raise ValueError(f"Tool not found: {parsed_name}") + params = self._parse_json(message.content[0].text) + if not isinstance(params, dict): + raise ValueError(f"Tool call params must be a JSON object, got {type(params)}") + source = f"{recipient}_{uuid.uuid4().hex}" + action_builder.add_tool_call(tool=tool, params=params, source=source) + + @staticmethod + def _strip_function_prefix(raw: str) -> str: + cleaned = (raw or "").replace("functions.", "").replace("<|constrain|>", "").strip() + if not cleaned: + raise ValueError("Tool name empty after parsing") + return cleaned + + # ------------------------------------------------------------------ + # JSON parsing with progressive recovery + # ------------------------------------------------------------------ + def _parse_json(self, json_string: str) -> Any: + if not self.strict_mode: + return json_repair.loads(json_string) + try: + return json.loads(json_string) + except json.JSONDecodeError: + pass + first_obj = self._extract_first_json_object(json_string) + if first_obj is not None: + try: + return json.loads(first_obj) + except json.JSONDecodeError: + pass + try: + return json.loads(self._repair_json_escapes(first_obj)) + except json.JSONDecodeError: + pass + # Re-raise the original error so callers see the underlying problem. + return json.loads(json_string) + + @staticmethod + def _extract_first_json_object(s: str) -> str | None: + """Return the substring for the first balanced top-level JSON object/array. + + Walks the string tracking brace/bracket depth and string quoting so + that trailing garbage (extra text, duplicate objects, ``[END]`` + markers, etc.) is silently discarded. Returns ``None`` when no + balanced object is found. + """ + + start = -1 + open_ch = "" + for i, ch in enumerate(s): + if ch in ("{", "["): + start = i + open_ch = ch + break + if start < 0: + return None + + close_ch = "}" if open_ch == "{" else "]" + depth = 0 + in_str = False + esc = False + for i in range(start, len(s)): + ch = s[i] + if esc: + esc = False + continue + if ch == "\\" and in_str: + esc = True + continue + if ch == '"' and not esc: + in_str = not in_str + continue + if not in_str: + if ch == open_ch: + depth += 1 + elif ch == close_ch: + depth -= 1 + if depth == 0: + return s[start : i + 1] + return None + + @staticmethod + def _repair_json_escapes(s: str) -> str: + """Fix invalid backslash escapes / control chars that are illegal in JSON.""" + + s = re.sub(r'\\(?!["\\/bfnrt]|u[0-9a-fA-F]{4})', r"\\\\", s) + s = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", "", s) + return s + + +__all__ = ["VLLMHarmonyInferenceModel"] diff --git a/cosmos-retriever/src/cosmos_retriever/prompts.py b/cosmos-retriever/src/cosmos_retriever/prompts.py new file mode 100644 index 0000000..e18e975 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/prompts.py @@ -0,0 +1,90 @@ +"""System prompts for the Harness-1 retrieval subagent. + +Carried over verbatim from the upstream Harness-1 training prompts so that the +trained model continues to see the exact format it was optimised for. +""" + +from __future__ import annotations + + +def get_retrieval_subagent_prompt(query: str, *, num_output_docs: int = 30) -> str: + """Return the system prompt the retrieval subagent was trained against.""" + + return f""" + + You are a retrieval subagent in a multi-agent system. Your specific role is to identify and retrieve the most relevant documents from a large corpus to help another agent answer questions. You do NOT answer questions yourself - you only find and retrieve relevant documents. + + Here is the query you need to find documents for: + + + {query} + + + **Available Tools:** + - SearchTool: Hybrid semantic and keyword search + - GrepTool: Text pattern matching + - ReadDocument: Read specific document snippets that look promising but incomplete + - PruneChunksTool: Remove irrelevant chunks to free up context space + + **Your Process:** + - Break down the query into its key concepts and information needs (list each one explicitly) + - For each key concept, develop a specific search strategy that targets that concept + - Consider what types of documents and evidence would be most helpful for answering this query + - Plan several distinct, non-overlapping search strategies that approach the question from different angles + - Then execute your searches using multiple parallel tool calls. + + **Your Thinking:** + After each round of searches, in your thinking: + - Consider the following: + - **What do I know?**: List the key topics, themes, or aspects of the question that your currently retrieved documents address. What specific information do you have? + - **What should I search for next?**: Systematically consider what search approaches, keywords, or document types you haven't yet tried that might yield valuable information. + - **What should I prune?**: If you were to prune chunks, what would you remove and what new searches would you prioritize? Would this likely yield significantly better or more complete information than what you currently have? + - **Do I have enough information?**: Given the question's complexity and requirements, do you have sufficient information to help answer it, or are there critical gaps? + - Decide if additional searches are needed (and if so, ensure they use genuinely different approaches and do not duplicate or redundant searches) + - Avoid getting stuck on a single search strategy - if one approach isn't yielding results, prune and backtrack and try different approaches + + **Tactics to Consider:** + - When queries fail, try different approaches or keywords to improve the results + - Avoid duplicate or redundant searches + - Execute multiple tool calls in parallel when possible + - It's OK for this section to be quite long. + - If you notice your token budget is approaching the threshold, prune irrelevant chunks proactively to avoid running out of context. + - Focus on gathering as much relevant information as possible, it is useful to get multiple perspectives on the same topic or redundant information to confirm the information you have found is correct. + - Follow explicit textual evidence rather than speculation + + **Output Format:** + Present your final results in order from most relevant to least relevant using this structure: + + + + Brief explanation (1-3 sentences) of why this document is relevant to the query. + + + + Example: + + + This document contains detailed analysis of the specific topic mentioned in the query and provides quantitative data that directly supports answering the question. + + + + Your final output should consist only of the up to {num_output_docs} ranked document results in the specified format and should not duplicate or rehash any of the search planning or evaluation work you did in the thinking block. +` + """ + + +def get_retrieval_subagent_budget_exhausted_message( + current_token_usage: int, threshold_budget: int +) -> str: + """User-message inserted when the retriever has crossed its soft token budget.""" + + return ( + f"[Token usage: {current_token_usage}/{threshold_budget}] **OVER BUDGET.** \n" + "**CRITICAL CONSTRAINT:** You are currently at or near your token budget limit. " + "You CANNOT search, grep, or read any additional documents unless you prune chunks and reduce your token usage.\n" + "You must now make a strategic decision between two options:\n" + "**Option 1: Prune chunks** By using the PruneChunksTool and continue searching after.\n" + "Account for the tokens used by each chunk and the relevancy of the chunks to determine which chunks to prune.**\n" + "\n**Option 2: Conclude your search**\n" + "Before making your decision, work through your strategic analysis and if concluding your search ensure you have the final correct exhaustive set of documents to answer the question and all its subquestions." + ) diff --git a/cosmos-retriever/src/cosmos_retriever/rerank.py b/cosmos-retriever/src/cosmos_retriever/rerank.py new file mode 100644 index 0000000..7559779 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/rerank.py @@ -0,0 +1,483 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +import time +from typing import Callable, List, Optional + +import requests +import structlog +from baseten_performance_client import ClassificationResponse, PerformanceClient + +from cosmos_retriever.config import get_config + +logger = structlog.get_logger("search_agent.rerank") + + +@dataclass +class RerankResult: + """Result of reranking a single document.""" + + document: str + score: float + original_index: int + tokens: Optional[int] = None # Token count, populated if token_counter is available + + +class Reranker(ABC): + """Abstract base class for reranking documents based on a query.""" + + def __init__( + self, + token_counter: Optional[Callable[[str], int]] = None, + max_tokens: Optional[int] = None, + ): + """ + Initialize the reranker. + + Args: + token_counter: Optional callable that counts tokens in a string. + max_tokens: Maximum total tokens for the output. Documents are returned + in reranked order until this budget is exhausted. + + Raises: + ValueError: If max_tokens is specified without a token_counter. + """ + if max_tokens is not None and token_counter is None: + raise ValueError("token_counter is required when max_tokens is specified") + self.token_counter = token_counter + self.max_tokens = max_tokens + + def _truncate_results( + self, results: List[RerankResult], max_tokens: Optional[int] = None + ) -> List[RerankResult]: + """Truncate results to fit within max_tokens total. + + Also populates the tokens field for each result if token_counter is available. + + Args: + results: List of RerankResult objects to truncate. + max_tokens: Optional override for max_tokens. If not provided, + uses the instance's max_tokens setting. + """ + # If we have a token_counter, populate tokens for all results + if self.token_counter is not None: + for result in results: + result.tokens = self.token_counter(result.document) + + effective_max_tokens = max_tokens if max_tokens is not None else self.max_tokens + if self.token_counter is None or effective_max_tokens is None: + return results + + truncated: List[RerankResult] = [] + total_tokens = 0 + for result in results: + doc_tokens = result.tokens # Already calculated above + assert doc_tokens is not None + if total_tokens + doc_tokens > effective_max_tokens: + logger.info( + "truncating_results", + kept=len(truncated), + dropped=len(results) - len(truncated), + total_tokens=total_tokens, + max_tokens=effective_max_tokens, + ) + break + truncated.append(result) + total_tokens += doc_tokens + + return truncated + + @abstractmethod + def _rerank( + self, + query: str, + documents: List[str], + instruction: Optional[str] = None, + ) -> List[RerankResult]: + """ + Rerank documents based on relevance to the query. + + Subclasses must implement this method to perform the actual reranking. + + Args: + query: The search query to rank documents against. + documents: List of document strings to rerank. + instruction: Optional instruction for the reranker. + + Returns: + List of RerankResult objects sorted by relevance (highest first). + """ + pass + + def __call__( + self, + query: str, + documents: List[str], + instruction: Optional[str] = None, + max_tokens: Optional[int] = None, + ) -> List[RerankResult]: + """ + Rerank documents based on relevance to the query. + + Args: + query: The search query to rank documents against. + documents: List of document strings to rerank. + instruction: Optional instruction for the reranker. + max_tokens: Optional override for max_tokens budget. If provided, + overrides the instance's max_tokens for this call only. + + Returns: + List of RerankResult objects sorted by relevance (highest first), + truncated to fit within max_tokens if token_counter is provided. + """ + start = time.perf_counter() + results = self._rerank(query, documents, instruction) + elapsed_ms = (time.perf_counter() - start) * 1000 + if elapsed_ms > 1500: + logger.warning( + "Extremely slow reranking", + elapsed_ms=round(elapsed_ms, 1), + ) + return self._truncate_results(results, max_tokens=max_tokens) + + +class BasetenReranker(Reranker): + """Reranker implementation using Baseten's classification API on top of Qwen 3 8B""" + + PREFIX = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' + SUFFIX = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + DEFAULT_INSTRUCTION = ( + "Given a web search query, retrieve relevant passages that answer the query" + ) + + def __init__( + self, + client: Optional[PerformanceClient] = None, + token_counter: Optional[Callable[[str], int]] = None, + max_tokens: Optional[int] = None, + batch_size: int = 16, + max_concurrent_requests: int = 256, + timeout_s: int = 360, + ): + """ + Initialize the Baseten reranker. + + Args: + client: Optional PerformanceClient. If not provided, uses config. + token_counter: Optional callable that counts tokens in a string. + max_tokens: Maximum total tokens for the output. + batch_size: Batch size for classification requests. + max_concurrent_requests: Maximum concurrent requests. + timeout_s: Timeout in seconds. + """ + super().__init__(token_counter=token_counter, max_tokens=max_tokens) + if client is None: + config = get_config() + client = config.get_baseten_client() + self.client = client + self.batch_size = batch_size + self.max_concurrent_requests = max_concurrent_requests + self.timeout_s = timeout_s + + def _format_input( + self, instruction: Optional[str], query: str, document: str + ) -> str: + """Format input for the classification model.""" + if instruction is None: + instruction = self.DEFAULT_INSTRUCTION + return f"{self.PREFIX}: {instruction}\n: {query}\n: {document}{self.SUFFIX}" + + def _rerank( + self, + query: str, + documents: list[str], + instruction: Optional[str] = None, + ) -> list[RerankResult]: + if not documents: + return [] + + # Format all documents for classification + inputs = [self._format_input(instruction, query, doc) for doc in documents] + + # Classify all inputs + response: ClassificationResponse = self.client.classify( + inputs=inputs, + truncate=True, + batch_size=self.batch_size, + max_concurrent_requests=self.max_concurrent_requests, + timeout_s=self.timeout_s, + ) + + # Extract scores for "yes" labels + results = [] + for idx, (doc, group) in enumerate(zip(documents, response.data)): + score = 0.0 + for result in group: + if result.label == "yes": + score = result.score + break + results.append(RerankResult(document=doc, score=score, original_index=idx)) + + # Sort by score descending + results.sort(key=lambda x: x.score, reverse=True) + return results + + +class VLLMQwen3Reranker(Reranker): + """Drop-in replacement for BasetenReranker backed by a local vLLM server. + + Serves Qwen3-Reranker-8B locally via vLLM's /score endpoint (original + Qwen3-reranker sequence-classification conversion). The prompt template and + yes/no scoring match BasetenReranker exactly, so scores match the Baseten + deployment up to numerics — use it when the Baseten reranker deployment is + unavailable. + + Server launch: + vllm serve Qwen/Qwen3-Reranker-8B --port 8011 \ + --hf-overrides '{"architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": true}' + Point at it with VLLM_RERANKER_URL (default http://127.0.0.1:8011). + """ + + PREFIX = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' + SUFFIX = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + DEFAULT_INSTRUCTION = ( + "Given a web search query, retrieve relevant passages that answer the query" + ) + + def __init__( + self, + base_url: Optional[str] = None, + model: str = "Qwen/Qwen3-Reranker-8B", + token_counter: Optional[Callable[[str], int]] = None, + max_tokens: Optional[int] = None, + batch_size: int = 32, + timeout_s: int = 360, + ): + super().__init__(token_counter=token_counter, max_tokens=max_tokens) + import os + + self.base_url = ( + base_url or os.getenv("VLLM_RERANKER_URL", "http://127.0.0.1:8011") + ).rstrip("/") + self.model = model + self.batch_size = batch_size + self.timeout_s = timeout_s + + def _rerank( + self, + query: str, + documents: List[str], + instruction: Optional[str] = None, + ) -> List[RerankResult]: + if not documents: + return [] + if instruction is None: + instruction = self.DEFAULT_INSTRUCTION + + text_1 = f"{self.PREFIX}: {instruction}\n: {query}\n" + scores: List[float] = [] + for start in range(0, len(documents), self.batch_size): + batch = documents[start : start + self.batch_size] + payload = { + "model": self.model, + "text_1": text_1, + "text_2": [f": {doc}{self.SUFFIX}" for doc in batch], + "truncate_prompt_tokens": -1, + } + last_error: Optional[Exception] = None + for attempt in range(3): + try: + response = requests.post( + f"{self.base_url}/score", + json=payload, + timeout=self.timeout_s, + ) + response.raise_for_status() + data = response.json()["data"] + scores.extend(float(item["score"]) for item in data) + last_error = None + break + except requests.exceptions.RequestException as exc: + last_error = exc + logger.warning( + "vllm_rerank_retry", attempt=attempt + 1, error=str(exc) + ) + time.sleep(2**attempt) + if last_error is not None: + logger.error("vllm_rerank_failed", error=str(last_error)) + raise last_error + + results = [ + RerankResult(document=doc, score=score, original_index=idx) + for idx, (doc, score) in enumerate(zip(documents, scores)) + ] + results.sort(key=lambda x: x.score, reverse=True) + return results + + +class ContextualReranker(Reranker): + """Reranker implementation using Contextual AI's rerank API.""" + + API_URL = "https://api.contextual.ai/v1/rerank" + DEFAULT_MODEL = "ctxl-rerank-v2-instruct-multilingual" + DEFAULT_INSTRUCTION = "Prioritize results that most closely align with the criteria outlined in the query" + + def __init__( + self, + api_key: Optional[str] = None, + model: Optional[str] = None, + token_counter: Optional[Callable[[str], int]] = None, + max_tokens: Optional[int] = None, + top_n: Optional[int] = None, + timeout_s: int = 60, + ): + """ + Initialize the Contextual AI reranker. + + Args: + api_key: Optional API key. If not provided, uses config. + model: Model to use for reranking. Defaults to ctxl-rerank-en-v1-instruct. + token_counter: Optional callable that counts tokens in a string. + max_tokens: Maximum total tokens for the output. + top_n: Optional number of top results to return from the API. + timeout_s: Timeout in seconds for API requests. + """ + super().__init__(token_counter=token_counter, max_tokens=max_tokens) + if api_key is None: + config = get_config() + api_key = config.contextual_api_key.get_secret_value() + self.api_key = api_key + self.model = model or self.DEFAULT_MODEL + self.top_n = top_n + self.timeout_s = timeout_s + + def _rerank( + self, + query: str, + documents: list[str], + instruction: Optional[str] = None, + ) -> list[RerankResult]: + if not documents: + return [] + + payload: dict[str, str | list[str] | int] = { + "query": query, + "documents": documents, + "model": self.model, + } + + if self.top_n is not None: + payload["top_n"] = self.top_n + + if instruction is not None: + payload["instruction"] = instruction + else: + payload["instruction"] = self.DEFAULT_INSTRUCTION + + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + + try: + response = requests.post( + self.API_URL, + json=payload, + headers=headers, + timeout=self.timeout_s, + ) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as e: + logger.error("contextual_rerank_failed", error=str(e)) + raise + + # Parse response and build results + results = [] + for item in data.get("results", []): + idx = item["index"] + score = item["relevance_score"] + results.append( + RerankResult( + document=documents[idx], + score=score, + original_index=idx, + ) + ) + + # Results should already be sorted by relevance, but ensure descending order + results.sort(key=lambda x: x.score, reverse=True) + return results + + +if __name__ == "__main__": + import argparse + import tiktoken + + parser = argparse.ArgumentParser(description="Run reranker example") + parser.add_argument( + "--reranker", + choices=["baseten", "contextual"], + default="baseten", + help="Reranker to use (default: baseten)", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=30, + help="Maximum tokens for output (default: 30)", + ) + args = parser.parse_args() + + logger.info( + "Running reranker example", reranker=args.reranker, max_tokens=args.max_tokens + ) + + # Simple token counter just to demonstrate the concept, not accurate token for all models of course + enc = tiktoken.get_encoding("o200k_harmony") + token_counter = lambda text: len(enc.encode(text)) + + # Create reranker based on argument + reranker: Reranker + if args.reranker == "contextual": + reranker = ContextualReranker( + token_counter=token_counter, + max_tokens=args.max_tokens, + ) + elif args.reranker == "baseten": + reranker = BasetenReranker( + token_counter=token_counter, + max_tokens=args.max_tokens, + ) + else: + raise ValueError(f"Invalid reranker: {args.reranker}") + + query = "What is the capital of China?" + documents = [ + "The capital of France is Paris.", + "The capital of China is Beijing.", + "The capital of Poland is Warsaw.", + "The capital of Germany is Berlin.", + "Chocolate is a delicious treat.", + "Pizza is a food", + "China has a population of 1.4 billion.", + "Germany has a population of 83 million.", + "Poland has a population of 38 million.", + "Warsaw is the capital of Poland.", + "Berlin is the capital of Germany.", + "Paris is the capital of France.", + "Beijing is the capital of China.", + "Warsaw is the capital of Poland.", + "Berlin is the capital of Germany.", + "Shanghai is not the capital of China.", + "Japan is closer to China than to the United States.", + "The capital of China has been Beijing for a long time.", + ] + results = reranker(query, documents) + logger.info("rerank_complete", num_results=len(results), max_tokens=args.max_tokens) + for result in results: + logger.info("result", score=result.score, document=result.document) + +# Back-compat alias kept for callers of the original cosmos-retriever API. +VLLMReranker = VLLMQwen3Reranker diff --git a/cosmos-retriever/src/cosmos_retriever/retriever.py b/cosmos-retriever/src/cosmos_retriever/retriever.py new file mode 100644 index 0000000..0308df2 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/retriever.py @@ -0,0 +1,497 @@ +"""High-level facade: ``CosmosRetriever().search(query)`` returns docs. + +Wraps the agent state machine, tool-set wiring, and Harmony token counter into +a single synchronous object that the CLI (and any in-process caller) drives +directly. The agent loop is synchronous end-to-end (Cosmos SDK + httpx + +tiktoken are all sync); there is no async surface here on purpose so that +subprocess-based callers (the MCP Toolkit ``agentic_search`` tool) get clean, +predictable behaviour. +""" + +from __future__ import annotations + +import asyncio +import re +import time +from dataclasses import dataclass, field + +import structlog +import tiktoken +from openai_harmony import ( + HarmonyEncoding, + HarmonyEncodingName, + RenderConversationConfig, + load_harmony_encoding, +) + +from cosmos_retriever.config import CorpusConfig, RetrieverSettings, get_settings +from cosmos_retriever.rerank import BasetenReranker, Reranker, VLLMReranker +from cosmos_retriever.tools import ( + SearchCorpusTool, + SearchCorpusToolCallMetadata, + ToolSet, +) +from cosmos_retriever.trajectory import ( + Action, + Observation, + Trajectory, +) + +logger = structlog.get_logger("cosmos_retriever.retriever") + + +_DOCUMENT_BLOCK_PATTERN = re.compile( + r"[^\"'\s>]+)[\"']?\s*>\s*" + r"(?:\s*(?P.*?)\s*\s*)?" + r"", + re.IGNORECASE | re.DOTALL, +) + + +@dataclass +class RetrievedDocument: + """A single curated document returned by the agent.""" + + id: str + text: str = "" # populated by `_hydrate_document_text` when available + justification: str | None = None + rank: int | None = None + + +@dataclass +class RetrievalResult: + """Output of :py:meth:`CosmosRetriever.search`.""" + + query: str + documents: list[RetrievedDocument] + num_turns: int + final_text: str = "" + pool_doc_ids: list[str] = field(default_factory=list) + elapsed_s: float = 0.0 + metadata: dict[str, str | int | float] = field(default_factory=dict) + + +class CosmosRetriever: + """Drive the trained Harness-1 agent against a Cosmos DB corpus. + + Construct once per process; reuse for many ``search`` calls. The internal + Cosmos and OpenAI clients are kept open for the lifetime of the instance. + + Args: + settings: Loaded :class:`RetrieverSettings`. Falls back to + :func:`get_settings` (i.e. env vars + ``.env``). + corpus_name: Optional container name to look up in the + :py:attr:`RetrieverSettings.corpus_registry`. When omitted, + the default-corpus env vars (``ACCOUNT_URI`` / ``COSMOS_DATABASE`` / + ``COSMOS_CORPUS_CONTAINER`` / ``AZURE_OPENAI_*``) are used. + reranker: Optional pre-built reranker. When omitted, one is built + from settings (Baseten if configured, then local vLLM, else None). + """ + + def __init__( + self, + settings: RetrieverSettings | None = None, + *, + corpus_name: str | None = None, + reranker: Reranker | None = None, + ) -> None: + self.settings = settings or get_settings() + self.corpus: CorpusConfig = self.settings.resolve_corpus(corpus_name) + + self._enc: HarmonyEncoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) + self._tiktoken = tiktoken.get_encoding("o200k_harmony") + self._reranker = reranker or self._build_default_reranker() + + cosmos_db = self.settings.build_cosmos_database(self.corpus) + openai_client = self.settings.build_openai_client(self.corpus) + self._use_chat = self.settings.use_chat_backend + self._use_responses = self.settings.use_responses_backend + self._use_generic = self.settings.use_generic_llm_backend + + self.toolset: ToolSet = ToolSet.build( + cosmos_database=cosmos_db, + cosmos_container_name=self.corpus.container, + openai_client=openai_client, + openai_embedding_model=self.corpus.embed_model, + embed_query_instruction=self.corpus.embed_query_instruction, + reranker=self._reranker, + token_counter=self._text_token_counter, + search_display_limit=self.settings.cosmos_retriever_search_display_limit, + # The ultra stub tools are dispatched by the Harmony env; a generic + # chat/responses model must not see (and try to call) them. + include_ultra_tools=not self._use_generic, + ) + + # Inference backend: either the fine-tuned Harness-1 over Harmony tokens, + # or any OpenAI-compatible chat/responses model via function-calling. + self.inference_model = None + self._chat_client = None + self._chat_model: str | None = None + if self._use_generic: + self._chat_client = self.settings.build_chat_client() + self._chat_model = self.settings.chat_model + else: + from cosmos_retriever.inference.vllm import ( # noqa: PLC0415 — heavy, harmony-only + VLLMHarmonyInferenceModel, + ) + + self.inference_model = VLLMHarmonyInferenceModel( + base_url=self.settings.vllm_base_url, + model_name=self.settings.vllm_model_name, + timeout_s=self.settings.vllm_timeout_s, + ) + + logger.info( + "cosmos_retriever_initialized", + inference_backend=self.settings.inference_backend, + vllm_base_url=None if self._use_generic else self.settings.vllm_base_url, + vllm_model_name=None if self._use_generic else self.settings.vllm_model_name, + chat_base_url=self.settings.chat_base_url if self._use_generic else None, + chat_model=self._chat_model if self._use_generic else None, + cosmos_account=self.corpus.account_uri, + cosmos_db=self.corpus.database, + cosmos_container=self.corpus.container, + embed_base_url=self.corpus.embed_base_url, + embed_model=self.corpus.embed_model, + embed_query_instruction=self.corpus.embed_query_instruction, + reranker=type(self._reranker).__name__ if self._reranker is not None else None, + ) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def search( + self, + query: str, + *, + max_documents: int = 20, + max_turns: int | None = None, + threshold_budget: int | None = None, + token_budget: int | None = None, + ) -> RetrievalResult: + """Run the multi-turn search agent and return its curated documents. + + Args: + query: Natural-language question. + max_documents: Cap on the number of documents to ask the model + to surface (rendered into the system prompt). + max_turns: Override the default ``COSMOS_RETRIEVER_MAX_TURNS``. + threshold_budget / token_budget: Override the default token + budgets for this single call. + + Returns: + A :class:`RetrievalResult` containing the ranked documents, the + number of turns the agent took, and the model's final-channel + text (used to extract document IDs/justifications). + """ + + if not query or not query.strip(): + raise ValueError("query must be a non-empty string") + + return self._search_sync( + query, + max_documents, + max_turns or self.settings.cosmos_retriever_max_turns, + threshold_budget or self.settings.cosmos_retriever_threshold_budget, + token_budget or self.settings.cosmos_retriever_token_budget, + ) + + def _search_sync( + self, + query: str, + max_documents: int, + max_turns: int, + threshold_budget: int, + token_budget: int, + ) -> RetrievalResult: + """Drive the upstream ``SlidingWindowSearchEnv`` for one query. + + Mirrors ``inference/evaluate_harness1_vllm.py:run_single_episode`` from + the upstream harness-1 repo so that recall on BrowseComp+ matches the + published Harness-1 numbers (the env owns the ``WorkingMemory`` / + ``curate`` / ``fan_out_search`` machinery the trained model relies on). + """ + + if self._use_chat: + return self._search_chat(query, max_documents) + if self._use_responses: + return self._search_responses(query, max_documents) + + from cosmos_retriever.env_rl import ( # noqa: PLC0415 — heavy, harmony-only + SlidingWindowSearchEnv, + ) + from cosmos_retriever.inference.evaluate_harness1_vllm import ( # noqa: PLC0415 + VllmTokenCompleter, + run_single_episode, + ) + + search_tool = self.toolset.get_tool("search_corpus") + if not isinstance(search_tool, SearchCorpusTool): + raise RuntimeError("toolset is missing a search_corpus tool") + + env = SlidingWindowSearchEnv( + toolset=self.toolset, + search_tool=search_tool, + query_id="adhoc", + query_text=query, + dataset_name="web", # inference mode: only used to key the rerank instruction + text_token_counter=self._text_token_counter, + max_turns=max_turns, + ) + + policy = VllmTokenCompleter( + base_url=self.settings.vllm_base_url, + model=self.settings.vllm_model_name, + max_tokens=2048, + temperature=1.0, + top_p=0.9, + timeout=int(self.settings.vllm_timeout_s), + ) + + start = time.perf_counter() + episode = asyncio.run(run_single_episode(env=env, policy=policy)) + elapsed = time.perf_counter() - start + + documents: list[RetrievedDocument] = [] + for rank, chunk_id in enumerate(env.wm.curated_ids[:max_documents]): + entry = env.wm.doc_store.get(chunk_id) or env.wm.doc_store.get( + chunk_id.split("_")[0] + ) + text = (entry or {}).get("full_text") or (entry or {}).get("snippet") or "" + documents.append( + RetrievedDocument( + id=chunk_id, + text=text, + justification=env.wm.curated_notes.get(chunk_id) if hasattr(env.wm, "curated_notes") else None, + rank=rank, + ) + ) + + result = RetrievalResult( + query=query, + documents=documents, + num_turns=int(episode.get("turns", 0)), + final_text="", + elapsed_s=round(elapsed, 3), + pool_doc_ids=sorted({cid.split("__")[0] for cid in env.wm.pool_ids}), + metadata={ + "n_pool": len(env.wm.pool_ids), + "n_curated": len(env.wm.curated_ids), + "total_curate_calls": episode.get("total_curate_calls", 0), + "tool_types_used": ",".join(sorted(set(episode.get("tool_types_used", [])))), + }, + ) + logger.info( + "search_complete", + query=query[:200], + num_documents=len(result.documents), + num_turns=result.num_turns, + n_pool=len(env.wm.pool_ids), + n_curated=len(env.wm.curated_ids), + elapsed_s=result.elapsed_s, + ) + return result + + def _search_chat(self, query: str, max_documents: int) -> RetrievalResult: + """Drive a generic OpenAI-compatible chat model via function-calling. + + Used when ``INFERENCE_BACKEND=openai_chat``. The chat agent talks to + the same Cosmos :class:`ToolSet`, so retrieval quality depends on the + chosen model's tool-use ability rather than the fine-tuned Harness-1 + checkpoint. + """ + + from cosmos_retriever.inference.openai_chat import ( # noqa: PLC0415 + run_chat_search, + ) + + if self._chat_client is None or self._chat_model is None: + raise RuntimeError("chat backend selected but chat client/model not initialised") + + start = time.perf_counter() + chat_result = run_chat_search( + toolset=self.toolset, + client=self._chat_client, + model=self._chat_model, + query=query, + max_documents=max_documents, + max_turns=self.settings.chat_max_turns, + temperature=self.settings.chat_temperature, + max_tokens=self.settings.chat_max_tokens, + ) + elapsed = time.perf_counter() - start + + documents = [ + RetrievedDocument(id=d.id, text=d.text, justification=d.justification, rank=d.rank) + for d in chat_result.documents + ] + result = RetrievalResult( + query=query, + documents=documents, + num_turns=chat_result.num_turns, + final_text=chat_result.final_text, + elapsed_s=round(elapsed, 3), + metadata=chat_result.metadata, + ) + logger.info( + "search_complete", + query=query[:200], + backend="openai_chat", + num_documents=len(result.documents), + num_turns=result.num_turns, + elapsed_s=result.elapsed_s, + ) + return result + + def _search_responses(self, query: str, max_documents: int) -> RetrievalResult: + """Drive a generic OpenAI **/responses** model (e.g. gpt-5.x reasoning). + + Used when ``INFERENCE_BACKEND=openai_responses``. Same Cosmos tools as + the chat backend, but routed through the responses API which reasoning + models require. + """ + + from cosmos_retriever.inference.openai_chat import ( # noqa: PLC0415 + run_responses_search, + ) + + if self._chat_client is None or self._chat_model is None: + raise RuntimeError("responses backend selected but chat client/model not initialised") + + start = time.perf_counter() + chat_result = run_responses_search( + toolset=self.toolset, + client=self._chat_client, + model=self._chat_model, + query=query, + max_documents=max_documents, + max_turns=self.settings.chat_max_turns, + max_tokens=self.settings.chat_max_tokens, + reasoning_effort=self.settings.chat_reasoning_effort, + ) + elapsed = time.perf_counter() - start + + documents = [ + RetrievedDocument(id=d.id, text=d.text, justification=d.justification, rank=d.rank) + for d in chat_result.documents + ] + result = RetrievalResult( + query=query, + documents=documents, + num_turns=chat_result.num_turns, + final_text=chat_result.final_text, + elapsed_s=round(elapsed, 3), + metadata=chat_result.metadata, + ) + logger.info( + "search_complete", + query=query[:200], + backend="openai_responses", + num_documents=len(result.documents), + num_turns=result.num_turns, + elapsed_s=result.elapsed_s, + ) + return result + + def _build_default_reranker(self) -> Reranker | None: + if self.settings.baseten_api_key and self.settings.baseten_model_url: + return BasetenReranker( + client=self.settings.get_baseten_client(), + token_counter=self._text_token_counter, + ) + if self.settings.vllm_reranker_url: + return VLLMReranker( + base_url=self.settings.vllm_reranker_url, + token_counter=self._text_token_counter, + ) + return None + + def _text_token_counter(self, text: str) -> int: + return len(self._tiktoken.encode(text)) + + def _trajectory_token_counter(self, trajectory: Trajectory) -> int: + return len( + self._enc.render_conversation( + trajectory.to_openai_harmony_format(), + config=RenderConversationConfig(auto_drop_analysis=False), + ) + ) + + @staticmethod + def _extract_documents( + trajectory: Trajectory, + ) -> tuple[list[RetrievedDocument], str]: + """Pull ranked document IDs + justifications out of the model's final turn.""" + + # Find the last Action with a UserText final turn. + final_text = "" + for entry in reversed(trajectory.actions_and_observations): + if not isinstance(entry, Action): + continue + for tool, params, _source in zip( + entry.tools, entry.params, entry.sources, strict=True + ): + if tool.tool_schema.name == "user_text": + final_text = params.get("text", "") or "" + break + if final_text: + break + + documents: list[RetrievedDocument] = [] + seen: set[str] = set() + for rank, match in enumerate(_DOCUMENT_BLOCK_PATTERN.finditer(final_text)): + doc_id = match.group("id") + if doc_id in seen: + continue + seen.add(doc_id) + justification = match.group("justification") + documents.append( + RetrievedDocument( + id=doc_id, + justification=justification.strip() if justification else None, + rank=rank, + ) + ) + return documents, final_text + + @staticmethod + def _hydrate_document_text( + trajectory: Trajectory, + documents: list[RetrievedDocument], + ) -> None: + """Best-effort: copy the first chunk-text we saw for each document into the result. + + We walk every Search/Grep observation's metadata to find the + ``returned_chunk_ids`` and pull the first matching ``# DOCUMENT ID: + ...`` block out of the observation text. This avoids a second Cosmos + round-trip and keeps the response self-contained. + """ + + if not documents: + return + + wanted: dict[str, RetrievedDocument] = {d.id: d for d in documents} + from cosmos_retriever.tasks import DOC_ID_PATTERN # noqa: PLC0415 — internal helper + + for entry in trajectory.actions_and_observations: + if not isinstance(entry, Observation): + continue + for obs_text, metadata in zip(entry.observations, entry.tool_metadata, strict=True): + if not isinstance(metadata, SearchCorpusToolCallMetadata): + continue + if not any(cid in wanted for cid in metadata.returned_chunk_ids): + continue + # Walk the formatted text to extract per-doc text. + matches = list(DOC_ID_PATTERN.finditer(obs_text)) + for idx, match in enumerate(matches): + chunk_id = match.group("chunk_id") + target_id = chunk_id.split("_")[0] if "_" in chunk_id else chunk_id + target = wanted.get(target_id) or wanted.get(chunk_id) + if target is None or target.text: + continue + start = match.end() + end = matches[idx + 1].start() if idx + 1 < len(matches) else len(obs_text) + target.text = obs_text[start:end].strip() + + +__all__ = ["CosmosRetriever", "RetrievalResult", "RetrievedDocument"] diff --git a/cosmos-retriever/src/cosmos_retriever/server.py b/cosmos-retriever/src/cosmos_retriever/server.py new file mode 100644 index 0000000..84ab91d --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/server.py @@ -0,0 +1,178 @@ +"""FastAPI HTTP service wrapping :class:`CosmosRetriever`. + +This is the network entry point the Azure Cosmos DB MCP Toolkit's +``agentic_search`` tool calls into. Instead of spawning ``python -m +cosmos_retriever search`` as a subprocess per request, the .NET server makes +an HTTP ``POST /search`` to a long-lived instance of this app. That keeps the +heavy clients (Cosmos SDK, OpenAI embeddings, Harmony encoder, tiktoken) +warm across calls and removes interpreter-cold-start from every request. + +Routes +------ +``GET /health`` → ``{"status": "ok"}`` (liveness; never touches Cosmos/vLLM). +``POST /search`` → run the multi-turn agent and return the JSON result. + +The ``/search`` response schema is: + + { + "query": str, + "num_turns": int, + "elapsed_s": float, + "documents": [ + {"id": str, "text": str, "justification": str | null, "rank": int} + ], + ... + } + +Concurrency +----------- +`CosmosRetriever.search` is fully synchronous and internally calls +``asyncio.run(...)``, which cannot run inside this app's event loop. Each +request therefore runs the search on a worker thread via +:func:`anyio.to_thread.run_sync`. Because a single retriever instance holds +sync Cosmos/httpx clients and per-call agent state that are **not** safe to +drive from two threads at once, calls that target the same corpus are +serialised with a per-corpus lock; different corpora run concurrently. +""" + +from __future__ import annotations + +import asyncio +from collections import defaultdict +from contextlib import asynccontextmanager +from dataclasses import asdict +from typing import TYPE_CHECKING + +import anyio +import structlog +from fastapi import FastAPI +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +from cosmos_retriever.config import RetrieverSettings, get_settings +from cosmos_retriever.retriever import CosmosRetriever + +if TYPE_CHECKING: + from collections.abc import AsyncIterator + +logger = structlog.get_logger("cosmos_retriever.server") + + +class SearchRequest(BaseModel): + """Request body for ``POST /search``""" + + query: str = Field(..., min_length=1, description="Natural-language information need.") + max_documents: int = Field( + default=20, + ge=1, + le=30, + alias="maxDocuments", + description="Cap on the number of curated documents to return.", + ) + database: str | None = Field( + default=None, + description="Override Cosmos database name (else COSMOS_DATABASE env var).", + ) + container: str | None = Field( + default=None, + description="Override Cosmos corpus container name (else COSMOS_CORPUS_CONTAINER).", + ) + + model_config = {"populate_by_name": True} + + +class _RetrieverPool: + """Lazily build + cache one :class:`CosmosRetriever` per corpus target. + + Keyed by ``(database_override, container)`` so a single server process can + serve many corpora. Each key also gets an :class:`asyncio.Lock` used to + serialise concurrent same-corpus requests (the underlying sync clients and + per-call agent state are not thread-safe for parallel use). + """ + + def __init__(self, settings: RetrieverSettings) -> None: + self._settings = settings + self._retrievers: dict[tuple[str | None, str | None], CosmosRetriever] = {} + self._locks: dict[tuple[str | None, str | None], asyncio.Lock] = defaultdict(asyncio.Lock) + self._build_lock = asyncio.Lock() + + async def get( + self, database: str | None, container: str | None + ) -> tuple[CosmosRetriever, asyncio.Lock]: + key = (database, container) + retriever = self._retrievers.get(key) + if retriever is None: + async with self._build_lock: + retriever = self._retrievers.get(key) + if retriever is None: + retriever = await anyio.to_thread.run_sync( + lambda: self._build(database, container) + ) + self._retrievers[key] = retriever + return retriever, self._locks[key] + + def _build(self, database: str | None, container: str | None) -> CosmosRetriever: + # Each retriever gets its own settings copy so a per-request database + # override never leaks into the shared singleton. + settings = self._settings.model_copy(deep=True) + if database: + settings.cosmos_database = database + return CosmosRetriever(settings=settings, corpus_name=container) + + +def create_app(settings: RetrieverSettings | None = None) -> FastAPI: + """Build the FastAPI application. + + Args: + settings: Optional pre-loaded settings. Defaults to :func:`get_settings`. + """ + + resolved = settings or get_settings() + + @asynccontextmanager + async def lifespan(app: FastAPI) -> AsyncIterator[None]: + app.state.pool = _RetrieverPool(resolved) + logger.info( + "cosmos_retriever_server_started", + host=resolved.host, + port=resolved.port, + default_container=resolved.cosmos_corpus_container, + ) + yield + + app = FastAPI( + title="Cosmos Retriever", + version="0.1.0", + description="HTTP service running the Harness-1 multi-turn search agent.", + lifespan=lifespan, + ) + + @app.get("/health") + async def health() -> dict[str, str]: + return {"status": "ok"} + + @app.post("/search") + async def search(request: SearchRequest) -> JSONResponse: + pool: _RetrieverPool = app.state.pool + try: + retriever, lock = await pool.get(request.database, request.container) + async with lock: + result = await anyio.to_thread.run_sync( + lambda: retriever.search( + request.query, max_documents=request.max_documents + ) + ) + except Exception as exc: # noqa: BLE001 — return JSON error envelope to caller + logger.error( + "search_failed", + query=request.query[:200], + error=str(exc), + error_type=type(exc).__name__, + ) + return JSONResponse( + status_code=500, + content={"error": str(exc), "type": type(exc).__name__}, + ) + return JSONResponse(content=asdict(result)) + + return app diff --git a/cosmos-retriever/src/cosmos_retriever/tasks.py b/cosmos-retriever/src/cosmos_retriever/tasks.py new file mode 100644 index 0000000..3a0fb27 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/tasks.py @@ -0,0 +1,422 @@ +import json +import re +from typing import Any, Dict, List, Literal, Optional, Set, TYPE_CHECKING, Union +from urllib.parse import urlsplit, urlunsplit + +from pydantic import BaseModel, Field, model_validator +from cosmos_retriever.tools import ( + GrepCorpusToolCallMetadata, + PruneChunksTool, + SearchCorpusToolCallMetadata, + SerializedTool, + ToolSet, + UserTextTool, +) +from cosmos_retriever.trajectory import Action, Observation, Trajectory +import structlog + +if TYPE_CHECKING: + from harness.config import Config + from datagen.search_dataset import SearchDataset + +logger = structlog.get_logger("search_agent.agent") + + +def get_message_for_question(question: str) -> Dict[str, Any]: + """Build the initial conversation state for a question.""" + + return {"role": "user", "content": question} + + +# HACK way to extract doc ids from tool output +DOC_ID_PATTERN = re.compile(r"#\s*DOCUMENT ID:\s*(?P[^\s]+)", re.IGNORECASE) +# HACK way to extract doc ids from final output +# Supports both and in single or double quotes +FINAL_OUTPUT_DOCUMENT_PATTERN = re.compile( + r"[^\"'\s>]+)[\"']?>" +) +CHUNK_ID_SUFFIX_PATTERN = re.compile(r"^(?P.+?)_+(?P\d+)$") + + +def extract_chunk_ids_from_tool_output(text: str) -> Set[str]: + """Extract chunk ids from tool output.""" + return {match.group("chunk_id") for match in DOC_ID_PATTERN.finditer(text)} + + +def extract_chunk_ids_from_final_output(text: str) -> Set[str]: + """Extract chunk ids from final output.""" + matches = { + match.group("chunk_id") + for match in FINAL_OUTPUT_DOCUMENT_PATTERN.finditer(text) + } + return matches + + +def chunk_ids_to_doc_ids(chunks_ids: Set[str]) -> Set[str]: + """Convert a set of chunk ids into a set of unique document ids.""" + document_ids: Set[str] = set() + + for raw_chunk_id in chunks_ids: + chunk_id = str(raw_chunk_id) + + # URLs are document IDs already. Keep path/query and strip fragments. + if "://" in chunk_id: + parsed = urlsplit(chunk_id) + document_ids.add( + urlunsplit( + (parsed.scheme, parsed.netloc, parsed.path, parsed.query, "") + ) + ) + continue + + # For chunk IDs like "12345_7", strip the numeric chunk suffix. + # Avoid applying this to path-like IDs that legitimately contain "/". + chunk_suffix_match = CHUNK_ID_SUFFIX_PATTERN.match(chunk_id) + if chunk_suffix_match and "/" not in chunk_id: + document_ids.add(chunk_suffix_match.group("base")) + continue + + document_ids.add(chunk_id) + + return document_ids + + +class SearchTaskOutput(BaseModel): + + trajectory: Trajectory + query_id: str # The query id in the dataset + dataset_name: str + nondeduplicated_traversed_chunk_ids: List[str] = Field(default_factory=list) + output_chunk_ids: List[str] = Field(default_factory=list) + # True if output_chunk_ids were extracted from reasoning text as a fallback + # (model terminated on reasoning without final text). Useful for filtering + # malformed data points during SFT data generation. + extracted_from_reasoning_fallback: bool = False + + @classmethod + def deserialize( + cls, + data: Union[str, Dict[str, Any]], + *, + config: "Config", + cosmos_container_name: str, + toolset: Optional[ToolSet] = None, + ) -> "SearchTaskOutput": + """Deserialize serialized output and hydrate its trajectory.""" + + if isinstance(data, cls): + return data + + if isinstance(data, str): + payload = json.loads(data) + else: + payload = data + + if not isinstance(payload, dict): + raise TypeError( + "SearchTaskOutput.deserialize expected a JSON string or dictionary." + ) + + trajectory_data = payload.get("trajectory") + if trajectory_data is None: + raise ValueError("Serialized SearchTaskOutput missing 'trajectory'.") + + hydrated_trajectory = Trajectory.deserialize( + trajectory_data, + config=config, + cosmos_container_name=cosmos_container_name, + toolset=toolset, + ) + + payload = payload.copy() + payload["trajectory"] = hydrated_trajectory + return cls.model_validate(payload) + + @model_validator(mode="after") + def populate_derived_fields(self) -> "SearchTaskOutput": + # Skip re-computation if nondeduplicated_traversed_chunk_ids is already populated + # (e.g., when deserializing from JSON where it was already saved). + # This avoids issues with tool_metadata not being properly deserialized into + # the correct subclass types (GrepCorpusToolCallMetadata, SearchCorpusToolCallMetadata). + if not self.nondeduplicated_traversed_chunk_ids: + all_chunk_ids: List[str] = [] + for action in self.trajectory.actions_and_observations: + if isinstance(action, Observation): + for tool_metadata in action.tool_metadata: + if tool_metadata is not None and ( + isinstance(tool_metadata, GrepCorpusToolCallMetadata) + or isinstance(tool_metadata, SearchCorpusToolCallMetadata) + ): + all_chunk_ids.extend(tool_metadata.returned_chunk_ids) + self.nondeduplicated_traversed_chunk_ids = all_chunk_ids + + # Skip output_document_ids computation if already populated from JSON + if not self.output_chunk_ids: + if not self.trajectory.actions_and_observations: + raise RuntimeError("Trajectory has no actions or observations") + + retrieval_subagent_output = self.trajectory.actions_and_observations[-1] + if ( + len(retrieval_subagent_output.sources) > 0 + and retrieval_subagent_output.sources[0] != "agent" + ): + raise RuntimeError("Early termination") + + if isinstance(retrieval_subagent_output, Action): + if len(retrieval_subagent_output.sources) == 0: + if retrieval_subagent_output.reasoning is None: + raise RuntimeError("Early termination") + else: + logger.warning( + "Early termination, trying to extract doc ids from reasoning as sometimes the model terminates on reasoning without final text" + ) + # Try to extract doc ids from the reasoning + self.output_chunk_ids = list( + extract_chunk_ids_from_final_output( + retrieval_subagent_output.reasoning + ) + ) + self.extracted_from_reasoning_fallback = True + else: + text_fragments = [ + params["text"] + for tool, params in zip( + retrieval_subagent_output.tools, + retrieval_subagent_output.params, + ) + if isinstance(tool, UserTextTool) and isinstance(params.get("text"), str) + ] + if text_fragments: + self.output_chunk_ids = list( + extract_chunk_ids_from_final_output("\n".join(text_fragments)) + ) + else: + raise RuntimeError("Early termination") + elif isinstance(retrieval_subagent_output, Observation): + raise RuntimeError("Early termination") + + return self + + def get_unique_traversed_document_ids(self) -> Set[str]: + return chunk_ids_to_doc_ids(set(self.nondeduplicated_traversed_chunk_ids)) + + def get_unique_traversed_chunk_ids(self) -> Set[str]: + return set(self.nondeduplicated_traversed_chunk_ids) + + def get_all_traversed_chunk_ids(self) -> List[str]: + return self.nondeduplicated_traversed_chunk_ids + + def get_all_output_chunk_ids(self) -> List[str]: + return self.output_chunk_ids + + def get_unique_output_chunk_ids(self) -> Set[str]: + return set(self.output_chunk_ids) + + def get_unique_output_document_ids(self) -> Set[str]: + return chunk_ids_to_doc_ids(set(self.output_chunk_ids)) + + def get_all_pruned_chunk_ids(self) -> List[str]: + """Extract all chunk IDs that were pruned during the trajectory.""" + pruned_chunk_ids: List[str] = [] + for action in self.trajectory.actions_and_observations: + if isinstance(action, Action): + for tool, params, source in zip( + action.tools, action.params, action.sources + ): + # Check for PruneChunksTool or SerializedTool with prune_chunks schema + is_prune_tool = isinstance(tool, PruneChunksTool) or ( + isinstance(tool, SerializedTool) + and tool.tool_schema.name == "prune_chunks" + ) + if is_prune_tool: + chunk_ids = params.get("chunk_ids", []) + pruned_chunk_ids.extend(chunk_ids) + return pruned_chunk_ids + + def get_unique_pruned_chunk_ids(self) -> Set[str]: + """Get unique chunk IDs that were pruned during the trajectory.""" + return set(self.get_all_pruned_chunk_ids()) + + def get_unique_pruned_document_ids(self) -> Set[str]: + """Get unique document IDs that were pruned during the trajectory.""" + return chunk_ids_to_doc_ids(self.get_unique_pruned_chunk_ids()) + + def get_all_pre_rerank_chunk_ids(self) -> List[str]: + """Extract all pre-rerank chunk IDs from search corpus tool calls.""" + pre_rerank_ids: List[str] = [] + for item in self.trajectory.actions_and_observations: + if isinstance(item, Observation): + for tool_metadata in item.tool_metadata: + if ( + tool_metadata is not None + and isinstance(tool_metadata, SearchCorpusToolCallMetadata) + and tool_metadata.pre_rerank_chunk_ids is not None + ): + pre_rerank_ids.extend(tool_metadata.pre_rerank_chunk_ids) + return pre_rerank_ids + + def get_unique_pre_rerank_chunk_ids(self) -> Set[str]: + """Get unique pre-rerank chunk IDs from search corpus tool calls.""" + return set(self.get_all_pre_rerank_chunk_ids()) + + def log_trajectory_stats(self) -> None: + logger.info( + "trajectory_chunk_stats", + total_chunk_ids=len(self.get_all_traversed_chunk_ids()), + unique_chunk_ids=len(self.get_unique_traversed_chunk_ids()), + duplicate_chunk_ids=len(self.get_all_traversed_chunk_ids()) + - len(self.get_unique_traversed_chunk_ids()), + ) + + +class SearchTaskEvaluationOutput(BaseModel): + """Per-query evaluation metrics produced when running the retrieval agent.""" + + query_id: str + recall: Optional[float] = None + precision: Optional[float] = None + f1: Optional[float] = None + trajectory_recall: Optional[float] = None + final_answer_recall: Optional[float] = None + retrieved_document_ids: List[str] = Field(default_factory=list) + num_turns: Optional[int] = None + prune_accuracy: Optional[float] = None + # Reranker metrics - only populated when a reranker was used + rerank_recall: Optional[float] = ( + None # Fraction of relevant pre-rerank chunks kept after reranking + ) + rerank_dropped_relevant_count: Optional[int] = ( + None # Number of relevant chunks dropped by reranker + ) + error: Optional[str] = None + + def succeeded(self) -> bool: + return self.error is None + + @classmethod + def from_search_task_output( + cls, + output: SearchTaskOutput, + dataset: "SearchDataset", + ) -> "SearchTaskEvaluationOutput": + """Create an evaluation output from a SearchTaskOutput and dataset. + + Calculates all evaluation metrics (recall, precision, f1, trajectory_recall, + final_answer_recall, prune_accuracy) based on the trajectory and ground truth + from the dataset. + """ + query_id = output.query_id + retrieved_chunk_ids = sorted(output.get_unique_output_chunk_ids()) + trajectory_chunk_ids = sorted(output.get_unique_traversed_chunk_ids()) + + recall = dataset.evaluate_results_recall(query_id, retrieved_chunk_ids) + precision = dataset.evaluate_results_precision(query_id, retrieved_chunk_ids) + f1 = dataset.evaluate_results_f1_score(query_id, retrieved_chunk_ids) + trajectory_recall = dataset.evaluate_results_recall( + query_id, trajectory_chunk_ids + ) + final_answer_recall = dataset.evaluate_results_final_answer_recall( + query_id, retrieved_chunk_ids + ) + num_turns = output.trajectory.num_turns + + # Calculate prune accuracy + prune_accuracy = cls._calculate_prune_accuracy(output, dataset) + + # Calculate reranker metrics + rerank_recall, rerank_dropped_relevant_count = cls._calculate_rerank_metrics( + output, dataset + ) + + return cls( + query_id=query_id, + recall=recall, + precision=precision, + f1=f1, + trajectory_recall=trajectory_recall, + final_answer_recall=final_answer_recall, + retrieved_document_ids=sorted(output.get_unique_output_document_ids()), + num_turns=num_turns, + prune_accuracy=prune_accuracy, + rerank_recall=rerank_recall, + rerank_dropped_relevant_count=rerank_dropped_relevant_count, + ) + + @staticmethod + def _calculate_prune_accuracy( + output: SearchTaskOutput, + dataset: "SearchDataset", + ) -> Optional[float]: + """Calculate prune accuracy for a search task output. + + Prune accuracy is the percentage of correct prune calls, where a bad prune + is defined as pruning an expected (ground truth) document ID. + + Returns None if no prune calls were made. + """ + pruned_chunk_ids = output.get_all_pruned_chunk_ids() + if not pruned_chunk_ids: + return None + + # Get expected document IDs from the dataset (these are chunk IDs or doc IDs) + expected_chunk_ids = set(dataset.get_expected_document_ids(output.query_id)) + # Also convert to doc IDs for comparison + expected_doc_ids = chunk_ids_to_doc_ids(expected_chunk_ids) + + # Count bad prunes (prunes of expected document IDs) + bad_prunes = 0 + for chunk_id in pruned_chunk_ids: + # A prune is bad if the chunk_id itself matches expected OR + # if the doc_id (prefix before _) matches expected doc IDs + doc_id = chunk_id.split("_")[0] if "_" in chunk_id else chunk_id + if chunk_id in expected_chunk_ids or doc_id in expected_doc_ids: + bad_prunes += 1 + + total_prunes = len(pruned_chunk_ids) + correct_prunes = total_prunes - bad_prunes + return correct_prunes / total_prunes + + @staticmethod + def _calculate_rerank_metrics( + output: SearchTaskOutput, + dataset: "SearchDataset", + ) -> tuple[Optional[float], Optional[int]]: + """Calculate reranker metrics for a search task output. + + Computes how well the reranker preserved relevant documents: + - rerank_recall: Fraction of relevant pre-rerank chunks that were kept after reranking + - rerank_dropped_relevant_count: Number of relevant chunks that were dropped + + Returns (None, None) if no reranking was performed (no pre_rerank_chunk_ids). + """ + pre_rerank_ids = output.get_unique_pre_rerank_chunk_ids() + if not pre_rerank_ids: + return None, None + + # Get the chunks that were actually returned after reranking + returned_ids = output.get_unique_traversed_chunk_ids() + + # Get expected document/chunk IDs from the dataset + expected_chunk_ids = set(dataset.get_expected_document_ids(output.query_id)) + expected_doc_ids = chunk_ids_to_doc_ids(expected_chunk_ids) + + def is_relevant(chunk_id: str) -> bool: + """Check if a chunk_id is relevant (matches expected).""" + doc_id = chunk_id.split("_")[0] if "_" in chunk_id else chunk_id + return chunk_id in expected_chunk_ids or doc_id in expected_doc_ids + + # Find relevant chunks in pre-rerank results + relevant_pre_rerank = {cid for cid in pre_rerank_ids if is_relevant(cid)} + + if not relevant_pre_rerank: + # No relevant chunks were in pre-rerank results, can't compute recall + return None, 0 + + # Find how many relevant chunks were kept after reranking + relevant_kept = relevant_pre_rerank & returned_ids + relevant_dropped = relevant_pre_rerank - returned_ids + + rerank_recall = len(relevant_kept) / len(relevant_pre_rerank) + rerank_dropped_count = len(relevant_dropped) + + return rerank_recall, rerank_dropped_count diff --git a/cosmos-retriever/src/cosmos_retriever/tools.py b/cosmos-retriever/src/cosmos_retriever/tools.py new file mode 100644 index 0000000..9b448c2 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/tools.py @@ -0,0 +1,878 @@ +"""Tool implementations for the Harness-1 retrieval agent. + +The corpus lives in Azure Cosmos DB for NoSQL and is queried with hybrid +RRF search (vector + full-text) plus an optional reranker. There are five +tools (matching the trained model's output schema): + +* :class:`SearchCorpusTool` — hybrid vector + FTS search over the corpus. +* :class:`GrepCorpusTool` — BM25-narrowed regex search. +* :class:`ReadDocumentTool` — fetch all chunks of a document by its docid. +* :class:`PruneChunksTool` — record chunk-ids whose context should be removed. +* :class:`MultiToolUseTool` — wraps a parallel tool-call bundle (the trained + model emits a single ``functions.multi_tool_use`` call to fan out). + +Plus :class:`UserTextTool`, the sentinel tool for assistant text in the +trajectory, and :class:`SerializedTool`, a placeholder used by tests/round-trips. +""" + +from __future__ import annotations + +import json +import os +import re +import threading +import time +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import Any, TypeAlias, cast + +import openai +import structlog +import tenacity +from azure.cosmos import ContainerProxy, DatabaseProxy +from azure.cosmos.exceptions import CosmosHttpResponseError +from pydantic import BaseModel, Field + +from cosmos_retriever.rerank import Reranker +from cosmos_retriever.utils import ProviderFormat + +logger = structlog.get_logger("cosmos_retriever.tools") + + +# ============================================================================ +# Cosmos helpers (concurrency throttle + retry) +# ============================================================================ + + +def _read_positive_int_env(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None: + return default + try: + value = int(raw) + except ValueError: + logger.warning("invalid_int_env", name=name, value=raw, default=default) + return default + if value < 1: + logger.warning("invalid_positive_int_env", name=name, value=raw, default=default) + return default + return value + + +COSMOS_QUERY_MAX_CONCURRENCY = _read_positive_int_env("COSMOS_QUERY_MAX_CONCURRENCY", 8) +_COSMOS_QUERY_SEMAPHORE = threading.BoundedSemaphore(COSMOS_QUERY_MAX_CONCURRENCY) + + +def _is_retryable_cosmos_error(exc: BaseException) -> bool: + if not isinstance(exc, CosmosHttpResponseError): + return False + status = getattr(exc, "status_code", None) + return status in (408, 429, 449, 500, 502, 503, 504) + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(5), + wait=tenacity.wait_exponential(multiplier=1, min=4, max=15), + retry=tenacity.retry_if_exception(_is_retryable_cosmos_error), + before_sleep=lambda retry_state: logger.warning( + "retry_cosmos_query", + attempt=retry_state.attempt_number, + error=str(retry_state.outcome.exception()) if retry_state.outcome else None, + ), +) +def _query_with_retry( + container: ContainerProxy, + query: str, + parameters: list[dict[str, Any]], + *, + partition_key: str | None = None, +) -> list[dict[str, Any]]: + """Execute a Cosmos NoSQL query with retry on transient errors.""" + + start = time.perf_counter() + with _COSMOS_QUERY_SEMAPHORE: + kwargs: dict[str, Any] = {"query": query, "parameters": parameters} + if partition_key is not None: + kwargs["partition_key"] = partition_key + else: + kwargs["enable_cross_partition_query"] = True + result = list(container.query_items(**kwargs)) + elapsed_ms = (time.perf_counter() - start) * 1000 + if elapsed_ms > 4500: + logger.warning( + "slow_cosmos_query", + elapsed_ms=round(elapsed_ms, 1), + cosmos_max_concurrency=COSMOS_QUERY_MAX_CONCURRENCY, + ) + return result + + +# ----- Stopword + tokenisation helpers for FullTextScore -------------------- + +_TOKEN_RE = re.compile(r"\w+", re.UNICODE) + +# Cosmos's en-US analyzer doesn't strip stopwords during FullTextScore scoring, +# and FullTextScore is hard-capped at 30 terms per call. Drop standard English +# stopwords client-side to (a) stay under 30 and (b) keep BM25 signal on the +# rare/content tokens. +_STOPWORDS = frozenset( + ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "did", "do", "does", "doing", "don", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "like", "me", "more", "most", "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "please", "same", "she", "should", "so", "some", "such", "tell", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "you", "your", "yours", "yourself", "yourselves"] +) + +_FTS_MAX_TERMS = 30 # Cosmos hard limit on FullTextScore arity. + + +def _tokenize_for_fts(query: str) -> list[str]: + """Tokenise for Cosmos FullTextScore: lowercase, drop stopwords, dedupe, cap at 30.""" + + out: list[str] = [] + seen: set[str] = set() + for raw in _TOKEN_RE.findall(query): + t = raw.lower() + if t in _STOPWORDS or t in seen: + continue + seen.add(t) + out.append(t) + if len(out) >= _FTS_MAX_TERMS: + break + return out + + +def _fts_literal_args(terms: list[str]) -> str: + """Render terms as comma-separated string literals for FullTextScore. + + The 2nd+ arguments of FullTextScore must be literals, not bound parameters. + """ + + def esc(t: str) -> str: + return '"' + t.replace("\\", "\\\\").replace('"', '\\"') + '"' + + return ", ".join(esc(t) for t in terms) + + +# ============================================================================ +# Tool schema (provider-agnostic) + provider format conversion +# ============================================================================ + + +class ToolSchema(BaseModel): + """Provider-agnostic JSON-Schema-like tool definition.""" + + name: str + description: str + parameters: dict[str, Any] + required: list[str] = Field(default_factory=list) + + def _to_openai_format(self) -> dict[str, Any]: + return { + "type": "function", + "name": self.name, + "description": self.description, + "parameters": { + "type": "object", + "properties": self.parameters, + "required": self.required, + }, + } + + def _to_openai_harmony_format(self) -> dict[str, Any]: + # Harmony uses the OpenAI-Chat-Completions function shape (function:{...}). + return { + "type": "function", + "function": { + "name": self.name, + "description": self.description, + "parameters": { + "type": "object", + "properties": self.parameters, + "required": self.required, + }, + }, + } + + def to_provider_format(self, provider: ProviderFormat) -> dict[str, Any]: + if provider is ProviderFormat.OPENAI: + return self._to_openai_format() + if provider is ProviderFormat.OPENAI_HARMONY: + return self._to_openai_harmony_format() + raise ValueError(f"Unsupported provider format: {provider}") + + +# ============================================================================ +# Tool schemas (data) +# ============================================================================ + +SEARCH_CORPUS_SCHEMA = ToolSchema( + name="search_corpus", + description=( + "Searches the corpus for relevant documents based on the input query. " + "Returns a section of the document that is relevant to the query." + ), + parameters={ + "query": { + "type": "string", + "description": "The search query to find relevant documents in the corpus.", + } + }, + required=["query"], +) + +READ_DOCUMENT_SCHEMA = ToolSchema( + name="read_document", + description="Reads the content of a document based on its ID.", + parameters={ + "doc_id": { + "type": "string", + "description": "The unique identifier of the document to read.", + } + }, + required=["doc_id"], +) + +GREP_CORPUS_SCHEMA = ToolSchema( + name="grep_corpus", + description="Performs a regex search on the corpus to find documents matching the query.", + parameters={ + "pattern": { + "type": "string", + "description": "The regex query to search for in the corpus.", + } + }, + required=["pattern"], +) + +MULTI_TOOL_USE_SCHEMA = ToolSchema( + name="multi_tool_use", + description="Allows the agent to use multiple tools in parallel to gather information.", + parameters={ + "tool_calls": { + "type": "array", + "description": "List of tool calls to execute in parallel.", + "items": { + "type": "object", + "properties": { + "tool_name": {"type": "string"}, + "parameters": {"type": "object"}, + }, + "required": ["tool_name", "parameters"], + }, + } + }, + required=["tool_calls"], +) + +PRUNE_CHUNKS_SCHEMA = ToolSchema( + name="prune_chunks", + description=( + "Prunes the chunks by id that are not relevant to the main question from the " + "history of the conversation." + ), + parameters={"chunk_ids": {"type": "array", "items": {"type": "string"}}}, + required=["chunk_ids"], +) + + +# ============================================================================ +# Base classes +# ============================================================================ + + +class ToolCallMetadata(BaseModel): + """Metadata returned alongside a tool call's text output.""" + + +class Tool(ABC, BaseModel): + """Base class for executable tools.""" + + tool_schema: ToolSchema + + @abstractmethod + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + """Execute the tool against ``params`` (possibly overridden by the caller).""" + + def get_format(self, provider: ProviderFormat) -> dict[str, Any]: + return self.tool_schema.to_provider_format(provider) + + def __repr__(self) -> str: + return f"Tool(name={self.tool_schema.name!r})" + + +class SerializedTool(Tool): + """Lightweight placeholder used when deserialising trajectories from JSON.""" + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + raise NotImplementedError("SerializedTool is a placeholder and cannot be executed.") + + +# ============================================================================ +# Concrete tools +# ============================================================================ + +DOC_TRUNCATION = 51_200_000 # effectively unbounded; keeps the formatting branch sane + + +class SearchCorpusToolCallMetadata(ToolCallMetadata): + """IDs returned by a search call (post-rerank, with optional pre-rerank list).""" + + returned_chunk_ids: list[str] + pre_rerank_chunk_ids: list[str] | None = None + + +class SearchCorpusTool(Tool): + """Hybrid (vector + full-text RRF) corpus search backed by Cosmos DB.""" + + tool_schema: ToolSchema + _cosmos_database: DatabaseProxy + _container: ContainerProxy + _openai_client: openai.OpenAI + _openai_ef_name: str + _embed_query_instruction: str | None + _reranker: Reranker | None + _search_limit: int + _display_limit: int + + def __init__( + self, + cosmos_database: DatabaseProxy, + openai_client: openai.OpenAI, + cosmos_container_name: str, + openai_ef_name: str = "text-embedding-3-small", + embed_query_instruction: str | None = None, + reranker: Reranker | None = None, + search_limit: int = 50, + display_limit: int = 10, + ) -> None: + super().__init__(tool_schema=SEARCH_CORPUS_SCHEMA) + self._cosmos_database = cosmos_database + self._container = cosmos_database.get_container_client(cosmos_container_name) + self._openai_client = openai_client + self._openai_ef_name = openai_ef_name + self._embed_query_instruction = embed_query_instruction + self._reranker = reranker + self._search_limit = search_limit + self._display_limit = display_limit + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, SearchCorpusToolCallMetadata | None]: + log = logger.bind(tool=self.tool_schema.name) + if not isinstance(params, dict) or "query" not in params: + log.error("invalid_params", params_type=type(params).__name__) + raise ValueError(f"Invalid params type: {type(params)}") + + query = params["query"] + ignore_ids: list[str] = [] + if overrides is not None and "ignore_ids" in overrides: + ignore_ids = overrides["ignore_ids"] + log.info("search_corpus", query=query, ignore_ids=len(ignore_ids)) + + terms = _tokenize_for_fts(query) + if not terms: + terms = [query.strip() or "_"] + dense_vec = self._embed_query(query) + + sql_parts = ["SELECT TOP @k c.id, c.text, c.docid, c.chunk_idx FROM c"] + parameters: list[dict[str, Any]] = [ + {"name": "@k", "value": self._search_limit}, + {"name": "@qVec", "value": dense_vec}, + ] + if ignore_ids: + sql_parts.append("WHERE NOT ARRAY_CONTAINS(@ignore, c.id)") + parameters.append({"name": "@ignore", "value": ignore_ids}) + sql_parts.append( + "ORDER BY RANK RRF(" + "VectorDistance(c.embedding, @qVec), " + f"FullTextScore(c.text, {_fts_literal_args(terms)})" + ")" + ) + sql = "\n".join(sql_parts) + + rows = _query_with_retry(self._container, sql, parameters) + ids = [r["id"] for r in rows] + documents = [r.get("text", "") for r in rows] + + max_tokens_override = ( + overrides.get("max_tokens") if overrides and "max_tokens" in overrides else None + ) + + token_counts: list[int | None] = [None] * len(ids) + if self._reranker is not None and ids: + rerank_results = self._reranker( + query, cast(list[str], documents), max_tokens=max_tokens_override + ) + ids = [ids[r.original_index] for r in rerank_results] + documents = [r.document for r in rerank_results] + token_counts = [r.tokens for r in rerank_results] + log.info("reranked_results", num_results=len(ids)) + + formatted = [ + "\n# DOCUMENT ID: {}{} \n{}".format( + id_, + f" ({tokens} tokens)" if tokens is not None else "", + doc[:DOC_TRUNCATION], + ) + for id_, doc, tokens in zip(ids, cast(list[str], documents), token_counts, strict=True) + ][: self._display_limit] + + text = "\n".join(formatted) if ids else "No results found" + return text, SearchCorpusToolCallMetadata(returned_chunk_ids=ids[: len(formatted)]) + + def _embed_query(self, text: str) -> list[float]: + if self._embed_query_instruction: + text = f"Instruct: {self._embed_query_instruction}\nQuery: {text}" + resp = self._openai_client.embeddings.create( + model=self._openai_ef_name, input=[text], encoding_format="float" + ) + return resp.data[0].embedding + + +class GrepCorpusToolCallMetadata(ToolCallMetadata): + """IDs returned by a grep call.""" + + returned_chunk_ids: list[str] + + +class GrepCorpusTool(Tool): + """Regex search over the corpus. + + Cosmos's ``RegexMatch`` requires an O(N) scan that blows past serverless + per-request budgets. We use ``FullTextScore`` (index-backed BM25) on the + pattern's tokens, then post-filter the top hits with the real regex + client-side. + """ + + tool_schema: ToolSchema + _cosmos_database: DatabaseProxy + _container: ContainerProxy + _token_counter: Callable[[str], int] | None + + def __init__( + self, + cosmos_database: DatabaseProxy, + cosmos_container_name: str, + token_counter: Callable[[str], int] | None = None, + ) -> None: + super().__init__(tool_schema=GREP_CORPUS_SCHEMA) + self._cosmos_database = cosmos_database + self._container = cosmos_database.get_container_client(cosmos_container_name) + self._token_counter = token_counter + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + log = logger.bind(tool=self.tool_schema.name) + if not isinstance(params, dict) or "pattern" not in params: + log.error("invalid_params", params_type=type(params).__name__) + raise ValueError(f"Invalid params type: {type(params)}") + + pattern = params["pattern"] + log.info("grep_corpus", pattern=pattern) + + terms = _tokenize_for_fts(pattern) + if not terms: + return "No results found", GrepCorpusToolCallMetadata(returned_chunk_ids=[]) + + sql = ( + "SELECT TOP 50 c.id, c.text, c.docid FROM c " + "ORDER BY RANK FullTextScore(c.text, " + _fts_literal_args(terms) + ")" + ) + candidate_rows = _query_with_retry(self._container, sql, []) + + try: + regex = re.compile(pattern, re.IGNORECASE) + rows = [r for r in candidate_rows if regex.search(r.get("text", ""))][:5] + except re.error: + rows = candidate_rows[:5] + + ids = [r["id"] for r in rows] + documents = [r.get("text", "") for r in rows] + token_counts: list[int | None] = ( + [self._token_counter(doc) for doc in documents] + if self._token_counter is not None + else [None] * len(documents) + ) + + formatted = [ + "\n# DOCUMENT ID: {}{} \n{}".format( + id_, + f" ({tokens} tokens)" if tokens is not None else "", + doc[:DOC_TRUNCATION], + ) + for id_, doc, tokens in zip(ids, documents, token_counts, strict=True) + ] + text = "\n".join(formatted) if ids else "No results found" + return text, GrepCorpusToolCallMetadata(returned_chunk_ids=ids) + + +class ReadDocumentTool(Tool): + """Reads all chunks for a document (partitioned by docid).""" + + tool_schema: ToolSchema + _cosmos_database: DatabaseProxy + _container: ContainerProxy + _reranker: Reranker | None + _token_counter: Callable[[str], int] | None + _max_tokens: int | None + + def __init__( + self, + cosmos_database: DatabaseProxy, + cosmos_container_name: str, + reranker: Reranker | None = None, + token_counter: Callable[[str], int] | None = None, + max_tokens: int | None = None, + ) -> None: + if max_tokens is not None and token_counter is None: + raise ValueError("token_counter is required when max_tokens is specified") + super().__init__(tool_schema=READ_DOCUMENT_SCHEMA) + self._cosmos_database = cosmos_database + self._container = cosmos_database.get_container_client(cosmos_container_name) + self._reranker = reranker + self._token_counter = token_counter + self._max_tokens = max_tokens + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + log = logger.bind(tool=self.tool_schema.name) + if not isinstance(params, dict) or ("doc_id" not in params and "id" not in params): + log.error("invalid_params", params_type=type(params).__name__) + raise ValueError(f"Invalid params type: {type(params)}") + + doc_id = params.get("doc_id") or params.get("id") + log.info("read_document", doc_id=doc_id) + # Ingest format is "__"; tolerate either form. + if isinstance(doc_id, str) and "__" in doc_id: + doc_id = doc_id.split("__", 1)[0] + + sql = ( + "SELECT TOP 300 c.id, c.text, c.chunk_idx, c.docid FROM c " + "WHERE c.docid = @doc_id" + ) + parameters = [{"name": "@doc_id", "value": doc_id}] + rows = _query_with_retry(self._container, sql, parameters, partition_key=doc_id) + rows.sort(key=lambda r: r.get("chunk_idx", 0)) + documents = [r.get("text", "") for r in rows] + assembled = "".join(cast(list[str], documents)) + + query = overrides.get("query") if overrides else None + max_tokens = ( + overrides.get("max_tokens") if overrides and "max_tokens" in overrides else None + ) or self._max_tokens + + if self._reranker is not None and query is not None and max_tokens is not None: + rerank_results = self._reranker(query, cast(list[str], documents), max_tokens=max_tokens) + kept_indices = {r.original_index for r in rerank_results} + kept_docs = [documents[i] for i in range(len(documents)) if i in kept_indices] + assembled = "".join(kept_docs) + log.info("reranked_and_filtered", original=len(documents), kept=len(kept_docs)) + elif self._token_counter is not None and max_tokens is not None: + total_tokens = self._token_counter(assembled) + if total_tokens > max_tokens: + truncated: list[str] = [] + running = 0 + for doc in documents: + n = self._token_counter(doc) + if running + n > max_tokens: + break + truncated.append(doc) + running += n + assembled = "".join(truncated) + log.info("truncated_by_tokens", original=len(documents), kept=len(truncated)) + + if self._token_counter is not None: + token_count = self._token_counter(assembled) + return f"# Document ({token_count} tokens)\n{assembled}", None + return assembled, None + + +class PruneChunksTool(Tool): + """No-op tool used to record which chunks should be elided in subsequent turns.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + super().__init__(tool_schema=PRUNE_CHUNKS_SCHEMA) + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + log = logger.bind(tool=self.tool_schema.name) + if not isinstance(params, dict) or "chunk_ids" not in params: + log.error("invalid_params", params_type=type(params).__name__) + raise ValueError(f"Invalid params type: {type(params)}") + log.info("prune_chunks", chunk_ids=len(params["chunk_ids"])) + return "Pruned", None + + +_ToolSetT: TypeAlias = "ToolSet" + + +class MultiToolUseTool(Tool): + """Wraps a parallel tool-call bundle for models without native parallel calls. + + The trained Harness-1 model emits a single ``functions.multi_tool_use`` + call with a list of inner ``{tool_name, parameters}`` entries; the inner + tools are dispatched serially against the bound :class:`ToolSet`. + """ + + tool_schema: ToolSchema + toolset: _ToolSetT + + def __init__(self, toolset: ToolSet) -> None: + super().__init__(tool_schema=MULTI_TOOL_USE_SCHEMA, toolset=toolset) + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + results: list[str] = [] + for tool_call in params["tool_calls"]: + tool = self.toolset.get_tool(tool_call["tool_name"]) + if tool is None: + raise ValueError(f"Tool {tool_call['tool_name']} not found in toolset") + output, _ = tool(tool_call["parameters"]) + results.append(output) + return json.dumps(results), None + + +class UserTextTool(Tool): + """Sentinel tool representing assistant text in a trajectory.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + super().__init__( + tool_schema=ToolSchema( + name="user_text", + description="Produces text for the user.", + parameters={}, + required=[], + ) + ) + + def __call__( + self, + params: dict[Any, Any], + overrides: dict[Any, Any] | None = None, + ) -> tuple[str, ToolCallMetadata | None]: + raise ValueError("UserTextTool should not be called directly") + + +# ============================================================================ +# Stub tools used by the ultra_core working-memory env +# ---------------------------------------------------------------------------- +# These tools are *registered* on the toolset so the model sees their +# schemas, but their actual behaviour is dispatched by +# :class:`cosmos_retriever.env.UltraSearchEnv` (which has access to the +# cross-turn :class:`WorkingMemory`). +# ============================================================================ + + +def _stub_tool(name_for_error: str): + def _impl(self, params, overrides=None): + raise NotImplementedError( + f"{name_for_error} is dispatched by UltraSearchEnv, not the tool itself" + ) + return _impl + + +class FanOutSearchTool(Tool): + """Stub: dispatched by env. Runs N parallel ``search_corpus`` calls.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + from cosmos_retriever.ultra_core import FAN_OUT_SEARCH_SCHEMA + super().__init__(tool_schema=FAN_OUT_SEARCH_SCHEMA) + + __call__ = _stub_tool("fan_out_search") + + +class CurateTool(Tool): + """Stub: dispatched by env. Updates :class:`WorkingMemory.curated_ids`.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + from cosmos_retriever.ultra_core import CURATE_SCHEMA + super().__init__(tool_schema=CURATE_SCHEMA) + + __call__ = _stub_tool("curate") + + +class EndSearchTool(Tool): + """Sentinel tool — when called, the env terminates the episode.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + from cosmos_retriever.ultra_core import END_SEARCH_SCHEMA + super().__init__(tool_schema=END_SEARCH_SCHEMA) + + def __call__(self, params, overrides=None): + return "Search concluded.", None + + +class ReviewDocsTool(Tool): + """Stub: dispatched by env. Returns full text of previously-found docs.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + from cosmos_retriever.ultra_core import REVIEW_DOCS_SCHEMA + super().__init__(tool_schema=REVIEW_DOCS_SCHEMA) + + __call__ = _stub_tool("review_docs") + + +class VerifyTool(Tool): + """Stub: dispatched by env (v8d). Verifies a claim against doc IDs.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + from cosmos_retriever.ultra_core import VERIFY_SCHEMA + super().__init__(tool_schema=VERIFY_SCHEMA) + + __call__ = _stub_tool("verify") + + +# ============================================================================ +# ToolSet +# ============================================================================ + + +class ToolSet(BaseModel): + """A composable collection of named :class:`Tool` instances.""" + + tools: dict[str, Tool] = Field(default_factory=dict) + name: str | None = None + + def add_tool(self, tool: Tool) -> None: + if tool.tool_schema.name in self.tools: + raise ValueError(f"Tool with name {tool.tool_schema.name} already exists") + self.tools[tool.tool_schema.name] = tool + + def remove_tool(self, name: str) -> None: + self.tools.pop(name, None) + + def get_tool(self, name: str) -> Tool | None: + return self.tools.get(name) + + def get_formats(self, provider: ProviderFormat) -> list[dict[str, Any]]: + return [tool.get_format(provider) for tool in self.tools.values()] + + def __repr__(self) -> str: + names = ", ".join(sorted(self.tools.keys())) + suffix = f" ({self.name})" if self.name else "" + return f"ToolSet{suffix}[{len(self.tools)} tools: {names}]" + + @classmethod + def build( + cls, + *, + cosmos_database: DatabaseProxy, + cosmos_container_name: str, + openai_client: openai.OpenAI, + openai_embedding_model: str = "text-embedding-3-small", + embed_query_instruction: str | None = None, + reranker: Reranker | None = None, + token_counter: Callable[[str], int] | None = None, + max_tokens: int | None = None, + search_limit: int = 50, + search_display_limit: int = 10, + include_ultra_tools: bool = False, + name: str | None = None, + ) -> ToolSet: + """Build a fully-wired retrieval :class:`ToolSet`. + + Returns a :class:`ToolSet` containing :class:`SearchCorpusTool`, + :class:`GrepCorpusTool`, :class:`ReadDocumentTool`, and + :class:`PruneChunksTool` — exactly the four tools the trained + Harness-1 model expects to see on its developer message. + + When ``include_ultra_tools`` is true, also registers the stub + ``fan_out_search``, ``curate``, ``review_docs``, and + ``end_search`` tools used by + :class:`cosmos_retriever.env.UltraSearchEnv`. + """ + + toolset = cls(name=name) + toolset.add_tool( + SearchCorpusTool( + cosmos_database=cosmos_database, + openai_client=openai_client, + cosmos_container_name=cosmos_container_name, + openai_ef_name=openai_embedding_model, + embed_query_instruction=embed_query_instruction, + reranker=reranker, + search_limit=search_limit, + display_limit=search_display_limit, + ) + ) + toolset.add_tool( + GrepCorpusTool( + cosmos_database=cosmos_database, + cosmos_container_name=cosmos_container_name, + token_counter=token_counter, + ) + ) + toolset.add_tool( + ReadDocumentTool( + cosmos_database=cosmos_database, + cosmos_container_name=cosmos_container_name, + reranker=reranker, + token_counter=token_counter, + max_tokens=max_tokens, + ) + ) + toolset.add_tool(PruneChunksTool()) + if include_ultra_tools: + toolset.add_tool(FanOutSearchTool()) + toolset.add_tool(CurateTool()) + toolset.add_tool(ReviewDocsTool()) + toolset.add_tool(EndSearchTool()) + return toolset + + +# Re-export the names trajectory.py imports +__all__ = [ + "COSMOS_QUERY_MAX_CONCURRENCY", + "DOC_TRUNCATION", + "GREP_CORPUS_SCHEMA", + "GrepCorpusTool", + "GrepCorpusToolCallMetadata", + "MULTI_TOOL_USE_SCHEMA", + "MultiToolUseTool", + "PRUNE_CHUNKS_SCHEMA", + "PruneChunksTool", + "READ_DOCUMENT_SCHEMA", + "ReadDocumentTool", + "SEARCH_CORPUS_SCHEMA", + "SearchCorpusTool", + "SearchCorpusToolCallMetadata", + "SerializedTool", + "Tool", + "ToolCallMetadata", + "ToolSchema", + "ToolSet", + "UserTextTool", +] diff --git a/cosmos-retriever/src/cosmos_retriever/trajectory.py b/cosmos-retriever/src/cosmos_retriever/trajectory.py new file mode 100644 index 0000000..c7eecf7 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/trajectory.py @@ -0,0 +1,491 @@ +"""Trajectory data structures + provider-format rendering. + +Carried over from upstream Harness-1 with everything that isn't required at +inference time stripped out: + +* Anthropic / Moonshot / OpenAI-Responses provider formats — gone. +* :pymeth:`Trajectory.deserialize` (loads saved-to-disk trajectories) — gone; + the retriever never re-hydrates a trajectory from JSON. + +What remains is enough to feed the trained Harness-1 model on vLLM +(:py:meth:`Trajectory.to_openai_harmony_format`) and to dump trajectories +for debugging (:py:meth:`Trajectory.to_openai_format`). +""" + +from __future__ import annotations + +import copy +import json +import uuid +from collections.abc import Iterator +from datetime import UTC, datetime +from typing import Any, Literal + +import structlog +from openai_harmony import ( + Author, + Conversation, + DeveloperContent, + Message, + ReasoningEffort, + Role, + SystemContent, + ToolDescription, +) +from pydantic import BaseModel, SerializeAsAny, model_validator + +from cosmos_retriever.tools import ( + GREP_CORPUS_SCHEMA, + MULTI_TOOL_USE_SCHEMA, + PRUNE_CHUNKS_SCHEMA, + READ_DOCUMENT_SCHEMA, + SEARCH_CORPUS_SCHEMA, + MultiToolUseTool, + SerializedTool, + Tool, + ToolCallMetadata, + UserTextTool, +) +from cosmos_retriever.utils import ProviderFormat + +logger = structlog.get_logger("cosmos_retriever.trajectory") + +Source = str | Literal["user"] | Literal["agent"] +"""Identifier for who produced an entry: a tool-call id, ``"user"``, or ``"agent"``.""" + + +# ============================================================================ +# Action +# ============================================================================ + + +class Action(BaseModel): + """One step the agent took: zero or more tool calls plus optional reasoning.""" + + tools: list[Tool] + params: list[dict] + sources: list[Source] + reasoning: str | None = None + + def as_iter(self) -> Iterator[tuple[Tool, dict, Source]]: + return iter(zip(self.tools, self.params, self.sources, strict=True)) + + @model_validator(mode="before") + @classmethod + def _deserialize_tools(cls, data: Any) -> Any: + """Resolve serialised tool stubs to runtime Tool subclasses.""" + + if not isinstance(data, dict): + return data + tools = data.get("tools") + if not tools: + return data + + resolved: list[Tool] = [] + for tool_entry in tools: + if isinstance(tool_entry, Tool): + resolved.append(tool_entry) + continue + if isinstance(tool_entry, dict): + schema = tool_entry.get("tool_schema") + if schema is None: + raise ValueError("Serialized tool entry missing 'tool_schema'") + if schema.get("name") == "user_text": + resolved.append(UserTextTool()) + else: + resolved.append(SerializedTool(tool_schema=schema)) + continue + resolved.append(tool_entry) + data = data.copy() + data["tools"] = resolved + return data + + +class ActionBuilder: + """Builder for an :class:`Action`.""" + + def __init__(self) -> None: + self.action = Action(tools=[], params=[], sources=[], reasoning=None) + + def add_tool_call(self, tool: Tool, params: dict, source: Source) -> ActionBuilder: + if isinstance(tool, MultiToolUseTool): + raise ValueError("MultiToolUseTool should not be added to an action builder") + self.action.tools.append(tool) + self.action.params.append(params) + self.action.sources.append(source) + return self + + def add_reasoning(self, reasoning: str) -> ActionBuilder: + if self.action.reasoning is not None: + raise ValueError("Reasoning already added for this action") + self.action.reasoning = reasoning + return self + + def is_complete(self) -> bool: + has_tools = ( + len(self.action.tools) > 0 + and len(self.action.tools) + == len(self.action.params) + == len(self.action.sources) + ) + return has_tools or self.action.reasoning is not None + + def build(self) -> Action: + if not self.is_complete(): + raise ValueError( + "ActionBuilder is not complete: missing tools/params/sources or reasoning" + ) + return self.action + + +# ============================================================================ +# Observation +# ============================================================================ + + +class Observation(BaseModel): + """Tool outputs (or user/system messages) for a single step.""" + + observations: list[str] + sources: list[Source] + tool_metadata: list[SerializeAsAny[ToolCallMetadata] | None] + + +class ObservationBuilder: + """Builder for an :class:`Observation`.""" + + def __init__(self) -> None: + self.observations: list[str] = [] + self.sources: list[Source] = [] + self.tool_metadata: list[ToolCallMetadata | None] = [] + + def add_observation( + self, + observation: str, + source: Source, + tool_metadata: ToolCallMetadata | None = None, + ) -> ObservationBuilder: + self.observations.append(observation) + self.sources.append(source) + self.tool_metadata.append(tool_metadata) + return self + + def is_complete(self) -> bool: + return ( + len(self.observations) > 0 + and len(self.tool_metadata) == len(self.observations) + and len(self.sources) == len(self.observations) + ) + + def build(self) -> Observation: + if not self.is_complete(): + raise ValueError("ObservationBuilder is not complete") + return Observation( + observations=self.observations, + sources=self.sources, + tool_metadata=self.tool_metadata, + ) + + +# ============================================================================ +# Trajectory +# ============================================================================ + + +class Trajectory(BaseModel): + """A sequence of alternating :class:`Action` and :class:`Observation` entries.""" + + actions_and_observations: list[Action | Observation] + id: uuid.UUID + + @property + def num_turns(self) -> int: + return sum(1 for entry in self.actions_and_observations if isinstance(entry, Action)) + + def clone(self) -> Trajectory: + """Deep-copy the trajectory while keeping Tool references shared. + + Tool instances may hold unpicklable HTTP clients, so we cannot use + :py:meth:`pydantic.BaseModel.model_copy(deep=True)`. We use + :py:meth:`pydantic.BaseModel.model_construct` to skip validation since + the data is already validated. + """ + + cloned: list[Action | Observation] = [] + for entry in self.actions_and_observations: + if isinstance(entry, Action): + cloned.append( + Action.model_construct( + tools=list(entry.tools), + params=copy.deepcopy(entry.params), + sources=list(entry.sources), + reasoning=entry.reasoning, + ) + ) + else: + cloned.append( + Observation.model_construct( + observations=list(entry.observations), + sources=list(entry.sources), + tool_metadata=list(entry.tool_metadata), + ) + ) + return Trajectory.model_construct(actions_and_observations=cloned, id=self.id) + + def __repr__(self) -> str: + out = "Trajectory:\n" + for i, item in enumerate(self.actions_and_observations): + if isinstance(item, Action): + out += f"[Step {i}] [Action] {item.tools!r} with params {item.params}\n" + else: + snippet = [obs[:100] for obs in item.observations] + out += f"[Step {i}] [Observation] {snippet}...\n" + out += "\n" + return out + + def to_provider_format(self, provider: ProviderFormat) -> Any: + if provider is ProviderFormat.OPENAI_HARMONY: + return self.to_openai_harmony_format() + if provider is ProviderFormat.OPENAI: + return self.to_openai_format() + raise ValueError(f"Unsupported provider format: {provider}") + + # ------------------------------------------------------------------ + # OpenAI Chat Completions (debug / serialisation only) + # ------------------------------------------------------------------ + def to_openai_format(self) -> list[dict[str, Any]]: + """Convert the trajectory into OpenAI Chat Completions message format.""" + + def _make_text_content(text: str) -> dict[str, str]: + if text.strip() == "": + logger.warning("empty_text_content_maybe_pruned") + return {"type": "text", "text": "Maybe pruned?"} + return {"type": "text", "text": text} + + messages: list[dict[str, Any]] = [] + for entry in self.actions_and_observations: + if isinstance(entry, Action): + content_items: list[dict[str, Any]] = [] + tool_calls: list[dict[str, Any]] = [] + for tool, params, source in entry.as_iter(): + if isinstance(tool, UserTextTool): + content_items.append(_make_text_content(params.get("text", ""))) + else: + tool_calls.append( + { + "id": str(source), + "type": "function", + "function": { + "name": tool.tool_schema.name, + "arguments": json.dumps(params), + }, + } + ) + msg: dict[str, Any] = { + "role": "assistant", + "content": content_items if content_items else "", + } + if tool_calls: + msg["tool_calls"] = tool_calls + messages.append(msg) + else: + for text, source in zip(entry.observations, entry.sources, strict=True): + if source == "user": + messages.append( + {"role": "user", "content": [_make_text_content(text)]} + ) + else: + messages.append( + { + "role": "tool", + "tool_call_id": str(source), + "content": [_make_text_content(text)], + } + ) + return messages + + # ------------------------------------------------------------------ + # OpenAI Harmony — the format the trained Harness-1 model expects + # ------------------------------------------------------------------ + def to_openai_harmony_format(self) -> Conversation: + system_message = ( + SystemContent.new() + .with_reasoning_effort(ReasoningEffort.HIGH) + .with_conversation_start_date(datetime.now(UTC).strftime("%Y-%m-%d")) + ) + messages: list[Message] = [Message.from_role_and_content(Role.SYSTEM, system_message)] + + def fmt_params(parameters: dict[str, Any], required: list[str]) -> dict[str, Any]: + return {"type": "object", "properties": parameters, "required": required} + + developer_message = DeveloperContent.new().with_function_tools( + [ + ToolDescription.new( + SEARCH_CORPUS_SCHEMA.name, + SEARCH_CORPUS_SCHEMA.description, + fmt_params(SEARCH_CORPUS_SCHEMA.parameters, SEARCH_CORPUS_SCHEMA.required), + ), + ToolDescription.new( + GREP_CORPUS_SCHEMA.name, + GREP_CORPUS_SCHEMA.description, + fmt_params(GREP_CORPUS_SCHEMA.parameters, GREP_CORPUS_SCHEMA.required), + ), + ToolDescription.new( + READ_DOCUMENT_SCHEMA.name, + READ_DOCUMENT_SCHEMA.description, + fmt_params(READ_DOCUMENT_SCHEMA.parameters, READ_DOCUMENT_SCHEMA.required), + ), + ToolDescription.new( + MULTI_TOOL_USE_SCHEMA.name, + MULTI_TOOL_USE_SCHEMA.description, + fmt_params(MULTI_TOOL_USE_SCHEMA.parameters, MULTI_TOOL_USE_SCHEMA.required), + ), + ToolDescription.new( + PRUNE_CHUNKS_SCHEMA.name, + PRUNE_CHUNKS_SCHEMA.description, + fmt_params(PRUNE_CHUNKS_SCHEMA.parameters, PRUNE_CHUNKS_SCHEMA.required), + ), + ] + ) + messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message)) + + tool_use_source_to_tool_name: dict[str, str] = {} + for entry in self.actions_and_observations: + if isinstance(entry, Action): + self._render_action_to_harmony(entry, messages, tool_use_source_to_tool_name) + else: + self._render_observation_to_harmony(entry, messages, tool_use_source_to_tool_name) + return Conversation(messages=messages) + + @staticmethod + def _render_action_to_harmony( + action: Action, + messages: list[Message], + tool_use_source_to_tool_name: dict[str, str], + ) -> None: + if action.reasoning: + messages.append( + Message.from_role_and_content(Role.ASSISTANT, action.reasoning).with_channel( + "analysis" + ) + ) + if len(action.tools) > 1: + # GPT-OSS 20B was not trained with native parallel tool calls; pack + # the bundle into a single multi_tool_use call on the commentary + # channel. + tool_calls: list[dict[str, Any]] = [] + for tool, params, source in action.as_iter(): + if isinstance(tool, UserTextTool): + messages.append( + Message.from_role_and_content( + Role.ASSISTANT, params["text"] + ).with_channel("final") + ) + else: + tool_calls.append( + {"tool_name": tool.tool_schema.name, "parameters": params} + ) + tool_use_source_to_tool_name[str(source)] = tool.tool_schema.name + messages.append( + Message.from_role_and_content(Role.ASSISTANT, json.dumps(tool_calls)) + .with_channel("commentary") + .with_recipient("functions.multi_tool_use") + .with_content_type("<|constrain|>json") + ) + elif len(action.tools) == 1: + tool = action.tools[0] + params = action.params[0] + source = action.sources[0] + if isinstance(tool, UserTextTool): + messages.append( + Message.from_role_and_content( + Role.ASSISTANT, params["text"] + ).with_channel("final") + ) + else: + messages.append( + Message.from_role_and_content(Role.ASSISTANT, json.dumps(params)) + .with_channel("commentary") + .with_recipient("functions." + tool.tool_schema.name) + .with_content_type("<|constrain|>json") + ) + tool_use_source_to_tool_name[str(source)] = "functions." + tool.tool_schema.name + + @staticmethod + def _render_observation_to_harmony( + observation: Observation, + messages: list[Message], + tool_use_source_to_tool_name: dict[str, str], + ) -> None: + if len(observation.observations) > 1: + tool_results: list[dict[str, Any]] = [] + for text, source in zip(observation.observations, observation.sources, strict=True): + if source == "user": + raise ValueError("User text inside a multi-tool result observation") + tool_name = tool_use_source_to_tool_name[str(source)] + tool_results.append( + {"type": "tool_result", "name": tool_name, "content": [text]} + ) + messages.append( + Message.from_author_and_content( + Author(role=Role.TOOL, name="functions.multi_tool_use"), + json.dumps(tool_results), + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + else: + text = observation.observations[0] + source = observation.sources[0] + if source == "user": + messages.append(Message.from_role_and_content(Role.USER, text)) + else: + source_str = str(source) + if source_str in tool_use_source_to_tool_name: + tool_name = tool_use_source_to_tool_name[source_str] + else: + parts = source_str.split("_") + if len(parts) >= 2 and parts[0] == "toolu": + tool_name = parts[1] + else: + raise ValueError(f"Unknown observation source: {source_str}") + messages.append( + Message.from_author_and_content( + Author(role=Role.TOOL, name="functions." + tool_name), + json.dumps(text), + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + + +class TrajectoryBuilder: + """Mutable builder for a :class:`Trajectory`.""" + + def __init__(self) -> None: + self.trajectory = Trajectory(actions_and_observations=[], id=uuid.uuid4()) + + def add_action(self, action: Action) -> TrajectoryBuilder: + self.trajectory.actions_and_observations.append(action) + return self + + def add_observation(self, observation: Observation) -> TrajectoryBuilder: + self.trajectory.actions_and_observations.append(observation) + return self + + def __len__(self) -> int: + return len(self.trajectory.actions_and_observations) + + def build(self) -> Trajectory: + return self.trajectory + + +__all__ = [ + "Action", + "ActionBuilder", + "Observation", + "ObservationBuilder", + "Source", + "Trajectory", + "TrajectoryBuilder", +] diff --git a/cosmos-retriever/src/cosmos_retriever/ultra_core.py b/cosmos-retriever/src/cosmos_retriever/ultra_core.py new file mode 100644 index 0000000..cf2114b --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/ultra_core.py @@ -0,0 +1,1951 @@ +"""Ultra Core — Shared module for the Ultra retrieval agent pipeline. + +Single source of truth for: WorkingMemory, context assembly, tool schemas, +system prompt, result summaries, reward computation. + +Imported by: generate_sft_v3.py, train_sft_v3.py, train_rl_v3.py +""" + +import copy +import hashlib +import json +import os +import re +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Set, Tuple + +import structlog + +# Optional v8d dependencies — imported lazily so SFT paths that don't need them still work. +try: + from rank_bm25 import BM25Okapi # type: ignore + _HAS_BM25 = True +except ImportError: + _HAS_BM25 = False + +try: + from datasketch import MinHash, MinHashLSH # type: ignore + _HAS_MINHASH = True +except ImportError: + _HAS_MINHASH = False +from openai_harmony import ( + Author, + Conversation, + DeveloperContent, + HarmonyEncoding, + HarmonyEncodingName, + Message, + ReasoningEffort, + Role, + SystemContent, + ToolDescription, + load_harmony_encoding, +) +from cosmos_retriever.tools import ( + ToolSchema, + UserTextTool, + SEARCH_CORPUS_SCHEMA, + GREP_CORPUS_SCHEMA, + READ_DOCUMENT_SCHEMA, + MULTI_TOOL_USE_SCHEMA, +) +from cosmos_retriever.trajectory import Action, Observation + +logger = structlog.get_logger() + +# ═══════════════════════════════════════════════════════════════════════════════ +# Constants +# ═══════════════════════════════════════════════════════════════════════════════ + +RECENT_K = int(os.environ.get("RECENT_K", "5")) +FAN_OUT_MAX_QUERIES = 5 +MAX_CURATED_DOCS = 30 +DOC_SNIPPET_CHARS = int(os.environ.get("DOC_SNIPPET_CHARS", "120")) +CURATED_DOC_CHARS = int(os.environ.get("CURATED_DOC_CHARS", "0")) +MAX_REVIEW_DOCS = 5 +SEARCH_DISPLAY_LIMIT = int(os.environ.get("SEARCH_DISPLAY_LIMIT", "10")) +MAX_TURNS = int(os.environ.get("MAX_TURNS", "35")) + +MAX_OBS_CHARS = int(os.environ.get("MAX_OBS_CHARS", "15000")) +SEARCH_TOKEN_BUDGET = int(os.environ.get("SEARCH_TOKEN_BUDGET", "4096")) +MAX_ANALYSIS_CHARS_OLDER = int(os.environ.get("MAX_ANALYSIS_CHARS_OLDER", "300")) + +# Token budget +MODEL_CTX_LIMIT = 32768 +GENERATION_BUDGET = 2048 +PROMPT_TOKEN_BUDGET = MODEL_CTX_LIMIT - GENERATION_BUDGET # 30720 + +# Format retry +MAX_FORMAT_RETRIES = int(os.environ.get("MAX_FORMAT_RETRIES", "3")) +CURATE_NUDGE_INTERVAL = int(os.environ.get("CURATE_NUDGE_INTERVAL", "1")) + +# Reward +REWARD_VERSION = os.environ.get("REWARD_VERSION", "v3") +RECALL_BETA = 2.0 +OUTCOME_WEIGHT = float(os.environ.get("OUTCOME_WEIGHT", "0.7")) +TRAJECTORY_RECALL_WEIGHT = float(os.environ.get("TRAJECTORY_RECALL_WEIGHT", "0.3")) +FINAL_ANSWER_BONUS = float(os.environ.get("FINAL_ANSWER_BONUS", "1.0")) +FINAL_ANSWER_BINARY = os.environ.get("FINAL_ANSWER_BINARY", "1") == "1" +# Dense final-answer shaping: +# - FINAL_ANSWER_RECALL_WEIGHT rewards putting answer docs into curated set. +# - TRAJECTORY_FA_RECALL_WEIGHT rewards finding answer docs in pool. +# - FA_MISS_PENALTY_WEIGHT penalizes cases where answer docs are in pool +# but are not curated (selection failure). +FINAL_ANSWER_RECALL_WEIGHT = float( + os.environ.get("FINAL_ANSWER_RECALL_WEIGHT", "0.8") +) +TRAJECTORY_FA_RECALL_WEIGHT = float( + os.environ.get("TRAJECTORY_FA_RECALL_WEIGHT", "0.4") +) +FA_MISS_PENALTY_WEIGHT = float( + os.environ.get("FA_MISS_PENALTY_WEIGHT", "0.35") +) +MIN_FORMAT_REWARD = 0.001 +FORMAT_ERROR_PENALTY = float(os.environ.get("FORMAT_ERROR_PENALTY", "-0.5")) +NO_CURATE_PENALTY = float(os.environ.get("NO_CURATE_PENALTY", "-0.2")) +GAP_PENALTY_WEIGHT = float(os.environ.get("GAP_PENALTY_WEIGHT", "0.0")) + +# Turn penalty (linear ramp from 0 at TURN_PENALTY_MIN to TURN_PENALTY_MAX at MAX_TURNS) +TURN_PENALTY_MAX = float(os.environ.get("TURN_PENALTY_MAX", "0.15")) +TURN_PENALTY_MIN_TURNS = int(os.environ.get("TURN_PENALTY_MIN_TURNS", "24")) + +# Reward shaping (legacy, kept for compat but defaults zeroed) +TARGET_CURATE_RATE = float(os.environ.get("TARGET_CURATE_RATE", "0.40")) +CURATE_RATE_BONUS_WEIGHT = float(os.environ.get("CURATE_RATE_BONUS_WEIGHT", "0.0")) +TOOL_DIVERSITY_BONUS_WEIGHT = float(os.environ.get("TOOL_DIVERSITY_BONUS", "0.0")) +TOOL_DIVERSITY_TARGET = int(os.environ.get("TOOL_DIVERSITY_TARGET", "3")) +TOOL_DIVERSITY_SHORTFALL_PENALTY = float( + os.environ.get("TOOL_DIVERSITY_SHORTFALL_PENALTY", "0.0") +) +CONSEC_SEARCH_PENALTY = float(os.environ.get("CONSEC_SEARCH_PENALTY", "0.08")) +MAX_CONSEC_BEFORE_PENALTY = int(os.environ.get("MAX_CONSEC_BEFORE_PENALTY", "1")) +CONSEC_SEARCH_PENALTY_CAP = 0.4 + +# Windowing +WINDOW_SIZE = int(os.environ.get("WINDOW_SIZE", "5")) +WINDOW_STRIDE = int(os.environ.get("WINDOW_STRIDE", "5")) # legacy, kept for compat +MAX_WINDOWS = int(os.environ.get("MAX_WINDOWS", "4")) + +# ─────────────────────────────────────────────────────────────────────────────── +# v8d feature flags (all default OFF — enabled explicitly via launch_v8d_rl.sh) +# ─────────────────────────────────────────────────────────────────────────────── +V8D_SUBTRACTIVE_CURATION = os.environ.get("V8D_SUBTRACTIVE_CURATION", "0") == "1" +V8D_IMPORTANCE_TAGGING = os.environ.get("V8D_IMPORTANCE_TAGGING", "0") == "1" +V8D_AUTO_POPULATE_FIRST_SEARCH = os.environ.get("V8D_AUTO_POPULATE_FIRST_SEARCH", "0") == "1" +V8D_EVIDENCE_GRAPH = os.environ.get("V8D_EVIDENCE_GRAPH", "0") == "1" +V8D_SENTENCE_COMPRESS = os.environ.get("V8D_SENTENCE_COMPRESS", "0") == "1" +V8D_CHUNK_NEIGHBORS = os.environ.get("V8D_CHUNK_NEIGHBORS", "0") == "1" +V8D_CONTENT_DEDUP = os.environ.get("V8D_CONTENT_DEDUP", "0") == "1" +V8D_VERIFY_TOOL = os.environ.get("V8D_VERIFY_TOOL", "0") == "1" +V8D_TOKEN_BUDGET_MARKER = os.environ.get("V8D_TOKEN_BUDGET_MARKER", "0") == "1" +V8D_ADAPTIVE_RERANK_INSTRUCTION = os.environ.get("V8D_ADAPTIVE_RERANK_INSTRUCTION", "0") == "1" + +# v8d tuning knobs +VALID_IMPORTANCE = ("very_high", "high", "fair", "low") +_IMPORTANCE_RANK = {"very_high": 0, "high": 1, "fair": 2, "low": 3} +SENTENCE_COMPRESS_K = int(os.environ.get("SENTENCE_COMPRESS_K", "4")) +MINHASH_DEDUP_THRESHOLD = float(os.environ.get("MINHASH_DEDUP_THRESHOLD", "0.85")) +MINHASH_NUM_PERM = int(os.environ.get("MINHASH_NUM_PERM", "64")) +EVIDENCE_GRAPH_MAX_ENTITIES = int(os.environ.get("EVIDENCE_GRAPH_MAX_ENTITIES", "8")) +AUTO_POPULATE_TOP_K = int(os.environ.get("AUTO_POPULATE_TOP_K", "8")) + +# Prompts +CURATE_NUDGE_PROMPT = ( + "IMPORTANT: You just searched without curating. Follow the search → curate rhythm: " + "review the results from your last search and call curate NOW to add ALL plausibly " + "relevant documents. Do not search again until you've curated." +) +FORMAT_RETRY_PROMPT = ( + "Your previous response could not be parsed as a valid tool call. " + "Please output a valid tool call using the commentary channel. " + "Example format: start with analysis channel for reasoning, then " + "use commentary channel with a function call like functions.fan_out_search({...})." +) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Tool Schemas (Ultra-specific; base tools imported from tools.py) +# ═══════════════════════════════════════════════════════════════════════════════ + +FAN_OUT_SEARCH_SCHEMA = ToolSchema( + name="fan_out_search", + description=( + f"Run up to {FAN_OUT_MAX_QUERIES} diverse search queries in parallel. " + "Returns combined results from all queries. Best for broad exploration." + ), + parameters={ + "queries": { + "type": "array", + "items": {"type": "string"}, + "description": ( + f"List of search queries (max {FAN_OUT_MAX_QUERIES}). " + "Each should target a different aspect." + ), + } + }, + required=["queries"], +) + +_CURATE_PARAMS_CORE: Dict[str, Any] = { + "add_ids": { + "type": "array", + "items": {"type": "string"}, + "description": "Document IDs to add to your curated set.", + }, + "remove_ids": { + "type": "array", + "items": {"type": "string"}, + "description": "Document IDs to remove from your curated set.", + }, +} + +_CURATE_PARAMS_WITH_IMPORTANCE: Dict[str, Any] = { + **_CURATE_PARAMS_CORE, + "importance": { + "type": "object", + "description": ( + "Optional per-doc importance tag: {doc_id: one of 'very_high'|'high'|'fair'|'low'}. " + "'very_high' = confirmed to directly satisfy all query constraints; 'high' = " + "strongly relevant; 'fair' = default if omitted; 'low' = marginal (eviction-first). " + "When the set is full, lowest-importance docs are evicted first." + ), + "additionalProperties": {"type": "string"}, + }, +} + +_curate_desc_base = ( + f"Update your curated set of relevant documents (max {MAX_CURATED_DOCS}). " + "The curated set is your final output." +) +_curate_desc_v8d = ( + f"Update your curated set of relevant documents (max {MAX_CURATED_DOCS}). The curated " + "set is your final output. Under v8d subtractive curation, you SHOULD tag each added " + "doc with an importance level; when the set is full, the lowest-importance docs are " + "evicted first. Default tag is 'fair'. Use 'very_high' only for docs you have " + "verified directly answer the query." +) + +CURATE_SCHEMA = ToolSchema( + name="curate", + description=_curate_desc_v8d if V8D_IMPORTANCE_TAGGING else _curate_desc_base, + parameters=( + _CURATE_PARAMS_WITH_IMPORTANCE if V8D_IMPORTANCE_TAGGING else _CURATE_PARAMS_CORE + ), + required=["add_ids"], +) + +VERIFY_SCHEMA = ToolSchema( + name="verify", + description=( + "Check whether specific documents support a claim. Returns a yes/no judgment per doc " + "and a short rationale. Use BEFORE tagging docs as 'very_high' importance on " + "multi-constraint queries to confirm they actually satisfy all criteria. " + "Does NOT cost corpus tokens — compute only." + ), + parameters={ + "doc_ids": { + "type": "array", + "items": {"type": "string"}, + "description": "Document IDs to check (max 5 per call).", + }, + "claim": { + "type": "string", + "description": ( + "A concrete, checkable claim derived from the query. " + "E.g. 'This doc was published after 2019 and mentions a GDPR fine above $10M.'" + ), + }, + }, + required=["doc_ids", "claim"], +) + +END_SEARCH_SCHEMA = ToolSchema( + name="end_search", + description=( + "End your search and submit your curated set as your final answer. " + "Call this when you've found enough relevant documents." + ), + parameters={ + "reasoning": { + "type": "string", + "description": "Brief explanation of why you're concluding your search.", + } + }, + required=["reasoning"], +) + +REVIEW_DOCS_SCHEMA = ToolSchema( + name="review_docs", + description=( + "Re-read documents from your memory. Shows the full text of previously-found " + "documents without re-searching the corpus. Use this to revisit promising docs." + ), + parameters={ + "doc_ids": { + "type": "array", + "items": {"type": "string"}, + "description": f"Document IDs to review from your pool (max {MAX_REVIEW_DOCS}).", + } + }, + required=["doc_ids"], +) + +ALL_TOOL_SCHEMAS = [ + SEARCH_CORPUS_SCHEMA, GREP_CORPUS_SCHEMA, READ_DOCUMENT_SCHEMA, + MULTI_TOOL_USE_SCHEMA, + FAN_OUT_SEARCH_SCHEMA, CURATE_SCHEMA, END_SEARCH_SCHEMA, + REVIEW_DOCS_SCHEMA, +] +if V8D_VERIFY_TOOL: + ALL_TOOL_SCHEMAS.append(VERIFY_SCHEMA) + + +def get_tool_descriptions() -> List[ToolDescription]: + """Build Harmony ToolDescription list for all 7 agent tools (+multi_tool_use).""" + def _fmt(schema: ToolSchema) -> Dict[str, Any]: + return { + "type": "object", + "properties": schema.parameters, + "required": schema.required, + } + return [ToolDescription.new(s.name, s.description, _fmt(s)) for s in ALL_TOOL_SCHEMAS] + + +# ═══════════════════════════════════════════════════════════════════════════════ +# System Prompt +# ═══════════════════════════════════════════════════════════════════════════════ + +def _v8d_prompt_addendum() -> str: + """Extra guidance injected into the system prompt when v8d features are on.""" + blocks: List[str] = [] + if V8D_IMPORTANCE_TAGGING: + blocks.append( + "**Importance Tagging (v8d):** When you call `curate`, tag each added doc " + "with an `importance` level in {very_high, high, fair, low}. Rules:\n" + " - very_high: you have VERIFIED the doc directly answers the query " + "(ideally after a `verify` call).\n" + " - high: strongly relevant, hits most query constraints.\n" + " - fair: plausible but not confirmed (default tag if omitted).\n" + " - low: marginal; will be evicted first when the set is full.\n" + "The curated set is capped at " + f"{MAX_CURATED_DOCS} — when full, the lowest-importance docs are " + "evicted first to make room for higher-tagged ones." + ) + if V8D_AUTO_POPULATE_FIRST_SEARCH: + blocks.append( + "**Auto-populate (v8d):** After your first successful search, the top-ranked " + f"{AUTO_POPULATE_TOP_K} docs are AUTOMATICALLY added to your curated set at " + "`fair` importance. Your job is NOT to re-add them — instead, promote the good " + "ones to `high`/`very_high` and REMOVE the bad ones. This is subtractive curation." + ) + if V8D_EVIDENCE_GRAPH: + blocks.append( + "**Evidence Graph (v8d):** The Working Memory shows `[Evidence Graph]` — entities " + "(names, dates, years) and which docs they appear in. Bridge entities (in multiple " + "docs) are high-value signals. Singleton entities (in only 1 doc) often indicate " + "gaps — consider follow-up searches for related entities." + ) + if V8D_VERIFY_TOOL: + blocks.append( + "**Verify tool (v8d):** Use `verify(doc_ids, claim)` BEFORE tagging docs as " + "`very_high` on multi-constraint queries. It returns yes/no for each doc, " + "letting you confirm a doc actually satisfies ALL criteria rather than just one." + ) + if V8D_TOKEN_BUDGET_MARKER: + blocks.append( + "**Context budget (v8d):** Each observation ends with `[Context: X/Y]`. When X/Y " + "is above 75%, wrap up your search within 2-3 more turns. When above 90%, call " + "`end_search` NOW." + ) + if not blocks: + return "" + return "\n\n".join(blocks) + "\n" + + +def get_system_prompt(query: str) -> str: + v8d_addendum = _v8d_prompt_addendum() + v8d_tool_line = ( + "- **verify**(doc_ids, claim): Check if docs support a specific claim. " + "Use before tagging as very_high.\n" if V8D_VERIFY_TOOL else "" + ) + return f"""You are a retrieval subagent. Find and retrieve the most relevant documents from a corpus to help answer a question. You do NOT answer questions yourself — you only find relevant documents. + + +{query} + + +**Available Tools:** +- **fan_out_search**(queries): Run up to {FAN_OUT_MAX_QUERIES} diverse queries in parallel. +- **search_corpus**(query): Single semantic + keyword search. +- **grep_corpus**(pattern): Exact regex pattern matching on the corpus. Use for specific names, dates, numbers, or exact phrases. +- **read_document**(doc_id): Read a document's full content. Use liberally — seeing full text reveals connections that snippets miss. +- **review_docs**(doc_ids): Re-read previously-found documents from memory (free, no corpus call). +- **curate**(add_ids, remove_ids{', importance' if V8D_IMPORTANCE_TAGGING else ''}): Update your curated set (max {MAX_CURATED_DOCS} docs). These are your final output. +{v8d_tool_line}- **end_search**(reasoning): Submit your curated set and conclude. + +**Context:** +Your context has two parts: +1. **Working Memory** — curated set with {"full content" if CURATED_DOC_CHARS > 0 else "snippets"}, document pool with snippets, and search history. +2. **Recent Turns** — full detail of your last {RECENT_K} actions and results. + +**Two-Tier Memory:** +- Your Working Memory shows {"full content for curated docs and brief snippets for uncurated pool docs" if CURATED_DOC_CHARS > 0 else "doc IDs + brief snippets for ALL previously found docs"}. +- Use **review_docs** to re-read the full text of any document from memory without re-searching. +- This is useful when you want to revisit a doc you found earlier. + +**Step 1 — Decompose the Query:** +Before your first search, identify the key constraints in the query (entities, dates, relationships, distinctive facts). Use the most specific/unique constraint for your first search. + +**Step 2 — Core Loop (ALWAYS follow this rhythm):** +1. **Search** — use fan_out_search, search_corpus, or grep_corpus. +2. **Curate immediately** — after EVERY search, call curate to add ALL plausibly relevant docs from the results. Do NOT do two searches in a row without curating in between. +3. **Repeat** — search a different angle, then curate again. +4. **Refine** — use review_docs or read_document to revisit docs, then curate to adjust. +5. **End** — call end_search when you've thoroughly covered the query. + +You have up to **{MAX_TURNS} turns**. Use them — thorough coverage matters more than speed. Don't end early if there are unexplored angles. + +The search → curate rhythm is critical. Results are freshest right after a search. If you delay curation, those results scroll out of your recent context and you lose the detail needed to decide relevance. + +**Search Strategy:** +- Keep queries SHORT (5-12 words). Vary angles — don't repeat similar queries. +- **NEVER repeat queries** from your search history. Use completely different wording. +- **Decompose complex queries** into distinct searchable facets. Search each facet separately. +- **Use grep_corpus** for specific names, dates, numbers, codes, or exact phrases from the query. grep often finds what semantic search misses. +- **Use read_document liberally** — full text reveals connections that snippets hide. If a doc partially matches, read it fully. High-recall agents read more documents. + +**Curation Strategy:** +- **Curate aggressively** — add ALL plausibly relevant docs. Include borderline docs. It's ALWAYS better to over-curate than under-curate. Aim for 3-8 adds per curate call. +- **Never remove** docs unless you are certain they are completely irrelevant after reading their full text. +- Keep analysis concise (2-3 paragraphs max). Focus on what to do next. + +**Backtracking — Critical Reasoning Skill:** +When you notice signs of being stuck, you MUST backtrack in your reasoning: +- **Stale pool**: If your last 2-3 searches added few or no new docs, STOP. In your reasoning, explicitly state: "My current search angle is exhausted. Let me rethink." Then try a completely different query decomposition. +- **Re-reading loop**: If you find yourself reading/reviewing the same docs repeatedly, STOP. Reason about what specific information you're missing and search for it directly. +- **Wrong entity**: If results consistently don't match, question your assumptions. The query may require interpreting an entity differently (e.g., a person's maiden name, an alternate spelling, a related entity). +- **Missed facet**: Re-read the query carefully. Identify any constraint you haven't explicitly searched for yet. + +Backtracking is a REASONING step: in your analysis, explain (1) what isn't working, (2) why, and (3) your new strategy. Then act on it. + +{v8d_addendum}""" + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d Contribution 1: BM25 Sentence-Level Compression (local, free) +# ═══════════════════════════════════════════════════════════════════════════════ + +_SENTENCE_SPLITTER = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9])") + + +def compress_chunk(query: str, chunk_text: str, k: int = SENTENCE_COMPRESS_K) -> str: + """Return the top-k query-relevant sentences from chunk_text, preserving order. + + Uses BM25 scores of query tokens against sentence tokens. If BM25 is + unavailable or the chunk has <= k sentences, returns the chunk unchanged. + Cost is purely local (microseconds per chunk). + """ + if not _HAS_BM25 or not chunk_text or not query: + return chunk_text + text = chunk_text.strip() + if not text: + return chunk_text + sentences = [s.strip() for s in _SENTENCE_SPLITTER.split(text) if s.strip()] + if len(sentences) <= k: + return chunk_text + try: + tokenized = [s.lower().split() for s in sentences] + bm25 = BM25Okapi(tokenized) + scores = bm25.get_scores(query.lower().split()) + top_idx = sorted( + sorted(range(len(sentences)), key=lambda i: -scores[i])[:k] + ) + return " ".join(sentences[i] for i in top_idx) + except Exception: + return chunk_text + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d Contribution 5: Semantic Content-Hash Deduplication (MinHash LSH) +# ═══════════════════════════════════════════════════════════════════════════════ + + +class ContentDedupTracker: + """Near-duplicate detection over chunk text using MinHash LSH. + + Used to prevent the pool from filling with near-identical chunks (common in + SEC filings where the same boilerplate appears across 10-Ks). No-op if the + `datasketch` package is missing, so SFT generation without it still works. + """ + + _TOKEN_RE = re.compile(r"[a-z0-9]+") + + def __init__( + self, + threshold: float = MINHASH_DEDUP_THRESHOLD, + num_perm: int = MINHASH_NUM_PERM, + ): + self.enabled = _HAS_MINHASH and V8D_CONTENT_DEDUP + self.num_perm = num_perm + self._fingerprints: Set[str] = set() # fallback for when LSH unavailable + if self.enabled: + try: + self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm) + except Exception: + self.enabled = False + self.lsh = None + else: + self.lsh = None + self._inserted: Set[str] = set() + + def _make_minhash(self, text: str): + mh = MinHash(num_perm=self.num_perm) + tokens = self._TOKEN_RE.findall(text.lower()) + # use 5-grams of tokens for robust fuzzy match + for i in range(len(tokens) - 4): + shingle = " ".join(tokens[i:i + 5]) + mh.update(shingle.encode("utf-8")) + # fallback: also add raw tokens if too short for shingles + if len(tokens) < 5: + for t in tokens: + mh.update(t.encode("utf-8")) + return mh + + def _fallback_fingerprint(self, text: str) -> str: + # 256-bit truncated shingled hash, reasonable near-dup detector when MinHash is absent. + norm = " ".join(self._TOKEN_RE.findall(text.lower()))[:4000] + return hashlib.sha1(norm.encode("utf-8")).hexdigest()[:16] + + def is_duplicate(self, chunk_id: str, text: str) -> bool: + """Return True if this text is near-duplicate of something already tracked. + + Always inserts on first call (returns False). Subsequent near-duplicates + return True without re-inserting. + """ + if not text or len(text.strip()) < 40: + return False + if chunk_id in self._inserted: + return False + if self.enabled and self.lsh is not None: + try: + mh = self._make_minhash(text) + matches = self.lsh.query(mh) + if matches: + return True + self.lsh.insert(chunk_id, mh) + self._inserted.add(chunk_id) + return False + except Exception: + pass + # Fallback path: exact normalized fingerprint only + fp = self._fallback_fingerprint(text) + if fp in self._fingerprints: + return True + self._fingerprints.add(fp) + self._inserted.add(chunk_id) + return False + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d Contribution 2: Evidence Graph (entity ↔ doc co-occurrence) +# ═══════════════════════════════════════════════════════════════════════════════ + + +class EvidenceGraph: + """Lightweight entity-document co-occurrence graph. + + Surfaces in the observation a compact summary of which entities (proper nouns, + years, dates) appear across multiple docs ("bridge" docs) vs only one + ("singletons"). Helps the model plan multi-hop searches and identify which + docs are likely relevant for the answer. + + Extraction is intentionally conservative (proper nouns, years, dates). + """ + + _ENTITY_RE = re.compile( + r"\b(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3}|\d{4}(?:s)?|\d{1,2}/\d{1,2}/\d{2,4})\b" + ) + _STOPWORDS = frozenset({ + "The", "This", "That", "A", "An", "It", "He", "She", "In", "On", "At", + "For", "By", "With", "To", "From", "I", "We", "You", "They", "But", + "However", "Moreover", "Therefore", "Furthermore", "Additionally", + "Page", "Section", "Chapter", "Figure", "Table", "Document", + }) + + def __init__(self): + self.entity_to_docs: Dict[str, Set[str]] = {} + self.doc_to_entities: Dict[str, Set[str]] = {} + + def _extract_entities(self, text: str) -> Set[str]: + ents: Set[str] = set() + for m in self._ENTITY_RE.finditer(text[:8000]): # cap for speed + ent = m.group(0).strip() + if len(ent) < 2: + continue + if ent in self._STOPWORDS: + continue + # drop single-word stopwords and pure 1-char tokens + ents.add(ent) + return ents + + def update_from_doc(self, doc_id: str, text: str) -> None: + if not text or doc_id in self.doc_to_entities: + return + ents = self._extract_entities(text) + if not ents: + return + self.doc_to_entities[doc_id] = ents + for e in ents: + self.entity_to_docs.setdefault(e, set()).add(doc_id) + + def render_summary(self, max_entities: int = EVIDENCE_GRAPH_MAX_ENTITIES) -> str: + """Render a compact human-readable summary for injection into observations.""" + if not self.entity_to_docs: + return "" + # Rank entities by number of docs they appear in (bridging = higher value) + ranked = sorted( + self.entity_to_docs.items(), + key=lambda kv: (-len(kv[1]), kv[0]), + ) + bridge = [(e, docs) for e, docs in ranked if len(docs) >= 2][:max_entities] + singleton_count = sum(1 for _, docs in ranked if len(docs) == 1) + if not bridge: + if singleton_count == 0: + return "" + return f"[Evidence Graph] 0 bridge entities, {singleton_count} singleton entities." + lines = ["[Evidence Graph] Entities appearing in multiple docs (bridges):"] + for ent, docs in bridge: + doc_list = sorted(docs)[:5] + extra = f" (+{len(docs) - 5} more)" if len(docs) > 5 else "" + lines.append(f" {ent}: {', '.join(doc_list)}{extra}") + if singleton_count > 0: + lines.append(f" ({singleton_count} entities in only 1 doc — potential hops)") + return "\n".join(lines) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Working Memory +# ═══════════════════════════════════════════════════════════════════════════════ + +@dataclass +class WorkingMemorySnapshot: + """Immutable snapshot of working memory at a point in time.""" + turn_number: int + curated_ids: List[str] + curated_notes: Dict[str, str] + pool_ids: List[str] + search_history: List[str] + text: str + + +class WorkingMemory: + """Two-tier memory: inner (compact WM in context) + outer (doc_store with full text). + + The WM text (~800-1500 tokens) shows doc IDs + snippets so the model can decide + what to review/curate without re-searching. Full text is in doc_store, accessible + via review_docs() at zero corpus cost. + """ + + def __init__(self, query: str, normalize_ids: bool = True): + self.query = query + self.turn_number = 0 + self.curated_ids: List[str] = [] + self.curated_notes: Dict[str, str] = {} + self.pool_ids: List[str] = [] + self.pool_id_set: Set[str] = set() + self.search_history: List[str] = [] + self.doc_store: Dict[str, Dict[str, str]] = {} + self.normalize_ids = normalize_ids + + # ── v8d additions (all no-op unless respective flags are set) ─────── + self.curated_importance: Dict[str, str] = {} # doc_id -> importance level + self.rerank_instruction: Optional[str] = None # per-episode, set by env + self.auto_populated: bool = False # tracks first-search auto-populate + self.content_dedup = ContentDedupTracker() if V8D_CONTENT_DEDUP else None + self.evidence_graph = EvidenceGraph() if V8D_EVIDENCE_GRAPH else None + # Counter for dup hits (diagnostic / metric) + self.dup_skipped: int = 0 + + def _normalize_id(self, chunk_id: str) -> str: + """Normalize chunk ID to base doc ID (strip trailing _N suffix). + + Handles both single-underscore (web_doc_123_5 -> web_doc_123) and + double-underscore (12486__0 -> 12486) chunk-id formats. + + If the ID is already in the pool (i.e. already normalized), returns as-is + to avoid double-stripping during curate calls. + """ + if not self.normalize_ids or "_" not in chunk_id: + return chunk_id + if chunk_id in self.pool_id_set: + return chunk_id + from cosmos_retriever.tasks import CHUNK_ID_SUFFIX_PATTERN + m = CHUNK_ID_SUFFIX_PATTERN.match(chunk_id) + if m and "/" not in chunk_id: + return m.group("base") + return chunk_id.rsplit("_", 1)[0] + + def get_pool_size(self) -> int: + return len(self.pool_ids) + + def add_to_pool(self, chunk_ids: List[str], + doc_texts: Optional[Dict[str, str]] = None) -> int: + """Add docs to pool and doc_store. Returns count of *newly added* docs. + + v8d: consults ContentDedupTracker before adding (content-level near-dup + suppression) and updates the EvidenceGraph with entity info. Both are + no-ops unless the respective v8d feature flags are set. + """ + added = 0 + for cid in chunk_ids: + doc_id = self._normalize_id(cid) + + # Resolve text for this doc (if any) + text = "" + if doc_texts: + text = doc_texts.get(cid, doc_texts.get(doc_id, "")) or "" + + # v8d: dedup on content, *before* adding to pool. We check against the + # normalized doc_id so that multiple chunks of the same SEC filing + # with slight boilerplate variation don't all make it in. + if ( + self.content_dedup is not None + and text + and doc_id not in self.pool_id_set # never dedup an already-known doc + and self.content_dedup.is_duplicate(doc_id, text) + ): + self.dup_skipped += 1 + continue + + if doc_id not in self.pool_id_set: + self.pool_ids.append(doc_id) + self.pool_id_set.add(doc_id) + added += 1 + if text and doc_id not in self.doc_store: + self.doc_store[doc_id] = { + "full_text": text, + "snippet": text[:DOC_SNIPPET_CHARS].replace("\n", " ").strip(), + } + # v8d: update evidence graph from the newly-seen doc text + if self.evidence_graph is not None: + self.evidence_graph.update_from_doc(doc_id, text) + return added + + def review_docs(self, doc_ids: List[str]) -> str: + """Retrieve full text from outer memory. Free — no corpus call.""" + parts = [] + for did in doc_ids[:MAX_REVIEW_DOCS]: + if did in self.doc_store: + parts.append( + f"# DOCUMENT ID: {did}\n{self.doc_store[did].get('full_text', '')}" + ) + else: + parts.append(f"# DOCUMENT ID: {did}\n(not found in memory)") + return "\n\n".join(parts) if parts else "No matching docs in memory." + + def curate( + self, + add_ids: List[str], + remove_ids: List[str], + notes: Optional[Dict[str, str]] = None, + importance: Optional[Dict[str, str]] = None, + ) -> str: + """Update the curated set. Returns a status string with capacity feedback. + + v8d subtractive behavior (enabled via V8D_SUBTRACTIVE_CURATION): + - Each added doc gets an importance tag ('very_high'|'high'|'fair'|'low'); + missing tags default to 'fair'. + - When the set is full and we try to add a doc that outranks an existing + low-importance one, we evict the lowest-importance doc first. + - When removing a doc, its importance entry is also cleared. + """ + # ── Remove phase ─────────────────────────────────────────────────── + remove_set = set(str(x) for x in remove_ids if x) + # Normalize remove_ids too so the model can pass either chunk or doc ids + remove_set_norm = {self._normalize_id(x) for x in remove_set} + remove_set_all = remove_set | remove_set_norm + self.curated_ids = [x for x in self.curated_ids if x not in remove_set_all] + for rid in remove_set_all: + self.curated_notes.pop(rid, None) + self.curated_importance.pop(rid, None) + + # Normalize importance dict keys so model can pass chunk_ids too + imp_norm: Dict[str, str] = {} + if importance and V8D_IMPORTANCE_TAGGING: + for k, v in importance.items(): + if not isinstance(k, str) or not isinstance(v, str): + continue + v = v.strip().lower() + if v not in VALID_IMPORTANCE: + v = "fair" + imp_norm[self._normalize_id(k.strip())] = v + + # ── Add phase ────────────────────────────────────────────────────── + existing = set(self.curated_ids) + dropped: List[str] = [] + evicted: List[str] = [] + + for doc_id in add_ids: + doc_id = str(doc_id).strip() + doc_id = self._normalize_id(doc_id) + if not doc_id or doc_id in existing: + # Allow importance re-tagging of an already-curated doc + if doc_id in existing and doc_id in imp_norm: + self.curated_importance[doc_id] = imp_norm[doc_id] + continue + + incoming_tag = imp_norm.get(doc_id, "fair") + + if len(self.curated_ids) < MAX_CURATED_DOCS: + self.curated_ids.append(doc_id) + existing.add(doc_id) + if V8D_IMPORTANCE_TAGGING: + self.curated_importance[doc_id] = incoming_tag + if notes and doc_id in notes: + self.curated_notes[doc_id] = notes[doc_id] + continue + + # At capacity: try to evict a lower-importance doc if enabled + if V8D_SUBTRACTIVE_CURATION: + incoming_rank = _IMPORTANCE_RANK.get(incoming_tag, 2) + # find lowest-importance doc in current curated set + worst_id = None + worst_rank = -1 + for cid in self.curated_ids: + tag = self.curated_importance.get(cid, "fair") + rank = _IMPORTANCE_RANK.get(tag, 2) + if rank > worst_rank: + worst_rank = rank + worst_id = cid + if worst_id is not None and worst_rank > incoming_rank: + # evict + self.curated_ids = [c for c in self.curated_ids if c != worst_id] + self.curated_importance.pop(worst_id, None) + self.curated_notes.pop(worst_id, None) + existing.discard(worst_id) + evicted.append(worst_id) + # now add + self.curated_ids.append(doc_id) + existing.add(doc_id) + self.curated_importance[doc_id] = incoming_tag + continue + + dropped.append(doc_id) + + n = len(self.curated_ids) + if V8D_IMPORTANCE_TAGGING and self.curated_importance: + # Render curated list sorted by importance for visibility + def _srt(i): + return (_IMPORTANCE_RANK.get(self.curated_importance.get(i, "fair"), 2), i) + rendered = [ + f"{i}[{self.curated_importance.get(i, 'fair')}]" + for i in sorted(self.curated_ids, key=_srt) + ] + else: + rendered = self.curated_ids + ids_str = ", ".join(rendered) if rendered else "(empty)" + result = f"Curated set updated ({n}/{MAX_CURATED_DOCS}): {ids_str}" + if evicted: + result += ( + f"\n[EVICTED low-importance] {len(evicted)} doc(s): " + f"{', '.join(evicted[:5])}" + ) + if dropped: + result += ( + f"\n[CAPACITY] Set is FULL and no evictable lower-importance docs — " + f"{len(dropped)} doc(s) NOT added: {', '.join(dropped[:5])}" + ) + return result + + def add_search_record(self, tool_name: str, params_summary: str, + num_results: int, num_new: int = -1, + num_new_curated: int = 0) -> None: + """Record a search action with yield info. + + num_new: number of *novel* docs added to pool (-1 = unknown/not tracked). + """ + entry = f"T{self.turn_number}: {tool_name}({params_summary}) → {num_results} docs" + if num_new >= 0: + entry += f", {num_new} new" + if num_new_curated > 0: + entry += f", +{num_new_curated} curated" + self.search_history.append(entry) + + def advance_turn(self) -> None: + self.turn_number += 1 + + def snapshot(self) -> WorkingMemorySnapshot: + return WorkingMemorySnapshot( + turn_number=self.turn_number, + curated_ids=list(self.curated_ids), + curated_notes=dict(self.curated_notes), + pool_ids=list(self.pool_ids), + search_history=list(self.search_history), + text=self.to_text(), + ) + + _POOL_DISPLAY_FULL = 50 + _POOL_DISPLAY_COMPACT = 30 + + def to_text(self) -> str: + """Render compact WM text for inclusion in model context.""" + lines = [ + f"== Working Memory (summarizing turns 0-{self.turn_number}) ==", + f'Query: "{self.query}"', + "", + ] + + # Curated set — show full content when CURATED_DOC_CHARS > 0 + n_curated = len(self.curated_ids) + lines.append(f"Curated Set ({n_curated}/{MAX_CURATED_DOCS}):") + if self.curated_ids: + # v8d: render grouped by importance (very_high → high → fair → low) + if V8D_IMPORTANCE_TAGGING and self.curated_importance: + def _rank(i: str) -> Tuple[int, int]: + return ( + _IMPORTANCE_RANK.get(self.curated_importance.get(i, "fair"), 2), + self.curated_ids.index(i) if i in self.curated_ids else 0, + ) + ordered = sorted(self.curated_ids, key=_rank) + last_tag: Optional[str] = None + for doc_id in ordered: + tag = self.curated_importance.get(doc_id, "fair") + if tag != last_tag: + lines.append(f" -- {tag} --") + last_tag = tag + store = self.doc_store.get(doc_id, {}) + note = self.curated_notes.get(doc_id, "") + note_str = f" -- {note}" if note else "" + if CURATED_DOC_CHARS > 0: + full = store.get("full_text", store.get("snippet", "")) + content = full[:CURATED_DOC_CHARS].strip() + lines.append(f" [*] {doc_id}{note_str}:") + lines.append(f" {content}") + else: + snippet = store.get("snippet", "") + lines.append(f" [*] {doc_id}: {snippet}{note_str}") + else: + for doc_id in self.curated_ids: + store = self.doc_store.get(doc_id, {}) + note = self.curated_notes.get(doc_id, "") + note_str = f" -- {note}" if note else "" + if CURATED_DOC_CHARS > 0: + full = store.get("full_text", store.get("snippet", "")) + content = full[:CURATED_DOC_CHARS].strip() + lines.append(f" [*] {doc_id}{note_str}:") + lines.append(f" {content}") + else: + snippet = store.get("snippet", "") + lines.append(f" [*] {doc_id}: {snippet}{note_str}") + else: + lines.append(" (empty -- use curate tool to add relevant docs)") + lines.append("") + + # Pool: most-recent uncurated docs first (recent finds are most actionable) + curated_set = set(self.curated_ids) + uncurated = [pid for pid in self.pool_ids if pid not in curated_set] + lines.append( + f"Document Pool: {len(self.pool_ids)} docs total, {len(uncurated)} uncurated" + ) + if uncurated: + recent = list(reversed(uncurated[-self._POOL_DISPLAY_FULL:])) + for did in recent: + snippet = self.doc_store.get(did, {}).get("snippet", "") + lines.append(f" [ ] {did}: {snippet}") + hidden = len(uncurated) - len(recent) + if hidden > 0: + older = uncurated[:hidden] + id_str = ", ".join(older[:self._POOL_DISPLAY_COMPACT]) + if hidden > self._POOL_DISPLAY_COMPACT: + id_str += f" (+{hidden - self._POOL_DISPLAY_COMPACT} more)" + lines.append(f" Earlier uncurated ({hidden}): {id_str}") + lines.append("") + + # Search history (last 12 entries) + if self.search_history: + lines.append("Search History:") + history = self.search_history[-12:] + if len(self.search_history) > 12: + lines.append( + f" ... ({len(self.search_history) - 12} earlier searches)" + ) + for entry in history: + lines.append(f" {entry}") + else: + lines.append("Search History: (no searches yet)") + + lines.append("") + lines.append("Use review_docs(doc_ids) to re-read any document from your pool.") + + # v8d: evidence graph summary + if self.evidence_graph is not None: + eg_text = self.evidence_graph.render_summary() + if eg_text: + lines.append("") + lines.append(eg_text) + + # v8d: dedup signal (helps the model realize SEC corpora have dups) + if self.content_dedup is not None and self.dup_skipped > 0: + lines.append( + f"[Dedup] {self.dup_skipped} near-duplicate chunk(s) auto-suppressed." + ) + + return "\n".join(lines) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Harmony Message Helpers +# ═══════════════════════════════════════════════════════════════════════════════ + +def action_observation_to_messages( + action: Action, + observation: Observation, + compress: bool = False, + max_analysis_chars: int = MAX_ANALYSIS_CHARS_OLDER, +) -> List[Message]: + """Convert one (action, observation) pair to Harmony messages. + + If compress=True, truncates the analysis/reasoning to max_analysis_chars. + Used for older turns in the recent window to prevent stale context pollution. + + Produces: [assistant analysis?] [assistant tool_call] [tool result] + """ + if compress and action.reasoning and len(action.reasoning) > max_analysis_chars: + action = copy.copy(action) + action.reasoning = ( + action.reasoning[:max_analysis_chars] + "...(truncated)" + ) + + messages: List[Message] = [] + tool_use_source_to_name: Dict[str, str] = {} + + # --- Action: reasoning (analysis channel) --- + if action.reasoning: + messages.append( + Message.from_role_and_content(Role.ASSISTANT, action.reasoning) + .with_channel("analysis") + ) + + # --- Action: tool call(s) (commentary channel) --- + if len(action.tools) > 1: + tool_calls = [] + for tool, params, source in action.as_iter(): + if isinstance(tool, UserTextTool): + messages.append( + Message.from_role_and_content(Role.ASSISTANT, params["text"]) + .with_channel("final") + ) + else: + tool_calls.append({ + "tool_name": tool.tool_schema.name, + "parameters": params, + }) + tool_use_source_to_name[source] = tool.tool_schema.name + if tool_calls: + messages.append( + Message.from_role_and_content( + Role.ASSISTANT, json.dumps(tool_calls) + ) + .with_channel("commentary") + .with_recipient("functions.multi_tool_use") + .with_content_type("<|constrain|>json") + ) + elif len(action.tools) == 1: + tool = action.tools[0] + params = action.params[0] + source = action.sources[0] + if isinstance(tool, UserTextTool): + messages.append( + Message.from_role_and_content(Role.ASSISTANT, params["text"]) + .with_channel("final") + ) + else: + messages.append( + Message.from_role_and_content(Role.ASSISTANT, json.dumps(params)) + .with_channel("commentary") + .with_recipient("functions." + tool.tool_schema.name) + .with_content_type("<|constrain|>json") + ) + tool_use_source_to_name[source] = "functions." + tool.tool_schema.name + + # --- Observation: tool result(s) --- + if len(observation.observations) > 1: + tool_results = [] + for obs_text, obs_source in zip( + observation.observations, observation.sources + ): + tool_name = tool_use_source_to_name.get(obs_source, "unknown") + if len(obs_text) > MAX_OBS_CHARS: + obs_text = ( + obs_text[:MAX_OBS_CHARS] + + f"\n... (truncated, {len(obs_text)} chars total)" + ) + tool_results.append({ + "type": "tool_result", + "name": tool_name, + "content": [obs_text], + }) + messages.append( + Message.from_author_and_content( + Author(role=Role.TOOL, name="functions.multi_tool_use"), + json.dumps(tool_results), + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + elif len(observation.observations) == 1: + obs_source = observation.sources[0] + obs_text = observation.observations[0] + if len(obs_text) > MAX_OBS_CHARS: + obs_text = ( + obs_text[:MAX_OBS_CHARS] + + f"\n... (truncated, {len(obs_text)} chars total)" + ) + if obs_source == "user": + messages.append( + Message.from_role_and_content(Role.USER, obs_text) + ) + else: + tool_name = tool_use_source_to_name.get(obs_source, "unknown") + messages.append( + Message.from_author_and_content( + Author(role=Role.TOOL, name=tool_name), obs_text, + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + + return messages + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Context Assembly — THE single function for building model input +# ═══════════════════════════════════════════════════════════════════════════════ + +def build_context( + system_prompt: str, + wm_text: Optional[str], + recent_actions: List[Action], + recent_observations: List[Observation], + result_summaries: Optional[List[str]] = None, +) -> Conversation: + """Build the hybrid context: [system + tools + query + WM? + recent turns + summaries]. + + This is the ONLY context assembly function in the codebase. SFT generation, + SFT training, and RL training all use it, guaranteeing identical formats. + + Args: + system_prompt: The system prompt including the query. + wm_text: Working Memory text for turns older than RECENT_K. None for early turns. + recent_actions: The most recent K (or fewer) Action objects. + recent_observations: Matching Observation objects. + result_summaries: One per recent turn. Injected as user messages between turns + (except after the latest turn). None to skip injection. + """ + system_message = ( + SystemContent.new() + .with_reasoning_effort(ReasoningEffort.HIGH) + .with_conversation_start_date("2026-04-01") + ) + messages = [Message.from_role_and_content(Role.SYSTEM, system_message)] + + developer_message = DeveloperContent.new().with_function_tools(get_tool_descriptions()) + messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message)) + + messages.append(Message.from_role_and_content(Role.USER, system_prompt)) + + if wm_text: + messages.append(Message.from_role_and_content(Role.USER, wm_text)) + + assert len(recent_actions) == len(recent_observations), ( + f"Mismatch: {len(recent_actions)} actions vs {len(recent_observations)} obs" + ) + n_recent = len(recent_actions) + for i, (action, observation) in enumerate( + zip(recent_actions, recent_observations) + ): + is_last = (i == n_recent - 1) + turn_msgs = action_observation_to_messages( + action, observation, compress=(not is_last), + ) + messages.extend(turn_msgs) + + # Inject result summary after each turn except the latest (the model + # hasn't acted on its latest results yet, so no summary needed). + if result_summaries and i < len(result_summaries) and not is_last: + summary = result_summaries[i] + if summary: + messages.append( + Message.from_role_and_content(Role.USER, summary) + ) + + return Conversation(messages=messages) + + +def render_context_within_budget( + system_prompt: str, + wm_text: Optional[str], + recent_actions: List[Action], + recent_observations: List[Observation], + result_summaries: Optional[List[str]], + enc: HarmonyEncoding, + budget: int = PROMPT_TOKEN_BUDGET, + nudge_prompt: Optional[str] = None, + retry_prompt: Optional[str] = None, +) -> List[int]: + """Render context tokens guaranteed to be within token budget. + + This is the ONLY token rendering function. ALL code paths (normal step, + format retry, initial observation) go through here. No unprotected path. + + Progressive truncation strategy: + 1. Normal render (300-char analysis for older turns) + 2. Truncate WM pool section + 3. Aggressive: 100-char analysis + 2000-char WM + 4. Drop oldest recent turns one at a time + 5. Minimal context (system + query only) + """ + def _append_tail(conv: Conversation) -> Conversation: + msgs = list(conv.messages) + if nudge_prompt: + msgs.append(Message.from_role_and_content(Role.USER, nudge_prompt)) + if retry_prompt: + msgs.append(Message.from_role_and_content(Role.USER, retry_prompt)) + return Conversation(messages=msgs) + + # --- Pass 1: normal render --- + conv = build_context( + system_prompt, wm_text, recent_actions, recent_observations, + result_summaries, + ) + conv = _append_tail(conv) + tokens = enc.render_conversation(conv) + if len(tokens) <= budget: + return tokens + + # --- Pass 2: truncate WM pool section --- + truncated_wm = wm_text + if truncated_wm and len(truncated_wm) > 1500: + pool_start = truncated_wm.find("Document Pool:") + hist_start = truncated_wm.find("Search History:") + if pool_start > 0 and hist_start > pool_start: + pool_section = truncated_wm[pool_start:hist_start] + overshoot = len(tokens) - budget + chars_to_cut = min(len(pool_section) - 100, overshoot * 3) + if chars_to_cut > 0: + new_pool = ( + pool_section[:len(pool_section) - chars_to_cut] + + "\n ... (truncated for context)\n\n" + ) + truncated_wm = ( + truncated_wm[:pool_start] + new_pool + + truncated_wm[hist_start:] + ) + + conv = build_context( + system_prompt, truncated_wm, recent_actions, recent_observations, + result_summaries, + ) + conv = _append_tail(conv) + tokens = enc.render_conversation(conv) + if len(tokens) <= budget: + return tokens + + # --- Pass 3: aggressive — 100-char analysis, 2000-char WM --- + aggressive_wm = truncated_wm + if aggressive_wm and len(aggressive_wm) > 2000: + aggressive_wm = aggressive_wm[:2000] + "\n...(WM truncated)" + + compressed_actions = [] + for action in recent_actions: + a = copy.copy(action) + if a.reasoning and len(a.reasoning) > 100: + a.reasoning = a.reasoning[:100] + "...(truncated)" + compressed_actions.append(a) + + conv = build_context( + system_prompt, aggressive_wm, compressed_actions, recent_observations, + result_summaries, + ) + conv = _append_tail(conv) + tokens = enc.render_conversation(conv) + if len(tokens) <= budget: + return tokens + + # --- Pass 4: drop oldest recent turns one at a time --- + drop_actions = list(compressed_actions) + drop_obs = list(recent_observations) + drop_summaries = list(result_summaries) if result_summaries else [] + + while len(drop_actions) > 1: + drop_actions = drop_actions[1:] + drop_obs = drop_obs[1:] + if drop_summaries: + drop_summaries = drop_summaries[1:] + + conv = build_context( + system_prompt, aggressive_wm, drop_actions, drop_obs, + drop_summaries or None, + ) + conv = _append_tail(conv) + tokens = enc.render_conversation(conv) + if len(tokens) <= budget: + return tokens + + # --- Pass 5: minimal context (system + query only) --- + conv = build_context(system_prompt, None, [], [], None) + if retry_prompt: + msgs = list(conv.messages) + msgs.append(Message.from_role_and_content(Role.USER, retry_prompt)) + conv = Conversation(messages=msgs) + tokens = enc.render_conversation(conv) + assert len(tokens) <= budget, ( + f"Even minimal context exceeds budget: {len(tokens)} > {budget}" + ) + return tokens + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Result Summary Builder +# ═══════════════════════════════════════════════════════════════════════════════ + +_SEARCH_TOOLS = frozenset({"fan_out_search", "search_corpus", "grep_corpus", "read_document"}) + + +HARNESS_PRESCRIPTIVE = os.environ.get("HARNESS_PRESCRIPTIVE", "1") == "1" + + +def build_result_summary( + obs_text: str, + tool_names: List[str], + wm: WorkingMemory, + turns_since_curate: int, + tool_types_used: Set[str], + current_turn: int, + pool_size_before: int, +) -> str: + """Build a concise, factual result summary (~100-150 tokens). + + Injected as a user message between turns to force the model to acknowledge + what happened and adapt its strategy. Entirely programmatic — no LLM call. + + When HARNESS_PRESCRIPTIVE=0 (e.g. during later RL training), the + prescriptive [ACTION REQUIRED]/[WARN]/[NEXT] messages are omitted to allow + the model more exploration freedom. Factual status and tips are always kept. + + Args: + pool_size_before: Pool size BEFORE this turn's add_to_pool(). This lets + us accurately report novel vs repeat docs (fixes the bug where + novel_count was always 0 because add_to_pool ran first). + """ + lines: List[str] = [] + tool_str = ", ".join(tool_names) if tool_names else "unknown" + is_search_turn = any(t in _SEARCH_TOOLS for t in tool_names) + + # ── 1. Tool result + doc count ────────────────────────────────────────── + new_doc_ids = re.findall(r'# DOCUMENT ID:\s*(\S+)', obs_text) + pool_size_after = len(wm.pool_ids) + novel_count = pool_size_after - pool_size_before + + if new_doc_ids: + lines.append( + f"[STATUS] {tool_str}: {len(new_doc_ids)} docs returned, " + f"{novel_count} new. Pool: {pool_size_after} total." + ) + elif is_search_turn: + lines.append(f"[STATUS] {tool_str}: no new documents found.") + elif "curate" in tool_names: + lines.append("[STATUS] curate: curated set updated.") + elif "review_docs" in tool_names: + lines.append("[STATUS] review_docs: documents re-read from memory.") + else: + lines.append(f"[STATUS] {tool_str} completed.") + + # ── 2. Curated set status ─────────────────────────────────────────────── + n_curated = len(wm.curated_ids) + n_pool = len(wm.pool_ids) + uncurated = n_pool - n_curated + if n_curated == 0: + lines.append( + f"[WARN] Curated set is EMPTY (0/{MAX_CURATED_DOCS}). " + f"You have {n_pool} docs in your pool — curate ALL promising ones now." + ) + else: + curated_preview = ", ".join(wm.curated_ids[:5]) + if n_curated > 5: + curated_preview += f" (+{n_curated - 5} more)" + lines.append(f"Curated: {n_curated}/{MAX_CURATED_DOCS} [{curated_preview}].") + if uncurated > 10 and n_curated < 8: + lines.append( + f"[TIP] {uncurated} uncurated docs in pool. " + "Add ALL relevant ones — don't under-curate." + ) + + # ── 3. Curate-after-search reminder ───────────────────────────────────── + if is_search_turn and HARNESS_PRESCRIPTIVE: + if turns_since_curate >= 1: + lines.append( + "[ACTION REQUIRED] You just searched — now curate. Review these results " + "and call curate to add ALL plausibly relevant docs before your next search." + ) + if turns_since_curate >= 2: + lines.append( + "[WARN] Multiple consecutive searches without curating. " + "Curate NOW before searching again." + ) + + # ── 4. Truncation detection → suggest read_document ───────────────────── + if "[... truncated]" in obs_text or "truncated," in obs_text: + truncated_docs = re.findall( + r'# DOCUMENT ID:\s*(\S+).*?(?:\[\.\.\.\ truncated\]|truncated,)', + obs_text, re.DOTALL, + ) + if truncated_docs: + lines.append( + f"[TIP] Docs [{', '.join(truncated_docs[:3])}] were truncated. " + "Use read_document(doc_id) to see full content." + ) + else: + lines.append( + "[TIP] Some results were truncated. " + "Use read_document(doc_id) to see full content." + ) + + # ── 5. Tool diversity / strategy suggestions ──────────────────────────── + if len(tool_types_used) == 1 and current_turn >= 3: + only_tool = list(tool_types_used)[0] + alternatives = [ + t for t in ["grep_corpus", "read_document", "review_docs", "curate"] + if t != only_tool + ] + lines.append( + f"[TIP] Only used {only_tool} so far. " + f"Consider: {', '.join(alternatives[:2])}." + ) + + if ("grep_corpus" not in tool_types_used + and current_turn >= 4 + and not is_search_turn): + lines.append( + "[TIP] You haven't used grep_corpus yet. " + "Use it for specific names, dates, numbers, or exact phrases from the query." + ) + + if ("read_document" not in tool_types_used + and current_turn >= 6 + and n_pool >= 5 + and not is_search_turn): + lines.append( + "[TIP] You haven't used read_document yet. " + "Reading full text of partially-matching docs often reveals connections that snippets miss." + ) + + # ── 6. Consecutive search penalty warning ─────────────────────────────── + if turns_since_curate >= 2 and n_pool > 0 and HARNESS_PRESCRIPTIVE: + lines.append( + f"[WARN] {turns_since_curate} consecutive non-curate turns. " + "You MUST curate before your next search." + ) + + # ── 7. Next action suggestions ────────────────────────────────────────── + if HARNESS_PRESCRIPTIVE: + suggestions = [] + if is_search_turn and turns_since_curate >= 1: + suggestions.append("curate ALL relevant docs from these results NOW") + elif n_curated == 0 and n_pool >= 3: + suggestions.append("curate promising docs from your pool") + + if suggestions: + lines.append("[NEXT] " + "; ".join(suggestions) + ".") + + return "\n".join(lines) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Reward Computation +# ═══════════════════════════════════════════════════════════════════════════════ + +def compute_reward( + recall: float, + precision: float, + final_answer_recall: float, + trajectory_recall: float, + n_curated: int, + turn: int, + total_curate_calls: int, + n_unique_tools: int, + is_terminal: bool, + trajectory_fa_recall: float = 0.0, +) -> Tuple[float, Dict[str, float]]: + """Compute reward from pre-evaluated metrics. Pure function, no dataset dependency. + + The caller evaluates recall/precision using the appropriate dataset method + and passes the results here. This keeps the reward function testable and + decoupled from dataset specifics. + + Returns (reward, metrics_dict). + + Reward hierarchy: + -0.5 = format error (caller handles this case separately) + -0.2 = never curated (terminal only) + 0.001 = curated but found nothing + 0.1+ = productive episode + ~2.0 = theoretical max (recall=1, precision=1, fa_found=1) + + Key components (v3 defaults): + 0.7 × F_beta(recall, precision, β=2) outcome quality + 0.3 × trajectory_recall pool discovery signal + 1.0 × binary final-answer bonus sparse success signal + 0.8 × final_answer_recall dense answer-curation signal + 0.4 × trajectory_fa_recall dense answer-discovery signal + -0.35 × max(trajectory_fa_recall - final_answer_recall, 0) + penalize "answer in pool but not curated" + -turn_penalty efficiency pressure + """ + # ── v4: pure recall reward (4-term) ──────────────────────────────────── + # trajectory_recall : relevant docs found in pool (broad discovery) + # recall : relevant docs in curated list (selection skill) + # trajectory_fa_recall : answer docs found in pool (targeted discovery) + # final_answer_recall : answer docs in curated list (highest value) + if REWARD_VERSION == "v4": + reward = 0.5 * ( + trajectory_recall + + recall + + trajectory_fa_recall + + final_answer_recall + ) + final_answer_found = final_answer_recall > 0 + pool_curated_gap = max(0.0, trajectory_recall - recall) + curate_rate = total_curate_calls / max(turn, 1) + metrics = { + "recall": recall, "precision": precision, + "f_beta": 0.0, "trajectory_recall": trajectory_recall, + "trajectory_fa_recall": trajectory_fa_recall, + "final_answer_recall": final_answer_recall, + "final_answer_found": 1.0 if final_answer_found else 0.0, + "pool_curated_gap": pool_curated_gap, + "num_curated_docs": float(n_curated), + "used_curate": 1.0 if n_curated > 0 else 0.0, + "total_curate_calls": float(total_curate_calls), + "curate_rate": curate_rate, + "tool_diversity": float(n_unique_tools), + "num_turns": turn, + "final_reward": reward, + "no_error": 1.0, + "max_turns_reached": 0.0, + "no_curate_penalty": 1.0 if (n_curated == 0 and is_terminal) else 0.0, + } + return reward, metrics + + # ── v3: original multi-component reward ────────────────────────────── + if n_curated == 0: + if is_terminal: + return NO_CURATE_PENALTY, { + "no_error": 1.0, "recall": 0.0, "precision": 0.0, + "f_beta": 0.0, "trajectory_recall": trajectory_recall, + "final_answer_found": 0.0, "num_curated_docs": 0, + "used_curate": 0.0, "no_curate_penalty": 1.0, + "final_reward": NO_CURATE_PENALTY, + "max_turns_reached": 0.0, + } + fallback = max(TRAJECTORY_RECALL_WEIGHT * trajectory_recall, MIN_FORMAT_REWARD) + return fallback, { + "no_error": 1.0, "recall": 0.0, "precision": 0.0, + "f_beta": 0.0, "trajectory_recall": trajectory_recall, + "final_answer_found": 0.0, "num_curated_docs": 0, + "used_curate": 0.0, "final_reward": fallback, + "max_turns_reached": 0.0, + } + + # F-beta (beta=2: recall weighted 4x over precision) + beta_sq = RECALL_BETA * RECALL_BETA + if precision + recall > 0: + f_beta = ( + (1 + beta_sq) * precision * recall + ) / (beta_sq * precision + recall) + else: + f_beta = 0.0 + + final_answer_found = final_answer_recall > 0 + if FINAL_ANSWER_BINARY: + final_answer_bonus = FINAL_ANSWER_BONUS if final_answer_found else 0.0 + else: + final_answer_bonus = FINAL_ANSWER_BONUS * final_answer_recall + + fa_dense_reward = ( + FINAL_ANSWER_RECALL_WEIGHT * final_answer_recall + + TRAJECTORY_FA_RECALL_WEIGHT * trajectory_fa_recall + ) + fa_miss_gap = max(0.0, trajectory_fa_recall - final_answer_recall) + fa_miss_penalty = FA_MISS_PENALTY_WEIGHT * fa_miss_gap + + combined = ( + OUTCOME_WEIGHT * f_beta + + TRAJECTORY_RECALL_WEIGHT * trajectory_recall + + final_answer_bonus + + fa_dense_reward + ) + combined -= fa_miss_penalty + + # Gap penalty (default 0 — kept for backward compat) + pool_curated_gap = max(0.0, trajectory_recall - recall) + gap_penalty = GAP_PENALTY_WEIGHT * pool_curated_gap + combined -= gap_penalty + + # Turn penalty: linear ramp from 0 at TURN_PENALTY_MIN_TURNS to TURN_PENALTY_MAX at MAX_TURNS + if turn > TURN_PENALTY_MIN_TURNS and TURN_PENALTY_MAX > 0: + turn_range = max(MAX_TURNS - TURN_PENALTY_MIN_TURNS, 1) + turn_frac = min((turn - TURN_PENALTY_MIN_TURNS) / turn_range, 1.0) + turn_penalty = TURN_PENALTY_MAX * turn_frac + else: + turn_penalty = 0.0 + combined -= turn_penalty + + # Legacy shaping bonuses (defaults zeroed in v2) + curate_rate = total_curate_calls / max(turn, 1) + curate_rate_bonus = CURATE_RATE_BONUS_WEIGHT * min( + curate_rate / TARGET_CURATE_RATE, 1.0 + ) + combined += curate_rate_bonus + + tool_diversity_bonus = TOOL_DIVERSITY_BONUS_WEIGHT * min( + n_unique_tools / TOOL_DIVERSITY_TARGET, 1.0 + ) + combined += tool_diversity_bonus + tool_diversity_shortfall = max(0, TOOL_DIVERSITY_TARGET - n_unique_tools) + tool_diversity_penalty = ( + TOOL_DIVERSITY_SHORTFALL_PENALTY * tool_diversity_shortfall + ) + combined -= tool_diversity_penalty + + final_reward = max(MIN_FORMAT_REWARD, combined) + + metrics = { + "recall": recall, + "precision": precision, + "f_beta": f_beta, + "final_answer_recall": final_answer_recall, + "final_answer_found": 1.0 if final_answer_found else 0.0, + "trajectory_recall": trajectory_recall, + "trajectory_fa_recall": trajectory_fa_recall, + "final_answer_bonus": final_answer_bonus, + "fa_dense_reward": fa_dense_reward, + "fa_miss_gap": fa_miss_gap, + "fa_miss_penalty": fa_miss_penalty, + "pool_curated_gap": pool_curated_gap, + "gap_penalty": gap_penalty, + "turn_penalty": turn_penalty, + "pre_penalty_reward": combined + gap_penalty + turn_penalty, + "num_curated_docs": float(n_curated), + "used_curate": 1.0, + "total_curate_calls": float(total_curate_calls), + "curate_rate": curate_rate, + "curate_rate_bonus": curate_rate_bonus, + "tool_diversity": float(n_unique_tools), + "tool_diversity_bonus": tool_diversity_bonus, + "tool_diversity_shortfall": float(tool_diversity_shortfall), + "tool_diversity_penalty": tool_diversity_penalty, + "num_turns": turn, + "final_reward": final_reward, + "no_error": 1.0, + "max_turns_reached": 0.0, + } + return final_reward, metrics + + +# ═══════════════════════════════════════════════════════════════════════════════ +# Utility: parse doc IDs/texts from search observation +# ═══════════════════════════════════════════════════════════════════════════════ + +def parse_doc_ids_from_observation(obs_text: str) -> List[str]: + """Extract document IDs from a search/grep observation string.""" + return re.findall(r'# DOCUMENT ID:\s*(\S+)', obs_text) + + +def parse_doc_texts_from_observation(obs_text: str) -> Dict[str, str]: + """Extract ``{doc_id: full_text}`` from observation text. + + The search/grep tool emits headers like ``# DOCUMENT ID: 16871__0 (502 tokens)`` + where ``16871__0`` is the chunk id, ``16871`` the document id, and the + ``(N tokens)`` suffix is informational. Callers (``WorkingMemory.add_to_pool``) + look up by the *normalized* doc id (``16871``), so we key the result both + ways: by the raw header token AND by the normalized doc id. + """ + docs: Dict[str, str] = {} + parts = re.split(r'# DOCUMENT ID:\s*', obs_text) + for part in parts[1:]: + lines = part.split("\n", 1) + if not lines: + continue + header = lines[0].strip() + text = (lines[1] if len(lines) > 1 else "").strip() + # Drop the trailing "(N tokens)" annotation if present. + chunk_id = header.split()[0] if header else "" + if not chunk_id: + continue + docs[chunk_id] = text + # Also key by the document id (chunk_id without the trailing __N suffix). + doc_id = chunk_id.split("__")[0] if "__" in chunk_id else chunk_id + docs.setdefault(doc_id, text) + return docs + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d helpers: token budget marker, rerank instruction, compressed observation +# ═══════════════════════════════════════════════════════════════════════════════ + + +def format_token_budget_marker( + used_tokens: int, + budget: int = PROMPT_TOKEN_BUDGET, +) -> str: + """Format the `[Context: X/Y]` marker injected at the end of observations. + + Makes the model budget-aware without it having to estimate context size. + """ + used_tokens = max(0, int(used_tokens)) + pct = int(100.0 * used_tokens / max(budget, 1)) + flag = "" + if pct >= 90: + flag = " CRITICAL — end_search NOW" + elif pct >= 75: + flag = " warning: finish up soon" + elif pct >= 60: + flag = " over halfway" + return f"[Context: {used_tokens}/{budget}{flag}]" + + +def append_token_marker(obs_text: str, used_tokens: int) -> str: + """Append the token-budget marker to an observation. No-op unless enabled.""" + if not V8D_TOKEN_BUDGET_MARKER: + return obs_text + marker = format_token_budget_marker(used_tokens) + if obs_text.endswith("\n"): + return obs_text + marker + return obs_text + "\n" + marker + + +def compress_search_observation(query: str, obs_text: str) -> str: + """Compress per-doc search result text with BM25 sentence selection. + + Applies to each `# DOCUMENT ID: ...` block. Preserves doc IDs and structure. + No-op if V8D_SENTENCE_COMPRESS is disabled or BM25 is unavailable. + """ + if not V8D_SENTENCE_COMPRESS or not _HAS_BM25: + return obs_text + if "# DOCUMENT ID:" not in obs_text: + return obs_text + # Split on doc delimiters, keep header and body separately + parts = re.split(r"(# DOCUMENT ID:\s*\S+\n)", obs_text) + # parts will alternate: [prefix, header1, body1, header2, body2, ...] + out_parts: List[str] = [] + for i, chunk in enumerate(parts): + if i == 0 or not chunk.startswith("# DOCUMENT ID:"): + out_parts.append(chunk) + continue + out_parts.append(chunk) + # next part is the body for this header + if i + 1 < len(parts): + body = parts[i + 1] + compressed = compress_chunk(query, body, k=SENTENCE_COMPRESS_K) + parts[i + 1] = compressed + # reconstruct + result: List[str] = [] + for i, chunk in enumerate(parts): + result.append(chunk) + return "".join(parts) + + +# Per-domain rerank instruction presets (used when V8D_ADAPTIVE_RERANK_INSTRUCTION=0 +# or when the LLM-based builder is unavailable). These are much cheaper than an +# extra LLM call per episode. +_DOMAIN_RERANK_INSTRUCTIONS = { + "sec": ( + "Given a query about SEC filings (10-K, 10-Q, 8-K, proxy statements), retrieve " + "passages that directly answer the query's specific financial, regulatory, or " + "governance criteria. Prefer passages with numeric facts, dates, or explicit " + "statements that match the query." + ), + "patents": ( + "Given a query about patents, retrieve passages that describe the specific " + "invention, claims, inventors, assignees, or prior art referenced in the query. " + "Prefer passages with technical detail matching the query's constraints." + ), + "browsecompplus": ( + "Given a hard multi-hop web query, retrieve passages that contain the specific " + "entities, dates, quantities, or relationships asked about. Prefer passages " + "that directly match multiple constraints simultaneously." + ), + "web": ( + "Given a web search query, retrieve passages that directly answer the query. " + "Prefer passages with specific entities, dates, or facts that match the query." + ), +} + + +def build_rerank_instruction( + query: str, + dataset_name: Optional[str] = None, + openai_client: Any = None, + use_llm: bool = False, +) -> str: + """Build a rerank instruction tailored to the current query. + + Default behavior (use_llm=False): returns a domain-specific static template. + Cheap, deterministic, no API cost. + + Advanced behavior (use_llm=True): calls GPT-5.4 to generate a query-specific + instruction. ~1 extra LLM call per episode (fixed cost, not per-turn). + """ + if dataset_name and not use_llm: + return _DOMAIN_RERANK_INSTRUCTIONS.get( + dataset_name, _DOMAIN_RERANK_INSTRUCTIONS["web"] + ) + + if not use_llm or openai_client is None: + return _DOMAIN_RERANK_INSTRUCTIONS.get( + dataset_name or "web", _DOMAIN_RERANK_INSTRUCTIONS["web"] + ) + + try: + system = ( + "You write concise (≤30 word) reranker instructions. Given a search query, " + "produce one sentence describing what makes a passage 'relevant enough to " + "return'. Focus on query-specific constraints (entities, dates, numbers, " + "relationships). Output ONLY the instruction, no preamble." + ) + resp = openai_client.chat.completions.create( + model=os.environ.get("RERANK_INSTR_MODEL", "gpt-5.4-mini"), + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": f"Query: {query}"}, + ], + temperature=0.2, + max_tokens=80, + timeout=10, + ) + instr = resp.choices[0].message.content.strip() + if len(instr) < 20 or len(instr) > 400: + raise ValueError("rerank instruction out of range") + return instr + except Exception as e: + logger.warning("rerank_instr_builder_failed", error=str(e)[:200]) + return _DOMAIN_RERANK_INSTRUCTIONS.get( + dataset_name or "web", _DOMAIN_RERANK_INSTRUCTIONS["web"] + ) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d: Verify tool exec (cheap LLM claim-check against doc text) +# ═══════════════════════════════════════════════════════════════════════════════ + + +_VERIFY_SYSTEM = ( + "You are a strict document verifier. Given a CLAIM and one DOCUMENT's full text, " + "answer only 'yes' or 'no' followed by a very short (≤20 word) rationale. " + "Answer 'yes' ONLY if the document directly supports ALL parts of the claim. " + "Answer 'no' if any constraint is missing or contradicted. Be conservative." +) + + +def exec_verify_claim( + openai_client: Any, + doc_texts: Dict[str, str], + claim: str, + model: Optional[str] = None, +) -> str: + """Run verify tool: for each doc, ask if it supports the claim. + + Returns a single string formatted as: + # DOCUMENT ID: + verdict: yes|no + rationale: + """ + if not doc_texts: + return "verify: no matching docs found in memory." + if not claim or len(claim.strip()) < 6: + return "verify: claim is too short or empty." + + model = model or os.environ.get("VERIFY_MODEL", "gpt-5.4-mini") + out_parts: List[str] = [] + for doc_id, text in list(doc_texts.items())[:5]: # cap per call + snippet = text[:6000] if text else "" + if not snippet: + out_parts.append( + f"# DOCUMENT ID: {doc_id}\nverdict: no\nrationale: document text unavailable." + ) + continue + try: + resp = openai_client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": _VERIFY_SYSTEM}, + { + "role": "user", + "content": ( + f"CLAIM: {claim}\n\nDOCUMENT:\n{snippet}\n\n" + "Answer strictly as: '. '" + ), + }, + ], + temperature=0.0, + max_tokens=80, + timeout=20, + ) + reply = resp.choices[0].message.content.strip() + # Normalize output + lower = reply.lower().lstrip() + verdict = "yes" if lower.startswith("yes") else "no" + # pull rationale after the first sentence break + rat = reply.split(".", 1)[-1].strip() if "." in reply else reply + out_parts.append( + f"# DOCUMENT ID: {doc_id}\nverdict: {verdict}\nrationale: {rat[:200]}" + ) + except Exception as e: + out_parts.append( + f"# DOCUMENT ID: {doc_id}\nverdict: unknown\nrationale: verify failed ({str(e)[:80]})." + ) + return "\n\n".join(out_parts) + + +# ═══════════════════════════════════════════════════════════════════════════════ +# v8d: Small helper used by auto-populate hook in env +# ═══════════════════════════════════════════════════════════════════════════════ + + +def auto_populate_from_first_search( + wm: WorkingMemory, + ranked_doc_ids: List[str], + top_k: int = AUTO_POPULATE_TOP_K, +) -> int: + """Populate the curated set from the first successful search's top-K hits. + + Idempotent: only runs if wm.auto_populated is False AND wm.curated_ids is empty. + All auto-populated docs get importance='fair'; the model is expected to demote + or remove poor ones on subsequent curate calls. + + Returns the number of docs added. + """ + if not V8D_AUTO_POPULATE_FIRST_SEARCH: + return 0 + if wm.auto_populated or wm.curated_ids: + return 0 + + added = 0 + seen: Set[str] = set() + for cid in ranked_doc_ids: + did = wm._normalize_id(cid) + if did in seen or not did: + continue + seen.add(did) + if did not in wm.pool_id_set: + continue + if len(wm.curated_ids) >= min(top_k, MAX_CURATED_DOCS): + break + wm.curated_ids.append(did) + if V8D_IMPORTANCE_TAGGING: + wm.curated_importance[did] = "fair" + added += 1 + + wm.auto_populated = True + return added diff --git a/cosmos-retriever/src/cosmos_retriever/utils.py b/cosmos-retriever/src/cosmos_retriever/utils.py new file mode 100644 index 0000000..e43717c --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/utils.py @@ -0,0 +1,28 @@ +"""Provider-format enum used by trajectory + tool serialisation. + +The retriever only ever talks to two formats: + +* :pyattr:`ProviderFormat.OPENAI` — OpenAI Chat Completions JSON, used for + cross-format readability/serialisation in tests. +* :pyattr:`ProviderFormat.OPENAI_HARMONY` — OpenAI-Harmony token format, the + in-context format the trained model was optimised for and the only format + used at runtime against vLLM. + +Other formats (Anthropic, Moonshot, OpenAI Responses) lived in upstream +Harness-1 to support teacher generation and evaluation; they are intentionally +excluded here. +""" + +from __future__ import annotations + +from enum import StrEnum + + +class ProviderFormat(StrEnum): + """Supported provider formats.""" + + OPENAI = "openai" + OPENAI_HARMONY = "openai_harmony" + + +__all__ = ["ProviderFormat"] diff --git a/cosmos-retriever/tests/__init__.py b/cosmos-retriever/tests/__init__.py new file mode 100644 index 0000000..249da09 --- /dev/null +++ b/cosmos-retriever/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for cosmos-retriever.""" diff --git a/cosmos-retriever/tests/conftest.py b/cosmos-retriever/tests/conftest.py new file mode 100644 index 0000000..c8008be --- /dev/null +++ b/cosmos-retriever/tests/conftest.py @@ -0,0 +1,26 @@ +"""Shared pytest fixtures. + +We deliberately avoid the project's :class:`RetrieverSettings` here — none of +the unit tests should touch real Cosmos or OpenAI. The :func:`stub_settings_env` +fixture (auto-applied) populates the required env vars with placeholder values +so that any import-time validation succeeds without secrets. +""" + +from __future__ import annotations + +from collections.abc import Iterator + +import pytest + + +@pytest.fixture(autouse=True) +def stub_settings_env(monkeypatch: pytest.MonkeyPatch) -> Iterator[None]: + """Provide harmless defaults for required env vars across the test session.""" + + monkeypatch.setenv("ACCOUNT_URI", "https://stub.documents.azure.com:443/") + monkeypatch.setenv("COSMOS_DATABASE", "test-db") + monkeypatch.setenv("COSMOS_CORPUS_CONTAINER", "test-container") + monkeypatch.setenv("OPENAI_API_KEY", "sk-test-stub") + monkeypatch.setenv("VLLM_BASE_URL", "http://test-vllm:8000") + monkeypatch.setenv("VLLM_MODEL_NAME", "harness-1") + yield diff --git a/cosmos-retriever/tests/test_chat_agent.py b/cosmos-retriever/tests/test_chat_agent.py new file mode 100644 index 0000000..d5637b6 --- /dev/null +++ b/cosmos-retriever/tests/test_chat_agent.py @@ -0,0 +1,271 @@ +"""Tests for the generic OpenAI-compatible chat backend. + +These never touch a real model or Cosmos: a fake chat client returns scripted +responses and a stub tool returns canned search results, so we only exercise +the agent loop, tool dispatch, doc-text hydration, and final-answer parsing. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import pytest + +from cosmos_retriever.config import RetrieverSettings +from cosmos_retriever.inference.openai_chat import run_chat_search, run_responses_search +from cosmos_retriever.tools import SEARCH_CORPUS_SCHEMA, Tool, ToolCallMetadata, ToolSet + + +# -------------------------------------------------------------------------- +# Fakes +# -------------------------------------------------------------------------- +class _StubSearchTool(Tool): + """Returns one canned search hit formatted like the real SearchCorpusTool.""" + + def __call__( + self, params: dict[Any, Any], overrides: dict[Any, Any] | None = None + ) -> tuple[str, ToolCallMetadata | None]: + return ( + "\n# DOCUMENT ID: doc_1 (12 tokens) \nMarie Curie discovered radium in 1898.", + None, + ) + + +def _toolset() -> ToolSet: + ts = ToolSet() + ts.add_tool(_StubSearchTool(tool_schema=SEARCH_CORPUS_SCHEMA)) + return ts + + +class _FakeFunction: + def __init__(self, name: str, arguments: str) -> None: + self.name = name + self.arguments = arguments + + +class _FakeToolCall: + def __init__(self, call_id: str, name: str, arguments: str) -> None: + self.id = call_id + self.type = "function" + self.function = _FakeFunction(name, arguments) + + +class _FakeMessage: + def __init__(self, content: str | None = None, tool_calls: list | None = None) -> None: + self.content = content + self.tool_calls = tool_calls + + +class FakeChatClient: + """Mimics the subset of openai.OpenAI used by run_chat_search.""" + + def __init__(self, scripted_messages: list[_FakeMessage]) -> None: + self._scripted = list(scripted_messages) + self.calls: list[dict] = [] + + # client.chat.completions.create(...) + @property + def chat(self) -> FakeChatClient: + return self + + @property + def completions(self) -> FakeChatClient: + return self + + def create(self, **kwargs: Any) -> Any: + self.calls.append(kwargs) + message = self._scripted.pop(0) + return SimpleNamespace(choices=[SimpleNamespace(message=message)]) + + +# -------------------------------------------------------------------------- +# Chat agent tests +# -------------------------------------------------------------------------- +def test_chat_search_executes_tool_then_parses_final_docs() -> None: + client = FakeChatClient( + [ + _FakeMessage( + content="", + tool_calls=[_FakeToolCall("call_1", "search_corpus", '{"query": "radium"}')], + ), + _FakeMessage( + content=( + "\n" + "States Curie discovered radium.\n" + "" + ), + tool_calls=None, + ), + ] + ) + + result = run_chat_search( + toolset=_toolset(), + client=client, + model="gpt-4o-foundry", + query="Who discovered radium?", + max_documents=5, + ) + + assert result.num_turns == 2 + assert len(result.documents) == 1 + doc = result.documents[0] + assert doc.id == "doc_1" + assert "Marie Curie discovered radium" in doc.text # hydrated from the search result + assert doc.justification == "States Curie discovered radium." + assert doc.rank == 0 + assert result.metadata["backend"] == "openai_chat" + assert result.metadata["tool_calls"] == 1 + # The model + tools were actually forwarded. + assert client.calls[0]["model"] == "gpt-4o-foundry" + assert any(t["function"]["name"] == "search_corpus" for t in client.calls[0]["tools"]) + + +def test_chat_search_handles_immediate_final_answer() -> None: + client = FakeChatClient( + [_FakeMessage(content="", tool_calls=None)] + ) + result = run_chat_search( + toolset=_toolset(), client=client, model="m", query="q", max_documents=3 + ) + assert result.num_turns == 1 + assert [d.id for d in result.documents] == ["doc_9"] + + +def test_chat_search_respects_max_turns_without_final() -> None: + # Always returns a tool call → never a final answer; loop must stop at max_turns. + looping = [ + _FakeMessage( + content="thinking", + tool_calls=[_FakeToolCall(f"c{i}", "search_corpus", "{}")], + ) + for i in range(10) + ] + client = FakeChatClient(looping) + result = run_chat_search( + toolset=_toolset(), client=client, model="m", query="q", max_turns=3 + ) + assert result.num_turns == 3 + assert result.documents == [] # no blocks ever emitted + + +# -------------------------------------------------------------------------- +# Config tests +# -------------------------------------------------------------------------- +def test_use_chat_backend_flag() -> None: + harmony = RetrieverSettings(inference_backend="harmony_vllm") # type: ignore[call-arg] + chat = RetrieverSettings(inference_backend="openai_chat") # type: ignore[call-arg] + assert harmony.use_chat_backend is False + assert chat.use_chat_backend is True + + +def test_build_chat_client_requires_base_url_and_model() -> None: + s = RetrieverSettings(inference_backend="openai_chat") # type: ignore[call-arg] + with pytest.raises(ValueError, match="CHAT_BASE_URL"): + s.build_chat_client() + + s2 = RetrieverSettings(inference_backend="openai_chat", chat_base_url="http://x/v1") # type: ignore[call-arg] + with pytest.raises(ValueError, match="CHAT_MODEL"): + s2.build_chat_client() + + +def test_build_chat_client_returns_openai_client() -> None: + s = RetrieverSettings( # type: ignore[call-arg] + inference_backend="openai_chat", + chat_base_url="http://foundry.example/v1", + chat_model="gpt-4o", + chat_api_key="secret-key", + ) + client = s.build_chat_client() + assert str(client.base_url).rstrip("/") == "http://foundry.example/v1" + + +# -------------------------------------------------------------------------- +# Responses-API backend +# -------------------------------------------------------------------------- +class _FakeFunctionCall: + """A /responses ``function_call`` output item.""" + + type = "function_call" + + def __init__(self, call_id: str, name: str, arguments: str) -> None: + self.call_id = call_id + self.name = name + self.arguments = arguments + + +class _FakeResponse: + def __init__(self, response_id: str, output: list, output_text: str = "") -> None: + self.id = response_id + self.output = output + self.output_text = output_text + + +class FakeResponsesClient: + """Mimics the subset of openai.OpenAI used by run_responses_search.""" + + def __init__(self, scripted: list[_FakeResponse]) -> None: + self._scripted = list(scripted) + self.calls: list[dict] = [] + + @property + def responses(self) -> FakeResponsesClient: + return self + + def create(self, **kwargs: Any) -> _FakeResponse: + self.calls.append(kwargs) + return self._scripted.pop(0) + + +def test_responses_search_executes_tool_then_parses_final_docs() -> None: + client = FakeResponsesClient( + [ + _FakeResponse( + "resp_1", + output=[_FakeFunctionCall("call_1", "search_corpus", '{"query": "radium"}')], + ), + _FakeResponse( + "resp_2", + output=[], + output_text=( + "Curie discovered radium." + ), + ), + ] + ) + + result = run_responses_search( + toolset=_toolset(), + client=client, + model="gpt-5.4", + query="Who discovered radium?", + max_documents=5, + reasoning_effort="low", + ) + + assert result.num_turns == 2 + assert [d.id for d in result.documents] == ["doc_1"] + assert "Marie Curie discovered radium" in result.documents[0].text + assert result.documents[0].justification == "Curie discovered radium." + assert result.metadata["backend"] == "openai_responses" + # First turn uses a plain-string input + flat function tool schema + reasoning. + assert isinstance(client.calls[0]["input"], str) + assert client.calls[0]["tools"][0]["name"] == "search_corpus" + assert client.calls[0]["reasoning"] == {"effort": "low"} + # Second turn continues via previous_response_id + function_call_output. + assert client.calls[1]["previous_response_id"] == "resp_1" + assert client.calls[1]["input"][0]["type"] == "function_call_output" + + +def test_responses_search_respects_max_turns() -> None: + looping = [ + _FakeResponse(f"resp_{i}", output=[_FakeFunctionCall(f"c{i}", "search_corpus", "{}")]) + for i in range(10) + ] + client = FakeResponsesClient(looping) + result = run_responses_search( + toolset=_toolset(), client=client, model="gpt-5.4", query="q", max_turns=3 + ) + assert result.num_turns == 3 + assert result.documents == [] diff --git a/cosmos-retriever/tests/test_server.py b/cosmos-retriever/tests/test_server.py new file mode 100644 index 0000000..de424e8 --- /dev/null +++ b/cosmos-retriever/tests/test_server.py @@ -0,0 +1,99 @@ +"""Tests for the FastAPI HTTP service in :mod:`cosmos_retriever.server`. + +These never touch real Cosmos / vLLM: the heavy :class:`CosmosRetriever` is +replaced with a stub so we only exercise the request/response plumbing, +concurrency-pool keying, and error-envelope behaviour. +""" + +from __future__ import annotations + +from fastapi.testclient import TestClient + +import cosmos_retriever.server as server_module +from cosmos_retriever.config import get_settings +from cosmos_retriever.retriever import RetrievalResult, RetrievedDocument + + +class _StubRetriever: + """Stand-in for CosmosRetriever that records its construction args.""" + + instances: list[_StubRetriever] = [] + + def __init__(self, settings=None, *, corpus_name=None, reranker=None) -> None: + self.settings = settings + self.corpus_name = corpus_name + self.calls: list[tuple[str, int]] = [] + _StubRetriever.instances.append(self) + + def search(self, query: str, *, max_documents: int = 20) -> RetrievalResult: + self.calls.append((query, max_documents)) + return RetrievalResult( + query=query, + documents=[RetrievedDocument(id="doc-1", text="hello", rank=0)], + num_turns=3, + elapsed_s=1.5, + ) + + +class _BoomRetriever(_StubRetriever): + def search(self, query: str, *, max_documents: int = 20) -> RetrievalResult: + raise RuntimeError("vllm unreachable") + + +def _client(monkeypatch, retriever_cls=_StubRetriever) -> TestClient: + _StubRetriever.instances = [] + monkeypatch.setattr(server_module, "CosmosRetriever", retriever_cls) + get_settings.cache_clear() + app = server_module.create_app(get_settings()) + return TestClient(app) + + +def test_health_ok(monkeypatch) -> None: + with _client(monkeypatch) as client: + resp = client.get("/health") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} + + +def test_search_returns_result_json(monkeypatch) -> None: + with _client(monkeypatch) as client: + resp = client.post("/search", json={"query": "who discovered radium?", "maxDocuments": 5}) + assert resp.status_code == 200 + body = resp.json() + assert body["query"] == "who discovered radium?" + assert body["num_turns"] == 3 + assert body["documents"][0]["id"] == "doc-1" + # max_documents forwarded through the alias. + assert _StubRetriever.instances[0].calls == [("who discovered radium?", 5)] + + +def test_search_defaults_max_documents(monkeypatch) -> None: + with _client(monkeypatch) as client: + resp = client.post("/search", json={"query": "q"}) + assert resp.status_code == 200 + assert _StubRetriever.instances[0].calls == [("q", 20)] + + +def test_search_rejects_empty_query(monkeypatch) -> None: + with _client(monkeypatch) as client: + resp = client.post("/search", json={"query": ""}) + assert resp.status_code == 422 # pydantic min_length + + +def test_search_pool_keys_by_corpus(monkeypatch) -> None: + with _client(monkeypatch) as client: + client.post("/search", json={"query": "a", "container": "corpus-x"}) + client.post("/search", json={"query": "b", "container": "corpus-x"}) + client.post("/search", json={"query": "c", "container": "corpus-y"}) + # Same container reuses one retriever; a different one builds a second. + corpora = sorted(r.corpus_name for r in _StubRetriever.instances) + assert corpora == ["corpus-x", "corpus-y"] + + +def test_search_error_returns_json_envelope(monkeypatch) -> None: + with _client(monkeypatch, retriever_cls=_BoomRetriever) as client: + resp = client.post("/search", json={"query": "boom"}) + assert resp.status_code == 500 + body = resp.json() + assert body["error"] == "vllm unreachable" + assert body["type"] == "RuntimeError" diff --git a/cosmos-retriever/tests/test_tools.py b/cosmos-retriever/tests/test_tools.py new file mode 100644 index 0000000..e6f7e1f --- /dev/null +++ b/cosmos-retriever/tests/test_tools.py @@ -0,0 +1,104 @@ +"""Unit tests for :mod:`cosmos_retriever.tools`.""" + +from __future__ import annotations + +from cosmos_retriever.tools import ( + GREP_CORPUS_SCHEMA, + PRUNE_CHUNKS_SCHEMA, + SEARCH_CORPUS_SCHEMA, + PruneChunksTool, + ToolSet, + UserTextTool, + _fts_literal_args, + _tokenize_for_fts, +) +from cosmos_retriever.utils import ProviderFormat + + +class TestStopwordTokenisation: + def test_drops_stopwords_and_lowercases(self) -> None: + assert _tokenize_for_fts("The quick brown FOX") == ["quick", "brown", "fox"] + + def test_dedupes_preserving_order(self) -> None: + assert _tokenize_for_fts("alpha BETA alpha gamma beta") == ["alpha", "beta", "gamma"] + + def test_caps_at_30_terms(self) -> None: + words = " ".join(f"word{i}" for i in range(50)) + terms = _tokenize_for_fts(words) + assert len(terms) == 30 + assert terms[0] == "word0" and terms[-1] == "word29" + + def test_empty_after_stopwords_returns_empty(self) -> None: + # Every token in this string is a stopword. + assert _tokenize_for_fts("the and or but please") == [] + + +class TestFtsLiteralArgs: + def test_emits_quoted_csv(self) -> None: + assert _fts_literal_args(["alpha", "beta"]) == '"alpha", "beta"' + + def test_escapes_quotes_and_backslashes(self) -> None: + out = _fts_literal_args(['he said "hi"', "back\\slash"]) + assert out == '"he said \\"hi\\"", "back\\\\slash"' + + +class TestSchemaProviderFormat: + def test_openai_format_contains_function_metadata(self) -> None: + f = SEARCH_CORPUS_SCHEMA.to_provider_format(ProviderFormat.OPENAI) + assert f["type"] == "function" + assert f["name"] == "search_corpus" + assert "query" in f["parameters"]["properties"] + assert f["parameters"]["required"] == ["query"] + + def test_harmony_format_nests_function_object(self) -> None: + f = GREP_CORPUS_SCHEMA.to_provider_format(ProviderFormat.OPENAI_HARMONY) + assert f["type"] == "function" + assert f["function"]["name"] == "grep_corpus" + assert f["function"]["parameters"]["required"] == ["pattern"] + + +class TestToolSetBasics: + def test_add_get_remove(self) -> None: + ts = ToolSet() + prune = PruneChunksTool() + user = UserTextTool() + ts.add_tool(prune) + ts.add_tool(user) + assert ts.get_tool("prune_chunks") is prune + assert ts.get_tool("user_text") is user + assert ts.get_tool("missing") is None + ts.remove_tool("prune_chunks") + assert ts.get_tool("prune_chunks") is None + + def test_duplicate_name_raises(self) -> None: + ts = ToolSet() + ts.add_tool(PruneChunksTool()) + try: + ts.add_tool(PruneChunksTool()) + except ValueError as exc: + assert "already exists" in str(exc) + else: # pragma: no cover + raise AssertionError("expected ValueError") + + +class TestPruneTool: + def test_returns_pruned_string(self) -> None: + tool = PruneChunksTool() + out, metadata = tool({"chunk_ids": ["a", "b"]}) + assert out == "Pruned" + assert metadata is None + + def test_rejects_missing_arg(self) -> None: + tool = PruneChunksTool() + try: + tool({}) + except ValueError as exc: + assert "Invalid params" in str(exc) + else: # pragma: no cover + raise AssertionError("expected ValueError") + + +def test_prune_chunks_schema_round_trip() -> None: + f = PRUNE_CHUNKS_SCHEMA.to_provider_format(ProviderFormat.OPENAI) + assert f["parameters"]["properties"]["chunk_ids"]["type"] == "array" + assert f["parameters"]["required"] == ["chunk_ids"] diff --git a/cosmos-retriever/tests/test_trajectory.py b/cosmos-retriever/tests/test_trajectory.py new file mode 100644 index 0000000..5ad72ff --- /dev/null +++ b/cosmos-retriever/tests/test_trajectory.py @@ -0,0 +1,134 @@ +"""Unit tests for :mod:`cosmos_retriever.trajectory`.""" + +from __future__ import annotations + +import json + +from cosmos_retriever.tools import ( + PRUNE_CHUNKS_SCHEMA, + PruneChunksTool, + SearchCorpusToolCallMetadata, + Tool, + ToolSchema, + UserTextTool, +) +from cosmos_retriever.trajectory import ( + Action, + ActionBuilder, + ObservationBuilder, + Trajectory, + TrajectoryBuilder, +) + + +class _FakeSearchTool(Tool): + """A stand-in :class:`Tool` for trajectory rendering tests.""" + + tool_schema: ToolSchema + + def __init__(self) -> None: + super().__init__( + tool_schema=ToolSchema( + name="search_corpus", + description="x", + parameters={"query": {"type": "string"}}, + required=["query"], + ) + ) + + def __call__(self, params, overrides=None): # type: ignore[override] + return "result", None + + +def _build_tiny_trajectory() -> Trajectory: + builder = TrajectoryBuilder() + builder.add_observation( + ObservationBuilder() + .add_observation("hello?", source="user") + .build() + ) + builder.add_action( + ActionBuilder() + .add_reasoning("I should search.") + .add_tool_call(_FakeSearchTool(), {"query": "hi"}, source="toolu_search_1") + .build() + ) + builder.add_observation( + ObservationBuilder() + .add_observation( + "\n# DOCUMENT ID: doc_a \nhello world", + source="toolu_search_1", + tool_metadata=SearchCorpusToolCallMetadata(returned_chunk_ids=["doc_a"]), + ) + .build() + ) + builder.add_action( + ActionBuilder() + .add_tool_call(UserTextTool(), {"text": ""}, source="agent") + .build() + ) + return builder.build() + + +class TestTrajectoryBuilders: + def test_builds_in_order(self) -> None: + traj = _build_tiny_trajectory() + assert traj.num_turns == 2 + # Alternating obs / action / obs / action + types = [type(e).__name__ for e in traj.actions_and_observations] + assert types == ["Observation", "Action", "Observation", "Action"] + + def test_clone_is_deep_for_params(self) -> None: + traj = _build_tiny_trajectory() + cloned = traj.clone() + # mutate the original's params, the clone should be unaffected + first_action = next( + e for e in traj.actions_and_observations if isinstance(e, Action) + ) + first_action.params[0]["query"] = "MUTATED" + cloned_first = next( + e for e in cloned.actions_and_observations if isinstance(e, Action) + ) + assert cloned_first.params[0]["query"] == "hi" + + +class TestOpenAIChatRendering: + def test_user_assistant_tool_round_trip(self) -> None: + traj = _build_tiny_trajectory() + msgs = traj.to_openai_format() + assert msgs[0] == {"role": "user", "content": [{"type": "text", "text": "hello?"}]} + assert msgs[1]["role"] == "assistant" + assert msgs[1]["tool_calls"][0]["function"]["name"] == "search_corpus" + assert json.loads(msgs[1]["tool_calls"][0]["function"]["arguments"]) == {"query": "hi"} + assert msgs[2]["role"] == "tool" + assert "DOCUMENT ID: doc_a" in msgs[2]["content"][0]["text"] + # Final assistant text + assert msgs[3]["role"] == "assistant" + assert msgs[3]["content"][0]["text"].startswith("") + + +class TestOpenAIHarmonyRendering: + def test_renders_to_a_conversation(self) -> None: + traj = _build_tiny_trajectory() + conv = traj.to_openai_harmony_format() + # Sanity: we should have system + developer + the four trajectory entries. + assert len(conv.messages) >= 4 + + def test_user_text_tool_does_not_appear_in_action_dispatch(self) -> None: + # PruneChunksTool exists in tools.py; ensure ActionBuilder accepts it + # but rejects MultiToolUseTool (the latter is reserved for rendering). + ab = ActionBuilder() + ab.add_tool_call(PruneChunksTool(), {"chunk_ids": ["x"]}, source="agent") + action = ab.build() + assert action.tools[0].tool_schema.name == PRUNE_CHUNKS_SCHEMA.name + + +class TestObservationBuilderValidation: + def test_incomplete_builder_raises(self) -> None: + ob = ObservationBuilder() + try: + ob.build() + except ValueError as exc: + assert "is not complete" in str(exc) + else: # pragma: no cover + raise AssertionError("expected ValueError") From 53db02227ddd0aac6dbf910167f0beec664c8150 Mon Sep 17 00:00:00 2001 From: cosmos-dev Date: Fri, 26 Jun 2026 21:57:56 +0000 Subject: [PATCH 2/8] feat(agentic_search): add agentic_search MCP tool Add a 9th MCP tool, agentic_search, that runs the Cosmos Retriever agent over a Cosmos DB corpus and returns ranked, curated documents. - AgenticSearchExecutor: calls the cosmos-retriever service over HTTP (COSMOS_RETRIEVER_URL, COSMOS_RETRIEVER_TIMEOUT_S); always returns parseable JSON (error envelope on failure). - Wire into Program.cs, MCPProtocolController (tools/list + tools/call), MCPTestController, and McpToolRequestValidator. - CosmosClientFactory: exclude ManagedIdentityCredential (fall through to az login); accept the standard MCP _meta params field. - Docs: docs/AGENTIC_SEARCH.md, README + CHANGELOG + .env.example. --- .env.example | 16 ++ CHANGELOG.md | 26 ++ README.md | 1 + docs/AGENTIC_SEARCH.md | 249 ++++++++++++++++++ .../AzureCosmosDB.MCP.Toolkit.csproj | 2 +- .../Controllers/MCPProtocolController.cs | 28 ++ .../Controllers/MCPTestController.cs | 15 +- src/AzureCosmosDB.MCP.Toolkit/Program.cs | 24 ++ .../Services/AgenticSearchExecutor.cs | 192 ++++++++++++++ .../Services/CosmosClientFactory.cs | 9 +- .../Services/CosmosDbToolsService.cs | 18 ++ .../Services/McpToolRequestValidator.cs | 11 +- .../AgenticSearchExecutorTests.cs | 227 ++++++++++++++++ 13 files changed, 814 insertions(+), 4 deletions(-) create mode 100644 docs/AGENTIC_SEARCH.md create mode 100644 src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs create mode 100644 tests/AzureCosmosDB.MCP.Toolkit.Tests/AgenticSearchExecutorTests.cs diff --git a/.env.example b/.env.example index 2b9b14b..8a81e8d 100644 --- a/.env.example +++ b/.env.example @@ -54,6 +54,22 @@ ASPNETCORE_LOGGING__LOGLEVEL__DEFAULT=Information # Optional: Server URLs (default: http://+:8080) ASPNETCORE_URLS=http://+:8080 +# ============================================================================ +# OPTIONAL: agentic_search TOOL (Cosmos retriever HTTP service) +# ============================================================================ +# The `agentic_search` MCP tool calls the trained Harness-1 multi-turn +# retrieval agent, which runs as a long-lived FastAPI service started with +# `python -m cosmos_retriever serve`. See docs/AGENTIC_SEARCH.md. +# Both vars below are optional with sensible defaults; if the service is not +# reachable, agentic_search simply returns a clean JSON error envelope to the +# caller. + +# Base URL of the cosmos-retriever FastAPI service (default http://127.0.0.1:9000). +# COSMOS_RETRIEVER_URL=http://127.0.0.1:9000 + +# Per-request wall-clock cap in seconds (default 600). +# COSMOS_RETRIEVER_TIMEOUT_S=600 + # ============================================================================ # DOCKER COMPOSE NOTES # ============================================================================ diff --git a/CHANGELOG.md b/CHANGELOG.md index 44bde2a..8660c30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.2.0] - 2026-06-18 + +### Added +- **`agentic_search` tool**: Runs the trained Harness-1 multi-turn retrieval + agent (`pat-jj/harness-1` served by vLLM) against a Cosmos DB corpus and + returns ranked, curated documents that best answer the query. The agent + issues hybrid (vector + full-text) RRF searches, optionally reranks with + Qwen3-Reranker-8B, reads full documents, and prunes its context across + multiple turns. Implemented as a subprocess call into the companion + [`cosmos-retriever`](https://github.com/your-org/cosmos-retriever) + Python package; see [`docs/AGENTIC_SEARCH.md`](docs/AGENTIC_SEARCH.md) for + the deployment story. +- Optional `database` and `container` arguments on `agentic_search` so a + single MCP server can target multiple Cosmos corpora at request time. When + the corpus registry (`CORPUS_REGISTRY` / `CORPUS_REGISTRY_FILE`) is set + in the host environment, the matching account, database, and embedding + model are picked automatically per call. +- New service: `AgenticSearchExecutor` (subprocess lifecycle, timeout, error + envelope generation). +- New env vars: `COSMOS_RETRIEVER_PYTHON`, `COSMOS_RETRIEVER_DIR`, + `COSMOS_RETRIEVER_TIMEOUT_S` — see [`.env.example`](.env.example). + +### Changed +- `AppState` now also exposes `ILoggerFactory` so static `[McpServerTool]` + methods can obtain a properly-named logger. + ## [1.1.2] - 2026-05-29 ### Added diff --git a/README.md b/README.md index 2fd336c..4a5615a 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ This toolkit provides: | `text_search` | Search for documents where a property contains a search phrase | | `vector_search` | Perform vector search using Azure OpenAI embeddings | | `hybrid_search` | Perform hybrid search combining vector similarity and full-text keyword search using Reciprocal Rank Fusion (RRF) | +| `agentic_search` | Run the trained Harness-1 multi-turn retrieval agent against a Cosmos DB corpus. Backed by the bundled [`cosmos-retriever/`](cosmos-retriever/) FastAPI service; see [docs/AGENTIC_SEARCH.md](docs/AGENTIC_SEARCH.md) for setup and per-corpus configuration. | ## Project Structure diff --git a/docs/AGENTIC_SEARCH.md b/docs/AGENTIC_SEARCH.md new file mode 100644 index 0000000..958b4e2 --- /dev/null +++ b/docs/AGENTIC_SEARCH.md @@ -0,0 +1,249 @@ +# `agentic_search` — Harness-1 multi-turn retrieval as an MCP tool + +`agentic_search` runs the trained **Harness-1** multi-turn search agent +(`pat-jj/harness-1`, a fine-tuned `openai/gpt-oss-20b` served by vLLM) against +an Azure Cosmos DB corpus and returns the ranked, curated set of documents +that best answer a natural-language query. The agent issues hybrid (vector + +full-text) RRF searches, optionally reranks with Qwen3-Reranker-8B, fetches +full documents, and prunes its working context across multiple turns. From +the MCP client's perspective it's a single tool call; under the hood the +agent can take 20–40 turns and 30–60 s of wall-clock time. + +## Architecture + +```text + MCP client MCPToolKit (.NET) cosmos-retriever (Python, FastAPI) + ────────── ───────────────── ─────────────────────────────────── + Claude Desktop ┌─ TokenBudgetRetrievalSubagent + AI Foundry ─── MCP HTTP ───► [McpServerTool] AgenticSearch │ ├─ SearchCorpus / Grep / Read / Prune + VS Code Copilot │ │ └─ VLLMHarmonyInferenceModel + ▼ │ + AgenticSearchExecutor ── HTTP POST ───► POST /search (uvicorn, kept warm) + │ │ + │ ◄────── JSON body ──────────────┤ + │ └─► vLLM (Harness-1) + Cosmos DB + embeddings + ▼ + MCP tool response +``` + +The .NET server and the Python retriever are now **two long-lived +processes**. The retriever is started once (`python -m cosmos_retriever +serve`) and keeps its Cosmos/embedding/Harmony clients warm; the .NET server +calls its `POST /search` endpoint per MCP tool call and passes the JSON +response through verbatim. + +## Prerequisites + +You need three things running on the same host (or reachable from it): + +| Component | What it is | +|---|---| +| **An LLM endpoint** | Either vLLM serving `pat-jj/harness-1` (default, Harmony token-IDs) **or** any OpenAI-compatible chat model — e.g. an Azure AI Foundry deployment — via `INFERENCE_BACKEND=openai_chat` (see below). | +| **Azure Cosmos DB for NoSQL** | Container populated with the Harness-1 schema (`id`, `docid`, `chunk_idx`, `text`, `embedding`), vector + FTS indexes enabled. | +| **Embeddings backend** | Whatever model your corpus was ingested with — Azure OpenAI `text-embedding-3-small`, OpenAI native, or a local vLLM embedding server. | + +### Inference backend (local model vs. any Foundry endpoint) + +The bundled retriever supports two backends, selected by `INFERENCE_BACKEND`: + +- `harmony_vllm` *(default)* — the fine-tuned `pat-jj/harness-1` checkpoint + served by vLLM, driven with raw Harmony token-IDs (`VLLM_BASE_URL`, `VLLM_MODEL_NAME`). +- `openai_chat` — **any** OpenAI-compatible chat model (an Azure AI Foundry + deployment, OpenAI, a local server, ...), driven with standard + function/tool calling. Set `CHAT_BASE_URL`, `CHAT_API_KEY`, `CHAT_MODEL` + (and `CHAT_API_VERSION` for Azure OpenAI-style endpoints). The agent uses + the same Cosmos tools, so retrieval quality tracks the chosen model's + tool-use ability rather than the Harness-1 checkpoint. + +The Python helper is **bundled in this repository** at +[`cosmos-retriever/`](../cosmos-retriever/) — no separate clone needed. +Install it into a virtualenv: + +```bash +cd cosmos-retriever +uv venv --python 3.11 .venv +uv pip install --python .venv/bin/python -e . +``` + +Confirm it works: + +```bash +.venv/bin/python -m cosmos_retriever serve --help +``` + +Then start the service (it reads its own `.env` / `.env.local` for +`VLLM_BASE_URL`, `ACCOUNT_URI`, `COSMOS_*`, `AZURE_OPENAI_*`, `HOST`, `PORT`): + +```bash +.venv/bin/python -m cosmos_retriever serve # binds HOST:PORT (default 0.0.0.0:9000) +curl -s http://127.0.0.1:9000/health # -> {"status":"ok"} +``` + +## Server configuration + +Two env vars are read by the `AgenticSearchExecutor` service; both optional. +If `COSMOS_RETRIEVER_URL` doesn't point at a running retriever service, the +tool returns a clean JSON `{"error":"...","hint":"..."}` envelope rather than +crashing the server. + +| Variable | Default | Purpose | +|---|---|---| +| `COSMOS_RETRIEVER_URL` | `http://127.0.0.1:9000` | Base URL of the cosmos-retriever FastAPI service. | +| `COSMOS_RETRIEVER_TIMEOUT_S` | `600` | Per-request wall-clock cap; the request is abandoned if it exceeds this. | + +Unlike the previous subprocess design, the retriever service has its **own** +environment. Everything it needs (`VLLM_BASE_URL`, `ACCOUNT_URI`, +`COSMOS_DATABASE`, `COSMOS_CORPUS_CONTAINER`, `AZURE_OPENAI_*`, +`CORPUS_REGISTRY_FILE`, …) is read from the retriever process's environment / +`.env` file, **not** inherited from the .NET server. + +## Tool schema + +```jsonc +{ + "name": "agentic_search", + "description": "Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1) against a Cosmos DB corpus and returns ranked, curated documents.", + "inputSchema": { + "type": "object", + "properties": { + "query": { "type": "string", "maxLength": 4096 }, + "maxDocuments": { "type": "integer", "minimum": 1, "maximum": 30, "default": 20 }, + "database": { "type": "string", "maxLength": 256 }, + "container": { "type": "string", "maxLength": 256 } + }, + "required": ["query"], + "additionalProperties": false + } +} +``` + +Tool result (the retriever service's `POST /search` JSON body, passed through verbatim): + +```jsonc +{ + "query": "Who discovered radium and when did she win her second Nobel?", + "num_turns": 5, + "elapsed_s": 32.3, + "documents": [ + { + "id": "96308__3", + "rank": 0, + "justification": "This biography directly states that Marie Curie ...", + "text": "..." + } + ] +} +``` + +On failure the helper (or the C# executor) returns a JSON error envelope: + +```jsonc +{ "error": "agentic_search timed out after 600s.", "stderr": "..." } +``` + +## Multi-corpus targeting + +`agentic_search` accepts optional `database` and `container` arguments so a +single MCP server can be aimed at multiple Cosmos corpora at request time. +For per-corpus *embedding-model* selection (e.g. one corpus ingested with +`text-embedding-3-small`, another with `qwen3-embed`), point +`CORPUS_REGISTRY_FILE` at a JSON file in the cosmos-retriever package: + +```jsonc +{ + "browsecomp_corpus_container": { + "account_uri": "https://acct-a.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "https://embedding.services.ai.azure.com/openai/v1", + "embed_api_key_env": "AZURE_OPENAI_API_KEY", + "embed_model": "text-embedding-3-small" + }, + "enterprise_ragbench_corpus": { + "account_uri": "https://acct-b.documents.azure.com:443/", + "database": "search_retrieval_database", + "embed_base_url": "http://localhost:8002/v1", + "embed_api_key_env": null, + "embed_model": "qwen3-embed", + "embed_query_instruction": "Given a question, retrieve documents that answer it" + } +} +``` + +Then call: + +```jsonc +{ "name": "agentic_search", + "arguments": { + "query": "What was the temporary mitigation applied to the internal load balancer ...", + "container": "enterprise_ragbench_corpus" + } } +``` + +The matching account, database, embedding URL, model, and optional +`Instruct:` prefix all get picked automatically per call. Adding a third +corpus is a one-line registry edit — no rebuild, no restart. + +## Local demo + +End-to-end with the upstream `harness-1` repository checkout providing the +Cosmos / Azure OpenAI / vLLM endpoints. + +**1. Start the retriever service** (the bundled `cosmos-retriever/` folder; it +reads `../harness-1/.env.local` plus its own `.env`): + +```bash +cd cosmos-retriever +set -a; source /path/to/harness-1/.env.local; set +a +VLLM_BASE_URL=http://localhost:8000 \ +VLLM_MODEL_NAME=harness-1 \ +VLLM_RERANKER_URL=http://localhost:8011 \ +CORPUS_REGISTRY_FILE=$PWD/corpus_registry.json \ +PORT=9000 \ +.venv/bin/python -m cosmos_retriever serve +``` + +**2. Start the .NET MCP server** (from the repo root), pointing it at the retriever URL: + +```bash +DEV_BYPASS_AUTH=true \ +COSMOS_RETRIEVER_URL=http://127.0.0.1:9000 \ +OPENAI_ENDPOINT="$AZURE_OPENAI_ENDPOINT" \ +OPENAI_EMBEDDING_DEPLOYMENT="$AZURE_OPENAI_EMBED_DEPLOYMENT" \ +dotnet run --project src/AzureCosmosDB.MCP.Toolkit +``` + +Then point any MCP client at `http://127.0.0.1:8080/mcp/`. + +## Operational notes + +- **The retriever service has its own environment.** Configure + `VLLM_BASE_URL`, `ACCOUNT_URI`, `COSMOS_*`, `AZURE_OPENAI_*`, + `CORPUS_REGISTRY_FILE`, etc. where you launch `cosmos_retriever serve` + (env or its `.env` file) — the .NET server no longer forwards them. +- **`COSMOS_USE_DEFAULT_CREDENTIAL`** controls the retriever's Cosmos auth. + By default it uses `AzureCliCredential`; set it to `1` to opt into the + broader `DefaultAzureCredential` chain (managed identity, etc.). +- **Warm process, no cold start.** Because the service stays up, the heavy + client init happens once. Per-call latency is dominated by Cosmos + round-trips + vLLM generation; don't expect sub-second latency. +- **Retrieval quality is corpus-dependent.** Cosmos's hybrid RRF puts gold + docs in the top 5 reliably; the Qwen3-Reranker step on top can over- or + under-shoot depending on how close the corpus distribution is to the + reranker's training data. If you see recall regressions, try disabling the + reranker for that corpus (omit `VLLM_RERANKER_URL` / `BASETEN_API_KEY`). +- **The tool always returns parseable JSON.** Unreachable service, request + timeouts, and non-2xx responses all yield + `{"error": "...", "hint"?: "...", "body"?: "..."}` envelopes rather than + HTTP 500s to the MCP client. + +## Implementation pointers + +| File | Role | +|---|---| +| [`Services/AgenticSearchExecutor.cs`](../src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs) | HTTP call to the retriever service, timeout, error-envelope generation. | +| [`Services/CosmosDbToolsService.cs`](../src/AzureCosmosDB.MCP.Toolkit/Services/CosmosDbToolsService.cs) | `AgenticSearch` instance method called by both controllers. | +| [`Program.cs`](../src/AzureCosmosDB.MCP.Toolkit/Program.cs) | `[McpServerTool] AgenticSearch` static method discovered by the MCP SDK. | +| [`Controllers/MCPProtocolController.cs`](../src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs) | JSON-RPC `tools/list` + `tools/call` dispatch for the custom `/mcp/http` transport. | +| [`Controllers/MCPTestController.cs`](../src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs) | REST sibling at `POST /api/mcp/tools/agentic_search`. | +| [`Services/McpToolRequestValidator.cs`](../src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs) | Strict input validation schema. | +| [`cosmos-retriever/`](../cosmos-retriever/) | The bundled Python FastAPI service (`POST /search`) the executor calls; run with `python -m cosmos_retriever serve`. | diff --git a/src/AzureCosmosDB.MCP.Toolkit/AzureCosmosDB.MCP.Toolkit.csproj b/src/AzureCosmosDB.MCP.Toolkit/AzureCosmosDB.MCP.Toolkit.csproj index f7a1e6a..a09fdb3 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/AzureCosmosDB.MCP.Toolkit.csproj +++ b/src/AzureCosmosDB.MCP.Toolkit/AzureCosmosDB.MCP.Toolkit.csproj @@ -7,7 +7,7 @@ false true AzureCosmosDB.MCP.Toolkit - 1.1.2 + 1.2.0 Azure Cosmos DB Team Microsoft Azure Cosmos DB MCP Toolkit diff --git a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs index fd51717..cf2d274 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs @@ -259,6 +259,21 @@ public async Task HandleMCPRequest([FromBody] JsonElement request required = new string[] { "databaseId", "containerId", "searchText", "textProperty", "vectorProperty", "selectProperties" }, additionalProperties = false } + }, + new { + name = "agentic_search", + description = "Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1) against a Cosmos DB corpus and returns ranked, curated documents. Pass `container=` to target a registered corpus (see CORPUS_REGISTRY env var on the host): the matching Cosmos account + database + embedding model is picked automatically per call. With no `container` the default-corpus env vars are used.", + inputSchema = new { + type = "object", + properties = new { + query = new { type = "string", description = "Natural-language information need to retrieve documents for", maxLength = 4096 }, + maxDocuments = new { type = "integer", description = "Maximum number of curated documents to return (1-30, default 20)", minimum = 1, maximum = 30, @default = 20 }, + database = new { type = "string", description = "Optional Cosmos database override (else COSMOS_DATABASE env var)", maxLength = 256 }, + container = new { type = "string", description = "Optional Cosmos corpus container override (else COSMOS_CORPUS_CONTAINER env var)", maxLength = 256 } + }, + required = new string[] { "query" }, + additionalProperties = false + } } } } @@ -452,6 +467,12 @@ private async Task ExecuteTool(string toolName, Dictionary await _cosmosDbTools.AgenticSearch( + GetStringArg(args, "query"), + GetOptionalIntArg(args, "maxDocuments", 20), + GetOptionalStringArg(args, "database"), + GetOptionalStringArg(args, "container"), + cancellationToken), _ => throw new ArgumentException($"Unknown tool: {toolName}") }; } @@ -461,6 +482,13 @@ private static string GetStringArg(Dictionary args, string key) return args.TryGetValue(key, out var value) ? value?.ToString() ?? "" : ""; } + private static string? GetOptionalStringArg(Dictionary args, string key) + { + if (!args.TryGetValue(key, out var value)) return null; + var s = value?.ToString(); + return string.IsNullOrWhiteSpace(s) ? null : s; + } + private static int GetRequiredIntArg(Dictionary args, string key) { if (!args.TryGetValue(key, out var value)) diff --git a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs index 7696874..310d31e 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs @@ -34,6 +34,7 @@ public async Task CallTool(string toolName, [FromBody] MCPToolReq "text_search" => await CallTextSearch(request.Parameters), "vector_search" => await CallVectorSearch(request.Parameters), "get_approximate_schema" => await CallGetApproximateSchema(request.Parameters), + "agentic_search" => await CallAgenticSearch(request.Parameters), _ => throw new ArgumentException($"Unknown tool: {toolName}") }; @@ -71,7 +72,8 @@ public IActionResult ListTools() new { name = "find_document_by_id", description = "Finds a document by its ID in the specified database/container" }, new { name = "text_search", description = "Select TOP N documents where a given property contains the provided search string. N must be between 1-20" }, new { name = "vector_search", description = "Performs vector search on Cosmos DB using Azure OpenAI embeddings" }, - new { name = "get_approximate_schema", description = "Approximates a container schema by sampling up to 10 documents" } + new { name = "get_approximate_schema", description = "Approximates a container schema by sampling up to 10 documents" }, + new { name = "agentic_search", description = "Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1) via the cosmos-retriever HTTP service and returns ranked, curated documents." } }; return Ok(new { tools, count = tools.Length, timestamp = DateTime.UtcNow }); @@ -127,6 +129,17 @@ private async Task CallGetApproximateSchema(Dictionary p return await _cosmosDbTools.GetApproximateSchema(databaseId, containerId); } + private async Task CallAgenticSearch(Dictionary parameters) + { + var query = GetRequiredParameter(parameters, "query"); + var maxDocuments = parameters.ContainsKey("maxDocuments") + ? GetRequiredParameter(parameters, "maxDocuments") + : 20; + string? database = parameters.ContainsKey("database") ? GetRequiredParameter(parameters, "database") : null; + string? container = parameters.ContainsKey("container") ? GetRequiredParameter(parameters, "container") : null; + return await _cosmosDbTools.AgenticSearch(query, maxDocuments, database, container); + } + private T GetRequiredParameter(Dictionary parameters, string paramName) { if (!parameters.TryGetValue(paramName, out var value)) diff --git a/src/AzureCosmosDB.MCP.Toolkit/Program.cs b/src/AzureCosmosDB.MCP.Toolkit/Program.cs index ffa178c..0c522a1 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Program.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Program.cs @@ -243,6 +243,7 @@ // Store configuration in static state for access by static tool methods AppState.Configuration = builder.Configuration; +AppState.LoggerFactory = app.Services.GetRequiredService(); // Add security headers middleware to allow MSAL authentication app.Use(async (context, next) => @@ -345,6 +346,7 @@ internal static class AppState { public static IConfiguration? Configuration { get; set; } + public static ILoggerFactory? LoggerFactory { get; set; } } public partial class Program @@ -1093,4 +1095,26 @@ FROM c return JsonSerializer.Serialize(new { error = ex.Message }); } } + + [McpServerTool, Description("Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1 served by vLLM) against a Cosmos DB corpus and returns ranked, curated documents that best answer the query. The agent internally issues hybrid (vector + full-text) RRF searches, optionally reranks with Qwen3-Reranker-8B, reads documents, and prunes its context across multiple turns. Pass `container=` to target a registered corpus (see CORPUS_REGISTRY env var on the host): the right Cosmos account + database + embedding model is picked automatically per call. With no `container` arg the default-corpus env vars are used.")] + public static async Task AgenticSearch( + [Description("Natural-language information need to retrieve documents for.")] string query, + [Description("Maximum number of curated documents to return (1-30, default 20).")] int maxDocuments = 20, + [Description("Optional Cosmos database name override (else COSMOS_DATABASE env var).")] string? database = null, + [Description("Optional Cosmos corpus container name override (else COSMOS_CORPUS_CONTAINER env var).")] string? container = null) + { + var logger = (AppState.LoggerFactory ?? Microsoft.Extensions.Logging.Abstractions.NullLoggerFactory.Instance) + .CreateLogger("AzureCosmosDB.MCP.Toolkit.CosmosDbTools.AgenticSearch"); + + if (string.IsNullOrWhiteSpace(query)) + { + return JsonSerializer.Serialize(new { error = "Parameter 'query' is required and must be non-empty." }); + } + if (maxDocuments < 1 || maxDocuments > 30) + { + return JsonSerializer.Serialize(new { error = "Parameter 'maxDocuments' must be between 1 and 30." }); + } + + return await AgenticSearchExecutor.RunAsync(query, maxDocuments, logger, database, container); + } } diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs new file mode 100644 index 0000000..3481373 --- /dev/null +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs @@ -0,0 +1,192 @@ +using System.Globalization; +using System.Net.Http.Json; +using System.Text.Json; + +namespace AzureCosmosDB.MCP.Toolkit.Services; + +/// +/// Calls the long-lived cosmos-retriever FastAPI service over HTTP and +/// returns its response body (a single JSON document) verbatim. +/// +/// +/// +/// The Python helper runs the trained Harness-1 multi-turn retrieval agent +/// (pat-jj/harness-1 served by vLLM) against an Azure Cosmos DB corpus +/// and returns a JSON document of curated, ranked results. It is started once +/// (python -m cosmos_retriever serve) and kept warm so the heavy +/// clients (Cosmos SDK, embeddings, Harmony encoder) are not re-initialised +/// on every call. +/// +/// +/// Host environment variables (read on every call): +/// +/// VariableDefault / purpose +/// +/// (COSMOS_RETRIEVER_URL) +/// Base URL of the cosmos-retriever FastAPI service. +/// Defaults to . +/// +/// +/// (COSMOS_RETRIEVER_TIMEOUT_S) +/// Per-request wall-clock cap in seconds; the request is +/// abandoned if it exceeds the timeout. Defaults to +/// . +/// +/// +/// +/// +/// The retriever service owns its own configuration (VLLM_BASE_URL, +/// ACCOUNT_URI, COSMOS_DATABASE, COSMOS_CORPUS_CONTAINER, +/// CORPUS_REGISTRY_FILE, AZURE_OPENAI_*, etc.) read from its own +/// environment / .env file; none of it flows through this process. +/// +/// +public static class AgenticSearchExecutor +{ + public const string BaseUrlEnvVar = "COSMOS_RETRIEVER_URL"; + + public const string TimeoutEnvVar = "COSMOS_RETRIEVER_TIMEOUT_S"; + + public const string DefaultBaseUrl = "http://127.0.0.1:9000"; + + public const int DefaultTimeoutSeconds = 600; + + private const int BodyTruncateBytes = 4096; + + // A single shared HttpClient with no built-in timeout — each call drives + // its own deadline via a linked CancellationTokenSource. + private static readonly HttpClient HttpClient = new() + { + Timeout = Timeout.InfiniteTimeSpan, + }; + + /// + /// Run a single cosmos-retriever search by calling the FastAPI + /// POST /search endpoint. + /// + /// Natural-language information need. + /// Cap on the number of curated docs returned (1–30). + /// Logger for request lifecycle events. + /// Optional Cosmos database override. + /// Optional Cosmos container override. + /// Cooperative cancellation. + /// + /// The service's response body, expected to be a single JSON document. On + /// any failure (service unreachable, timed out, non-success status, empty + /// body) returns a serialised { "error": "...", ... } envelope so + /// the MCP tool always returns parseable JSON to the caller. + /// + public static async Task RunAsync( + string query, + int maxDocuments, + ILogger logger, + string? database = null, + string? container = null, + CancellationToken cancellationToken = default) + { + var baseUrl = ResolveString(BaseUrlEnvVar, defaultValue: DefaultBaseUrl).TrimEnd('/'); + var timeoutSeconds = ResolveInt(TimeoutEnvVar, DefaultTimeoutSeconds); + var requestUri = $"{baseUrl}/search"; + + var payload = new Dictionary + { + ["query"] = query, + ["maxDocuments"] = maxDocuments, + }; + if (!string.IsNullOrWhiteSpace(database)) payload["database"] = database; + if (!string.IsNullOrWhiteSpace(container)) payload["container"] = container; + + logger.LogInformation( + "agentic_search: POST {RequestUri} (database={Database} container={Container} timeout={Timeout}s)", + requestUri, database ?? "", container ?? "", timeoutSeconds); + + using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + timeoutCts.CancelAfter(TimeSpan.FromSeconds(timeoutSeconds)); + + HttpResponseMessage response; + try + { + using var content = JsonContent.Create(payload); + response = await HttpClient + .PostAsync(requestUri, content, timeoutCts.Token) + .ConfigureAwait(false); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + logger.LogWarning("agentic_search: request exceeded {Timeout}s.", timeoutSeconds); + return ErrorEnvelope( + $"agentic_search timed out after {timeoutSeconds}s.", + hint: $"Increase {TimeoutEnvVar} or check that the cosmos-retriever service at {baseUrl} is responsive."); + } + catch (HttpRequestException ex) + { + logger.LogError(ex, + "agentic_search: failed to reach the cosmos-retriever service at {BaseUrl}.", baseUrl); + return ErrorEnvelope( + $"Failed to reach the cosmos-retriever service: {ex.Message}", + hint: $"Start it with 'python -m cosmos_retriever serve' and set {BaseUrlEnvVar} to its base URL (default {DefaultBaseUrl})."); + } + + using (response) + { + var body = (await response.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false)).Trim(); + + if (!response.IsSuccessStatusCode) + { + logger.LogWarning( + "agentic_search: service returned {StatusCode}. body tail: {Body}", + (int)response.StatusCode, TruncateTail(body, 512)); + + // The FastAPI service emits its own JSON error envelope on most + // failures; pass it through verbatim if so, otherwise wrap it. + if (LooksLikeJson(body)) + { + return body; + } + return ErrorEnvelope( + $"agentic_search service returned HTTP {(int)response.StatusCode}.", + bodyTail: TruncateTail(body, BodyTruncateBytes)); + } + + if (string.IsNullOrWhiteSpace(body)) + { + return ErrorEnvelope("agentic_search service produced no output."); + } + + return body; + } + } + + private static string ResolveString(string envVar, string defaultValue) + { + var value = Environment.GetEnvironmentVariable(envVar); + return string.IsNullOrWhiteSpace(value) ? defaultValue : value; + } + + private static int ResolveInt(string envVar, int defaultValue) + { + var raw = Environment.GetEnvironmentVariable(envVar); + if (!string.IsNullOrWhiteSpace(raw) && int.TryParse(raw, NumberStyles.Integer, CultureInfo.InvariantCulture, out var parsed) && parsed > 0) + { + return parsed; + } + return defaultValue; + } + + private static bool LooksLikeJson(string s) => + s.Length > 0 && (s[0] == '{' || s[0] == '['); + + private static string ErrorEnvelope(string error, string? hint = null, string? bodyTail = null) + { + var payload = new Dictionary { ["error"] = error }; + if (hint is not null) payload["hint"] = hint; + if (bodyTail is not null) payload["body"] = bodyTail; + return JsonSerializer.Serialize(payload); + } + + private static string TruncateTail(string s, int maxChars) + { + if (string.IsNullOrEmpty(s) || s.Length <= maxChars) return s ?? string.Empty; + return "..." + s[^maxChars..]; + } +} diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosClientFactory.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosClientFactory.cs index 82958b3..ad2cc33 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosClientFactory.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosClientFactory.cs @@ -74,7 +74,14 @@ public static CosmosClient CreateCosmosClient(IConfiguration configuration, ILog } logger.LogInformation("Creating CosmosClient using Azure credentials (cloud mode)"); - var credential = new DefaultAzureCredential(); + // Exclude ManagedIdentityCredential: on Azure VMs MSI_ENDPOINT/IMDS is present + // but the managed identity often lacks Cosmos RBAC (SSO failure). Skipping it + // lets the chain fall through to the Azure CLI login (az login), which the + // Python retriever uses successfully. + var credential = new DefaultAzureCredential(new DefaultAzureCredentialOptions + { + ExcludeManagedIdentityCredential = true, + }); return new CosmosClient(endpoint, credential, BuildClientOptions(configuration, logger, useGatewayMode: false)); } diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosDbToolsService.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosDbToolsService.cs index 46bb07f..da77bc2 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosDbToolsService.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/CosmosDbToolsService.cs @@ -700,4 +700,22 @@ public async Task GetApproximateSchema(string databaseId, string contain return new { error = ex.Message }; } } + + /// + /// Calls the cosmos-retriever FastAPI service and returns its raw response + /// body (a single JSON document). See + /// for the environment-variable contract and timeout knobs. + /// + public async Task AgenticSearch( + string query, + int maxDocuments = 20, + string? database = null, + string? container = null, + CancellationToken cancellationToken = default) + { + var raw = await AgenticSearchExecutor.RunAsync(query, maxDocuments, _logger, database, container, cancellationToken); + // Pass the JSON string through verbatim so the MCP envelope serialises it + // as a single string (matching the other tools, which also return JSON strings). + return raw; + } } diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs index 859c566..cce8307 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs @@ -58,6 +58,13 @@ public sealed class McpToolRequestValidator ["vectorProperty"] = ToolArgumentSchema.String(required: true, maxLength: 256), ["selectProperties"] = ToolArgumentSchema.String(required: true, maxLength: 512), ["topN"] = ToolArgumentSchema.Integer(required: false, minValue: 1, maxValue: 50) + }), + ["agentic_search"] = new(new Dictionary(StringComparer.Ordinal) + { + ["query"] = ToolArgumentSchema.String(required: true, maxLength: 4096), + ["maxDocuments"] = ToolArgumentSchema.Integer(required: false, minValue: 1, maxValue: 30), + ["database"] = ToolArgumentSchema.String(required: false, maxLength: 256), + ["container"] = ToolArgumentSchema.String(required: false, maxLength: 256) }) }; @@ -68,7 +75,9 @@ public ToolValidationResult ValidateToolCall(JsonElement paramsElement) throw new ToolInputValidationException("'params' must be a JSON object."); } - RejectUnknownProperties(paramsElement, ["name", "arguments"], "params"); + // `_meta` is a standard MCP field clients may attach to params (e.g. progress + // tokens); accept and ignore it rather than rejecting the request. + RejectUnknownProperties(paramsElement, ["name", "arguments", "_meta"], "params"); if (!paramsElement.TryGetProperty("name", out var toolNameElement) || toolNameElement.ValueKind != JsonValueKind.String) { diff --git a/tests/AzureCosmosDB.MCP.Toolkit.Tests/AgenticSearchExecutorTests.cs b/tests/AzureCosmosDB.MCP.Toolkit.Tests/AgenticSearchExecutorTests.cs new file mode 100644 index 0000000..daa7b0b --- /dev/null +++ b/tests/AzureCosmosDB.MCP.Toolkit.Tests/AgenticSearchExecutorTests.cs @@ -0,0 +1,227 @@ +using System.Net; +using System.Net.Sockets; +using System.Text; +using System.Text.Json; +using AzureCosmosDB.MCP.Toolkit.Services; +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using Xunit; + +namespace AzureCosmosDB.MCP.Toolkit.Tests; + +/// +/// Tests for . Stands in for the +/// cosmos-retriever FastAPI service with a tiny in-process +/// so we can verify the executor's response +/// pass-through, timeout behaviour, and error-envelope generation without +/// needing the real retriever service running. +/// +public sealed class AgenticSearchExecutorTests : IDisposable +{ + private readonly Dictionary _savedEnv = new(); + private static readonly NullLogger _logger = NullLogger.Instance; + + private void SetEnv(string name, string? value) + { + if (!_savedEnv.ContainsKey(name)) + { + _savedEnv[name] = Environment.GetEnvironmentVariable(name); + } + Environment.SetEnvironmentVariable(name, value); + } + + public void Dispose() + { + foreach (var (k, v) in _savedEnv) + { + Environment.SetEnvironmentVariable(k, v); + } + } + + [Fact] + public async Task RunAsync_passes_through_service_response_body() + { + const string body = + "{\"query\":\"hi\",\"documents\":[{\"id\":\"doc_a\",\"rank\":0}],\"num_turns\":1,\"elapsed_s\":0.01}"; + + using var server = StubServer.Start((ctx, _) => + { + ctx.Response.StatusCode = 200; + ctx.Response.ContentType = "application/json"; + return body; + }); + + SetEnv(AgenticSearchExecutor.BaseUrlEnvVar, server.BaseUrl); + SetEnv(AgenticSearchExecutor.TimeoutEnvVar, "30"); + + var raw = await AgenticSearchExecutor.RunAsync("hi", maxDocuments: 5, logger: _logger); + + using var doc = JsonDocument.Parse(raw); + doc.RootElement.GetProperty("query").GetString().Should().Be("hi"); + doc.RootElement.GetProperty("num_turns").GetInt32().Should().Be(1); + doc.RootElement.GetProperty("documents")[0].GetProperty("id").GetString().Should().Be("doc_a"); + } + + [Fact] + public async Task RunAsync_forwards_request_payload_to_service() + { + string? capturedBody = null; + using var server = StubServer.Start((ctx, reqBody) => + { + capturedBody = reqBody; + ctx.Response.StatusCode = 200; + return "{\"query\":\"q\",\"documents\":[],\"num_turns\":0,\"elapsed_s\":0.0}"; + }); + + SetEnv(AgenticSearchExecutor.BaseUrlEnvVar, server.BaseUrl); + SetEnv(AgenticSearchExecutor.TimeoutEnvVar, "30"); + + await AgenticSearchExecutor.RunAsync( + "find me docs", maxDocuments: 7, logger: _logger, database: "db1", container: "corpus-x"); + + capturedBody.Should().NotBeNull(); + using var doc = JsonDocument.Parse(capturedBody!); + doc.RootElement.GetProperty("query").GetString().Should().Be("find me docs"); + doc.RootElement.GetProperty("maxDocuments").GetInt32().Should().Be(7); + doc.RootElement.GetProperty("database").GetString().Should().Be("db1"); + doc.RootElement.GetProperty("container").GetString().Should().Be("corpus-x"); + } + + [Fact] + public async Task RunAsync_passes_through_service_error_envelope_on_non_success() + { + using var server = StubServer.Start((ctx, _) => + { + ctx.Response.StatusCode = 500; + ctx.Response.ContentType = "application/json"; + return "{\"error\":\"vllm unreachable\",\"type\":\"RuntimeError\"}"; + }); + + SetEnv(AgenticSearchExecutor.BaseUrlEnvVar, server.BaseUrl); + SetEnv(AgenticSearchExecutor.TimeoutEnvVar, "30"); + + var raw = await AgenticSearchExecutor.RunAsync("hi", maxDocuments: 5, logger: _logger); + + using var doc = JsonDocument.Parse(raw); + doc.RootElement.GetProperty("error").GetString().Should().Be("vllm unreachable"); + } + + [Fact] + public async Task RunAsync_returns_error_envelope_when_service_unreachable() + { + // Reserve+release a port so nothing is listening on it. + var port = GetFreePort(); + SetEnv(AgenticSearchExecutor.BaseUrlEnvVar, $"http://127.0.0.1:{port}"); + SetEnv(AgenticSearchExecutor.TimeoutEnvVar, "5"); + + var raw = await AgenticSearchExecutor.RunAsync("hi", maxDocuments: 5, logger: _logger); + + using var doc = JsonDocument.Parse(raw); + doc.RootElement.GetProperty("error").GetString().Should().Contain("Failed to reach"); + doc.RootElement.TryGetProperty("hint", out var hint).Should().BeTrue(); + hint.GetString().Should().Contain(AgenticSearchExecutor.BaseUrlEnvVar); + } + + [Fact] + public async Task RunAsync_returns_error_envelope_when_service_times_out() + { + using var server = StubServer.Start((ctx, _) => + { + // Sleep for longer than the 1s timeout we're about to set. + Thread.Sleep(5000); + ctx.Response.StatusCode = 200; + return "{}"; + }); + + SetEnv(AgenticSearchExecutor.BaseUrlEnvVar, server.BaseUrl); + SetEnv(AgenticSearchExecutor.TimeoutEnvVar, "1"); + + var raw = await AgenticSearchExecutor.RunAsync("hi", maxDocuments: 5, logger: _logger); + + using var doc = JsonDocument.Parse(raw); + doc.RootElement.GetProperty("error").GetString().Should().Contain("timed out after 1s"); + } + + private static int GetFreePort() + { + var listener = new TcpListener(IPAddress.Loopback, 0); + listener.Start(); + var port = ((IPEndPoint)listener.LocalEndpoint).Port; + listener.Stop(); + return port; + } + + /// + /// Minimal in-process HTTP server backed by . + /// The handler receives the request context plus the request body and + /// returns the response body string. + /// + private sealed class StubServer : IDisposable + { + private readonly HttpListener _listener; + private readonly CancellationTokenSource _cts = new(); + + public string BaseUrl { get; } + + private StubServer(HttpListener listener, string baseUrl) + { + _listener = listener; + BaseUrl = baseUrl; + } + + public static StubServer Start(Func handler) + { + var port = GetFreePort(); + var baseUrl = $"http://127.0.0.1:{port}"; + var listener = new HttpListener(); + listener.Prefixes.Add($"{baseUrl}/"); + listener.Start(); + var server = new StubServer(listener, baseUrl); + _ = Task.Run(() => server.LoopAsync(handler)); + return server; + } + + private async Task LoopAsync(Func handler) + { + while (!_cts.IsCancellationRequested) + { + HttpListenerContext ctx; + try + { + ctx = await _listener.GetContextAsync().ConfigureAwait(false); + } + catch + { + return; // listener stopped + } + + try + { + string reqBody; + using (var reader = new StreamReader(ctx.Request.InputStream, Encoding.UTF8)) + { + reqBody = await reader.ReadToEndAsync().ConfigureAwait(false); + } + + var responseBody = handler(ctx, reqBody); + var buffer = Encoding.UTF8.GetBytes(responseBody); + ctx.Response.ContentLength64 = buffer.Length; + await ctx.Response.OutputStream.WriteAsync(buffer).ConfigureAwait(false); + ctx.Response.OutputStream.Close(); + } + catch + { + try { ctx.Response.Abort(); } catch { /* best effort */ } + } + } + } + + public void Dispose() + { + _cts.Cancel(); + try { _listener.Stop(); } catch { /* best effort */ } + try { _listener.Close(); } catch { /* best effort */ } + _cts.Dispose(); + } + } +} From 9db564033e0eb960f59ee8d4e1eda794eaea60cc Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 05:01:58 +0000 Subject: [PATCH 3/8] chore: remove benchmark scripts and dev artifacts from toolkit --- cosmos-retriever/scripts/bench_browsecomp.py | 199 ------------------ cosmos-retriever/scripts/bench_erag.py | 169 --------------- cosmos-retriever/scripts/diagnose_qst_0099.py | 155 -------------- cosmos-retriever/scripts/erag_repeat.py | 52 ----- .../scripts/run_with_upstream_env.sh | 46 ---- 5 files changed, 621 deletions(-) delete mode 100644 cosmos-retriever/scripts/bench_browsecomp.py delete mode 100644 cosmos-retriever/scripts/bench_erag.py delete mode 100644 cosmos-retriever/scripts/diagnose_qst_0099.py delete mode 100644 cosmos-retriever/scripts/erag_repeat.py delete mode 100755 cosmos-retriever/scripts/run_with_upstream_env.sh diff --git a/cosmos-retriever/scripts/bench_browsecomp.py b/cosmos-retriever/scripts/bench_browsecomp.py deleted file mode 100644 index 7b450f0..0000000 --- a/cosmos-retriever/scripts/bench_browsecomp.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Run an N-question slice of BrowseComp+ through the standalone retriever, -score recall@curated against gold docs, and save per-query records as JSONL. - -Usage:: - - # with reranker (default — VLLM_RERANKER_URL must be set in the env) - python scripts/bench_browsecomp.py \\ - --n 83 --seed 42 --parallel 4 \\ - --container browsecomp_corpus_container \\ - --output runs/bench_bc83_rerank.jsonl - - # without reranker - VLLM_RERANKER_URL= python scripts/bench_browsecomp.py \\ - --n 83 --seed 42 --parallel 4 \\ - --container browsecomp_corpus_container \\ - --output runs/bench_bc83_norerank.jsonl - -Records contain: query_id, query, gold_docids, retrieved_chunk_ids, -retrieved_docids (chunk_id.split('__')[0]), recall, precision, num_turns, -elapsed_s, error. -""" - -from __future__ import annotations - -import argparse -import json -import os -import random -import sys -import time -import traceback -from collections import defaultdict -from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path - -from cosmos_retriever.config import RetrieverSettings, init_logging -from cosmos_retriever.retriever import CosmosRetriever - -DATASET = Path("/nvme/harness-1/external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl") -QREL_GOLD = Path("/nvme/harness-1/external/BrowseComp-Plus/topics-qrels/qrel_golds.txt") -QREL_EVIDENCE = Path("/nvme/harness-1/external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt") - - -def load_qrels(path: Path) -> dict[str, set[str]]: - """TREC qrels: ``query_id Q0 doc_id relevance`` -> {qid: {docid, ...}}.""" - d: dict[str, set[str]] = defaultdict(set) - if not path.exists(): - return d - for line in path.open(): - parts = line.split() - if len(parts) == 4: - d[parts[0]].add(parts[2]) - return d - - -_GOLD = load_qrels(QREL_GOLD) -_EVIDENCE = load_qrels(QREL_EVIDENCE) -# Reference positives for "Recall" = gold ∪ evidence (search_dataset.py BrowseCompPlusDataset). -_UNION: dict[str, set[str]] = defaultdict(set) -for _q in set(_GOLD) | set(_EVIDENCE): - _UNION[_q] = _GOLD.get(_q, set()) | _EVIDENCE.get(_q, set()) - - -def load_dataset(n: int, seed: int) -> list[dict]: - rows = [json.loads(l) for l in DATASET.open()] - rng = random.Random(seed) - rng.shuffle(rows) - return rows[:n] - - -def score(retrieved_chunk_ids: list[str], gold_docids: set[str]) -> tuple[float, float]: - if not gold_docids: - return 0.0, 0.0 - retrieved_docids = {cid.split("__")[0] for cid in retrieved_chunk_ids} - hit = retrieved_docids & gold_docids - recall = len(hit) / len(gold_docids) - precision = len(hit) / len(retrieved_docids) if retrieved_docids else 0.0 - return recall, precision - - -def _recall(found: set[str], positives: set[str]) -> float: - return len(found & positives) / len(positives) if positives else 0.0 - - -def run_one(retriever: CosmosRetriever, row: dict, max_docs: int) -> dict: - qid = row["query_id"] - query = row["query"] - gold_pos = _GOLD.get(str(qid), set()) # final-answer positives - union_pos = _UNION.get(str(qid), set()) # reference "Recall" positives = gold ∪ evidence - started = time.perf_counter() - try: - result = retriever.search(query=query, max_documents=max_docs) - elapsed = time.perf_counter() - started - curated_docids = {d.id.split("__")[0] for d in result.documents} - pool_docids = set(result.pool_doc_ids) - recall = _recall(curated_docids, union_pos) # Recall (curated set) - trajectory_recall = _recall(pool_docids, union_pos) # Trajectory Recall (pool) - final_answer_recall = _recall(curated_docids, gold_pos) # Final-Answer Recall (curated vs gold) - precision = ( - len(curated_docids & union_pos) / len(curated_docids) if curated_docids else 0.0 - ) - return { - "query_id": qid, - "query": query, - "union_pos": sorted(union_pos), - "gold_pos": sorted(gold_pos), - "curated_docids": sorted(curated_docids), - "pool_docids": sorted(pool_docids), - "num_curated": len(curated_docids), - "n_pool": len(pool_docids), - "recall": recall, - "trajectory_recall": trajectory_recall, - "final_answer_recall": final_answer_recall, - "precision": precision, - "num_turns": result.num_turns, - "elapsed_s": round(elapsed, 2), - "error": None, - } - except Exception as exc: # noqa: BLE001 — record all failures so the bench keeps going - return { - "query_id": qid, - "query": query, - "union_pos": sorted(union_pos), - "gold_pos": sorted(gold_pos), - "curated_docids": [], - "pool_docids": [], - "num_curated": 0, - "n_pool": 0, - "recall": 0.0, - "trajectory_recall": 0.0, - "final_answer_recall": 0.0, - "precision": 0.0, - "num_turns": None, - "elapsed_s": round(time.perf_counter() - started, 2), - "error": f"{type(exc).__name__}: {exc}", - "traceback": traceback.format_exc(), - } - - -def main() -> int: - ap = argparse.ArgumentParser() - ap.add_argument("--n", type=int, default=83) - ap.add_argument("--seed", type=int, default=42) - ap.add_argument("--parallel", type=int, default=4) - ap.add_argument("--container", default="browsecomp_corpus_container") - ap.add_argument("--max-documents", type=int, default=30) - ap.add_argument("--output", required=True) - args = ap.parse_args() - - init_logging() - settings = RetrieverSettings() - print( - f"[bench] reranker={'ON' if settings.vllm_reranker_url else 'OFF'} " - f"vllm={settings.vllm_base_url} container={args.container} n={args.n} parallel={args.parallel}", - file=sys.stderr, - ) - - rows = load_dataset(args.n, args.seed) - retriever = CosmosRetriever(settings=settings, corpus_name=args.container) - - out = Path(args.output) - out.parent.mkdir(parents=True, exist_ok=True) - done = 0 - recall_sum = 0.0 - traj_sum = 0.0 - fa_sum = 0.0 - err_count = 0 - with out.open("w") as f, ThreadPoolExecutor(max_workers=args.parallel) as ex: - futures = {ex.submit(run_one, retriever, row, args.max_documents): row for row in rows} - for fut in as_completed(futures): - rec = fut.result() - f.write(json.dumps(rec) + "\n") - f.flush() - done += 1 - recall_sum += rec["recall"] - traj_sum += rec["trajectory_recall"] - fa_sum += rec["final_answer_recall"] - if rec["error"]: - err_count += 1 - print( - f"[bench] {done}/{len(rows)} qid={rec['query_id']} " - f"recall={rec['recall']:.2f} traj={rec['trajectory_recall']:.2f} fa={rec['final_answer_recall']:.2f} " - f"n_cur={rec['num_curated']} n_pool={rec['n_pool']} " - f"turns={rec['num_turns']} elapsed={rec['elapsed_s']}s " - f"err={'Y' if rec['error'] else 'N'}", - file=sys.stderr, - ) - - n = max(done, 1) - print( - f"[bench] DONE n={done} Recall={recall_sum/n:.3f} " - f"Trajectory={traj_sum/n:.3f} Final-Answer={fa_sum/n:.3f} errors={err_count}", - file=sys.stderr, - ) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/cosmos-retriever/scripts/bench_erag.py b/cosmos-retriever/scripts/bench_erag.py deleted file mode 100644 index 6b0a8b3..0000000 --- a/cosmos-retriever/scripts/bench_erag.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Run an N-question slice of EnterpriseRAG-Bench (ERAG) through the standalone -retriever, score recall@curated against gold docs, and save per-query records -as JSONL. - -Mirrors ``bench_browsecomp.py`` but loads the ERAG questions parquet -(``question_id``, ``question``, ``expected_doc_ids``) instead of the BrowseComp -JSONL, and defaults to the ``enterprise_ragbench_corpus`` container. - -Usage:: - - python scripts/bench_erag.py \\ - --n 500 --seed 42 --parallel 4 \\ - --container enterprise_ragbench_corpus \\ - --output runs/bench_erag500.jsonl - -Budget / turn knobs are read from the environment (COSMOS_RETRIEVER_MAX_TURNS, -COSMOS_RETRIEVER_THRESHOLD_BUDGET, COSMOS_RETRIEVER_TOKEN_BUDGET) via RetrieverSettings. - -Records contain: query_id, query, gold_docids, retrieved_chunk_ids, -retrieved_docids (chunk_id.split('__')[0]), recall, precision, num_turns, -elapsed_s, error. -""" - -from __future__ import annotations - -import argparse -import json -import random -import sys -import time -import traceback -from concurrent.futures import ThreadPoolExecutor, as_completed -from pathlib import Path - -import pyarrow.parquet as pq - -from cosmos_retriever.config import RetrieverSettings, init_logging -from cosmos_retriever.retriever import CosmosRetriever - -DATASET = Path( - "/nvme/hf-cache/hub/datasets--onyx-dot-app--EnterpriseRAG-Bench/" - "snapshots/69916e31c68aa5963c00248fd7f0bc12d04fd235/data/questions/test.parquet" -) - - -def load_dataset(n: int, seed: int) -> list[dict]: - table = pq.read_table(DATASET, columns=["question_id", "question", "expected_doc_ids"]) - cols = table.to_pydict() - rows = [ - { - "query_id": qid, - "query": q, - "gold_docids": list(gold) if gold else [], - } - for qid, q, gold in zip( - cols["question_id"], cols["question"], cols["expected_doc_ids"], strict=True - ) - ] - rng = random.Random(seed) - rng.shuffle(rows) - return rows[:n] - - -def score(retrieved_chunk_ids: list[str], gold_docids: set[str]) -> tuple[float, float]: - if not gold_docids: - return 0.0, 0.0 - retrieved_docids = {cid.split("__")[0] for cid in retrieved_chunk_ids} - hit = retrieved_docids & gold_docids - recall = len(hit) / len(gold_docids) - precision = len(hit) / len(retrieved_docids) if retrieved_docids else 0.0 - return recall, precision - - -def run_one(retriever: CosmosRetriever, row: dict, max_docs: int) -> dict: - qid = row["query_id"] - query = row["query"] - gold_docids = set(row["gold_docids"]) - started = time.perf_counter() - try: - result = retriever.search(query=query, max_documents=max_docs) - elapsed = time.perf_counter() - started - retrieved = [d.id for d in result.documents] - recall, precision = score(retrieved, gold_docids) - return { - "query_id": qid, - "query": query, - "gold_docids": sorted(gold_docids), - "retrieved_chunk_ids": retrieved, - "retrieved_docids": sorted({c.split("__")[0] for c in retrieved}), - "num_curated": len(retrieved), - "recall": recall, - "precision": precision, - "num_turns": result.num_turns, - "elapsed_s": round(elapsed, 2), - "error": None, - } - except Exception as exc: # noqa: BLE001 — record all failures so the bench keeps going - return { - "query_id": qid, - "query": query, - "gold_docids": sorted(gold_docids), - "retrieved_chunk_ids": [], - "retrieved_docids": [], - "num_curated": 0, - "recall": 0.0, - "precision": 0.0, - "num_turns": None, - "elapsed_s": round(time.perf_counter() - started, 2), - "error": f"{type(exc).__name__}: {exc}", - "traceback": traceback.format_exc(), - } - - -def main() -> int: - ap = argparse.ArgumentParser() - ap.add_argument("--n", type=int, default=500) - ap.add_argument("--seed", type=int, default=42) - ap.add_argument("--parallel", type=int, default=4) - ap.add_argument("--container", default="enterprise_ragbench_corpus") - ap.add_argument("--max-documents", type=int, default=20) - ap.add_argument("--output", required=True) - args = ap.parse_args() - - init_logging() - settings = RetrieverSettings() - print( - f"[bench] reranker={'ON' if settings.vllm_reranker_url else 'OFF'} " - f"vllm={settings.vllm_base_url} container={args.container} n={args.n} " - f"parallel={args.parallel} max_turns={settings.cosmos_retriever_max_turns} " - f"threshold={settings.cosmos_retriever_threshold_budget} token={settings.cosmos_retriever_token_budget}", - file=sys.stderr, - ) - - rows = load_dataset(args.n, args.seed) - retriever = CosmosRetriever(settings=settings, corpus_name=args.container) - - out = Path(args.output) - out.parent.mkdir(parents=True, exist_ok=True) - done = 0 - recall_sum = 0.0 - err_count = 0 - with out.open("w") as f, ThreadPoolExecutor(max_workers=args.parallel) as ex: - futures = {ex.submit(run_one, retriever, row, args.max_documents): row for row in rows} - for fut in as_completed(futures): - rec = fut.result() - f.write(json.dumps(rec) + "\n") - f.flush() - done += 1 - recall_sum += rec["recall"] - if rec["error"]: - err_count += 1 - print( - f"[bench] {done}/{len(rows)} qid={rec['query_id']} " - f"recall={rec['recall']:.2f} n={rec['num_curated']} " - f"turns={rec['num_turns']} elapsed={rec['elapsed_s']}s " - f"err={'Y' if rec['error'] else 'N'}", - file=sys.stderr, - ) - - avg_recall = recall_sum / max(done, 1) - print( - f"[bench] DONE n={done} mean_recall={avg_recall:.3f} errors={err_count}", - file=sys.stderr, - ) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/cosmos-retriever/scripts/diagnose_qst_0099.py b/cosmos-retriever/scripts/diagnose_qst_0099.py deleted file mode 100644 index 800f000..0000000 --- a/cosmos-retriever/scripts/diagnose_qst_0099.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Diagnose where the gold doc loses rank for qst_0099 in the ERAG corpus.""" - -from __future__ import annotations - -import os -import sys -import time - -os.environ.setdefault("CORPUS_REGISTRY_FILE", "/nvme/cosmos-retriever/corpus_registry.json") - -from cosmos_retriever.config import get_settings # noqa: E402 - -QUERY = ( - "What was the temporary mitigation applied to the internal load balancer " - "serving the gen-infer VIPs around 03:40 UTC that immediately reduced TCP " - "retransmits?" -) -GOLD = "dsid_fa2d9f0bda0e4d6b9174ae6b15f7b37e" - - -def with_retry(label, fn, attempts=4): - last = None - for i in range(attempts): - try: - return fn() - except Exception as e: # noqa: BLE001 - last = e - wait = 2**i - print( - f" [{label}] attempt {i + 1} failed: {type(e).__name__}: {str(e)[:120]} — retry in {wait}s" - ) - time.sleep(wait) - raise last # type: ignore[misc] - - -def main() -> int: - settings = get_settings() - corpus = settings.resolve_corpus("enterprise_ragbench_corpus") - print("=== corpus ===") - print(f" account_uri = {corpus.account_uri}") - print(f" database = {corpus.database}") - print(f" container = {corpus.container}") - print(f" embed_model = {corpus.embed_model} url={corpus.embed_base_url}") - print() - - db = settings.build_cosmos_database(corpus) - container = db.get_container_client(corpus.container) - oc = settings.build_openai_client(corpus) - - print("=== 1. gold-doc presence (partition-key lookup) ===") - rows = with_retry( - "presence", - lambda: list( - container.query_items( - query="SELECT TOP 5 c.id, c.docid, c.chunk_idx FROM c WHERE c.docid = @d", - parameters=[{"name": "@d", "value": GOLD}], - partition_key=GOLD, - ) - ), - ) - print(f" {len(rows)} chunks for {GOLD}:") - for r in rows: - print(f" id={r['id']} chunk_idx={r['chunk_idx']}") - if not rows: - print(" FATAL: gold doc not in container.") - return 1 - print() - - from cosmos_retriever.tools import _fts_literal_args, _query_with_retry, _tokenize_for_fts - - emb_text = QUERY - if corpus.embed_query_instruction: - emb_text = f"Instruct: {corpus.embed_query_instruction}\nQuery: {QUERY}" - emb = with_retry( - "embed", - lambda: oc.embeddings.create(model=corpus.embed_model, input=[emb_text]).data[0].embedding, - ) - print(f"=== 2. RRF top-50 (no rerank) — embed_dim={len(emb)} ===") - terms = _tokenize_for_fts(QUERY) or [QUERY] - sql = ( - "SELECT TOP @k c.id, c.docid, c.chunk_idx FROM c\n" - "ORDER BY RANK RRF(" - "VectorDistance(c.embedding, @qVec), " - f"FullTextScore(c.text, {_fts_literal_args(terms)})" - ")" - ) - rrf_rows = with_retry( - "rrf", - lambda: _query_with_retry( - container, - sql, - [{"name": "@k", "value": 50}, {"name": "@qVec", "value": emb}], - ), - ) - gold_rank = None - for rank, r in enumerate(rrf_rows, 1): - if r["docid"] == GOLD: - gold_rank = rank - break - print(f" pool size = {len(rrf_rows)} gold_rank = {gold_rank}") - print(" top-10 ids:") - for rank, r in enumerate(rrf_rows[:10], 1): - marker = " GOLD ✓" if r["docid"] == GOLD else "" - print(f" rank={rank:>2} {r['id']}{marker}") - print() - - if gold_rank is None: - print("Gold doc not in top-50. Retrieval itself is missing it.") - return 0 - - if not settings.vllm_reranker_url: - print("=== 3. (no VLLM_RERANKER_URL set, skipping rerank check) ===") - return 0 - - from cosmos_retriever.rerank import VLLMReranker - - reranker = VLLMReranker(base_url=settings.vllm_reranker_url) - print("=== 3. Qwen3-Reranker reordering of those 50 ===") - - docs: list[str] = [] - for r in rrf_rows: - text_rows = with_retry( - f"fetch_{r['id']}", - lambda r=r: list( - container.query_items( - query="SELECT TOP 1 c.text FROM c WHERE c.id = @i", - parameters=[{"name": "@i", "value": r["id"]}], - partition_key=r["docid"], - ) - ), - attempts=3, - ) - docs.append(text_rows[0]["text"] if text_rows else "") - - reranked = reranker(QUERY, docs) - new_gold_rank = None - for new_rank, rr in enumerate(reranked, 1): - if rrf_rows[rr.original_index]["docid"] == GOLD: - new_gold_rank = new_rank - print( - f" rerank position = {new_rank} (was {gold_rank}) score={rr.score:.4f} GOLD ✓" - ) - break - print(" top-5 after rerank:") - for new_rank, rr in enumerate(reranked[:5], 1): - rid = rrf_rows[rr.original_index]["id"] - marker = " GOLD ✓" if rrf_rows[rr.original_index]["docid"] == GOLD else "" - print(f" rank={new_rank:>2} score={rr.score:.4f} {rid}{marker}") - if new_gold_rank is None: - print(" GOLD ABSENT in reranked list — reranker scored other docs higher.") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/cosmos-retriever/scripts/erag_repeat.py b/cosmos-retriever/scripts/erag_repeat.py deleted file mode 100644 index 8d91ed1..0000000 --- a/cosmos-retriever/scripts/erag_repeat.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Run agentic search 3x against ERAG and check whether gold doc surfaces.""" - -from __future__ import annotations - -import json -import os -import subprocess -import sys - -REPO = "/nvme/cosmos-retriever" -QUERY = ( - "What was the temporary mitigation applied to the internal load balancer " - "serving the gen-infer VIPs around 03:40 UTC that immediately reduced TCP " - "retransmits?" -) -GOLD = "dsid_fa2d9f0bda0e4d6b9174ae6b15f7b37e" - -env = os.environ.copy() - - -def run_once(idx: int) -> None: - cmd = [ - f"{REPO}/.venv/bin/python", - "-m", - "cosmos_retriever", - "search", - "--container", - "enterprise_ragbench_corpus", - "--query", - QUERY, - "--max-documents", - "5", - ] - proc = subprocess.run(cmd, capture_output=True, text=True, env=env, check=False) - if proc.returncode != 0: - print(f"run {idx}: subprocess exit={proc.returncode}", file=sys.stderr) - print(proc.stderr[-2000:], file=sys.stderr) - return - data = json.loads(proc.stdout) - ids = [d["id"].split("__")[0] for d in data["documents"]] - print( - f"run {idx}: turns={data['num_turns']:>2} " - f"elapsed={data['elapsed_s']:>5.1f}s " - f"gold_hit={GOLD in ids:>5} " - f"ranked_ids={ids}" - ) - - -if __name__ == "__main__": - n = int(sys.argv[1]) if len(sys.argv) > 1 else 3 - for i in range(1, n + 1): - run_once(i) diff --git a/cosmos-retriever/scripts/run_with_upstream_env.sh b/cosmos-retriever/scripts/run_with_upstream_env.sh deleted file mode 100755 index 89cc953..0000000 --- a/cosmos-retriever/scripts/run_with_upstream_env.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# Bridge: source the upstream harness-1 .env.local and re-export the values our -# RetrieverSettings expects under its own variable names, then exec whatever -# command was passed on the command line. -# -# Maps: -# AZURE_OPENAI_EMBED_API_KEY -> AZURE_OPENAI_API_KEY (the embed-only key) -# AZURE_OPENAI_EMBED_DEPLOYMENT -> OPENAI_EMBEDDING_MODEL -# ACCOUNT_URI / COSMOS_DATABASE / COSMOS_CORPUS_CONTAINER -> passed through -# AZURE_OPENAI_ENDPOINT -> passed through -# -# Targets the live vLLM in the running pytorch container at 172.17.0.2:8000 -# (harness-1 model) and the matching reranker on :8011. -# -# Usage: scripts/run_with_upstream_env.sh python -m cosmos_retriever smoke --query "..." - -set -euo pipefail - -UPSTREAM_ENV="${UPSTREAM_ENV:-/nvme/harness-1/.env.local}" - -if [[ ! -r "${UPSTREAM_ENV}" ]]; then - echo "error: cannot read ${UPSTREAM_ENV}" >&2 - exit 1 -fi - -# shellcheck disable=SC1090 -set -a -source "${UPSTREAM_ENV}" -set +a - -# --- Map upstream var names to ours ----------------------------------------- -export OPENAI_EMBEDDING_MODEL="${AZURE_OPENAI_EMBED_DEPLOYMENT:-text-embedding-3-small}" -if [[ -n "${AZURE_OPENAI_EMBED_API_KEY:-}" ]]; then - # Our config reads AZURE_OPENAI_API_KEY for the embedding endpoint. - export AZURE_OPENAI_API_KEY="${AZURE_OPENAI_EMBED_API_KEY}" -fi - -# --- Point at the running vLLM in the pytorch container -------------------- -export VLLM_BASE_URL="${VLLM_BASE_URL:-http://172.17.0.2:8000}" -export VLLM_MODEL_NAME="${VLLM_MODEL_NAME:-harness-1}" -export VLLM_RERANKER_URL="${VLLM_RERANKER_URL:-http://172.17.0.2:8011}" - -# --- Sensible default timeouts / budgets so we don't wait forever ---------- -export VLLM_TIMEOUT_S="${VLLM_TIMEOUT_S:-600}" - -exec "$@" From 905acc027dfeadd3ed0d83ba87004f945301a73b Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 18:43:05 +0000 Subject: [PATCH 4/8] chore: remove generated datagen/splits folder --- .../datagen/splits/browsecompplus_splits.json | 846 --- .../datagen/splits/patents_splits.json | 3252 ------------ .../datagen/splits/sec_splits.json | 4685 ----------------- .../datagen/splits/summary.json | 34 - .../datagen/splits/web_splits.json | 2794 ---------- 5 files changed, 11611 deletions(-) delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json deleted file mode 100644 index 98e17a6..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/splits/browsecompplus_splits.json +++ /dev/null @@ -1,846 +0,0 @@ -{ - "dataset": "browsecompplus", - "total_queries": 830, - "train_queries": 664, - "test_queries": 166, - "sft_queries": 199, - "rl_queries": 465, - "sft_ratio": 0.2996987951807229, - "rl_ratio": 0.7003012048192772, - "sft_query_ids": [ - "100", - "1010", - "1016", - "1022", - "1023", - "1026", - "1033", - "1046", - "1049", - "1055", - "1068", - "107", - "1071", - "1089", - "1090", - "1098", - "1108", - "1117", - "1118", - "1122", - "113", - "1134", - "1138", - "1139", - "1148", - "1158", - "116", - "1172", - "1176", - "1184", - "1188", - "120", - "1210", - "1218", - "122", - "1228", - "1233", - "1235", - "1237", - "124", - "1240", - "1243", - "1247", - "1249", - "125", - "1258", - "1260", - "1266", - "138", - "15", - "155", - "170", - "177", - "178", - "183", - "194", - "20", - "201", - "203", - "206", - "209", - "216", - "226", - "233", - "242", - "256", - "265", - "268", - "27", - "276", - "293", - "299", - "304", - "315", - "327", - "328", - "337", - "349", - "372", - "376", - "380", - "383", - "402", - "406", - "408", - "409", - "41", - "410", - "411", - "416", - "421", - "424", - "429", - "433", - "435", - "436", - "442", - "449", - "46", - "467", - "471", - "473", - "483", - "485", - "486", - "487", - "493", - "497", - "498", - "5", - "503", - "506", - "507", - "512", - "513", - "517", - "520", - "53", - "540", - "560", - "563", - "568", - "570", - "571", - "575", - "577", - "584", - "588", - "59", - "590", - "591", - "592", - "593", - "594", - "611", - "618", - "62", - "631", - "637", - "651", - "655", - "661", - "665", - "669", - "673", - "679", - "682", - "703", - "707", - "709", - "714", - "719", - "723", - "724", - "737", - "738", - "744", - "768", - "769", - "771", - "786", - "788", - "791", - "792", - "793", - "801", - "806", - "814", - "815", - "82", - "827", - "828", - "833", - "851", - "870", - "872", - "882", - "886", - "89", - "895", - "896", - "90", - "915", - "916", - "919", - "926", - "927", - "930", - "932", - "944", - "948", - "951", - "96", - "960", - "961", - "966", - "968", - "981", - "996" - ], - "rl_query_ids": [ - "1", - "10", - "1000", - "1002", - "1003", - "1005", - "1007", - "1008", - "1015", - "1018", - "1019", - "102", - "1020", - "1021", - "1025", - "1027", - "1028", - "103", - "1032", - "1035", - "1037", - "1038", - "1039", - "1040", - "1041", - "1042", - "1043", - "1044", - "1045", - "1047", - "1052", - "1057", - "1058", - "106", - "1060", - "1061", - "1062", - "1063", - "1066", - "1072", - "1073", - "1076", - "1078", - "1081", - "1082", - "1083", - "1091", - "1092", - "1093", - "1094", - "1095", - "1099", - "11", - "110", - "1101", - "1103", - "1105", - "1107", - "111", - "1119", - "1124", - "1126", - "1131", - "1133", - "1135", - "1142", - "1147", - "1150", - "1152", - "1153", - "1155", - "1161", - "1162", - "1163", - "1167", - "1169", - "1174", - "1177", - "1179", - "1182", - "1185", - "1187", - "119", - "1192", - "1193", - "1194", - "1196", - "1198", - "12", - "1200", - "1201", - "1203", - "1204", - "1206", - "1208", - "1209", - "1212", - "1214", - "1215", - "1219", - "1220", - "1221", - "1222", - "1223", - "1225", - "1226", - "1227", - "1230", - "1231", - "1232", - "1234", - "1236", - "1238", - "1239", - "1242", - "1246", - "1250", - "1252", - "1253", - "1254", - "1259", - "126", - "1262", - "1263", - "1264", - "1265", - "127", - "130", - "134", - "149", - "156", - "160", - "161", - "165", - "166", - "171", - "174", - "176", - "179", - "18", - "180", - "181", - "184", - "186", - "190", - "192", - "196", - "199", - "202", - "205", - "210", - "211", - "215", - "219", - "23", - "234", - "235", - "236", - "238", - "239", - "241", - "244", - "245", - "246", - "248", - "249", - "25", - "250", - "251", - "253", - "255", - "257", - "261", - "262", - "263", - "264", - "266", - "267", - "270", - "275", - "278", - "279", - "280", - "282", - "283", - "284", - "285", - "286", - "287", - "291", - "294", - "295", - "297", - "298", - "3", - "301", - "303", - "305", - "308", - "309", - "310", - "311", - "314", - "317", - "320", - "322", - "323", - "33", - "331", - "333", - "335", - "342", - "347", - "350", - "351", - "353", - "356", - "357", - "36", - "364", - "366", - "37", - "370", - "377", - "387", - "389", - "39", - "390", - "391", - "392", - "393", - "401", - "403", - "413", - "414", - "417", - "420", - "426", - "427", - "428", - "432", - "434", - "438", - "443", - "445", - "446", - "450", - "454", - "468", - "470", - "472", - "478", - "481", - "484", - "490", - "491", - "494", - "495", - "496", - "499", - "50", - "500", - "501", - "505", - "51", - "511", - "516", - "52", - "521", - "523", - "524", - "527", - "528", - "529", - "530", - "532", - "533", - "534", - "535", - "537", - "538", - "539", - "54", - "542", - "543", - "544", - "546", - "548", - "549", - "55", - "550", - "551", - "552", - "553", - "556", - "558", - "56", - "561", - "569", - "576", - "58", - "580", - "581", - "582", - "583", - "587", - "595", - "596", - "598", - "599", - "6", - "600", - "601", - "602", - "603", - "607", - "61", - "619", - "620", - "621", - "624", - "627", - "628", - "629", - "63", - "630", - "635", - "636", - "639", - "64", - "642", - "644", - "645", - "650", - "652", - "662", - "664", - "666", - "67", - "670", - "671", - "674", - "675", - "678", - "684", - "685", - "686", - "69", - "692", - "694", - "695", - "696", - "70", - "700", - "701", - "702", - "71", - "710", - "711", - "713", - "715", - "716", - "717", - "718", - "72", - "720", - "725", - "726", - "728", - "729", - "730", - "731", - "735", - "739", - "74", - "741", - "745", - "746", - "753", - "756", - "757", - "758", - "759", - "761", - "764", - "770", - "773", - "774", - "775", - "776", - "778", - "781", - "783", - "784", - "785", - "79", - "794", - "796", - "798", - "8", - "800", - "802", - "804", - "81", - "810", - "816", - "819", - "820", - "821", - "822", - "823", - "83", - "832", - "835", - "836", - "838", - "843", - "844", - "847", - "85", - "850", - "852", - "854", - "856", - "86", - "861", - "863", - "864", - "865", - "867", - "87", - "871", - "875", - "883", - "884", - "887", - "897", - "898", - "904", - "906", - "907", - "909", - "910", - "92", - "921", - "922", - "923", - "928", - "942", - "946", - "95", - "950", - "952", - "963", - "97", - "970", - "971", - "978", - "979", - "980", - "984", - "986", - "991", - "992", - "999" - ], - "test_query_ids": [ - "1004", - "1012", - "1029", - "1030", - "1034", - "1036", - "1048", - "105", - "1053", - "1065", - "1077", - "1079", - "1085", - "109", - "1096", - "1097", - "1106", - "1110", - "1111", - "1115", - "1121", - "1127", - "1128", - "1141", - "1144", - "1149", - "1164", - "1190", - "1191", - "1195", - "1207", - "121", - "1211", - "1213", - "1216", - "1217", - "1224", - "1248", - "1257", - "128", - "131", - "132", - "140", - "152", - "153", - "154", - "159", - "168", - "169", - "175", - "191", - "193", - "200", - "22", - "221", - "223", - "228", - "229", - "231", - "237", - "240", - "254", - "26", - "288", - "289", - "30", - "319", - "324", - "330", - "354", - "362", - "367", - "371", - "384", - "394", - "395", - "396", - "397", - "405", - "418", - "422", - "425", - "464", - "469", - "480", - "49", - "502", - "509", - "514", - "515", - "519", - "525", - "555", - "562", - "572", - "579", - "60", - "605", - "610", - "614", - "625", - "632", - "633", - "638", - "643", - "653", - "672", - "68", - "681", - "688", - "689", - "7", - "706", - "708", - "712", - "722", - "732", - "734", - "742", - "747", - "749", - "751", - "754", - "760", - "763", - "772", - "78", - "787", - "790", - "797", - "805", - "809", - "811", - "826", - "830", - "834", - "840", - "853", - "869", - "873", - "876", - "88", - "885", - "893", - "894", - "899", - "905", - "912", - "920", - "925", - "93", - "934", - "936", - "941", - "943", - "947", - "959", - "962", - "969", - "972", - "976", - "98", - "983", - "985", - "987", - "998" - ] -} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json deleted file mode 100644 index 119af23..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/splits/patents_splits.json +++ /dev/null @@ -1,3252 +0,0 @@ -{ - "dataset": "patents", - "total_queries": 3107, - "train_queries": 2518, - "test_queries": 718, - "sft_queries": 755, - "rl_queries": 1763, - "sft_ratio": 0.2998411437648928, - "rl_ratio": 0.7001588562351072, - "sft_query_ids": [ - "0_2", - "0_5", - "0_9", - "100_2", - "100_4", - "100_5", - "100_6", - "100_7", - "101_2", - "101_5", - "102_1", - "102_2", - "102_3", - "103_1", - "103_2", - "103_4", - "103_9", - "104_3", - "104_8", - "106_9", - "107_1", - "107_3", - "112_0", - "112_1", - "116_10", - "116_12", - "11_0", - "11_13", - "11_8", - "122_10", - "122_8", - "123_0", - "123_1", - "123_5", - "123_6", - "125_0", - "129_0", - "130_1", - "130_12", - "130_4", - "130_6", - "130_7", - "130_9", - "131_2", - "131_3", - "143_0", - "143_1", - "143_10", - "144_1", - "144_10", - "144_13", - "144_2", - "144_5", - "144_6", - "144_7", - "144_8", - "146_11", - "146_6", - "146_9", - "151_11", - "151_14", - "151_16", - "152_0", - "152_10", - "152_11", - "152_13", - "152_14", - "152_17", - "152_2", - "152_20", - "152_26", - "152_27", - "152_9", - "154_11", - "156_2", - "156_3", - "157_3", - "158_4", - "158_6", - "159_0", - "159_1", - "159_10", - "159_22", - "159_23", - "159_7", - "159_9", - "163_12", - "163_3", - "163_4", - "164_11", - "164_12", - "164_15", - "164_18", - "164_7", - "164_8", - "166_7", - "166_8", - "170_8", - "181_15", - "181_3", - "181_4", - "181_8", - "183_2", - "183_5", - "183_6", - "184_2", - "184_3", - "184_4", - "184_5", - "184_7", - "184_8", - "186_1", - "186_12", - "186_17", - "186_5", - "186_7", - "187_10", - "187_19", - "187_3", - "187_4", - "187_5", - "187_9", - "189_2", - "189_5", - "189_8", - "191_13", - "191_16", - "191_2", - "191_26", - "193_0", - "193_12", - "193_16", - "193_20", - "193_4", - "193_5", - "193_9", - "198_9", - "199_1", - "199_14", - "199_17", - "199_6", - "199_7", - "19_14", - "19_20", - "19_21", - "19_23", - "19_28", - "1_1", - "1_3", - "203_12", - "203_5", - "203_8", - "204_0", - "204_24", - "204_25", - "204_28", - "204_3", - "204_7", - "205_11", - "205_13", - "205_15", - "205_19", - "206_0", - "206_1", - "206_12", - "206_2", - "206_4", - "206_9", - "207_2", - "207_5", - "209_1", - "211_0", - "211_1", - "211_2", - "211_3", - "213_10", - "213_11", - "213_2", - "213_3", - "213_4", - "213_9", - "215_1", - "215_19", - "215_24", - "215_25", - "215_26", - "215_3", - "215_4", - "215_8", - "217_1", - "21_27", - "21_7", - "220_1", - "220_11", - "220_15", - "220_17", - "220_18", - "220_26", - "222_10", - "223_3", - "224_13", - "224_14", - "224_23", - "225_0", - "22_11", - "22_12", - "22_2", - "22_3", - "22_5", - "22_6", - "22_8", - "230_0", - "230_2", - "231_10", - "231_14", - "231_17", - "231_21", - "231_4", - "231_7", - "231_8", - "232_3", - "233_5", - "233_8", - "234_4", - "235_4", - "235_6", - "236_0", - "236_2", - "236_4", - "236_6", - "237_10", - "237_15", - "237_8", - "239_2", - "239_5", - "240_0", - "240_1", - "240_12", - "240_3", - "240_4", - "240_5", - "240_6", - "240_7", - "241_0", - "241_1", - "241_4", - "241_6", - "243_0", - "243_11", - "243_16", - "243_18", - "243_9", - "246_0", - "246_1", - "246_5", - "248_6", - "249_11", - "249_13", - "249_2", - "249_4", - "249_6", - "249_7", - "249_9", - "24_11", - "24_3", - "250_13", - "250_18", - "250_22", - "250_24", - "250_25", - "250_28", - "250_4", - "252_0", - "252_1", - "252_12", - "252_19", - "252_3", - "252_4", - "253_0", - "253_1", - "254_1", - "254_13", - "254_14", - "254_3", - "254_4", - "257_0", - "257_12", - "257_2", - "259_0", - "262_11", - "262_7", - "265_0", - "266_1", - "267_0", - "267_1", - "267_4", - "267_9", - "269_0", - "269_4", - "269_5", - "273_0", - "273_1", - "276_0", - "276_1", - "278_6", - "278_7", - "278_8", - "279_12", - "279_13", - "27_10", - "281_2", - "281_3", - "281_4", - "286_1", - "286_11", - "286_2", - "288_16", - "288_5", - "28_18", - "28_19", - "291_0", - "291_1", - "291_2", - "291_3", - "291_6", - "291_7", - "298_12", - "298_3", - "298_9", - "299_5", - "299_6", - "299_7", - "299_8", - "299_9", - "2_0", - "2_1", - "300_1", - "300_10", - "300_17", - "300_7", - "301_0", - "301_10", - "301_11", - "301_13", - "301_15", - "301_17", - "301_4", - "301_8", - "302_12", - "302_18", - "302_19", - "302_2", - "302_7", - "302_8", - "303_1", - "303_13", - "303_14", - "303_2", - "303_6", - "303_9", - "305_11", - "305_12", - "305_13", - "305_15", - "305_17", - "305_3", - "305_4", - "305_8", - "308_0", - "310_15", - "310_17", - "310_2", - "311_3", - "311_7", - "317_0", - "318_1", - "318_12", - "318_17", - "318_8", - "319_2", - "31_12", - "31_15", - "31_2", - "31_5", - "31_8", - "323_0", - "323_12", - "323_2", - "323_6", - "323_8", - "323_9", - "324_17", - "326_5", - "327_1", - "327_3", - "331_12", - "331_24", - "332_0", - "332_1", - "332_5", - "335_1", - "335_6", - "335_9", - "336_12", - "336_7", - "337_1", - "337_11", - "339_17", - "339_4", - "341_16", - "341_9", - "344_0", - "344_1", - "345_1", - "348_2", - "355_2", - "356_14", - "356_16", - "356_20", - "356_21", - "356_23", - "356_24", - "356_26", - "356_8", - "361_14", - "361_3", - "361_4", - "363_12", - "363_15", - "363_2", - "363_3", - "363_6", - "363_7", - "364_11", - "364_13", - "364_5", - "366_0", - "366_3", - "372_11", - "372_14", - "372_8", - "373_11", - "373_14", - "373_15", - "373_3", - "373_4", - "373_6", - "373_7", - "373_8", - "374_0", - "374_2", - "374_4", - "374_6", - "381_2", - "388_0", - "388_11", - "388_13", - "388_14", - "388_5", - "389_6", - "395_18", - "397_0", - "398_0", - "398_5", - "398_7", - "399_14", - "399_15", - "399_16", - "39_15", - "39_18", - "39_19", - "39_2", - "39_7", - "402_6", - "406_0", - "406_11", - "406_4", - "406_8", - "407_16", - "407_6", - "407_8", - "407_9", - "411_0", - "411_6", - "412_1", - "412_12", - "412_15", - "412_19", - "412_2", - "412_3", - "412_8", - "413_3", - "413_6", - "413_7", - "416_3", - "416_4", - "420_0", - "420_11", - "420_2", - "420_6", - "420_7", - "420_8", - "421_16", - "421_2", - "421_21", - "421_5", - "422_1", - "422_11", - "422_12", - "422_15", - "422_16", - "422_17", - "422_23", - "422_24", - "422_4", - "422_6", - "425_13", - "425_17", - "425_21", - "425_22", - "425_26", - "425_27", - "425_28", - "426_10", - "426_11", - "426_21", - "426_25", - "438_0", - "438_2", - "43_1", - "43_2", - "43_3", - "43_4", - "440_9", - "445_3", - "445_4", - "448_15", - "448_18", - "448_2", - "448_4", - "448_8", - "449_1", - "452_10", - "452_12", - "452_14", - "452_3", - "452_4", - "452_5", - "452_8", - "453_14", - "453_9", - "454_2", - "454_3", - "454_5", - "456_13", - "459_33", - "459_36", - "460_0", - "460_12", - "460_14", - "460_17", - "460_23", - "460_29", - "460_3", - "460_4", - "460_7", - "460_8", - "460_9", - "465_9", - "467_4", - "469_9", - "470_0", - "470_3", - "471_15", - "471_16", - "471_4", - "471_7", - "471_9", - "472_0", - "472_11", - "473_0", - "474_1", - "482_12", - "482_15", - "482_17", - "482_5", - "482_7", - "485_0", - "485_5", - "485_6", - "486_15", - "486_16", - "486_3", - "486_7", - "488_1", - "488_4", - "493_0", - "493_11", - "493_4", - "493_5", - "493_7", - "494_0", - "494_2", - "494_4", - "494_9", - "499_1", - "499_6", - "499_8", - "499_9", - "49_15", - "49_19", - "49_2", - "49_6", - "49_7", - "49_9", - "508_2", - "514_0", - "514_1", - "514_14", - "514_4", - "514_5", - "515_11", - "515_12", - "515_13", - "515_5", - "515_9", - "517_0", - "517_1", - "518_12", - "518_17", - "518_35", - "518_37", - "518_39", - "518_42", - "518_43", - "518_7", - "518_9", - "521_0", - "521_4", - "521_5", - "521_9", - "52_16", - "530_1", - "530_2", - "530_5", - "530_8", - "530_9", - "54_0", - "55_1", - "55_15", - "55_20", - "55_21", - "55_4", - "58_3", - "5_0", - "5_1", - "5_10", - "5_14", - "5_3", - "5_5", - "5_7", - "5_8", - "60_16", - "60_24", - "60_25", - "60_27", - "60_6", - "60_9", - "62_1", - "62_13", - "62_14", - "65_0", - "65_12", - "65_13", - "65_3", - "65_4", - "65_8", - "66_0", - "66_14", - "66_17", - "66_18", - "68_10", - "68_13", - "68_14", - "68_7", - "68_8", - "69_1", - "69_10", - "69_2", - "69_3", - "69_4", - "69_6", - "70_1", - "70_6", - "70_8", - "70_9", - "76_1", - "76_13", - "76_20", - "76_6", - "78_1", - "78_16", - "78_18", - "78_19", - "78_22", - "78_24", - "78_26", - "78_6", - "78_7", - "78_8", - "78_9", - "81_17", - "82_2", - "82_4", - "83_1", - "83_10", - "83_11", - "83_2", - "83_3", - "83_4", - "83_5", - "83_9", - "84_1", - "85_1", - "85_13", - "85_14", - "85_15", - "85_16", - "85_17", - "85_19", - "85_20", - "85_3", - "85_7", - "86_0", - "86_13", - "86_17", - "86_20", - "86_22", - "86_23", - "86_3", - "87_0", - "87_13", - "87_16", - "87_5", - "87_9", - "90_5", - "92_10", - "92_3", - "98_0", - "98_1", - "98_13", - "98_15", - "98_16", - "98_5", - "98_6" - ], - "rl_query_ids": [ - "0_0", - "0_1", - "0_3", - "0_4", - "0_6", - "0_7", - "0_8", - "100_1", - "100_13", - "100_3", - "100_8", - "100_9", - "101_0", - "101_1", - "101_10", - "101_3", - "101_4", - "101_6", - "101_9", - "102_0", - "103_0", - "103_10", - "103_3", - "103_5", - "103_6", - "103_7", - "103_8", - "104_2", - "104_4", - "104_5", - "104_6", - "104_7", - "105_1", - "105_4", - "106_10", - "106_11", - "106_6", - "107_0", - "107_4", - "112_2", - "113_17", - "113_18", - "115_0", - "115_1", - "116_14", - "116_8", - "117_3", - "117_4", - "118_1", - "11_1", - "11_10", - "11_14", - "11_15", - "11_16", - "11_2", - "11_3", - "11_4", - "11_5", - "11_6", - "11_7", - "11_9", - "122_9", - "123_2", - "123_3", - "123_4", - "123_7", - "124_10", - "125_1", - "129_1", - "129_3", - "129_4", - "129_7", - "129_8", - "130_0", - "130_10", - "130_11", - "130_13", - "130_2", - "130_3", - "130_5", - "130_8", - "131_0", - "131_1", - "134_0", - "134_1", - "134_2", - "136_4", - "136_5", - "136_6", - "137_8", - "139_0", - "139_1", - "143_11", - "143_12", - "143_13", - "143_14", - "143_15", - "143_2", - "143_3", - "143_4", - "143_5", - "143_6", - "143_7", - "143_8", - "143_9", - "144_0", - "144_11", - "144_12", - "144_14", - "144_3", - "144_4", - "144_9", - "146_1", - "146_10", - "146_12", - "146_13", - "146_14", - "146_15", - "146_16", - "146_17", - "146_2", - "146_3", - "146_4", - "146_5", - "146_7", - "146_8", - "14_0", - "150_1", - "150_13", - "150_14", - "150_15", - "150_16", - "150_2", - "150_3", - "150_4", - "151_0", - "151_1", - "151_10", - "151_12", - "151_13", - "151_15", - "151_17", - "151_18", - "151_2", - "151_3", - "151_4", - "151_5", - "151_6", - "151_7", - "151_8", - "151_9", - "152_1", - "152_12", - "152_15", - "152_16", - "152_18", - "152_19", - "152_21", - "152_22", - "152_23", - "152_24", - "152_25", - "152_28", - "152_29", - "152_3", - "152_4", - "152_5", - "152_6", - "152_7", - "152_8", - "153_0", - "154_1", - "154_12", - "154_2", - "156_0", - "156_1", - "156_4", - "157_12", - "157_13", - "157_15", - "157_17", - "157_4", - "157_6", - "157_7", - "157_8", - "158_0", - "158_1", - "158_10", - "158_2", - "158_3", - "158_5", - "158_7", - "158_8", - "158_9", - "159_11", - "159_12", - "159_13", - "159_14", - "159_17", - "159_18", - "159_2", - "159_21", - "159_24", - "159_3", - "159_4", - "159_5", - "159_6", - "159_8", - "163_0", - "163_1", - "163_10", - "163_11", - "163_13", - "163_14", - "163_15", - "163_16", - "163_2", - "163_5", - "163_6", - "163_7", - "163_8", - "163_9", - "164_0", - "164_1", - "164_13", - "164_14", - "164_16", - "164_17", - "164_19", - "164_2", - "164_20", - "164_21", - "164_22", - "164_23", - "164_3", - "164_4", - "164_5", - "164_6", - "164_9", - "166_10", - "166_12", - "166_13", - "166_4", - "166_5", - "166_6", - "166_9", - "170_16", - "170_17", - "170_5", - "170_7", - "176_1", - "181_0", - "181_1", - "181_10", - "181_11", - "181_12", - "181_13", - "181_14", - "181_16", - "181_2", - "181_5", - "181_6", - "181_7", - "181_9", - "183_0", - "183_1", - "183_3", - "183_4", - "184_0", - "184_1", - "184_10", - "184_11", - "184_12", - "184_13", - "184_14", - "184_15", - "184_6", - "184_9", - "186_0", - "186_10", - "186_11", - "186_13", - "186_15", - "186_2", - "186_4", - "186_8", - "186_9", - "187_0", - "187_1", - "187_11", - "187_12", - "187_2", - "187_6", - "187_7", - "187_8", - "188_0", - "188_5", - "188_6", - "188_8", - "188_9", - "189_0", - "189_1", - "189_3", - "189_4", - "189_6", - "189_7", - "191_0", - "191_1", - "191_10", - "191_11", - "191_12", - "191_14", - "191_15", - "191_17", - "191_18", - "191_19", - "191_20", - "191_21", - "191_22", - "191_23", - "191_24", - "191_25", - "191_3", - "191_4", - "191_5", - "191_6", - "191_7", - "191_8", - "191_9", - "193_1", - "193_10", - "193_11", - "193_13", - "193_14", - "193_15", - "193_17", - "193_18", - "193_19", - "193_2", - "193_21", - "193_22", - "193_23", - "193_24", - "193_25", - "193_26", - "193_3", - "193_6", - "193_7", - "193_8", - "195_12", - "195_13", - "195_14", - "198_10", - "198_3", - "198_4", - "198_5", - "198_6", - "198_7", - "199_0", - "199_15", - "199_16", - "199_18", - "199_19", - "199_2", - "199_3", - "199_4", - "199_5", - "199_8", - "199_9", - "19_11", - "19_12", - "19_13", - "19_15", - "19_17", - "19_18", - "19_19", - "19_22", - "19_24", - "19_25", - "1_0", - "1_4", - "203_0", - "203_1", - "203_10", - "203_11", - "203_14", - "203_3", - "203_4", - "203_6", - "203_7", - "203_9", - "204_1", - "204_10", - "204_11", - "204_12", - "204_13", - "204_14", - "204_15", - "204_16", - "204_2", - "204_22", - "204_23", - "204_26", - "204_27", - "204_4", - "204_5", - "204_6", - "204_8", - "204_9", - "205_12", - "205_14", - "205_18", - "205_20", - "205_21", - "205_23", - "206_10", - "206_11", - "206_13", - "206_14", - "206_3", - "206_5", - "206_6", - "206_7", - "206_8", - "207_3", - "208_1", - "209_0", - "209_2", - "209_3", - "209_4", - "211_4", - "211_5", - "212_9", - "213_0", - "213_1", - "213_12", - "213_13", - "213_14", - "213_5", - "213_6", - "213_7", - "213_8", - "215_0", - "215_10", - "215_11", - "215_12", - "215_13", - "215_14", - "215_15", - "215_16", - "215_17", - "215_18", - "215_2", - "215_20", - "215_21", - "215_22", - "215_23", - "215_5", - "215_6", - "215_7", - "215_9", - "217_0", - "21_22", - "21_25", - "21_26", - "21_29", - "21_8", - "220_0", - "220_10", - "220_12", - "220_13", - "220_14", - "220_16", - "220_19", - "220_2", - "220_20", - "220_21", - "220_22", - "220_23", - "220_24", - "220_25", - "220_27", - "220_28", - "220_29", - "220_3", - "220_4", - "220_5", - "220_6", - "220_7", - "220_8", - "220_9", - "222_1", - "222_2", - "222_6", - "222_8", - "223_0", - "223_1", - "223_2", - "223_4", - "223_5", - "223_6", - "224_15", - "224_21", - "224_22", - "227_0", - "227_1", - "227_2", - "227_3", - "227_4", - "227_5", - "227_6", - "227_7", - "227_8", - "227_9", - "229_0", - "22_14", - "22_16", - "22_7", - "230_1", - "230_3", - "230_4", - "230_6", - "231_0", - "231_1", - "231_11", - "231_12", - "231_13", - "231_15", - "231_16", - "231_18", - "231_19", - "231_2", - "231_20", - "231_22", - "231_23", - "231_24", - "231_25", - "231_26", - "231_3", - "231_5", - "231_6", - "231_9", - "232_0", - "232_1", - "232_2", - "232_4", - "233_4", - "233_6", - "233_7", - "235_0", - "235_1", - "235_2", - "235_3", - "235_5", - "236_1", - "236_3", - "236_5", - "236_7", - "236_8", - "236_9", - "237_0", - "237_1", - "237_11", - "237_12", - "237_13", - "237_14", - "237_16", - "237_17", - "237_18", - "237_19", - "237_2", - "237_3", - "237_4", - "237_5", - "237_6", - "237_7", - "237_9", - "239_0", - "239_1", - "239_4", - "240_10", - "240_11", - "240_13", - "240_14", - "240_2", - "240_8", - "240_9", - "241_2", - "241_3", - "241_5", - "243_1", - "243_10", - "243_12", - "243_13", - "243_15", - "243_17", - "243_19", - "243_2", - "243_3", - "243_4", - "243_5", - "243_6", - "243_7", - "243_8", - "246_2", - "246_3", - "248_4", - "248_5", - "248_7", - "248_8", - "248_9", - "249_0", - "249_1", - "249_10", - "249_12", - "249_14", - "249_15", - "249_17", - "249_18", - "249_19", - "249_3", - "249_5", - "249_8", - "24_0", - "24_1", - "24_10", - "24_12", - "24_2", - "24_4", - "24_5", - "24_6", - "24_7", - "24_8", - "24_9", - "250_10", - "250_11", - "250_12", - "250_14", - "250_15", - "250_16", - "250_17", - "250_19", - "250_20", - "250_21", - "250_23", - "250_26", - "250_27", - "250_29", - "250_3", - "250_7", - "250_8", - "250_9", - "252_10", - "252_11", - "252_13", - "252_14", - "252_2", - "252_5", - "252_6", - "252_7", - "252_8", - "252_9", - "253_2", - "254_10", - "254_12", - "254_15", - "254_16", - "254_18", - "254_19", - "254_20", - "254_21", - "254_22", - "254_24", - "254_25", - "254_26", - "254_5", - "254_6", - "254_7", - "254_8", - "254_9", - "257_1", - "257_11", - "257_17", - "257_4", - "257_8", - "259_1", - "262_0", - "262_1", - "262_10", - "262_16", - "262_17", - "262_2", - "262_3", - "262_4", - "262_5", - "262_6", - "262_8", - "262_9", - "267_10", - "267_11", - "267_2", - "267_3", - "267_5", - "267_6", - "267_7", - "267_8", - "269_1", - "269_2", - "269_3", - "269_6", - "273_2", - "275_22", - "276_2", - "278_0", - "278_1", - "278_10", - "278_11", - "278_2", - "278_3", - "278_4", - "278_5", - "278_9", - "279_10", - "279_3", - "279_4", - "27_0", - "27_1", - "27_11", - "27_12", - "27_2", - "27_3", - "27_4", - "27_8", - "27_9", - "280_0", - "281_0", - "281_1", - "286_0", - "286_10", - "286_7", - "286_8", - "286_9", - "288_1", - "288_12", - "288_13", - "288_15", - "288_2", - "288_3", - "288_4", - "288_6", - "291_11", - "291_12", - "291_25", - "291_26", - "291_4", - "291_5", - "291_8", - "291_9", - "294_5", - "294_6", - "298_0", - "298_1", - "298_10", - "298_11", - "298_14", - "298_2", - "298_4", - "298_5", - "298_6", - "298_7", - "298_8", - "299_0", - "299_1", - "299_2", - "299_3", - "299_4", - "2_2", - "2_3", - "2_4", - "2_5", - "300_0", - "300_11", - "300_12", - "300_14", - "300_15", - "300_16", - "300_2", - "300_3", - "300_4", - "300_5", - "300_6", - "300_8", - "300_9", - "301_1", - "301_12", - "301_14", - "301_16", - "301_2", - "301_3", - "301_5", - "301_6", - "301_7", - "301_9", - "302_0", - "302_1", - "302_10", - "302_11", - "302_13", - "302_14", - "302_15", - "302_16", - "302_17", - "302_3", - "302_4", - "302_5", - "302_6", - "302_9", - "303_0", - "303_10", - "303_11", - "303_12", - "303_15", - "303_16", - "303_17", - "303_18", - "303_19", - "303_3", - "303_4", - "303_5", - "303_7", - "303_8", - "304_10", - "304_11", - "305_0", - "305_1", - "305_10", - "305_14", - "305_16", - "305_18", - "305_19", - "305_2", - "305_5", - "305_6", - "305_7", - "305_9", - "307_8", - "308_1", - "308_2", - "310_16", - "311_0", - "311_1", - "311_16", - "311_2", - "311_4", - "311_5", - "311_6", - "311_8", - "311_9", - "314_4", - "316_0", - "317_1", - "317_2", - "317_3", - "318_0", - "318_10", - "318_11", - "318_13", - "318_14", - "318_15", - "318_16", - "318_18", - "318_19", - "318_2", - "318_3", - "318_4", - "318_5", - "318_6", - "318_7", - "318_9", - "319_3", - "319_4", - "319_5", - "31_0", - "31_1", - "31_10", - "31_11", - "31_13", - "31_14", - "31_16", - "31_17", - "31_18", - "31_19", - "31_4", - "31_6", - "31_7", - "31_9", - "322_10", - "322_6", - "322_7", - "322_8", - "323_1", - "323_10", - "323_11", - "323_3", - "323_4", - "323_5", - "323_7", - "324_16", - "324_18", - "327_0", - "327_2", - "331_0", - "331_1", - "331_10", - "331_11", - "331_13", - "331_14", - "331_15", - "331_16", - "331_17", - "331_18", - "331_19", - "331_2", - "331_20", - "331_21", - "331_22", - "331_23", - "331_3", - "331_4", - "331_5", - "331_6", - "331_7", - "331_8", - "331_9", - "332_2", - "332_3", - "332_4", - "335_10", - "335_11", - "335_12", - "335_16", - "335_17", - "335_19", - "335_2", - "335_3", - "335_4", - "335_5", - "335_7", - "335_8", - "336_0", - "336_1", - "336_10", - "336_11", - "336_13", - "336_14", - "336_15", - "336_16", - "336_17", - "336_18", - "336_19", - "336_20", - "336_21", - "336_22", - "336_4", - "336_5", - "336_6", - "336_8", - "336_9", - "337_0", - "337_12", - "337_2", - "337_3", - "337_4", - "337_5", - "337_6", - "337_7", - "337_8", - "339_13", - "339_14", - "339_15", - "339_5", - "339_6", - "339_8", - "341_0", - "341_1", - "341_10", - "341_11", - "341_12", - "341_13", - "341_14", - "341_15", - "341_17", - "341_18", - "341_2", - "341_3", - "341_4", - "341_5", - "341_6", - "341_7", - "341_8", - "343_1", - "344_2", - "345_0", - "345_2", - "345_3", - "345_4", - "352_13", - "355_0", - "355_1", - "355_3", - "355_4", - "355_5", - "355_6", - "355_7", - "355_8", - "355_9", - "356_0", - "356_1", - "356_10", - "356_11", - "356_12", - "356_13", - "356_15", - "356_19", - "356_2", - "356_22", - "356_25", - "356_27", - "356_3", - "356_33", - "356_4", - "356_5", - "356_6", - "356_7", - "356_9", - "360_2", - "361_0", - "361_1", - "361_10", - "361_12", - "361_13", - "361_2", - "361_5", - "361_6", - "361_7", - "361_8", - "361_9", - "363_0", - "363_1", - "363_10", - "363_11", - "363_13", - "363_14", - "363_16", - "363_17", - "363_18", - "363_19", - "363_4", - "363_5", - "363_8", - "363_9", - "364_12", - "364_2", - "364_4", - "364_6", - "364_9", - "366_1", - "366_2", - "370_7", - "370_8", - "372_12", - "372_15", - "373_0", - "373_1", - "373_10", - "373_12", - "373_13", - "373_2", - "373_5", - "373_9", - "374_1", - "374_3", - "374_5", - "374_7", - "374_8", - "377_0", - "377_1", - "381_0", - "381_1", - "381_4", - "384_0", - "384_1", - "384_2", - "384_3", - "384_4", - "384_5", - "384_6", - "384_7", - "385_0", - "385_1", - "385_2", - "385_3", - "388_1", - "388_10", - "388_12", - "388_15", - "388_16", - "388_2", - "388_3", - "388_4", - "388_6", - "388_7", - "388_8", - "388_9", - "38_0", - "38_1", - "392_0", - "392_1", - "392_2", - "392_3", - "392_4", - "392_5", - "392_6", - "392_7", - "394_0", - "394_1", - "394_2", - "395_15", - "397_2", - "398_1", - "398_2", - "398_3", - "398_4", - "398_6", - "399_10", - "399_11", - "399_12", - "399_13", - "399_17", - "399_18", - "399_19", - "39_0", - "39_1", - "39_13", - "39_14", - "39_3", - "39_5", - "39_8", - "400_12", - "402_5", - "406_1", - "406_10", - "406_12", - "406_13", - "406_14", - "406_2", - "406_3", - "406_5", - "406_6", - "406_7", - "406_9", - "407_10", - "407_11", - "407_12", - "407_15", - "407_17", - "407_5", - "407_7", - "408_0", - "408_1", - "411_1", - "411_2", - "411_3", - "411_4", - "411_5", - "412_0", - "412_10", - "412_11", - "412_13", - "412_14", - "412_16", - "412_17", - "412_18", - "412_4", - "412_5", - "412_6", - "412_7", - "412_9", - "413_0", - "413_1", - "413_2", - "413_4", - "413_5", - "413_8", - "416_0", - "416_1", - "416_2", - "41_11", - "420_1", - "420_10", - "420_12", - "420_13", - "420_14", - "420_15", - "420_16", - "420_17", - "420_18", - "420_19", - "420_3", - "420_4", - "420_5", - "420_9", - "421_0", - "421_1", - "421_10", - "421_11", - "421_12", - "421_13", - "421_14", - "421_15", - "421_17", - "421_18", - "421_19", - "421_20", - "421_22", - "421_23", - "421_3", - "421_4", - "421_6", - "421_7", - "421_8", - "421_9", - "422_0", - "422_10", - "422_13", - "422_14", - "422_18", - "422_19", - "422_2", - "422_20", - "422_21", - "422_22", - "422_25", - "422_3", - "422_5", - "422_7", - "422_8", - "422_9", - "424_1", - "425_0", - "425_1", - "425_14", - "425_15", - "425_16", - "425_18", - "425_19", - "425_2", - "425_20", - "425_23", - "425_24", - "425_25", - "425_3", - "425_4", - "425_7", - "426_12", - "426_13", - "426_15", - "426_17", - "426_18", - "426_22", - "426_24", - "426_29", - "426_4", - "426_6", - "438_1", - "43_5", - "43_6", - "43_7", - "43_8", - "43_9", - "440_10", - "445_1", - "445_2", - "445_6", - "445_7", - "445_8", - "448_12", - "448_13", - "448_14", - "448_3", - "448_5", - "449_0", - "449_2", - "449_3", - "449_4", - "449_5", - "449_6", - "449_7", - "449_8", - "449_9", - "452_0", - "452_1", - "452_11", - "452_13", - "452_15", - "452_16", - "452_17", - "452_18", - "452_19", - "452_2", - "452_6", - "452_7", - "452_9", - "453_0", - "453_1", - "453_10", - "453_11", - "453_13", - "453_15", - "453_16", - "453_17", - "453_2", - "453_3", - "453_4", - "453_5", - "453_6", - "453_7", - "453_8", - "454_0", - "454_1", - "455_16", - "455_18", - "459_32", - "459_34", - "459_35", - "459_37", - "459_38", - "459_39", - "459_40", - "460_1", - "460_10", - "460_11", - "460_13", - "460_15", - "460_16", - "460_18", - "460_19", - "460_2", - "460_20", - "460_21", - "460_22", - "460_24", - "460_25", - "460_26", - "460_27", - "460_28", - "460_5", - "460_6", - "465_12", - "465_5", - "467_13", - "467_5", - "467_6", - "467_7", - "467_8", - "469_1", - "469_10", - "469_11", - "469_12", - "469_13", - "469_14", - "469_15", - "469_16", - "469_17", - "469_18", - "469_19", - "469_2", - "469_4", - "469_5", - "469_7", - "469_8", - "470_1", - "470_5", - "471_0", - "471_1", - "471_11", - "471_13", - "471_14", - "471_17", - "471_19", - "471_2", - "471_6", - "471_8", - "472_1", - "472_10", - "472_12", - "472_14", - "472_15", - "472_2", - "472_4", - "472_5", - "472_7", - "472_8", - "472_9", - "473_1", - "473_2", - "473_3", - "474_0", - "476_4", - "47_0", - "482_0", - "482_1", - "482_10", - "482_11", - "482_13", - "482_14", - "482_19", - "482_2", - "482_20", - "482_22", - "482_3", - "482_4", - "482_6", - "482_8", - "482_9", - "483_11", - "483_12", - "483_13", - "483_15", - "483_16", - "483_17", - "485_1", - "485_2", - "485_3", - "485_4", - "486_0", - "486_10", - "486_2", - "486_4", - "486_8", - "486_9", - "487_0", - "487_1", - "487_2", - "487_3", - "487_4", - "488_0", - "488_2", - "488_3", - "488_5", - "488_6", - "488_7", - "490_18", - "490_22", - "490_25", - "490_27", - "490_3", - "490_6", - "490_8", - "490_9", - "492_19", - "492_20", - "493_1", - "493_10", - "493_12", - "493_2", - "493_3", - "493_6", - "493_9", - "494_1", - "494_3", - "494_7", - "499_0", - "499_2", - "499_3", - "499_4", - "499_5", - "499_7", - "49_0", - "49_1", - "49_10", - "49_11", - "49_12", - "49_13", - "49_14", - "49_16", - "49_17", - "49_18", - "49_3", - "49_4", - "49_5", - "49_8", - "508_10", - "508_13", - "508_19", - "508_9", - "514_10", - "514_11", - "514_13", - "514_2", - "514_3", - "514_6", - "515_0", - "515_1", - "515_10", - "515_14", - "515_15", - "515_2", - "515_3", - "515_4", - "515_6", - "515_7", - "515_8", - "517_2", - "517_3", - "517_4", - "518_10", - "518_11", - "518_13", - "518_14", - "518_15", - "518_16", - "518_2", - "518_3", - "518_34", - "518_36", - "518_38", - "518_4", - "518_40", - "518_41", - "518_5", - "518_6", - "518_8", - "521_1", - "521_10", - "521_11", - "521_12", - "521_2", - "521_3", - "521_6", - "521_7", - "521_8", - "524_0", - "528_0", - "528_1", - "530_11", - "530_12", - "530_13", - "530_14", - "530_3", - "530_4", - "530_6", - "530_7", - "53_0", - "53_1", - "53_2", - "54_1", - "54_2", - "55_11", - "55_12", - "55_14", - "55_16", - "55_2", - "55_22", - "55_5", - "55_6", - "58_10", - "58_4", - "59_0", - "5_11", - "5_12", - "5_13", - "5_2", - "5_4", - "5_6", - "5_9", - "60_0", - "60_1", - "60_10", - "60_11", - "60_12", - "60_13", - "60_14", - "60_15", - "60_17", - "60_18", - "60_19", - "60_20", - "60_21", - "60_22", - "60_26", - "60_3", - "60_4", - "60_7", - "62_0", - "62_10", - "62_11", - "62_12", - "62_2", - "62_3", - "62_4", - "62_5", - "62_6", - "62_7", - "62_8", - "62_9", - "64_5", - "65_1", - "65_10", - "65_11", - "65_14", - "65_15", - "65_16", - "65_2", - "65_5", - "65_6", - "65_7", - "65_9", - "66_1", - "66_10", - "66_11", - "66_12", - "66_13", - "66_15", - "66_16", - "66_2", - "66_3", - "66_4", - "66_5", - "66_6", - "66_7", - "66_8", - "66_9", - "68_11", - "68_12", - "68_15", - "68_5", - "68_6", - "68_9", - "69_0", - "69_11", - "69_12", - "69_5", - "69_7", - "69_8", - "69_9", - "70_0", - "70_10", - "70_11", - "70_12", - "70_13", - "70_14", - "70_2", - "70_3", - "70_4", - "70_5", - "70_7", - "73_0", - "76_15", - "78_0", - "78_10", - "78_11", - "78_12", - "78_13", - "78_14", - "78_15", - "78_17", - "78_2", - "78_20", - "78_21", - "78_23", - "78_25", - "78_3", - "78_4", - "78_5", - "81_16", - "81_24", - "81_3", - "81_7", - "82_1", - "82_10", - "82_3", - "82_5", - "82_6", - "82_7", - "82_8", - "82_9", - "83_0", - "83_12", - "83_6", - "83_7", - "83_8", - "84_2", - "85_0", - "85_10", - "85_11", - "85_12", - "85_18", - "85_2", - "85_21", - "85_22", - "85_23", - "85_4", - "85_5", - "85_6", - "85_8", - "85_9", - "86_1", - "86_10", - "86_12", - "86_14", - "86_15", - "86_16", - "86_18", - "86_2", - "86_21", - "86_25", - "86_26", - "86_4", - "86_5", - "86_6", - "86_8", - "86_9", - "87_1", - "87_10", - "87_11", - "87_12", - "87_14", - "87_15", - "87_17", - "87_18", - "87_2", - "87_3", - "87_4", - "87_6", - "87_7", - "87_8", - "8_1", - "90_4", - "92_0", - "92_1", - "92_11", - "92_12", - "92_2", - "92_4", - "92_5", - "92_6", - "92_7", - "92_8", - "92_9", - "97_2", - "98_10", - "98_11", - "98_12", - "98_14", - "98_17", - "98_18", - "98_2", - "98_3", - "98_4", - "98_7", - "98_8", - "98_9" - ], - "test_query_ids": [ - "0_0", - "0_1", - "0_10", - "0_11", - "0_12", - "0_13", - "0_14", - "0_15", - "0_16", - "0_17", - "0_18", - "0_19", - "0_2", - "0_20", - "0_21", - "0_22", - "0_23", - "0_24", - "0_25", - "0_26", - "0_27", - "0_28", - "0_29", - "0_3", - "0_30", - "0_31", - "0_4", - "0_5", - "0_6", - "0_7", - "0_8", - "0_9", - "100_5", - "100_6", - "100_7", - "104_0", - "104_1", - "104_5", - "104_6", - "106_0", - "106_1", - "106_10", - "106_11", - "106_12", - "106_2", - "106_3", - "106_4", - "106_5", - "106_6", - "106_7", - "106_8", - "106_9", - "113_0", - "113_1", - "113_2", - "113_4", - "120_0", - "120_1", - "120_10", - "120_11", - "120_12", - "120_13", - "120_14", - "120_15", - "120_16", - "120_17", - "120_2", - "120_3", - "120_4", - "120_5", - "120_6", - "120_7", - "120_8", - "120_9", - "121_0", - "121_1", - "121_2", - "121_3", - "121_4", - "121_5", - "121_6", - "121_7", - "121_8", - "123_0", - "123_1", - "123_10", - "123_11", - "123_14", - "123_15", - "123_16", - "123_17", - "123_18", - "123_2", - "123_3", - "123_4", - "123_5", - "123_6", - "123_7", - "123_8", - "123_9", - "125_12", - "125_13", - "125_14", - "125_15", - "125_16", - "125_20", - "125_21", - "125_22", - "125_23", - "125_24", - "125_3", - "125_4", - "125_5", - "125_6", - "125_7", - "127_1", - "127_11", - "127_12", - "127_2", - "127_4", - "128_0", - "128_1", - "128_10", - "128_11", - "128_12", - "128_13", - "128_14", - "128_15", - "128_16", - "128_17", - "128_18", - "128_19", - "128_2", - "128_20", - "128_21", - "128_22", - "128_23", - "128_24", - "128_25", - "128_26", - "128_27", - "128_3", - "128_4", - "128_5", - "128_6", - "128_7", - "128_8", - "128_9", - "129_1", - "129_2", - "129_3", - "130_13", - "130_15", - "130_16", - "130_17", - "131_0", - "131_1", - "131_2", - "131_3", - "131_4", - "131_5", - "131_6", - "14_0", - "14_1", - "14_10", - "14_11", - "14_12", - "14_13", - "14_14", - "14_15", - "14_16", - "14_17", - "14_2", - "14_3", - "14_4", - "14_5", - "14_6", - "14_7", - "14_8", - "14_9", - "17_0", - "17_1", - "17_10", - "17_11", - "17_12", - "17_13", - "17_14", - "17_15", - "17_16", - "17_17", - "17_18", - "17_2", - "17_3", - "17_4", - "17_5", - "17_6", - "17_7", - "17_8", - "17_9", - "18_11", - "18_12", - "19_0", - "19_1", - "19_2", - "19_3", - "19_4", - "19_5", - "19_6", - "19_7", - "19_8", - "19_9", - "23_10", - "23_15", - "23_16", - "23_19", - "23_2", - "23_21", - "23_22", - "23_23", - "23_24", - "23_25", - "23_27", - "23_3", - "23_4", - "23_5", - "23_6", - "23_7", - "23_8", - "26_0", - "26_1", - "26_10", - "26_11", - "26_12", - "26_2", - "26_3", - "26_4", - "26_5", - "26_6", - "26_7", - "26_8", - "26_9", - "27_0", - "28_0", - "28_1", - "28_2", - "28_4", - "28_5", - "2_0", - "2_1", - "2_10", - "2_11", - "2_12", - "2_13", - "2_14", - "2_17", - "2_18", - "2_19", - "2_2", - "2_3", - "2_4", - "2_5", - "2_6", - "2_7", - "2_8", - "2_9", - "30_0", - "30_1", - "30_10", - "30_11", - "30_12", - "30_13", - "30_14", - "30_2", - "30_3", - "30_4", - "30_5", - "30_6", - "30_7", - "30_8", - "30_9", - "31_0", - "31_1", - "31_10", - "31_11", - "31_2", - "31_3", - "31_4", - "31_5", - "31_6", - "31_7", - "31_8", - "31_9", - "36_0", - "36_1", - "36_10", - "36_11", - "36_12", - "36_13", - "36_14", - "36_15", - "36_16", - "36_17", - "36_18", - "36_19", - "36_2", - "36_20", - "36_21", - "36_22", - "36_23", - "36_24", - "36_25", - "36_26", - "36_28", - "36_29", - "36_3", - "36_4", - "36_5", - "36_6", - "36_7", - "36_8", - "36_9", - "37_10", - "37_8", - "37_9", - "3_0", - "3_1", - "3_2", - "3_3", - "3_4", - "3_5", - "3_6", - "3_8", - "3_9", - "41_0", - "41_1", - "41_2", - "41_3", - "41_4", - "42_10", - "42_11", - "42_12", - "42_18", - "42_20", - "42_3", - "42_8", - "42_9", - "44_0", - "44_1", - "44_10", - "44_11", - "44_12", - "44_13", - "44_14", - "44_15", - "44_16", - "44_18", - "44_19", - "44_2", - "44_20", - "44_22", - "44_23", - "44_24", - "44_25", - "44_3", - "44_4", - "44_5", - "44_6", - "44_7", - "44_8", - "44_9", - "47_0", - "47_1", - "47_10", - "47_11", - "47_12", - "47_13", - "47_14", - "47_15", - "47_16", - "47_17", - "47_18", - "47_19", - "47_2", - "47_20", - "47_3", - "47_4", - "47_5", - "47_6", - "47_7", - "47_8", - "47_9", - "49_0", - "49_1", - "49_10", - "49_11", - "49_13", - "49_2", - "49_3", - "49_4", - "49_5", - "49_6", - "49_7", - "49_8", - "49_9", - "4_0", - "4_1", - "4_10", - "4_11", - "4_12", - "4_13", - "4_14", - "4_15", - "4_16", - "4_17", - "4_18", - "4_19", - "4_2", - "4_3", - "4_4", - "4_5", - "4_6", - "4_7", - "4_8", - "4_9", - "50_0", - "50_1", - "50_2", - "50_3", - "50_4", - "50_5", - "50_6", - "50_7", - "50_8", - "50_9", - "52_2", - "55_0", - "55_1", - "55_2", - "55_3", - "57_4", - "5_0", - "5_1", - "5_2", - "61_0", - "61_1", - "61_10", - "61_11", - "61_12", - "61_13", - "61_14", - "61_15", - "61_16", - "61_17", - "61_18", - "61_19", - "61_2", - "61_20", - "61_21", - "61_22", - "61_23", - "61_24", - "61_3", - "61_4", - "61_5", - "61_6", - "61_7", - "61_8", - "61_9", - "62_0", - "62_1", - "62_2", - "62_3", - "62_4", - "62_5", - "62_6", - "62_7", - "62_8", - "62_9", - "65_0", - "66_6", - "67_10", - "67_11", - "67_12", - "67_13", - "67_4", - "67_5", - "67_6", - "67_9", - "68_3", - "68_4", - "68_5", - "68_6", - "68_7", - "69_1", - "69_10", - "69_14", - "69_4", - "69_6", - "6_0", - "6_1", - "6_2", - "6_3", - "6_4", - "6_5", - "6_6", - "6_7", - "6_8", - "6_9", - "70_1", - "70_10", - "70_11", - "70_13", - "70_14", - "70_15", - "70_16", - "70_17", - "70_18", - "70_19", - "70_2", - "70_21", - "70_3", - "70_4", - "70_5", - "70_6", - "70_7", - "70_8", - "70_9", - "71_0", - "71_1", - "71_10", - "71_11", - "71_12", - "71_13", - "71_15", - "71_16", - "71_17", - "71_18", - "71_19", - "71_2", - "71_3", - "71_4", - "71_5", - "71_6", - "71_7", - "71_8", - "71_9", - "72_10", - "72_11", - "72_12", - "72_13", - "72_14", - "72_15", - "73_0", - "73_1", - "73_2", - "73_3", - "73_4", - "73_5", - "73_6", - "73_7", - "73_8", - "73_9", - "74_0", - "74_1", - "74_10", - "74_2", - "74_3", - "74_4", - "74_5", - "74_6", - "74_7", - "74_8", - "74_9", - "75_1", - "75_3", - "75_4", - "75_5", - "75_6", - "75_7", - "75_8", - "77_0", - "77_1", - "77_10", - "77_2", - "77_3", - "77_4", - "77_5", - "77_6", - "77_7", - "77_8", - "79_0", - "79_1", - "79_10", - "79_11", - "79_2", - "79_3", - "79_4", - "79_5", - "79_6", - "79_7", - "79_9", - "80_0", - "80_1", - "80_10", - "80_11", - "80_12", - "80_13", - "80_14", - "80_18", - "80_2", - "80_24", - "80_3", - "80_4", - "80_5", - "80_6", - "80_7", - "80_8", - "80_9", - "81_0", - "81_1", - "82_0", - "86_1", - "88_0", - "88_1", - "88_10", - "88_2", - "88_4", - "88_5", - "88_6", - "88_7", - "88_8", - "88_9", - "89_0", - "89_1", - "89_10", - "89_11", - "89_12", - "89_13", - "89_14", - "89_15", - "89_16", - "89_17", - "89_18", - "89_2", - "89_3", - "89_4", - "89_5", - "89_6", - "89_7", - "89_8", - "89_9", - "8_10", - "8_14", - "90_17", - "90_18", - "91_0", - "91_1", - "91_2", - "91_3", - "91_4", - "91_5", - "91_6", - "91_7", - "92_0", - "92_1", - "92_10", - "92_11", - "92_12", - "92_13", - "92_14", - "92_2", - "92_3", - "92_4", - "92_5", - "92_6", - "92_7", - "92_8", - "92_9", - "95_0", - "95_1", - "95_10", - "95_11", - "95_12", - "95_13", - "95_14", - "95_15", - "95_16", - "95_17", - "95_18", - "95_19", - "95_2", - "95_3", - "95_4", - "95_5", - "95_6", - "95_7", - "95_8", - "95_9", - "96_0", - "96_1", - "96_2", - "96_3", - "96_4", - "96_5", - "96_6", - "97_0", - "97_1", - "97_2", - "97_3", - "98_0", - "98_1", - "98_2", - "98_3", - "98_4", - "98_5", - "98_6", - "98_7", - "98_8" - ] -} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json deleted file mode 100644 index 9f0937a..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/splits/sec_splits.json +++ /dev/null @@ -1,4685 +0,0 @@ -{ - "dataset": "sec", - "total_queries": 4084, - "train_queries": 3453, - "test_queries": 1216, - "sft_queries": 1035, - "rl_queries": 2418, - "sft_ratio": 0.2997393570807993, - "rl_ratio": 0.7002606429192006, - "sft_query_ids": [ - "1000_1", - "1001_0", - "1002_0", - "1004_1", - "1005_0", - "1006_1", - "1007_1", - "1007_2", - "1010_2", - "1011_0", - "1012_1", - "1013_0", - "1013_1", - "1014_0", - "1016_0", - "1018_2", - "101_0", - "1020_1", - "1020_2", - "1023_0", - "1023_1", - "1024_0", - "1025_2", - "1027_0", - "1027_2", - "1028_0", - "102_0", - "1033_0", - "1034_1", - "1036_0", - "1036_2", - "1042_0", - "1043_1", - "1045_0", - "1045_2", - "1047_0", - "1051_0", - "1052_0", - "1053_0", - "1053_1", - "1056_1", - "1056_2", - "1057_0", - "1063_0", - "1068_0", - "106_0", - "1071_0", - "1073_1", - "1074_1", - "1075_0", - "1077_0", - "1082_0", - "1082_1", - "1090_0", - "1090_1", - "1091_1", - "1092_0", - "1092_1", - "1097_1", - "1098_0", - "1099_0", - "10_0", - "1104_1", - "1105_0", - "1106_1", - "1107_1", - "1111_0", - "1112_0", - "1113_0", - "1115_0", - "1115_1", - "1117_1", - "1120_0", - "1122_0", - "1124_1", - "1125_0", - "1127_1", - "1129_0", - "112_0", - "1132_0", - "1133_0", - "1135_1", - "1137_0", - "1143_0", - "1144_1", - "1145_1", - "1148_0", - "1149_0", - "1154_1", - "1157_0", - "1157_1", - "1161_0", - "1164_0", - "1164_1", - "1166_1", - "116_0", - "1170_0", - "1170_1", - "1173_0", - "1173_1", - "1176_0", - "1176_1", - "1178_0", - "117_0", - "1183_0", - "1183_1", - "1185_1", - "1186_0", - "1188_0", - "1190_0", - "1193_1", - "1196_1", - "119_0", - "11_0", - "1200_1", - "1205_0", - "1207_0", - "1208_0", - "1208_1", - "1209_1", - "1210_0", - "1213_1", - "1215_0", - "1216_1", - "1219_1", - "1221_1", - "1222_0", - "1225_0", - "1226_0", - "1226_1", - "1227_0", - "1229_1", - "1231_1", - "1233_0", - "1238_1", - "1243_0", - "1244_0", - "1245_0", - "1246_1", - "1247_0", - "1247_1", - "1248_1", - "1249_1", - "1250_0", - "1252_1", - "1253_0", - "1257_0", - "1258_0", - "1259_0", - "125_0", - "1260_0", - "1261_0", - "1261_1", - "1263_1", - "1266_1", - "1269_0", - "1269_1", - "1271_1", - "1272_1", - "1273_0", - "1275_0", - "1275_1", - "1280_1", - "1281_1", - "1283_0", - "1286_1", - "1287_0", - "1289_0", - "1290_1", - "1291_1", - "1293_1", - "1296_0", - "1296_1", - "1298_1", - "1300_1", - "1301_1", - "1306_1", - "1307_0", - "1308_1", - "1309_0", - "1313_1", - "1314_0", - "1317_0", - "1319_0", - "1320_1", - "1322_0", - "1322_1", - "1323_0", - "1325_0", - "1329_1", - "1337_1", - "1338_1", - "1343_1", - "1344_1", - "1346_0", - "1348_1", - "134_0", - "1350_1", - "1351_1", - "1353_0", - "1353_1", - "1356_1", - "1358_0", - "1360_0", - "1360_1", - "1365_1", - "1367_1", - "1369_0", - "136_0", - "1371_0", - "1380_0", - "1380_1", - "1382_0", - "1386_0", - "1391_0", - "1392_0", - "1393_0", - "1394_1", - "1396_0", - "1396_1", - "1399_1", - "1405_0", - "1405_1", - "1406_1", - "1409_0", - "140_0", - "1410_1", - "1413_0", - "1415_0", - "1417_0", - "1418_0", - "1418_1", - "1421_0", - "1423_0", - "1429_1", - "1430_0", - "143_0", - "1440_0", - "1447_0", - "1448_0", - "1451_1", - "1456_1", - "1457_1", - "1458_0", - "1460_1", - "1461_0", - "1464_0", - "1465_1", - "1467_0", - "1470_1", - "1471_1", - "1473_0", - "1475_0", - "1477_0", - "1478_1", - "1479_1", - "1481_0", - "1482_0", - "1484_0", - "1486_1", - "1490_0", - "1493_0", - "1493_1", - "1497_0", - "1497_1", - "1498_1", - "14_0", - "1500_0", - "1503_1", - "1509_0", - "1510_0", - "1511_1", - "1514_0", - "1514_1", - "1515_0", - "1516_0", - "1517_1", - "1519_1", - "1522_0", - "1523_1", - "1528_0", - "1529_1", - "152_0", - "1530_0", - "1531_1", - "1533_1", - "1535_0", - "1537_1", - "1541_0", - "1542_0", - "1543_0", - "1548_1", - "1553_0", - "1553_1", - "1558_1", - "1560_1", - "1568_0", - "1574_0", - "1577_1", - "1581_0", - "1583_0", - "1584_0", - "1585_0", - "1590_0", - "1591_0", - "1597_0", - "1597_1", - "159_0", - "1602_0", - "1603_1", - "1605_0", - "1607_0", - "1609_1", - "160_0", - "1610_0", - "1611_0", - "1612_1", - "1615_0", - "1615_1", - "1616_1", - "1619_1", - "1620_1", - "1624_0", - "1628_0", - "1628_1", - "1629_1", - "1633_0", - "1633_1", - "1635_1", - "1638_0", - "1645_0", - "1652_0", - "1655_0", - "1655_1", - "1657_0", - "1659_0", - "1661_0", - "1666_0", - "1667_1", - "1668_1", - "1670_0", - "1671_0", - "1672_0", - "1672_1", - "1675_0", - "1676_0", - "1677_1", - "1678_0", - "167_0", - "1683_0", - "1686_0", - "1688_0", - "1691_0", - "1691_1", - "1700_0", - "1702_0", - "1704_0", - "1705_0", - "170_0", - "1713_0", - "1714_0", - "1717_0", - "171_0", - "1720_0", - "1721_0", - "1727_0", - "1728_1", - "172_0", - "1730_0", - "1732_0", - "1734_1", - "1735_0", - "1737_1", - "1738_0", - "1739_1", - "1748_0", - "1750_0", - "1753_0", - "1754_0", - "1755_0", - "1760_1", - "1761_0", - "1765_0", - "1767_1", - "176_0", - "1770_0", - "1771_0", - "1772_0", - "1774_0", - "1775_0", - "1778_0", - "1784_1", - "1787_0", - "1787_1", - "1788_0", - "1789_0", - "1789_1", - "1790_0", - "1793_1", - "1794_0", - "1796_0", - "179_0", - "1802_0", - "1809_0", - "1813_0", - "1815_0", - "1815_1", - "1816_0", - "1819_0", - "1821_0", - "1823_0", - "1824_1", - "1829_0", - "1830_1", - "1839_0", - "1848_1", - "1850_1", - "1851_1", - "1853_0", - "1855_0", - "1856_0", - "1857_0", - "1864_0", - "1869_1", - "1871_0", - "1872_0", - "1873_1", - "1874_0", - "1881_0", - "1885_1", - "1889_0", - "188_0", - "1890_0", - "1892_0", - "1893_0", - "1895_0", - "1896_0", - "1897_0", - "1897_1", - "1904_1", - "1905_0", - "190_0", - "1910_1", - "1914_1", - "1915_1", - "1918_0", - "1923_1", - "1924_1", - "1926_0", - "1929_0", - "1930_0", - "1939_0", - "1942_0", - "1944_0", - "1947_0", - "1956_0", - "1957_1", - "1962_1", - "1963_0", - "1964_1", - "1965_0", - "1967_1", - "1968_1", - "1970_1", - "1971_0", - "1972_1", - "1973_0", - "1973_1", - "1975_0", - "1982_1", - "1994_0", - "1994_1", - "1996_1", - "1_0", - "2002_0", - "2003_1", - "2004_0", - "2004_1", - "2005_1", - "2006_0", - "2007_0", - "2010_0", - "2010_1", - "2013_0", - "2016_0", - "2016_1", - "2019_0", - "201_0", - "2020_0", - "2023_1", - "2025_1", - "2026_0", - "2026_1", - "2028_1", - "2029_1", - "2031_0", - "2036_0", - "2037_0", - "2037_1", - "2038_0", - "2038_1", - "2039_0", - "203_0", - "2041_1", - "2043_1", - "2046_1", - "2047_1", - "2048_0", - "2048_1", - "2053_0", - "2055_0", - "2057_1", - "2060_1", - "2063_1", - "2064_0", - "2069_0", - "2069_1", - "2073_0", - "2073_1", - "2075_0", - "2075_1", - "2076_0", - "2076_1", - "2080_0", - "2081_0", - "2082_1", - "2084_0", - "2085_0", - "2089_0", - "2092_1", - "2097_0", - "2098_0", - "2099_0", - "2105_1", - "2108_0", - "2112_1", - "2113_1", - "2114_0", - "2114_1", - "2115_1", - "2116_1", - "2118_1", - "211_0", - "2123_1", - "2126_1", - "2127_0", - "2129_0", - "2134_0", - "2138_0", - "213_0", - "2140_0", - "2142_1", - "2143_0", - "2144_0", - "2145_0", - "2145_1", - "2147_0", - "2149_0", - "214_0", - "2152_0", - "2152_1", - "2154_1", - "2161_1", - "2167_0", - "2170_1", - "2171_0", - "2174_0", - "2176_0", - "2177_0", - "2181_0", - "2181_1", - "2183_0", - "2187_0", - "21_0", - "2200_0", - "2202_1", - "2203_0", - "2215_0", - "2216_1", - "2217_0", - "221_0", - "2221_0", - "2223_0", - "2225_0", - "2230_0", - "2233_0", - "2234_0", - "2236_0", - "2238_0", - "2240_0", - "2242_0", - "2243_0", - "2250_0", - "2254_0", - "2255_0", - "2259_0", - "2260_0", - "2264_0", - "2269_0", - "2272_0", - "2274_0", - "2275_0", - "2278_0", - "2281_0", - "2282_0", - "2289_0", - "228_0", - "2291_0", - "2297_0", - "2300_0", - "2315_0", - "2318_0", - "2324_0", - "2327_0", - "2328_0", - "2331_0", - "2344_0", - "2347_0", - "234_1", - "2354_0", - "2357_0", - "2358_0", - "2360_0", - "2367_0", - "2370_0", - "2379_0", - "2380_0", - "2383_0", - "2385_0", - "2386_0", - "239_0", - "239_2", - "23_0", - "2400_0", - "2403_0", - "2405_0", - "2408_0", - "2411_0", - "2413_0", - "2418_0", - "241_0", - "2424_0", - "2431_0", - "2433_0", - "2437_0", - "2438_0", - "2443_0", - "2445_0", - "2447_0", - "2449_0", - "2463_0", - "2464_0", - "2470_0", - "2471_0", - "2472_0", - "2477_0", - "2478_0", - "2479_0", - "2483_0", - "2486_0", - "2487_0", - "24_0", - "2510_0", - "2511_0", - "2513_0", - "2521_0", - "2531_0", - "253_0", - "253_2", - "2544_0", - "254_0", - "2552_0", - "2557_0", - "2559_0", - "2560_0", - "2563_0", - "2567_0", - "2572_0", - "2575_0", - "2588_0", - "2590_0", - "2592_0", - "2593_0", - "259_1", - "25_0", - "265_0", - "265_2", - "266_0", - "268_0", - "269_0", - "271_0", - "272_0", - "279_0", - "27_0", - "280_0", - "286_0", - "28_0", - "290_1", - "292_0", - "297_0", - "303_0", - "305_0", - "30_0", - "311_0", - "313_0", - "314_1", - "316_0", - "327_1", - "329_0", - "331_0", - "333_0", - "333_1", - "336_0", - "337_0", - "338_0", - "339_0", - "339_1", - "340_0", - "343_0", - "348_1", - "348_2", - "349_0", - "34_0", - "350_0", - "351_0", - "355_0", - "356_2", - "365_1", - "367_0", - "36_0", - "379_0", - "380_1", - "382_0", - "382_1", - "382_2", - "389_1", - "391_0", - "392_0", - "395_1", - "39_0", - "404_0", - "407_0", - "40_0", - "412_0", - "413_0", - "426_0", - "427_0", - "427_1", - "428_0", - "430_0", - "430_1", - "431_1", - "431_2", - "432_0", - "432_1", - "432_2", - "436_0", - "437_1", - "438_0", - "439_0", - "43_0", - "440_0", - "440_1", - "442_0", - "442_2", - "443_0", - "449_0", - "449_1", - "452_2", - "453_2", - "454_0", - "459_0", - "45_0", - "461_0", - "467_0", - "473_1", - "476_0", - "479_0", - "482_1", - "482_2", - "495_0", - "495_2", - "499_0", - "4_0", - "500_0", - "501_1", - "505_0", - "509_1", - "510_0", - "510_1", - "511_0", - "514_0", - "516_0", - "517_0", - "518_0", - "51_0", - "525_0", - "527_0", - "529_0", - "52_0", - "533_0", - "536_0", - "539_1", - "539_2", - "540_0", - "542_0", - "553_0", - "557_0", - "55_0", - "563_1", - "570_0", - "573_0", - "576_1", - "576_2", - "57_0", - "580_0", - "580_1", - "586_1", - "589_0", - "58_0", - "590_0", - "591_0", - "592_0", - "596_0", - "598_1", - "599_0", - "59_0", - "603_0", - "612_0", - "613_0", - "615_0", - "618_2", - "619_0", - "619_2", - "620_0", - "622_0", - "627_0", - "629_0", - "62_0", - "635_1", - "637_1", - "639_0", - "639_1", - "644_0", - "649_0", - "651_0", - "652_0", - "653_0", - "653_2", - "654_1", - "654_2", - "658_0", - "660_0", - "664_0", - "665_0", - "666_2", - "667_0", - "667_2", - "668_1", - "669_0", - "675_0", - "678_1", - "680_0", - "681_2", - "684_0", - "686_0", - "688_0", - "690_0", - "696_0", - "697_2", - "698_0", - "6_0", - "700_0", - "704_0", - "707_0", - "709_1", - "714_0", - "715_0", - "717_0", - "71_0", - "721_1", - "725_0", - "729_2", - "738_0", - "73_0", - "742_2", - "743_0", - "743_1", - "743_2", - "744_2", - "745_0", - "749_0", - "74_0", - "751_1", - "753_2", - "754_1", - "755_0", - "756_0", - "756_1", - "762_2", - "763_0", - "763_1", - "764_0", - "764_1", - "765_1", - "768_0", - "76_0", - "770_1", - "771_0", - "772_0", - "774_0", - "774_1", - "774_2", - "775_2", - "776_0", - "777_0", - "778_0", - "779_0", - "779_1", - "77_2", - "780_0", - "781_2", - "782_0", - "783_0", - "783_2", - "786_0", - "796_2", - "800_0", - "802_0", - "806_0", - "806_1", - "809_0", - "80_0", - "80_1", - "814_0", - "816_0", - "816_1", - "818_0", - "820_0", - "820_1", - "820_2", - "825_0", - "826_0", - "828_0", - "831_1", - "836_0", - "845_0", - "847_0", - "84_0", - "857_0", - "857_1", - "858_0", - "863_2", - "866_0", - "867_0", - "868_0", - "869_0", - "86_0", - "874_0", - "876_0", - "876_1", - "876_2", - "878_0", - "879_0", - "879_1", - "882_0", - "884_0", - "887_0", - "887_1", - "890_0", - "891_1", - "892_0", - "893_2", - "894_1", - "899_0", - "904_0", - "909_0", - "913_0", - "916_1", - "917_1", - "919_2", - "921_0", - "924_2", - "925_0", - "927_0", - "929_0", - "930_2", - "931_0", - "932_0", - "934_0", - "936_0", - "936_2", - "937_1", - "939_0", - "940_1", - "942_1", - "942_2", - "943_0", - "944_1", - "945_0", - "946_1", - "94_0", - "950_0", - "952_1", - "952_2", - "954_0", - "955_1", - "957_2", - "959_1", - "959_2", - "95_0", - "962_1", - "963_1", - "967_0", - "968_2", - "969_1", - "972_1", - "973_1", - "973_2", - "977_0", - "977_2", - "982_1", - "983_0", - "984_2", - "985_2", - "987_1", - "988_2", - "990_1", - "995_1", - "995_2", - "996_1", - "997_0", - "998_0", - "998_1", - "99_0" - ], - "rl_query_ids": [ - "0_0", - "1000_0", - "1000_2", - "1001_1", - "1001_2", - "1002_1", - "1002_2", - "1003_0", - "1004_0", - "1004_2", - "1006_0", - "1006_2", - "1007_0", - "1009_0", - "100_0", - "1010_0", - "1010_1", - "1011_1", - "1011_2", - "1012_0", - "1012_2", - "1014_1", - "1014_2", - "1015_0", - "1015_1", - "1015_2", - "1018_0", - "1018_1", - "1019_0", - "1019_1", - "1020_0", - "1021_0", - "1022_0", - "1022_1", - "1022_2", - "1024_1", - "1024_2", - "1025_0", - "1025_1", - "1026_0", - "1026_1", - "1026_2", - "1027_1", - "1030_0", - "1030_1", - "1030_2", - "1031_0", - "1031_1", - "1032_0", - "1032_1", - "1032_2", - "1033_1", - "1033_2", - "1034_0", - "1035_0", - "1035_1", - "1036_1", - "1038_0", - "1038_1", - "1038_2", - "103_0", - "1040_0", - "1041_0", - "1041_1", - "1041_2", - "1043_0", - "1043_2", - "1045_1", - "1046_0", - "1047_1", - "1047_2", - "1048_0", - "1049_0", - "104_0", - "1050_0", - "1050_1", - "1050_2", - "1051_1", - "1051_2", - "1054_0", - "1054_1", - "1054_2", - "1056_0", - "1057_1", - "1057_2", - "1059_0", - "1059_1", - "1059_2", - "105_0", - "1060_0", - "1060_1", - "1061_0", - "1061_1", - "1061_2", - "1062_0", - "1062_1", - "1064_0", - "1064_1", - "1064_2", - "1066_0", - "1066_1", - "1067_0", - "1068_1", - "1068_2", - "1070_0", - "1070_1", - "1071_1", - "1072_0", - "1072_1", - "1073_0", - "1074_0", - "1076_0", - "1076_1", - "1077_1", - "1078_0", - "1078_1", - "107_0", - "1080_0", - "1080_1", - "1081_0", - "1081_1", - "1084_0", - "1085_0", - "1086_0", - "1087_0", - "1087_1", - "1089_0", - "1089_1", - "108_0", - "1091_0", - "1093_0", - "1093_1", - "1094_0", - "1094_1", - "1095_0", - "1095_1", - "1097_0", - "1098_1", - "109_0", - "1103_0", - "1103_1", - "1104_0", - "1105_1", - "1106_0", - "1107_0", - "1108_0", - "110_0", - "1110_0", - "1110_1", - "1111_1", - "1113_1", - "1114_0", - "1114_1", - "1117_0", - "1118_0", - "1118_1", - "1119_0", - "1119_1", - "111_0", - "1120_1", - "1121_0", - "1121_1", - "1123_0", - "1124_0", - "1126_0", - "1127_0", - "1128_0", - "1128_1", - "1129_1", - "1131_0", - "1131_1", - "1132_1", - "1133_1", - "1134_0", - "1134_1", - "1135_0", - "1136_0", - "1137_1", - "1138_0", - "1138_1", - "1139_0", - "1139_1", - "113_0", - "1140_0", - "1140_1", - "1141_0", - "1141_1", - "1142_0", - "1142_1", - "1144_0", - "1145_0", - "1147_0", - "1147_1", - "1148_1", - "114_0", - "1150_0", - "1151_0", - "1151_1", - "1153_0", - "1154_0", - "1155_0", - "1155_1", - "1156_0", - "1156_1", - "1158_0", - "1158_1", - "1159_0", - "115_0", - "1161_1", - "1163_0", - "1163_1", - "1166_0", - "1168_0", - "1172_0", - "1172_1", - "1175_0", - "1175_1", - "1177_0", - "1178_1", - "1179_0", - "1180_0", - "1180_1", - "1181_0", - "1181_1", - "1182_0", - "1182_1", - "1185_0", - "1186_1", - "1188_1", - "1189_0", - "1189_1", - "118_0", - "1190_1", - "1191_0", - "1192_0", - "1192_1", - "1193_0", - "1194_0", - "1194_1", - "1195_0", - "1196_0", - "1197_0", - "1197_1", - "1198_0", - "1198_1", - "1199_0", - "1199_1", - "1200_0", - "1201_0", - "1203_0", - "1203_1", - "1204_0", - "1204_1", - "1206_0", - "1206_1", - "1207_1", - "1209_0", - "120_0", - "1210_1", - "1213_0", - "1215_1", - "1216_0", - "1217_0", - "1217_1", - "1218_0", - "1218_1", - "1219_0", - "121_0", - "1221_0", - "1222_1", - "1224_0", - "1228_0", - "1228_1", - "1229_0", - "1230_0", - "1231_0", - "1232_0", - "1233_1", - "1234_0", - "1234_1", - "1235_0", - "1235_1", - "1236_0", - "1237_0", - "1237_1", - "1238_0", - "1239_0", - "123_0", - "1240_0", - "1243_1", - "1245_1", - "1246_0", - "1248_0", - "1249_0", - "124_0", - "1251_0", - "1251_1", - "1252_0", - "1255_0", - "1256_0", - "1256_1", - "1258_1", - "1259_1", - "1260_1", - "1263_0", - "1264_0", - "1264_1", - "1265_0", - "1265_1", - "1266_0", - "126_0", - "1270_0", - "1270_1", - "1271_0", - "1272_0", - "1273_1", - "1274_0", - "1276_0", - "1276_1", - "1277_0", - "1277_1", - "1279_0", - "1280_0", - "1281_0", - "1282_0", - "1282_1", - "1283_1", - "1284_0", - "1284_1", - "1285_0", - "1285_1", - "1286_0", - "1287_1", - "1289_1", - "128_0", - "1290_0", - "1291_0", - "1292_0", - "1292_1", - "1293_0", - "1294_0", - "1294_1", - "1295_0", - "1297_0", - "1297_1", - "1298_0", - "129_0", - "1300_0", - "1301_0", - "1302_0", - "1302_1", - "1303_0", - "1303_1", - "1304_0", - "1304_1", - "1305_0", - "1305_1", - "1306_0", - "1307_1", - "1308_0", - "130_0", - "1310_0", - "1310_1", - "1311_0", - "1311_1", - "1312_0", - "1312_1", - "1313_0", - "1314_1", - "1315_0", - "1315_1", - "1316_0", - "1316_1", - "1317_1", - "1318_0", - "1318_1", - "1319_1", - "131_0", - "1320_0", - "1321_0", - "1321_1", - "1323_1", - "1324_0", - "1325_1", - "1326_0", - "1326_1", - "1328_0", - "1329_0", - "1330_0", - "1330_1", - "1331_0", - "1331_1", - "1332_0", - "1332_1", - "1333_0", - "1334_0", - "1334_1", - "1335_0", - "1335_1", - "1336_0", - "1336_1", - "1337_0", - "1338_0", - "133_0", - "1340_0", - "1340_1", - "1341_0", - "1342_0", - "1343_0", - "1344_0", - "1346_1", - "1347_0", - "1347_1", - "1348_0", - "1350_0", - "1351_0", - "1352_0", - "1352_1", - "1354_0", - "1354_1", - "1356_0", - "1357_0", - "1358_1", - "135_0", - "1361_0", - "1361_1", - "1362_0", - "1362_1", - "1364_0", - "1364_1", - "1365_0", - "1366_0", - "1366_1", - "1367_0", - "1369_1", - "1370_0", - "1371_1", - "1372_0", - "1372_1", - "1374_0", - "1375_0", - "1375_1", - "1376_0", - "1376_1", - "1377_0", - "1377_1", - "137_0", - "1381_0", - "1381_1", - "1382_1", - "1384_0", - "1384_1", - "1385_0", - "1385_1", - "1386_1", - "1387_0", - "1389_0", - "138_0", - "1390_0", - "1391_1", - "1394_0", - "1395_0", - "1395_1", - "1397_0", - "1397_1", - "1399_0", - "13_0", - "1400_0", - "1401_0", - "1401_1", - "1402_0", - "1402_1", - "1403_0", - "1403_1", - "1404_0", - "1404_1", - "1406_0", - "1407_0", - "1407_1", - "1408_0", - "1408_1", - "1409_1", - "1410_0", - "1411_0", - "1411_1", - "1412_0", - "1415_1", - "1416_0", - "1417_1", - "141_0", - "1420_0", - "1422_0", - "1422_1", - "1424_0", - "1425_0", - "1425_1", - "1427_0", - "1427_1", - "1428_0", - "1428_1", - "1429_0", - "142_0", - "1430_1", - "1431_0", - "1431_1", - "1433_0", - "1433_1", - "1434_0", - "1434_1", - "1435_0", - "1435_1", - "1436_0", - "1436_1", - "1437_0", - "1438_0", - "1438_1", - "1439_0", - "1439_1", - "1441_0", - "1441_1", - "1442_0", - "1442_1", - "1443_0", - "1443_1", - "1444_0", - "1446_0", - "1446_1", - "1447_1", - "1448_1", - "1449_0", - "1449_1", - "144_0", - "1450_0", - "1450_1", - "1451_0", - "1452_0", - "1452_1", - "1453_0", - "1453_1", - "1454_0", - "1454_1", - "1455_0", - "1456_0", - "1457_0", - "1458_1", - "1459_0", - "145_0", - "1460_0", - "1461_1", - "1462_0", - "1462_1", - "1463_0", - "1464_1", - "1465_0", - "1466_0", - "1466_1", - "1467_1", - "1468_0", - "1468_1", - "1469_0", - "1470_0", - "1471_0", - "1472_0", - "1472_1", - "1475_1", - "1477_1", - "1478_0", - "1479_0", - "147_0", - "1480_0", - "1480_1", - "1481_1", - "1482_1", - "1484_1", - "1485_0", - "1486_0", - "1487_0", - "1488_0", - "1488_1", - "1489_0", - "1489_1", - "148_0", - "1490_1", - "1491_0", - "1491_1", - "1492_0", - "1492_1", - "1494_0", - "1494_1", - "1495_0", - "1495_1", - "1498_0", - "149_0", - "1500_1", - "1501_0", - "1501_1", - "1503_0", - "1504_0", - "1504_1", - "1507_0", - "1507_1", - "150_0", - "1510_1", - "1511_0", - "1512_0", - "1512_1", - "1513_0", - "1515_1", - "1516_1", - "1517_0", - "1519_0", - "1520_0", - "1520_1", - "1521_0", - "1521_1", - "1522_1", - "1523_0", - "1527_0", - "1528_1", - "1529_0", - "1530_1", - "1531_0", - "1533_0", - "1534_0", - "1534_1", - "1535_1", - "1536_0", - "1536_1", - "1537_0", - "1538_0", - "1538_1", - "1539_0", - "1539_1", - "153_0", - "1540_0", - "1540_1", - "1541_1", - "1542_1", - "1543_1", - "1545_0", - "1545_1", - "1546_0", - "1547_0", - "1547_1", - "1548_0", - "1549_0", - "1549_1", - "154_0", - "1550_0", - "1550_1", - "1554_0", - "1554_1", - "1555_0", - "1555_1", - "1556_0", - "1556_1", - "1557_0", - "1557_1", - "1558_0", - "1559_0", - "1559_1", - "155_0", - "1560_0", - "1561_0", - "1561_1", - "1562_0", - "1562_1", - "1563_0", - "1563_1", - "1564_0", - "1564_1", - "1566_0", - "1566_1", - "1567_0", - "1567_1", - "1568_1", - "1569_0", - "1569_1", - "156_0", - "1572_0", - "1572_1", - "1573_0", - "1573_1", - "1574_1", - "1575_0", - "1575_1", - "1577_0", - "1578_0", - "1578_1", - "1579_0", - "157_0", - "1580_0", - "1580_1", - "1581_1", - "1582_0", - "1583_1", - "1584_1", - "1585_1", - "1587_0", - "1587_1", - "1589_0", - "158_0", - "1592_0", - "1592_1", - "1593_0", - "1594_0", - "1594_1", - "1596_0", - "1596_1", - "1599_0", - "1599_1", - "1600_0", - "1600_1", - "1602_1", - "1603_0", - "1605_1", - "1606_0", - "1606_1", - "1607_1", - "1608_0", - "1608_1", - "1609_0", - "1610_1", - "1611_1", - "1612_0", - "1614_0", - "1614_1", - "1616_0", - "1617_0", - "1617_1", - "1619_0", - "161_0", - "1620_0", - "1621_0", - "1621_1", - "1622_0", - "1622_1", - "1623_0", - "1625_0", - "1625_1", - "1626_0", - "1626_1", - "1627_0", - "1627_1", - "1629_0", - "162_0", - "1630_0", - "1631_0", - "1632_0", - "1632_1", - "1635_0", - "1636_0", - "1636_1", - "1638_1", - "1639_0", - "163_0", - "1641_0", - "1642_0", - "1642_1", - "1643_0", - "1643_1", - "1644_0", - "1644_1", - "1645_1", - "1646_0", - "1646_1", - "1647_0", - "1647_1", - "1649_0", - "164_0", - "1650_0", - "1650_1", - "1651_0", - "1651_1", - "1652_1", - "1653_0", - "1654_0", - "1654_1", - "1656_0", - "1656_1", - "1657_1", - "1658_0", - "1658_1", - "1659_1", - "165_0", - "1660_0", - "1660_1", - "1661_1", - "1662_0", - "1662_1", - "1664_0", - "1664_1", - "1665_0", - "1665_1", - "1666_1", - "1667_0", - "1668_0", - "1669_0", - "1669_1", - "1673_0", - "1673_1", - "1674_0", - "1675_1", - "1677_0", - "1678_1", - "1679_0", - "1679_1", - "1681_0", - "1681_1", - "1682_0", - "1683_1", - "1684_0", - "1684_1", - "1685_0", - "1685_1", - "1686_1", - "1687_0", - "168_0", - "1690_0", - "1692_0", - "1692_1", - "1693_0", - "1693_1", - "1694_0", - "1695_0", - "1695_1", - "1696_0", - "1697_0", - "1697_1", - "1698_0", - "1699_0", - "1699_1", - "169_0", - "1704_1", - "1706_0", - "1706_1", - "1707_0", - "1708_0", - "1708_1", - "1709_0", - "1710_0", - "1710_1", - "1712_0", - "1716_0", - "1718_0", - "1719_0", - "1722_0", - "1725_0", - "1726_0", - "1727_1", - "1728_0", - "1729_0", - "1729_1", - "1731_0", - "1731_1", - "1733_0", - "1734_0", - "1735_1", - "1737_0", - "1738_1", - "1739_0", - "1740_0", - "1740_1", - "1741_0", - "1742_0", - "1742_1", - "1743_0", - "1743_1", - "1746_0", - "1746_1", - "1747_0", - "1747_1", - "1748_1", - "1749_0", - "1749_1", - "174_0", - "1750_1", - "1751_0", - "1751_1", - "1752_0", - "1753_1", - "1756_0", - "1757_0", - "1757_1", - "1759_0", - "175_0", - "1760_0", - "1765_1", - "1766_0", - "1766_1", - "1767_0", - "1768_0", - "1768_1", - "1769_0", - "1769_1", - "1772_1", - "1776_0", - "1779_0", - "1779_1", - "177_0", - "1780_0", - "1780_1", - "1782_0", - "1783_0", - "1784_0", - "1785_0", - "1785_1", - "1786_0", - "1786_1", - "1788_1", - "178_0", - "1790_1", - "1791_0", - "1792_0", - "1792_1", - "1793_0", - "1794_1", - "1796_1", - "1797_0", - "1797_1", - "1798_0", - "1798_1", - "1799_0", - "1800_0", - "1803_0", - "1804_0", - "1805_0", - "1807_0", - "1808_0", - "1809_1", - "180_0", - "1810_0", - "1811_0", - "1811_1", - "1812_0", - "1813_1", - "1816_1", - "1817_0", - "1818_0", - "1818_1", - "1819_1", - "181_0", - "1820_0", - "1820_1", - "1821_1", - "1822_0", - "1823_1", - "1824_0", - "1825_0", - "1825_1", - "1829_1", - "182_0", - "1830_0", - "1831_0", - "1831_1", - "1832_0", - "1832_1", - "1833_0", - "1833_1", - "1835_0", - "1835_1", - "1837_0", - "1838_0", - "183_0", - "1840_0", - "1842_0", - "1844_0", - "1845_0", - "1845_1", - "1846_0", - "1848_0", - "1849_0", - "1849_1", - "184_0", - "1850_0", - "1851_0", - "1852_0", - "1852_1", - "1853_1", - "1854_0", - "1854_1", - "1855_1", - "1856_1", - "1858_0", - "1859_0", - "185_0", - "1860_0", - "1861_0", - "1862_0", - "1863_0", - "1863_1", - "1865_0", - "1866_0", - "1866_1", - "1869_0", - "186_0", - "1870_0", - "1870_1", - "1871_1", - "1872_1", - "1873_0", - "1874_1", - "1875_0", - "1875_1", - "1877_0", - "1877_1", - "1878_0", - "1878_1", - "1879_0", - "187_0", - "1880_0", - "1880_1", - "1881_1", - "1882_0", - "1882_1", - "1883_0", - "1883_1", - "1885_0", - "1886_0", - "1887_0", - "1887_1", - "1891_0", - "1894_0", - "1894_1", - "1895_1", - "1899_0", - "189_0", - "18_0", - "1900_0", - "1901_0", - "1902_0", - "1902_1", - "1903_0", - "1903_1", - "1904_0", - "1906_0", - "1906_1", - "1907_0", - "1907_1", - "1908_0", - "1910_0", - "1911_0", - "1911_1", - "1914_0", - "1915_0", - "1916_0", - "191_0", - "1920_0", - "1921_0", - "1923_0", - "1924_0", - "1926_1", - "1927_0", - "1927_1", - "1928_0", - "1929_1", - "192_0", - "1931_0", - "1934_0", - "1935_0", - "1936_0", - "1938_0", - "193_0", - "1940_0", - "1941_0", - "1943_0", - "1946_0", - "1949_0", - "194_0", - "1950_0", - "1950_1", - "1951_0", - "1951_1", - "1952_0", - "1954_0", - "1954_1", - "1955_0", - "1955_1", - "1956_1", - "1957_0", - "1958_0", - "1959_0", - "195_0", - "1960_0", - "1960_1", - "1962_0", - "1963_1", - "1964_0", - "1965_1", - "1967_0", - "1968_0", - "1970_0", - "1971_1", - "1972_0", - "1974_0", - "1974_1", - "1975_1", - "1976_0", - "1976_1", - "1977_0", - "1977_1", - "1978_0", - "1979_0", - "197_0", - "1980_0", - "1980_1", - "1981_0", - "1981_1", - "1982_0", - "1983_0", - "1983_1", - "1984_0", - "1986_0", - "1986_1", - "1987_0", - "1987_1", - "1989_0", - "198_0", - "1991_0", - "1993_0", - "1993_1", - "1995_0", - "1995_1", - "1996_0", - "1997_0", - "1997_1", - "1998_0", - "1998_1", - "1999_0", - "1999_1", - "199_0", - "19_0", - "2000_0", - "2000_1", - "2001_0", - "2001_1", - "2002_1", - "2003_0", - "2005_0", - "2006_1", - "2007_1", - "2008_0", - "2008_1", - "200_0", - "2012_0", - "2013_1", - "2014_0", - "2014_1", - "2017_0", - "2017_1", - "2019_1", - "2020_1", - "2021_0", - "2021_1", - "2022_0", - "2022_1", - "2023_0", - "2025_0", - "2027_0", - "2027_1", - "2028_0", - "2029_0", - "202_0", - "2030_0", - "2030_1", - "2031_1", - "2032_0", - "2032_1", - "2033_0", - "2034_0", - "2034_1", - "2035_0", - "2035_1", - "2036_1", - "2039_1", - "2040_0", - "2040_1", - "2041_0", - "2042_0", - "2042_1", - "2043_0", - "2044_0", - "2044_1", - "2045_0", - "2046_0", - "2047_0", - "2049_0", - "2049_1", - "204_0", - "2050_0", - "2050_1", - "2051_0", - "2051_1", - "2053_1", - "2054_0", - "2054_1", - "2056_0", - "2056_1", - "2057_0", - "2058_0", - "205_0", - "2060_0", - "2061_0", - "2062_0", - "2062_1", - "2063_0", - "2064_1", - "2065_0", - "2066_0", - "2066_1", - "2067_0", - "2068_0", - "206_0", - "2070_0", - "2070_1", - "2074_0", - "2074_1", - "2077_0", - "2080_1", - "2081_1", - "2082_0", - "2084_1", - "2086_0", - "2086_1", - "2087_0", - "2087_1", - "2088_0", - "2088_1", - "2089_1", - "208_0", - "2090_0", - "2090_1", - "2091_0", - "2091_1", - "2092_0", - "2093_0", - "2093_1", - "2094_0", - "2094_1", - "2095_0", - "2095_1", - "2097_1", - "2098_1", - "2099_1", - "209_0", - "20_0", - "2100_0", - "2100_1", - "2101_0", - "2102_0", - "2102_1", - "2103_0", - "2104_0", - "2104_1", - "2105_0", - "2106_0", - "2106_1", - "2107_0", - "2107_1", - "2108_1", - "210_0", - "2110_0", - "2110_1", - "2111_0", - "2111_1", - "2112_0", - "2113_0", - "2115_0", - "2116_0", - "2117_0", - "2118_0", - "2119_0", - "2119_1", - "2120_0", - "2120_1", - "2121_0", - "2121_1", - "2122_0", - "2122_1", - "2123_0", - "2124_0", - "2126_0", - "2127_1", - "2128_0", - "2128_1", - "2129_1", - "212_0", - "2130_0", - "2130_1", - "2132_0", - "2132_1", - "2133_0", - "2133_1", - "2134_1", - "2135_0", - "2135_1", - "2136_0", - "2136_1", - "2137_0", - "2137_1", - "2138_1", - "2142_0", - "2143_1", - "2146_0", - "2146_1", - "2147_1", - "2148_0", - "2149_1", - "2153_0", - "2154_0", - "2155_0", - "2155_1", - "2156_0", - "2156_1", - "2157_0", - "2157_1", - "2159_0", - "215_0", - "2160_0", - "2161_0", - "2162_0", - "2162_1", - "2163_0", - "2164_0", - "2165_0", - "2168_0", - "2169_0", - "216_0", - "2170_0", - "2171_1", - "2172_0", - "2172_1", - "2173_0", - "2173_1", - "2174_1", - "2177_1", - "2178_0", - "2178_1", - "2179_0", - "217_0", - "2180_0", - "2180_1", - "2183_1", - "2184_0", - "2184_1", - "2185_0", - "2186_0", - "2187_1", - "2188_0", - "2189_0", - "2189_1", - "218_0", - "2190_0", - "2190_1", - "2191_0", - "2191_1", - "2192_0", - "2194_0", - "2195_0", - "2196_0", - "2197_0", - "2198_0", - "2199_0", - "2201_0", - "2202_0", - "2205_0", - "2205_1", - "2206_0", - "2207_0", - "2208_0", - "2208_1", - "220_0", - "2210_0", - "2211_0", - "2211_1", - "2212_0", - "2212_1", - "2213_0", - "2213_1", - "2216_0", - "2218_0", - "2219_0", - "2220_0", - "2222_0", - "2224_0", - "2226_0", - "2227_0", - "2228_0", - "2229_0", - "222_0", - "2232_0", - "2235_0", - "2239_0", - "223_0", - "2241_0", - "2244_0", - "2245_0", - "2246_0", - "2247_0", - "2249_0", - "224_0", - "2251_0", - "2252_0", - "2253_0", - "2257_0", - "2258_0", - "225_0", - "2262_0", - "2265_0", - "2266_0", - "2267_0", - "2270_0", - "2271_0", - "2273_0", - "2276_0", - "2277_0", - "2279_0", - "2280_0", - "2283_0", - "2284_0", - "2285_0", - "2286_0", - "2287_0", - "2288_0", - "2292_0", - "2294_0", - "2295_0", - "2296_0", - "2298_0", - "229_0", - "22_0", - "2301_0", - "2302_0", - "2304_0", - "2305_0", - "2306_0", - "2307_0", - "2308_0", - "2309_0", - "230_0", - "2310_0", - "2312_0", - "2313_0", - "2314_0", - "2317_0", - "2319_0", - "231_0", - "2320_0", - "2321_0", - "2322_0", - "2325_0", - "2326_0", - "2329_0", - "232_0", - "2330_0", - "2332_0", - "2333_0", - "2334_0", - "2335_0", - "2336_0", - "233_0", - "2340_0", - "2341_0", - "2342_0", - "2343_0", - "2345_0", - "2346_0", - "2348_0", - "2349_0", - "234_0", - "234_2", - "2350_0", - "2351_0", - "2352_0", - "2353_0", - "2355_0", - "2356_0", - "235_0", - "235_1", - "235_2", - "2361_0", - "2364_0", - "2365_0", - "2368_0", - "2369_0", - "236_0", - "2371_0", - "2372_0", - "2373_0", - "2374_0", - "2375_0", - "2376_0", - "2377_0", - "2378_0", - "237_0", - "2381_0", - "2384_0", - "2387_0", - "2388_0", - "2390_0", - "2392_0", - "2395_0", - "2397_0", - "2398_0", - "2399_0", - "239_1", - "2402_0", - "2404_0", - "2406_0", - "2407_0", - "2409_0", - "240_0", - "2410_0", - "2412_0", - "2416_0", - "2420_0", - "2421_0", - "2422_0", - "2423_0", - "2425_0", - "2426_0", - "2428_0", - "2430_0", - "2435_0", - "2436_0", - "2439_0", - "243_0", - "2440_0", - "2441_0", - "2442_0", - "2444_0", - "2446_0", - "2448_0", - "244_0", - "244_1", - "244_2", - "2450_0", - "2451_0", - "2452_0", - "2453_0", - "2454_0", - "2455_0", - "2457_0", - "2458_0", - "2459_0", - "245_0", - "2461_0", - "2462_0", - "2465_0", - "2466_0", - "2467_0", - "2468_0", - "2469_0", - "246_0", - "2474_0", - "2475_0", - "2476_0", - "2480_0", - "2481_0", - "2482_0", - "2484_0", - "2488_0", - "2489_0", - "248_0", - "2490_0", - "2491_0", - "2492_0", - "2493_0", - "2494_0", - "2495_0", - "2496_0", - "2497_0", - "2498_0", - "2499_0", - "249_0", - "2500_0", - "2501_0", - "2502_0", - "2503_0", - "2504_0", - "2505_0", - "2506_0", - "2507_0", - "2509_0", - "250_0", - "2512_0", - "2514_0", - "2515_0", - "2516_0", - "2517_0", - "2518_0", - "2519_0", - "251_0", - "2522_0", - "2523_0", - "2525_0", - "2526_0", - "2528_0", - "2529_0", - "252_0", - "2530_0", - "2532_0", - "2533_0", - "2536_0", - "2537_0", - "2538_0", - "253_1", - "2540_0", - "2541_0", - "2542_0", - "2543_0", - "2545_0", - "2547_0", - "2548_0", - "2549_0", - "2550_0", - "2551_0", - "2553_0", - "2554_0", - "2555_0", - "2556_0", - "255_0", - "2562_0", - "2564_0", - "2566_0", - "2568_0", - "2569_0", - "256_0", - "2571_0", - "2574_0", - "2577_0", - "2579_0", - "257_0", - "2580_0", - "2581_0", - "2582_0", - "2583_0", - "2584_0", - "2585_0", - "2586_0", - "2587_0", - "2589_0", - "258_0", - "2591_0", - "2596_0", - "259_0", - "259_2", - "260_0", - "262_0", - "263_0", - "264_0", - "265_1", - "267_0", - "270_0", - "273_0", - "274_0", - "275_0", - "276_0", - "277_0", - "278_0", - "281_0", - "282_0", - "283_0", - "284_0", - "285_0", - "287_0", - "288_0", - "289_0", - "290_0", - "290_2", - "291_0", - "293_0", - "295_0", - "298_0", - "29_0", - "300_0", - "301_0", - "302_0", - "304_0", - "308_0", - "309_0", - "310_0", - "312_0", - "314_0", - "314_2", - "317_0", - "318_0", - "318_1", - "318_2", - "319_0", - "31_0", - "320_0", - "321_0", - "322_0", - "323_0", - "324_0", - "325_0", - "326_0", - "327_0", - "329_1", - "329_2", - "32_0", - "330_0", - "332_0", - "334_0", - "33_0", - "341_0", - "342_0", - "344_0", - "345_0", - "346_0", - "348_0", - "352_0", - "353_0", - "354_0", - "355_1", - "355_2", - "356_0", - "356_1", - "357_0", - "358_0", - "360_0", - "361_0", - "364_0", - "365_0", - "368_0", - "369_0", - "370_0", - "371_0", - "372_0", - "373_0", - "374_0", - "375_0", - "378_0", - "37_0", - "380_0", - "380_2", - "383_0", - "384_0", - "385_0", - "386_0", - "387_0", - "389_0", - "38_0", - "390_0", - "391_1", - "391_2", - "393_0", - "394_0", - "395_0", - "395_2", - "396_0", - "397_0", - "398_0", - "399_0", - "399_1", - "399_2", - "400_0", - "401_0", - "402_0", - "406_0", - "409_0", - "410_0", - "411_0", - "414_0", - "415_0", - "416_0", - "417_0", - "418_0", - "418_1", - "418_2", - "419_0", - "41_0", - "420_0", - "421_0", - "422_0", - "423_0", - "425_0", - "426_1", - "427_2", - "429_0", - "429_1", - "429_2", - "42_0", - "430_2", - "431_0", - "433_0", - "433_1", - "433_2", - "435_0", - "437_0", - "437_2", - "442_1", - "444_0", - "446_0", - "448_0", - "449_2", - "44_0", - "450_0", - "451_0", - "451_1", - "451_2", - "452_0", - "452_1", - "453_0", - "453_1", - "456_0", - "456_1", - "456_2", - "457_0", - "458_0", - "460_0", - "462_0", - "463_0", - "464_0", - "465_0", - "466_0", - "468_0", - "469_0", - "46_0", - "470_0", - "471_0", - "472_0", - "473_0", - "473_2", - "474_0", - "475_0", - "477_0", - "478_0", - "47_0", - "480_0", - "481_0", - "482_0", - "483_0", - "484_0", - "485_0", - "485_1", - "485_2", - "486_0", - "487_0", - "488_0", - "489_0", - "48_0", - "490_0", - "491_0", - "493_0", - "494_0", - "495_1", - "496_0", - "497_0", - "497_1", - "497_2", - "498_0", - "499_1", - "499_2", - "49_0", - "500_1", - "500_2", - "501_0", - "501_2", - "502_0", - "502_1", - "502_2", - "503_0", - "503_1", - "503_2", - "506_0", - "507_0", - "508_0", - "509_0", - "50_0", - "510_2", - "512_0", - "513_0", - "513_1", - "513_2", - "515_0", - "517_1", - "517_2", - "519_0", - "520_0", - "520_1", - "520_2", - "522_0", - "524_0", - "528_0", - "530_0", - "531_0", - "534_0", - "534_1", - "534_2", - "537_0", - "538_0", - "539_0", - "53_0", - "541_0", - "542_1", - "542_2", - "543_0", - "544_0", - "545_0", - "546_0", - "547_0", - "548_0", - "548_1", - "548_2", - "54_0", - "550_0", - "551_0", - "552_0", - "554_0", - "555_0", - "556_0", - "557_1", - "557_2", - "558_0", - "558_1", - "558_2", - "559_0", - "560_0", - "560_1", - "560_2", - "562_0", - "563_0", - "564_0", - "565_0", - "566_0", - "567_0", - "568_0", - "569_0", - "56_0", - "574_0", - "576_0", - "577_0", - "579_0", - "579_1", - "579_2", - "580_2", - "581_0", - "583_0", - "583_1", - "584_0", - "584_1", - "584_2", - "585_0", - "586_0", - "586_2", - "587_0", - "588_0", - "588_1", - "588_2", - "589_1", - "591_1", - "591_2", - "594_0", - "595_0", - "597_0", - "597_1", - "597_2", - "598_0", - "5_0", - "600_0", - "601_0", - "607_0", - "608_0", - "609_0", - "609_1", - "609_2", - "60_0", - "610_0", - "611_0", - "614_0", - "614_1", - "617_0", - "618_0", - "618_1", - "619_1", - "61_0", - "621_0", - "623_0", - "623_1", - "624_0", - "625_0", - "625_1", - "626_0", - "628_0", - "630_0", - "630_1", - "630_2", - "631_0", - "632_0", - "633_0", - "634_0", - "635_0", - "636_0", - "637_0", - "637_2", - "638_0", - "639_2", - "63_0", - "642_0", - "643_0", - "647_0", - "648_0", - "649_1", - "649_2", - "64_0", - "650_0", - "653_1", - "654_0", - "655_0", - "656_0", - "657_0", - "658_1", - "659_0", - "65_0", - "660_1", - "661_0", - "662_0", - "663_0", - "664_1", - "664_2", - "666_0", - "666_1", - "667_1", - "668_0", - "668_2", - "66_0", - "670_0", - "671_0", - "673_0", - "674_0", - "677_0", - "678_0", - "678_2", - "679_0", - "67_0", - "681_0", - "681_1", - "682_0", - "683_0", - "683_1", - "683_2", - "685_0", - "687_0", - "689_0", - "68_0", - "691_0", - "692_0", - "692_1", - "692_2", - "693_0", - "694_0", - "695_0", - "697_0", - "697_1", - "699_0", - "69_0", - "700_1", - "700_2", - "701_0", - "702_0", - "705_0", - "708_0", - "709_0", - "70_0", - "711_0", - "713_0", - "716_0", - "718_0", - "719_0", - "719_1", - "719_2", - "720_0", - "721_0", - "722_0", - "722_1", - "723_0", - "724_0", - "724_1", - "724_2", - "727_0", - "728_0", - "729_0", - "729_1", - "72_0", - "731_0", - "732_0", - "733_0", - "734_0", - "735_0", - "736_0", - "736_1", - "736_2", - "739_0", - "740_0", - "741_0", - "742_0", - "742_1", - "744_0", - "744_1", - "746_0", - "747_0", - "748_0", - "750_0", - "751_0", - "751_2", - "752_0", - "753_0", - "753_1", - "754_0", - "754_2", - "756_2", - "757_0", - "758_0", - "759_0", - "759_1", - "759_2", - "75_0", - "760_0", - "760_1", - "760_2", - "761_0", - "761_1", - "762_0", - "762_1", - "763_2", - "765_0", - "765_2", - "766_0", - "769_0", - "769_1", - "769_2", - "770_0", - "775_0", - "775_1", - "779_2", - "77_0", - "77_1", - "781_0", - "781_1", - "783_1", - "784_0", - "787_0", - "788_0", - "789_0", - "78_0", - "790_0", - "791_0", - "792_0", - "793_0", - "793_1", - "793_2", - "794_0", - "795_0", - "796_0", - "796_1", - "797_0", - "797_1", - "797_2", - "798_0", - "798_1", - "798_2", - "799_0", - "79_0", - "7_0", - "801_0", - "803_0", - "803_1", - "803_2", - "804_0", - "805_0", - "806_2", - "807_0", - "808_0", - "80_2", - "810_0", - "811_0", - "811_1", - "812_0", - "813_0", - "815_0", - "817_0", - "819_0", - "81_0", - "821_0", - "822_0", - "823_0", - "824_0", - "824_1", - "827_0", - "828_1", - "828_2", - "829_0", - "82_0", - "830_0", - "831_0", - "831_2", - "832_0", - "833_0", - "835_0", - "837_0", - "838_0", - "839_0", - "83_0", - "840_0", - "841_0", - "841_1", - "841_2", - "842_0", - "843_0", - "844_0", - "845_1", - "845_2", - "846_0", - "848_0", - "849_0", - "850_0", - "851_0", - "853_0", - "853_1", - "853_2", - "854_0", - "856_0", - "858_1", - "858_2", - "859_0", - "85_0", - "860_0", - "860_1", - "861_0", - "861_1", - "861_2", - "862_0", - "863_0", - "863_1", - "864_0", - "865_0", - "870_0", - "871_0", - "875_0", - "875_1", - "875_2", - "877_0", - "878_1", - "878_2", - "879_2", - "87_0", - "880_0", - "881_0", - "883_0", - "883_1", - "883_2", - "886_0", - "886_1", - "886_2", - "887_2", - "888_0", - "88_0", - "891_0", - "891_2", - "893_0", - "893_1", - "894_0", - "894_2", - "895_0", - "896_0", - "897_0", - "899_1", - "899_2", - "89_0", - "8_0", - "900_0", - "901_0", - "903_0", - "905_0", - "907_0", - "908_0", - "908_1", - "908_2", - "910_0", - "910_1", - "910_2", - "911_0", - "911_1", - "911_2", - "913_1", - "914_0", - "915_0", - "915_1", - "915_2", - "916_0", - "916_2", - "917_0", - "918_0", - "918_1", - "918_2", - "919_0", - "919_1", - "920_0", - "921_1", - "921_2", - "924_0", - "924_1", - "925_1", - "925_2", - "927_1", - "927_2", - "929_1", - "929_2", - "92_0", - "930_0", - "930_1", - "932_1", - "932_2", - "934_1", - "934_2", - "935_0", - "936_1", - "937_0", - "937_2", - "939_1", - "939_2", - "93_0", - "940_0", - "940_2", - "941_0", - "941_1", - "941_2", - "942_0", - "943_1", - "943_2", - "944_0", - "944_2", - "946_0", - "946_2", - "947_0", - "947_1", - "948_0", - "948_1", - "949_0", - "949_1", - "951_0", - "952_0", - "953_0", - "953_1", - "954_1", - "954_2", - "955_0", - "955_2", - "956_0", - "956_1", - "956_2", - "957_0", - "957_1", - "959_0", - "960_0", - "960_1", - "960_2", - "962_0", - "963_0", - "963_2", - "964_0", - "965_0", - "965_1", - "965_2", - "966_0", - "966_1", - "968_0", - "968_1", - "969_0", - "971_0", - "972_0", - "973_0", - "975_0", - "975_1", - "975_2", - "977_1", - "978_0", - "97_0", - "980_0", - "980_1", - "980_2", - "981_0", - "981_1", - "981_2", - "982_0", - "982_2", - "983_1", - "984_0", - "984_1", - "985_0", - "985_1", - "986_0", - "986_1", - "986_2", - "987_0", - "987_2", - "988_0", - "988_1", - "989_0", - "989_1", - "989_2", - "98_0", - "990_0", - "990_2", - "991_0", - "991_1", - "993_0", - "994_0", - "994_1", - "994_2", - "995_0", - "996_0", - "996_2", - "999_0", - "999_1", - "9_0" - ], - "test_query_ids": [ - "0_0", - "0_1", - "0_2", - "100_0", - "100_1", - "101_0", - "102_0", - "102_1", - "102_2", - "103_0", - "103_1", - "103_2", - "104_0", - "104_1", - "104_2", - "105_0", - "105_1", - "105_2", - "106_0", - "106_1", - "108_0", - "109_0", - "109_1", - "109_2", - "110_0", - "110_1", - "110_2", - "111_0", - "111_1", - "111_2", - "112_0", - "112_1", - "112_2", - "113_0", - "113_1", - "113_2", - "114_0", - "114_1", - "114_2", - "115_0", - "116_0", - "116_1", - "116_2", - "117_0", - "117_1", - "117_2", - "118_0", - "118_1", - "118_2", - "119_0", - "119_1", - "119_2", - "11_0", - "11_1", - "11_2", - "120_0", - "121_0", - "121_1", - "122_0", - "122_1", - "122_2", - "123_0", - "123_1", - "123_2", - "124_0", - "124_1", - "124_2", - "125_0", - "126_0", - "126_1", - "126_2", - "127_0", - "127_1", - "127_2", - "128_0", - "128_1", - "128_2", - "129_0", - "129_1", - "129_2", - "12_0", - "130_0", - "130_1", - "130_2", - "131_0", - "131_1", - "131_2", - "132_0", - "132_1", - "132_2", - "133_0", - "133_1", - "133_2", - "134_0", - "134_1", - "134_2", - "136_0", - "136_1", - "136_2", - "137_0", - "137_1", - "138_0", - "138_1", - "138_2", - "139_0", - "139_1", - "13_0", - "140_0", - "140_1", - "140_2", - "141_0", - "141_1", - "141_2", - "142_0", - "142_1", - "142_2", - "144_0", - "145_0", - "146_0", - "146_1", - "146_2", - "148_0", - "148_1", - "148_2", - "149_0", - "149_1", - "149_2", - "14_0", - "14_1", - "14_2", - "150_0", - "150_1", - "150_2", - "151_0", - "151_1", - "151_2", - "152_0", - "153_0", - "153_1", - "153_2", - "154_0", - "154_1", - "154_2", - "155_0", - "155_1", - "155_2", - "156_0", - "156_1", - "156_2", - "157_0", - "157_1", - "157_2", - "158_0", - "158_1", - "158_2", - "159_0", - "159_1", - "159_2", - "15_0", - "15_1", - "161_0", - "161_1", - "161_2", - "162_0", - "162_1", - "162_2", - "163_0", - "164_0", - "165_0", - "165_1", - "165_2", - "166_0", - "166_1", - "166_2", - "167_0", - "168_0", - "168_1", - "169_0", - "169_1", - "169_2", - "16_0", - "16_1", - "16_2", - "170_0", - "170_1", - "170_2", - "171_0", - "171_1", - "171_2", - "172_0", - "173_0", - "173_1", - "173_2", - "174_0", - "174_1", - "175_0", - "175_1", - "175_2", - "176_0", - "176_1", - "176_2", - "177_0", - "177_1", - "177_2", - "178_0", - "179_0", - "179_1", - "179_2", - "17_0", - "17_1", - "17_2", - "180_0", - "180_1", - "180_2", - "181_0", - "181_1", - "181_2", - "182_0", - "183_0", - "183_1", - "184_0", - "184_1", - "184_2", - "185_0", - "185_1", - "185_2", - "186_0", - "186_1", - "186_2", - "187_0", - "187_1", - "187_2", - "188_0", - "189_0", - "189_1", - "189_2", - "18_0", - "18_1", - "18_2", - "192_0", - "192_1", - "192_2", - "193_0", - "193_1", - "193_2", - "194_0", - "194_1", - "196_0", - "196_1", - "197_0", - "197_1", - "197_2", - "198_0", - "198_1", - "19_0", - "19_1", - "19_2", - "1_0", - "1_1", - "200_0", - "200_1", - "200_2", - "201_0", - "201_1", - "202_0", - "202_1", - "203_0", - "204_0", - "205_0", - "205_1", - "206_0", - "206_1", - "207_0", - "207_1", - "208_0", - "208_1", - "209_0", - "20_0", - "20_1", - "20_2", - "210_0", - "210_1", - "211_0", - "211_1", - "212_0", - "213_0", - "213_1", - "215_0", - "215_1", - "216_0", - "216_1", - "217_0", - "217_1", - "218_0", - "218_1", - "219_0", - "219_1", - "21_0", - "220_0", - "220_1", - "221_0", - "221_1", - "222_0", - "222_1", - "223_0", - "223_1", - "225_0", - "226_0", - "226_1", - "227_0", - "227_1", - "228_0", - "228_1", - "229_0", - "22_0", - "22_1", - "22_2", - "230_0", - "230_1", - "231_0", - "231_1", - "232_0", - "232_1", - "233_0", - "233_1", - "234_0", - "235_0", - "235_1", - "236_0", - "236_1", - "237_0", - "239_0", - "239_1", - "23_0", - "23_1", - "23_2", - "241_0", - "241_1", - "243_0", - "243_1", - "244_0", - "244_1", - "247_0", - "247_1", - "248_0", - "248_1", - "249_0", - "24_0", - "24_1", - "24_2", - "250_0", - "250_1", - "251_0", - "251_1", - "252_0", - "252_1", - "253_0", - "253_1", - "254_0", - "254_1", - "255_0", - "255_1", - "256_0", - "256_1", - "257_0", - "257_1", - "258_0", - "258_1", - "259_0", - "259_1", - "25_0", - "25_1", - "25_2", - "260_0", - "260_1", - "261_0", - "261_1", - "263_0", - "264_0", - "264_1", - "265_0", - "266_0", - "266_1", - "267_0", - "267_1", - "268_0", - "268_1", - "269_0", - "269_1", - "26_0", - "26_1", - "270_0", - "270_1", - "271_0", - "272_0", - "272_1", - "273_0", - "273_1", - "275_0", - "275_1", - "276_0", - "276_1", - "279_0", - "279_1", - "27_0", - "27_1", - "27_2", - "280_0", - "280_1", - "281_0", - "281_1", - "282_0", - "282_1", - "283_0", - "283_1", - "284_0", - "284_1", - "285_0", - "285_1", - "286_0", - "286_1", - "287_0", - "287_1", - "288_0", - "288_1", - "289_0", - "289_1", - "28_0", - "28_1", - "28_2", - "290_0", - "291_0", - "291_1", - "292_0", - "292_1", - "293_0", - "293_1", - "294_0", - "294_1", - "295_0", - "295_1", - "296_0", - "296_1", - "297_0", - "297_1", - "298_0", - "298_1", - "299_0", - "299_1", - "2_0", - "2_1", - "300_0", - "300_1", - "301_0", - "302_0", - "302_1", - "303_0", - "303_1", - "305_0", - "305_1", - "306_0", - "306_1", - "307_0", - "307_1", - "308_0", - "308_1", - "309_0", - "309_1", - "30_0", - "30_1", - "30_2", - "310_0", - "311_0", - "311_1", - "312_0", - "312_1", - "313_0", - "313_1", - "314_0", - "315_0", - "315_1", - "316_0", - "317_0", - "317_1", - "318_0", - "318_1", - "319_0", - "319_1", - "31_0", - "31_1", - "31_2", - "320_0", - "320_1", - "321_0", - "321_1", - "322_0", - "324_0", - "324_1", - "325_0", - "325_1", - "326_0", - "326_1", - "327_0", - "327_1", - "328_0", - "328_1", - "329_0", - "329_1", - "32_0", - "32_1", - "32_2", - "330_0", - "330_1", - "331_0", - "331_1", - "332_0", - "332_1", - "333_0", - "333_1", - "334_0", - "334_1", - "335_0", - "335_1", - "336_0", - "336_1", - "337_0", - "337_1", - "338_0", - "338_1", - "339_0", - "339_1", - "33_0", - "33_1", - "340_0", - "340_1", - "341_0", - "341_1", - "342_0", - "342_1", - "344_0", - "344_1", - "346_0", - "346_1", - "348_0", - "348_1", - "349_0", - "349_1", - "34_0", - "34_1", - "34_2", - "351_0", - "352_0", - "353_0", - "353_1", - "354_0", - "354_1", - "355_0", - "355_1", - "357_0", - "358_0", - "358_1", - "359_0", - "359_1", - "360_0", - "361_0", - "361_1", - "362_0", - "362_1", - "364_0", - "364_1", - "365_0", - "365_1", - "366_0", - "366_1", - "367_0", - "367_1", - "368_0", - "368_1", - "369_0", - "369_1", - "36_0", - "36_1", - "36_2", - "370_0", - "370_1", - "371_0", - "371_1", - "372_0", - "372_1", - "373_0", - "374_0", - "375_0", - "375_1", - "376_0", - "376_1", - "377_0", - "377_1", - "378_0", - "378_1", - "379_0", - "379_1", - "37_0", - "37_1", - "381_0", - "381_1", - "382_0", - "383_0", - "383_1", - "384_0", - "384_1", - "385_0", - "385_1", - "386_0", - "387_0", - "387_1", - "389_0", - "389_1", - "38_0", - "38_1", - "38_2", - "390_0", - "390_1", - "391_0", - "391_1", - "392_0", - "392_1", - "393_0", - "393_1", - "394_0", - "394_1", - "395_0", - "395_1", - "396_0", - "396_1", - "397_0", - "397_1", - "398_0", - "398_1", - "399_0", - "399_1", - "39_0", - "3_0", - "3_1", - "400_0", - "400_1", - "402_0", - "402_1", - "403_0", - "403_1", - "404_0", - "404_1", - "405_0", - "405_1", - "406_0", - "406_1", - "407_0", - "407_1", - "408_0", - "408_1", - "409_0", - "409_1", - "40_0", - "40_1", - "40_2", - "410_0", - "410_1", - "411_0", - "411_1", - "412_0", - "412_1", - "413_0", - "413_1", - "414_0", - "414_1", - "415_0", - "415_1", - "416_0", - "417_0", - "417_1", - "418_0", - "418_1", - "419_0", - "419_1", - "420_0", - "420_1", - "421_0", - "421_1", - "423_0", - "423_1", - "424_0", - "425_0", - "426_0", - "426_1", - "428_0", - "428_1", - "429_0", - "429_1", - "42_0", - "430_0", - "431_0", - "431_1", - "432_0", - "432_1", - "433_0", - "433_1", - "434_0", - "434_1", - "435_0", - "435_1", - "436_0", - "436_1", - "437_0", - "437_1", - "438_0", - "438_1", - "439_0", - "439_1", - "43_0", - "43_1", - "43_2", - "440_0", - "440_1", - "441_0", - "442_0", - "443_0", - "443_1", - "444_0", - "444_1", - "446_0", - "447_0", - "448_0", - "448_1", - "449_0", - "44_0", - "44_1", - "44_2", - "450_0", - "450_1", - "451_0", - "451_1", - "452_0", - "452_1", - "453_0", - "453_1", - "454_0", - "456_0", - "456_1", - "458_0", - "458_1", - "459_0", - "459_1", - "45_0", - "45_1", - "45_2", - "460_0", - "460_1", - "461_0", - "462_0", - "464_0", - "464_1", - "465_0", - "465_1", - "466_0", - "467_0", - "468_0", - "468_1", - "46_0", - "46_1", - "46_2", - "470_0", - "470_1", - "471_0", - "471_1", - "472_0", - "472_1", - "473_0", - "473_1", - "474_0", - "476_0", - "476_1", - "477_0", - "477_1", - "478_0", - "478_1", - "479_0", - "479_1", - "480_0", - "480_1", - "481_0", - "482_0", - "482_1", - "483_0", - "483_1", - "484_0", - "484_1", - "485_0", - "485_1", - "486_0", - "486_1", - "487_0", - "487_1", - "488_0", - "488_1", - "489_0", - "48_0", - "48_1", - "48_2", - "490_0", - "490_1", - "492_0", - "492_1", - "494_0", - "494_1", - "495_0", - "495_1", - "496_0", - "496_1", - "497_0", - "497_1", - "498_0", - "499_0", - "499_1", - "49_0", - "49_1", - "49_2", - "4_0", - "4_1", - "4_2", - "500_0", - "500_1", - "501_0", - "501_1", - "502_0", - "502_1", - "503_0", - "503_1", - "504_0", - "504_1", - "506_0", - "506_1", - "507_0", - "507_1", - "508_0", - "508_1", - "509_0", - "50_0", - "50_1", - "50_2", - "510_0", - "510_1", - "512_0", - "512_1", - "513_0", - "513_1", - "514_0", - "514_1", - "515_0", - "516_0", - "516_1", - "517_0", - "517_1", - "519_0", - "519_1", - "51_0", - "51_1", - "51_2", - "520_0", - "521_0", - "521_1", - "522_0", - "522_1", - "523_0", - "523_1", - "524_0", - "524_1", - "525_0", - "525_1", - "526_0", - "526_1", - "527_0", - "527_1", - "528_0", - "528_1", - "529_0", - "529_1", - "52_0", - "52_1", - "52_2", - "530_0", - "531_0", - "531_1", - "532_0", - "532_1", - "533_0", - "534_0", - "535_0", - "536_0", - "536_1", - "538_0", - "538_1", - "539_0", - "53_0", - "53_1", - "53_2", - "540_0", - "540_1", - "541_0", - "541_1", - "542_0", - "542_1", - "543_0", - "543_1", - "544_0", - "544_1", - "545_0", - "545_1", - "546_0", - "546_1", - "547_0", - "548_0", - "548_1", - "549_0", - "549_1", - "54_0", - "54_1", - "54_2", - "550_0", - "550_1", - "552_0", - "553_0", - "553_1", - "554_0", - "554_1", - "555_0", - "555_1", - "556_0", - "556_1", - "557_0", - "557_1", - "558_0", - "558_1", - "559_0", - "559_1", - "55_0", - "55_1", - "55_2", - "560_0", - "560_1", - "561_0", - "561_1", - "562_0", - "563_0", - "563_1", - "564_0", - "564_1", - "565_0", - "565_1", - "566_0", - "566_1", - "568_0", - "568_1", - "569_0", - "56_0", - "56_1", - "56_2", - "570_0", - "570_1", - "571_0", - "572_0", - "572_1", - "573_0", - "573_1", - "574_0", - "575_0", - "575_1", - "576_0", - "576_1", - "577_0", - "577_1", - "578_0", - "578_1", - "579_0", - "579_1", - "57_0", - "57_1", - "57_2", - "580_0", - "580_1", - "581_0", - "581_1", - "582_0", - "582_1", - "583_0", - "584_0", - "584_1", - "586_0", - "586_1", - "587_0", - "588_0", - "588_1", - "589_0", - "589_1", - "58_0", - "58_1", - "58_2", - "590_0", - "590_1", - "592_0", - "592_1", - "593_0", - "594_0", - "594_1", - "595_0", - "595_1", - "596_0", - "596_1", - "597_0", - "598_0", - "598_1", - "599_0", - "599_1", - "59_0", - "59_1", - "59_2", - "5_0", - "5_1", - "5_2", - "600_0", - "600_1", - "601_0", - "601_1", - "602_0", - "602_1", - "603_0", - "603_1", - "604_0", - "604_1", - "605_0", - "606_0", - "606_1", - "607_0", - "607_1", - "608_0", - "608_1", - "609_0", - "609_1", - "60_0", - "60_1", - "60_2", - "610_0", - "610_1", - "613_0", - "614_0", - "615_0", - "616_0", - "616_1", - "617_0", - "617_1", - "619_0", - "619_1", - "61_0", - "61_1", - "61_2", - "621_0", - "621_1", - "623_0", - "624_0", - "625_0", - "625_1", - "626_0", - "628_0", - "628_1", - "629_0", - "629_1", - "62_0", - "62_1", - "632_0", - "632_1", - "633_0", - "633_1", - "634_0", - "634_1", - "635_0", - "636_0", - "636_1", - "637_0", - "637_1", - "638_0", - "639_0", - "639_1", - "63_0", - "63_1", - "63_2", - "640_0", - "640_1", - "641_0", - "641_1", - "642_0", - "642_1", - "643_0", - "643_1", - "644_0", - "646_0", - "646_1", - "647_0", - "648_0", - "648_1", - "649_0", - "64_0", - "64_1", - "64_2", - "650_0", - "650_1", - "651_0", - "651_1", - "652_0", - "652_1", - "653_0", - "653_1", - "654_0", - "655_0", - "655_1", - "65_0", - "65_1", - "65_2", - "67_0", - "67_1", - "67_2", - "68_0", - "68_1", - "69_0", - "69_1", - "69_2", - "6_0", - "6_1", - "6_2", - "70_0", - "70_1", - "70_2", - "71_0", - "71_1", - "71_2", - "72_0", - "72_1", - "72_2", - "73_0", - "73_1", - "73_2", - "74_0", - "74_1", - "74_2", - "75_0", - "75_1", - "75_2", - "76_0", - "76_1", - "76_2", - "77_0", - "77_1", - "78_0", - "78_1", - "78_2", - "7_0", - "7_1", - "80_0", - "80_1", - "80_2", - "81_0", - "81_1", - "82_0", - "82_1", - "83_0", - "83_1", - "83_2", - "84_0", - "84_1", - "84_2", - "85_0", - "86_0", - "86_1", - "86_2", - "87_0", - "87_1", - "87_2", - "88_0", - "88_1", - "88_2", - "8_0", - "90_0", - "90_1", - "91_0", - "91_1", - "91_2", - "92_0", - "92_1", - "92_2", - "93_0", - "93_1", - "94_0", - "94_1", - "94_2", - "95_0", - "95_1", - "95_2", - "96_0", - "96_1", - "96_2", - "97_0", - "97_1", - "97_2", - "98_0", - "98_1", - "98_2", - "9_0", - "9_1", - "9_2" - ] -} \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json deleted file mode 100644 index 1eebdca..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/splits/summary.json +++ /dev/null @@ -1,34 +0,0 @@ -[ - { - "dataset": "browsecompplus", - "total": 830, - "train": 664, - "test": 166, - "sft": 199, - "rl": 465 - }, - { - "dataset": "sec", - "total": 4084, - "train": 3453, - "test": 1216, - "sft": 1035, - "rl": 2418 - }, - { - "dataset": "patents", - "total": 3107, - "train": 2518, - "test": 718, - "sft": 755, - "rl": 1763 - }, - { - "dataset": "web", - "total": 2351, - "train": 2224, - "test": 554, - "sft": 667, - "rl": 1557 - } -] \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json b/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json deleted file mode 100644 index 0915b3c..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/splits/web_splits.json +++ /dev/null @@ -1,2794 +0,0 @@ -{ - "dataset": "web", - "total_queries": 2351, - "train_queries": 2224, - "test_queries": 554, - "sft_queries": 667, - "rl_queries": 1557, - "sft_ratio": 0.29991007194244607, - "rl_ratio": 0.700089928057554, - "sft_query_ids": [ - "0_2", - "100_3", - "101_1", - "102_1", - "102_3", - "103_0", - "103_1", - "105_0", - "106_2", - "106_3", - "107_0", - "107_2", - "108_1", - "109_0", - "10_1", - "10_2", - "10_3", - "111_1", - "115_0", - "115_3", - "117_2", - "118_2", - "11_0", - "120_1", - "121_1", - "123_0", - "123_1", - "123_3", - "125_0", - "125_3", - "126_0", - "126_2", - "127_1", - "129_1", - "12_0", - "12_1", - "131_2", - "131_3", - "135_0", - "135_1", - "139_0", - "139_2", - "13_1", - "140_3", - "143_0", - "143_1", - "143_2", - "144_2", - "145_0", - "145_1", - "146_0", - "146_1", - "146_2", - "148_0", - "149_0", - "149_1", - "149_2", - "150_3", - "151_0", - "153_1", - "155_0", - "156_3", - "158_0", - "159_0", - "160_2", - "161_1", - "161_3", - "162_1", - "163_1", - "163_3", - "164_1", - "164_2", - "165_0", - "166_1", - "169_0", - "16_3", - "171_0", - "171_1", - "175_2", - "17_2", - "180_0", - "180_2", - "182_0", - "184_1", - "184_2", - "184_3", - "187_1", - "188_0", - "189_0", - "189_1", - "190_0", - "190_1", - "190_2", - "190_3", - "191_1", - "192_2", - "193_1", - "196_0", - "196_2", - "196_3", - "197_2", - "199_1", - "1_3", - "201_0", - "203_1", - "203_2", - "204_0", - "204_1", - "204_2", - "204_3", - "206_1", - "207_2", - "209_2", - "212_1", - "212_2", - "215_0", - "215_2", - "215_3", - "219_0", - "219_1", - "21_1", - "21_2", - "220_0", - "220_1", - "222_1", - "223_1", - "223_3", - "224_0", - "226_0", - "227_1", - "228_0", - "228_1", - "229_3", - "22_2", - "22_3", - "230_0", - "231_2", - "231_3", - "232_0", - "233_0", - "234_0", - "236_0", - "237_0", - "238_1", - "239_0", - "239_3", - "23_1", - "240_0", - "242_1", - "244_0", - "244_1", - "245_1", - "245_3", - "247_0", - "247_1", - "247_2", - "249_1", - "24_0", - "24_1", - "24_2", - "250_0", - "251_0", - "251_3", - "252_0", - "253_0", - "253_1", - "253_3", - "254_0", - "258_3", - "259_0", - "260_2", - "261_1", - "262_0", - "263_2", - "263_3", - "264_1", - "264_2", - "265_0", - "265_1", - "266_0", - "266_2", - "267_0", - "267_3", - "268_0", - "268_1", - "268_3", - "269_1", - "26_0", - "270_2", - "271_0", - "272_0", - "272_1", - "272_2", - "273_2", - "274_0", - "275_1", - "275_2", - "275_3", - "276_0", - "278_1", - "279_0", - "279_3", - "280_1", - "280_3", - "286_0", - "286_1", - "288_1", - "28_3", - "293_1", - "293_3", - "295_0", - "295_3", - "299_0", - "299_2", - "2_0", - "2_1", - "300_2", - "301_0", - "303_1", - "304_3", - "307_1", - "307_2", - "309_0", - "309_2", - "309_3", - "30_1", - "315_0", - "315_1", - "316_0", - "316_3", - "317_2", - "31_3", - "321_0", - "322_2", - "322_3", - "324_0", - "324_1", - "324_3", - "325_0", - "326_0", - "329_0", - "32_0", - "32_2", - "32_3", - "331_0", - "332_3", - "334_0", - "334_1", - "336_1", - "337_2", - "338_0", - "339_0", - "339_3", - "33_2", - "33_3", - "340_1", - "341_0", - "341_2", - "343_0", - "343_2", - "348_0", - "348_1", - "349_0", - "349_1", - "34_0", - "350_2", - "350_3", - "351_0", - "351_2", - "351_3", - "352_0", - "354_2", - "357_2", - "35_1", - "35_2", - "360_2", - "360_3", - "361_1", - "362_0", - "364_3", - "366_3", - "367_3", - "368_0", - "368_1", - "370_2", - "371_2", - "372_0", - "374_0", - "375_1", - "376_1", - "377_3", - "378_1", - "379_0", - "381_3", - "382_0", - "382_1", - "382_2", - "384_0", - "384_2", - "385_0", - "386_1", - "38_0", - "38_1", - "390_1", - "390_2", - "391_1", - "395_1", - "395_3", - "396_2", - "397_3", - "398_0", - "398_2", - "39_2", - "39_3", - "3_1", - "3_2", - "401_2", - "402_0", - "402_2", - "405_0", - "405_2", - "407_0", - "407_1", - "407_2", - "408_2", - "40_0", - "40_1", - "412_1", - "412_2", - "412_3", - "414_1", - "414_3", - "415_1", - "415_2", - "416_1", - "417_1", - "421_1", - "422_0", - "422_2", - "426_1", - "428_1", - "42_2", - "435_1", - "436_1", - "436_3", - "437_0", - "437_1", - "438_2", - "439_0", - "439_1", - "439_2", - "43_1", - "43_2", - "440_0", - "444_0", - "444_2", - "445_1", - "447_0", - "447_1", - "448_1", - "448_2", - "448_3", - "449_1", - "449_2", - "449_3", - "451_0", - "453_2", - "454_0", - "454_2", - "455_0", - "455_1", - "456_0", - "456_1", - "456_2", - "456_3", - "457_1", - "457_2", - "458_1", - "45_1", - "45_2", - "460_0", - "461_1", - "467_2", - "468_0", - "469_0", - "46_1", - "46_2", - "471_0", - "472_2", - "473_1", - "474_0", - "474_1", - "475_0", - "475_1", - "475_3", - "476_3", - "477_0", - "478_0", - "478_2", - "480_3", - "482_1", - "485_0", - "485_3", - "487_0", - "487_1", - "488_1", - "489_1", - "489_2", - "48_1", - "491_1", - "492_1", - "492_2", - "492_3", - "495_1", - "497_2", - "498_1", - "499_0", - "499_1", - "4_3", - "503_1", - "503_2", - "508_2", - "509_2", - "510_0", - "510_1", - "511_0", - "511_1", - "512_0", - "515_2", - "516_3", - "517_0", - "519_3", - "520_0", - "520_2", - "521_1", - "521_2", - "522_2", - "522_3", - "523_0", - "524_0", - "524_2", - "525_0", - "526_0", - "527_1", - "527_2", - "527_3", - "52_0", - "530_0", - "530_1", - "534_2", - "535_0", - "535_1", - "537_1", - "537_3", - "539_3", - "541_0", - "543_0", - "544_3", - "545_1", - "545_3", - "548_0", - "548_2", - "548_3", - "54_0", - "551_1", - "554_0", - "554_3", - "555_0", - "555_1", - "555_2", - "558_2", - "559_2", - "55_0", - "560_0", - "560_2", - "561_1", - "562_0", - "566_3", - "569_1", - "573_0", - "574_0", - "576_0", - "576_3", - "577_2", - "578_2", - "582_1", - "584_0", - "584_3", - "586_1", - "586_3", - "587_3", - "588_2", - "589_0", - "590_0", - "591_3", - "592_0", - "593_0", - "593_1", - "595_2", - "595_3", - "597_0", - "597_3", - "59_0", - "59_3", - "5_1", - "5_2", - "601_0", - "602_2", - "602_3", - "603_0", - "603_1", - "604_1", - "604_2", - "609_2", - "609_3", - "60_0", - "611_1", - "612_0", - "612_1", - "614_2", - "615_0", - "616_0", - "616_1", - "617_0", - "618_1", - "618_2", - "61_1", - "620_0", - "621_1", - "622_1", - "624_1", - "625_0", - "627_0", - "628_0", - "62_2", - "62_3", - "631_3", - "633_0", - "633_1", - "634_1", - "635_1", - "635_2", - "635_3", - "637_1", - "637_2", - "637_3", - "641_2", - "642_0", - "644_2", - "646_1", - "647_2", - "647_3", - "649_3", - "64_0", - "650_1", - "651_0", - "651_2", - "652_0", - "652_1", - "653_0", - "654_1", - "655_0", - "655_2", - "655_3", - "658_0", - "658_1", - "659_2", - "659_3", - "65_3", - "663_0", - "663_1", - "663_2", - "666_1", - "66_0", - "66_3", - "670_1", - "670_2", - "671_1", - "671_2", - "672_0", - "672_3", - "673_1", - "673_2", - "676_3", - "677_1", - "677_3", - "679_2", - "679_3", - "67_1", - "67_3", - "680_3", - "683_1", - "683_3", - "686_2", - "686_3", - "687_2", - "688_1", - "689_0", - "692_0", - "692_1", - "693_0", - "695_0", - "695_2", - "696_2", - "697_2", - "698_2", - "698_3", - "6_2", - "700_0", - "700_2", - "703_1", - "706_0", - "708_0", - "708_2", - "70_0", - "710_0", - "710_2", - "710_3", - "712_0", - "713_3", - "715_1", - "716_1", - "719_0", - "719_1", - "722_3", - "725_0", - "726_1", - "726_2", - "726_3", - "727_1", - "728_0", - "730_0", - "731_2", - "731_3", - "732_0", - "732_1", - "732_2", - "734_0", - "734_2", - "735_1", - "735_3", - "73_1", - "74_1", - "76_2", - "77_3", - "78_2", - "79_0", - "7_0", - "80_1", - "82_0", - "84_1", - "88_0", - "88_2", - "88_3", - "89_1", - "8_0", - "8_1", - "8_2", - "8_3", - "91_2", - "92_2", - "93_2", - "94_0", - "94_1", - "98_2", - "99_2", - "9_1" - ], - "rl_query_ids": [ - "0_0", - "0_1", - "0_3", - "100_0", - "100_1", - "100_2", - "101_0", - "101_2", - "101_3", - "102_0", - "102_2", - "103_2", - "103_3", - "104_0", - "104_1", - "104_2", - "104_3", - "105_1", - "105_2", - "105_3", - "106_0", - "106_1", - "107_1", - "108_0", - "108_2", - "108_3", - "109_1", - "109_2", - "10_0", - "111_0", - "111_2", - "111_3", - "112_0", - "112_1", - "112_2", - "112_3", - "113_0", - "114_0", - "114_1", - "114_2", - "114_3", - "115_1", - "115_2", - "116_0", - "116_1", - "116_2", - "116_3", - "117_0", - "117_1", - "117_3", - "118_0", - "118_1", - "119_0", - "11_1", - "11_2", - "120_0", - "120_2", - "120_3", - "121_0", - "121_2", - "121_3", - "122_0", - "122_1", - "122_2", - "122_3", - "123_2", - "124_0", - "124_1", - "124_2", - "124_3", - "125_1", - "125_2", - "126_1", - "127_0", - "127_2", - "127_3", - "129_0", - "129_2", - "129_3", - "12_2", - "12_3", - "130_0", - "131_0", - "131_1", - "132_0", - "134_0", - "134_1", - "135_2", - "135_3", - "136_0", - "136_1", - "136_2", - "137_0", - "138_0", - "138_1", - "138_2", - "138_3", - "139_1", - "139_3", - "13_0", - "13_2", - "13_3", - "140_0", - "140_1", - "140_2", - "141_0", - "141_1", - "141_2", - "141_3", - "144_0", - "144_1", - "144_3", - "145_2", - "145_3", - "146_3", - "147_0", - "147_1", - "148_1", - "149_3", - "14_0", - "14_1", - "150_0", - "150_1", - "150_2", - "151_1", - "151_2", - "151_3", - "152_0", - "152_1", - "152_2", - "153_0", - "153_2", - "153_3", - "154_0", - "154_1", - "156_0", - "156_1", - "156_2", - "157_0", - "157_1", - "158_1", - "159_1", - "159_2", - "15_0", - "160_0", - "160_1", - "160_3", - "161_0", - "161_2", - "162_0", - "163_0", - "163_2", - "164_0", - "164_3", - "165_1", - "166_0", - "166_2", - "166_3", - "167_0", - "168_0", - "168_1", - "169_1", - "169_2", - "169_3", - "16_0", - "16_1", - "16_2", - "170_0", - "170_1", - "171_2", - "173_0", - "173_1", - "173_2", - "174_0", - "174_1", - "174_2", - "174_3", - "175_0", - "175_1", - "175_3", - "176_0", - "176_1", - "177_0", - "177_1", - "177_2", - "177_3", - "17_0", - "17_1", - "17_3", - "180_1", - "180_3", - "181_1", - "183_0", - "183_1", - "183_2", - "183_3", - "184_0", - "185_0", - "185_1", - "185_2", - "185_3", - "186_0", - "186_1", - "187_0", - "188_1", - "188_2", - "188_3", - "189_2", - "18_0", - "18_1", - "18_2", - "18_3", - "191_0", - "191_2", - "191_3", - "192_0", - "192_1", - "193_0", - "193_2", - "195_0", - "195_1", - "195_2", - "195_3", - "196_1", - "197_0", - "197_1", - "197_3", - "198_0", - "198_1", - "199_0", - "199_2", - "19_0", - "19_1", - "19_2", - "19_3", - "1_0", - "1_1", - "1_2", - "201_1", - "201_2", - "201_3", - "202_0", - "202_1", - "202_2", - "202_3", - "203_0", - "203_3", - "205_0", - "205_1", - "206_0", - "206_2", - "207_0", - "207_1", - "207_3", - "208_0", - "208_1", - "208_2", - "208_3", - "209_0", - "209_1", - "209_3", - "20_0", - "20_1", - "20_2", - "20_3", - "211_0", - "211_1", - "211_2", - "211_3", - "212_0", - "212_3", - "213_0", - "213_1", - "213_2", - "213_3", - "214_0", - "214_1", - "214_2", - "215_1", - "216_0", - "216_1", - "216_2", - "216_3", - "217_0", - "217_1", - "217_2", - "217_3", - "218_0", - "218_1", - "219_2", - "219_3", - "21_0", - "21_3", - "220_2", - "220_3", - "221_0", - "221_1", - "221_2", - "221_3", - "222_0", - "222_2", - "222_3", - "223_0", - "223_2", - "225_0", - "225_1", - "225_2", - "227_0", - "227_2", - "228_2", - "228_3", - "229_0", - "229_1", - "229_2", - "22_0", - "22_1", - "230_1", - "230_2", - "231_0", - "231_1", - "232_1", - "232_2", - "232_3", - "233_1", - "234_1", - "235_0", - "236_1", - "236_2", - "236_3", - "237_1", - "237_2", - "237_3", - "238_0", - "238_2", - "238_3", - "239_1", - "239_2", - "23_0", - "23_2", - "23_3", - "240_1", - "240_2", - "240_3", - "241_0", - "241_1", - "241_2", - "242_0", - "242_2", - "243_0", - "243_1", - "243_2", - "243_3", - "244_2", - "245_0", - "245_2", - "246_0", - "246_1", - "246_2", - "246_3", - "248_0", - "248_1", - "248_2", - "248_3", - "249_0", - "249_2", - "249_3", - "24_3", - "250_1", - "250_2", - "250_3", - "251_1", - "251_2", - "252_1", - "253_2", - "254_1", - "254_2", - "254_3", - "256_0", - "256_1", - "257_0", - "257_1", - "257_2", - "257_3", - "258_0", - "258_1", - "258_2", - "259_1", - "260_0", - "260_1", - "261_0", - "261_2", - "261_3", - "263_0", - "263_1", - "264_0", - "264_3", - "265_2", - "265_3", - "266_1", - "266_3", - "267_1", - "267_2", - "268_2", - "269_0", - "269_2", - "269_3", - "26_1", - "26_2", - "26_3", - "270_0", - "270_1", - "270_3", - "271_1", - "272_3", - "273_0", - "273_1", - "274_1", - "274_2", - "275_0", - "276_1", - "277_0", - "278_0", - "278_2", - "278_3", - "279_1", - "279_2", - "27_0", - "280_0", - "280_2", - "281_0", - "281_1", - "281_2", - "283_0", - "283_1", - "284_0", - "285_0", - "285_1", - "285_2", - "285_3", - "286_2", - "286_3", - "287_1", - "287_2", - "287_3", - "288_0", - "288_2", - "289_0", - "289_1", - "289_2", - "289_3", - "28_0", - "28_1", - "28_2", - "290_0", - "290_1", - "290_2", - "290_3", - "291_0", - "291_1", - "291_2", - "291_3", - "292_0", - "292_1", - "293_0", - "293_2", - "294_0", - "294_1", - "294_2", - "295_1", - "295_2", - "296_0", - "296_1", - "296_2", - "296_3", - "297_0", - "299_1", - "29_0", - "29_1", - "29_2", - "29_3", - "2_2", - "2_3", - "300_0", - "300_1", - "301_1", - "301_2", - "302_0", - "303_0", - "303_2", - "303_3", - "304_0", - "304_1", - "304_2", - "305_0", - "306_0", - "306_1", - "306_2", - "306_3", - "307_0", - "309_1", - "30_0", - "30_2", - "30_3", - "310_0", - "310_1", - "311_0", - "311_1", - "311_2", - "311_3", - "312_0", - "313_0", - "313_1", - "313_2", - "313_3", - "314_0", - "314_1", - "314_2", - "314_3", - "316_1", - "316_2", - "317_0", - "317_1", - "318_0", - "318_1", - "318_2", - "318_3", - "31_0", - "31_1", - "31_2", - "320_0", - "320_1", - "320_2", - "320_3", - "321_1", - "321_2", - "321_3", - "322_0", - "322_1", - "323_0", - "323_1", - "323_2", - "323_3", - "324_2", - "326_1", - "326_2", - "326_3", - "327_0", - "327_1", - "327_2", - "327_3", - "329_1", - "329_2", - "329_3", - "32_1", - "330_0", - "330_1", - "330_2", - "331_1", - "331_2", - "331_3", - "332_0", - "332_1", - "332_2", - "333_0", - "333_1", - "333_2", - "333_3", - "336_0", - "336_2", - "336_3", - "337_0", - "337_1", - "337_3", - "339_1", - "339_2", - "33_0", - "33_1", - "340_0", - "341_1", - "341_3", - "342_0", - "342_1", - "343_1", - "344_0", - "344_1", - "344_2", - "344_3", - "345_0", - "345_1", - "345_2", - "346_0", - "347_0", - "347_1", - "347_2", - "347_3", - "348_2", - "348_3", - "349_2", - "349_3", - "34_1", - "350_0", - "350_1", - "351_1", - "352_1", - "352_2", - "353_0", - "353_1", - "354_0", - "354_1", - "354_3", - "355_0", - "355_1", - "355_2", - "356_0", - "356_1", - "357_0", - "357_1", - "357_3", - "358_0", - "358_1", - "358_2", - "358_3", - "35_0", - "35_3", - "360_0", - "360_1", - "361_0", - "363_0", - "363_1", - "363_2", - "363_3", - "364_0", - "364_1", - "364_2", - "365_0", - "365_1", - "365_2", - "366_0", - "366_1", - "366_2", - "367_0", - "367_1", - "367_2", - "368_2", - "368_3", - "369_0", - "369_1", - "369_2", - "369_3", - "36_0", - "370_0", - "370_1", - "371_0", - "371_1", - "374_1", - "374_2", - "374_3", - "375_0", - "375_2", - "375_3", - "376_0", - "376_2", - "376_3", - "377_0", - "377_1", - "377_2", - "378_0", - "378_2", - "378_3", - "37_0", - "37_1", - "37_2", - "381_0", - "381_1", - "381_2", - "383_0", - "383_1", - "383_2", - "383_3", - "384_1", - "385_1", - "385_2", - "386_0", - "386_2", - "387_0", - "389_0", - "389_1", - "38_2", - "38_3", - "390_0", - "390_3", - "391_0", - "392_0", - "392_1", - "392_2", - "392_3", - "393_0", - "393_1", - "394_0", - "394_1", - "394_2", - "394_3", - "395_0", - "395_2", - "396_0", - "396_1", - "396_3", - "397_0", - "397_1", - "397_2", - "398_1", - "399_0", - "399_1", - "39_0", - "39_1", - "3_0", - "400_0", - "400_1", - "401_0", - "401_1", - "401_3", - "402_1", - "402_3", - "403_0", - "403_1", - "403_2", - "403_3", - "405_1", - "405_3", - "406_0", - "406_1", - "407_3", - "408_0", - "408_1", - "408_3", - "409_0", - "40_2", - "411_0", - "411_1", - "411_2", - "411_3", - "412_0", - "413_0", - "413_1", - "413_2", - "413_3", - "414_0", - "414_2", - "415_0", - "415_3", - "416_0", - "416_2", - "416_3", - "417_0", - "417_2", - "417_3", - "418_0", - "41_0", - "41_1", - "420_0", - "420_1", - "420_2", - "420_3", - "421_0", - "421_2", - "421_3", - "422_1", - "422_3", - "423_0", - "423_1", - "424_0", - "426_0", - "426_2", - "426_3", - "427_0", - "427_1", - "427_2", - "428_0", - "428_2", - "428_3", - "429_0", - "429_1", - "429_2", - "429_3", - "42_0", - "42_1", - "42_3", - "430_0", - "430_1", - "430_2", - "430_3", - "432_0", - "432_1", - "432_2", - "432_3", - "433_0", - "433_1", - "433_2", - "433_3", - "435_0", - "435_2", - "435_3", - "436_0", - "436_2", - "438_0", - "438_1", - "438_3", - "439_3", - "43_0", - "43_3", - "440_1", - "440_2", - "440_3", - "441_0", - "442_0", - "442_1", - "443_0", - "444_1", - "444_3", - "445_0", - "445_2", - "445_3", - "446_0", - "446_1", - "447_2", - "448_0", - "449_0", - "450_0", - "450_1", - "450_2", - "450_3", - "451_1", - "451_2", - "451_3", - "453_0", - "453_1", - "453_3", - "454_1", - "454_3", - "457_0", - "457_3", - "458_0", - "458_2", - "459_0", - "459_1", - "459_2", - "459_3", - "45_0", - "45_3", - "460_1", - "460_2", - "461_0", - "461_2", - "461_3", - "462_0", - "462_1", - "462_2", - "462_3", - "464_0", - "464_1", - "464_2", - "464_3", - "465_0", - "465_1", - "465_2", - "465_3", - "466_0", - "467_0", - "467_1", - "467_3", - "468_1", - "468_2", - "468_3", - "469_1", - "469_2", - "469_3", - "46_0", - "46_3", - "470_0", - "470_1", - "470_2", - "470_3", - "472_0", - "472_1", - "472_3", - "473_0", - "475_2", - "476_0", - "476_1", - "476_2", - "478_1", - "479_0", - "480_0", - "480_1", - "480_2", - "481_0", - "482_0", - "482_2", - "482_3", - "483_0", - "483_1", - "483_2", - "483_3", - "484_0", - "484_1", - "484_2", - "484_3", - "485_1", - "485_2", - "486_0", - "487_2", - "488_0", - "488_2", - "488_3", - "489_0", - "489_3", - "48_0", - "491_0", - "491_2", - "491_3", - "492_0", - "493_0", - "495_0", - "495_2", - "495_3", - "496_0", - "496_1", - "496_2", - "496_3", - "497_0", - "497_1", - "497_3", - "498_0", - "498_2", - "499_2", - "499_3", - "49_0", - "49_1", - "49_2", - "49_3", - "4_0", - "4_1", - "4_2", - "500_0", - "500_1", - "500_2", - "500_3", - "501_0", - "501_1", - "501_2", - "502_0", - "502_1", - "503_0", - "503_3", - "504_0", - "504_1", - "504_2", - "504_3", - "505_0", - "505_1", - "506_0", - "506_1", - "506_2", - "507_0", - "507_1", - "507_2", - "507_3", - "508_0", - "508_1", - "508_3", - "509_0", - "509_1", - "509_3", - "50_0", - "50_1", - "50_2", - "50_3", - "510_2", - "510_3", - "511_2", - "511_3", - "512_1", - "512_2", - "512_3", - "514_0", - "514_1", - "514_2", - "515_0", - "515_1", - "516_0", - "516_1", - "516_2", - "517_1", - "519_0", - "519_1", - "519_2", - "51_0", - "51_1", - "51_2", - "51_3", - "520_1", - "520_3", - "521_0", - "521_3", - "522_0", - "522_1", - "523_1", - "523_2", - "523_3", - "524_1", - "524_3", - "525_1", - "525_2", - "527_0", - "528_0", - "528_1", - "528_2", - "528_3", - "529_0", - "52_1", - "52_2", - "52_3", - "530_2", - "530_3", - "531_0", - "531_1", - "533_0", - "533_1", - "533_2", - "533_3", - "534_0", - "534_1", - "534_3", - "536_0", - "537_0", - "537_2", - "538_0", - "538_1", - "539_0", - "539_1", - "539_2", - "540_0", - "540_1", - "540_2", - "540_3", - "542_0", - "542_1", - "542_2", - "542_3", - "543_1", - "543_2", - "543_3", - "544_0", - "544_1", - "544_2", - "545_0", - "545_2", - "548_1", - "549_0", - "549_1", - "549_2", - "549_3", - "54_1", - "550_0", - "550_1", - "551_0", - "553_0", - "553_1", - "553_2", - "553_3", - "554_1", - "554_2", - "556_0", - "556_1", - "556_2", - "557_0", - "557_1", - "557_2", - "557_3", - "558_0", - "558_1", - "558_3", - "559_0", - "559_1", - "559_3", - "55_1", - "55_2", - "55_3", - "560_1", - "560_3", - "561_0", - "561_2", - "561_3", - "562_1", - "562_2", - "562_3", - "563_0", - "564_0", - "565_0", - "565_1", - "566_0", - "566_1", - "566_2", - "567_0", - "567_1", - "567_2", - "567_3", - "568_0", - "568_1", - "568_2", - "568_3", - "569_0", - "569_2", - "569_3", - "571_0", - "571_1", - "571_2", - "571_3", - "572_0", - "572_1", - "572_2", - "572_3", - "573_1", - "574_1", - "574_2", - "574_3", - "575_0", - "575_1", - "575_2", - "576_1", - "576_2", - "577_0", - "577_1", - "578_0", - "578_1", - "578_3", - "579_0", - "579_1", - "579_2", - "579_3", - "57_0", - "57_1", - "581_0", - "581_1", - "581_2", - "581_3", - "582_0", - "583_0", - "583_1", - "583_2", - "583_3", - "584_1", - "584_2", - "585_0", - "586_0", - "586_2", - "587_0", - "587_1", - "587_2", - "588_0", - "588_1", - "588_3", - "589_1", - "589_2", - "58_0", - "58_1", - "590_1", - "591_0", - "591_1", - "591_2", - "592_1", - "592_2", - "592_3", - "593_2", - "593_3", - "595_0", - "595_1", - "596_0", - "596_1", - "596_2", - "596_3", - "597_1", - "597_2", - "598_0", - "598_1", - "599_0", - "599_1", - "59_1", - "59_2", - "5_0", - "5_3", - "600_0", - "600_1", - "600_2", - "601_1", - "601_2", - "601_3", - "602_0", - "602_1", - "603_2", - "603_3", - "604_0", - "607_0", - "607_1", - "607_2", - "607_3", - "608_0", - "608_1", - "608_2", - "608_3", - "609_0", - "609_1", - "60_1", - "60_2", - "60_3", - "611_0", - "613_0", - "614_0", - "614_1", - "614_3", - "616_2", - "616_3", - "617_1", - "617_2", - "617_3", - "618_0", - "618_3", - "619_0", - "61_0", - "621_0", - "621_2", - "622_0", - "622_2", - "623_0", - "623_1", - "623_2", - "623_3", - "624_0", - "624_2", - "624_3", - "625_1", - "625_2", - "625_3", - "626_0", - "626_1", - "627_1", - "627_2", - "629_0", - "629_1", - "629_2", - "629_3", - "62_0", - "62_1", - "631_0", - "631_1", - "631_2", - "632_0", - "632_1", - "632_2", - "632_3", - "633_2", - "633_3", - "634_0", - "634_2", - "634_3", - "635_0", - "637_0", - "638_0", - "639_0", - "640_0", - "640_1", - "641_0", - "641_1", - "641_3", - "642_1", - "642_2", - "642_3", - "643_0", - "643_1", - "644_0", - "644_1", - "644_3", - "646_0", - "646_2", - "646_3", - "647_0", - "647_1", - "648_0", - "649_0", - "649_1", - "649_2", - "64_1", - "64_2", - "650_0", - "650_2", - "650_3", - "651_1", - "651_3", - "652_2", - "652_3", - "653_1", - "653_2", - "653_3", - "654_0", - "655_1", - "656_0", - "657_0", - "657_1", - "657_2", - "657_3", - "658_2", - "658_3", - "659_0", - "659_1", - "65_0", - "65_1", - "65_2", - "660_0", - "660_1", - "660_2", - "660_3", - "661_0", - "661_1", - "662_0", - "662_1", - "662_2", - "662_3", - "663_3", - "664_0", - "665_0", - "665_1", - "665_2", - "665_3", - "666_0", - "667_0", - "667_1", - "667_2", - "668_0", - "669_0", - "669_1", - "669_2", - "66_1", - "66_2", - "670_0", - "670_3", - "671_0", - "672_1", - "672_2", - "673_0", - "675_0", - "675_1", - "675_2", - "675_3", - "676_0", - "676_1", - "676_2", - "677_0", - "677_2", - "678_0", - "679_0", - "679_1", - "67_0", - "67_2", - "680_0", - "680_1", - "680_2", - "683_0", - "683_2", - "684_0", - "684_1", - "684_2", - "685_0", - "686_0", - "686_1", - "687_0", - "687_1", - "687_3", - "688_0", - "688_2", - "688_3", - "689_1", - "689_2", - "689_3", - "68_0", - "690_0", - "690_1", - "692_2", - "692_3", - "693_1", - "695_1", - "695_3", - "696_0", - "696_1", - "696_3", - "697_0", - "697_1", - "697_3", - "698_0", - "698_1", - "699_0", - "699_1", - "6_0", - "6_1", - "6_3", - "700_1", - "700_3", - "701_0", - "701_1", - "701_2", - "703_0", - "703_2", - "703_3", - "704_0", - "704_1", - "705_0", - "705_1", - "705_2", - "706_1", - "706_2", - "706_3", - "707_0", - "707_1", - "708_1", - "709_0", - "709_1", - "709_2", - "709_3", - "710_1", - "711_0", - "711_1", - "711_2", - "712_1", - "712_2", - "712_3", - "713_0", - "713_1", - "713_2", - "714_0", - "714_1", - "715_0", - "715_2", - "715_3", - "716_0", - "716_2", - "716_3", - "717_0", - "717_1", - "717_2", - "717_3", - "718_0", - "719_2", - "71_0", - "71_1", - "71_2", - "71_3", - "720_0", - "720_1", - "722_0", - "722_1", - "722_2", - "723_0", - "723_1", - "725_1", - "725_2", - "725_3", - "726_0", - "727_0", - "727_2", - "727_3", - "728_1", - "728_2", - "728_3", - "729_0", - "729_1", - "729_2", - "729_3", - "730_1", - "730_2", - "730_3", - "731_0", - "731_1", - "732_3", - "734_1", - "735_0", - "735_2", - "736_0", - "73_0", - "74_0", - "74_2", - "75_0", - "75_1", - "75_2", - "75_3", - "76_0", - "76_1", - "76_3", - "77_0", - "77_1", - "77_2", - "78_0", - "78_1", - "78_3", - "79_1", - "7_1", - "7_2", - "7_3", - "80_0", - "80_2", - "80_3", - "81_0", - "82_1", - "82_2", - "82_3", - "83_0", - "83_1", - "84_0", - "84_2", - "84_3", - "85_0", - "85_1", - "85_3", - "87_0", - "88_1", - "89_0", - "90_0", - "90_1", - "90_2", - "90_3", - "91_0", - "91_1", - "91_3", - "92_0", - "92_1", - "92_3", - "93_0", - "93_1", - "93_3", - "94_2", - "94_3", - "95_0", - "95_1", - "96_0", - "96_1", - "96_2", - "96_3", - "97_0", - "97_1", - "97_2", - "97_3", - "98_0", - "98_1", - "98_3", - "99_0", - "99_1", - "99_3", - "9_0", - "9_2" - ], - "test_query_ids": [ - "0_0", - "0_1", - "0_2", - "0_3", - "100_0", - "100_1", - "100_2", - "101_0", - "101_1", - "101_2", - "102_0", - "102_1", - "102_2", - "102_3", - "103_0", - "103_1", - "103_2", - "103_3", - "104_0", - "104_1", - "104_2", - "105_0", - "105_1", - "106_0", - "106_1", - "107_0", - "107_1", - "107_2", - "107_3", - "108_0", - "108_1", - "108_2", - "108_3", - "109_0", - "109_1", - "109_2", - "109_3", - "10_0", - "10_1", - "10_2", - "10_3", - "110_0", - "110_1", - "110_2", - "110_3", - "112_0", - "112_1", - "112_2", - "112_3", - "113_0", - "113_1", - "113_2", - "113_3", - "114_0", - "114_1", - "114_2", - "115_0", - "115_1", - "115_2", - "115_3", - "117_0", - "117_1", - "118_0", - "118_1", - "118_2", - "118_3", - "119_1", - "119_2", - "11_0", - "11_1", - "11_2", - "11_3", - "120_0", - "120_1", - "120_2", - "121_0", - "121_1", - "121_2", - "121_3", - "122_0", - "122_1", - "122_2", - "122_3", - "123_0", - "123_1", - "123_2", - "123_3", - "125_0", - "125_1", - "125_2", - "126_0", - "126_1", - "126_2", - "126_3", - "127_0", - "127_1", - "128_0", - "128_1", - "128_2", - "128_3", - "129_0", - "129_1", - "12_0", - "12_1", - "12_2", - "12_3", - "130_0", - "130_1", - "130_2", - "130_3", - "131_0", - "131_1", - "132_0", - "132_1", - "132_2", - "132_3", - "133_0", - "133_1", - "133_2", - "133_3", - "134_0", - "134_1", - "134_2", - "134_3", - "136_0", - "136_1", - "137_0", - "137_1", - "137_2", - "138_0", - "138_1", - "138_2", - "138_3", - "139_0", - "139_1", - "13_0", - "13_1", - "13_2", - "13_3", - "140_0", - "140_1", - "140_2", - "140_3", - "141_0", - "141_1", - "141_2", - "141_3", - "142_0", - "142_1", - "142_2", - "142_3", - "143_0", - "143_1", - "143_2", - "144_0", - "144_1", - "144_2", - "144_3", - "145_0", - "145_1", - "145_2", - "145_3", - "146_0", - "146_1", - "146_2", - "146_3", - "147_0", - "147_1", - "147_2", - "147_3", - "148_0", - "148_1", - "148_2", - "148_3", - "149_0", - "149_1", - "149_2", - "14_0", - "14_1", - "150_0", - "151_0", - "151_1", - "151_2", - "151_3", - "152_0", - "152_1", - "152_2", - "152_3", - "153_0", - "153_1", - "153_2", - "153_3", - "154_0", - "154_1", - "154_2", - "154_3", - "155_0", - "155_1", - "155_2", - "155_3", - "156_0", - "156_1", - "156_2", - "156_3", - "157_0", - "157_1", - "158_0", - "159_0", - "159_1", - "159_2", - "159_3", - "15_0", - "15_1", - "15_2", - "15_3", - "161_0", - "161_1", - "162_0", - "162_1", - "162_2", - "162_3", - "163_0", - "163_1", - "163_2", - "164_0", - "165_0", - "165_1", - "165_2", - "165_3", - "166_0", - "166_1", - "166_2", - "166_3", - "167_0", - "167_1", - "167_2", - "167_3", - "168_0", - "168_1", - "168_2", - "168_3", - "169_0", - "169_1", - "169_2", - "169_3", - "16_0", - "16_1", - "16_2", - "16_3", - "170_0", - "170_1", - "170_2", - "170_3", - "172_0", - "172_1", - "172_2", - "172_3", - "173_0", - "173_1", - "173_2", - "173_3", - "175_0", - "175_1", - "176_0", - "176_1", - "176_2", - "177_0", - "177_1", - "177_2", - "177_3", - "178_0", - "179_0", - "179_1", - "179_2", - "17_0", - "17_1", - "17_2", - "180_0", - "180_1", - "180_2", - "180_3", - "181_0", - "181_1", - "182_0", - "182_1", - "182_2", - "182_3", - "185_0", - "185_1", - "186_0", - "186_1", - "186_2", - "186_3", - "187_0", - "187_1", - "187_2", - "187_3", - "18_0", - "18_1", - "18_2", - "19_0", - "19_1", - "19_2", - "19_3", - "1_0", - "1_1", - "1_2", - "1_3", - "21_0", - "21_1", - "22_0", - "22_1", - "22_2", - "22_3", - "23_0", - "23_1", - "23_2", - "23_3", - "24_0", - "24_1", - "24_2", - "24_3", - "25_0", - "25_1", - "26_0", - "27_0", - "27_1", - "28_0", - "28_1", - "28_2", - "29_0", - "29_1", - "29_2", - "29_3", - "2_0", - "2_1", - "2_2", - "2_3", - "30_0", - "30_1", - "30_2", - "31_0", - "31_1", - "31_2", - "32_0", - "32_1", - "32_2", - "32_3", - "33_0", - "33_1", - "33_2", - "33_3", - "34_0", - "34_1", - "34_2", - "35_0", - "36_0", - "36_1", - "36_2", - "36_3", - "37_0", - "37_1", - "37_2", - "37_3", - "38_0", - "38_1", - "39_0", - "39_1", - "3_0", - "3_1", - "3_2", - "3_3", - "40_0", - "40_1", - "40_2", - "40_3", - "41_0", - "41_1", - "41_2", - "41_3", - "42_0", - "45_0", - "45_1", - "45_2", - "45_3", - "46_0", - "46_1", - "46_2", - "46_3", - "47_0", - "47_1", - "48_0", - "48_1", - "48_2", - "48_3", - "49_0", - "49_1", - "49_2", - "49_3", - "4_0", - "4_1", - "4_2", - "4_3", - "50_0", - "50_1", - "50_2", - "50_3", - "52_0", - "53_0", - "53_1", - "53_2", - "53_3", - "54_0", - "54_1", - "55_0", - "55_1", - "55_2", - "55_3", - "56_0", - "56_1", - "56_2", - "57_0", - "57_1", - "57_2", - "57_3", - "58_0", - "58_1", - "59_0", - "59_1", - "5_0", - "5_1", - "5_2", - "5_3", - "60_0", - "60_1", - "60_2", - "60_3", - "61_0", - "61_1", - "62_0", - "62_1", - "62_2", - "62_3", - "63_0", - "63_1", - "63_2", - "63_3", - "64_0", - "64_1", - "64_2", - "64_3", - "66_0", - "66_1", - "68_0", - "6_0", - "6_1", - "6_2", - "6_3", - "70_0", - "70_1", - "70_2", - "70_3", - "71_0", - "71_1", - "72_0", - "73_0", - "73_1", - "74_0", - "74_1", - "74_2", - "74_3", - "75_0", - "75_1", - "75_2", - "75_3", - "76_0", - "77_0", - "77_1", - "77_2", - "77_3", - "78_0", - "78_1", - "78_2", - "79_0", - "79_1", - "79_2", - "79_3", - "7_0", - "7_1", - "81_0", - "81_1", - "81_2", - "81_3", - "82_0", - "82_1", - "82_2", - "82_3", - "83_0", - "83_1", - "84_0", - "84_1", - "84_2", - "85_0", - "85_1", - "85_2", - "85_3", - "86_0", - "86_1", - "86_2", - "86_3", - "87_0", - "88_0", - "89_0", - "8_0", - "8_1", - "90_0", - "90_1", - "90_2", - "90_3", - "91_0", - "91_1", - "91_2", - "91_3", - "92_0", - "92_1", - "92_2", - "92_3", - "93_0", - "94_0", - "94_1", - "94_2", - "94_3", - "95_0", - "95_1", - "95_2", - "95_3", - "96_0", - "96_1", - "96_2", - "96_3", - "97_0", - "97_1", - "97_2", - "97_3", - "98_0", - "98_1", - "98_2", - "99_0", - "99_1", - "99_2", - "9_0", - "9_1", - "9_2", - "9_3" - ] -} \ No newline at end of file From 43eb203fe485f1e47f9e09677c87d4e7ff916fda Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 21:30:51 +0000 Subject: [PATCH 5/8] refactor(cosmos-retriever): extract vLLM runtime policy, drop eval harness - Move VllmTokenCompleter + run_single_episode into inference/vllm_policy.py - Delete inference/evaluate_harness1_vllm.py (eval/benchmark code) - Repoint retriever.py to vllm_policy; update env_rl docstring - Include pre-existing: pool_doc_ids trajectory pooling (openai_chat), optional baseten import (rerank) --- .../src/cosmos_retriever/env_rl.py | 4 +- .../inference/evaluate_harness1_vllm.py | 498 ------------------ .../cosmos_retriever/inference/openai_chat.py | 7 + .../cosmos_retriever/inference/vllm_policy.py | 121 +++++ .../src/cosmos_retriever/rerank.py | 19 +- .../src/cosmos_retriever/retriever.py | 7 +- 6 files changed, 147 insertions(+), 509 deletions(-) delete mode 100644 cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py create mode 100644 cosmos-retriever/src/cosmos_retriever/inference/vllm_policy.py diff --git a/cosmos-retriever/src/cosmos_retriever/env_rl.py b/cosmos-retriever/src/cosmos_retriever/env_rl.py index 6e4b6a7..05d511e 100644 --- a/cosmos-retriever/src/cosmos_retriever/env_rl.py +++ b/cosmos-retriever/src/cosmos_retriever/env_rl.py @@ -13,9 +13,9 @@ the ``WorkingMemory`` / ``curate`` / ``fan_out_search`` machinery and renders budget-bounded context each turn via ``ultra_core``. There is no gold data, reward computation, or RL training here — recall is scored externally by the -caller (e.g. ``scripts/bench_erag.py``) against ``env.wm.curated_ids``. +caller against ``env.wm.curated_ids``. -Consumed by ``retriever.py`` and ``inference/evaluate_harness1_vllm.py``. +Consumed by ``retriever.py`` and ``inference/vllm_policy.py``. """ import asyncio diff --git a/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py b/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py deleted file mode 100644 index cb814fa..0000000 --- a/cosmos-retriever/src/cosmos_retriever/inference/evaluate_harness1_vllm.py +++ /dev/null @@ -1,498 +0,0 @@ -"""Evaluate Harness-1 against a local vLLM OpenAI-compatible endpoint. - -This mirrors inference/evaluate_harness1.py, but replaces the Tinker sampling -client with raw token-id calls to vLLM /v1/completions. It is intended for -parity checks of the released Hugging Face checkpoint served by vLLM. -""" - -from __future__ import annotations - -import argparse -import asyncio -import json -import os -import random -import time -import urllib.error -import urllib.request -from pathlib import Path -from typing import Dict, List - -import structlog -import tiktoken - -# Allow direct execution while keeping imports package-relative. -import sys - -_REPO_ROOT = Path(__file__).resolve().parents[1] -if str(_REPO_ROOT) not in sys.path: - sys.path.insert(0, str(_REPO_ROOT)) - -from cosmos_retriever.datagen.search_dataset import SearchDataset, get_dataset -from cosmos_retriever.config import get_config -from cosmos_retriever.tools import ( - GrepCorpusTool, - PruneChunksTool, - ReadDocumentTool, - SearchCorpusTool, - ToolSet, - UserTextTool, -) -from tinker_cookbook.completers import StopCondition, TokensWithLogprobs -from cosmos_retriever.env_rl import MAX_TURNS, SEARCH_DISPLAY_LIMIT, SlidingWindowSearchEnv - -logger = structlog.get_logger("evaluate_harness1_vllm") - -SAVE_FULL_TRAJECTORIES = os.environ.get("SAVE_FULL_TRAJECTORIES", "0") == "1" - - -class VllmTokenCompleter: - """Token-level policy backed by vLLM raw completions.""" - - def __init__( - self, - *, - base_url: str, - model: str, - max_tokens: int, - temperature: float, - top_p: float, - timeout: int, - ) -> None: - self.base_url = base_url.rstrip("/") - self.model = model - self.max_tokens = max_tokens - self.temperature = temperature - self.top_p = top_p - self.timeout = timeout - - @property - def completions_url(self) -> str: - if self.base_url.endswith("/v1"): - return f"{self.base_url}/completions" - return f"{self.base_url}/v1/completions" - - async def __call__(self, model_input, stop: StopCondition) -> TokensWithLogprobs: - prompt_tokens = model_input.to_ints() - payload = { - "model": self.model, - "prompt": prompt_tokens, - "max_tokens": self.max_tokens, - "temperature": self.temperature, - "top_p": self.top_p, - "stream": False, - "return_token_ids": True, - } - if stop and all(isinstance(s, int) for s in stop): - payload["stop_token_ids"] = list(stop) - elif stop: - payload["stop"] = list(stop) - - data = await asyncio.to_thread(self._post_json, payload) - choice = data["choices"][0] - tokens = ( - choice.get("token_ids") - or choice.get("tokens") - or choice.get("text_token_ids") - or [] - ) - if not tokens: - raise RuntimeError(f"vLLM response did not include token IDs: {str(data)[:500]}") - return TokensWithLogprobs(tokens=[int(t) for t in tokens], maybe_logprobs=None) - - def _post_json(self, payload: Dict) -> Dict: - body = json.dumps(payload).encode("utf-8") - req = urllib.request.Request( - self.completions_url, - data=body, - headers={"Content-Type": "application/json"}, - method="POST", - ) - try: - with urllib.request.urlopen(req, timeout=self.timeout) as resp: - return json.loads(resp.read().decode("utf-8")) - except urllib.error.HTTPError as exc: - detail = exc.read().decode("utf-8", errors="replace") - raise RuntimeError(f"vLLM HTTP {exc.code}: {detail[:1000]}") from exc - - -def save_full_trajectory(env: SlidingWindowSearchEnv) -> None: - traj_root = os.environ.get("TRAJECTORY_SAVE_PATH") or os.environ.get( - "LOG_PATH", "./tmp/rl_ultra_v3" - ) - full_dir = os.path.join(traj_root, "full") - os.makedirs(full_dir, exist_ok=True) - - turns = [] - for i, (action, obs) in enumerate(zip(env._all_actions, env._all_observations)): - turn_record = {"turn": i} - if action.reasoning: - turn_record["reasoning"] = action.reasoning - - tool_calls = [] - for tool, params in zip(action.tools, action.params): - name = "user_text" if isinstance(tool, UserTextTool) else tool.tool_schema.name - tool_calls.append({"tool": name, "params": params}) - turn_record["tool_calls"] = tool_calls - - tool_returns = [] - for j, obs_text in enumerate(obs.observations): - tr = {"text": obs_text} - if j < len(obs.tool_metadata) and obs.tool_metadata[j] is not None: - try: - tr["metadata"] = obs.tool_metadata[j].model_dump() - except Exception: - tr["metadata"] = str(obs.tool_metadata[j]) - tool_returns.append(tr) - turn_record["tool_returns"] = tool_returns - turns.append(turn_record) - - record = { - "query_id": env.query_id, - "query_text": env.wm.query, - "dataset": env.dataset.name, - "system_prompt": env.system_prompt, - "turns": turns, - "curated_ids": env.wm.curated_ids, - "curated_importance": dict(env.wm.curated_importance), - "reward": env._terminal_reward, - "metrics": { - k: v - for k, v in env._terminal_metrics.items() - if isinstance(v, (int, float, str, bool)) - }, - } - qid_safe = str(env.query_id).replace("/", "_") - with open(os.path.join(full_dir, f"{qid_safe}.json"), "w", encoding="utf-8") as f: - json.dump(record, f, indent=2, default=str) - - -async def run_single_episode( - env: SlidingWindowSearchEnv, - policy: VllmTokenCompleter, -) -> Dict: - ob, stop_condition = await env.initial_observation() - turns = 0 - start = time.time() - - while True: - ac_with_logprobs = await policy(ob, stop_condition) - step_result = await env.step(ac_with_logprobs.tokens) - turns += 1 - if step_result.episode_done: - break - ob = step_result.next_observation - stop_condition = step_result.next_stop_condition - - elapsed = time.time() - start - result = { - "turns": turns, - "n_curated": len(env.wm.curated_ids), - "n_pool": len(env.wm.pool_ids), - "elapsed_s": round(elapsed, 1), - "tool_types_used": list(env._tool_types_used), - "total_curate_calls": env._total_curate_calls, - "pool_ids": list(env.wm.pool_ids), - } - return result - - -async def eval_single_query( - qid: str, - dataset: SearchDataset, - toolset: ToolSet, - search_tool: SearchCorpusTool, - text_token_counter, - policy: VllmTokenCompleter, - max_turns: int, -) -> Dict: - _, query_text = dataset.get_query_by_id(qid) - env = SlidingWindowSearchEnv( - toolset=toolset, - search_tool=search_tool, - query_id=qid, - query_text=query_text, - dataset=dataset, - text_token_counter=text_token_counter, - max_turns=max_turns, - ) - try: - result = await run_single_episode(env=env, policy=policy) - result["query_id"] = qid - result["query"] = query_text[:80] - if SAVE_FULL_TRAJECTORIES: - save_full_trajectory(env) - logger.info( - "episode_result", - qid=qid, - recall=round(result.get("recall", 0), 3), - trajectory_recall=round(result.get("trajectory_recall", 0), 3), - final_answer_recall=round(result.get("final_answer_recall", 0), 3), - reward=round(result.get("reward", 0), 3), - curated=result["n_curated"], - pool=result["n_pool"], - turns=result["turns"], - error=result["error"], - time=result["elapsed_s"], - ) - return result - except Exception as exc: - logger.error("episode_failed", qid=qid, error=str(exc)[:500]) - return { - "query_id": qid, - "query": query_text[:80], - "error": True, - "reward": 0, - "recall": 0, - "trajectory_recall": 0, - "final_answer_recall": 0, - "precision": 0, - "n_curated": 0, - "n_pool": 0, - "turns": 0, - } - - -async def eval_queries( - query_ids: List[str], - dataset: SearchDataset, - toolset: ToolSet, - search_tool: SearchCorpusTool, - text_token_counter, - policy: VllmTokenCompleter, - max_turns: int, - parallel: int, - partial_output: Path | None = None, -) -> List[Dict]: - sem = asyncio.Semaphore(parallel) - write_lock = asyncio.Lock() - completed = 0 - - async def bounded(qid: str) -> Dict: - nonlocal completed - async with sem: - result = await eval_single_query( - qid, - dataset, - toolset, - search_tool, - text_token_counter, - policy, - max_turns, - ) - if partial_output is not None: - async with write_lock: - completed += 1 - partial_output.parent.mkdir(parents=True, exist_ok=True) - with partial_output.open("a", encoding="utf-8") as f: - f.write(json.dumps(result, default=str) + "\n") - logger.info( - "partial_result_saved", - path=str(partial_output), - completed=completed, - total=len(query_ids), - qid=qid, - ) - return result - - return list(await asyncio.gather(*(bounded(qid) for qid in query_ids))) - - -def summarize_results(results: List[Dict]) -> Dict: - n = len(results) - - def mean(key: str) -> float: - return sum(float(r.get(key, 0.0)) for r in results) / max(n, 1) - - return { - "n": n, - "errors": sum(1 for r in results if r.get("error")), - "recall": mean("recall"), - "trajectory_recall": mean("trajectory_recall"), - "final_answer_recall": mean("final_answer_recall"), - "precision": mean("precision"), - "reward": mean("reward"), - "turns": mean("turns"), - "n_curated": mean("n_curated"), - "n_pool": mean("n_pool"), - } - - -def print_results_table(name: str, results: List[Dict]) -> None: - summary = summarize_results(results) - print(f"\n{'=' * 80}") - print(f" {name}") - print(f"{'=' * 80}") - print(f" n: {summary['n']} errors: {summary['errors']}") - print(f" Recall: {summary['recall']:.4f}") - print(f" Trajectory Recall: {summary['trajectory_recall']:.4f}") - print(f" Final-Answer Recall: {summary['final_answer_recall']:.4f}") - print(f" Precision: {summary['precision']:.4f}") - print(f" Reward: {summary['reward']:.4f}") - print(f" Turns: {summary['turns']:.2f}") - print(f"{'=' * 80}\n") - - -async def main() -> None: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--dataset", default="browsecompplus") - parser.add_argument("--split", default="test", choices=["all", "test", "train", "rl"]) - parser.add_argument("--collection-split", default="test", choices=["test", "train", "rl"]) - parser.add_argument("--n-queries", type=int, default=100) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--query-ids", nargs="*", default=None) - parser.add_argument("--max-turns", type=int, default=MAX_TURNS) - parser.add_argument("--max-tokens", type=int, default=2048) - parser.add_argument("--temperature", type=float, default=1.0) - parser.add_argument("--top-p", type=float, default=0.9) - parser.add_argument("--parallel", type=int, default=1) - parser.add_argument("--base-url", default="http://127.0.0.1:8000/v1") - parser.add_argument("--model", default="harness-1") - parser.add_argument("--timeout", type=int, default=900) - parser.add_argument("--output", default=None) - parser.add_argument( - "--partial-output", - default=None, - help="Append one JSON line per completed query so interrupted runs keep progress.", - ) - parser.add_argument( - "--reranker", - type=str, - default="baseten", - choices=["baseten", "vllm", "none"], - help="Reranker backend: baseten (original), vllm (local Qwen3-Reranker-8B drop-in), or none.", - ) - args = parser.parse_args() - - config = get_config() - tiktoken_enc = tiktoken.get_encoding("o200k_harmony") - text_token_counter = lambda text: len(tiktoken_enc.encode(text)) - - dataset = get_dataset(args.dataset) - collection_names = dataset.get_cosmos_containers(split=args.collection_split) - cosmos_database = config.get_cosmos_database() - import os as _os - _EMBED_BASE_URL = _os.environ.get("EMBED_BASE_URL") - if _EMBED_BASE_URL: - from openai import OpenAI as _OpenAI - openai_client = _OpenAI( - base_url=_EMBED_BASE_URL, - api_key=_os.environ.get("EMBED_API_KEY", "EMPTY"), - ) - _embed_model = _os.environ.get("EMBED_MODEL", "qwen3-embed") - else: - openai_client = config.get_openai_client() - _embed_model = "text-embedding-3-small" - - try: - _reranker_backend = getattr(args, "reranker", "baseten") - if _reranker_backend == "none": - reranker = None - elif _reranker_backend == "vllm": - from harness.rerank import VLLMQwen3Reranker - - reranker = VLLMQwen3Reranker(token_counter=text_token_counter, max_tokens=4096) - else: - from harness.rerank import BasetenReranker - - reranker = BasetenReranker(token_counter=text_token_counter, max_tokens=4096) - except Exception: - reranker = None - - search_tool = SearchCorpusTool( - cosmos_database=cosmos_database, - openai_client=openai_client, - cosmos_container_name=collection_names[0], - openai_ef_name=_embed_model, - reranker=reranker, - snippet_max_chars=2048, - display_limit=SEARCH_DISPLAY_LIMIT, - ) - toolset = ToolSet(name=f"{args.dataset}_toolset") - toolset.add_tool(search_tool) - toolset.add_tool( - GrepCorpusTool( - cosmos_database=cosmos_database, - cosmos_container_name=collection_names[0], - token_counter=text_token_counter, - ) - ) - toolset.add_tool( - ReadDocumentTool( - cosmos_database=cosmos_database, - cosmos_container_name=collection_names[0], - reranker=reranker, - token_counter=text_token_counter, - max_tokens=4096, - ) - ) - toolset.add_tool(PruneChunksTool()) - - if args.split == "all": - all_qids = dataset.get_all_query_ids() - elif args.split == "test": - all_qids = dataset.get_test_query_ids() - elif args.split == "rl": - all_qids = dataset.get_rl_query_ids() - else: - all_qids = dataset.get_all_query_ids(split="train") - - if args.query_ids: - known_qids = set(all_qids) - query_ids = [qid for qid in args.query_ids if qid in known_qids] - if not query_ids: - raise ValueError("No valid query IDs remained after filtering") - else: - rng = random.Random(args.seed) - query_ids = rng.sample(all_qids, min(args.n_queries, len(all_qids))) - - policy = VllmTokenCompleter( - base_url=args.base_url, - model=args.model, - max_tokens=args.max_tokens, - temperature=args.temperature, - top_p=args.top_p, - timeout=args.timeout, - ) - - logger.info( - "evaluating_vllm", - model=args.model, - base_url=args.base_url, - n=len(query_ids), - parallel=args.parallel, - ) - results = await eval_queries( - query_ids=query_ids, - dataset=dataset, - toolset=toolset, - search_tool=search_tool, - text_token_counter=text_token_counter, - policy=policy, - max_turns=args.max_turns, - parallel=args.parallel, - partial_output=Path(args.partial_output) if args.partial_output else None, - ) - print_results_table(args.model, results) - - if args.output: - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - payload = { - args.model: [ - { - k: v - for k, v in r.items() - if isinstance(v, (int, float, str, bool, list)) - } - for r in results - ], - "_summary": summarize_results(results), - } - output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") - logger.info("results_saved", path=str(output_path)) - - -if __name__ == "__main__": - os.environ.setdefault("PYTHONDONTWRITEBYTECODE", "1") - asyncio.run(main()) diff --git a/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py b/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py index 011037f..943a00d 100644 --- a/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py +++ b/cosmos-retriever/src/cosmos_retriever/inference/openai_chat.py @@ -73,6 +73,7 @@ class ChatSearchResult: documents: list[ChatDocument] num_turns: int final_text: str = "" + pool_doc_ids: list[str] = field(default_factory=list) metadata: dict[str, str | int | float] = field(default_factory=dict) @@ -343,18 +344,24 @@ def run_responses_search( documents = _extract_documents(final_text, doc_text, max_documents) + # Every chunk surfaced by any tool call across the whole trajectory lands in + # ``doc_text``; its doc-level projection is the "pool" used for trajectory_recall. + pool_doc_ids = sorted({cid.split("__")[0] for cid in doc_text}) + logger.info( "responses_search_complete", model=model, num_turns=num_turns, num_documents=len(documents), tool_calls=tool_call_count, + pool_size=len(pool_doc_ids), ) return ChatSearchResult( documents=documents, num_turns=num_turns, final_text=final_text, + pool_doc_ids=pool_doc_ids, metadata={ "backend": "openai_responses", "model": model, diff --git a/cosmos-retriever/src/cosmos_retriever/inference/vllm_policy.py b/cosmos-retriever/src/cosmos_retriever/inference/vllm_policy.py new file mode 100644 index 0000000..e786a54 --- /dev/null +++ b/cosmos-retriever/src/cosmos_retriever/inference/vllm_policy.py @@ -0,0 +1,121 @@ +"""Runtime policy for the ``harmony_vllm`` backend. + +Provides the token-level vLLM policy (:class:`VllmTokenCompleter`) and the +single-episode driver (:func:`run_single_episode`) used by ``retriever.py`` to +serve searches against a local vLLM OpenAI-compatible endpoint. These were +extracted from the former evaluation harness so the live serving path no longer +depends on any benchmarking/eval code. +""" + +from __future__ import annotations + +import asyncio +import json +import time +import urllib.error +import urllib.request +from typing import Dict + +from tinker_cookbook.completers import StopCondition, TokensWithLogprobs + +from cosmos_retriever.env_rl import SlidingWindowSearchEnv + + +class VllmTokenCompleter: + """Token-level policy backed by vLLM raw completions.""" + + def __init__( + self, + *, + base_url: str, + model: str, + max_tokens: int, + temperature: float, + top_p: float, + timeout: int, + ) -> None: + self.base_url = base_url.rstrip("/") + self.model = model + self.max_tokens = max_tokens + self.temperature = temperature + self.top_p = top_p + self.timeout = timeout + + @property + def completions_url(self) -> str: + if self.base_url.endswith("/v1"): + return f"{self.base_url}/completions" + return f"{self.base_url}/v1/completions" + + async def __call__(self, model_input, stop: StopCondition) -> TokensWithLogprobs: + prompt_tokens = model_input.to_ints() + payload = { + "model": self.model, + "prompt": prompt_tokens, + "max_tokens": self.max_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "stream": False, + "return_token_ids": True, + } + if stop and all(isinstance(s, int) for s in stop): + payload["stop_token_ids"] = list(stop) + elif stop: + payload["stop"] = list(stop) + + data = await asyncio.to_thread(self._post_json, payload) + choice = data["choices"][0] + tokens = ( + choice.get("token_ids") + or choice.get("tokens") + or choice.get("text_token_ids") + or [] + ) + if not tokens: + raise RuntimeError(f"vLLM response did not include token IDs: {str(data)[:500]}") + return TokensWithLogprobs(tokens=[int(t) for t in tokens], maybe_logprobs=None) + + def _post_json(self, payload: Dict) -> Dict: + body = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + self.completions_url, + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=self.timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"vLLM HTTP {exc.code}: {detail[:1000]}") from exc + + +async def run_single_episode( + env: SlidingWindowSearchEnv, + policy: VllmTokenCompleter, +) -> Dict: + ob, stop_condition = await env.initial_observation() + turns = 0 + start = time.time() + + while True: + ac_with_logprobs = await policy(ob, stop_condition) + step_result = await env.step(ac_with_logprobs.tokens) + turns += 1 + if step_result.episode_done: + break + ob = step_result.next_observation + stop_condition = step_result.next_stop_condition + + elapsed = time.time() - start + result = { + "turns": turns, + "n_curated": len(env.wm.curated_ids), + "n_pool": len(env.wm.pool_ids), + "elapsed_s": round(elapsed, 1), + "tool_types_used": list(env._tool_types_used), + "total_curate_calls": env._total_curate_calls, + "pool_ids": list(env.wm.pool_ids), + } + return result diff --git a/cosmos-retriever/src/cosmos_retriever/rerank.py b/cosmos-retriever/src/cosmos_retriever/rerank.py index 7559779..d482eb2 100644 --- a/cosmos-retriever/src/cosmos_retriever/rerank.py +++ b/cosmos-retriever/src/cosmos_retriever/rerank.py @@ -1,14 +1,18 @@ +from __future__ import annotations + from abc import ABC, abstractmethod from dataclasses import dataclass import time -from typing import Callable, List, Optional +from typing import TYPE_CHECKING, Callable, List, Optional import requests import structlog -from baseten_performance_client import ClassificationResponse, PerformanceClient from cosmos_retriever.config import get_config +if TYPE_CHECKING: + from baseten_performance_client import ClassificationResponse, PerformanceClient + logger = structlog.get_logger("search_agent.rerank") @@ -172,7 +176,10 @@ def __init__( super().__init__(token_counter=token_counter, max_tokens=max_tokens) if client is None: config = get_config() - client = config.get_baseten_client() + client = config.get_baseten_client() # type: ignore[assignment] + + + self.client = client self.batch_size = batch_size self.max_concurrent_requests = max_concurrent_requests @@ -478,6 +485,6 @@ def _rerank( logger.info("rerank_complete", num_results=len(results), max_tokens=args.max_tokens) for result in results: logger.info("result", score=result.score, document=result.document) - -# Back-compat alias kept for callers of the original cosmos-retriever API. -VLLMReranker = VLLMQwen3Reranker + +# Back-compat alias kept for callers of the original cosmos-retriever API. +VLLMReranker = VLLMQwen3Reranker diff --git a/cosmos-retriever/src/cosmos_retriever/retriever.py b/cosmos-retriever/src/cosmos_retriever/retriever.py index 0308df2..c61e06e 100644 --- a/cosmos-retriever/src/cosmos_retriever/retriever.py +++ b/cosmos-retriever/src/cosmos_retriever/retriever.py @@ -206,8 +206,8 @@ def _search_sync( ) -> RetrievalResult: """Drive the upstream ``SlidingWindowSearchEnv`` for one query. - Mirrors ``inference/evaluate_harness1_vllm.py:run_single_episode`` from - the upstream harness-1 repo so that recall on BrowseComp+ matches the + Mirrors ``inference/vllm_policy.py:run_single_episode`` from the + upstream harness-1 repo so that recall on BrowseComp+ matches the published Harness-1 numbers (the env owns the ``WorkingMemory`` / ``curate`` / ``fan_out_search`` machinery the trained model relies on). """ @@ -220,7 +220,7 @@ def _search_sync( from cosmos_retriever.env_rl import ( # noqa: PLC0415 — heavy, harmony-only SlidingWindowSearchEnv, ) - from cosmos_retriever.inference.evaluate_harness1_vllm import ( # noqa: PLC0415 + from cosmos_retriever.inference.vllm_policy import ( # noqa: PLC0415 VllmTokenCompleter, run_single_episode, ) @@ -381,6 +381,7 @@ def _search_responses(self, query: str, max_documents: int) -> RetrievalResult: num_turns=chat_result.num_turns, final_text=chat_result.final_text, elapsed_s=round(elapsed, 3), + pool_doc_ids=chat_result.pool_doc_ids, metadata=chat_result.metadata, ) logger.info( From 03aa2e03db6c2d7225d2cecfb68a045b7bbedf33 Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 21:30:51 +0000 Subject: [PATCH 6/8] chore(toolkit): polish agentic_search descriptions, align maxDocuments to 1-50, de-brand harness references --- .../Controllers/MCPProtocolController.cs | 4 ++-- .../Controllers/MCPTestController.cs | 2 +- src/AzureCosmosDB.MCP.Toolkit/Program.cs | 8 ++++---- .../Services/AgenticSearchExecutor.cs | 13 ++++++------- .../Services/McpToolRequestValidator.cs | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs index cf2d274..4bca534 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPProtocolController.cs @@ -262,12 +262,12 @@ public async Task HandleMCPRequest([FromBody] JsonElement request }, new { name = "agentic_search", - description = "Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1) against a Cosmos DB corpus and returns ranked, curated documents. Pass `container=` to target a registered corpus (see CORPUS_REGISTRY env var on the host): the matching Cosmos account + database + embedding model is picked automatically per call. With no `container` the default-corpus env vars are used.", + description = "PREFERRED tool for answering knowledge questions from a Cosmos DB corpus. Runs an autonomous multi-turn retrieval agent that plans sub-queries, issues several vector/keyword searches, follows leads across documents, reranks candidates, and returns a curated, ranked set of the most relevant documents with their content. Use this for anything beyond a trivial lookup: complex, ambiguous, multi-part, or multi-hop questions; or whenever one-shot vector_search/text_search might miss relevant context. It is more thorough (but slower) than the single-shot search tools, so prefer it when answer quality matters more than latency. Just pass a natural-language `query`; the agent handles query planning and ranking for you. Optionally pass `container=` to target a registered corpus (see the CORPUS_REGISTRY env var on the host): the matching Cosmos account + database + embedding model is selected automatically per call. With no `container`, the default-corpus env vars are used. Use `maxDocuments` to cap how many curated documents are returned.", inputSchema = new { type = "object", properties = new { query = new { type = "string", description = "Natural-language information need to retrieve documents for", maxLength = 4096 }, - maxDocuments = new { type = "integer", description = "Maximum number of curated documents to return (1-30, default 20)", minimum = 1, maximum = 30, @default = 20 }, + maxDocuments = new { type = "integer", description = "Maximum number of curated documents to return (1-50, default 20)", minimum = 1, maximum = 50, @default = 20 }, database = new { type = "string", description = "Optional Cosmos database override (else COSMOS_DATABASE env var)", maxLength = 256 }, container = new { type = "string", description = "Optional Cosmos corpus container override (else COSMOS_CORPUS_CONTAINER env var)", maxLength = 256 } }, diff --git a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs index 310d31e..9eb5e37 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Controllers/MCPTestController.cs @@ -73,7 +73,7 @@ public IActionResult ListTools() new { name = "text_search", description = "Select TOP N documents where a given property contains the provided search string. N must be between 1-20" }, new { name = "vector_search", description = "Performs vector search on Cosmos DB using Azure OpenAI embeddings" }, new { name = "get_approximate_schema", description = "Approximates a container schema by sampling up to 10 documents" }, - new { name = "agentic_search", description = "Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1) via the cosmos-retriever HTTP service and returns ranked, curated documents." } + new { name = "agentic_search", description = "Runs an autonomous multi-turn retrieval agent against a Cosmos DB corpus and returns ranked, curated documents that best answer the query." } }; return Ok(new { tools, count = tools.Length, timestamp = DateTime.UtcNow }); diff --git a/src/AzureCosmosDB.MCP.Toolkit/Program.cs b/src/AzureCosmosDB.MCP.Toolkit/Program.cs index 0c522a1..7c9a4ed 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Program.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Program.cs @@ -1096,10 +1096,10 @@ FROM c } } - [McpServerTool, Description("Runs the Harness-1 multi-turn retrieval agent (pat-jj/harness-1 served by vLLM) against a Cosmos DB corpus and returns ranked, curated documents that best answer the query. The agent internally issues hybrid (vector + full-text) RRF searches, optionally reranks with Qwen3-Reranker-8B, reads documents, and prunes its context across multiple turns. Pass `container=` to target a registered corpus (see CORPUS_REGISTRY env var on the host): the right Cosmos account + database + embedding model is picked automatically per call. With no `container` arg the default-corpus env vars are used.")] + [McpServerTool, Description("PREFERRED tool for answering knowledge questions from a Cosmos DB corpus. Runs an autonomous multi-turn retrieval agent that plans sub-queries, issues several vector/keyword searches, follows leads across documents, reranks candidates, and returns a curated, ranked set of the most relevant documents with their content. Use this for anything beyond a trivial lookup: complex, ambiguous, multi-part, or multi-hop questions; or whenever one-shot vector_search/text_search might miss relevant context. It is more thorough (but slower) than the single-shot search tools, so prefer it when answer quality matters more than latency. Just pass a natural-language `query`; the agent handles query planning and ranking for you. Optionally pass `container=` to target a registered corpus (see the CORPUS_REGISTRY env var on the host): the matching Cosmos account + database + embedding model is selected automatically per call. With no `container` the default-corpus env vars are used. Use `maxDocuments` to cap how many curated documents are returned.")] public static async Task AgenticSearch( [Description("Natural-language information need to retrieve documents for.")] string query, - [Description("Maximum number of curated documents to return (1-30, default 20).")] int maxDocuments = 20, + [Description("Maximum number of curated documents to return (1-50, default 20).")] int maxDocuments = 20, [Description("Optional Cosmos database name override (else COSMOS_DATABASE env var).")] string? database = null, [Description("Optional Cosmos corpus container name override (else COSMOS_CORPUS_CONTAINER env var).")] string? container = null) { @@ -1110,9 +1110,9 @@ public static async Task AgenticSearch( { return JsonSerializer.Serialize(new { error = "Parameter 'query' is required and must be non-empty." }); } - if (maxDocuments < 1 || maxDocuments > 30) + if (maxDocuments < 1 || maxDocuments > 50) { - return JsonSerializer.Serialize(new { error = "Parameter 'maxDocuments' must be between 1 and 30." }); + return JsonSerializer.Serialize(new { error = "Parameter 'maxDocuments' must be between 1 and 50." }); } return await AgenticSearchExecutor.RunAsync(query, maxDocuments, logger, database, container); diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs index 3481373..25dc83b 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/AgenticSearchExecutor.cs @@ -10,11 +10,10 @@ namespace AzureCosmosDB.MCP.Toolkit.Services; /// /// /// -/// The Python helper runs the trained Harness-1 multi-turn retrieval agent -/// (pat-jj/harness-1 served by vLLM) against an Azure Cosmos DB corpus -/// and returns a JSON document of curated, ranked results. It is started once -/// (python -m cosmos_retriever serve) and kept warm so the heavy -/// clients (Cosmos SDK, embeddings, Harmony encoder) are not re-initialised +/// The Python helper runs a multi-turn retrieval agent against an Azure Cosmos +/// DB corpus and returns a JSON document of curated, ranked results. It is +/// started once (python -m cosmos_retriever serve) and kept warm so the +/// heavy clients (Cosmos SDK, embeddings, model encoder) are not re-initialised /// on every call. /// /// @@ -35,7 +34,7 @@ namespace AzureCosmosDB.MCP.Toolkit.Services; /// /// /// -/// The retriever service owns its own configuration (VLLM_BASE_URL, +/// The retriever service owns its own configuration (model endpoint, /// ACCOUNT_URI, COSMOS_DATABASE, COSMOS_CORPUS_CONTAINER, /// CORPUS_REGISTRY_FILE, AZURE_OPENAI_*, etc.) read from its own /// environment / .env file; none of it flows through this process. @@ -65,7 +64,7 @@ public static class AgenticSearchExecutor /// POST /search endpoint. /// /// Natural-language information need. - /// Cap on the number of curated docs returned (1–30). + /// Cap on the number of curated docs returned (1–50). /// Logger for request lifecycle events. /// Optional Cosmos database override. /// Optional Cosmos container override. diff --git a/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs b/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs index cce8307..3875642 100644 --- a/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs +++ b/src/AzureCosmosDB.MCP.Toolkit/Services/McpToolRequestValidator.cs @@ -62,7 +62,7 @@ public sealed class McpToolRequestValidator ["agentic_search"] = new(new Dictionary(StringComparer.Ordinal) { ["query"] = ToolArgumentSchema.String(required: true, maxLength: 4096), - ["maxDocuments"] = ToolArgumentSchema.Integer(required: false, minValue: 1, maxValue: 30), + ["maxDocuments"] = ToolArgumentSchema.Integer(required: false, minValue: 1, maxValue: 50), ["database"] = ToolArgumentSchema.String(required: false, maxLength: 256), ["container"] = ToolArgumentSchema.String(required: false, maxLength: 256) }) From dc0d8fc115edbc220c1eebbe4fb33a02766f1834 Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 23:06:24 +0000 Subject: [PATCH 7/8] chore(cosmos-retriever): remove datagen package and unit tests folder The datagen/ package deletion was previously only staged, never committed, so it still appeared in the PR. Actually remove it (search_dataset.py, generate_sft_rl_splits.py, BrowseComp-Plus, README, __init__) along with the unit tests folder, the datagen TYPE_CHECKING import in tasks.py, the stale datagen comment in config.py, and the now-dangling pytest/respx dev deps and pytest/ruff test config in pyproject.toml. --- cosmos-retriever/pyproject.toml | 11 - .../src/cosmos_retriever/config.py | 3 - .../cosmos_retriever/datagen/BrowseComp-Plus | 1 - .../src/cosmos_retriever/datagen/README.md | 96 -- .../src/cosmos_retriever/datagen/__init__.py | 0 .../datagen/generate_sft_rl_splits.py | 140 -- .../datagen/search_dataset.py | 1333 ----------------- .../src/cosmos_retriever/tasks.py | 7 +- cosmos-retriever/tests/__init__.py | 1 - cosmos-retriever/tests/conftest.py | 26 - cosmos-retriever/tests/test_chat_agent.py | 271 ---- cosmos-retriever/tests/test_server.py | 99 -- cosmos-retriever/tests/test_tools.py | 104 -- cosmos-retriever/tests/test_trajectory.py | 134 -- 14 files changed, 3 insertions(+), 2223 deletions(-) delete mode 120000 cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/README.md delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/__init__.py delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py delete mode 100644 cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py delete mode 100644 cosmos-retriever/tests/__init__.py delete mode 100644 cosmos-retriever/tests/conftest.py delete mode 100644 cosmos-retriever/tests/test_chat_agent.py delete mode 100644 cosmos-retriever/tests/test_server.py delete mode 100644 cosmos-retriever/tests/test_tools.py delete mode 100644 cosmos-retriever/tests/test_trajectory.py diff --git a/cosmos-retriever/pyproject.toml b/cosmos-retriever/pyproject.toml index 172321c..d643c51 100644 --- a/cosmos-retriever/pyproject.toml +++ b/cosmos-retriever/pyproject.toml @@ -36,9 +36,6 @@ dependencies = [ baseten = ["baseten-performance-client>=0.4,<1"] dev = [ "mypy>=1.10,<2", - "pytest>=8,<9", - "pytest-asyncio>=0.23,<1", - "respx>=0.21,<1", "ruff>=0.6,<1", ] @@ -63,9 +60,6 @@ target-version = "py311" select = ["E", "F", "I", "B", "UP", "SIM", "N"] ignore = ["E501"] # line length handled by formatter -[tool.ruff.lint.per-file-ignores] -"tests/*" = ["B", "N"] - [tool.mypy] python_version = "3.11" strict = false @@ -73,8 +67,3 @@ warn_unused_ignores = true warn_redundant_casts = true ignore_missing_imports = true files = ["src/cosmos_retriever"] - -[tool.pytest.ini_options] -asyncio_mode = "auto" -testpaths = ["tests"] -addopts = "-q" diff --git a/cosmos-retriever/src/cosmos_retriever/config.py b/cosmos-retriever/src/cosmos_retriever/config.py index ccc8086..d7d0569 100644 --- a/cosmos-retriever/src/cosmos_retriever/config.py +++ b/cosmos-retriever/src/cosmos_retriever/config.py @@ -439,9 +439,6 @@ def get_settings() -> RetrieverSettings: return settings -# Upstream Harness-1 modules (rerank.py, agent.py, datagen/) call ``get_config()`` -# and treat its return value as a ``Config`` with ``get_baseten_client`` etc. -# We forward to the same singleton so those modules import cleanly. def get_config() -> "RetrieverSettings": return get_settings() diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus b/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus deleted file mode 120000 index c047219..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/BrowseComp-Plus +++ /dev/null @@ -1 +0,0 @@ -/nvme/harness-1/external/BrowseComp-Plus \ No newline at end of file diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/README.md b/cosmos-retriever/src/cosmos_retriever/datagen/README.md deleted file mode 100644 index 389b746..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/README.md +++ /dev/null @@ -1,96 +0,0 @@ -# Datasets - -This repository includes the evaluation code used by Harness-1, but it does not -bundle large retrieval corpora or private Chroma indexes. - -## Public Ready-To-Run Path: BrowseComp+ - -BrowseComp+ is the recommended public smoke/evaluation dataset for this release. -The evaluator expects the public BrowseComp+ query/answer files and qrels on -disk, plus a Chroma collection containing the corresponding BrowseComp+ corpus -chunks. - -### 1. Download BrowseComp+ - -Clone the public BrowseComp+ release and follow its instructions to obtain the -decrypted query/answer file: - -```bash -git clone https://github.com/texttron/BrowseComp-Plus external/BrowseComp-Plus -``` - -After setup, you should have files equivalent to: - -```text -external/BrowseComp-Plus/topics-qrels/queries.tsv -external/BrowseComp-Plus/topics-qrels/qrel_golds.txt -external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt -external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl -``` - -### 2. Configure local paths - -Copy `.env.example` to `.env.local` and point these variables at the downloaded -files: - -```bash -BROWSECOMPPLUS_QUERIES_PATH=external/BrowseComp-Plus/topics-qrels/queries.tsv -BROWSECOMPPLUS_QRELS_GOLD_PATH=external/BrowseComp-Plus/topics-qrels/qrel_golds.txt -BROWSECOMPPLUS_QRELS_EVIDENCE_PATH=external/BrowseComp-Plus/topics-qrels/qrel_evidence.txt -BROWSECOMPPLUS_ANSWERS_PATH=external/BrowseComp-Plus/data/browsecomp_plus_decrypted.jsonl -``` - -### 3. Build or provide the BrowseComp+ retrieval collection - -The search harness retrieves from Chroma. For BrowseComp+, create a Chroma -collection named `browsecomp_plus_test` containing the BrowseComp+ corpus chunks, -with document IDs matching the qrel document IDs. Configure your Chroma access in -`.env.local`: - -```bash -CHROMA_API_KEY=... -CHROMA_DATABASE=... -``` - -At minimum, each indexed chunk should preserve: - -- the document/chunk ID used in the qrels, -- text content, -- any metadata your Chroma deployment requires for retrieval. - -The evaluator looks up the collection name from the dataset class, so keeping -the collection name `browsecomp_plus_test` is the least surprising path. - -### 4. Run a BrowseComp+ Harness-1 eval - -Set your checkpoint path privately in the environment, then run: - -```bash -set -a && source .env.local && set +a - -PYTHONPATH=. uv run python inference/evaluate_harness1.py \ - --dataset browsecompplus \ - --split test \ - --collection-split test \ - --max-turns 40 \ - --temperature 1.0 \ - --checkpoints harness1="$HARNESS1_TINKER_CHECKPOINT" \ - --output tmp/eval_harness1_browsecompplus.json -``` - -The released Hugging Face checkpoint can be used for model loading and serving, -but the full search evaluation still requires a configured retrieval backend and -the Harness-1 tool environment. - -## Other In-Domain Corpora - -The `web`, `sec`, and `patents` in-domain corpora used in the paper are not -distributed here as public ready-made datasets/indexes. To reproduce those -settings, construct the corresponding data and Chroma collections yourself. We -recommend using the Context-1 data-generation repository as the reference -pipeline: - -https://github.com/chroma-core/context-1-data-gen - -Once your corpora are indexed in Chroma with compatible collection names and -document IDs, the same Harness-1 evaluation scripts can target those datasets. diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/__init__.py b/cosmos-retriever/src/cosmos_retriever/datagen/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py b/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py deleted file mode 100644 index 9edd94d..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/generate_sft_rl_splits.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 -"""Generate SFT/RL query splits for all datasets. - -Splits each dataset's train queries into: - - SFT: 30% of train queries - - RL: 70% of train queries - -Output: JSON files per dataset with query_id lists for each split. - -Usage: - python -m datagen.generate_sft_rl_splits --output_dir datagen/splits - python -m datagen.generate_sft_rl_splits --output_dir datagen/splits --datasets browsecompplus sec patents web -""" - -import argparse -import json -import os -import sys - -from cosmos_retriever.datagen.search_dataset import get_dataset - - -DATASETS = ["browsecompplus", "sec", "patents", "web"] - - -def generate_splits(dataset_name: str) -> dict: - """Generate SFT/RL splits for a single dataset. - - Returns a dict with split info and query_id lists. - """ - print(f"\n{'='*60}") - print(f"Dataset: {dataset_name}") - print(f"{'='*60}") - - ds = get_dataset(dataset_name) - - all_ids = ds.get_all_query_ids() - train_ids = ds.get_train_query_ids() - test_ids = ds.get_test_query_ids() - sft_ids = ds.get_sft_query_ids() - rl_ids = ds.get_rl_query_ids() - - # Verify no overlap - sft_set = set(sft_ids) - rl_set = set(rl_ids) - train_set = set(train_ids) - test_set = set(test_ids) - - assert sft_set & rl_set == set(), "SFT and RL sets overlap!" - assert sft_set | rl_set == train_set, f"SFT + RL != train set! diff={train_set - (sft_set | rl_set)}" - - overlap = train_set & test_set - if overlap: - print(f" WARNING: {len(overlap)} query IDs overlap between train and test (pre-split dataset artifact)") - # For pre-split datasets, train/test may share IDs — this is expected - - print(f" Total queries: {len(all_ids)}") - print(f" Train queries: {len(train_ids)} ({len(train_ids)/len(all_ids)*100:.1f}%)") - print(f" Test queries: {len(test_ids)} ({len(test_ids)/len(all_ids)*100:.1f}%)") - print(f" ── SFT queries: {len(sft_ids)} ({len(sft_ids)/len(train_ids)*100:.1f}% of train, {len(sft_ids)/len(all_ids)*100:.1f}% of total)") - print(f" ── RL queries: {len(rl_ids)} ({len(rl_ids)/len(train_ids)*100:.1f}% of train, {len(rl_ids)/len(all_ids)*100:.1f}% of total)") - - return { - "dataset": dataset_name, - "total_queries": len(all_ids), - "train_queries": len(train_ids), - "test_queries": len(test_ids), - "sft_queries": len(sft_ids), - "rl_queries": len(rl_ids), - "sft_ratio": len(sft_ids) / len(train_ids) if train_ids else 0, - "rl_ratio": len(rl_ids) / len(train_ids) if train_ids else 0, - "sft_query_ids": sorted(sft_ids), - "rl_query_ids": sorted(rl_ids), - "test_query_ids": sorted(test_ids), - } - - -def main(): - parser = argparse.ArgumentParser(description="Generate SFT/RL query splits") - parser.add_argument( - "--output_dir", - type=str, - default="datagen/splits", - help="Output directory for split files (default: datagen/splits)", - ) - parser.add_argument( - "--datasets", - nargs="+", - default=DATASETS, - help=f"Datasets to process (default: {' '.join(DATASETS)})", - ) - args = parser.parse_args() - - os.makedirs(args.output_dir, exist_ok=True) - - summary = [] - - for ds_name in args.datasets: - try: - split_info = generate_splits(ds_name) - - # Save per-dataset split file - output_path = os.path.join(args.output_dir, f"{ds_name}_splits.json") - with open(output_path, "w") as f: - json.dump(split_info, f, indent=2) - print(f" Saved to: {output_path}") - - summary.append({ - "dataset": ds_name, - "total": split_info["total_queries"], - "train": split_info["train_queries"], - "test": split_info["test_queries"], - "sft": split_info["sft_queries"], - "rl": split_info["rl_queries"], - }) - except Exception as e: - print(f" ERROR: {e}") - import traceback - traceback.print_exc() - continue - - # Save summary - summary_path = os.path.join(args.output_dir, "summary.json") - with open(summary_path, "w") as f: - json.dump(summary, f, indent=2) - - # Print summary table - print(f"\n{'='*60}") - print("Summary") - print(f"{'='*60}") - print(f"{'Dataset':<18} {'Total':>6} {'Train':>6} {'Test':>6} {'SFT':>6} {'RL':>6}") - print(f"{'-'*18} {'-'*6} {'-'*6} {'-'*6} {'-'*6} {'-'*6}") - for s in summary: - print(f"{s['dataset']:<18} {s['total']:>6} {s['train']:>6} {s['test']:>6} {s['sft']:>6} {s['rl']:>6}") - print(f"\nAll splits saved to: {args.output_dir}") - - -if __name__ == "__main__": - main() - diff --git a/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py b/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py deleted file mode 100644 index 277e697..0000000 --- a/cosmos-retriever/src/cosmos_retriever/datagen/search_dataset.py +++ /dev/null @@ -1,1333 +0,0 @@ -from abc import ABC, abstractmethod -import ast -from collections import defaultdict -from enum import Enum -from typing import List, Literal, Optional, Set, Tuple -import datasets -import csv -import json -import random -from urllib.parse import urlsplit, urlunsplit -import cosmos_retriever.config as config -from cosmos_retriever.tasks import chunk_ids_to_doc_ids - - -SPLIT_SEED = 42 -TRAIN_RATIO = 0.8 - -# Within the train split, further divide into SFT and RL subsets -SFT_RL_SPLIT_SEED = 123 # Different seed from train/test split for independence -SFT_RATIO = 0.3 # 30% of train queries for SFT, 70% for RL - -# Type alias for fact-level document structure -FactItem = dict # {"fact": str, "chunk_ids": List[str], "is_final_answer": bool} - - -def normalize_document_id(document_id: str) -> str: - """Normalize a document ID for evaluation. - - For URL-like IDs, strip the fragment to avoid mismatches between equivalent - links such as ``/wiki/Foo`` and ``/wiki/Foo#section``. - """ - if "://" not in document_id: - return document_id - - parsed = urlsplit(document_id) - return urlunsplit((parsed.scheme, parsed.netloc, parsed.path, parsed.query, "")) - - -def load_hf_dataset_first_available( - hf_path: str, - *, - split_preferences: Tuple[str, ...] = ("test", "train", "validation"), -) -> datasets.Dataset: - """Load a HuggingFace dataset and pick the first available preferred split.""" - cfg = config.get_config() - token = cfg.huggingface_token - raw = datasets.load_dataset(hf_path, token=token) - - for split_name in split_preferences: - if split_name in raw and len(raw[split_name]) > 0: - return raw[split_name] - - # Fallback to first non-empty split, then first split if all are empty. - for split_name in raw.keys(): - if len(raw[split_name]) > 0: - return raw[split_name] - - first_split = next(iter(raw.keys())) - return raw[first_split] - - -# ============================================================================ -# Backward-compatible enum (used by existing callers) -# ============================================================================ - - -class SearchDatasetName(Enum): - """Backward-compatible enum. Prefer using get_dataset(name_str) directly.""" - BROWSECOMPPLUS = "browsecompplus" - BC_PLUS = "bc_plus" - EPSTEIN = "epstein" - LONGSEALQA = "longsealqa" - SEAL0QA = "seal0qa" - FRAMES = "frames" - HOTPOTQA_SUBSET = "hotpotqa_subset" - PODCASTS_TEST = "podcasts_test" - WEB = "web" - PATENTS = "patents" - SEC = "sec" - WEB_SIMPLE = "web_simple" - SEC_SIMPLE = "sec_simple" - DEEPSEARCH = "deepsearch" - GAIA = "gaia" - OTHER = "other" - - -# ============================================================================ -# Search Dataset Base Class -# ============================================================================ - - -class SearchDataset(ABC): - """ - Abstract base class for search datasets. - - A search dataset is a dataset of search queries and the documents that are required - to answer the query or that are relevant to the query. - - Subclasses must implement `_load_dataset()` to populate `_search_queries_dataset` - with a HuggingFace Dataset containing the following columns: - - query_id: The query id - - query: The search query - - document_ids: The documents that are required to answer the query or that are relevant to the query. - For document-level evaluation: List[str] of document/chunk IDs. - For fact-level evaluation: List[FactItem] where each FactItem has - {"fact": str, "chunk_ids": List[str], "is_final_answer": bool}. - - answer: The answer to the query - - Subclasses can override `evaluation_mode` property to change evaluation behavior: - - "document": Standard document/chunk-level evaluation (default) - - "fact": Fact-level evaluation where a fact is found if ANY of its chunk_ids are retrieved - - For final_answer_recall evaluation: - - Document-level datasets can override `_get_final_answer_document_ids()` to specify - which document IDs are "final answer" documents (e.g., gold vs evidence in BrowseCompPlus). - - Fact-level datasets automatically use facts where is_final_answer=True. - """ - - _search_queries_dataset: datasets.Dataset - _query_index: dict # Maps query_id -> row dict for O(1) lookups - _train_query_ids: List[str] # Query IDs in the train split - _test_query_ids: List[str] # Query IDs in the test split - - # Cosmos container configuration - override in subclasses. - # A list (typically of length 1 in the Cosmos port; load balancing is - # handled server-side via RU/s rather than via per-request sharding). - COSMOS_CONTAINERS: List[str] = [] - # Optional split-specific containers (if not set, falls back to COSMOS_CONTAINERS) - COSMOS_CONTAINERS_TRAIN: Optional[List[str]] = None - COSMOS_CONTAINERS_TEST: Optional[List[str]] = None - - def __init__(self) -> None: - # Subclass loads dataset into self._search_queries_dataset - self._load_dataset() - - # Build common indices - self._build_query_index() - self._create_train_test_split() - - @abstractmethod - def _load_dataset(self) -> None: - """Load the dataset into self._search_queries_dataset. Implemented by subclasses.""" - pass - - @property - @abstractmethod - def name(self) -> str: - """Return the name identifier for this dataset.""" - pass - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """Return the evaluation mode for this dataset. - - - "document": Standard document/chunk-level evaluation. document_ids is List[str]. - - "fact": Fact-level evaluation. document_ids is List[FactItem] where each fact - has chunk_ids. A fact counts as found if ANY of its chunk_ids are retrieved. - - Override this in subclasses that use fact-level evaluation. - """ - return "document" - - def get_cosmos_containers( - self, split: Optional[Literal["train", "test"]] = None - ) -> List[str]: - """Get the Cosmos container names that back this dataset. - - Args: - split: If provided, return containers specific to that split. - If None, returns the default containers. - - Returns: - A list of Cosmos container names. In the Cosmos port this is - typically a single-element list (no per-request sharding). - - Raises: - ValueError: If no containers are configured for the requested split. - """ - if split == "train" and self.COSMOS_CONTAINERS_TRAIN is not None: - collections = self.COSMOS_CONTAINERS_TRAIN - elif split == "test" and self.COSMOS_CONTAINERS_TEST is not None: - collections = self.COSMOS_CONTAINERS_TEST - else: - collections = self.COSMOS_CONTAINERS - - if not collections: - raise ValueError( - f"No Cosmos containers configured for dataset '{self.name}'" - + (f" (split={split})" if split else "") - ) - return collections - - def _build_query_index(self) -> None: - """Build query index for O(1) lookups instead of O(n) filter operations.""" - self._query_index = {} - for i in range(len(self._search_queries_dataset)): - row = self._search_queries_dataset[i] - # Handle document_ids that may be stored as string instead of list - # TODO: We should fix this in the dataset itself. - document_ids = row["document_ids"] - if isinstance(document_ids, str): - document_ids = ast.literal_eval(document_ids) - # For document-level evaluation, ensure document_ids are strings - # (model outputs are strings, so we need consistent types for comparison) - if self.evaluation_mode == "document": - document_ids = [ - normalize_document_id(str(doc_id)) for doc_id in document_ids - ] - # Ensure query_id is always a string - query_id = str(row["query_id"]) - self._query_index[query_id] = { - "query_id": query_id, - "query": row["query"], - "document_ids": document_ids, - "answer": row["answer"], - } - - def _create_train_test_split(self) -> None: - """Create deterministic train/test split (80/20).""" - all_query_ids = list(self._query_index.keys()) - all_query_ids_sorted = sorted(all_query_ids) # Sort for determinism - rng = random.Random(SPLIT_SEED) - rng.shuffle(all_query_ids_sorted) - split_idx = int(len(all_query_ids_sorted) * TRAIN_RATIO) - self._train_query_ids = all_query_ids_sorted[:split_idx] - self._test_query_ids = all_query_ids_sorted[split_idx:] - - def get_train_query_ids(self) -> List[str]: - """Return all query ids in the train split (80% of data).""" - return self._train_query_ids.copy() - - def get_test_query_ids(self) -> List[str]: - """Return all query ids in the test split (20% of data).""" - return self._test_query_ids.copy() - - def _create_sft_rl_split(self) -> None: - """Split train queries into SFT (30%) and RL (70%) subsets. - - This is a deterministic sub-split of the train set. The split is - performed after the train/test split, so it's independent of it. - """ - train_ids_sorted = sorted(self._train_query_ids) # Sort for determinism - rng = random.Random(SFT_RL_SPLIT_SEED) - rng.shuffle(train_ids_sorted) - split_idx = int(len(train_ids_sorted) * SFT_RATIO) - self._sft_query_ids = train_ids_sorted[:split_idx] - self._rl_query_ids = train_ids_sorted[split_idx:] - - def get_sft_query_ids(self) -> List[str]: - """Return query ids for SFT training (30% of train split).""" - if not hasattr(self, "_sft_query_ids"): - self._create_sft_rl_split() - return self._sft_query_ids.copy() - - def get_rl_query_ids(self) -> List[str]: - """Return query ids for RL training (70% of train split).""" - if not hasattr(self, "_rl_query_ids"): - self._create_sft_rl_split() - return self._rl_query_ids.copy() - - def get_random_query( - self, split: Optional[Literal["train", "test"]] = None - ) -> Tuple[str, str]: - """Get a random query from the search queries dataset. - - Args: - split: If provided, only sample from the specified split ("train" or "test"). - If None, sample from all queries. - - Returns the query id and query text. - """ - if split == "train": - query_ids = self._train_query_ids - elif split == "test": - query_ids = self._test_query_ids - else: - query_ids = list(self._query_index.keys()) - - query_id = random.choice(query_ids) - return (query_id, self._query_index[query_id]["query"]) - - def get_all_query_ids( - self, split: Optional[Literal["train", "test", "sft", "rl"]] = None - ) -> List[str]: - """Return all query ids contained in the dataset. - - Args: - split: If provided, only return query ids from the specified split. - - "train": All train queries (80% of data) - - "test": All test queries (20% of data) - - "sft": SFT subset of train queries (30% of train = 24% of total) - - "rl": RL subset of train queries (70% of train = 56% of total) - - None: All query ids - """ - if split == "train": - return self._train_query_ids.copy() - elif split == "test": - return self._test_query_ids.copy() - elif split == "sft": - return self.get_sft_query_ids() - elif split == "rl": - return self.get_rl_query_ids() - return list(self._query_index.keys()) - - def get_expected_document_ids(self, query_id: str) -> List[str]: - """Get the expected document/chunk ids for a given query id. - - For document-level datasets: returns the document_ids list directly. - For fact-level datasets: returns a flattened list of all chunk_ids from all facts. - - Returns a list of document/chunk IDs. - """ - return list(self._get_all_relevant_chunk_ids(query_id)) - - def get_expected_facts(self, query_id: str) -> List[FactItem]: - """Get the expected facts for a given query id. - - Only meaningful for fact-level datasets (evaluation_mode == "fact"). - For document-level datasets, this returns an empty list. - - Returns a list of fact objects, each with keys: - - "fact": str - description of the fact - - "chunk_ids": List[str] - chunk IDs containing this fact - - "is_final_answer": bool - whether this fact is the final answer - """ - if self.evaluation_mode != "fact": - raise ValueError(f"Dataset {self.name} is not a fact-level dataset") - return self._query_index[query_id]["document_ids"] - - def get_expected_answer(self, query_id: str) -> str: - """Get the expected answer for a given query id. - - Returns the expected answer. - """ - return self._query_index[query_id]["answer"] - - def get_query_by_id(self, query_id: str) -> Tuple[str, str]: - """Get a query by id from the search queries dataset. - - Returns the query id and query text. - """ - row = self._query_index[query_id] - return (row["query_id"], row["query"]) - - def _get_all_relevant_chunk_ids(self, query_id: str) -> Set[str]: - """Get all relevant chunk IDs for a query, handling both evaluation modes. - - For document-level: returns document_ids directly. - For fact-level: extracts and flattens all chunk_ids from fact objects. - """ - document_ids = self._query_index[query_id]["document_ids"] - - if self.evaluation_mode == "fact": - # Fact-level: extract chunk_ids from each fact object - all_chunk_ids: Set[str] = set() - for fact in document_ids: - all_chunk_ids.update(fact["chunk_ids"]) - return all_chunk_ids - else: - # Document-level: document_ids is already a flat list - return set(document_ids) - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Get document IDs that correspond to "final answer" documents. - - For document-level datasets: By default, returns all document_ids. - Subclasses can override this to return only "gold" or "final answer" documents. - - For fact-level datasets: Returns chunk_ids from facts where is_final_answer=True. - """ - document_ids = self._query_index[query_id]["document_ids"] - - if self.evaluation_mode == "fact": - # Fact-level: extract chunk_ids only from final answer facts - final_answer_chunk_ids: Set[str] = set() - for fact in document_ids: - if fact.get("is_final_answer", False): - final_answer_chunk_ids.update(fact["chunk_ids"]) - return final_answer_chunk_ids - else: - # Document-level: by default, all documents are considered "final answer" - # Subclasses can override to provide gold-only documents - return set(document_ids) - - def _get_final_answer_facts(self, query_id: str) -> List[FactItem]: - """Get facts that are marked as final answer. - - Only meaningful for fact-level datasets. - Returns facts where is_final_answer=True. - """ - if self.evaluation_mode != "fact": - return [] - document_ids = self._query_index[query_id]["document_ids"] - return [fact for fact in document_ids if fact.get("is_final_answer", False)] - - def evaluate_results_recall( - self, query_id: str, retrieved_chunk_ids: List[str] - ) -> float: - """Evaluate the recall of the retrieved chunk ids for a given query. - - For document-level evaluation: - Recall = True Positives / (True Positives + False Negatives) - where positives are document IDs. - - For fact-level evaluation: - Recall = (facts found) / (total facts) - A fact is considered found if ANY of its chunk_ids are in the retrieved set. - """ - retrieved_set = set(retrieved_chunk_ids) - - if self.evaluation_mode == "fact": - # Fact-level recall: count facts where at least one chunk_id is retrieved - facts = self._query_index[query_id]["document_ids"] - if len(facts) == 0: - return 0.0 - - facts_found = sum( - 1 - for fact in facts - if set(fact["chunk_ids"]).intersection(retrieved_set) - ) - return facts_found / len(facts) - else: - # Document-level recall - retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) - relevant_document_ids_set: Set[str] = set( - self._query_index[query_id]["document_ids"] - ) - - true_positives = len( - retrieved_document_ids_set.intersection(relevant_document_ids_set) - ) - false_negatives = len( - relevant_document_ids_set - retrieved_document_ids_set - ) - if true_positives + false_negatives == 0: - return 0.0 - return true_positives / (true_positives + false_negatives) - - def evaluate_results_final_answer_recall( - self, query_id: str, retrieved_chunk_ids: List[str] - ) -> float: - """Evaluate the final answer recall of the retrieved chunk ids for a given query. - - This metric measures recall specifically on "final answer" or "gold" documents/facts: - - For document-level evaluation (e.g., BrowseCompPlus): - Uses _get_final_answer_document_ids() which can be overridden by subclasses - to return only "gold" documents (excluding "evidence" documents). - Recall = (gold docs found) / (total gold docs) - - For fact-level evaluation: - Only considers facts where is_final_answer=True. - Recall = (final answer facts found) / (total final answer facts) - A fact is found if ANY of its chunk_ids are in the retrieved set. - """ - retrieved_set = set(retrieved_chunk_ids) - - if self.evaluation_mode == "fact": - # Fact-level: only count final answer facts - final_answer_facts = self._get_final_answer_facts(query_id) - if len(final_answer_facts) == 0: - return 0.0 - - facts_found = sum( - 1 - for fact in final_answer_facts - if set(fact["chunk_ids"]).intersection(retrieved_set) - ) - return facts_found / len(final_answer_facts) - else: - # Document-level: use final answer document IDs - retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) - final_answer_document_ids_set: Set[str] = ( - self._get_final_answer_document_ids(query_id) - ) - - if len(final_answer_document_ids_set) == 0: - return 0.0 - - true_positives = len( - retrieved_document_ids_set.intersection(final_answer_document_ids_set) - ) - return true_positives / len(final_answer_document_ids_set) - - def evaluate_results_precision( - self, query_id: str, retrieved_chunk_ids: List[str] - ) -> float: - """Evaluate the precision of the retrieved chunk ids for a given query. - - For document-level evaluation: - Precision = True Positives / (True Positives + False Positives) - where positives are document IDs. - - For fact-level evaluation: - Precision = (relevant chunks retrieved) / (total chunks retrieved) - A chunk is relevant if it appears in any fact's chunk_ids. - """ - retrieved_set = set(retrieved_chunk_ids) - - if self.evaluation_mode == "fact": - # Fact-level precision: what fraction of retrieved chunks are relevant - if len(retrieved_set) == 0: - return 0.0 - - all_relevant_chunk_ids = self._get_all_relevant_chunk_ids(query_id) - relevant_retrieved = len(retrieved_set.intersection(all_relevant_chunk_ids)) - return relevant_retrieved / len(retrieved_set) - else: - # Document-level precision - retrieved_document_ids_set: Set[str] = chunk_ids_to_doc_ids(retrieved_set) - relevant_document_ids_set: Set[str] = set( - self._query_index[query_id]["document_ids"] - ) - - true_positives = len( - retrieved_document_ids_set.intersection(relevant_document_ids_set) - ) - false_positives = len( - retrieved_document_ids_set - relevant_document_ids_set - ) - if true_positives + false_positives == 0: - return 0.0 - return true_positives / (true_positives + false_positives) - - def evaluate_results_f1_score( - self, query_id: str, retrieved_chunk_ids: List[str] - ) -> float: - """Evaluate the F1 score of the retrieved chunk ids for a given query. - - F1 score is defined as 2 * (Precision * Recall) / (Precision + Recall) - Works for both document-level and fact-level evaluation modes. - """ - precision = self.evaluate_results_precision(query_id, retrieved_chunk_ids) - recall = self.evaluate_results_recall(query_id, retrieved_chunk_ids) - if precision + recall == 0: - return 0.0 - return 2 * (precision * recall) / (precision + recall) - - @classmethod - def from_known_dataset(cls, name: "SearchDatasetName") -> "SearchDataset": - """Backward-compatible factory method. Prefer get_dataset(name_str) instead.""" - return get_dataset(name.value) - - -# ============================================================================ -# Pre-Split Dataset Base Class -# ============================================================================ - - -class PreSplitSearchDataset(SearchDataset): - """ - Base class for search datasets with separate train/test HuggingFace paths. - - Instead of loading a single dataset and applying an 80/20 split, this class - loads from separate train and test HF paths and uses those as the canonical splits. - - Subclasses must define: - - HF_PATH_TRAIN: HuggingFace path for train split - - HF_PATH_TEST: HuggingFace path for test split - - name property - - Optionally: - - HF_SPLIT_TRAIN: The split name in the train dataset (default: "train") - - HF_SPLIT_TEST: The split name in the test dataset (default: "test") - - Override `_post_load_setup()` for additional processing (e.g., gold_document_ids) - """ - - HF_PATH_TRAIN: str - HF_PATH_TEST: str - HF_SPLIT_TRAIN: str = "train" - HF_SPLIT_TEST: str = "test" - - def _load_dataset(self) -> None: - """Load train and test datasets from separate HF paths.""" - cfg = config.get_config() - token = cfg.huggingface_token - - train_ds = datasets.load_dataset(self.HF_PATH_TRAIN, token=token)[ - self.HF_SPLIT_TRAIN - ] - test_ds = datasets.load_dataset(self.HF_PATH_TEST, token=token)[ - self.HF_SPLIT_TEST - ] - - # Store query IDs from each split before combining - self._presplit_train_ids = [str(qid) for qid in train_ds["query_id"]] - self._presplit_test_ids = [str(qid) for qid in test_ds["query_id"]] - - # Combine into single dataset for unified access - self._search_queries_dataset = datasets.concatenate_datasets( - [train_ds, test_ds] - ) - - # Hook for subclass-specific post-load processing - self._post_load_setup() - - def _post_load_setup(self) -> None: - """Override in subclasses for additional setup after loading.""" - pass - - def _create_train_test_split(self) -> None: - """Use the pre-defined splits instead of random 80/20.""" - self._train_query_ids = self._presplit_train_ids - self._test_query_ids = self._presplit_test_ids - - -class SingleSplitSearchDataset(SearchDataset): - """Dataset helper for eval-only corpora that expose a single HF split. - - For these datasets we typically want deterministic sampling from the full set, - so we expose all query IDs through both train and test partitions. - """ - - HF_PATH: str - HF_SPLIT_PREFERENCES: Tuple[str, ...] = ("test", "train", "validation") - - def _load_dataset(self) -> None: - self._search_queries_dataset = load_hf_dataset_first_available( - self.HF_PATH, split_preferences=self.HF_SPLIT_PREFERENCES - ) - - def _create_train_test_split(self) -> None: - all_query_ids = sorted(self._query_index.keys()) - self._train_query_ids = all_query_ids - self._test_query_ids = all_query_ids - - -# ============================================================================ -# BrowseComp+ Dataset -# ============================================================================ - - -class BrowseCompPlusDataset(SearchDataset): - """BrowseComp+ search dataset.""" - - _gold_document_ids: dict[str, Set[str]] # Maps query_id -> gold document IDs - - # Container name comes from config.cosmos_corpus_container (see get_cosmos_containers). - COSMOS_CONTAINERS = ["browsecomp_corpus_container"] - - def get_cosmos_containers(self, split=None): - return [config.get_config().cosmos_corpus_container] - - @property - def name(self) -> str: - return "browsecompplus" - - def _load_dataset(self) -> None: - cfg = config.get_config() - - qrels_gold = self._load_qrels(cfg.browsecompplus_qrels_gold_path) - qrels_evidence = self._load_qrels(cfg.browsecompplus_qrels_evidence_path) - - # Store gold document IDs separately for final_answer_recall - self._gold_document_ids = { - query_id: set(doc_ids) for query_id, doc_ids in qrels_gold.items() - } - - # Combine qrels_gold and qrels_evidence for overall recall - qrels: dict[str, list] = defaultdict(list) - for query_id, doc_ids in qrels_gold.items(): - qrels[query_id].extend(doc_ids) - for query_id, doc_ids in qrels_evidence.items(): - qrels[query_id].extend(doc_ids) - - queries = self._load_queries(cfg.browsecompplus_queries_path) - answers = self._load_decrypted_answers(cfg.browsecompplus_answers_path) - - query_ids = list(queries.keys()) - self._search_queries_dataset = datasets.Dataset.from_dict( - { - "query_id": query_ids, - "query": [queries[query_id] for query_id in query_ids], - "document_ids": [qrels[query_id] for query_id in query_ids], - "answer": [answers[query_id] for query_id in query_ids], - } - ) - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - @staticmethod - def _load_qrels(path: str) -> dict: - """Load qrels from a TREC-format file.""" - qrels: dict[str, dict[str, int]] = {} - with open(path, "r") as f: - for line in f: - parts = line.strip().split() - query_id = parts[0] - doc_id = parts[2] - relevance = int(parts[3]) - if query_id not in qrels: - qrels[query_id] = {} - qrels[query_id][doc_id] = relevance - return qrels - - @staticmethod - def _load_queries(path: str) -> dict: - """Load queries from a TSV file.""" - queries = {} - with open(path) as fd: - rd = csv.reader(fd, delimiter="\t", quotechar='"') - for row in rd: - query_id = row[0] - query_text = row[1] - queries[query_id] = query_text - return queries - - @staticmethod - def _load_decrypted_answers(path: str) -> dict: - """Load decrypted answers from a JSONL file.""" - answers = {} - with open(path, "r") as f: - for line in f: - doc = json.loads(line) - answers[doc["query_id"]] = doc["answer"] - return answers - - -# ============================================================================ -# Other Datasets -# ============================================================================ - - -class WebDataset(SearchDataset): - """Web search dataset. - - Loads from kellyhongg/web_1_17_test (test split) and kellyhongg/web_train_1_17 (train split). - If the train dataset is empty/unavailable, falls back to using only the test dataset - with an 80/20 random split. - """ - - HF_PATH_TRAIN = "kellyhongg/1_17_web_train" - HF_PATH_TEST = "kellyhongg/1_17_web_test" - COSMOS_CONTAINERS_TRAIN = [f"web_train_1_17_replica_{i}" for i in range(1, 45)] - COSMOS_CONTAINERS_TEST = [f"web_test_1_17_replica_{i}" for i in range(1, 45)] - - _gold_document_ids: dict[str, Set[str]] - _has_presplit: bool = False # Whether we have separate train/test data - - @property - def name(self) -> str: - return "web" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - return "document" - - def _load_dataset(self) -> None: - cfg = config.get_config() - token = cfg.huggingface_token - - test_ds = None - train_ds = None - - # Load test dataset - try: - test_ds = datasets.load_dataset(self.HF_PATH_TEST, token=token)["test"] - except Exception: - pass - - # Try loading train dataset - try: - raw_train = datasets.load_dataset(self.HF_PATH_TRAIN, token=token) - # Pick the first available split - for split_name in ["train", "test"]: - if split_name in raw_train and len(raw_train[split_name]) > 0: - train_ds = raw_train[split_name] - break - except Exception: - pass - - if train_ds is not None and test_ds is not None: - # Both available: use pre-split - self._has_presplit = True - self._presplit_train_ids = [str(qid) for qid in train_ds["query_id"]] - self._presplit_test_ids = [str(qid) for qid in test_ds["query_id"]] - self._search_queries_dataset = datasets.concatenate_datasets([train_ds, test_ds]) - elif test_ds is not None: - # Only test available: use it with random 80/20 split - self._has_presplit = False - self._search_queries_dataset = test_ds - elif train_ds is not None: - # Only train available - self._has_presplit = False - self._search_queries_dataset = train_ds - else: - raise ValueError("Neither train nor test data could be loaded for WebDataset") - - # Extract gold_document_ids - gold_document_ids = [ - ast.literal_eval(docids) if isinstance(docids, str) else docids - for docids in self._search_queries_dataset["gold_document_ids"] - ] - self._gold_document_ids = { - str(query_id): set(doc_ids) - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _create_train_test_split(self) -> None: - """Use pre-split if available, otherwise random 80/20.""" - if self._has_presplit: - self._train_query_ids = self._presplit_train_ids - self._test_query_ids = self._presplit_test_ids - else: - # Fall back to random split - super()._create_train_test_split() - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - -class EpsteinDataset(SearchDataset): - HF_PATH = "kellyhongg/epstein_1_14" - - @property - def name(self) -> str: - return "epstein" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """Epstein uses document-level evaluation.""" - return "document" - - def _load_dataset(self) -> None: - self._search_queries_dataset = datasets.load_dataset(self.HF_PATH)["test"] - gold_document_ids = [ - ast.literal_eval(docids) - for docids in self._search_queries_dataset["gold_document_ids"] - ] - - self._gold_document_ids = { - str(query_id): set(doc_ids) - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - -class PatentsDataset(PreSplitSearchDataset): - """Patents search dataset with pre-split train/test HF paths.""" - - HF_PATH_TRAIN = "kellyhongg/1_18_patents_train" - HF_PATH_TEST = "kellyhongg/1_18_patents_test" - HF_SPLIT_TRAIN = "train" - HF_SPLIT_TEST = "test" - COSMOS_CONTAINERS_TRAIN = [f"patents_train_1_18_replica_{i}" for i in range(1, 45)] - COSMOS_CONTAINERS_TEST = [f"patents_test_1_18_replica_{i}" for i in range(1, 45)] - - _gold_document_ids: dict[str, Set[str]] - - @property - def name(self) -> str: - return "patents" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - return "document" - - def _post_load_setup(self) -> None: - """Extract gold_document_ids from the combined dataset.""" - gold_document_ids = [ - ast.literal_eval(docids) if isinstance(docids, str) else docids - for docids in self._search_queries_dataset["gold_document_ids"] - ] - self._gold_document_ids = { - str(query_id): set(doc_ids) - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - -class SECDataset(PreSplitSearchDataset): - """SEC Filings search dataset with pre-split train/test HF paths. - - Uses sec_1_4 (full combined corpus, ~2.1M chunks) for both train and test - retrieval. The previous sec_train_1_14 collection was missing ~15% of GT - chunk IDs for train queries. Test HF data uses kellyhongg/sec_test_new - which filters out tasks with overlapping chunks. - """ - - HF_PATH_TRAIN = "kellyhongg/1_18_sec_train" - HF_PATH_TEST = "kellyhongg/sec_test_new" - HF_SPLIT_TRAIN = "train" - HF_SPLIT_TEST = "test" - COSMOS_CONTAINERS_TRAIN = ["sec_1_4"] - COSMOS_CONTAINERS_TEST = ["sec_1_4"] - - @property - def name(self) -> str: - return "sec" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """SEC Filings uses fact-level evaluation.""" - return "fact" - - -class PodcastsTestSet(SearchDataset): - HF_PATH = "kellyhongg/1_25_podcasts_test" - - @property - def name(self) -> str: - return "podcasts_test" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """Podcasts uses document-level evaluation.""" - return "document" - - def _load_dataset(self) -> None: - self._search_queries_dataset = datasets.load_dataset( - self.HF_PATH, token=config.get_config().huggingface_token - )["test"] - gold_document_ids = [ - ast.literal_eval(docids) if isinstance(docids, str) else docids - for docids in self._search_queries_dataset["gold_document_ids"] - ] - - # Ensure gold_document_ids are strings (model outputs are strings) - self._gold_document_ids = { - str(query_id): set(str(doc_id) for doc_id in doc_ids) - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - -class WebSimpleDataset(PreSplitSearchDataset): - """Web Simple search dataset with pre-split train/test HF paths.""" - - HF_PATH_TRAIN = "kellyhongg/1_25_web_simple_train" - HF_PATH_TEST = "kellyhongg/1_25_web_simple_test" - HF_SPLIT_TRAIN = "train" - HF_SPLIT_TEST = "test" - # Same as WebDataset - COSMOS_CONTAINERS_TRAIN = [f"web_train_1_17_replica_{i}" for i in range(1, 45)] - COSMOS_CONTAINERS_TEST = [f"web_test_1_17_replica_{i}" for i in range(1, 45)] - - _gold_document_ids: dict[str, Set[str]] - - @property - def name(self) -> str: - return "web_simple" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - return "document" - - def _post_load_setup(self) -> None: - """Extract gold_document_ids from the combined dataset.""" - gold_document_ids = [ - ast.literal_eval(docids) if isinstance(docids, str) else docids - for docids in self._search_queries_dataset["gold_document_ids"] - ] - self._gold_document_ids = { - str(query_id): set(str(doc_id) for doc_id in doc_ids) - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - """Return only gold document IDs (excluding evidence documents).""" - return self._gold_document_ids.get(query_id, set()) - - -class SECSimpleDataset(PreSplitSearchDataset): - """SEC Simple search dataset with pre-split train/test HF paths.""" - - HF_PATH_TRAIN = "kellyhongg/1_25_sec_simple_train" - HF_PATH_TEST = "kellyhongg/1_25_sec_simple_test" - HF_SPLIT_TRAIN = "train" - HF_SPLIT_TEST = "test" - COSMOS_CONTAINERS_TRAIN = ["sec_1_4"] - COSMOS_CONTAINERS_TEST = ["sec_1_4"] - - @property - def name(self) -> str: - return "sec_simple" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """SEC Simple uses fact-level evaluation.""" - return "fact" - - -# ============================================================================ -# Additional Benchmark Datasets (Kelly April 2026 refresh) -# ============================================================================ - - -class BCPlusDataset(SingleSplitSearchDataset): - """BrowseComp+ benchmark loaded directly from HuggingFace.""" - - HF_PATH = "kellyhongg/bc_plus" - HF_SPLIT_PREFERENCES = ("test", "train") - COSMOS_CONTAINERS = ["browsecomp_corpus_container"] - - def get_cosmos_containers(self, split=None): - return [config.get_config().cosmos_corpus_container] - - _gold_document_ids: dict[str, Set[str]] - - @property - def name(self) -> str: - return "bc_plus" - - def _load_dataset(self) -> None: - self._search_queries_dataset = load_hf_dataset_first_available( - self.HF_PATH, split_preferences=self.HF_SPLIT_PREFERENCES - ) - - gold_document_ids = [ - ast.literal_eval(docids) if isinstance(docids, str) else docids - for docids in self._search_queries_dataset["gold_document_ids"] - ] - self._gold_document_ids = { - str(query_id): { - normalize_document_id(str(doc_id)) for doc_id in doc_ids - } - for query_id, doc_ids in zip( - self._search_queries_dataset["query_id"], gold_document_ids - ) - } - - def _get_final_answer_document_ids(self, query_id: str) -> Set[str]: - return self._gold_document_ids.get(query_id, set()) - - -class LongSealQADataset(SingleSplitSearchDataset): - """LongSealQA retrieval dataset backed by a Chroma collection.""" - - HF_PATH = "kellyhongg/longsealqa" - HF_SPLIT_PREFERENCES = ("test", "train") - COSMOS_CONTAINERS = ["longsealqa"] - - @property - def name(self) -> str: - return "longsealqa" - - -class Seal0QADataset(SingleSplitSearchDataset): - """Seal0QA open-web retrieval dataset (web search based).""" - - HF_PATH = "kellyhongg/seal0qa" - HF_SPLIT_PREFERENCES = ("test", "train") - - @property - def name(self) -> str: - return "seal0qa" - - -class FramesDataset(SingleSplitSearchDataset): - """FRAMES benchmark for Wikipedia-focused retrieval.""" - - HF_PATH = "kellyhongg/frames" - HF_SPLIT_PREFERENCES = ("test", "train") - - @property - def name(self) -> str: - return "frames" - - -class HotpotQASubsetDataset(SingleSplitSearchDataset): - """HotpotQA subset benchmark for Wikipedia-focused retrieval.""" - - HF_PATH = "kellyhongg/hotpotqa_subset" - HF_SPLIT_PREFERENCES = ("test", "train") - - @property - def name(self) -> str: - return "hotpotqa_subset" - - -# ============================================================================ -# SEC Filings Dataset (legacy - uses HuggingFace kellyhongg/sec_filings) -# ============================================================================ - - -class SECFilingsDataset(SearchDataset): - """SEC Filings search dataset from HuggingFace. - - This dataset uses fact-level evaluation where document_ids contains a list of - fact objects, each with chunk_ids. A fact is considered found if ANY of its - chunk_ids are retrieved. - """ - - HF_PATH = "kellyhongg/sec_filings" - COSMOS_CONTAINERS = [f"latest_sec_filings_replica_{i}" for i in range(46)] - - @property - def name(self) -> str: - return "sec_filings" - - @property - def evaluation_mode(self) -> Literal["document", "fact"]: - """SEC Filings uses fact-level evaluation.""" - return "fact" - - def _load_dataset(self) -> None: - self._search_queries_dataset = datasets.load_dataset(self.HF_PATH)["test"] - - def get_cosmos_containers( - self, split: Optional[Literal["train", "test"]] = None - ) -> List[str]: - """Return the Cosmos container names for SEC Filings.""" - return self.COSMOS_CONTAINERS - - -# ============================================================================ -# QA-Only Benchmark Datasets (no document_ids, answer-evaluation only) -# ============================================================================ - - -class DeepSearchDataset(SearchDataset): - """xbench/DeepSearch benchmark dataset. - - This is an answer-evaluation benchmark (no document_ids / recall evaluation). - The dataset is encrypted; the decrypt code is available at the xbench_evals - GitHub repo. We load the raw HF dataset and map it to the SearchDataset - interface with empty document_ids so that the query/answer pipeline works. - - HF columns: id, prompt, answer, reference_steps, canary - """ - - HF_PATH = "xbench/DeepSearch" - - @property - def name(self) -> str: - return "deepsearch" - - def _load_dataset(self) -> None: - raw_ds = datasets.load_dataset(self.HF_PATH, split="train") - - self._search_queries_dataset = datasets.Dataset.from_dict( - { - "query_id": [str(row["id"]) for row in raw_ds], - "query": [row["prompt"] for row in raw_ds], - "document_ids": [[] for _ in range(len(raw_ds))], - "answer": [row["answer"] for row in raw_ds], - } - ) - - -class GAIADataset(SearchDataset): - """GAIA benchmark dataset (gaia-benchmark/GAIA). - - This is an answer-evaluation benchmark for general AI assistants. - It is a gated dataset — you must accept the terms on the HF page before - loading: https://huggingface.co/datasets/gaia-benchmark/GAIA - - We load the '2023_all' config and combine validation + test splits. - HF columns: task_id, Question, Level, Final answer, file_name, file_path, - Annotator Metadata - - document_ids is set to empty because GAIA does not provide retrieval labels. - """ - - HF_PATH = "gaia-benchmark/GAIA" - HF_CONFIG = "2023_all" - - @property - def name(self) -> str: - return "gaia" - - def _load_dataset(self) -> None: - cfg = config.get_config() - token = cfg.huggingface_token - - raw_ds = datasets.load_dataset( - self.HF_PATH, self.HF_CONFIG, token=token - ) - - # GAIA typically has 'validation' and 'test' splits. - # 'test' answers are hidden, so we use 'validation' as our primary data. - # If both exist, concatenate them; otherwise use whichever is available. - splits_to_use = [] - for split_name in ["validation", "test"]: - if split_name in raw_ds: - splits_to_use.append(raw_ds[split_name]) - - if not splits_to_use: - raise ValueError( - f"GAIA dataset has no usable splits. Available: {list(raw_ds.keys())}" - ) - - combined = datasets.concatenate_datasets(splits_to_use) - - self._search_queries_dataset = datasets.Dataset.from_dict( - { - "query_id": [str(row["task_id"]) for row in combined], - "query": [row["Question"] for row in combined], - "document_ids": [[] for _ in range(len(combined))], - "answer": [row["Final answer"] for row in combined], - } - ) - - -# ============================================================================ -# Dataset Registry & Factory -# ============================================================================ - - - - -# ============================================================================ -# Enterprise RAG Bench Dataset (Onyx EnterpriseRAG-Bench) -# ============================================================================ - - -class EnterpriseRagBenchDataset(SearchDataset): - """Onyx EnterpriseRAG-Bench, 500-question leaderboard split. - - Document-level retrieval eval. 30 of the 500 questions - (high_level + info_not_found) have empty `expected_doc_ids` — they are - included by default but contribute recall=0. - """ - - _gold_document_ids: dict - - def get_cosmos_containers(self, split=None): - import os - return [os.environ.get("ENT_RAG_CONTAINER", "enterprise_ragbench_corpus")] - - @property - def name(self) -> str: - return "enterprise_rag" - - def _load_dataset(self) -> None: - ds = datasets.load_dataset( - "onyx-dot-app/EnterpriseRAG-Bench", "questions", split="test" - ) - rows = [] - for r in ds: - qid = r.get("question_id") or r.get("qid") - if qid is None: - continue - rows.append({ - "qid": str(qid), - "question": r.get("question", ""), - "expected_doc_ids": list(r.get("expected_doc_ids") or []), - "gold_answer": r.get("gold_answer", ""), - }) - self._gold_document_ids = { - r["qid"]: set(r["expected_doc_ids"]) for r in rows - } - self._search_queries_dataset = datasets.Dataset.from_dict({ - "query_id": [r["qid"] for r in rows], - "query": [r["question"] for r in rows], - "document_ids": [list(r["expected_doc_ids"]) for r in rows], - "answer": [r["gold_answer"] for r in rows], - }) - - def _get_final_answer_document_ids(self, query_id): - return self._gold_document_ids.get(query_id, set()) - - def _create_train_test_split(self) -> None: - all_query_ids = sorted(self._query_index.keys()) - self._train_query_ids = all_query_ids - self._test_query_ids = all_query_ids - - -DATASET_REGISTRY: dict[str, type[SearchDataset]] = { - "browsecompplus": BrowseCompPlusDataset, - "enterprise_rag": EnterpriseRagBenchDataset, - "bc_plus": BCPlusDataset, - "epstein": EpsteinDataset, - "longsealqa": LongSealQADataset, - "seal0qa": Seal0QADataset, - "frames": FramesDataset, - "hotpotqa_subset": HotpotQASubsetDataset, - "podcasts_test": PodcastsTestSet, - "web": WebDataset, - "patents": PatentsDataset, - "sec": SECDataset, - "web_simple": WebSimpleDataset, - "sec_simple": SECSimpleDataset, - "sec_filings": SECFilingsDataset, # Legacy dataset - works with existing collections - "deepsearch": DeepSearchDataset, - "gaia": GAIADataset, -} - - -def get_dataset(name: str) -> SearchDataset: - """Create a search dataset by name. - - Args: - name: The dataset name. Available datasets: - - "browsecompplus": BrowseComp+ dataset - - "bc_plus": BrowseComp+ HF dataset variant (single collection) - - "epstein": Epstein dataset - - "longsealqa": LongSeal QA dataset - - "seal0qa": Seal0 QA dataset (open-web) - - "frames": FRAMES dataset - - "hotpotqa_subset": HotpotQA subset dataset - - "podcasts_test": Podcasts test dataset - - "web": Web dataset (pre-split train/test) - - "patents": Patents dataset (pre-split train/test) - - "sec": SEC Filings dataset (pre-split train/test) - - "web_simple": Web Simple dataset (pre-split train/test) - - "sec_simple": SEC Simple dataset (pre-split train/test) - - "deepsearch": xbench/DeepSearch benchmark (answer-eval only) - - "gaia": GAIA benchmark (answer-eval only, gated) - - Returns: - An instance of the corresponding dataset class. - - Raises: - ValueError: If the dataset name is not recognized. - """ - if name not in DATASET_REGISTRY: - available = ", ".join(DATASET_REGISTRY.keys()) - raise ValueError(f"Unknown dataset: {name}. Available datasets: {available}") - return DATASET_REGISTRY[name]() diff --git a/cosmos-retriever/src/cosmos_retriever/tasks.py b/cosmos-retriever/src/cosmos_retriever/tasks.py index 3a0fb27..33b3433 100644 --- a/cosmos-retriever/src/cosmos_retriever/tasks.py +++ b/cosmos-retriever/src/cosmos_retriever/tasks.py @@ -17,7 +17,6 @@ if TYPE_CHECKING: from harness.config import Config - from datagen.search_dataset import SearchDataset logger = structlog.get_logger("search_agent.agent") @@ -297,7 +296,7 @@ def succeeded(self) -> bool: def from_search_task_output( cls, output: SearchTaskOutput, - dataset: "SearchDataset", + dataset: "Any", ) -> "SearchTaskEvaluationOutput": """Create an evaluation output from a SearchTaskOutput and dataset. @@ -345,7 +344,7 @@ def from_search_task_output( @staticmethod def _calculate_prune_accuracy( output: SearchTaskOutput, - dataset: "SearchDataset", + dataset: "Any", ) -> Optional[float]: """Calculate prune accuracy for a search task output. @@ -379,7 +378,7 @@ def _calculate_prune_accuracy( @staticmethod def _calculate_rerank_metrics( output: SearchTaskOutput, - dataset: "SearchDataset", + dataset: "Any", ) -> tuple[Optional[float], Optional[int]]: """Calculate reranker metrics for a search task output. diff --git a/cosmos-retriever/tests/__init__.py b/cosmos-retriever/tests/__init__.py deleted file mode 100644 index 249da09..0000000 --- a/cosmos-retriever/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Test suite for cosmos-retriever.""" diff --git a/cosmos-retriever/tests/conftest.py b/cosmos-retriever/tests/conftest.py deleted file mode 100644 index c8008be..0000000 --- a/cosmos-retriever/tests/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Shared pytest fixtures. - -We deliberately avoid the project's :class:`RetrieverSettings` here — none of -the unit tests should touch real Cosmos or OpenAI. The :func:`stub_settings_env` -fixture (auto-applied) populates the required env vars with placeholder values -so that any import-time validation succeeds without secrets. -""" - -from __future__ import annotations - -from collections.abc import Iterator - -import pytest - - -@pytest.fixture(autouse=True) -def stub_settings_env(monkeypatch: pytest.MonkeyPatch) -> Iterator[None]: - """Provide harmless defaults for required env vars across the test session.""" - - monkeypatch.setenv("ACCOUNT_URI", "https://stub.documents.azure.com:443/") - monkeypatch.setenv("COSMOS_DATABASE", "test-db") - monkeypatch.setenv("COSMOS_CORPUS_CONTAINER", "test-container") - monkeypatch.setenv("OPENAI_API_KEY", "sk-test-stub") - monkeypatch.setenv("VLLM_BASE_URL", "http://test-vllm:8000") - monkeypatch.setenv("VLLM_MODEL_NAME", "harness-1") - yield diff --git a/cosmos-retriever/tests/test_chat_agent.py b/cosmos-retriever/tests/test_chat_agent.py deleted file mode 100644 index d5637b6..0000000 --- a/cosmos-retriever/tests/test_chat_agent.py +++ /dev/null @@ -1,271 +0,0 @@ -"""Tests for the generic OpenAI-compatible chat backend. - -These never touch a real model or Cosmos: a fake chat client returns scripted -responses and a stub tool returns canned search results, so we only exercise -the agent loop, tool dispatch, doc-text hydration, and final-answer parsing. -""" - -from __future__ import annotations - -from types import SimpleNamespace -from typing import Any - -import pytest - -from cosmos_retriever.config import RetrieverSettings -from cosmos_retriever.inference.openai_chat import run_chat_search, run_responses_search -from cosmos_retriever.tools import SEARCH_CORPUS_SCHEMA, Tool, ToolCallMetadata, ToolSet - - -# -------------------------------------------------------------------------- -# Fakes -# -------------------------------------------------------------------------- -class _StubSearchTool(Tool): - """Returns one canned search hit formatted like the real SearchCorpusTool.""" - - def __call__( - self, params: dict[Any, Any], overrides: dict[Any, Any] | None = None - ) -> tuple[str, ToolCallMetadata | None]: - return ( - "\n# DOCUMENT ID: doc_1 (12 tokens) \nMarie Curie discovered radium in 1898.", - None, - ) - - -def _toolset() -> ToolSet: - ts = ToolSet() - ts.add_tool(_StubSearchTool(tool_schema=SEARCH_CORPUS_SCHEMA)) - return ts - - -class _FakeFunction: - def __init__(self, name: str, arguments: str) -> None: - self.name = name - self.arguments = arguments - - -class _FakeToolCall: - def __init__(self, call_id: str, name: str, arguments: str) -> None: - self.id = call_id - self.type = "function" - self.function = _FakeFunction(name, arguments) - - -class _FakeMessage: - def __init__(self, content: str | None = None, tool_calls: list | None = None) -> None: - self.content = content - self.tool_calls = tool_calls - - -class FakeChatClient: - """Mimics the subset of openai.OpenAI used by run_chat_search.""" - - def __init__(self, scripted_messages: list[_FakeMessage]) -> None: - self._scripted = list(scripted_messages) - self.calls: list[dict] = [] - - # client.chat.completions.create(...) - @property - def chat(self) -> FakeChatClient: - return self - - @property - def completions(self) -> FakeChatClient: - return self - - def create(self, **kwargs: Any) -> Any: - self.calls.append(kwargs) - message = self._scripted.pop(0) - return SimpleNamespace(choices=[SimpleNamespace(message=message)]) - - -# -------------------------------------------------------------------------- -# Chat agent tests -# -------------------------------------------------------------------------- -def test_chat_search_executes_tool_then_parses_final_docs() -> None: - client = FakeChatClient( - [ - _FakeMessage( - content="", - tool_calls=[_FakeToolCall("call_1", "search_corpus", '{"query": "radium"}')], - ), - _FakeMessage( - content=( - "\n" - "States Curie discovered radium.\n" - "" - ), - tool_calls=None, - ), - ] - ) - - result = run_chat_search( - toolset=_toolset(), - client=client, - model="gpt-4o-foundry", - query="Who discovered radium?", - max_documents=5, - ) - - assert result.num_turns == 2 - assert len(result.documents) == 1 - doc = result.documents[0] - assert doc.id == "doc_1" - assert "Marie Curie discovered radium" in doc.text # hydrated from the search result - assert doc.justification == "States Curie discovered radium." - assert doc.rank == 0 - assert result.metadata["backend"] == "openai_chat" - assert result.metadata["tool_calls"] == 1 - # The model + tools were actually forwarded. - assert client.calls[0]["model"] == "gpt-4o-foundry" - assert any(t["function"]["name"] == "search_corpus" for t in client.calls[0]["tools"]) - - -def test_chat_search_handles_immediate_final_answer() -> None: - client = FakeChatClient( - [_FakeMessage(content="", tool_calls=None)] - ) - result = run_chat_search( - toolset=_toolset(), client=client, model="m", query="q", max_documents=3 - ) - assert result.num_turns == 1 - assert [d.id for d in result.documents] == ["doc_9"] - - -def test_chat_search_respects_max_turns_without_final() -> None: - # Always returns a tool call → never a final answer; loop must stop at max_turns. - looping = [ - _FakeMessage( - content="thinking", - tool_calls=[_FakeToolCall(f"c{i}", "search_corpus", "{}")], - ) - for i in range(10) - ] - client = FakeChatClient(looping) - result = run_chat_search( - toolset=_toolset(), client=client, model="m", query="q", max_turns=3 - ) - assert result.num_turns == 3 - assert result.documents == [] # no blocks ever emitted - - -# -------------------------------------------------------------------------- -# Config tests -# -------------------------------------------------------------------------- -def test_use_chat_backend_flag() -> None: - harmony = RetrieverSettings(inference_backend="harmony_vllm") # type: ignore[call-arg] - chat = RetrieverSettings(inference_backend="openai_chat") # type: ignore[call-arg] - assert harmony.use_chat_backend is False - assert chat.use_chat_backend is True - - -def test_build_chat_client_requires_base_url_and_model() -> None: - s = RetrieverSettings(inference_backend="openai_chat") # type: ignore[call-arg] - with pytest.raises(ValueError, match="CHAT_BASE_URL"): - s.build_chat_client() - - s2 = RetrieverSettings(inference_backend="openai_chat", chat_base_url="http://x/v1") # type: ignore[call-arg] - with pytest.raises(ValueError, match="CHAT_MODEL"): - s2.build_chat_client() - - -def test_build_chat_client_returns_openai_client() -> None: - s = RetrieverSettings( # type: ignore[call-arg] - inference_backend="openai_chat", - chat_base_url="http://foundry.example/v1", - chat_model="gpt-4o", - chat_api_key="secret-key", - ) - client = s.build_chat_client() - assert str(client.base_url).rstrip("/") == "http://foundry.example/v1" - - -# -------------------------------------------------------------------------- -# Responses-API backend -# -------------------------------------------------------------------------- -class _FakeFunctionCall: - """A /responses ``function_call`` output item.""" - - type = "function_call" - - def __init__(self, call_id: str, name: str, arguments: str) -> None: - self.call_id = call_id - self.name = name - self.arguments = arguments - - -class _FakeResponse: - def __init__(self, response_id: str, output: list, output_text: str = "") -> None: - self.id = response_id - self.output = output - self.output_text = output_text - - -class FakeResponsesClient: - """Mimics the subset of openai.OpenAI used by run_responses_search.""" - - def __init__(self, scripted: list[_FakeResponse]) -> None: - self._scripted = list(scripted) - self.calls: list[dict] = [] - - @property - def responses(self) -> FakeResponsesClient: - return self - - def create(self, **kwargs: Any) -> _FakeResponse: - self.calls.append(kwargs) - return self._scripted.pop(0) - - -def test_responses_search_executes_tool_then_parses_final_docs() -> None: - client = FakeResponsesClient( - [ - _FakeResponse( - "resp_1", - output=[_FakeFunctionCall("call_1", "search_corpus", '{"query": "radium"}')], - ), - _FakeResponse( - "resp_2", - output=[], - output_text=( - "Curie discovered radium." - ), - ), - ] - ) - - result = run_responses_search( - toolset=_toolset(), - client=client, - model="gpt-5.4", - query="Who discovered radium?", - max_documents=5, - reasoning_effort="low", - ) - - assert result.num_turns == 2 - assert [d.id for d in result.documents] == ["doc_1"] - assert "Marie Curie discovered radium" in result.documents[0].text - assert result.documents[0].justification == "Curie discovered radium." - assert result.metadata["backend"] == "openai_responses" - # First turn uses a plain-string input + flat function tool schema + reasoning. - assert isinstance(client.calls[0]["input"], str) - assert client.calls[0]["tools"][0]["name"] == "search_corpus" - assert client.calls[0]["reasoning"] == {"effort": "low"} - # Second turn continues via previous_response_id + function_call_output. - assert client.calls[1]["previous_response_id"] == "resp_1" - assert client.calls[1]["input"][0]["type"] == "function_call_output" - - -def test_responses_search_respects_max_turns() -> None: - looping = [ - _FakeResponse(f"resp_{i}", output=[_FakeFunctionCall(f"c{i}", "search_corpus", "{}")]) - for i in range(10) - ] - client = FakeResponsesClient(looping) - result = run_responses_search( - toolset=_toolset(), client=client, model="gpt-5.4", query="q", max_turns=3 - ) - assert result.num_turns == 3 - assert result.documents == [] diff --git a/cosmos-retriever/tests/test_server.py b/cosmos-retriever/tests/test_server.py deleted file mode 100644 index de424e8..0000000 --- a/cosmos-retriever/tests/test_server.py +++ /dev/null @@ -1,99 +0,0 @@ -"""Tests for the FastAPI HTTP service in :mod:`cosmos_retriever.server`. - -These never touch real Cosmos / vLLM: the heavy :class:`CosmosRetriever` is -replaced with a stub so we only exercise the request/response plumbing, -concurrency-pool keying, and error-envelope behaviour. -""" - -from __future__ import annotations - -from fastapi.testclient import TestClient - -import cosmos_retriever.server as server_module -from cosmos_retriever.config import get_settings -from cosmos_retriever.retriever import RetrievalResult, RetrievedDocument - - -class _StubRetriever: - """Stand-in for CosmosRetriever that records its construction args.""" - - instances: list[_StubRetriever] = [] - - def __init__(self, settings=None, *, corpus_name=None, reranker=None) -> None: - self.settings = settings - self.corpus_name = corpus_name - self.calls: list[tuple[str, int]] = [] - _StubRetriever.instances.append(self) - - def search(self, query: str, *, max_documents: int = 20) -> RetrievalResult: - self.calls.append((query, max_documents)) - return RetrievalResult( - query=query, - documents=[RetrievedDocument(id="doc-1", text="hello", rank=0)], - num_turns=3, - elapsed_s=1.5, - ) - - -class _BoomRetriever(_StubRetriever): - def search(self, query: str, *, max_documents: int = 20) -> RetrievalResult: - raise RuntimeError("vllm unreachable") - - -def _client(monkeypatch, retriever_cls=_StubRetriever) -> TestClient: - _StubRetriever.instances = [] - monkeypatch.setattr(server_module, "CosmosRetriever", retriever_cls) - get_settings.cache_clear() - app = server_module.create_app(get_settings()) - return TestClient(app) - - -def test_health_ok(monkeypatch) -> None: - with _client(monkeypatch) as client: - resp = client.get("/health") - assert resp.status_code == 200 - assert resp.json() == {"status": "ok"} - - -def test_search_returns_result_json(monkeypatch) -> None: - with _client(monkeypatch) as client: - resp = client.post("/search", json={"query": "who discovered radium?", "maxDocuments": 5}) - assert resp.status_code == 200 - body = resp.json() - assert body["query"] == "who discovered radium?" - assert body["num_turns"] == 3 - assert body["documents"][0]["id"] == "doc-1" - # max_documents forwarded through the alias. - assert _StubRetriever.instances[0].calls == [("who discovered radium?", 5)] - - -def test_search_defaults_max_documents(monkeypatch) -> None: - with _client(monkeypatch) as client: - resp = client.post("/search", json={"query": "q"}) - assert resp.status_code == 200 - assert _StubRetriever.instances[0].calls == [("q", 20)] - - -def test_search_rejects_empty_query(monkeypatch) -> None: - with _client(monkeypatch) as client: - resp = client.post("/search", json={"query": ""}) - assert resp.status_code == 422 # pydantic min_length - - -def test_search_pool_keys_by_corpus(monkeypatch) -> None: - with _client(monkeypatch) as client: - client.post("/search", json={"query": "a", "container": "corpus-x"}) - client.post("/search", json={"query": "b", "container": "corpus-x"}) - client.post("/search", json={"query": "c", "container": "corpus-y"}) - # Same container reuses one retriever; a different one builds a second. - corpora = sorted(r.corpus_name for r in _StubRetriever.instances) - assert corpora == ["corpus-x", "corpus-y"] - - -def test_search_error_returns_json_envelope(monkeypatch) -> None: - with _client(monkeypatch, retriever_cls=_BoomRetriever) as client: - resp = client.post("/search", json={"query": "boom"}) - assert resp.status_code == 500 - body = resp.json() - assert body["error"] == "vllm unreachable" - assert body["type"] == "RuntimeError" diff --git a/cosmos-retriever/tests/test_tools.py b/cosmos-retriever/tests/test_tools.py deleted file mode 100644 index e6f7e1f..0000000 --- a/cosmos-retriever/tests/test_tools.py +++ /dev/null @@ -1,104 +0,0 @@ -"""Unit tests for :mod:`cosmos_retriever.tools`.""" - -from __future__ import annotations - -from cosmos_retriever.tools import ( - GREP_CORPUS_SCHEMA, - PRUNE_CHUNKS_SCHEMA, - SEARCH_CORPUS_SCHEMA, - PruneChunksTool, - ToolSet, - UserTextTool, - _fts_literal_args, - _tokenize_for_fts, -) -from cosmos_retriever.utils import ProviderFormat - - -class TestStopwordTokenisation: - def test_drops_stopwords_and_lowercases(self) -> None: - assert _tokenize_for_fts("The quick brown FOX") == ["quick", "brown", "fox"] - - def test_dedupes_preserving_order(self) -> None: - assert _tokenize_for_fts("alpha BETA alpha gamma beta") == ["alpha", "beta", "gamma"] - - def test_caps_at_30_terms(self) -> None: - words = " ".join(f"word{i}" for i in range(50)) - terms = _tokenize_for_fts(words) - assert len(terms) == 30 - assert terms[0] == "word0" and terms[-1] == "word29" - - def test_empty_after_stopwords_returns_empty(self) -> None: - # Every token in this string is a stopword. - assert _tokenize_for_fts("the and or but please") == [] - - -class TestFtsLiteralArgs: - def test_emits_quoted_csv(self) -> None: - assert _fts_literal_args(["alpha", "beta"]) == '"alpha", "beta"' - - def test_escapes_quotes_and_backslashes(self) -> None: - out = _fts_literal_args(['he said "hi"', "back\\slash"]) - assert out == '"he said \\"hi\\"", "back\\\\slash"' - - -class TestSchemaProviderFormat: - def test_openai_format_contains_function_metadata(self) -> None: - f = SEARCH_CORPUS_SCHEMA.to_provider_format(ProviderFormat.OPENAI) - assert f["type"] == "function" - assert f["name"] == "search_corpus" - assert "query" in f["parameters"]["properties"] - assert f["parameters"]["required"] == ["query"] - - def test_harmony_format_nests_function_object(self) -> None: - f = GREP_CORPUS_SCHEMA.to_provider_format(ProviderFormat.OPENAI_HARMONY) - assert f["type"] == "function" - assert f["function"]["name"] == "grep_corpus" - assert f["function"]["parameters"]["required"] == ["pattern"] - - -class TestToolSetBasics: - def test_add_get_remove(self) -> None: - ts = ToolSet() - prune = PruneChunksTool() - user = UserTextTool() - ts.add_tool(prune) - ts.add_tool(user) - assert ts.get_tool("prune_chunks") is prune - assert ts.get_tool("user_text") is user - assert ts.get_tool("missing") is None - ts.remove_tool("prune_chunks") - assert ts.get_tool("prune_chunks") is None - - def test_duplicate_name_raises(self) -> None: - ts = ToolSet() - ts.add_tool(PruneChunksTool()) - try: - ts.add_tool(PruneChunksTool()) - except ValueError as exc: - assert "already exists" in str(exc) - else: # pragma: no cover - raise AssertionError("expected ValueError") - - -class TestPruneTool: - def test_returns_pruned_string(self) -> None: - tool = PruneChunksTool() - out, metadata = tool({"chunk_ids": ["a", "b"]}) - assert out == "Pruned" - assert metadata is None - - def test_rejects_missing_arg(self) -> None: - tool = PruneChunksTool() - try: - tool({}) - except ValueError as exc: - assert "Invalid params" in str(exc) - else: # pragma: no cover - raise AssertionError("expected ValueError") - - -def test_prune_chunks_schema_round_trip() -> None: - f = PRUNE_CHUNKS_SCHEMA.to_provider_format(ProviderFormat.OPENAI) - assert f["parameters"]["properties"]["chunk_ids"]["type"] == "array" - assert f["parameters"]["required"] == ["chunk_ids"] diff --git a/cosmos-retriever/tests/test_trajectory.py b/cosmos-retriever/tests/test_trajectory.py deleted file mode 100644 index 5ad72ff..0000000 --- a/cosmos-retriever/tests/test_trajectory.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Unit tests for :mod:`cosmos_retriever.trajectory`.""" - -from __future__ import annotations - -import json - -from cosmos_retriever.tools import ( - PRUNE_CHUNKS_SCHEMA, - PruneChunksTool, - SearchCorpusToolCallMetadata, - Tool, - ToolSchema, - UserTextTool, -) -from cosmos_retriever.trajectory import ( - Action, - ActionBuilder, - ObservationBuilder, - Trajectory, - TrajectoryBuilder, -) - - -class _FakeSearchTool(Tool): - """A stand-in :class:`Tool` for trajectory rendering tests.""" - - tool_schema: ToolSchema - - def __init__(self) -> None: - super().__init__( - tool_schema=ToolSchema( - name="search_corpus", - description="x", - parameters={"query": {"type": "string"}}, - required=["query"], - ) - ) - - def __call__(self, params, overrides=None): # type: ignore[override] - return "result", None - - -def _build_tiny_trajectory() -> Trajectory: - builder = TrajectoryBuilder() - builder.add_observation( - ObservationBuilder() - .add_observation("hello?", source="user") - .build() - ) - builder.add_action( - ActionBuilder() - .add_reasoning("I should search.") - .add_tool_call(_FakeSearchTool(), {"query": "hi"}, source="toolu_search_1") - .build() - ) - builder.add_observation( - ObservationBuilder() - .add_observation( - "\n# DOCUMENT ID: doc_a \nhello world", - source="toolu_search_1", - tool_metadata=SearchCorpusToolCallMetadata(returned_chunk_ids=["doc_a"]), - ) - .build() - ) - builder.add_action( - ActionBuilder() - .add_tool_call(UserTextTool(), {"text": ""}, source="agent") - .build() - ) - return builder.build() - - -class TestTrajectoryBuilders: - def test_builds_in_order(self) -> None: - traj = _build_tiny_trajectory() - assert traj.num_turns == 2 - # Alternating obs / action / obs / action - types = [type(e).__name__ for e in traj.actions_and_observations] - assert types == ["Observation", "Action", "Observation", "Action"] - - def test_clone_is_deep_for_params(self) -> None: - traj = _build_tiny_trajectory() - cloned = traj.clone() - # mutate the original's params, the clone should be unaffected - first_action = next( - e for e in traj.actions_and_observations if isinstance(e, Action) - ) - first_action.params[0]["query"] = "MUTATED" - cloned_first = next( - e for e in cloned.actions_and_observations if isinstance(e, Action) - ) - assert cloned_first.params[0]["query"] == "hi" - - -class TestOpenAIChatRendering: - def test_user_assistant_tool_round_trip(self) -> None: - traj = _build_tiny_trajectory() - msgs = traj.to_openai_format() - assert msgs[0] == {"role": "user", "content": [{"type": "text", "text": "hello?"}]} - assert msgs[1]["role"] == "assistant" - assert msgs[1]["tool_calls"][0]["function"]["name"] == "search_corpus" - assert json.loads(msgs[1]["tool_calls"][0]["function"]["arguments"]) == {"query": "hi"} - assert msgs[2]["role"] == "tool" - assert "DOCUMENT ID: doc_a" in msgs[2]["content"][0]["text"] - # Final assistant text - assert msgs[3]["role"] == "assistant" - assert msgs[3]["content"][0]["text"].startswith("") - - -class TestOpenAIHarmonyRendering: - def test_renders_to_a_conversation(self) -> None: - traj = _build_tiny_trajectory() - conv = traj.to_openai_harmony_format() - # Sanity: we should have system + developer + the four trajectory entries. - assert len(conv.messages) >= 4 - - def test_user_text_tool_does_not_appear_in_action_dispatch(self) -> None: - # PruneChunksTool exists in tools.py; ensure ActionBuilder accepts it - # but rejects MultiToolUseTool (the latter is reserved for rendering). - ab = ActionBuilder() - ab.add_tool_call(PruneChunksTool(), {"chunk_ids": ["x"]}, source="agent") - action = ab.build() - assert action.tools[0].tool_schema.name == PRUNE_CHUNKS_SCHEMA.name - - -class TestObservationBuilderValidation: - def test_incomplete_builder_raises(self) -> None: - ob = ObservationBuilder() - try: - ob.build() - except ValueError as exc: - assert "is not complete" in str(exc) - else: # pragma: no cover - raise AssertionError("expected ValueError") From a7d8b2f94b100db26eae79b4f231a3de631cbd36 Mon Sep 17 00:00:00 2001 From: Aryan Saboo Date: Tue, 30 Jun 2026 23:09:15 +0000 Subject: [PATCH 8/8] feat(cosmos-retriever): expose per-query agent trajectory on RetrievalResult Add a trajectory field to RetrievalResult populated by the harmony_vllm backend: the search queries issued (search_history), per-turn tool calls (turn_tools), programmatic per-turn status summaries (turn_summaries), and the final per-doc importance tags from curation (curated_importance). --- .../src/cosmos_retriever/retriever.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cosmos-retriever/src/cosmos_retriever/retriever.py b/cosmos-retriever/src/cosmos_retriever/retriever.py index c61e06e..2ec3d67 100644 --- a/cosmos-retriever/src/cosmos_retriever/retriever.py +++ b/cosmos-retriever/src/cosmos_retriever/retriever.py @@ -69,6 +69,11 @@ class RetrievalResult: pool_doc_ids: list[str] = field(default_factory=list) elapsed_s: float = 0.0 metadata: dict[str, str | int | float] = field(default_factory=dict) + # Per-query trajectory: the agent's step-by-step actions (search queries + # issued, per-turn tool calls, programmatic turn summaries, and the final + # importance tags assigned during curation). Populated by the harmony_vllm + # backend; empty for backends that don't expose turn-level state. + trajectory: dict[str, object] = field(default_factory=dict) class CosmosRetriever: @@ -267,6 +272,19 @@ def _search_sync( ) ) + trajectory = { + "search_history": list(getattr(env.wm, "search_history", []) or []), + "turn_summaries": list(getattr(env, "_result_summaries", []) or []), + "curated_importance": dict(getattr(env.wm, "curated_importance", {}) or {}), + "turn_tools": [ + [ + getattr(getattr(t, "tool_schema", None), "name", type(t).__name__) + for t in getattr(action, "tools", []) + ] + for action in getattr(env, "_all_actions", []) or [] + ], + } + result = RetrievalResult( query=query, documents=documents, @@ -274,6 +292,7 @@ def _search_sync( final_text="", elapsed_s=round(elapsed, 3), pool_doc_ids=sorted({cid.split("__")[0] for cid in env.wm.pool_ids}), + trajectory=trajectory, metadata={ "n_pool": len(env.wm.pool_ids), "n_curated": len(env.wm.curated_ids),