From e0fb8f828bde0f3dfb90bc20f99e16e7038c4964 Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 17:25:41 -0300
Subject: [PATCH 1/8] feat: implement built-in token usage tracking and cost
 observability across the RAG pipeline

---
 CHANGELOG.md                                  |  51 ++
 docs/api-reference.md                         |  63 +-
 docs/getting-started.md                       |  22 +-
 docs/index.md                                 |   2 +
 docs/ingestion.md                             |   1 +
 docs/providers.md                             |  14 +-
 docs/retrieval.md                             |  21 +
 docs/token-usage.md                           | 300 ++++++++
 graphrag_sdk/src/graphrag_sdk/__init__.py     |   2 +
 graphrag_sdk/src/graphrag_sdk/api/main.py     |   7 +-
 graphrag_sdk/src/graphrag_sdk/core/context.py |  24 +
 graphrag_sdk/src/graphrag_sdk/core/models.py  |  52 ++
 .../src/graphrag_sdk/core/providers/base.py   |  34 +-
 .../graphrag_sdk/core/providers/litellm.py    |  55 +-
 .../graphrag_sdk/core/providers/openrouter.py |  50 +-
 .../extraction_strategies/graph_extraction.py |   6 +-
 .../src/graphrag_sdk/ingestion/pipeline.py    |   3 +-
 .../retrieval/strategies/multi_path.py        |  11 +-
 .../src/graphrag_sdk/storage/vector_store.py  |   7 +-
 graphrag_sdk/tests/test_token_usage.py        | 652 ++++++++++++++++++
 20 files changed, 1333 insertions(+), 44 deletions(-)
 create mode 100644 docs/token-usage.md
 create mode 100644 graphrag_sdk/tests/test_token_usage.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 43006ddb..5d71c579 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,57 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+<<<<<<< HEAD
+## [1.0.2] - 2026-05-04
+
+Patch release. One retrieval correctness fix and one default-value
+change carried over from the post-1.0.1 README onboarding work.
+
+### Fixed
+
+- **Chunk citations preserve the full `Document.path`.** The chunk
+  retrieval strategy was reducing the path returned from the graph
+  to a basename via `path.rsplit("/", 1)[-1]` before handing it off
+  to the citation pipeline. That dropped real information: files
+  sharing a basename across directories — e.g. `operations/index.md`
+  vs `commands/index.md` — collapsed to the same identifier
+  downstream, and consumers building source links from the citation
+  could no longer reconstruct the original location. `Document.path`
+  already stored the full path passed to `rag.ingest()`, so this is
+  a read-side fix only; existing graphs start emitting full paths in
+  the next query with no migration required.
+=======
+### Added
+
+- **Token usage tracking on all public response objects (#227).** `IngestionResult`,
+  `RagResult`, and `RetrieverResult` now expose a `usage: TokenUsage` field
+  that reports the total `prompt_tokens`, `completion_tokens`, and
+  `embedding_tokens` consumed by the operation. `TokenUsage` is exported from
+  the top-level package and supports `+` / `+=` for easy aggregation over
+  batch results.
+
+  ```python
+  result = await rag.completion("Who is Alice?")
+  print(result.usage.prompt_tokens)      # LLM input tokens
+  print(result.usage.completion_tokens)  # LLM output tokens
+  print(result.usage.embedding_tokens)   # embedding tokens
+  ```
+
+  **Implementation notes:**
+  - Async provider methods (`ainvoke`, `ainvoke_messages`, `aembed_query`,
+    `aembed_documents`, `abatch_invoke`) now accept an optional keyword-only
+    `ctx: Context | None = None` parameter. Usage is recorded into the
+    accumulator at `ctx.usage` via `ctx.record_usage()`.
+  - `VectorStore.index_chunks()` accepts the same optional `ctx` and forwards
+    it to the embedder.
+  - Custom providers that do not override these methods, and all callers that
+    omit `ctx`, continue to work exactly as before — the change is fully
+    backward-compatible.
+  - 51 new unit tests in `tests/test_token_usage.py` covering model arithmetic,
+    context accumulation, provider instrumentation, and backward compatibility.
+  - See [docs/token-usage.md](docs/token-usage.md) for the full guide.
+>>>>>>> 4455125 (feat: implement built-in token usage tracking and cost observability across the RAG pipeline)
+
 ## [1.0.2] - 2026-05-04
 
 Patch release. One retrieval correctness fix and one default-value
diff --git a/docs/api-reference.md b/docs/api-reference.md
index 68233935..db20266d 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -8,6 +8,7 @@ Complete reference for all public classes and methods exported by `graphrag_sdk`
 - [Connection](#connection)
 - [Providers](#providers)
 - [Data Models](#data-models)
+- [TokenUsage](#tokenusage)
 - [Schema](#schema)
 - [Ingestion Strategies](#ingestion-strategies)
 - [Ingestion Pipeline](#ingestion-pipeline)
@@ -269,11 +270,11 @@ LLMInterface(model_name: str, model_params: dict | None = None, max_concurrency:
 | Method | Signature | Description |
 |--------|-----------|-------------|
 | `invoke` | `(prompt: str, **kwargs) -> LLMResponse` | Sync text generation (abstract) |
-| `ainvoke` | `(prompt: str, *, max_retries=3, **kwargs) -> LLMResponse` | Async with retry + backoff |
-| `ainvoke_messages` | `(messages: list[ChatMessage], *, max_retries=3, **kwargs) -> LLMResponse` | Multi-turn native messages (see below) |
+| `ainvoke` | `(prompt: str, *, ctx=None, max_retries=3, **kwargs) -> LLMResponse` | Async with retry + backoff; records usage into `ctx` |
+| `ainvoke_messages` | `(messages: list[ChatMessage], *, ctx=None, max_retries=3, **kwargs) -> LLMResponse` | Multi-turn native messages; records usage into `ctx` |
 | `invoke_with_model` | `(prompt: str, response_model: Type[BaseModel], **kwargs) -> BaseModel` | Structured output |
 | `ainvoke_with_model` | `(prompt: str, response_model: Type[BaseModel], *, max_retries=3) -> BaseModel` | Async structured output |
-| `abatch_invoke` | `(prompts: list[str], *, max_concurrency=None, max_retries=3) -> list[LLMBatchItem]` | Concurrent batch |
+| `abatch_invoke` | `(prompts: list[str], *, ctx=None, max_concurrency=None, max_retries=3) -> list[LLMBatchItem]` | Concurrent batch; threads `ctx` to each `ainvoke` |
 
 `ainvoke_messages()` is used by `completion()` when conversation history is provided. The default implementation concatenates messages into a single prompt string and calls `ainvoke()`, so custom providers work without changes. `LiteLLM` and `OpenRouterLLM` override this with native multi-turn implementations.
 
@@ -283,9 +284,9 @@ LLMInterface(model_name: str, model_params: dict | None = None, max_concurrency:
 |--------|-----------|-------------|
 | `model_name` | `@property -> str` | Embedding model identifier (abstract) |
 | `embed_query` | `(text: str, **kwargs) -> list[float]` | Single text embedding (abstract) |
-| `aembed_query` | `(text: str, **kwargs) -> list[float]` | Async single (default: thread pool) |
+| `aembed_query` | `(text: str, *, ctx=None, **kwargs) -> list[float]` | Async single; records `embedding_tokens` into `ctx` |
 | `embed_documents` | `(texts: list[str], **kwargs) -> list[list[float]]` | Batch (default: sequential) |
-| `aembed_documents` | `(texts: list[str], **kwargs) -> list[list[float]]` | Async batch (default: thread pool) |
+| `aembed_documents` | `(texts: list[str], *, ctx=None, **kwargs) -> list[list[float]]` | Async batch; records `embedding_tokens` into `ctx` |
 
 ### LLMBatchItem
 
@@ -411,8 +412,11 @@ class IngestionResult(DataModel):
     relationships_created: int = 0
     chunks_indexed: int = 0
     metadata: dict[str, Any] = {}
+    usage: TokenUsage = TokenUsage()     # Accumulated token counts for this ingest
 ```
 
+See [Token Usage](token-usage.md) for what each counter covers.
+
 ### RagResult
 
 ```python
@@ -420,16 +424,22 @@ class RagResult(DataModel):
     answer: str
     retriever_result: RetrieverResult | None = None      # Populated when return_context=True
     metadata: dict[str, Any] = {}                        # Contains model, num_context_items, strategy
+    usage: TokenUsage = TokenUsage()                     # Accumulated token counts for this completion
 ```
 
+See [Token Usage](token-usage.md) for what each counter covers.
+
 ### RetrieverResult
 
 ```python
 class RetrieverResult(DataModel):
     items: list[RetrieverResultItem] = []
     metadata: dict[str, Any] = {}
+    usage: TokenUsage = TokenUsage()     # Accumulated token counts for this retrieve
 ```
 
+See [Token Usage](token-usage.md) for what each counter covers.
+
 ### RetrieverResultItem
 
 ```python
@@ -519,6 +529,32 @@ Deterministic entity ID from normalized name and optional type. When `entity_typ
 
 ---
 
+## TokenUsage
+
+```python
+from graphrag_sdk import TokenUsage
+
+class TokenUsage(DataModel):
+    prompt_tokens: int = 0       # Total tokens sent to LLM (input)
+    completion_tokens: int = 0   # Total tokens generated by LLM (output)
+    embedding_tokens: int = 0    # Total tokens sent to the embedder
+```
+
+All fields default to `0`. Supports `+` (returns new instance) and `+=` (in-place accumulation) for aggregation across batch results.
+
+```python
+# Aggregate batch ingest usage
+results = await rag.ingest(["a.pdf", "b.pdf"])
+total = sum(
+    (r.usage for r in results if isinstance(r, IngestionResult)),
+    start=TokenUsage(),
+)
+```
+
+See the full guide in [Token Usage](token-usage.md).
+
+---
+
 ## Schema
 
 ```python
@@ -745,16 +781,25 @@ VectorStore(connection, embedder=None, index_name="chunk_embeddings", embedding_
 from graphrag_sdk import Context
 ```
 
-Execution context for logging and budget tracking.
+Execution context threaded through every strategy call for logging, budget tracking, and token usage accumulation.
 
 ```python
-Context(tenant_id: str = "default", latency_budget_ms: float = 60000.0)
+Context(
+    tenant_id: str = "default",
+    latency_budget_ms: float | None = None,
+    metadata: dict[str, Any] = {},
+)
 ```
 
 | Method/Property | Description |
 |----------------|-------------|
-| `ctx.log(message, log_level=logging.INFO)` | Log a message |
-| `ctx.budget_exceeded` | True if elapsed time > latency_budget_ms |
+| `ctx.log(message, log_level=logging.INFO)` | Log a message with tenant/trace prefix |
+| `ctx.budget_exceeded` | True if elapsed time > `latency_budget_ms` |
+| `ctx.remaining_budget_ms` | Remaining budget in ms, or `None` |
+| `ctx.elapsed_ms` | Milliseconds since context creation |
+| `ctx.usage` | `TokenUsage` accumulator for this operation |
+| `ctx.record_usage(*, prompt_tokens=0, completion_tokens=0, embedding_tokens=0)` | Add token counts to the accumulator |
+| `ctx.child(**overrides)` | Create a child context with inherited tenant/trace |
 
 ---
 
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 7eea71e0..2a51e996 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -118,12 +118,17 @@ The default is `256` (matched-Matryoshka dimensions of `text-embedding-3-large`)
 result = await rag.ingest("path/to/document.txt")
 print(f"Created {result.nodes_created} nodes, {result.relationships_created} relationships")
 print(f"Indexed {result.chunks_indexed} chunks")
+# Token costs for this ingest (extraction LLM + chunk embeddings)
+print(f"Prompt tokens: {result.usage.prompt_tokens}")
+print(f"Completion tokens: {result.usage.completion_tokens}")
+print(f"Embedding tokens: {result.usage.embedding_tokens}")
 ```
 
 ### From raw text
 
 ```python
-result = await rag.ingest("acme_doc", text="Acme Corp was founded in 1985 by Jane Doe in Austin, Texas.")
+result = await rag.ingest(text="Acme Corp was founded in 1985 by Jane Doe in Austin, Texas.",
+                          document_id="acme_doc")
 print(f"Created {result.nodes_created} nodes, {result.relationships_created} relationships")
 print(f"Indexed {result.chunks_indexed} chunks")
 ```
@@ -151,6 +156,10 @@ Use `completion()` for the full RAG pipeline — retrieval + answer generation:
 ```python
 result = await rag.completion("Who works at Acme Corp?")
 print(result.answer)
+# See what it cost
+print(f"Tokens used — prompt: {result.usage.prompt_tokens}, "
+      f"completion: {result.usage.completion_tokens}, "
+      f"embedding: {result.usage.embedding_tokens}")
 ```
 
 ### With context inspection
@@ -201,14 +210,16 @@ Supported roles: `"system"`, `"user"`, `"assistant"`. Invalid roles raise `Value
 Use `get_statistics()` to see a summary of what the graph contains:
 
 ```python
-stats = await rag.graph_store.get_statistics()
+stats = await rag.get_statistics()
 print(f"Nodes: {stats['node_count']}, Edges: {stats['edge_count']}")
 ```
 
 You can also run raw Cypher queries against the graph:
 
 ```python
-results = await rag.graph_store.query_raw("MATCH (p:Person)-[:WORKS_AT]->(o:Organization) RETURN p.name, o.name LIMIT 10")
+results = await rag.graph_store.query_raw(
+    "MATCH (p:Person)-[:WORKS_AT]->(o:Organization) RETURN p.name, o.name LIMIT 10"
+)
 for row in results.result_set:
     print(row)
 ```
@@ -221,8 +232,8 @@ After all documents have been ingested, run `finalize()` to deduplicate entities
 
 ```python
 results = await rag.finalize()
-print(f"Deduplicated: {results['entities_deduplicated']}")
-print(f"Embedded: {results['entities_embedded']} entities, {results['relationships_embedded']} relationships")
+print(f"Deduplicated: {results.entities_deduplicated}")
+print(f"Embedded: {results.entities_embedded} entities, {results.relationships_embedded} relationships")
 ```
 
 This step is important for query accuracy. It merges duplicate entities (e.g., "J. Doe" and "Jane Doe") and ensures all entities have vector embeddings for semantic search.
@@ -233,6 +244,7 @@ This step is important for query accuracy. It merges duplicate entities (e.g., "
 
 - [docs/configuration.md](configuration.md) -- Tuning connection settings, chunking parameters, and retrieval options.
 - [docs/strategies.md](strategies.md) -- Custom extraction and resolution strategies.
+- [docs/token-usage.md](token-usage.md) -- Cost tracking, billing dashboards, and observability patterns.
 - [docs/benchmark.md](benchmark.md) -- Reproducing benchmark results on the GraphRAG-Bench Novel corpus (20 novels, 2,010 questions).
 
 ---
diff --git a/docs/index.md b/docs/index.md
index d56f7c61..310a9799 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,6 +12,7 @@ GraphRAG SDK builds knowledge graphs from documents and answers questions over t
 - **Fully modular** -- swap chunking, extraction, resolution, retrieval, and reranking strategies
 - **Production-ready** -- async-first, connection pooling, circuit breaker, batched writes
 - **Full provenance** -- every answer traces back to its source document and chunk
+- **Built-in cost tracking** -- `result.usage.prompt_tokens / completion_tokens / embedding_tokens` on every response
 
 ## Quick Start
 
@@ -43,5 +44,6 @@ asyncio.run(main())
 - [Getting Started](getting-started.md) -- Full tutorial from install to first query
 - [Architecture](architecture.md) -- How the 9-step pipeline works
 - [Strategies](strategies.md) -- All swappable strategy ABCs and built-in options
+- [Token Usage](token-usage.md) -- Cost tracking and observability
 - [Benchmark](benchmark.md) -- Methodology and reproduction instructions
 - [API Reference](api-reference.md) -- Full API documentation
diff --git a/docs/ingestion.md b/docs/ingestion.md
index 1bb0b6c9..cfb6cfb1 100644
--- a/docs/ingestion.md
+++ b/docs/ingestion.md
@@ -277,6 +277,7 @@ stats = await rag.finalize()
 - **Fastest step:** Quality filter, prune, and resolve — all in-memory, sub-second.
 - **Parallelism:** Steps 8-9 run in parallel. Step 1 NER uses a semaphore (default 12 concurrent calls).
 - **Batch size:** The benchmark uses 1500-character chunks. 20 documents (~4.7 MB total) take ~47 minutes to ingest.
+- **Cost tracking:** Check `result.usage.prompt_tokens`, `result.usage.completion_tokens`, and `result.usage.embedding_tokens` after each `ingest()` call. See [Token Usage](token-usage.md) for aggregation patterns across batch ingestion.
 
 ---
 
diff --git a/docs/providers.md b/docs/providers.md
index 3c3bd141..78244576 100644
--- a/docs/providers.md
+++ b/docs/providers.md
@@ -137,11 +137,13 @@ class MyLLM(LLMInterface):
 
 | Method | Default Behavior | Override When |
 |--------|-----------------|--------------|
-| `ainvoke(prompt, max_retries=3)` | Runs `invoke()` in a thread pool with retry | You have a native async client |
-| `ainvoke_messages(messages, max_retries=3)` | Concatenates messages into a single prompt and calls `ainvoke()` | You have a native multi-turn chat API |
+| `ainvoke(prompt, *, ctx=None, max_retries=3)` | Runs `invoke()` in a thread pool with retry; records usage if `ctx` provided | You have a native async client |
+| `ainvoke_messages(messages, *, ctx=None, max_retries=3)` | Concatenates messages into a single prompt and calls `ainvoke()` | You have a native multi-turn chat API |
 | `invoke_with_model(prompt, response_model)` | Calls `invoke()` and parses JSON into Pydantic model | Your provider has native structured output |
 | `ainvoke_with_model(prompt, response_model)` | Calls `ainvoke()` and parses JSON | Same, async version |
-| `abatch_invoke(prompts, max_concurrency)` | Concurrent `ainvoke()` with semaphore | You have a native batch API |
+| `abatch_invoke(prompts, *, ctx=None, max_concurrency)` | Concurrent `ainvoke()` with semaphore; threads `ctx` to each call | You have a native batch API |
+
+> **Token usage:** pass the current `ctx` to record prompt/completion tokens automatically. See [Token Usage](token-usage.md).
 
 `ainvoke_messages()` is called by `completion()` when conversation history is provided. Override it to pass messages natively to your LLM's chat API for proper multi-turn handling:
 
@@ -195,9 +197,11 @@ The `model_name` property is used by the graph config node to validate that the
 
 | Method | Default Behavior | Override When |
 |--------|-----------------|--------------|
-| `aembed_query(text)` | Runs `embed_query()` in thread pool | You have async embedding |
+| `aembed_query(text, *, ctx=None)` | Runs `embed_query()` in thread pool; records embedding tokens if `ctx` provided | You have async embedding |
 | `embed_documents(texts)` | Sequential `embed_query()` per text | You can batch embeddings |
-| `aembed_documents(texts)` | Runs `embed_documents()` in thread pool | You have async batch |
+| `aembed_documents(texts, *, ctx=None)` | Runs `embed_documents()` in thread pool; records embedding tokens if `ctx` provided | You have async batch |
+
+> **Token usage:** pass the current `ctx` to record embedding tokens automatically. See [Token Usage](token-usage.md).
 
 ### Batch Embedding
 
diff --git a/docs/retrieval.md b/docs/retrieval.md
index 0d9a65ee..88acc3b7 100644
--- a/docs/retrieval.md
+++ b/docs/retrieval.md
@@ -312,11 +312,32 @@ reranker = CosineReranker(embedder=embedder, top_k=10)
 result = await rag.completion("Your question", reranker=reranker)
 ```
 
+### Token Usage
+
+Both `retrieve()` and `completion()` attach token counters to the result:
+
+```python
+# Retrieval only
+result = await rag.retrieve("What did Professor Harmon discover?")
+print(result.usage.embedding_tokens)   # query embedding tokens
+print(result.usage.prompt_tokens)      # keyword-extraction LLM tokens
+
+# Full completion
+result = await rag.completion("What did Professor Harmon discover?")
+print(result.usage.prompt_tokens)      # retrieval + answer generation LLM input
+print(result.usage.completion_tokens)  # answer tokens
+print(result.usage.embedding_tokens)   # query embedding tokens
+```
+
+See [Token Usage](token-usage.md) for cost estimation helpers and observability patterns.
+
 ---
 
+
 ## File Reference
 
 | File | What it contains |
+
 |------|-----------------|
 | [`multi_path.py`](https://github.com/FalkorDB/GraphRAG-SDK/blob/main/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/multi_path.py) | Main orchestrator — coordinates all 9 steps |
 | [`entity_discovery.py`](https://github.com/FalkorDB/GraphRAG-SDK/blob/main/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/entity_discovery.py) | RELATES vector search + 2-path entity discovery |
diff --git a/docs/token-usage.md b/docs/token-usage.md
new file mode 100644
index 00000000..71e2144f
--- /dev/null
+++ b/docs/token-usage.md
@@ -0,0 +1,300 @@
+# Token Usage Tracking
+
+GraphRAG SDK exposes token consumption on every public response object so you can implement cost monitoring, billing dashboards, and observability without subclassing private methods.
+
+## Quick Start
+
+```python
+from graphrag_sdk import GraphRAG, TokenUsage
+
+rag = GraphRAG(connection=..., llm=..., embedder=...)
+
+# Ingestion
+result = await rag.ingest("doc.pdf")
+print(result.usage.prompt_tokens)      # LLM input tokens (extraction + chunking)
+print(result.usage.completion_tokens)  # LLM output tokens
+print(result.usage.embedding_tokens)   # Tokens sent to the embedding model
+
+# Retrieval only
+result = await rag.retrieve("What is GraphRAG?")
+print(result.usage.embedding_tokens)   # Query embedding tokens
+print(result.usage.prompt_tokens)      # Keyword-extraction LLM tokens
+
+# Full RAG completion
+result = await rag.completion("What is GraphRAG?")
+print(result.usage.prompt_tokens)      # Total LLM input (retrieval + answer)
+print(result.usage.completion_tokens)  # Total LLM output
+print(result.usage.embedding_tokens)   # Query embedding tokens
+```
+
+---
+
+## TokenUsage Model
+
+```python
+from graphrag_sdk import TokenUsage
+
+class TokenUsage(DataModel):
+    prompt_tokens: int = 0       # Total tokens sent to LLM (input)
+    completion_tokens: int = 0   # Total tokens generated by LLM (output)
+    embedding_tokens: int = 0    # Total tokens sent to the embedder
+```
+
+All fields default to `0`, so the field is always safe to read even when no LLM or embedding calls were made (e.g., a cached result, a custom provider that does not surface usage).
+
+### Arithmetic
+
+`TokenUsage` supports `+` and `+=` for aggregation:
+
+```python
+# Sum across multiple ingestion results
+results: list[IngestionResult] = await rag.ingest(["a.pdf", "b.pdf", "c.pdf"])
+total = sum(
+    (r.usage for r in results if isinstance(r, IngestionResult)),
+    start=TokenUsage(),
+)
+print(total.embedding_tokens)
+```
+
+```python
+# Accumulate inside a loop
+total = TokenUsage()
+for doc_path in large_corpus:
+    r = await rag.ingest(doc_path)
+    total += r.usage
+
+cost = (total.prompt_tokens * 0.003 + total.completion_tokens * 0.004) / 1000
+print(f"Estimated cost: ${cost:.4f}")
+```
+
+---
+
+## What Each Counter Covers
+
+### `ingest()`
+
+| Counter | What it counts |
+|---|---|
+| `prompt_tokens` | ContextualChunking LLM prompts + NER step-1 prompts + extraction verify step-2 prompts |
+| `completion_tokens` | All LLM responses during extraction and chunking |
+| `embedding_tokens` | Chunk texts sent to the embedder during indexing |
+
+### `retrieve()`
+
+| Counter | What it counts |
+|---|---|
+| `prompt_tokens` | Keyword-extraction LLM call |
+| `completion_tokens` | Keyword-extraction LLM response |
+| `embedding_tokens` | Query text sent to the embedder |
+
+### `completion()`
+
+Includes everything from `retrieve()` **plus**:
+
+| Counter | What it counts |
+|---|---|
+| `prompt_tokens` | Question-rewrite LLM call + final answer generation prompt |
+| `completion_tokens` | Final answer generated |
+
+---
+
+## Batch Ingest Aggregation
+
+When ingesting a list of sources, each `IngestionResult` in the returned list carries its own `.usage`. The facade does **not** aggregate them automatically, which is consistent with the `list[IngestionResult | Exception]` contract.
+
+```python
+results = await rag.ingest(["a.pdf", "b.pdf", "c.pdf"])
+
+total = TokenUsage()
+for r in results:
+    if isinstance(r, IngestionResult):
+        total += r.usage
+    else:
+        print(f"Ingestion failed: {r}")
+
+print(f"Total embedding tokens: {total.embedding_tokens}")
+```
+
+---
+
+## Cost Estimation Pattern
+
+Token pricing varies by model. Here is a reusable helper:
+
+```python
+from graphrag_sdk import TokenUsage
+
+def estimate_cost(
+    usage: TokenUsage,
+    *,
+    prompt_price_per_1k: float,
+    completion_price_per_1k: float,
+    embedding_price_per_1k: float = 0.0,
+) -> float:
+    """Estimate USD cost from token usage and per-1K prices."""
+    return (
+        usage.prompt_tokens * prompt_price_per_1k / 1000
+        + usage.completion_tokens * completion_price_per_1k / 1000
+        + usage.embedding_tokens * embedding_price_per_1k / 1000
+    )
+
+
+# GPT-4o pricing (illustrative — check OpenAI's current rates)
+result = await rag.completion("Summarize the knowledge graph.")
+cost = estimate_cost(
+    result.usage,
+    prompt_price_per_1k=0.0025,
+    completion_price_per_1k=0.010,
+    embedding_price_per_1k=0.00002,
+)
+print(f"This query cost approx ${cost:.6f}")
+```
+
+---
+
+## Design Decisions
+
+### Accumulator on Context (not return values)
+
+Usage is accumulated on `Context.usage` as a side-effect of provider calls, then **snapshotted** into the result object at the end of the operation. This means:
+
+- **No signature changes to strategy ABCs.** The 15+ strategy classes don't need to be touched — they already thread `ctx` through every step.
+- **Scoped per operation.** Each `ingest()` / `retrieve()` / `completion()` call creates or receives its own fresh `Context`, so counters never bleed between calls.
+- **Thread-safe by construction.** Async Python has cooperative concurrency. Within a single `await` chain, there is no race on the accumulator.
+
+### `ctx` is keyword-only and defaults to `None`
+
+Provider async methods (`ainvoke`, `ainvoke_messages`, `aembed_query`, `aembed_documents`, `abatch_invoke`) now accept `ctx: Context | None = None`. When `ctx` is `None`, the call behaves exactly as before — no usage is recorded and no exception is raised. This makes the change **fully backward-compatible** for custom providers that don't override these methods.
+
+### Custom providers: zero changes required
+
+If you implement a custom `LLMInterface` or `Embedder`, your code keeps working without modification. To opt in to usage tracking, add `ctx` handling to your async methods:
+
+```python
+from graphrag_sdk import LLMInterface
+from graphrag_sdk.core.models import LLMResponse
+from graphrag_sdk.core.context import Context
+from typing import Any
+
+class MyLLM(LLMInterface):
+    def __init__(self):
+        super().__init__(model_name="my-llm")
+
+    def invoke(self, prompt: str, **kwargs: Any) -> LLMResponse:
+        resp = my_client.generate(prompt)
+        return LLMResponse(content=resp.text)
+
+    async def ainvoke(
+        self,
+        prompt: str,
+        *,
+        ctx: Context | None = None,
+        max_retries: int = 3,
+        **kwargs: Any,
+    ) -> LLMResponse:
+        resp = await my_async_client.generate(prompt)
+        if ctx is not None:
+            ctx.record_usage(
+                prompt_tokens=resp.usage.input_tokens,
+                completion_tokens=resp.usage.output_tokens,
+            )
+        return LLMResponse(content=resp.text)
+```
+
+Similarly for `Embedder.aembed_query`:
+
+```python
+async def aembed_query(
+    self,
+    text: str,
+    *,
+    ctx: Context | None = None,
+    **kwargs: Any,
+) -> list[float]:
+    resp = await my_embed_client.embed(text)
+    if ctx is not None:
+        ctx.record_usage(embedding_tokens=resp.usage.total_tokens)
+    return resp.embedding
+```
+
+### `ctx.record_usage()` API
+
+```python
+ctx.record_usage(
+    *,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+    embedding_tokens: int = 0,
+) -> None
+```
+
+All arguments are keyword-only and default to `0`. Safe to call with zeros (no-op).
+
+---
+
+## Observability Integration
+
+### OpenTelemetry / Prometheus
+
+```python
+from opentelemetry import metrics
+from graphrag_sdk import TokenUsage
+
+meter = metrics.get_meter("graphrag")
+prompt_counter = meter.create_counter("graphrag.prompt_tokens")
+completion_counter = meter.create_counter("graphrag.completion_tokens")
+embedding_counter = meter.create_counter("graphrag.embedding_tokens")
+
+def record_otel(usage: TokenUsage, operation: str) -> None:
+    labels = {"operation": operation}
+    prompt_counter.add(usage.prompt_tokens, labels)
+    completion_counter.add(usage.completion_tokens, labels)
+    embedding_counter.add(usage.embedding_tokens, labels)
+
+result = await rag.completion("What is the main theme?")
+record_otel(result.usage, "completion")
+```
+
+### Structured Logging
+
+```python
+import logging, json
+
+logger = logging.getLogger("cost")
+
+result = await rag.ingest("doc.pdf")
+logger.info(json.dumps({
+    "event": "ingest",
+    "source": "doc.pdf",
+    "prompt_tokens": result.usage.prompt_tokens,
+    "completion_tokens": result.usage.completion_tokens,
+    "embedding_tokens": result.usage.embedding_tokens,
+}))
+```
+
+---
+
+## Migration from the Fragile Pattern
+
+Before this feature, users had to subclass internal private methods to intercept token counts:
+
+```python
+# ❌ Old fragile pattern — breaks on internal refactors
+class TrackingLLM(LiteLLM):
+    def _completion_kwargs(self, prompt, **kwargs):
+        kw = super()._completion_kwargs(prompt, **kwargs)
+        # ... intercept here
+        return kw
+```
+
+Replace with:
+
+```python
+# ✅ New first-class API
+result = await rag.completion("question")
+print(result.usage.prompt_tokens)
+print(result.usage.completion_tokens)
+print(result.usage.embedding_tokens)
+```
+
+No subclassing required. No private method access. Stable across SDK versions.
diff --git a/graphrag_sdk/src/graphrag_sdk/__init__.py b/graphrag_sdk/src/graphrag_sdk/__init__.py
index a7c8ff36..54e9cd11 100644
--- a/graphrag_sdk/src/graphrag_sdk/__init__.py
+++ b/graphrag_sdk/src/graphrag_sdk/__init__.py
@@ -40,6 +40,7 @@
     SearchType,
     TextChunk,
     TextChunks,
+    TokenUsage,
 )
 from graphrag_sdk.core.providers import (
     Embedder,
@@ -138,6 +139,7 @@
     "SearchType",
     "TextChunk",
     "TextChunks",
+    "TokenUsage",
     # Ingestion
     "ChunkingStrategy",
     "CallableChunking",
diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py
index 13c46352..9f599037 100644
--- a/graphrag_sdk/src/graphrag_sdk/api/main.py
+++ b/graphrag_sdk/src/graphrag_sdk/api/main.py
@@ -549,6 +549,8 @@ async def retrieve(
             retriever_result = await reranker.rerank(question, retriever_result, ctx)
 
         ctx.log(f"Retrieved {len(retriever_result.items)} context items")
+        # Attach accumulated token usage to the result
+        retriever_result.usage = ctx.usage
         return retriever_result
 
     # ── Completion ──────────────────────────────────────────────
@@ -608,7 +610,7 @@ async def _rewrite_question_with_history(
             question=question,
         )
         try:
-            resp = await self.llm.ainvoke(prompt)
+            resp = await self.llm.ainvoke(prompt, ctx=ctx)
             rewritten = (resp.content or "").strip().splitlines()[0].strip() if resp.content else ""
         except Exception as e:
             # Broad catch is intentional (see docstring) — but log at WARNING
@@ -728,7 +730,7 @@ async def completion(
             ChatMessage(role="user", content=final_user_content),
         ]
 
-        llm_response = await self.llm.ainvoke_messages(messages)
+        llm_response = await self.llm.ainvoke_messages(messages, ctx=ctx)
 
         result = RagResult(
             answer=self._clean_answer(llm_response.content),
@@ -740,6 +742,7 @@ async def completion(
                 "has_history": bool(history),
                 "retrieval_query": retrieval_query,
             },
+            usage=ctx.usage,
         )
 
         ctx.log(f"Generated answer ({len(result.answer)} chars)")
diff --git a/graphrag_sdk/src/graphrag_sdk/core/context.py b/graphrag_sdk/src/graphrag_sdk/core/context.py
index 7f93ab0e..8170b339 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/context.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/context.py
@@ -10,6 +10,8 @@
 from typing import Any
 from uuid import uuid4
 
+from graphrag_sdk.core.models import TokenUsage
+
 logger = logging.getLogger(__name__)
 
 
@@ -32,6 +34,7 @@ class Context:
     trace_id: str = field(default_factory=lambda: str(uuid4()))
     latency_budget_ms: float | None = None
     metadata: dict[str, Any] = field(default_factory=dict)
+    usage: TokenUsage = field(default_factory=TokenUsage)
     _start_time: float = field(default_factory=time.monotonic, repr=False)
 
     @property
@@ -56,6 +59,10 @@ def child(self, **overrides: Any) -> Context:
         """Create a child context inheriting tenant/trace but with optional overrides.
 
         Useful for per-step contexts within a pipeline.
+
+        Note: ``usage`` is **not** inherited — the child starts with zero counters.
+        Token usage recorded in a child context is NOT propagated back to the parent.
+        For full usage tracking, pass the parent context directly to all callees.
         """
         return Context(
             tenant_id=overrides.get("tenant_id", self.tenant_id),
@@ -64,6 +71,23 @@ def child(self, **overrides: Any) -> Context:
             metadata={**self.metadata, **overrides.get("metadata", {})},
         )
 
+    def record_usage(
+        self,
+        *,
+        prompt_tokens: int = 0,
+        completion_tokens: int = 0,
+        embedding_tokens: int = 0,
+    ) -> None:
+        """Accumulate token counts from a single LLM or embedding call.
+
+        Called by provider implementations after every successful API
+        response.  Totals are available on :attr:`usage` at the end of
+        the operation.  Safe to call with all-zero values (no-op).
+        """
+        self.usage.prompt_tokens += prompt_tokens
+        self.usage.completion_tokens += completion_tokens
+        self.usage.embedding_tokens += embedding_tokens
+
     def log(self, message: str, level: int = logging.INFO) -> None:
         """Log with context prefix for traceability."""
         prefix = f"[tenant={self.tenant_id} trace={self.trace_id[:8]}]"
diff --git a/graphrag_sdk/src/graphrag_sdk/core/models.py b/graphrag_sdk/src/graphrag_sdk/core/models.py
index af63a9d7..a69aa426 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/models.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/models.py
@@ -278,6 +278,55 @@ class ResolutionResult(DataModel):
     merged_count: int = 0
 
 
+# ── Usage Tracking Types ─────────────────────────────────────────
+
+
+class TokenUsage(DataModel):
+    """Accumulated token usage for a single SDK operation.
+
+    Reported on :class:`IngestionResult`, :class:`RagResult`, and
+    :class:`RetrieverResult`.  All counts default to zero so the field
+    is always safe to read even when no LLM/embedding calls were made
+    (e.g., cached results, custom providers that don't surface usage).
+
+    Example::
+
+        result = await rag.completion("What is GraphRAG?")
+        print(result.usage.prompt_tokens)      # total LLM input tokens
+        print(result.usage.completion_tokens)  # total LLM output tokens
+        print(result.usage.embedding_tokens)   # total embedding tokens
+    """
+
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    embedding_tokens: int = 0
+
+    def __add__(self, other: TokenUsage) -> TokenUsage:
+        """Return a new TokenUsage that sums both operands."""
+        return TokenUsage(
+            prompt_tokens=self.prompt_tokens + other.prompt_tokens,
+            completion_tokens=self.completion_tokens + other.completion_tokens,
+            embedding_tokens=self.embedding_tokens + other.embedding_tokens,
+        )
+
+    def __iadd__(self, other: TokenUsage) -> TokenUsage:
+        """Accumulate *other* into self in-place.
+
+        Used for aggregating results across batch operations::
+
+            total = TokenUsage()
+            for r in results:
+                total += r.usage
+
+        For per-call accumulation inside a pipeline, prefer
+        ``ctx.record_usage()`` instead.
+        """
+        self.prompt_tokens += other.prompt_tokens
+        self.completion_tokens += other.completion_tokens
+        self.embedding_tokens += other.embedding_tokens
+        return self
+
+
 # ── Retrieval Types ──────────────────────────────────────────────
 
 
@@ -294,6 +343,7 @@ class RetrieverResult(DataModel):
 
     items: list[RetrieverResultItem] = Field(default_factory=list)
     metadata: dict[str, Any] = Field(default_factory=dict)
+    usage: TokenUsage = Field(default_factory=TokenUsage)
 
 
 class RawSearchResult(DataModel):
@@ -346,6 +396,7 @@ class RagResult(DataModel):
     answer: str
     retriever_result: RetrieverResult | None = None
     metadata: dict[str, Any] = Field(default_factory=dict)
+    usage: TokenUsage = Field(default_factory=TokenUsage)
 
 
 # ── Ingestion Types ──────────────────────────────────────────────
@@ -359,6 +410,7 @@ class IngestionResult(DataModel):
     relationships_created: int = 0
     chunks_indexed: int = 0
     metadata: dict[str, Any] = Field(default_factory=dict)
+    usage: TokenUsage = Field(default_factory=TokenUsage)
 
 
 class FinalizeResult(DataModel):
diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index cd7a31da..98d01b47 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -60,15 +60,25 @@ def embed_query(self, text: str, **kwargs: Any) -> list[float]:
         """Embed a single text string into a float vector."""
         ...
 
-    async def aembed_query(self, text: str, **kwargs: Any) -> list[float]:
-        """Async variant — defaults to sync-in-thread."""
+    async def aembed_query(
+        self, text: str, *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[float]:
+        """Async variant — defaults to sync-in-thread.
+
+        Args:
+            ctx: Execution context for usage tracking. Ignored by default
+                implementation — override in concrete classes to record
+                ``embedding_tokens`` via ``ctx.record_usage()``.
+        """
         return await asyncio.to_thread(self.embed_query, text, **kwargs)
 
     def embed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
         """Batch embed multiple texts. Default: sequential fallback."""
         return [self.embed_query(t, **kwargs) for t in texts]
 
-    async def aembed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
+    async def aembed_documents(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         """Async batch embed. Default: sync-in-thread."""
         return await asyncio.to_thread(self.embed_documents, texts, **kwargs)
 
@@ -105,11 +115,19 @@ async def ainvoke(
         self,
         prompt: str,
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
         """Async variant with retry + jittered exponential backoff.
 
+        Args:
+            ctx: Execution context for usage tracking. Ignored by the
+                default implementation — override in concrete classes to
+                record ``prompt_tokens`` / ``completion_tokens`` via
+                ``ctx.record_usage()``.
+            max_retries: Retry count.
+
         Retries on any exception up to ``max_retries`` times with
         jittered delays between attempts.
         """
@@ -136,6 +154,7 @@ async def ainvoke_messages(
         self,
         messages: list[ChatMessage],
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
@@ -149,6 +168,7 @@ async def ainvoke_messages(
 
         Args:
             messages: Ordered list of ``ChatMessage`` objects.
+            ctx: Execution context for usage tracking.
             max_retries: Retry count forwarded to ``ainvoke``.
             **kwargs: Extra arguments forwarded to the underlying call.
 
@@ -160,7 +180,7 @@ async def ainvoke_messages(
         for msg in messages:
             parts.append(f"{msg.role.capitalize()}: {msg.content}")
         prompt = "\n\n".join(parts)
-        return await self.ainvoke(prompt, max_retries=max_retries, **kwargs)
+        return await self.ainvoke(prompt, ctx=ctx, max_retries=max_retries, **kwargs)
 
     async def astream(self, prompt: str, **kwargs: Any) -> AsyncIterator[str]:
         """Async streaming — default yields the full response as one chunk."""
@@ -197,6 +217,7 @@ async def abatch_invoke(
         self,
         prompts: list[str],
         *,
+        ctx: Any | None = None,
         max_concurrency: int | None = None,
         max_retries: int = 3,
         **kwargs: Any,
@@ -205,6 +226,9 @@ async def abatch_invoke(
 
         Args:
             prompts: List of prompt strings to process.
+            ctx: Execution context for usage tracking. Each successful
+                call accumulates into ``ctx.usage`` via the provider's
+                ``ainvoke()`` implementation.
             max_concurrency: Override the instance default concurrency limit.
             max_retries: Retry count passed to each ``ainvoke`` call.
             **kwargs: Extra arguments forwarded to ``ainvoke``.
@@ -220,7 +244,7 @@ async def abatch_invoke(
         async def _call(i: int, prompt: str) -> LLMBatchItem:
             async with sem:
                 try:
-                    resp = await self.ainvoke(prompt, max_retries=max_retries, **kwargs)
+                    resp = await self.ainvoke(prompt, ctx=ctx, max_retries=max_retries, **kwargs)
                     return LLMBatchItem(index=i, response=resp)
                 except Exception as exc:
                     return LLMBatchItem(index=i, error=exc)
diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/litellm.py b/graphrag_sdk/src/graphrag_sdk/core/providers/litellm.py
index 8d5e8331..c90729ef 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/litellm.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/litellm.py
@@ -19,6 +19,33 @@
 logger = logging.getLogger(__name__)
 
 
+def _extract_llm_usage(response: Any) -> tuple[int, int]:
+    """Extract (prompt_tokens, completion_tokens) from a LiteLLM response.
+
+    Returns (0, 0) when usage is absent or incomplete so callers never
+    need to guard against None.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0, 0
+    pt = getattr(usage, "prompt_tokens", None) or 0
+    ct = getattr(usage, "completion_tokens", None) or 0
+    return int(pt), int(ct)
+
+
+def _extract_embedding_usage(response: Any) -> int:
+    """Extract embedding token count from a LiteLLM embedding response.
+
+    LiteLLM returns ``usage.prompt_tokens`` for embedding calls.
+    Returns 0 when absent.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0
+    pt = getattr(usage, "prompt_tokens", None) or 0
+    return int(pt)
+
+
 class LiteLLM(LLMInterface):
     """LLM provider backed by LiteLLM.
 
@@ -115,6 +142,7 @@ async def ainvoke(
         self,
         prompt: str,
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
@@ -130,6 +158,9 @@ async def ainvoke(
             try:
                 response = await litellm.acompletion(**self._completion_kwargs(prompt, **kwargs))
                 content = response.choices[0].message.content or ""
+                if ctx is not None:
+                    pt, ct = _extract_llm_usage(response)
+                    ctx.record_usage(prompt_tokens=pt, completion_tokens=ct)
                 return LLMResponse(content=content)
             except Exception as exc:
                 last_exc = exc
@@ -182,6 +213,7 @@ async def ainvoke_messages(
         self,
         messages: list[ChatMessage],
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
@@ -202,6 +234,9 @@ async def ainvoke_messages(
                     **self._messages_completion_kwargs(messages, **kwargs)
                 )
                 content = response.choices[0].message.content or ""
+                if ctx is not None:
+                    pt, ct = _extract_llm_usage(response)
+                    ctx.record_usage(prompt_tokens=pt, completion_tokens=ct)
                 return LLMResponse(content=content)
             except Exception as exc:
                 last_exc = exc
@@ -291,10 +326,14 @@ def _raw_embed_sync(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
         sorted_data = sorted(response.data, key=lambda x: x["index"])
         return [d["embedding"] for d in sorted_data]
 
-    async def _raw_embed_async(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
+    async def _raw_embed_async(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         """Raw async embed without retry — called by binary_split_retry_async."""
         litellm = self._import_litellm()
         response = await litellm.aembedding(**self._embedding_kwargs(texts, **kwargs))
+        if ctx is not None:
+            ctx.record_usage(embedding_tokens=_extract_embedding_usage(response))
         sorted_data = sorted(response.data, key=lambda x: x["index"])
         return [d["embedding"] for d in sorted_data]
 
@@ -307,16 +346,24 @@ def embed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
             results.extend(binary_split_retry_sync(self._raw_embed_sync, batch, **kwargs))
         return results
 
-    async def aembed_query(self, text: str, **kwargs: Any) -> list[float]:
+    async def aembed_query(
+        self, text: str, *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[float]:
         litellm = self._import_litellm()
         response = await litellm.aembedding(**self._embedding_kwargs(text, **kwargs))
+        if ctx is not None:
+            ctx.record_usage(embedding_tokens=_extract_embedding_usage(response))
         return response.data[0]["embedding"]
 
-    async def aembed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
+    async def aembed_documents(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         if not texts:
             return []
         results: list[list[float]] = []
         for start in range(0, len(texts), self.batch_size):
             batch = texts[start : start + self.batch_size]
-            results.extend(await binary_split_retry_async(self._raw_embed_async, batch, **kwargs))
+            results.extend(
+                await binary_split_retry_async(self._raw_embed_async, batch, ctx=ctx, **kwargs)
+            )
         return results
diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/openrouter.py b/graphrag_sdk/src/graphrag_sdk/core/providers/openrouter.py
index bb075524..0e01255d 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/openrouter.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/openrouter.py
@@ -20,6 +20,28 @@
 logger = logging.getLogger(__name__)
 
 
+def _extract_openai_llm_usage(response: Any) -> tuple[int, int]:
+    """Extract (prompt_tokens, completion_tokens) from an OpenAI-style response.
+
+    Returns (0, 0) when usage is absent or incomplete.
+    """
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0, 0
+    pt = getattr(usage, "prompt_tokens", None) or 0
+    ct = getattr(usage, "completion_tokens", None) or 0
+    return int(pt), int(ct)
+
+
+def _extract_openai_embedding_usage(response: Any) -> int:
+    """Extract embedding token count from an OpenAI-style embedding response."""
+    usage = getattr(response, "usage", None)
+    if usage is None:
+        return 0
+    pt = getattr(usage, "prompt_tokens", None) or 0
+    return int(pt)
+
+
 class OpenRouterLLM(LLMInterface):
     """LLM provider backed by OpenRouter (uses the OpenAI SDK).
 
@@ -140,6 +162,7 @@ async def ainvoke(
         self,
         prompt: str,
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
@@ -153,6 +176,9 @@ async def ainvoke(
             try:
                 response = await client.chat.completions.create(**create_kwargs)
                 content = response.choices[0].message.content or ""
+                if ctx is not None:
+                    pt, ct = _extract_openai_llm_usage(response)
+                    ctx.record_usage(prompt_tokens=pt, completion_tokens=ct)
                 return LLMResponse(content=content)
             except Exception as exc:
                 last_exc = exc
@@ -173,6 +199,7 @@ async def ainvoke_messages(
         self,
         messages: list[ChatMessage],
         *,
+        ctx: Any | None = None,
         max_retries: int = 3,
         **kwargs: Any,
     ) -> LLMResponse:
@@ -189,6 +216,9 @@ async def ainvoke_messages(
             try:
                 response = await client.chat.completions.create(**create_kwargs)
                 content = response.choices[0].message.content or ""
+                if ctx is not None:
+                    pt, ct = _extract_openai_llm_usage(response)
+                    ctx.record_usage(prompt_tokens=pt, completion_tokens=ct)
                 return LLMResponse(content=content)
             except Exception as exc:
                 last_exc = exc
@@ -280,10 +310,14 @@ def _raw_embed_sync(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
         sorted_data = sorted(response.data, key=lambda x: x.index)
         return [d.embedding for d in sorted_data]
 
-    async def _raw_embed_async(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
+    async def _raw_embed_async(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         """Raw async embed without retry — called by binary_split_retry_async."""
         client = self._get_async_client()
         response = await client.embeddings.create(model=self.model, input=texts, **kwargs)
+        if ctx is not None:
+            ctx.record_usage(embedding_tokens=_extract_openai_embedding_usage(response))
         sorted_data = sorted(response.data, key=lambda x: x.index)
         return [d.embedding for d in sorted_data]
 
@@ -296,16 +330,24 @@ def embed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
             results.extend(binary_split_retry_sync(self._raw_embed_sync, batch, **kwargs))
         return results
 
-    async def aembed_query(self, text: str, **kwargs: Any) -> list[float]:
+    async def aembed_query(
+        self, text: str, *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[float]:
         client = self._get_async_client()
         response = await client.embeddings.create(model=self.model, input=text, **kwargs)
+        if ctx is not None:
+            ctx.record_usage(embedding_tokens=_extract_openai_embedding_usage(response))
         return response.data[0].embedding
 
-    async def aembed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
+    async def aembed_documents(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         if not texts:
             return []
         results: list[list[float]] = []
         for start in range(0, len(texts), self.batch_size):
             batch = texts[start : start + self.batch_size]
-            results.extend(await binary_split_retry_async(self._raw_embed_async, batch, **kwargs))
+            results.extend(
+                await binary_split_retry_async(self._raw_embed_async, batch, ctx=ctx, **kwargs)
+            )
         return results
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/extraction_strategies/graph_extraction.py b/graphrag_sdk/src/graphrag_sdk/ingestion/extraction_strategies/graph_extraction.py
index 09a56121..8bc15c57 100644
--- a/graphrag_sdk/src/graphrag_sdk/ingestion/extraction_strategies/graph_extraction.py
+++ b/graphrag_sdk/src/graphrag_sdk/ingestion/extraction_strategies/graph_extraction.py
@@ -253,7 +253,9 @@ async def extract(
             if self._max_concurrency is not None:
                 batch_kw["max_concurrency"] = self._max_concurrency
 
-            step1_results = await self.entity_extractor._llm.abatch_invoke(ner_prompts, **batch_kw)
+            step1_results = await self.entity_extractor._llm.abatch_invoke(
+                ner_prompts, ctx=ctx, **batch_kw
+            )
             for item in step1_results:
                 chunk = active_chunks[item.index]
                 if not item.ok:
@@ -324,7 +326,7 @@ async def _step1(text: str, chunk_uid: str) -> list[ExtractedEntity]:
         all_relations: list[ExtractedRelation] = []
 
         if step2_prompts:
-            step2_results = await self.llm.abatch_invoke(step2_prompts, **batch_kw2)
+            step2_results = await self.llm.abatch_invoke(step2_prompts, ctx=ctx, **batch_kw2)
 
             for item in step2_results:
                 chunk_idx = step2_indices[item.index]
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/pipeline.py b/graphrag_sdk/src/graphrag_sdk/ingestion/pipeline.py
index 4d04fc15..45be4b66 100644
--- a/graphrag_sdk/src/graphrag_sdk/ingestion/pipeline.py
+++ b/graphrag_sdk/src/graphrag_sdk/ingestion/pipeline.py
@@ -175,7 +175,7 @@ async def _step_mentions() -> int:
 
             async def _step_index_chunks() -> None:
                 ctx.log("Step 9/9: Embedding & indexing chunks")
-                await self.vector_store.index_chunks(chunks)
+                await self.vector_store.index_chunks(chunks, ctx=ctx)
 
             mentions_written, _ = await asyncio.gather(
                 _step_mentions(),
@@ -194,6 +194,7 @@ async def _step_index_chunks() -> None:
                     "raw_relationships": len(graph_data.relationships),
                     "mention_edges_created": mentions_written,
                 },
+                usage=ctx.usage.model_copy(),  # snapshot at pipeline completion
             )
             ctx.log(
                 f"Pipeline complete: {result.nodes_created} nodes, "
diff --git a/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/multi_path.py b/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/multi_path.py
index d7bcbd9b..15cd1179 100644
--- a/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/multi_path.py
+++ b/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/multi_path.py
@@ -186,12 +186,12 @@ async def _execute(
         **kwargs: Any,
     ) -> RawSearchResult:
         # 1. Extract keywords
-        simple_kw, llm_kw = await self._extract_keywords(query)
+        simple_kw, llm_kw = await self._extract_keywords(query, ctx)
         all_keywords = llm_kw[:8] + simple_kw
         ctx.log(f"MultiPath [1/9]: {len(all_keywords)} keywords extracted")
 
         # 2. Embed question only
-        query_vector = await self._embedder.aembed_query(query)
+        query_vector = await self._embedder.aembed_query(query, ctx=ctx)
 
         # 3. RELATES vector search + Text-to-Cypher (parallel when enabled)
         if self._enable_cypher:
@@ -322,7 +322,9 @@ def _format(self, raw: RawSearchResult) -> RetrieverResult:
 
     # -- Internal: keyword extraction (stays in orchestrator) --
 
-    async def _extract_keywords(self, query: str) -> tuple[list[str], list[str]]:
+    async def _extract_keywords(
+        self, query: str, ctx: Context | None = None
+    ) -> tuple[list[str], list[str]]:
         """Extract simple + LLM-based keywords from the query."""
         words = re.sub(r"[?.!,;:'\"\-()\[\]]", " ", query.lower()).split()
         simple = [w for w in words if w not in self._STOP_WORDS and len(w) > 2][:12]
@@ -333,7 +335,8 @@ async def _extract_keywords(self, query: str) -> tuple[list[str], list[str]]:
                 "Extract ALL proper nouns, character names, person names, place names, "
                 "book titles, and specific terms from this question. "
                 "Return them comma-separated, nothing else.\n\n"
-                f"Question: {query}\n\nNames: "
+                f"Question: {query}\n\nNames: ",
+                ctx=ctx,
             )
             llm_kw = [
                 k.strip().strip("'\"").rstrip("()").strip()
diff --git a/graphrag_sdk/src/graphrag_sdk/storage/vector_store.py b/graphrag_sdk/src/graphrag_sdk/storage/vector_store.py
index df0cdb39..b04dbf79 100644
--- a/graphrag_sdk/src/graphrag_sdk/storage/vector_store.py
+++ b/graphrag_sdk/src/graphrag_sdk/storage/vector_store.py
@@ -181,7 +181,7 @@ async def drop_relates_vector_index(self) -> None:
 
     # ── Indexing ─────────────────────────────────────────────────
 
-    async def index_chunks(self, chunks: TextChunks) -> int:
+    async def index_chunks(self, chunks: TextChunks, *, ctx: Any | None = None) -> int:
         """Embed and store vectors for all chunks.
 
         Uses batch embedding (``aembed_documents``) for efficiency,
@@ -189,6 +189,7 @@ async def index_chunks(self, chunks: TextChunks) -> int:
 
         Args:
             chunks: TextChunks collection to embed and index.
+            ctx: Execution context for token usage tracking.
 
         Returns:
             Number of chunks indexed.
@@ -203,13 +204,13 @@ async def index_chunks(self, chunks: TextChunks) -> int:
         # Batch embed all chunk texts in one API call
         texts = [chunk.text for chunk in chunks.chunks]
         try:
-            vectors = await self._embedder.aembed_documents(texts)
+            vectors = await self._embedder.aembed_documents(texts, ctx=ctx)
         except Exception as exc:
             logger.warning(f"Batch embedding failed, falling back to sequential: {exc}")
             vectors = []
             for chunk in chunks.chunks:
                 try:
-                    vec = await self._embedder.aembed_query(chunk.text)
+                    vec = await self._embedder.aembed_query(chunk.text, ctx=ctx)
                     vectors.append(vec)
                 except Exception:
                     logger.debug("Single chunk embedding failed", exc_info=True)
diff --git a/graphrag_sdk/tests/test_token_usage.py b/graphrag_sdk/tests/test_token_usage.py
new file mode 100644
index 00000000..150b9200
--- /dev/null
+++ b/graphrag_sdk/tests/test_token_usage.py
@@ -0,0 +1,652 @@
+"""Tests for token usage tracking (#227).
+
+Covers:
+- TokenUsage model arithmetic and defaults
+- Context.record_usage() accumulator
+- LiteLLM / OpenRouter provider instrumentation
+- Result types carry usage (IngestionResult, RagResult, RetrieverResult)
+- Backward compatibility: ctx=None never raises
+- abatch_invoke threads ctx through to each ainvoke
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from graphrag_sdk.core.context import Context
+from graphrag_sdk.core.models import (
+    ChatMessage,
+    IngestionResult,
+    LLMResponse,
+    RagResult,
+    RetrieverResult,
+    RetrieverResultItem,
+    TokenUsage,
+)
+from graphrag_sdk.core.providers import LLMInterface, LiteLLM, LiteLLMEmbedder, OpenRouterLLM
+from graphrag_sdk.core.providers.openrouter import OpenRouterEmbedder
+
+
+# ── Helpers ──────────────────────────────────────────────────────
+
+
+def _litellm_resp(content: str = "ok", prompt_tokens: int = 10, completion_tokens: int = 5):
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    usage.completion_tokens = completion_tokens
+    msg = MagicMock()
+    msg.content = content
+    choice = MagicMock()
+    choice.message = msg
+    resp = MagicMock()
+    resp.choices = [choice]
+    resp.usage = usage
+    return resp
+
+
+def _litellm_embed_resp(vectors: list[list[float]], prompt_tokens: int = 20):
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    data = [{"index": i, "embedding": v} for i, v in enumerate(vectors)]
+    resp = MagicMock()
+    resp.data = data
+    resp.usage = usage
+    return resp
+
+
+def _openai_resp(content: str = "ok", prompt_tokens: int = 8, completion_tokens: int = 4):
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    usage.completion_tokens = completion_tokens
+    msg = MagicMock()
+    msg.content = content
+    choice = MagicMock()
+    choice.message = msg
+    resp = MagicMock()
+    resp.choices = [choice]
+    resp.usage = usage
+    return resp
+
+
+def _openai_embed_resp(vectors: list[list[float]], prompt_tokens: int = 15):
+    usage = MagicMock()
+    usage.prompt_tokens = prompt_tokens
+    items = []
+    for i, v in enumerate(vectors):
+        item = MagicMock()
+        item.index = i
+        item.embedding = v
+        items.append(item)
+    resp = MagicMock()
+    resp.data = items
+    resp.usage = usage
+    return resp
+
+
+# ── TokenUsage model ─────────────────────────────────────────────
+
+
+class TestTokenUsageModel:
+    def test_defaults_are_zero(self):
+        u = TokenUsage()
+        assert u.prompt_tokens == 0
+        assert u.completion_tokens == 0
+        assert u.embedding_tokens == 0
+
+    def test_explicit_values(self):
+        u = TokenUsage(prompt_tokens=100, completion_tokens=50, embedding_tokens=200)
+        assert u.prompt_tokens == 100
+        assert u.completion_tokens == 50
+        assert u.embedding_tokens == 200
+
+    def test_add_returns_new_instance(self):
+        a = TokenUsage(prompt_tokens=10, completion_tokens=5, embedding_tokens=20)
+        b = TokenUsage(prompt_tokens=3, completion_tokens=2, embedding_tokens=8)
+        c = a + b
+        assert c.prompt_tokens == 13
+        assert c.completion_tokens == 7
+        assert c.embedding_tokens == 28
+        # originals unchanged
+        assert a.prompt_tokens == 10
+        assert b.prompt_tokens == 3
+
+    def test_add_identity(self):
+        a = TokenUsage(prompt_tokens=5)
+        assert (a + TokenUsage()).prompt_tokens == 5
+
+    def test_iadd_accumulates_in_place(self):
+        u = TokenUsage()
+        u += TokenUsage(prompt_tokens=10, completion_tokens=3, embedding_tokens=15)
+        u += TokenUsage(prompt_tokens=5, completion_tokens=2, embedding_tokens=5)
+        assert u.prompt_tokens == 15
+        assert u.completion_tokens == 5
+        assert u.embedding_tokens == 20
+
+    def test_serializes_to_dict(self):
+        u = TokenUsage(prompt_tokens=1, completion_tokens=2, embedding_tokens=3)
+        d = u.model_dump()
+        assert d == {"prompt_tokens": 1, "completion_tokens": 2, "embedding_tokens": 3}
+
+
+# ── Context accumulator ──────────────────────────────────────────
+
+
+class TestContextUsageAccumulator:
+    def test_fresh_context_has_zero_usage(self):
+        ctx = Context()
+        assert ctx.usage.prompt_tokens == 0
+        assert ctx.usage.completion_tokens == 0
+        assert ctx.usage.embedding_tokens == 0
+
+    def test_record_usage_llm(self):
+        ctx = Context()
+        ctx.record_usage(prompt_tokens=100, completion_tokens=50)
+        assert ctx.usage.prompt_tokens == 100
+        assert ctx.usage.completion_tokens == 50
+        assert ctx.usage.embedding_tokens == 0
+
+    def test_record_usage_embedding(self):
+        ctx = Context()
+        ctx.record_usage(embedding_tokens=300)
+        assert ctx.usage.embedding_tokens == 300
+        assert ctx.usage.prompt_tokens == 0
+
+    def test_record_usage_accumulates(self):
+        ctx = Context()
+        ctx.record_usage(prompt_tokens=50, completion_tokens=10)
+        ctx.record_usage(prompt_tokens=30, completion_tokens=5)
+        ctx.record_usage(embedding_tokens=100)
+        assert ctx.usage.prompt_tokens == 80
+        assert ctx.usage.completion_tokens == 15
+        assert ctx.usage.embedding_tokens == 100
+
+    def test_record_usage_noop_zeros(self):
+        ctx = Context()
+        ctx.record_usage()
+        assert ctx.usage.prompt_tokens == 0
+
+    def test_usage_field_is_token_usage_instance(self):
+        ctx = Context()
+        assert isinstance(ctx.usage, TokenUsage)
+
+    def test_two_contexts_have_independent_usage(self):
+        ctx1 = Context()
+        ctx2 = Context()
+        ctx1.record_usage(prompt_tokens=999)
+        assert ctx2.usage.prompt_tokens == 0
+
+    def test_child_context_has_independent_usage(self):
+        parent = Context()
+        parent.record_usage(prompt_tokens=50)
+        child = parent.child()
+        child.record_usage(prompt_tokens=10)
+        # parent still has original value
+        assert parent.usage.prompt_tokens == 50
+        assert child.usage.prompt_tokens == 10
+
+
+# ── Result types carry usage ─────────────────────────────────────
+
+
+class TestResultUsageField:
+    def test_ingestion_result_default_usage(self):
+        r = IngestionResult()
+        assert isinstance(r.usage, TokenUsage)
+        assert r.usage.prompt_tokens == 0
+
+    def test_rag_result_default_usage(self):
+        r = RagResult(answer="hello")
+        assert isinstance(r.usage, TokenUsage)
+        assert r.usage.embedding_tokens == 0
+
+    def test_retriever_result_default_usage(self):
+        r = RetrieverResult()
+        assert isinstance(r.usage, TokenUsage)
+        assert r.usage.completion_tokens == 0
+
+    def test_ingestion_result_with_usage(self):
+        u = TokenUsage(prompt_tokens=100, completion_tokens=40, embedding_tokens=200)
+        r = IngestionResult(usage=u)
+        assert r.usage.prompt_tokens == 100
+        assert r.usage.embedding_tokens == 200
+
+    def test_rag_result_with_usage(self):
+        u = TokenUsage(prompt_tokens=50, completion_tokens=20)
+        r = RagResult(answer="42", usage=u)
+        assert r.usage.prompt_tokens == 50
+
+    def test_retriever_result_with_usage(self):
+        u = TokenUsage(embedding_tokens=75)
+        r = RetrieverResult(items=[], usage=u)
+        assert r.usage.embedding_tokens == 75
+
+    def test_usage_mutability_on_retriever_result(self):
+        """Facade sets usage after retrieval strategy returns."""
+        r = RetrieverResult()
+        ctx = Context()
+        ctx.record_usage(embedding_tokens=42)
+        r.usage = ctx.usage
+        assert r.usage.embedding_tokens == 42
+
+    def test_existing_fields_unchanged(self):
+        """Backward compat: existing fields work exactly as before."""
+        r = IngestionResult(nodes_created=5, relationships_created=3, chunks_indexed=2)
+        assert r.nodes_created == 5
+        assert r.relationships_created == 3
+        assert r.chunks_indexed == 2
+
+    def test_retriever_result_items_still_work(self):
+        items = [RetrieverResultItem(content="a"), RetrieverResultItem(content="b")]
+        r = RetrieverResult(items=items)
+        assert len(r.items) == 2
+
+
+# ── LiteLLM provider instrumentation ────────────────────────────
+
+
+class TestLiteLLMUsageInstrumentation:
+    @pytest.mark.asyncio
+    async def test_ainvoke_records_usage_when_ctx_provided(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(
+            return_value=_litellm_resp(prompt_tokens=30, completion_tokens=12)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            ctx = Context()
+            await llm.ainvoke("hello", ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 30
+        assert ctx.usage.completion_tokens == 12
+        assert ctx.usage.embedding_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_ainvoke_no_ctx_does_not_raise(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(
+            return_value=_litellm_resp(prompt_tokens=10, completion_tokens=5)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            result = await llm.ainvoke("hello")  # no ctx → backward compat
+
+        assert result.content == "ok"
+
+    @pytest.mark.asyncio
+    async def test_ainvoke_messages_records_usage(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(
+            return_value=_litellm_resp(prompt_tokens=50, completion_tokens=20)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            ctx = Context()
+            msgs = [ChatMessage(role="user", content="hi")]
+            await llm.ainvoke_messages(msgs, ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 50
+        assert ctx.usage.completion_tokens == 20
+
+    @pytest.mark.asyncio
+    async def test_ainvoke_messages_no_ctx_does_not_raise(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(return_value=_litellm_resp())
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            msgs = [ChatMessage(role="user", content="hi")]
+            result = await llm.ainvoke_messages(msgs)
+
+        assert result.content == "ok"
+
+    @pytest.mark.asyncio
+    async def test_aembed_query_records_embedding_tokens(self):
+        mock_litellm = MagicMock()
+        mock_litellm.aembedding = AsyncMock(
+            return_value=_litellm_embed_resp([[0.1, 0.2]], prompt_tokens=25)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            embedder = LiteLLMEmbedder(model="text-embedding-ada-002")
+            ctx = Context()
+            result = await embedder.aembed_query("hello", ctx=ctx)
+
+        assert result == [0.1, 0.2]
+        assert ctx.usage.embedding_tokens == 25
+        assert ctx.usage.prompt_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_aembed_query_no_ctx_does_not_raise(self):
+        mock_litellm = MagicMock()
+        mock_litellm.aembedding = AsyncMock(
+            return_value=_litellm_embed_resp([[0.1, 0.2]], prompt_tokens=25)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            embedder = LiteLLMEmbedder(model="text-embedding-ada-002")
+            result = await embedder.aembed_query("hello")  # no ctx
+
+        assert result == [0.1, 0.2]
+
+    @pytest.mark.asyncio
+    async def test_aembed_documents_records_embedding_tokens(self):
+        """Regression: batch path (main ingest path) must accumulate embedding_tokens."""
+        mock_litellm = MagicMock()
+        mock_litellm.aembedding = AsyncMock(
+            return_value=_litellm_embed_resp([[0.1, 0.2], [0.3, 0.4]], prompt_tokens=40)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            embedder = LiteLLMEmbedder(model="text-embedding-ada-002")
+            ctx = Context()
+            results = await embedder.aembed_documents(["hello", "world"], ctx=ctx)
+
+        assert len(results) == 2
+        assert ctx.usage.embedding_tokens == 40
+
+    @pytest.mark.asyncio
+    async def test_aembed_documents_no_ctx_does_not_raise(self):
+        mock_litellm = MagicMock()
+        mock_litellm.aembedding = AsyncMock(
+            return_value=_litellm_embed_resp([[0.1, 0.2], [0.3, 0.4]], prompt_tokens=40)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            embedder = LiteLLMEmbedder(model="text-embedding-ada-002")
+            results = await embedder.aembed_documents(["hello", "world"])  # no ctx
+
+        assert len(results) == 2
+
+    @pytest.mark.asyncio
+    async def test_multiple_ainvoke_calls_accumulate(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(
+            side_effect=[
+                _litellm_resp(prompt_tokens=10, completion_tokens=5),
+                _litellm_resp(prompt_tokens=20, completion_tokens=8),
+            ]
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            ctx = Context()
+            await llm.ainvoke("first", ctx=ctx)
+            await llm.ainvoke("second", ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 30
+        assert ctx.usage.completion_tokens == 13
+
+    @pytest.mark.asyncio
+    async def test_usage_none_in_response_safe(self):
+        """If litellm returns usage=None, record_usage gets zeros."""
+        mock_litellm = MagicMock()
+        resp = _litellm_resp()
+        resp.usage = None
+        mock_litellm.acompletion = AsyncMock(return_value=resp)
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            ctx = Context()
+            await llm.ainvoke("hello", ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 0
+        assert ctx.usage.completion_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_abatch_invoke_accumulates_all_items(self):
+        call_count = {"n": 0}
+
+        class InstrumentedLLM(LLMInterface):
+            def __init__(self):
+                super().__init__(model_name="test")
+
+            def invoke(self, prompt: str, **kwargs: Any) -> LLMResponse:
+                return LLMResponse(content="ok")
+
+            async def ainvoke(self, prompt: str, *, ctx=None, max_retries=3, **kwargs):
+                call_count["n"] += 1
+                if ctx is not None:
+                    ctx.record_usage(prompt_tokens=10, completion_tokens=5)
+                return LLMResponse(content="ok")
+
+        llm = InstrumentedLLM()
+        ctx = Context()
+        results = await llm.abatch_invoke(["a", "b", "c"], ctx=ctx)
+
+        assert len(results) == 3
+        assert all(r.ok for r in results)
+        assert ctx.usage.prompt_tokens == 30   # 3 × 10
+        assert ctx.usage.completion_tokens == 15  # 3 × 5
+
+    @pytest.mark.asyncio
+    async def test_abatch_invoke_no_ctx_backward_compat(self):
+        """abatch_invoke without ctx still works exactly as before."""
+        class SimpleLLM(LLMInterface):
+            def __init__(self):
+                super().__init__(model_name="test")
+
+            def invoke(self, prompt: str, **kwargs: Any) -> LLMResponse:
+                return LLMResponse(content=f"resp:{prompt}")
+
+        llm = SimpleLLM()
+        results = await llm.abatch_invoke(["x", "y"])
+
+        assert len(results) == 2
+        assert results[0].ok
+
+
+# ── OpenRouter provider instrumentation ──────────────────────────
+
+
+class TestOpenRouterUsageInstrumentation:
+    @pytest.mark.asyncio
+    async def test_ainvoke_records_usage(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        mock_async.chat.completions.create = AsyncMock(
+            return_value=_openai_resp(prompt_tokens=8, completion_tokens=4)
+        )
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            llm = OpenRouterLLM(model="openai/gpt-4o", api_key="k")
+            ctx = Context()
+            result = await llm.ainvoke("hello", ctx=ctx)
+
+        assert result.content == "ok"
+        assert ctx.usage.prompt_tokens == 8
+        assert ctx.usage.completion_tokens == 4
+
+    @pytest.mark.asyncio
+    async def test_ainvoke_no_ctx_backward_compat(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        mock_async.chat.completions.create = AsyncMock(return_value=_openai_resp())
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            llm = OpenRouterLLM(model="openai/gpt-4o", api_key="k")
+            result = await llm.ainvoke("hello")
+
+        assert result.content == "ok"
+
+    @pytest.mark.asyncio
+    async def test_ainvoke_messages_records_usage(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        mock_async.chat.completions.create = AsyncMock(
+            return_value=_openai_resp(prompt_tokens=12, completion_tokens=6)
+        )
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            llm = OpenRouterLLM(model="openai/gpt-4o", api_key="k")
+            ctx = Context()
+            msgs = [ChatMessage(role="user", content="hi")]
+            await llm.ainvoke_messages(msgs, ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 12
+        assert ctx.usage.completion_tokens == 6
+
+    @pytest.mark.asyncio
+    async def test_aembed_query_records_embedding_tokens(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        mock_async.embeddings.create = AsyncMock(
+            return_value=_openai_embed_resp([[0.3, 0.4]], prompt_tokens=18)
+        )
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            embedder = OpenRouterEmbedder(model="text-embedding-ada-002", api_key="k")
+            ctx = Context()
+            result = await embedder.aembed_query("hello", ctx=ctx)
+
+        assert result == [0.3, 0.4]
+        assert ctx.usage.embedding_tokens == 18
+
+    @pytest.mark.asyncio
+    async def test_aembed_query_no_ctx_backward_compat(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        mock_async.embeddings.create = AsyncMock(
+            return_value=_openai_embed_resp([[0.3, 0.4]])
+        )
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            embedder = OpenRouterEmbedder(model="text-embedding-ada-002", api_key="k")
+            result = await embedder.aembed_query("hello")  # no ctx
+
+        assert result == [0.3, 0.4]
+
+    @pytest.mark.asyncio
+    async def test_usage_none_in_response_safe(self):
+        mock_openai = MagicMock()
+        mock_async = MagicMock()
+        resp = _openai_resp()
+        resp.usage = None
+        mock_async.chat.completions.create = AsyncMock(return_value=resp)
+        mock_openai.AsyncOpenAI.return_value = mock_async
+        with patch.dict("sys.modules", {"openai": mock_openai}):
+            llm = OpenRouterLLM(model="openai/gpt-4o", api_key="k")
+            ctx = Context()
+            await llm.ainvoke("hello", ctx=ctx)
+
+        assert ctx.usage.prompt_tokens == 0
+        assert ctx.usage.completion_tokens == 0
+
+
+# ── Usage extraction helpers ──────────────────────────────────────
+
+
+class TestUsageExtractionHelpers:
+    def test_litellm_extract_llm_usage_normal(self):
+        from graphrag_sdk.core.providers.litellm import _extract_llm_usage
+        resp = _litellm_resp(prompt_tokens=42, completion_tokens=17)
+        pt, ct = _extract_llm_usage(resp)
+        assert pt == 42
+        assert ct == 17
+
+    def test_litellm_extract_llm_usage_none(self):
+        from graphrag_sdk.core.providers.litellm import _extract_llm_usage
+        resp = MagicMock()
+        resp.usage = None
+        pt, ct = _extract_llm_usage(resp)
+        assert pt == 0
+        assert ct == 0
+
+    def test_litellm_extract_embedding_usage_normal(self):
+        from graphrag_sdk.core.providers.litellm import _extract_embedding_usage
+        resp = _litellm_embed_resp([[0.1]], prompt_tokens=33)
+        assert _extract_embedding_usage(resp) == 33
+
+    def test_litellm_extract_embedding_usage_none(self):
+        from graphrag_sdk.core.providers.litellm import _extract_embedding_usage
+        resp = MagicMock()
+        resp.usage = None
+        assert _extract_embedding_usage(resp) == 0
+
+    def test_openrouter_extract_llm_usage_normal(self):
+        from graphrag_sdk.core.providers.openrouter import _extract_openai_llm_usage
+        resp = _openai_resp(prompt_tokens=99, completion_tokens=11)
+        pt, ct = _extract_openai_llm_usage(resp)
+        assert pt == 99
+        assert ct == 11
+
+    def test_openrouter_extract_embedding_usage_normal(self):
+        from graphrag_sdk.core.providers.openrouter import _extract_openai_embedding_usage
+        resp = _openai_embed_resp([[0.1]], prompt_tokens=77)
+        assert _extract_openai_embedding_usage(resp) == 77
+
+    def test_openrouter_extract_with_missing_attribute(self):
+        from graphrag_sdk.core.providers.openrouter import _extract_openai_llm_usage
+        resp = MagicMock(spec=[])  # no attributes
+        pt, ct = _extract_openai_llm_usage(resp)
+        assert pt == 0
+        assert ct == 0
+
+
+# ── Public export ─────────────────────────────────────────────────
+
+
+class TestPublicExport:
+    def test_token_usage_importable_from_top_level(self):
+        from graphrag_sdk import TokenUsage  # noqa: F401
+        assert TokenUsage is not None
+
+    def test_token_usage_in_all(self):
+        import graphrag_sdk
+        assert "TokenUsage" in graphrag_sdk.__all__
+
+    def test_ingestion_result_in_all(self):
+        import graphrag_sdk
+        assert "IngestionResult" in graphrag_sdk.__all__
+        assert "RagResult" in graphrag_sdk.__all__
+        assert "RetrieverResult" in graphrag_sdk.__all__
+
+
+# ── End-to-end accumulation scenario (no real LLM) ───────────────
+
+
+class TestEndToEndAccumulation:
+    @pytest.mark.asyncio
+    async def test_mixed_llm_and_embed_accumulate_in_single_ctx(self):
+        """Simulate a retrieve+complete flow accumulating into one ctx."""
+        ctx = Context()
+
+        # Step 1: embed query (simulate retrieval)
+        ctx.record_usage(embedding_tokens=50)
+
+        # Step 2: keyword extraction LLM call
+        ctx.record_usage(prompt_tokens=200, completion_tokens=10)
+
+        # Step 3: final answer LLM call
+        ctx.record_usage(prompt_tokens=1500, completion_tokens=300)
+
+        assert ctx.usage.prompt_tokens == 1700
+        assert ctx.usage.completion_tokens == 310
+        assert ctx.usage.embedding_tokens == 50
+
+        # Snapshot into RagResult
+        result = RagResult(answer="The answer", usage=ctx.usage)
+        assert result.usage.prompt_tokens == 1700
+        assert result.usage.completion_tokens == 310
+        assert result.usage.embedding_tokens == 50
+
+    @pytest.mark.asyncio
+    async def test_ingest_flow_accumulation(self):
+        """Simulate ingest: chunking LLM + extraction LLM + embedding."""
+        ctx = Context()
+
+        # ContextualChunking: 3 chunks × ~100 tokens each
+        ctx.record_usage(prompt_tokens=300, completion_tokens=90)
+
+        # GraphExtraction step1 NER + step2 verify
+        ctx.record_usage(prompt_tokens=2000, completion_tokens=800)
+
+        # Embedding 3 chunks
+        ctx.record_usage(embedding_tokens=150)
+
+        result = IngestionResult(
+            nodes_created=5,
+            chunks_indexed=3,
+            usage=ctx.usage,
+        )
+        assert result.usage.prompt_tokens == 2300
+        assert result.usage.completion_tokens == 890
+        assert result.usage.embedding_tokens == 150
+        # original fields unchanged
+        assert result.nodes_created == 5
+        assert result.chunks_indexed == 3

From 9abd03b15c92c9c5b34e376235b411142e2a3d0c Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 20:55:43 -0300
Subject: [PATCH 2/8] Refactor code structure for improved readability and
 maintainability

---
 .../src/graphrag_sdk/core/providers/base.py   | 20 ++++++++++++-------
 graphrag_sdk/tests/test_token_usage.py        | 15 ++++++++++++++
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index 98d01b47..b55e8dc2 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -56,7 +56,7 @@ def model_name(self) -> str:
         ...
 
     @abstractmethod
-    def embed_query(self, text: str, **kwargs: Any) -> list[float]:
+    def embed_query(self, text: str, *, ctx: Any | None = None, **kwargs: Any) -> list[float]:
         """Embed a single text string into a float vector."""
         ...
 
@@ -72,9 +72,15 @@ async def aembed_query(
         """
         return await asyncio.to_thread(self.embed_query, text, **kwargs)
 
-    def embed_documents(self, texts: list[str], **kwargs: Any) -> list[list[float]]:
-        """Batch embed multiple texts. Default: sequential fallback."""
-        return [self.embed_query(t, **kwargs) for t in texts]
+    def embed_documents(self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any) -> list[list[float]]:
+        """Batch embed multiple texts. Default: sequential fallback.
+        
+        Args:
+            ctx: Execution context for usage tracking.
+        """
+        if ctx is None:
+            return [self.embed_query(t, **kwargs) for t in texts]
+        return [self.embed_query(t, ctx=ctx, **kwargs) for t in texts]
 
     async def aembed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
@@ -107,7 +113,7 @@ def __init__(
         self.max_concurrency = max_concurrency
 
     @abstractmethod
-    def invoke(self, prompt: str, **kwargs: Any) -> LLMResponse:
+    def invoke(self, prompt: str, *, ctx: Any | None = None, **kwargs: Any) -> LLMResponse:
         """Synchronous text-in / text-out invocation."""
         ...
 
@@ -182,9 +188,9 @@ async def ainvoke_messages(
         prompt = "\n\n".join(parts)
         return await self.ainvoke(prompt, ctx=ctx, max_retries=max_retries, **kwargs)
 
-    async def astream(self, prompt: str, **kwargs: Any) -> AsyncIterator[str]:
+    async def astream(self, prompt: str, *, ctx: Any | None = None, **kwargs: Any) -> AsyncIterator[str]:
         """Async streaming — default yields the full response as one chunk."""
-        resp = await self.ainvoke(prompt, **kwargs)
+        resp = await self.ainvoke(prompt, ctx=ctx, **kwargs)
         yield resp.content
 
     def invoke_with_model(
diff --git a/graphrag_sdk/tests/test_token_usage.py b/graphrag_sdk/tests/test_token_usage.py
index 150b9200..aabe3ae2 100644
--- a/graphrag_sdk/tests/test_token_usage.py
+++ b/graphrag_sdk/tests/test_token_usage.py
@@ -388,6 +388,21 @@ async def test_usage_none_in_response_safe(self):
         assert ctx.usage.prompt_tokens == 0
         assert ctx.usage.completion_tokens == 0
 
+    @pytest.mark.asyncio
+    async def test_astream_records_usage(self):
+        mock_litellm = MagicMock()
+        mock_litellm.acompletion = AsyncMock(
+            return_value=_litellm_resp(prompt_tokens=40, completion_tokens=20)
+        )
+        with patch.dict("sys.modules", {"litellm": mock_litellm}):
+            llm = LiteLLM(model="gpt-4")
+            ctx = Context()
+            async for chunk in llm.astream("hello", ctx=ctx):
+                pass
+        
+        assert ctx.usage.prompt_tokens == 40
+        assert ctx.usage.completion_tokens == 20
+
     @pytest.mark.asyncio
     async def test_abatch_invoke_accumulates_all_items(self):
         call_count = {"n": 0}

From fadba63431b96afb151c56def341db98e6a469fa Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 21:19:25 -0300
Subject: [PATCH 3/8] refactor: improve formatting of method signatures for
 better readability

---
 graphrag_sdk/src/graphrag_sdk/core/providers/base.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index b55e8dc2..a3a86847 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -72,9 +72,11 @@ async def aembed_query(
         """
         return await asyncio.to_thread(self.embed_query, text, **kwargs)
 
-    def embed_documents(self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any) -> list[list[float]]:
+    def embed_documents(
+        self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
+    ) -> list[list[float]]:
         """Batch embed multiple texts. Default: sequential fallback.
-        
+
         Args:
             ctx: Execution context for usage tracking.
         """
@@ -188,7 +190,9 @@ async def ainvoke_messages(
         prompt = "\n\n".join(parts)
         return await self.ainvoke(prompt, ctx=ctx, max_retries=max_retries, **kwargs)
 
-    async def astream(self, prompt: str, *, ctx: Any | None = None, **kwargs: Any) -> AsyncIterator[str]:
+    async def astream(
+        self, prompt: str, *, ctx: Any | None = None, **kwargs: Any
+    ) -> AsyncIterator[str]:
         """Async streaming — default yields the full response as one chunk."""
         resp = await self.ainvoke(prompt, ctx=ctx, **kwargs)
         yield resp.content

From dfe4eaa620789e9f906b7fab20282bce00c7705a Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 22:06:48 -0300
Subject: [PATCH 4/8] fix: enhance token usage tracking to isolate retrieval
 and generation tokens in completion results

---
 graphrag_sdk/src/graphrag_sdk/api/main.py |  2 +-
 graphrag_sdk/tests/test_facade.py         | 54 ++++++++++++++++-------
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py
index 9f599037..ee9d3cb9 100644
--- a/graphrag_sdk/src/graphrag_sdk/api/main.py
+++ b/graphrag_sdk/src/graphrag_sdk/api/main.py
@@ -550,7 +550,7 @@ async def retrieve(
 
         ctx.log(f"Retrieved {len(retriever_result.items)} context items")
         # Attach accumulated token usage to the result
-        retriever_result.usage = ctx.usage
+        retriever_result.usage = ctx.usage.model_copy()
         return retriever_result
 
     # ── Completion ──────────────────────────────────────────────
diff --git a/graphrag_sdk/tests/test_facade.py b/graphrag_sdk/tests/test_facade.py
index 2e4a22ae..29adcb4c 100644
--- a/graphrag_sdk/tests/test_facade.py
+++ b/graphrag_sdk/tests/test_facade.py
@@ -12,10 +12,12 @@
 from graphrag_sdk.core.models import (
     ChatMessage,
     IngestionResult,
+    LLMResponse,
     RagResult,
     RawSearchResult,
     RetrieverResult,
     RetrieverResultItem,
+    TokenUsage,
 )
 from graphrag_sdk.retrieval.strategies.base import RetrievalStrategy
 
@@ -676,25 +678,47 @@ async def test_completion_rewrite_question_enabled(self, mock_conn, embedder):
         assert result.metadata["retrieval_query"] == "Where did Jane Doe go to college?"
         assert result.answer == "She attended Stanford University."
 
-    async def test_completion_rewrite_fallback_on_empty(self, mock_conn, embedder):
-        """If the rewrite LLM returns empty, fall back to the original question."""
-        llm = MockLLM(responses=["", "Some answer."])
+    async def test_completion_usage_isolation(self, mock_conn, embedder):
+        """
+        Verify that retriever_result.usage is a snapshot and does not 
+        incorrectly include tokens from the final generation step.
+        """
+        llm = MockLLM(responses=["The answer is 42."])
         g = GraphRAG(connection=mock_conn, llm=llm, embedder=embedder, embedding_dimension=8)
+        
+        # Mock strategy to record some retrieval usage
         mock_strategy = MagicMock(spec=RetrievalStrategy)
-        mock_strategy.search = AsyncMock(
-            return_value=RetrieverResult(items=[RetrieverResultItem(content="c")])
-        )
+        
+        # The facade calls strategy.search(question, ctx).
+        # We must monkeypatch .search to record usage into the provided ctx,
+        # because that's how the real SDK works.
+        async def mock_search(question, ctx, **kwargs):
+            ctx.record_usage(prompt_tokens=100, embedding_tokens=50)
+            return RetrieverResult(
+                items=[RetrieverResultItem(content="context")]
+            )
+        
+        mock_strategy.search = AsyncMock(side_effect=mock_search)
         g._retrieval_strategy = mock_strategy
 
-        result = await g.completion(
-            "where did she go?",
-            history=[{"role": "user", "content": "Who?"}, {"role": "assistant", "content": "Jane."}],
-            rewrite_question_with_history=True,
-        )
-        # Empty rewrite → original question used for retrieval
-        call_args = mock_strategy.search.call_args
-        assert call_args[0][0] == "where did she go?"
-        assert result.metadata["retrieval_query"] == "where did she go?"
+        async def mock_ainvoke_messages(messages, ctx=None, **kwargs):
+            if ctx:
+                ctx.record_usage(prompt_tokens=200, completion_tokens=50)
+            return LLMResponse(content="The answer is 42.")
+        
+        llm.ainvoke_messages = AsyncMock(side_effect=mock_ainvoke_messages)
+
+        result = await g.completion("What is the answer?", return_context=True)
+        
+        # 1. Retriever result usage should ONLY have retrieval tokens (prompt=100, embed=50)
+        assert result.retriever_result.usage.prompt_tokens == 100
+        assert result.retriever_result.usage.completion_tokens == 0
+        
+        # 2. Final RagResult usage should have BOTH retrieval and generation tokens
+        # (100 + 200 prompt, 0 + 50 completion)
+        assert result.usage.prompt_tokens == 300
+        assert result.usage.completion_tokens == 50
+
 
     async def test_completion_custom_prompt_template_with_history(self, mock_conn, embedder):
         """UI agent's use case: citation-style template works in multi-turn mode."""

From 5dc520cc749a4264aadf6137aa9d1c09c32d8bed Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 22:21:37 -0300
Subject: [PATCH 5/8] fix: update changelog to include token usage tracking and
 remove outdated entries

---
 CHANGELOG.md | 53 +---------------------------------------------------
 1 file changed, 1 insertion(+), 52 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d71c579..40014fc8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,57 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-<<<<<<< HEAD
-## [1.0.2] - 2026-05-04
-
-Patch release. One retrieval correctness fix and one default-value
-change carried over from the post-1.0.1 README onboarding work.
-
-### Fixed
-
-- **Chunk citations preserve the full `Document.path`.** The chunk
-  retrieval strategy was reducing the path returned from the graph
-  to a basename via `path.rsplit("/", 1)[-1]` before handing it off
-  to the citation pipeline. That dropped real information: files
-  sharing a basename across directories — e.g. `operations/index.md`
-  vs `commands/index.md` — collapsed to the same identifier
-  downstream, and consumers building source links from the citation
-  could no longer reconstruct the original location. `Document.path`
-  already stored the full path passed to `rag.ingest()`, so this is
-  a read-side fix only; existing graphs start emitting full paths in
-  the next query with no migration required.
-=======
-### Added
-
-- **Token usage tracking on all public response objects (#227).** `IngestionResult`,
-  `RagResult`, and `RetrieverResult` now expose a `usage: TokenUsage` field
-  that reports the total `prompt_tokens`, `completion_tokens`, and
-  `embedding_tokens` consumed by the operation. `TokenUsage` is exported from
-  the top-level package and supports `+` / `+=` for easy aggregation over
-  batch results.
-
-  ```python
-  result = await rag.completion("Who is Alice?")
-  print(result.usage.prompt_tokens)      # LLM input tokens
-  print(result.usage.completion_tokens)  # LLM output tokens
-  print(result.usage.embedding_tokens)   # embedding tokens
-  ```
-
-  **Implementation notes:**
-  - Async provider methods (`ainvoke`, `ainvoke_messages`, `aembed_query`,
-    `aembed_documents`, `abatch_invoke`) now accept an optional keyword-only
-    `ctx: Context | None = None` parameter. Usage is recorded into the
-    accumulator at `ctx.usage` via `ctx.record_usage()`.
-  - `VectorStore.index_chunks()` accepts the same optional `ctx` and forwards
-    it to the embedder.
-  - Custom providers that do not override these methods, and all callers that
-    omit `ctx`, continue to work exactly as before — the change is fully
-    backward-compatible.
-  - 51 new unit tests in `tests/test_token_usage.py` covering model arithmetic,
-    context accumulation, provider instrumentation, and backward compatibility.
-  - See [docs/token-usage.md](docs/token-usage.md) for the full guide.
->>>>>>> 4455125 (feat: implement built-in token usage tracking and cost observability across the RAG pipeline)
-
 ## [1.0.2] - 2026-05-04
 
 Patch release. One retrieval correctness fix and one default-value
@@ -310,4 +259,4 @@ to this version by default. Legacy v0.x users can pin `graphrag-sdk==0.8.2`.
 ### Fixed
 
 - `hnswlib` import guard in SemanticResolution and LLMVerifiedResolution — raises clear `ImportError` instead of `AttributeError` when hnswlib is not installed.
-- 14 ruff lint errors (import sorting, line length) resolved; CI no longer ignores lint rules.
+- 14 ruff lint errors (import sorting, line length) resolved; CI no longer ignores lint rules.
\ No newline at end of file

From 869db43f13c750c6e8e959f5c8003deb4b42f619 Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 22:28:53 -0300
Subject: [PATCH 6/8] refactor: simplify method signatures in Embedder and
 LLMInterface classes

---
 graphrag_sdk/src/graphrag_sdk/core/providers/base.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index a3a86847..ac87ec96 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -56,7 +56,7 @@ def model_name(self) -> str:
         ...
 
     @abstractmethod
-    def embed_query(self, text: str, *, ctx: Any | None = None, **kwargs: Any) -> list[float]:
+    def embed_query(self, text: str, **kwargs: Any) -> list[float]:
         """Embed a single text string into a float vector."""
         ...
 
@@ -76,13 +76,14 @@ def embed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
     ) -> list[list[float]]:
         """Batch embed multiple texts. Default: sequential fallback.
-
+S
         Args:
             ctx: Execution context for usage tracking.
         """
         if ctx is None:
             return [self.embed_query(t, **kwargs) for t in texts]
-        return [self.embed_query(t, ctx=ctx, **kwargs) for t in texts]
+        return [self.embed_query(t, **kwargs) for t in texts]
+
 
     async def aembed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
@@ -115,7 +116,7 @@ def __init__(
         self.max_concurrency = max_concurrency
 
     @abstractmethod
-    def invoke(self, prompt: str, *, ctx: Any | None = None, **kwargs: Any) -> LLMResponse:
+    def invoke(self, prompt: str, **kwargs: Any) -> LLMResponse:
         """Synchronous text-in / text-out invocation."""
         ...
 

From 83c9eca823c96bc0d3287dc66c3d0694f3d0f5dc Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 22:30:52 -0300
Subject: [PATCH 7/8] fix: correct formatting in embed_documents method
 docstring

---
 graphrag_sdk/src/graphrag_sdk/core/providers/base.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index ac87ec96..7edd945a 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -76,15 +76,14 @@ def embed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
     ) -> list[list[float]]:
         """Batch embed multiple texts. Default: sequential fallback.
-S
-        Args:
-            ctx: Execution context for usage tracking.
+        S
+                Args:
+                    ctx: Execution context for usage tracking.
         """
         if ctx is None:
             return [self.embed_query(t, **kwargs) for t in texts]
         return [self.embed_query(t, **kwargs) for t in texts]
 
-
     async def aembed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
     ) -> list[list[float]]:

From c2566660ff63eac5ffa4cd8f2b4f85e1769eedcf Mon Sep 17 00:00:00 2001
From: drr00t <adribeiro@gmail.com>
Date: Sun, 10 May 2026 22:43:55 -0300
Subject: [PATCH 8/8] fix: pass context parameter to embed_documents in
 aembed_documents method

---
 graphrag_sdk/src/graphrag_sdk/core/providers/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
index 7edd945a..ed2cfb7b 100644
--- a/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
+++ b/graphrag_sdk/src/graphrag_sdk/core/providers/base.py
@@ -88,7 +88,7 @@ async def aembed_documents(
         self, texts: list[str], *, ctx: Any | None = None, **kwargs: Any
     ) -> list[list[float]]:
         """Async batch embed. Default: sync-in-thread."""
-        return await asyncio.to_thread(self.embed_documents, texts, **kwargs)
+        return await asyncio.to_thread(self.embed_documents, texts, ctx=ctx, **kwargs)
 
 
 class LLMInterface(ABC):