docker
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cagent-schema.json‎
Lines changed: 38 additions & 2 deletions b/‎cagent-schema.json‎
Lines changed: 38 additions & 2 deletions
diff --git a/‎docs/USAGE.md‎
Lines changed: 52 additions & 1 deletion b/‎docs/USAGE.md‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎examples/rag/semantic_embeddings.yaml‎
Lines changed: 97 additions & 0 deletions b/‎examples/rag/semantic_embeddings.yaml‎
Lines changed: 97 additions & 0 deletions
@@ -146,7 +146,7 @@ See [MCP Mode documentation](./docs/MCP-MODE.md) for detailed instructions on ex
 - **💭 Advanced reasoning** - Built-in "think", "todo" and "memory" tools for
   complex problem-solving.
 - **🔍 RAG (Retrieval-Augmented Generation)** - Pluggable retrieval strategies
-  (chunked_embeddings, BM25, more to come..) with hybrid retrieval, fusion, and result reranking support.
+  (BM25, chunked-embeddings, semantic-embeddings) with hybrid retrieval, result fusion and reranking support.
 - **🌐 Multiple AI providers** - Support for OpenAI, Anthropic, Gemini, xAI,
   Mistral, Nebius and [Docker Model
   Runner](https://docs.docker.com/ai/model-runner/).
@@ -369,7 +369,7 @@ agents:
 ```
 
 **Features:**
-- **Multiple strategies**: Vector (semantic), BM25 (keyword), or both
+- **Multiple strategies**: Vector embeddings, semantic embeddings, BM25 (keyword), or combinations
 - **Parallel execution**: Strategies run concurrently for fast results
 - **Pluggable fusion**: RRF, weighted, or max score combining
 - **Result reranking**: Re-score results with specialized models for improved relevance
 
@@ -725,12 +725,13 @@
                 "description": "Retrieval strategy type",
                 "enum": [
                   "bm25",
-                  "chunked-embeddings"
+                  "chunked-embeddings",
+                  "semantic-embeddings"
                 ]
               },
               "embedding_model": {
                 "type": "string",
-                "description": "Embedding model reference for chunked-embeddings strategies (looked up in models map, or 'auto' for automatic selection)",
+                "description": "Embedding model reference for chunked-embeddings and semantic-embeddings strategies (looked up in models map, or 'auto' for automatic selection)",
                 "examples": [
                   "openai/text-embedding-3-small",
                   "dmr/embeddinggemma",
@@ -811,6 +812,41 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "embedding_batch_size": {
+                "type": "integer",
+                "description": "Number of text chunks to send to the embedding API in a single request (chunked-embeddings/semantic-embeddings only)",
+                "minimum": 1,
+                "default": 50
+              },
+              "max_embedding_concurrency": {
+                "type": "integer",
+                "description": "Maximum concurrent embedding batch API requests. For semantic-embeddings, also controls parallel LLM calls for generating chunk summaries.",
+                "minimum": 1,
+                "default": 3
+              },
+              "max_indexing_concurrency": {
+                "type": "integer",
+                "description": "Maximum number of files to index in parallel during initialization",
+                "minimum": 1,
+                "default": 3
+              },
+              "chat_model": {
+                "type": "string",
+                "description": "Chat model used to generate semantic representations for each chunk (semantic-embeddings only, required)",
+                "examples": [
+                  "anthropic/claude-sonnet-4-5",
+                  "openai/gpt-4o-mini"
+                ]
+              },
+              "semantic_prompt": {
+                "type": "string",
+                "description": "Custom prompt template for semantic LLM. Uses JavaScript template literal syntax with the following placeholders: ${path} (full source file path), ${basename} (base name of file), ${chunk_index} (numeric chunk index), ${content} (raw chunk content), ${ast_context} (AST metadata when ast_context is enabled). Only applicable to semantic-embeddings strategy."
+              },
+              "ast_context": {
+                "type": "boolean",
+                "description": "Include TreeSitter-derived AST metadata in the semantic prompt (semantic-embeddings only, requires chunking.code_aware for best results)",
+                "default": false
               }
             },
             "additionalProperties": true
 
@@ -680,6 +680,39 @@ rag:
 
 **Best for:** Understanding intent, synonyms, paraphrasing, multilingual queries
 
+#### Semantic-Embeddings Strategy (LLM-Enhanced Semantic Search)
+
+Uses an LLM to generate semantic summaries of each chunk before embedding, capturing meaning and intent rather than raw text:
+
+```yaml
+rag:
+  code_search:
+    docs: [./src, ./pkg]
+    strategies:
+      - type: semantic-embeddings
+        embedding_model: openai/text-embedding-3-small
+        vector_dimensions: 1536
+        chat_model: openai/gpt-4o-mini        # LLM for generating summaries
+        database: ./semantic.db
+        threshold: 0.3
+        limit: 10
+        ast_context: true                     # Include AST metadata in prompts
+        chunking:
+          size: 1000
+          code_aware: true                    # AST-aware chunking for best results
+```
+
+**Best for:** Code search, understanding intent, finding implementations by what they do rather than exact names
+
+**Trade-offs:** Higher quality retrieval but slower indexing (LLM call per chunk) and additional API costs
+
+**Parameters:**
+- `embedding_model` (required): Embedding model for vector similarity
+- `chat_model` (required): Chat model to generate semantic summaries
+- `vector_dimensions` (required): Embedding vector dimensions
+- `semantic_prompt`: Custom prompt template (uses `${path}`, `${content}`, `${ast_context}` placeholders)
+- `ast_context`: Include TreeSitter AST metadata in prompts (default: `false`)
+
 #### BM25 Strategy (Keyword Search)
 
 Uses traditional keyword matching:
@@ -945,7 +978,7 @@ models:
 | `results` | object | Post-processing configuration |
 
 **Strategy Configuration:**
-- `type`: Strategy type (`chunked-embeddings`, `bm25`)
+- `type`: Strategy type (`chunked-embeddings`, `semantic-embeddings`, `bm25`)
 - `docs`: Strategy-specific documents (optional, augments shared docs)
 - `database`: Database configuration (path to local sqlite db)
 - `chunking`: Chunking configuration
@@ -963,6 +996,23 @@ models:
 - `chunking.size`: Chunk size in characters (default: `1000`)
 - `chunking.overlap`: Overlap between chunks (default: `75`)
 
+**Semantic-Embeddings Strategy:**
+- `embedding_model` (required): Embedding model reference (e.g., `openai/text-embedding-3-small`)
+- `chat_model` (required): Chat model for generating semantic summaries (e.g., `openai/gpt-4o-mini`)
+- `vector_dimensions` (required): Embedding vector dimensions (e.g., `1536` for text-embedding-3-small)
+- `database`: Database configuration (same formats as chunked-embeddings)
+- `semantic_prompt`: Custom prompt template using JS template literals (`${path}`, `${basename}`, `${chunk_index}`, `${content}`, `${ast_context}`)
+- `ast_context`: Include TreeSitter AST metadata in semantic prompts. Useful for code (default: `false`, best with `code_aware` chunking)
+- `similarity_metric`: Similarity metric (default: `cosine_similarity`)
+- `threshold`: Minimum similarity (0–1, default: `0.5`)
+- `limit`: Max candidates from this strategy for fusion input (default: `5`)
+- `embedding_batch_size`: Chunks per embedding request (default: `50`)
+- `max_embedding_concurrency`: Concurrent embedding/LLM requests (default: `3`)
+- `max_indexing_concurrency`: Concurrent file indexing (default: `3`)
+- `chunking.size`: Chunk size in characters (default: `1000`)
+- `chunking.overlap`: Overlap between chunks (default: `75`)
+- `chunking.code_aware`: Use AST-based chunking (default: `false`, if `true` the `chunking.overlap` will be ignored)
+
 **BM25 Strategy:**
 - `database`: Database configuration (same formats as chunked-embeddings)
 - `k1`: Term frequency saturation (recommended range: `1.2–2.0`, default: `1.5`)
@@ -1029,6 +1079,7 @@ Look for logs tagged with:
 See `examples/rag/` directory:
 - `examples/rag/bm25.yaml` - BM25 strategy only
 - `examples/rag/hybrid.yaml` - Hybrid retrieval (chunked-embeddings + BM25)
+- `examples/rag/semantic_embeddings.yaml` - Semantic-embeddings strategy with LLM summaries
 - `examples/rag/reranking.yaml` - Reranking with various providers
 - `examples/rag/reranking_full_example.yaml` - Complete reranking configuration reference
 
 
@@ -0,0 +1,97 @@
+# This example demonstrates the semantic-embeddings RAG strategy.
+#
+# Unlike chunked-embeddings which embeds raw text chunks directly,
+# semantic-embeddings uses an LLM to generate semantic summaries of each
+# chunk before embedding. This captures the meaning/purpose of code,
+# making retrieval more semantic than direct chunk embedding.
+#
+# Trade-offs:
+# - Higher quality retrieval for code and structured content
+# - Slower indexing (requires LLM call per chunk)
+# - Additional cost from semantic model API calls
+
+agents:
+  root:
+    model: openai/gpt-4o
+    description: assistant with semantic code search
+    instruction: |
+      You are a helpful coding assistant with access to semantic code search.
+      Use the search tool to find relevant code based on meaning, not just keywords.
+    rag:
+      - codebase
+
+rag:
+  codebase:
+    tool:
+      description: Search the codebase for relevant code snippets by semantic meaning
+    docs:
+      - ../../pkg/**/*.go
+      - ../../cmd/**/*.go
+    strategies:
+      - type: semantic-embeddings
+        # Required: embedding model for vector similarity
+        embedding_model: openai/text-embedding-3-small
+        vector_dimensions: 1536
+
+        # Required: chat model to generate semantic summaries of each chunk
+        chat_model: openai/gpt-4o-mini
+        
+        # Custom prompt template for generating semantic summaries during indexing.
+        # Uses JS template literal syntax with these placeholders:
+        #   ${path}        - full source file path
+        #   ${basename}    - base name of the source file
+        #   ${chunk_index} - numeric index of the chunk
+        #   ${content}     - raw chunk content
+        #   ${ast_context} - formatted AST metadata (when ast_context: true)
+        semantic_prompt: |
+          You are summarizing source code for semantic search.
+
+          File: ${basename}
+          ${ast_context}
+
+          ```
+          ${content}
+          ```
+
+          In 2-4 sentences, explain what this code does. Be specific:
+          - Name exact functions, types, and methods
+          - Mention key dependencies or libraries used
+          - Describe inputs, outputs, and notable behavior
+
+        # Optional: database path (defaults to auto-generated name)
+        database: ./semantic_embeddings.db
+
+        # Optional: similarity settings
+        similarity_metric: cosine_similarity
+        threshold: 0.3
+        limit: 10
+
+        # Optional: performance tuning
+        embedding_batch_size: 50        # chunks per embedding API call
+        max_embedding_concurrency: 3    # parallel embedding/LLM requests
+        max_indexing_concurrency: 3     # parallel file indexing
+
+        # Optional: include AST metadata in semantic prompt (best with code_aware chunking)
+        ast_context: true
+
+        # Optional: chunking configuration
+        chunking:
+          size: 1000
+          respect_word_boundaries: true
+          code_aware: true  # Use tree-sitter for AST-aware chunking
+
+    results:
+      # Optional: rerank results using an LLM for better relevance
+      reranking:
+        model: openai/gpt-4o-mini
+        threshold: 0.3
+        # Custom criteria to guide the reranking model's relevance scoring
+        criteria: |
+          When scoring relevance, prioritize:
+          - Code that directly implements the queried functionality
+          - Functions and methods over comments or documentation
+          - Complete implementations over partial snippets
+      deduplicate: true
+      return_full_content: false # return full document content instead of just the matched chunks
+      limit: 5
+