From b6f96c2c891042fddd78a51dc1a7024f700857ca Mon Sep 17 00:00:00 2001
From: Anthony Bible <anthony@anthony.bible>
Date: Sat, 29 Nov 2025 08:23:59 -0700
Subject: [PATCH] chore(config): document and expand base, dev, and prod
 configs

- add explanatory comments and default values for api, worker, database, nats, log, batch_processing, security, git, and token_counting
- tune environment-specific settings (dev: lower concurrency, shorter backoffs, dev-friendly batch/poller/rate limits; prod: higher concurrency, larger batch sizes, stricter security/rate limits)
- add gemini batch config and clarifying env var notes
- update wiki submodule to f321068
---
 configs/config.dev.yaml  |  80 ++++++++++++++++++------------
 configs/config.prod.yaml | 102 +++++++++++++++++++++++++++++++--------
 configs/config.yaml      |  85 ++++++++++++++++++++++++--------
 wiki                     |   2 +-
 4 files changed, 198 insertions(+), 71 deletions(-)

diff --git a/configs/config.dev.yaml b/configs/config.dev.yaml
index f31be8e..2aec0b9 100644
--- a/configs/config.dev.yaml
+++ b/configs/config.dev.yaml
@@ -1,30 +1,20 @@
 # Development configuration overrides
+# See wiki/configuration-reference.md for complete documentation
+
 database:
-  user: dev
-  password: dev
+  user: dev                    # Development database username
+  password: dev                # Development database password
 
 log:
-  level: debug
-  format: text
+  level: debug                 # Verbose logging for development
+  format: text                 # Human-readable text format
 
-# Gemini API key should be set via environment variable:
-# export CODECHUNK_GEMINI_API_KEY=your-api-key
+# Enhanced development configuration
+worker:
+  concurrency: 3               # Lower concurrency for debugging
 
-# Note: Timeout configuration is inherited from config.yaml
-# - gemini.timeout: 120s (per-embedding API call timeout)
-# - worker.job_timeout: 30m (overall job processing timeout)
-# These values allow sufficient time for embedding generation even with network slowdowns
-gemini:
-  batch:
-    enabled: true
-    input_dir: /tmp/batch_embeddings/input
-    output_dir: /tmp/batch_embeddings/output
-    poll_interval: 5s
-    max_wait_time: 30m
-
-# Enhanced batch processing for development
 batch_processing:
-  # Enable batch processing in development environment
+  # Enable batch processing but with developer-friendly settings
   enabled: true
   # Lower threshold for development to test batching with smaller repos
   threshold_chunks: 10
@@ -46,25 +36,55 @@ batch_processing:
       min: 25
       max: 200
       timeout: 20m
-  fallback_to_sequential: false
+  fallback_to_sequential: false    # Test batch processing without fallback
   queue_limits:
-    max_queue_size: 100000
+    max_queue_size: 10000
     max_wait_time: 15m
   default_priority: "background"
-  # Use test embeddings in development for faster testing without API calls
+  # Note: Set use_test_embeddings: true to avoid API costs during development
   use_test_embeddings: false
-  # Batch chunking strategy parameters (development values)
+  # Development-optimized batch settings
   max_batch_size: 300           # Smaller batch size for faster testing
   initial_backoff: 5s           # Shorter initial backoff for development
   max_backoff: 60s              # Shorter max backoff for development
   max_retries: 2                # Fewer retries in development
-  enable_batch_chunking: true   # Enable batch chunking in development
-  # Async batch job poller configuration
+  enable_batch_chunking: true
+  # Development poller configuration (more frequent for testing)
   poller_interval: 30s          # Poll Gemini batch jobs every 30 seconds
   max_concurrent_polls: 5       # Max concurrent batch job status checks
   # Token counting configuration
   token_counting:
-    enabled: true                    # Enable token counting
-    mode: "all"                      # Mode: "all", "sample", or "on_demand"
-    sample_percent: 10               # Percentage of chunks to sample (for "sample" mode)
-    max_tokens_per_chunk: 8192       # Maximum tokens per chunk (Gemini embedding model limit)
+    enabled: true                    # Enable token counting in development
+    mode: "all"                      # Count all chunks for accuracy
+    sample_percent: 10               # Not used in "all" mode
+    max_tokens_per_chunk: 8192       # Maximum tokens per chunk
+
+# Development security settings (more permissive)
+security:
+  rate_limit:
+    requests_per_minute: 300    # Higher rate limit for development
+    burst_size: 50              # Larger burst size
+    window_size: 1m
+    enabled: true               # Still enable rate limiting
+
+# Development git operations
+git:
+  default_timeout: 10m         # Shorter timeout for faster development cycles
+  max_concurrent_clones: 2     # Fewer concurrent clones for resource conservation
+
+# Gemini API key should be set via environment variable:
+# export CODECHUNK_GEMINI_API_KEY=your-api-key
+
+# Note: Timeout configuration is inherited from config.yaml
+# - gemini.timeout: 120s (per-embedding API call timeout)
+# - worker.job_timeout: 30m (overall job processing timeout)
+# These values allow sufficient time for embedding generation even with network slowdowns
+
+# Gemini batch configuration (development)
+gemini:
+  batch:
+    enabled: true
+    input_dir: /tmp/batch_embeddings/input
+    output_dir: /tmp/batch_embeddings/output
+    poll_interval: 5s
+    max_wait_time: 30m
diff --git a/configs/config.prod.yaml b/configs/config.prod.yaml
index da4fb48..65affc5 100644
--- a/configs/config.prod.yaml
+++ b/configs/config.prod.yaml
@@ -1,32 +1,94 @@
 # Production configuration overrides
+# See wiki/configuration-reference.md for complete documentation
+
 api:
-  read_timeout: 30s
-  write_timeout: 30s
+  read_timeout: 30s            # Longer timeout for production network conditions
+  write_timeout: 30s           # Longer timeout for production network conditions
 
 worker:
-  concurrency: 20
-  job_timeout: 60m
+  concurrency: 20              # Higher concurrency for production throughput
+  job_timeout: 60m             # Extended timeout for large repositories
 
 database:
-  sslmode: require
-  max_connections: 100
-  max_idle_connections: 20
+  sslmode: require             # Enforce SSL in production
+  max_connections: 100         # Higher connection limit for production
+  max_idle_connections: 20     # More idle connections for better performance
 
 nats:
-  max_reconnects: 10
-  reconnect_wait: 5s
+  max_reconnects: 10           # More reconnection attempts for production resilience
+  reconnect_wait: 5s           # Longer wait between reconnections
 
 gemini:
-  max_retries: 5
-  timeout: 60s
+  max_retries: 5               # More retries for production reliability
+  timeout: 60s                 # Shorter per-request timeout for better responsiveness
 
 log:
-  level: error
-  format: json
-
-# Production secrets should be set via environment variables:
-# CODECHUNK_DATABASE_USER
-# CODECHUNK_DATABASE_PASSWORD
-# CODECHUNK_DATABASE_HOST
-# CODECHUNK_GEMINI_API_KEY
-# CODECHUNK_NATS_URL
\ No newline at end of file
+  level: error                 # Only log errors in production
+  format: json                 # Structured JSON logging for production monitoring
+
+# Production batch processing settings
+batch_processing:
+  # Higher threshold for production (more efficient batching)
+  threshold_chunks: 100
+  # Production-optimized batch sizes (larger for efficiency)
+  batch_sizes:
+    realtime:
+      min: 5
+      max: 25
+      timeout: 5m
+    interactive:
+      min: 10
+      max: 50
+      timeout: 10m
+    background:
+      min: 50
+      max: 200
+      timeout: 30m
+    batch:
+      min: 100
+      max: 500
+      timeout: 60m
+  fallback_to_sequential: true     # Ensure fallback to sequential processing
+  queue_limits:
+    max_queue_size: 50000          # Larger queue for production
+    max_wait_time: 60m             # Longer wait time for high load
+  default_priority: "background"
+  use_test_embeddings: false        # Production must use real embeddings
+  # Production-optimized batch settings
+  max_batch_size: 1000             # Larger batch size for production efficiency
+  initial_backoff: 1m              # Longer initial backoff for production
+  max_backoff: 10m                 # Longer max backoff for production
+  max_retries: 5                   # More retries for production reliability
+  enable_batch_chunking: true
+  # Production poller configuration (more efficient)
+  poller_interval: 60s             # Less frequent polling for efficiency
+  max_concurrent_polls: 10         # More concurrent polls for scalability
+  # Production token counting
+  token_counting:
+    enabled: true
+    mode: "all"                    # Count all chunks for accurate billing
+    sample_percent: 10
+    max_tokens_per_chunk: 8192
+
+# Production security settings (more restrictive)
+security:
+  max_url_length: 1024             # Shorter URLs for security
+  max_body_size: 32768             # Smaller limit for security (32KB)
+  rate_limit:
+    requests_per_minute: 120       # Moderate rate limiting
+    burst_size: 20                 # Controlled burst size
+    window_size: 1m
+    enabled: true                  # Strict rate limiting in production
+
+# Production git operations
+git:
+  default_timeout: 60m             # Longer timeout for large repos
+  max_concurrent_clones: 5         # Higher concurrency for production
+  retry_attempts: 3                # More retries for production
+
+# Production secrets must be set via environment variables:
+# CODECHUNK_DATABASE_USER         - Required database username
+# CODECHUNK_DATABASE_PASSWORD     - Required database password
+# CODECHUNK_DATABASE_HOST         - Database hostname if not localhost
+# CODECHUNK_GEMINI_API_KEY        - Required Gemini API key for embeddings
+# CODECHUNK_NATS_URL              - NATS server URL if using external NATS
\ No newline at end of file
diff --git a/configs/config.yaml b/configs/config.yaml
index 0545275..48778d6 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,29 +1,40 @@
 # Base configuration for CodeChunking
+# See wiki/configuration-reference.md for complete documentation
+
 api:
-  host: 0.0.0.0
-  port: 8080
-  read_timeout: 10s
-  write_timeout: 10s
+  host: 0.0.0.0               # API server bind address
+  port: 8080                  # API server port
+  read_timeout: 10s           # Maximum duration for reading HTTP requests
+  write_timeout: 10s          # Maximum duration for writing HTTP responses
+  # Middleware flags (optional, default to true)
+  # enable_default_middleware: true
+  # enable_cors: true
+  # enable_security_headers: true
+  # enable_logging: true
+  # enable_error_handling: true
 
 worker:
-  concurrency: 5
-  queue_group: workers
-  job_timeout: 30m  # Overall timeout for complete job processing
-                    # Must be larger than (gemini.timeout × max_chunks_per_repo)
-                    # to allow all embeddings to complete
+  concurrency: 5              # Number of concurrent worker processes (min: 1)
+  queue_group: workers        # NATS queue group name for load balancing
+  job_timeout: 30m            # Overall timeout for complete job processing
+                             # Must be larger than (gemini.timeout × max_chunks_per_repo)
+                             # to allow all embeddings to complete
 
 database:
-  host: localhost
-  port: 5432
-  name: codechunking
-  sslmode: disable
-  max_connections: 25
-  max_idle_connections: 5
+  host: localhost              # Database server hostname
+  port: 5432                   # Database port (1-65535)
+  name: codechunking           # Database name (required)
+  # user:                      # Database username (required, set via env var)
+  # password:                  # Database password (required, set via env var)
+  sslmode: disable             # SSL mode: "disable", "require", "prefer"
+  max_connections: 25          # Maximum open connections
+  max_idle_connections: 5      # Maximum idle connections
 
 nats:
-  url: nats://localhost:4222
-  max_reconnects: 5
-  reconnect_wait: 2s
+  url: nats://localhost:4222   # NATS server URL
+  max_reconnects: 5            # Maximum reconnection attempts
+  reconnect_wait: 2s           # Time between reconnection attempts
+  # test_mode: false          # Enable test mode for debugging
 
 search:
   iterative_scan_mode: relaxed_order  # pgvector 0.8.0+ iterative scanning mode
@@ -102,5 +113,39 @@ batch_processing:
     max_tokens_per_chunk: 8192       # Maximum tokens per chunk (Gemini embedding model limit)
 
 log:
-  level: info
-  format: json
+  level: info                   # Log level: "debug", "info", "warn", "error"
+  format: json                  # Log format: "json", "text"
+
+# Git operations configuration
+git:
+  default_depth: 1              # Git clone depth (0 = full, 1 = shallow)
+  shallow_clone_threshold_mb: 100  # Repository size threshold for shallow clone
+  default_timeout: 30m          # Default git operation timeout
+  max_concurrent_clones: 3      # Maximum concurrent git clone operations
+  retry_attempts: 2             # Retry attempts for git operations
+  retry_backoff_duration: 5s    # Backoff between git retries
+  enable_progress_tracking: true    # Enable git operation progress tracking
+  enable_performance_monitoring: true  # Enable git performance monitoring
+  auto_select_strategy: true    # Automatically select clone strategy
+  workspace_cleanup_enabled: true      # Enable workspace cleanup
+  workspace_cleanup_interval: 24h      # Cleanup interval
+
+# Security configuration
+security:
+  max_url_length: 2048          # Maximum URL length (characters)
+  max_body_size: 65536          # Maximum request body size (64KB)
+  enable_xss_protection: true   # Enable XSS protection
+  enable_sql_injection: true    # Enable SQL injection protection
+  enable_control_char_check: true   # Enable control character validation
+  enable_unicode_check: true    # Enable Unicode validation
+  enable_path_traversal: true   # Enable path traversal protection
+  log_security_violations: true # Log security violations
+  log_level: "INFO"             # Security log level
+  enable_validation_cache: true # Enable validation caching
+  cache_size: 1000              # Validation cache size
+  cache_ttl: 5m                 # Validation cache TTL
+  rate_limit:
+    requests_per_minute: 60     # Rate limit per minute
+    burst_size: 10              # Burst capacity
+    window_size: 1m             # Rate limit window
+    enabled: true               # Enable rate limiting
diff --git a/wiki b/wiki
index 0cafe7b..f321068 160000
--- a/wiki
+++ b/wiki
@@ -1 +1 @@
-Subproject commit 0cafe7b011b12759b8aa7ea7f5f79214a7f673ce
+Subproject commit f321068ddd7e6e3c72267b51c5d9de93f7aeffdb