From b6f96c2c891042fddd78a51dc1a7024f700857ca Mon Sep 17 00:00:00 2001 From: Anthony Bible Date: Sat, 29 Nov 2025 08:23:59 -0700 Subject: [PATCH] chore(config): document and expand base, dev, and prod configs - add explanatory comments and default values for api, worker, database, nats, log, batch_processing, security, git, and token_counting - tune environment-specific settings (dev: lower concurrency, shorter backoffs, dev-friendly batch/poller/rate limits; prod: higher concurrency, larger batch sizes, stricter security/rate limits) - add gemini batch config and clarifying env var notes - update wiki submodule to f321068 --- configs/config.dev.yaml | 80 ++++++++++++++++++------------ configs/config.prod.yaml | 102 +++++++++++++++++++++++++++++++-------- configs/config.yaml | 85 ++++++++++++++++++++++++-------- wiki | 2 +- 4 files changed, 198 insertions(+), 71 deletions(-) diff --git a/configs/config.dev.yaml b/configs/config.dev.yaml index f31be8e..2aec0b9 100644 --- a/configs/config.dev.yaml +++ b/configs/config.dev.yaml @@ -1,30 +1,20 @@ # Development configuration overrides +# See wiki/configuration-reference.md for complete documentation + database: - user: dev - password: dev + user: dev # Development database username + password: dev # Development database password log: - level: debug - format: text + level: debug # Verbose logging for development + format: text # Human-readable text format -# Gemini API key should be set via environment variable: -# export CODECHUNK_GEMINI_API_KEY=your-api-key +# Enhanced development configuration +worker: + concurrency: 3 # Lower concurrency for debugging -# Note: Timeout configuration is inherited from config.yaml -# - gemini.timeout: 120s (per-embedding API call timeout) -# - worker.job_timeout: 30m (overall job processing timeout) -# These values allow sufficient time for embedding generation even with network slowdowns -gemini: - batch: - enabled: true - input_dir: /tmp/batch_embeddings/input - output_dir: /tmp/batch_embeddings/output - poll_interval: 5s - max_wait_time: 30m - -# Enhanced batch processing for development batch_processing: - # Enable batch processing in development environment + # Enable batch processing but with developer-friendly settings enabled: true # Lower threshold for development to test batching with smaller repos threshold_chunks: 10 @@ -46,25 +36,55 @@ batch_processing: min: 25 max: 200 timeout: 20m - fallback_to_sequential: false + fallback_to_sequential: false # Test batch processing without fallback queue_limits: - max_queue_size: 100000 + max_queue_size: 10000 max_wait_time: 15m default_priority: "background" - # Use test embeddings in development for faster testing without API calls + # Note: Set use_test_embeddings: true to avoid API costs during development use_test_embeddings: false - # Batch chunking strategy parameters (development values) + # Development-optimized batch settings max_batch_size: 300 # Smaller batch size for faster testing initial_backoff: 5s # Shorter initial backoff for development max_backoff: 60s # Shorter max backoff for development max_retries: 2 # Fewer retries in development - enable_batch_chunking: true # Enable batch chunking in development - # Async batch job poller configuration + enable_batch_chunking: true + # Development poller configuration (more frequent for testing) poller_interval: 30s # Poll Gemini batch jobs every 30 seconds max_concurrent_polls: 5 # Max concurrent batch job status checks # Token counting configuration token_counting: - enabled: true # Enable token counting - mode: "all" # Mode: "all", "sample", or "on_demand" - sample_percent: 10 # Percentage of chunks to sample (for "sample" mode) - max_tokens_per_chunk: 8192 # Maximum tokens per chunk (Gemini embedding model limit) + enabled: true # Enable token counting in development + mode: "all" # Count all chunks for accuracy + sample_percent: 10 # Not used in "all" mode + max_tokens_per_chunk: 8192 # Maximum tokens per chunk + +# Development security settings (more permissive) +security: + rate_limit: + requests_per_minute: 300 # Higher rate limit for development + burst_size: 50 # Larger burst size + window_size: 1m + enabled: true # Still enable rate limiting + +# Development git operations +git: + default_timeout: 10m # Shorter timeout for faster development cycles + max_concurrent_clones: 2 # Fewer concurrent clones for resource conservation + +# Gemini API key should be set via environment variable: +# export CODECHUNK_GEMINI_API_KEY=your-api-key + +# Note: Timeout configuration is inherited from config.yaml +# - gemini.timeout: 120s (per-embedding API call timeout) +# - worker.job_timeout: 30m (overall job processing timeout) +# These values allow sufficient time for embedding generation even with network slowdowns + +# Gemini batch configuration (development) +gemini: + batch: + enabled: true + input_dir: /tmp/batch_embeddings/input + output_dir: /tmp/batch_embeddings/output + poll_interval: 5s + max_wait_time: 30m diff --git a/configs/config.prod.yaml b/configs/config.prod.yaml index da4fb48..65affc5 100644 --- a/configs/config.prod.yaml +++ b/configs/config.prod.yaml @@ -1,32 +1,94 @@ # Production configuration overrides +# See wiki/configuration-reference.md for complete documentation + api: - read_timeout: 30s - write_timeout: 30s + read_timeout: 30s # Longer timeout for production network conditions + write_timeout: 30s # Longer timeout for production network conditions worker: - concurrency: 20 - job_timeout: 60m + concurrency: 20 # Higher concurrency for production throughput + job_timeout: 60m # Extended timeout for large repositories database: - sslmode: require - max_connections: 100 - max_idle_connections: 20 + sslmode: require # Enforce SSL in production + max_connections: 100 # Higher connection limit for production + max_idle_connections: 20 # More idle connections for better performance nats: - max_reconnects: 10 - reconnect_wait: 5s + max_reconnects: 10 # More reconnection attempts for production resilience + reconnect_wait: 5s # Longer wait between reconnections gemini: - max_retries: 5 - timeout: 60s + max_retries: 5 # More retries for production reliability + timeout: 60s # Shorter per-request timeout for better responsiveness log: - level: error - format: json - -# Production secrets should be set via environment variables: -# CODECHUNK_DATABASE_USER -# CODECHUNK_DATABASE_PASSWORD -# CODECHUNK_DATABASE_HOST -# CODECHUNK_GEMINI_API_KEY -# CODECHUNK_NATS_URL \ No newline at end of file + level: error # Only log errors in production + format: json # Structured JSON logging for production monitoring + +# Production batch processing settings +batch_processing: + # Higher threshold for production (more efficient batching) + threshold_chunks: 100 + # Production-optimized batch sizes (larger for efficiency) + batch_sizes: + realtime: + min: 5 + max: 25 + timeout: 5m + interactive: + min: 10 + max: 50 + timeout: 10m + background: + min: 50 + max: 200 + timeout: 30m + batch: + min: 100 + max: 500 + timeout: 60m + fallback_to_sequential: true # Ensure fallback to sequential processing + queue_limits: + max_queue_size: 50000 # Larger queue for production + max_wait_time: 60m # Longer wait time for high load + default_priority: "background" + use_test_embeddings: false # Production must use real embeddings + # Production-optimized batch settings + max_batch_size: 1000 # Larger batch size for production efficiency + initial_backoff: 1m # Longer initial backoff for production + max_backoff: 10m # Longer max backoff for production + max_retries: 5 # More retries for production reliability + enable_batch_chunking: true + # Production poller configuration (more efficient) + poller_interval: 60s # Less frequent polling for efficiency + max_concurrent_polls: 10 # More concurrent polls for scalability + # Production token counting + token_counting: + enabled: true + mode: "all" # Count all chunks for accurate billing + sample_percent: 10 + max_tokens_per_chunk: 8192 + +# Production security settings (more restrictive) +security: + max_url_length: 1024 # Shorter URLs for security + max_body_size: 32768 # Smaller limit for security (32KB) + rate_limit: + requests_per_minute: 120 # Moderate rate limiting + burst_size: 20 # Controlled burst size + window_size: 1m + enabled: true # Strict rate limiting in production + +# Production git operations +git: + default_timeout: 60m # Longer timeout for large repos + max_concurrent_clones: 5 # Higher concurrency for production + retry_attempts: 3 # More retries for production + +# Production secrets must be set via environment variables: +# CODECHUNK_DATABASE_USER - Required database username +# CODECHUNK_DATABASE_PASSWORD - Required database password +# CODECHUNK_DATABASE_HOST - Database hostname if not localhost +# CODECHUNK_GEMINI_API_KEY - Required Gemini API key for embeddings +# CODECHUNK_NATS_URL - NATS server URL if using external NATS \ No newline at end of file diff --git a/configs/config.yaml b/configs/config.yaml index 0545275..48778d6 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -1,29 +1,40 @@ # Base configuration for CodeChunking +# See wiki/configuration-reference.md for complete documentation + api: - host: 0.0.0.0 - port: 8080 - read_timeout: 10s - write_timeout: 10s + host: 0.0.0.0 # API server bind address + port: 8080 # API server port + read_timeout: 10s # Maximum duration for reading HTTP requests + write_timeout: 10s # Maximum duration for writing HTTP responses + # Middleware flags (optional, default to true) + # enable_default_middleware: true + # enable_cors: true + # enable_security_headers: true + # enable_logging: true + # enable_error_handling: true worker: - concurrency: 5 - queue_group: workers - job_timeout: 30m # Overall timeout for complete job processing - # Must be larger than (gemini.timeout × max_chunks_per_repo) - # to allow all embeddings to complete + concurrency: 5 # Number of concurrent worker processes (min: 1) + queue_group: workers # NATS queue group name for load balancing + job_timeout: 30m # Overall timeout for complete job processing + # Must be larger than (gemini.timeout × max_chunks_per_repo) + # to allow all embeddings to complete database: - host: localhost - port: 5432 - name: codechunking - sslmode: disable - max_connections: 25 - max_idle_connections: 5 + host: localhost # Database server hostname + port: 5432 # Database port (1-65535) + name: codechunking # Database name (required) + # user: # Database username (required, set via env var) + # password: # Database password (required, set via env var) + sslmode: disable # SSL mode: "disable", "require", "prefer" + max_connections: 25 # Maximum open connections + max_idle_connections: 5 # Maximum idle connections nats: - url: nats://localhost:4222 - max_reconnects: 5 - reconnect_wait: 2s + url: nats://localhost:4222 # NATS server URL + max_reconnects: 5 # Maximum reconnection attempts + reconnect_wait: 2s # Time between reconnection attempts + # test_mode: false # Enable test mode for debugging search: iterative_scan_mode: relaxed_order # pgvector 0.8.0+ iterative scanning mode @@ -102,5 +113,39 @@ batch_processing: max_tokens_per_chunk: 8192 # Maximum tokens per chunk (Gemini embedding model limit) log: - level: info - format: json + level: info # Log level: "debug", "info", "warn", "error" + format: json # Log format: "json", "text" + +# Git operations configuration +git: + default_depth: 1 # Git clone depth (0 = full, 1 = shallow) + shallow_clone_threshold_mb: 100 # Repository size threshold for shallow clone + default_timeout: 30m # Default git operation timeout + max_concurrent_clones: 3 # Maximum concurrent git clone operations + retry_attempts: 2 # Retry attempts for git operations + retry_backoff_duration: 5s # Backoff between git retries + enable_progress_tracking: true # Enable git operation progress tracking + enable_performance_monitoring: true # Enable git performance monitoring + auto_select_strategy: true # Automatically select clone strategy + workspace_cleanup_enabled: true # Enable workspace cleanup + workspace_cleanup_interval: 24h # Cleanup interval + +# Security configuration +security: + max_url_length: 2048 # Maximum URL length (characters) + max_body_size: 65536 # Maximum request body size (64KB) + enable_xss_protection: true # Enable XSS protection + enable_sql_injection: true # Enable SQL injection protection + enable_control_char_check: true # Enable control character validation + enable_unicode_check: true # Enable Unicode validation + enable_path_traversal: true # Enable path traversal protection + log_security_violations: true # Log security violations + log_level: "INFO" # Security log level + enable_validation_cache: true # Enable validation caching + cache_size: 1000 # Validation cache size + cache_ttl: 5m # Validation cache TTL + rate_limit: + requests_per_minute: 60 # Rate limit per minute + burst_size: 10 # Burst capacity + window_size: 1m # Rate limit window + enabled: true # Enable rate limiting diff --git a/wiki b/wiki index 0cafe7b..f321068 160000 --- a/wiki +++ b/wiki @@ -1 +1 @@ -Subproject commit 0cafe7b011b12759b8aa7ea7f5f79214a7f673ce +Subproject commit f321068ddd7e6e3c72267b51c5d9de93f7aeffdb