Kaptn/config.example.yaml at main · aaronlmathis/Kaptn · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
server:
  addr: "0.0.0.0:8080"
  base_path: "/"
  cors:
    allow_origins: ["*"]
    allow_methods: ["GET", "POST", "PUT", "DELETE", "OPTIONS"]

security:
  # enable one of: "none", "header", "oidc"
  auth_mode: "none"
  session_ttl: "12h"     # How long access tokens last
  refresh_token_ttl: "7d" # How long refresh tokens last
  oidc:
    issuer: ""
    client_id: ""
    audience: ""
    jwks_url: ""

# Auth Keys Configuration (file paths for local development)
  auth_keys:
    oidc_state_hash_key_path: "keys/oidc_state_hash.key"
    oidc_state_block_key_path: "keys/oidc_state_block.key"
    jwt_private_key_path: "keys/kaptn_jwt_private.pem"
    jwt_public_key_path: "keys/kaptn_jwt_public.pem"

authz:
  mode: "idp_groups"
  groups_prefix_allowlist: ["kaptn-"]
  default_groups: [""]

kubernetes:
  mode: "kubeconfig"        # or "incluster"
  kubeconfig_path: ""       # used if mode=kubeconfig, defaults to $KUBECONFIG
  namespace_default: "default"
  insecure_tls: false       # skip TLS verification for development environments
  qps: 100                  # queries per second allowed to Kubernetes API server
  burst: 200                # maximum burst for throttle

features:
  enable_apply: true
  enable_nodes_actions: true
  enable_overview: true
  enable_prometheus_analytics: true

rate_limits:
  apply_per_minute: 30
  actions_per_minute: 60

logging:
  level: "debug"

integrations:
  prometheus:
    url: "http://prometheus-server.monitoring.svc:80"
    timeout: "10s"
    enabled: true

caching:
  overview_ttl: "5s"
  analytics_ttl: "120s"
  summary_ttl: "60s"
  search_cache_ttl: "300s"
  search_cache_max_size: 100000

  logs_cache:
    ttl: "10m"
    max_global: 250000
    max_per_scope: 20000
    max_subscribers: 200
    buffer_size: 100
    eviction_interval: "30s"
    cleanup_interval: "5m"

    # Background log collection - automatically collects logs from all pods (V2 - Event-driven)
    background_collection_enabled: true
    background_collection_retention: "1h"
    # Collection mode and interval
    #   stream: follow live logs (default)
    #   poll:   fetch logs every poll_interval (lighter on API server)
    background_collection_mode: "stream"     # or "poll"
    background_collection_poll_interval: "10s"
    background_collection_tail_lines: 100

    # Max per-line size (bytes) read from logs to protect memory
    max_log_line_bytes: 262144   # 256KB
    # Optional: informer resync period (advanced). 0s disables periodic resync.
    informer_resync: "0s"

    # Operational limits (Phase 10)
    max_streams_per_user: 50
    max_query_limit: 10000
    max_export_size: 104857600  # 100MB
    max_concurrent_queries: 20
    rate_limit_per_second: 1000
    backpressure_threshold: 80  # Percentage
    degraded_mode_timeout: "5m"

jobs:
  persistence_enabled: true
  store_path: "/data/jobs"
  cleanup_interval: "30m"
  max_age: "7d"           # Keep jobs longer in production


timeseries:
  enabled: true
  window: "4h"
  tick_interval: "500ms"
  capacity_refresh_interval: "15s"
  resource_poll_interval: "10s"
  summary_poll_interval: "15s"
  state_reconcile_interval: "30s"
  prune_interval: "60s"
  disable_network_if_unavailable: true


  max_series: 1000000              # 1M series (20 sections × 200 metrics × 50 users × 5x safety)
  max_points_per_series: 10000000  # 10M points per series (scales with available memory)
  max_ws_clients: 2000             # Support many concurrent dashboard users
  ws_read_limit: 131072            # 128KB - handles massive subscription messages
  ws_write_buffer_size: 8192       # 8K message buffer per client (high-frequency updates)

  hi_res:
    step: "1s"            # Keep 1s for real-time dashboards
  lo_res:
    step: "10s"           # Increase to 10s for efficiency over longer windows