From b7f807fa855138353b61116d9bf4ee39ba9d2a57 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Tue, 12 May 2026 11:27:00 +1000 Subject: [PATCH 1/6] vector: ship logs to clickhouse logs-ingest instead of loki Replaces the loki sink in the vector template config with a remap transform + http sink that posts to platform's logs-external clickhouse pipeline at logs-ingest.analytics.production.platform.ethpandaops.io. Reuses secret_loki credentials since the same VMAuth backend fronts both ingresses. New devnets generated from this template will ship logs straight to clickhouse. This mirrors the rollout already done in glamsterdam-devnet-3 (all hosts) and bal-devnet-6 (bootnode-1) where it has been verified end-to-end. --- .../devnet-0/group_vars/all/all.yaml | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index 0a83f3d..0d429cc 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -274,6 +274,9 @@ docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain } docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" # role: ethpandaops.general.vector +# Ship docker logs to platform's ClickHouse logs-external pipeline. +# Reuses secret_loki credentials — the same VMAuth backend fronts both ingresses. +clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" vector_config: | [sources.in] type = "docker_logs" @@ -286,24 +289,31 @@ vector_config: | "snooper-", ] - [sinks.out] - type = "loki" + [transforms.clickhouse_shape] + type = "remap" inputs = ["in"] - out_of_order_action = "accept" - labels.forwarder = "vector" - labels.instance = "{{ inventory_hostname }}" - labels.network = "{{ ethereum_network_name }}" - labels.testnet = "{{ ethereum_network_name }}" - labels.ingress_user = "{{ secret_loki.username }}" - labels.container_name = "{{ '{{ container_name }}' }}" - {%- if ethereum_node_el is defined +%} - labels.ethereum_el = "{{ ethereum_node_el }}" - {%- endif +%} - {%- if ethereum_node_cl is defined +%} - labels.ethereum_cl = "{{ ethereum_node_cl }}" - {%- endif +%} + source = ''' + .IngressUser = "{{ secret_loki.username }}" + .Namespace = "" + .Pod = "" + .Container = string(.container_name) ?? "" + .Node = "{{ inventory_hostname }}" + .Stream = string(.stream) ?? "" + .Message = string(.message) ?? "" + .Timestamp = .timestamp + del(.container_name); del(.container_id); del(.container_created_at) + del(.image); del(.host); del(.label); del(.source_type) + del(.stream); del(.message); del(.timestamp) + ''' + + [sinks.clickhouse_logs] + type = "http" + inputs = ["clickhouse_shape"] + uri = "{{ clickhouse_logs_endpoint }}" + method = "post" encoding.codec = "json" - endpoint = "{{ secret_loki.endpoint }}" auth.strategy = "basic" auth.user = "{{ secret_loki.username }}" auth.password = "{{ secret_loki.password }}" + batch.max_events = 5000 + batch.timeout_secs = 3 From 46a8c4aad11dd0cbf97aa553e16b6560a949f6ed Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Tue, 12 May 2026 11:31:36 +1000 Subject: [PATCH 2/6] remove comments --- ansible/inventories/devnet-0/group_vars/all/all.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index 0d429cc..0b68823 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -274,8 +274,6 @@ docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain } docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" # role: ethpandaops.general.vector -# Ship docker logs to platform's ClickHouse logs-external pipeline. -# Reuses secret_loki credentials — the same VMAuth backend fronts both ingresses. clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" vector_config: | [sources.in] From cc2399be8b72f9e21d90781c20b2d63c02f2e4e9 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 15 May 2026 20:41:14 +1000 Subject: [PATCH 3/6] Revert "remove comments" This reverts commit 46a8c4aad11dd0cbf97aa553e16b6560a949f6ed. --- ansible/inventories/devnet-0/group_vars/all/all.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index 0b68823..0d429cc 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -274,6 +274,8 @@ docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain } docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" # role: ethpandaops.general.vector +# Ship docker logs to platform's ClickHouse logs-external pipeline. +# Reuses secret_loki credentials — the same VMAuth backend fronts both ingresses. clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" vector_config: | [sources.in] From 788d508b6a0e9c4f1deeca87f4abeedb8948354f Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 15 May 2026 20:41:14 +1000 Subject: [PATCH 4/6] Revert "vector: ship logs to clickhouse logs-ingest instead of loki" This reverts commit b7f807fa855138353b61116d9bf4ee39ba9d2a57. --- .../devnet-0/group_vars/all/all.yaml | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index 0d429cc..0a83f3d 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -274,9 +274,6 @@ docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain } docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" # role: ethpandaops.general.vector -# Ship docker logs to platform's ClickHouse logs-external pipeline. -# Reuses secret_loki credentials — the same VMAuth backend fronts both ingresses. -clickhouse_logs_endpoint: "https://logs-ingest.analytics.production.platform.ethpandaops.io" vector_config: | [sources.in] type = "docker_logs" @@ -289,31 +286,24 @@ vector_config: | "snooper-", ] - [transforms.clickhouse_shape] - type = "remap" + [sinks.out] + type = "loki" inputs = ["in"] - source = ''' - .IngressUser = "{{ secret_loki.username }}" - .Namespace = "" - .Pod = "" - .Container = string(.container_name) ?? "" - .Node = "{{ inventory_hostname }}" - .Stream = string(.stream) ?? "" - .Message = string(.message) ?? "" - .Timestamp = .timestamp - del(.container_name); del(.container_id); del(.container_created_at) - del(.image); del(.host); del(.label); del(.source_type) - del(.stream); del(.message); del(.timestamp) - ''' - - [sinks.clickhouse_logs] - type = "http" - inputs = ["clickhouse_shape"] - uri = "{{ clickhouse_logs_endpoint }}" - method = "post" + out_of_order_action = "accept" + labels.forwarder = "vector" + labels.instance = "{{ inventory_hostname }}" + labels.network = "{{ ethereum_network_name }}" + labels.testnet = "{{ ethereum_network_name }}" + labels.ingress_user = "{{ secret_loki.username }}" + labels.container_name = "{{ '{{ container_name }}' }}" + {%- if ethereum_node_el is defined +%} + labels.ethereum_el = "{{ ethereum_node_el }}" + {%- endif +%} + {%- if ethereum_node_cl is defined +%} + labels.ethereum_cl = "{{ ethereum_node_cl }}" + {%- endif +%} encoding.codec = "json" + endpoint = "{{ secret_loki.endpoint }}" auth.strategy = "basic" auth.user = "{{ secret_loki.username }}" auth.password = "{{ secret_loki.password }}" - batch.max_events = 5000 - batch.timeout_secs = 3 From 4837f489298170fd10b75ffce96aae23eed8b855 Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Fri, 15 May 2026 20:44:11 +1000 Subject: [PATCH 5/6] otelcol: add upstream role, ship logs+traces to prod OTLP New template devnets now run ethpandaops.general.otelcol_contrib alongside vector. Logs + traces flow to the production OTLP endpoint (vmauth tags as external tier). Vector stays as the Loki shipper for now; sink renamed [sinks.out] -> [sinks.loki] for clarity and `otelcol` added to vector's exclude list so it doesn't loop on its own logs. --- .../devnet-0/group_vars/all/all.yaml | 84 ++++++++++++++++++- ansible/playbook.yaml | 2 + 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index 0a83f3d..f720294 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -273,12 +273,22 @@ docker_nginx_proxy_wildcard_cert: "{{ network_server_subdomain }}" docker_nginx_proxy_wildcard_cert_url: "http://cert.{{ network_server_subdomain }}/{{ network_server_subdomain }}-latest.tar.enc" docker_nginx_proxy_wildcard_cert_psk: "{{ secret_cert_encryption_psk }}" -# role: ethpandaops.general.vector +# role: ethpandaops.general.otelcol_contrib +# Reuses secret_loki credentials (same vmauth backend serves both ingresses). +otlp_endpoint: "https://otlp.analytics.production.platform.ethpandaops.io" +otlp_deployment_env: production + +otelcol_contrib_container_networks: "{{ docker_networks_shared }}" + +# Vector kept alongside otelcol just to ship logs to Loki. Will be removed +# when Loki path is replaced (e.g. central aggregator or Loki OTLP support). +vector_container_networks: "{{ docker_networks_shared }}" vector_config: | [sources.in] type = "docker_logs" exclude_containers = [ "{{ vector_container_name }}", + "otelcol", "ethereum-metrics-exporter", "nginx-proxy", "node_exporter", @@ -286,7 +296,7 @@ vector_config: | "snooper-", ] - [sinks.out] + [sinks.loki] type = "loki" inputs = ["in"] out_of_order_action = "accept" @@ -307,3 +317,73 @@ vector_config: | auth.strategy = "basic" auth.user = "{{ secret_loki.username }}" auth.password = "{{ secret_loki.password }}" +otelcol_contrib_config: | + extensions: + basicauth/client: + client_auth: + username: {{ secret_loki.username }} + password: {{ secret_loki.password }} + + receivers: + filelog: + include: [/var/lib/docker/containers/*/*-json.log] + include_file_path: true + start_at: end + operators: + - type: container + format: docker + add_metadata_from_filepath: true + - type: filter + expr: 'attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$"' + - type: json_parser + if: 'body matches "^\\s*\\{"' + on_error: send + severity: + parse_from: attributes.level + overwrite_text: true + mapping: + fatal4: [emergency, emerg] + fatal3: [alert] + fatal2: [critical, crit] + fatal: [panic] + + otlp: + protocols: + grpc: {endpoint: "[::]:4317"} + http: {endpoint: "[::]:4318"} + + processors: + resource: + attributes: + - {key: deployment.environment, value: "{{ otlp_deployment_env }}", action: upsert} + - {key: network, value: "{{ ethereum_network_name }}", action: upsert} + - {key: ingress_user, value: "{{ secret_loki.username }}", action: upsert} + - {key: host.name, value: "{{ inventory_hostname }}", action: upsert} + + transform/service_name: + log_statements: + - context: resource + statements: + - set(attributes["service.name"], attributes["container.name"]) where attributes["container.name"] != nil + + batch: + send_batch_size: 500 + timeout: 5s + + exporters: + otlphttp/staging: + endpoint: "{{ otlp_endpoint }}" + auth: + authenticator: basicauth/client + + service: + extensions: [basicauth/client] + pipelines: + logs: + receivers: [filelog, otlp] + processors: [resource, transform/service_name, batch] + exporters: [otlphttp/staging] + traces: + receivers: [otlp] + processors: [resource, batch] + exporters: [otlphttp/staging] diff --git a/ansible/playbook.yaml b/ansible/playbook.yaml index 305a39b..1a76594 100644 --- a/ansible/playbook.yaml +++ b/ansible/playbook.yaml @@ -43,6 +43,8 @@ tags: [init-server, node_exporter] - role: ethpandaops.general.prometheus tags: [init-server, prometheus] + - role: ethpandaops.general.otelcol_contrib + tags: [init-server, otelcol] - role: ethpandaops.general.vector tags: [init-server, vector] From 18d43cf39a5cf0a8f744faa2bd51e9b7a7b8207a Mon Sep 17 00:00:00 2001 From: Sam Calder-Mason Date: Tue, 19 May 2026 17:27:28 +1000 Subject: [PATCH 6/6] otelcol: filter own logs by body content (catches nil container.name case) The docker container parser fails to extract container.name from otelcol's own log file, leaving the attribute nil. The original container.name regex filter therefore lets every otelcol-contrib log through, both the first-line structured records and the stack-trace continuation rows that share no container metadata. Adding a body-match clause (github.com/open-telemetry/opentelemetry-collector-contrib or otelcol-contrib substring) catches both cases and drops the self-emitted spam at the source. Verified on blob-devnets prysm-besu-full-1: cluster-wide otelcol error volume fell from ~1M / 5min to 0 with this clause in place. --- ansible/inventories/devnet-0/group_vars/all/all.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/inventories/devnet-0/group_vars/all/all.yaml b/ansible/inventories/devnet-0/group_vars/all/all.yaml index f720294..e0d7457 100644 --- a/ansible/inventories/devnet-0/group_vars/all/all.yaml +++ b/ansible/inventories/devnet-0/group_vars/all/all.yaml @@ -334,7 +334,7 @@ otelcol_contrib_config: | format: docker add_metadata_from_filepath: true - type: filter - expr: 'attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$"' + expr: '(attributes["container.name"] != nil and attributes["container.name"] matches "^(otelcol|ethereum-metrics-exporter|nginx-proxy|node_exporter|prometheus|snooper-.*)$") or body matches "github\\.com/open-telemetry/opentelemetry-collector-contrib|otelcol-contrib"' - type: json_parser if: 'body matches "^\\s*\\{"' on_error: send