From 01f7c39fc88f8d87b67718f3fc1d27bb7acbfae5 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 2 Jun 2026 10:30:47 +0100 Subject: [PATCH 1/4] fix(webapp): restore Postgres fallback for non-ClickHouse OTLP spans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Environments with taskEventStore = 'taskEvent' / 'taskEventPartitioned' (Postgres-backed runs not yet migrated to ClickHouse) were hitting a throw in buildEventRepository when the org-scoped ClickHouse factory received those store values. The throw happened inside the grouping loop of #exportEvents, causing the entire OTLP request to return HTTP 500 — which the OpenTelemetry collector treats as non-retryable, silently dropping the full batch. Fix: guard the getEventRepositoryForOrganizationSync call in #exportEvents to only invoke the ClickHouse factory for clickhouse/clickhouse_v2 stores. All other store values (taskEvent, taskEventPartitioned, postgres) route directly to the existing Postgres eventRepository, mirroring the pattern already used by resolveEventRepositoryForStore and getEventRepositoryForStore in eventRepository/index.server.ts. Also wrap the factory call in a try/catch that falls back to Postgres so any future unexpected store values in an OTLP batch degrade gracefully instead of failing the whole request. --- apps/webapp/app/v3/otlpExporter.server.ts | 24 +++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/apps/webapp/app/v3/otlpExporter.server.ts b/apps/webapp/app/v3/otlpExporter.server.ts index 788e7339834..8c52bcbc999 100644 --- a/apps/webapp/app/v3/otlpExporter.server.ts +++ b/apps/webapp/app/v3/otlpExporter.server.ts @@ -24,6 +24,7 @@ import type { ClickhouseFactory } from "~/services/clickhouse/clickhouseFactory. import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { generateSpanId } from "./eventRepository/common.server"; +import { eventRepository } from "./eventRepository/eventRepository.server"; import type { CreatableEventKind, CreatableEventStatus, @@ -120,10 +121,25 @@ class OTLPExporter { const routeKey = `${event.organizationId}\0${taskEventStore}`; let resolved = routeCache.get(routeKey); if (!resolved) { - resolved = this._clickhouseFactory.getEventRepositoryForOrganizationSync( - taskEventStore, - event.organizationId - ); + // Non-ClickHouse stores (taskEvent / taskEventPartitioned) are Postgres-backed. + // The ClickHouse factory only handles clickhouse/clickhouse_v2 and throws otherwise. + if (taskEventStore !== "clickhouse" && taskEventStore !== "clickhouse_v2") { + resolved = { key: "postgres:default", repository: eventRepository }; + } else { + try { + resolved = this._clickhouseFactory.getEventRepositoryForOrganizationSync( + taskEventStore, + event.organizationId + ); + } catch (error) { + logger.error("[OTLPExporter] Failed to resolve ClickHouse event repository", { + taskEventStore, + organizationId: event.organizationId, + error: error instanceof Error ? error.message : error, + }); + resolved = { key: "postgres:default", repository: eventRepository }; + } + } routeCache.set(routeKey, resolved); } From 88a5efd4223fe9432e7106facc469aca7b1b1cf3 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 2 Jun 2026 10:31:15 +0100 Subject: [PATCH 2/4] chore: add server-changes entry for OTLP Postgres fallback fix --- .server-changes/otlp-postgres-store-fallback.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .server-changes/otlp-postgres-store-fallback.md diff --git a/.server-changes/otlp-postgres-store-fallback.md b/.server-changes/otlp-postgres-store-fallback.md new file mode 100644 index 00000000000..cfb1f475097 --- /dev/null +++ b/.server-changes/otlp-postgres-store-fallback.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Restore Postgres fallback for non-ClickHouse OTLP spans. Environments where runs carry a Postgres-backed taskEventStore (taskEvent / taskEventPartitioned) were receiving HTTP 500 from the OTLP ingest endpoints because the ClickHouse factory threw an error when passed those store values. The throw aborted the entire OTLP batch in #exportEvents. Non-ClickHouse stores are now routed directly to the Postgres eventRepository (matching the existing pattern in eventRepository/index.server.ts), and the ClickHouse factory call is wrapped in a try/catch that falls back to Postgres so any future unexpected store values degrade gracefully rather than failing the whole request. From 6b8058b814b1a8201bf6f0b23be50ef407f2bb84 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 2 Jun 2026 10:46:40 +0100 Subject: [PATCH 3/4] chore: tighten server-changes description --- .server-changes/otlp-postgres-store-fallback.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.server-changes/otlp-postgres-store-fallback.md b/.server-changes/otlp-postgres-store-fallback.md index cfb1f475097..ca0e5c9f74f 100644 --- a/.server-changes/otlp-postgres-store-fallback.md +++ b/.server-changes/otlp-postgres-store-fallback.md @@ -3,4 +3,4 @@ area: webapp type: fix --- -Restore Postgres fallback for non-ClickHouse OTLP spans. Environments where runs carry a Postgres-backed taskEventStore (taskEvent / taskEventPartitioned) were receiving HTTP 500 from the OTLP ingest endpoints because the ClickHouse factory threw an error when passed those store values. The throw aborted the entire OTLP batch in #exportEvents. Non-ClickHouse stores are now routed directly to the Postgres eventRepository (matching the existing pattern in eventRepository/index.server.ts), and the ClickHouse factory call is wrapped in a try/catch that falls back to Postgres so any future unexpected store values degrade gracefully rather than failing the whole request. +Fixes OTLP ingest endpoints returning HTTP 500 for runs on environments that use a Postgres-backed task event store. This caused the OpenTelemetry collector to drop entire span batches as non-retryable, resulting in real span loss. From 3f96fc819f2514f01826a87bda7599aa55cb4b7d Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 2 Jun 2026 12:13:09 +0100 Subject: [PATCH 4/4] fix(webapp): remove overly-broad catch on ClickHouse factory resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As pointed out in code review: the guard on line 126 already screens out all non-ClickHouse store values, so the only values reaching getEventRepositoryForOrganizationSync are 'clickhouse' and 'clickhouse_v2' — both valid. The broad catch was unnecessary and harmful: a real ClickHouse resolution failure would have been silently routed to Postgres, writing spans where they'd be invisible on the ClickHouse read path. --- apps/webapp/app/v3/otlpExporter.server.ts | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/v3/otlpExporter.server.ts b/apps/webapp/app/v3/otlpExporter.server.ts index 8c52bcbc999..975e4aed4a7 100644 --- a/apps/webapp/app/v3/otlpExporter.server.ts +++ b/apps/webapp/app/v3/otlpExporter.server.ts @@ -124,21 +124,14 @@ class OTLPExporter { // Non-ClickHouse stores (taskEvent / taskEventPartitioned) are Postgres-backed. // The ClickHouse factory only handles clickhouse/clickhouse_v2 and throws otherwise. if (taskEventStore !== "clickhouse" && taskEventStore !== "clickhouse_v2") { + // Non-ClickHouse stores (taskEvent / taskEventPartitioned) are Postgres-backed. + // The ClickHouse factory only handles clickhouse/clickhouse_v2 and throws otherwise. resolved = { key: "postgres:default", repository: eventRepository }; } else { - try { - resolved = this._clickhouseFactory.getEventRepositoryForOrganizationSync( - taskEventStore, - event.organizationId - ); - } catch (error) { - logger.error("[OTLPExporter] Failed to resolve ClickHouse event repository", { - taskEventStore, - organizationId: event.organizationId, - error: error instanceof Error ? error.message : error, - }); - resolved = { key: "postgres:default", repository: eventRepository }; - } + resolved = this._clickhouseFactory.getEventRepositoryForOrganizationSync( + taskEventStore, + event.organizationId + ); } routeCache.set(routeKey, resolved); }