diff --git a/src/lib/availability/availability-service.ts b/src/lib/availability/availability-service.ts index 8e0560b0f..ce48cca28 100644 --- a/src/lib/availability/availability-service.ts +++ b/src/lib/availability/availability-service.ts @@ -4,7 +4,7 @@ * Simple two-tier status: success (green) or failure (red) */ -import { and, eq, inArray, isNull, type SQLWrapper, sql } from "drizzle-orm"; +import { and, eq, inArray, isNotNull, isNull, type SQLWrapper, sql } from "drizzle-orm"; import { db } from "@/drizzle/db"; import { messageRequest, providers } from "@/drizzle/schema"; import type { @@ -46,28 +46,6 @@ const FINALIZED_REQUEST_OUTCOME_ALIAS = "successRateOutcome" as const; const FINALIZED_REQUEST_OUTCOME_SQL = sql.raw(`"${FINALIZED_REQUEST_OUTCOME_ALIAS}"`); const COUNTABLE_REQUEST_OUTCOME_SQL = sql`${FINALIZED_REQUEST_OUTCOME_SQL} IN ('success', 'failure')`; -/** - * Provider-chain `reason` values that, when present on the last chain entry, - * indicate the message-request has reached a terminal state. - * - * Mirrors the list inside `fn_is_message_request_finalized` (drizzle/0095_*.sql) - * and `fn_compute_message_request_success_rate_outcome` — keep in sync. - */ -const FINALIZED_PROVIDER_CHAIN_REASONS = [ - "request_success", - "retry_success", - "retry_failed", - "system_error", - "resource_not_found", - "client_error_non_retryable", - "concurrent_limit_failed", - "hedge_winner", - "hedge_loser_cancelled", - "client_abort", -] as const; -const FINALIZED_PROVIDER_CHAIN_REASONS_SQL = sql.raw( - FINALIZED_PROVIDER_CHAIN_REASONS.map((reason) => `'${reason}'`).join(", ") -); // Keep the hard cap independent from the UI/API default so future default tuning does not silently relax/tighten the guardrail. // It intentionally equals the default today; the separation preserves distinct semantic roles for future tuning. export const MAX_BUCKETS_HARD_LIMIT = 100; @@ -85,62 +63,24 @@ export class AvailabilityQueryValidationError extends Error { } /** - * "Finalized request" predicate used in the availability CTE WHERE clause. + * 可用性监控的"已终态"边界收敛为 `status_code IS NOT NULL`。 + * + * 这与部分索引 `idx_message_request_provider_created_at_finalized_active` + * 的谓词 `deleted_at IS NULL AND status_code IS NOT NULL` 对齐,让 + * provider + 时间范围聚合可以直接命中索引,而不是退化为大范围扫描。 * - * SEMANTICALLY EQUIVALENT to `fn_is_message_request_finalized(blocked_by, - * status_code, provider_chain, error_message)` defined in drizzle/0095_*.sql - * (re-affirmed in 0097_*.sql / 0098_*.sql). It is intentionally inlined here - * because PostgreSQL does NOT inline PL/pgSQL functions, which means calling - * the function in the WHERE clause makes the predicate opaque to the - * planner. That hides the dominant `status_code IS NOT NULL` branch and - * prevents the planner from using the partial index - * `idx_message_request_provider_created_at_finalized_active` - * (predicate: `status_code IS NOT NULL AND deleted_at IS NULL`), which - * collapses the dashboard query into a sequential scan. + * 不复刻 `fn_is_message_request_finalized` 的语义(即使内联)也是有意为之: + * 该函数会把仅有 providerChain / errorMessage 片段但 statusCode 仍为 NULL + * 的"请求中"记录判为终态;放到可用性统计里会被分类函数误算成 failure。 + * 终态记录的成功/失败/排除分类继续由 + * `fn_compute_message_request_success_rate_outcome(...)` 处理。 * - * KEEP IN SYNC with `fn_is_message_request_finalized` in - * drizzle/0095_young_lily_hollister.sql; the trigger and the row-level - * outcome function still call the SQL function (per-row write path, not - * latency critical), so the canonical definition stays in PL/pgSQL. + * 已知限制:若未来出现 status_code 长时间未落库但请求已稳定结束的写路径, + * 这些记录会被排除;届时应引入独立的、SARGable 的 finalized 谓词, + * 而不是放回 PL/pgSQL 函数调用。 */ function buildAvailabilityFinalizedCondition() { - // The `IS NOT NULL` checks below are individually SARGable. Listing - // status_code first encourages the planner to scan the partial index. - // - // The provider_chain branch wraps each jsonb-array operation in a CASE - // because PostgreSQL does NOT guarantee left-to-right short-circuit of - // AND / OR (see PG docs on Logical Operators). Without CASE, an - // observed-rare-but-legal historical row where `provider_chain` is a - // non-array jsonb value (object, scalar, or json null) would make - // `jsonb_array_length(...)` raise `cannot get array length of a non-array` - // and crash the dashboard query. - // - // The `?` JSONB key-existence operator on the last line is correct under - // the `pg` driver Drizzle uses today (parameterized via `$N`). If we ever - // swap drivers (e.g. `postgres.js`) bare `?` may be reinterpreted as a - // positional placeholder; either change the driver or use - // `jsonb_exists(..., 'statusCode')` at that point. - return sql`( - ${messageRequest.statusCode} IS NOT NULL - OR ${messageRequest.blockedBy} IS NOT NULL - OR COALESCE(${messageRequest.errorMessage}, '') <> '' - OR ( - CASE - WHEN ${messageRequest.providerChain} IS NULL THEN FALSE - WHEN jsonb_typeof(${messageRequest.providerChain}) <> 'array' THEN FALSE - WHEN jsonb_array_length(${messageRequest.providerChain}) = 0 THEN FALSE - WHEN jsonb_typeof(${messageRequest.providerChain} -> -1) <> 'object' THEN FALSE - ELSE ( - (${messageRequest.providerChain} -> -1 ->> 'reason') IN (${FINALIZED_PROVIDER_CHAIN_REASONS_SQL}) - OR ( - (${messageRequest.providerChain} -> -1 ? 'statusCode') - AND jsonb_typeof(${messageRequest.providerChain} -> -1 -> 'statusCode') = 'number' - ) - OR COALESCE(${messageRequest.providerChain} -> -1 ->> 'errorMessage', '') <> '' - ) - END - ) - )`; + return isNotNull(messageRequest.statusCode); } function assertValidDate(date: Date, fieldName: string): Date { diff --git a/tests/unit/lib/availability-service.test.ts b/tests/unit/lib/availability-service.test.ts index 8a8d38a8e..e36c9a532 100644 --- a/tests/unit/lib/availability-service.test.ts +++ b/tests/unit/lib/availability-service.test.ts @@ -50,6 +50,17 @@ function extractFinalizedRequestsSql(queryText: string): string { return queryText.slice(start, end); } +// 终态边界必须仅由 status_code 收敛:不能回退到包含 blocked_by / +// error_message / provider_chain 任一非空的旧语义,否则会重新把"请求中" +// 记录纳入可用性统计。每一处断言都重复这套规则,防止个别用例漏检导致回归。 +function expectStatusCodeOnlyFinalizedBoundary(sqlText: string) { + expect(sqlText).not.toContain("fn_is_message_request_finalized"); + expect(sqlText).toContain(`"status_code" is not null`); + expect(sqlText).not.toContain(`"blocked_by" is not null`); + expect(sqlText).not.toContain(`"error_message" is not null`); + expect(sqlText).not.toContain(`"provider_chain" -> -1 ->> 'reason'`); +} + describe("availability-service", () => { beforeEach(() => { vi.resetModules(); @@ -309,24 +320,17 @@ describe("availability-service", () => { const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]); const finalizedRequestsSql = extractFinalizedRequestsSql(queryText); - // The "finalized" predicate is inlined as a SARGable expression (not a - // function call) so the planner can use the partial index on - // status_code IS NOT NULL. - expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized"); - expect(finalizedRequestsSql).toContain(`"status_code" is not null`); - expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`); - expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`); - // The provider_chain branch must wrap jsonb operations in a CASE so the - // dashboard query does not crash on a non-array historical row - // (PostgreSQL does not guarantee AND short-circuit). - expect(finalizedRequestsSql).toContain("case"); - expect(finalizedRequestsSql).toContain( - `jsonb_typeof("message_request"."provider_chain") <> 'array'` - ); + // 可用性监控的终态边界收敛为 status_code IS NOT NULL, + // 这样才能命中部分索引 idx_message_request_provider_created_at_finalized_active; + // 同时不会把 providerChain / errorMessage 已写入但 statusCode 仍为空的"请求中" + // 记录纳入聚合 —— 它们会在分类阶段被误判成 failure。 + expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql); expect(queryText).toContain("group by"); expect(queryText).toContain("percentile_cont(0.95)"); expect(queryText).toContain("row_number() over"); expect(queryText).toContain(`"successrateoutcome" in ('success', 'failure')`); + // 终态记录的 success/failure/excluded 分类仍由 outcome 函数完成。 + expect(queryText).toContain("fn_compute_message_request_success_rate_outcome"); expect(queryText).toContain('avg("durationms") filter'); }); @@ -495,13 +499,9 @@ describe("availability-service", () => { const finalizedRequestsSql = extractFinalizedRequestsSql( normalizeSql(executeMock.mock.calls[0]?.[0]) ); - // The "finalized" predicate is inlined as a SARGable expression (not a - // function call) so the planner can use the partial index on - // status_code IS NOT NULL. - expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized"); - expect(finalizedRequestsSql).toContain(`"status_code" is not null`); - expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`); - expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`); + // 终态判定只看 status_code IS NOT NULL:要么命中部分索引,要么直接排除"请求中" + // 的记录,不再依据 providerChain / errorMessage 片段把它们判为终态。 + expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql); }); it("queryProviderAvailability 会保留 Gemini passthrough 终态(statusCode!=null 且 durationMs=null)", async () => { @@ -535,6 +535,10 @@ describe("availability-service", () => { normalizeSql(executeMock.mock.calls[0]?.[0]) ); expect(finalizedRequestsSql).not.toMatch(/where .*duration_?ms.*is not null/); + // Gemini passthrough 写入了 statusCode(即使 durationMs 仍为 null), + // 因此会被 status_code IS NOT NULL 的终态过滤保留下来;同时保持终态边界 + // 不被其他字段放宽。 + expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql); }); it("queryProviderAvailability 当前不会把中间持久化状态(statusCode=null 且 durationMs!=null)误算为 red", async () => { @@ -567,13 +571,9 @@ describe("availability-service", () => { const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]); const finalizedRequestsSql = extractFinalizedRequestsSql(queryText); - // The "finalized" predicate is inlined as a SARGable expression (not a - // function call) so the planner can use the partial index on - // status_code IS NOT NULL. - expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized"); - expect(finalizedRequestsSql).toContain(`"status_code" is not null`); - expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`); - expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`); + // status_code IS NOT NULL 把 statusCode=null 的中间持久化记录直接排除在聚合外, + // 它们根本不会进入 outcome 分类阶段,所以不会被算成 failure。 + expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql); expect(queryText).toContain("fn_compute_message_request_success_rate_outcome"); expect(queryText).toContain(`"successrateoutcome" = 'failure'`); }); @@ -742,11 +742,10 @@ describe("availability-service", () => { ]); const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]); - // Inlined finalized predicate (planner-transparent; see - // buildAvailabilityFinalizedCondition in availability-service.ts). - expect(queryText).not.toContain("fn_is_message_request_finalized"); - expect(queryText).toContain(`"status_code" is not null`); - expect(queryText).toContain(`"blocked_by" is not null`); + // getCurrentProviderStatus 同样使用 status_code IS NOT NULL 终态边界, + // 让短窗口查询也能直接命中部分索引并避免误判"请求中"。 + expectStatusCodeOnlyFinalizedBoundary(queryText); + expect(queryText).toContain("fn_compute_message_request_success_rate_outcome"); expect(queryText).toContain(">= now() - (15 * interval '1 minute')"); expect(queryText).toContain("<= now()"); expect(queryText).toContain("count(*) filter");