Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 15 additions & 75 deletions src/lib/availability/availability-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* Simple two-tier status: success (green) or failure (red)
*/

import { and, eq, inArray, isNull, type SQLWrapper, sql } from "drizzle-orm";
import { and, eq, inArray, isNotNull, isNull, type SQLWrapper, sql } from "drizzle-orm";
import { db } from "@/drizzle/db";
import { messageRequest, providers } from "@/drizzle/schema";
import type {
Expand Down Expand Up @@ -46,28 +46,6 @@ const FINALIZED_REQUEST_OUTCOME_ALIAS = "successRateOutcome" as const;
const FINALIZED_REQUEST_OUTCOME_SQL = sql.raw(`"${FINALIZED_REQUEST_OUTCOME_ALIAS}"`);
const COUNTABLE_REQUEST_OUTCOME_SQL = sql`${FINALIZED_REQUEST_OUTCOME_SQL} IN ('success', 'failure')`;

/**
* Provider-chain `reason` values that, when present on the last chain entry,
* indicate the message-request has reached a terminal state.
*
* Mirrors the list inside `fn_is_message_request_finalized` (drizzle/0095_*.sql)
* and `fn_compute_message_request_success_rate_outcome` — keep in sync.
*/
const FINALIZED_PROVIDER_CHAIN_REASONS = [
"request_success",
"retry_success",
"retry_failed",
"system_error",
"resource_not_found",
"client_error_non_retryable",
"concurrent_limit_failed",
"hedge_winner",
"hedge_loser_cancelled",
"client_abort",
] as const;
const FINALIZED_PROVIDER_CHAIN_REASONS_SQL = sql.raw(
FINALIZED_PROVIDER_CHAIN_REASONS.map((reason) => `'${reason}'`).join(", ")
);
// Keep the hard cap independent from the UI/API default so future default tuning does not silently relax/tighten the guardrail.
// It intentionally equals the default today; the separation preserves distinct semantic roles for future tuning.
export const MAX_BUCKETS_HARD_LIMIT = 100;
Expand All @@ -85,62 +63,24 @@ export class AvailabilityQueryValidationError extends Error {
}

/**
* "Finalized request" predicate used in the availability CTE WHERE clause.
* 可用性监控的"已终态"边界收敛为 `status_code IS NOT NULL`。
*
* 这与部分索引 `idx_message_request_provider_created_at_finalized_active`
* 的谓词 `deleted_at IS NULL AND status_code IS NOT NULL` 对齐,让
* provider + 时间范围聚合可以直接命中索引,而不是退化为大范围扫描。
*
* SEMANTICALLY EQUIVALENT to `fn_is_message_request_finalized(blocked_by,
* status_code, provider_chain, error_message)` defined in drizzle/0095_*.sql
* (re-affirmed in 0097_*.sql / 0098_*.sql). It is intentionally inlined here
* because PostgreSQL does NOT inline PL/pgSQL functions, which means calling
* the function in the WHERE clause makes the predicate opaque to the
* planner. That hides the dominant `status_code IS NOT NULL` branch and
* prevents the planner from using the partial index
* `idx_message_request_provider_created_at_finalized_active`
* (predicate: `status_code IS NOT NULL AND deleted_at IS NULL`), which
* collapses the dashboard query into a sequential scan.
* 不复刻 `fn_is_message_request_finalized` 的语义(即使内联)也是有意为之:
* 该函数会把仅有 providerChain / errorMessage 片段但 statusCode 仍为 NULL
* 的"请求中"记录判为终态;放到可用性统计里会被分类函数误算成 failure。
* 终态记录的成功/失败/排除分类继续由
* `fn_compute_message_request_success_rate_outcome(...)` 处理。
*
* KEEP IN SYNC with `fn_is_message_request_finalized` in
* drizzle/0095_young_lily_hollister.sql; the trigger and the row-level
* outcome function still call the SQL function (per-row write path, not
* latency critical), so the canonical definition stays in PL/pgSQL.
* 已知限制:若未来出现 status_code 长时间未落库但请求已稳定结束的写路径,
* 这些记录会被排除;届时应引入独立的、SARGable 的 finalized 谓词,
* 而不是放回 PL/pgSQL 函数调用。
*/
function buildAvailabilityFinalizedCondition() {
// The `IS NOT NULL` checks below are individually SARGable. Listing
// status_code first encourages the planner to scan the partial index.
//
// The provider_chain branch wraps each jsonb-array operation in a CASE
// because PostgreSQL does NOT guarantee left-to-right short-circuit of
// AND / OR (see PG docs on Logical Operators). Without CASE, an
// observed-rare-but-legal historical row where `provider_chain` is a
// non-array jsonb value (object, scalar, or json null) would make
// `jsonb_array_length(...)` raise `cannot get array length of a non-array`
// and crash the dashboard query.
//
// The `?` JSONB key-existence operator on the last line is correct under
// the `pg` driver Drizzle uses today (parameterized via `$N`). If we ever
// swap drivers (e.g. `postgres.js`) bare `?` may be reinterpreted as a
// positional placeholder; either change the driver or use
// `jsonb_exists(..., 'statusCode')` at that point.
return sql`(
${messageRequest.statusCode} IS NOT NULL
OR ${messageRequest.blockedBy} IS NOT NULL
OR COALESCE(${messageRequest.errorMessage}, '') <> ''
OR (
CASE
WHEN ${messageRequest.providerChain} IS NULL THEN FALSE
WHEN jsonb_typeof(${messageRequest.providerChain}) <> 'array' THEN FALSE
WHEN jsonb_array_length(${messageRequest.providerChain}) = 0 THEN FALSE
WHEN jsonb_typeof(${messageRequest.providerChain} -> -1) <> 'object' THEN FALSE
ELSE (
(${messageRequest.providerChain} -> -1 ->> 'reason') IN (${FINALIZED_PROVIDER_CHAIN_REASONS_SQL})
OR (
(${messageRequest.providerChain} -> -1 ? 'statusCode')
AND jsonb_typeof(${messageRequest.providerChain} -> -1 -> 'statusCode') = 'number'
)
OR COALESCE(${messageRequest.providerChain} -> -1 ->> 'errorMessage', '') <> ''
)
END
)
)`;
return isNotNull(messageRequest.statusCode);
}

function assertValidDate(date: Date, fieldName: string): Date {
Expand Down
65 changes: 32 additions & 33 deletions tests/unit/lib/availability-service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ function extractFinalizedRequestsSql(queryText: string): string {
return queryText.slice(start, end);
}

// 终态边界必须仅由 status_code 收敛:不能回退到包含 blocked_by /
// error_message / provider_chain 任一非空的旧语义,否则会重新把"请求中"
// 记录纳入可用性统计。每一处断言都重复这套规则,防止个别用例漏检导致回归。
function expectStatusCodeOnlyFinalizedBoundary(sqlText: string) {
expect(sqlText).not.toContain("fn_is_message_request_finalized");
expect(sqlText).toContain(`"status_code" is not null`);
expect(sqlText).not.toContain(`"blocked_by" is not null`);
expect(sqlText).not.toContain(`"error_message" is not null`);
expect(sqlText).not.toContain(`"provider_chain" -> -1 ->> 'reason'`);
}

describe("availability-service", () => {
beforeEach(() => {
vi.resetModules();
Expand Down Expand Up @@ -309,24 +320,17 @@ describe("availability-service", () => {

const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]);
const finalizedRequestsSql = extractFinalizedRequestsSql(queryText);
// The "finalized" predicate is inlined as a SARGable expression (not a
// function call) so the planner can use the partial index on
// status_code IS NOT NULL.
expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized");
expect(finalizedRequestsSql).toContain(`"status_code" is not null`);
expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`);
expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`);
// The provider_chain branch must wrap jsonb operations in a CASE so the
// dashboard query does not crash on a non-array historical row
// (PostgreSQL does not guarantee AND short-circuit).
expect(finalizedRequestsSql).toContain("case");
expect(finalizedRequestsSql).toContain(
`jsonb_typeof("message_request"."provider_chain") <> 'array'`
);
// 可用性监控的终态边界收敛为 status_code IS NOT NULL,
// 这样才能命中部分索引 idx_message_request_provider_created_at_finalized_active;
// 同时不会把 providerChain / errorMessage 已写入但 statusCode 仍为空的"请求中"
// 记录纳入聚合 —— 它们会在分类阶段被误判成 failure。
expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql);
expect(queryText).toContain("group by");
expect(queryText).toContain("percentile_cont(0.95)");
expect(queryText).toContain("row_number() over");
expect(queryText).toContain(`"successrateoutcome" in ('success', 'failure')`);
// 终态记录的 success/failure/excluded 分类仍由 outcome 函数完成。
expect(queryText).toContain("fn_compute_message_request_success_rate_outcome");
expect(queryText).toContain('avg("durationms") filter');
});

Expand Down Expand Up @@ -495,13 +499,9 @@ describe("availability-service", () => {
const finalizedRequestsSql = extractFinalizedRequestsSql(
normalizeSql(executeMock.mock.calls[0]?.[0])
);
// The "finalized" predicate is inlined as a SARGable expression (not a
// function call) so the planner can use the partial index on
// status_code IS NOT NULL.
expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized");
expect(finalizedRequestsSql).toContain(`"status_code" is not null`);
expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`);
expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`);
// 终态判定只看 status_code IS NOT NULL:要么命中部分索引,要么直接排除"请求中"
// 的记录,不再依据 providerChain / errorMessage 片段把它们判为终态。
expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql);
});

it("queryProviderAvailability 会保留 Gemini passthrough 终态(statusCode!=null 且 durationMs=null)", async () => {
Expand Down Expand Up @@ -535,6 +535,10 @@ describe("availability-service", () => {
normalizeSql(executeMock.mock.calls[0]?.[0])
);
expect(finalizedRequestsSql).not.toMatch(/where .*duration_?ms.*is not null/);
// Gemini passthrough 写入了 statusCode(即使 durationMs 仍为 null),
// 因此会被 status_code IS NOT NULL 的终态过滤保留下来;同时保持终态边界
// 不被其他字段放宽。
expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql);
});

it("queryProviderAvailability 当前不会把中间持久化状态(statusCode=null 且 durationMs!=null)误算为 red", async () => {
Expand Down Expand Up @@ -567,13 +571,9 @@ describe("availability-service", () => {
const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]);
const finalizedRequestsSql = extractFinalizedRequestsSql(queryText);

// The "finalized" predicate is inlined as a SARGable expression (not a
// function call) so the planner can use the partial index on
// status_code IS NOT NULL.
expect(finalizedRequestsSql).not.toContain("fn_is_message_request_finalized");
expect(finalizedRequestsSql).toContain(`"status_code" is not null`);
expect(finalizedRequestsSql).toContain(`"blocked_by" is not null`);
expect(finalizedRequestsSql).toContain(`"provider_chain" -> -1 ->> 'reason'`);
// status_code IS NOT NULL 把 statusCode=null 的中间持久化记录直接排除在聚合外,
// 它们根本不会进入 outcome 分类阶段,所以不会被算成 failure。
expectStatusCodeOnlyFinalizedBoundary(finalizedRequestsSql);
expect(queryText).toContain("fn_compute_message_request_success_rate_outcome");
expect(queryText).toContain(`"successrateoutcome" = 'failure'`);
});
Expand Down Expand Up @@ -742,11 +742,10 @@ describe("availability-service", () => {
]);

const queryText = normalizeSql(executeMock.mock.calls[0]?.[0]);
// Inlined finalized predicate (planner-transparent; see
// buildAvailabilityFinalizedCondition in availability-service.ts).
expect(queryText).not.toContain("fn_is_message_request_finalized");
expect(queryText).toContain(`"status_code" is not null`);
expect(queryText).toContain(`"blocked_by" is not null`);
// getCurrentProviderStatus 同样使用 status_code IS NOT NULL 终态边界,
// 让短窗口查询也能直接命中部分索引并避免误判"请求中"。
expectStatusCodeOnlyFinalizedBoundary(queryText);
expect(queryText).toContain("fn_compute_message_request_success_rate_outcome");
expect(queryText).toContain(">= now() - (15 * interval '1 minute')");
expect(queryText).toContain("<= now()");
expect(queryText).toContain("count(*) filter");
Expand Down
Loading