diff --git a/docs/docs/api/appkit/Class.ExecutionError.md b/docs/docs/api/appkit/Class.ExecutionError.md index 75886c4dc..eacfdc23d 100644 --- a/docs/docs/api/appkit/Class.ExecutionError.md +++ b/docs/docs/api/appkit/Class.ExecutionError.md @@ -22,6 +22,7 @@ throw new ExecutionError("Statement was canceled"); new ExecutionError(message: string, options?: { cause?: Error; context?: Record; + errorCode?: string; }): ExecutionError; ``` @@ -30,15 +31,16 @@ new ExecutionError(message: string, options?: { | Parameter | Type | | ------ | ------ | | `message` | `string` | -| `options?` | \{ `cause?`: `Error`; `context?`: `Record`\<`string`, `unknown`\>; \} | +| `options?` | \{ `cause?`: `Error`; `context?`: `Record`\<`string`, `unknown`\>; `errorCode?`: `string`; \} | | `options.cause?` | `Error` | | `options.context?` | `Record`\<`string`, `unknown`\> | +| `options.errorCode?` | `string` | #### Returns `ExecutionError` -#### Inherited from +#### Overrides [`AppKitError`](Class.AppKitError.md).[`constructor`](Class.AppKitError.md#constructor) @@ -86,6 +88,19 @@ Additional context for the error *** +### errorCode? + +```ts +readonly optional errorCode: string; +``` + +Structured error code from the upstream source (typically the warehouse's +`error_code` for statement-level failures, or the SDK's `ApiError.errorCode` +for HTTP failures). Preserved through wrapping so callers can branch on a +stable identifier without substring-matching the message. + +*** + ### isRetryable ```ts @@ -202,16 +217,17 @@ Create an execution error for closed/expired results ### statementFailed() ```ts -static statementFailed(errorMessage?: string): ExecutionError; +static statementFailed(errorMessage?: string, errorCode?: string): ExecutionError; ``` -Create an execution error for statement failure +Create an execution error for statement failure. #### Parameters -| Parameter | Type | -| ------ | ------ | -| `errorMessage?` | `string` | +| Parameter | Type | Description | +| ------ | ------ | ------ | +| `errorMessage?` | `string` | Human-readable error from the warehouse / SDK. | +| `errorCode?` | `string` | Structured code (e.g. "INVALID_PARAMETER_VALUE") to preserve through wrapping. Optional. | #### Returns diff --git a/packages/appkit-ui/src/js/sse/connect-sse.ts b/packages/appkit-ui/src/js/sse/connect-sse.ts index c4fd4500d..13d9053de 100644 --- a/packages/appkit-ui/src/js/sse/connect-sse.ts +++ b/packages/appkit-ui/src/js/sse/connect-sse.ts @@ -18,7 +18,11 @@ export async function connectSSE( lastEventId: initialLastEventId = null, retryDelay = 2000, maxRetries = 3, - maxBufferSize = 1024 * 1024, // 1MB + // 1 MiB — matches the server's `streamDefaults.maxEventSize`. SSE + // carries only short JSON control messages; bulk Arrow payloads flow + // over plain HTTP via `/api/analytics/arrow-result/:jobId`, so this + // buffer never needs to hold multi-MiB attachments. + maxBufferSize = 1 * 1024 * 1024, timeout = 300000, // 5 minutes onError, } = options; diff --git a/packages/appkit-ui/src/react/charts/__tests__/types.test.ts b/packages/appkit-ui/src/react/charts/__tests__/types.test.ts index 13394dcf6..d6685ce01 100644 --- a/packages/appkit-ui/src/react/charts/__tests__/types.test.ts +++ b/packages/appkit-ui/src/react/charts/__tests__/types.test.ts @@ -93,7 +93,7 @@ describe("isQueryProps", () => { const props = { queryKey: "test_query", parameters: { limit: 100 }, - format: "json" as const, + format: "json_array" as const, }; expect(isQueryProps(props as any)).toBe(true); diff --git a/packages/appkit-ui/src/react/charts/types.ts b/packages/appkit-ui/src/react/charts/types.ts index 65804a741..ef738aad2 100644 --- a/packages/appkit-ui/src/react/charts/types.ts +++ b/packages/appkit-ui/src/react/charts/types.ts @@ -4,8 +4,22 @@ import type { Table } from "apache-arrow"; // Data Format Types // ============================================================================ -/** Supported data formats for analytics queries */ -export type DataFormat = "json" | "arrow" | "auto"; +/** + * Supported data formats for analytics queries. + * + * "json" and "arrow" are legacy aliases kept for backwards compatibility + * with appkit-ui < 0.33.0 — safe to remove once no consumer is on a + * pre-0.33.0 version. resolveFormat() normalizes them to their canonical + * equivalents before any downstream code reads the value. + */ +export type DataFormat = + | "json_array" + | "arrow_stream" + | "auto" + /** @deprecated Use "json_array". Safe to remove once no consumer is on appkit-ui < 0.33.0. */ + | "json" + /** @deprecated Use "arrow_stream". Safe to remove once no consumer is on appkit-ui < 0.33.0. */ + | "arrow"; /** Chart orientation */ export type Orientation = "vertical" | "horizontal"; @@ -77,8 +91,8 @@ export interface QueryProps extends ChartBaseProps { parameters?: Record; /** * Data format to use - * - "json": Use JSON format (smaller payloads, simpler) - * - "arrow": Use Arrow format (faster for large datasets) + * - "json_array": Use JSON format (smaller payloads, simpler) + * - "arrow_stream": Use Arrow format (faster for large datasets) * - "auto": Automatically select based on expected data size * @default "auto" */ diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts index cfe5d6ce9..ba4c962b0 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -1,34 +1,220 @@ -import { renderHook } from "@testing-library/react"; -import { afterEach, describe, expect, test, vi } from "vitest"; +import { renderHook, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, test, vi } from "vitest"; -// Mock connectSSE so the hook does not attempt a real network request. -const mockConnectSSE = vi.fn().mockImplementation((_opts: unknown) => { - // Return a never-resolving promise; tests don't need the result. - return new Promise(() => {}); +let lastConnectArgs: any = null; +const mockProcessArrowBuffer = vi.fn(); +const mockFetchArrow = vi.fn(); +const mockConnectSSE = vi.fn((args: any) => { + lastConnectArgs = args; + return () => {}; }); vi.mock("@/js", () => ({ + connectSSE: (...args: unknown[]) => mockConnectSSE(...(args as [any])), ArrowClient: { - fetchArrow: vi.fn(), - processArrowBuffer: vi.fn(), + fetchArrow: (...args: unknown[]) => mockFetchArrow(...args), + processArrowBuffer: (...args: unknown[]) => mockProcessArrowBuffer(...args), }, - connectSSE: (...args: unknown[]) => mockConnectSSE(...args), })); -// Stub useQueryHMR so we don't pull in import.meta.hot wiring. vi.mock("../use-query-hmr", () => ({ - useQueryHMR: () => {}, + useQueryHMR: vi.fn(), })); import { useAnalyticsQuery } from "../use-analytics-query"; describe("useAnalyticsQuery", () => { - afterEach(() => { + beforeEach(() => { vi.clearAllMocks(); + lastConnectArgs = null; + }); + + test("fetches an arrow message (warehouse statement id) via /arrow-result", async () => { + const fakeTable = { numRows: 1, schema: { fields: [] } }; + const fakeBytes = new Uint8Array([1, 2, 3]); + mockFetchArrow.mockResolvedValueOnce(fakeBytes); + mockProcessArrowBuffer.mockResolvedValueOnce(fakeTable); + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow", statement_id: "stmt-warehouse-1" }), + }); + + await waitFor(() => { + expect(result.current.data).toBe(fakeTable); + }); + + expect(mockFetchArrow).toHaveBeenCalledTimes(1); + expect(mockFetchArrow).toHaveBeenCalledWith( + "/api/analytics/arrow-result/stmt-warehouse-1", + ); + expect(mockProcessArrowBuffer).toHaveBeenCalledWith(fakeBytes); + }); + + test("fetches an arrow message with synthetic inline- id through the same /arrow-result path", async () => { + // The client must treat inline and external-links responses uniformly — + // it never decodes base64 locally. The /arrow-result route on the + // server is the only place that knows which path the bytes came from. + const fakeTable = { numRows: 1, schema: { fields: [] } }; + const fakeBytes = new Uint8Array([1, 2, 3, 4, 5]); + mockFetchArrow.mockResolvedValueOnce(fakeBytes); + mockProcessArrowBuffer.mockResolvedValueOnce(fakeTable); + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ + type: "arrow", + statement_id: "inline-abc-xyz", + }), + }); + + await waitFor(() => { + expect(result.current.data).toBe(fakeTable); + }); + + expect(mockFetchArrow).toHaveBeenCalledTimes(1); + expect(mockFetchArrow).toHaveBeenCalledWith( + "/api/analytics/arrow-result/inline-abc-xyz", + ); + }); + + test("surfaces an error when the arrow fetch fails", async () => { + mockFetchArrow.mockRejectedValueOnce(new Error("network")); + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow", statement_id: "stmt-1" }), + }); + + await waitFor(() => { + expect(result.current.error).toBe( + "Unable to load data, please try again", + ); + }); + expect(result.current.loading).toBe(false); + }); + + test("rejects the retired arrow_inline message type as schema-invalid", async () => { + // arrow_inline was the prior wire shape. The discriminated union no + // longer accepts it, so it falls through to the generic error/code + // branch — but critically, it must NEVER trigger ArrowClient calls. + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow_inline", attachment: "AQID" }), + }); + + await waitFor(() => { + expect( + result.current.loading || + result.current.error || + result.current.data === null, + ).toBeTruthy(); + }); + expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); + expect(mockFetchArrow).not.toHaveBeenCalled(); + }); + + test("normalizes an empty result message (no data field) to []", async () => { + // The wire schema makes `data` optional — empty result sets may omit + // it. The hook must surface that as an explicit empty array rather + // than `undefined`, so callers can rely on `data` being either null + // (no message yet) or a value of the inferred result type. + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "result" }), + }); + + await waitFor(() => { + expect(result.current.data).toEqual([]); + }); + expect(result.current.loading).toBe(false); + expect(result.current.error).toBeNull(); + }); + + test("still handles type:result rows for JSON_ARRAY", async () => { + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ + type: "result", + data: [{ id: 1 }, { id: 2 }], + }), + }); + + await waitFor(() => { + expect(result.current.data).toEqual([{ id: 1 }, { id: 2 }]); + }); + expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); + expect(mockFetchArrow).not.toHaveBeenCalled(); + }); + + test("a malformed (non-JSON) SSE payload clears loading and surfaces an error — does not strand the hook in loading=true", async () => { + // A `JSON.parse` failure inside the SSE handler used to be swallowed + // by the outer catch with only a console.warn, leaving the hook + // permanently in `loading=true` with no error surfaced. The UI would + // spin forever. The handler now reports a user-facing error so the + // consumer can render a retry affordance. + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), + ); + + await lastConnectArgs.onMessage({ data: "not-json{" }); + + await waitFor(() => { + expect(result.current.loading).toBe(false); + }); + expect(result.current.error).toBe("Unable to load data, please try again"); + expect(result.current.data).toBeNull(); + }); + + test("a server error event carrying a structured errorCode exposes it on the hook return value", async () => { + // The SSE error broadcaster forwards an `errorCode` field for + // UI branching (e.g. INLINE_ARROW_STASH_EXHAUSTED). The hook + // surfaces both the human `error` text AND the structured + // `errorCode` so consumers can branch on the stable identifier + // instead of substring-matching the sanitized human message. + const errorSpy = vi.spyOn(console, "error").mockImplementation(() => {}); + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ + type: "error", + error: "Server is at capacity, please retry", + code: "UPSTREAM_ERROR", + errorCode: "INLINE_ARROW_STASH_EXHAUSTED", + }), + }); + + await waitFor(() => { + expect(result.current.error).toBe("Server is at capacity, please retry"); + }); + expect(result.current.loading).toBe(false); + expect(result.current.errorCode).toBe("INLINE_ARROW_STASH_EXHAUSTED"); + + errorSpy.mockRestore(); }); test("does not refetch when params object is structurally equal across renders", () => { - // Each render passes a fresh object literal — the common footgun. const { rerender } = renderHook( ({ limit }: { limit: number }) => // biome-ignore lint/suspicious/noExplicitAny: typed registry not available in tests @@ -36,15 +222,12 @@ describe("useAnalyticsQuery", () => { { initialProps: { limit: 10 } }, ); - // Initial render triggers exactly one connection. expect(mockConnectSSE).toHaveBeenCalledTimes(1); - // Re-render with structurally-equal-but-new-reference params. rerender({ limit: 10 }); rerender({ limit: 10 }); rerender({ limit: 10 }); - // Should NOT have refetched — the hook stabilized the params reference. expect(mockConnectSSE).toHaveBeenCalledTimes(1); }); diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts index a4d99a916..686aff317 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts @@ -72,7 +72,7 @@ describe("useChartData", () => { }); describe("format selection", () => { - test("uses JSON format when explicitly specified", () => { + test("uses JSON_ARRAY format when explicitly specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -82,7 +82,7 @@ describe("useChartData", () => { renderHook(() => useChartData({ queryKey: "test", - format: "json", + format: "json_array", }), ); @@ -93,7 +93,7 @@ describe("useChartData", () => { ); }); - test("uses ARROW format when explicitly specified", () => { + test("uses ARROW_STREAM format when explicitly specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -103,7 +103,7 @@ describe("useChartData", () => { renderHook(() => useChartData({ queryKey: "test", - format: "arrow", + format: "arrow_stream", }), ); @@ -114,7 +114,7 @@ describe("useChartData", () => { ); }); - test("auto-selects ARROW for large limit", () => { + test("auto-selects ARROW_STREAM for large limit", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -136,7 +136,7 @@ describe("useChartData", () => { ); }); - test("auto-selects ARROW for date range queries", () => { + test("auto-selects ARROW_STREAM for date range queries", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -205,7 +205,7 @@ describe("useChartData", () => { ); }); - test("auto-selects JSON by default when no heuristics match", () => { + test("auto-selects JSON_ARRAY by default when no heuristics match", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -227,7 +227,7 @@ describe("useChartData", () => { ); }); - test("defaults to auto format (JSON) when format is not specified", () => { + test("defaults to auto format (JSON_ARRAY) when format is not specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -353,7 +353,7 @@ describe("useChartData", () => { expect(result.current.isArrow).toBe(false); }); - test("isArrow reflects requested ARROW format when data is null", () => { + test("isArrow reflects requested ARROW_STREAM format when data is null", () => { mockUseAnalyticsQuery.mockReturnValue({ data: null, loading: true, @@ -361,13 +361,13 @@ describe("useChartData", () => { }); const { result } = renderHook(() => - useChartData({ queryKey: "test", format: "arrow" }), + useChartData({ queryKey: "test", format: "arrow_stream" }), ); expect(result.current.isArrow).toBe(true); }); - test("isArrow reflects requested JSON format when data is null", () => { + test("isArrow reflects requested JSON_ARRAY format when data is null", () => { mockUseAnalyticsQuery.mockReturnValue({ data: null, loading: true, @@ -375,7 +375,7 @@ describe("useChartData", () => { }); const { result } = renderHook(() => - useChartData({ queryKey: "test", format: "json" }), + useChartData({ queryKey: "test", format: "json_array" }), ); expect(result.current.isArrow).toBe(false); diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index 60fc5f63b..a603e10a4 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -47,7 +47,7 @@ export interface TypedArrowTable< export interface UseAnalyticsQueryOptions< F extends AnalyticsFormat = "JSON_ARRAY", > { - /** Response format - "JSON_ARRAY" returns typed arrays, "ARROW_STREAM" returns TypedArrowTable */ + /** Response format - "JSON_ARRAY" (default) returns typed arrays, "ARROW_STREAM" uses Arrow (inline or external links) */ format?: F; /** Maximum size of serialized parameters in bytes */ @@ -63,8 +63,16 @@ export interface UseAnalyticsQueryResult { data: T | null; /** Loading state of the query */ loading: boolean; - /** Error state of the query */ + /** Error state of the query — sanitized human-readable message */ error: string | null; + /** + * Structured upstream error code when the server attaches one to the + * SSE error payload (e.g. `INLINE_ARROW_STASH_EXHAUSTED`, + * `RESULT_TOO_LARGE_FOR_JSON_FALLBACK`, `NOT_IMPLEMENTED`). Prefer + * branching UI on this stable identifier rather than substring- + * matching `error`, which is a free-form sanitized message. + */ + errorCode: string | null; } /** diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 05a32ee02..a3d2cf306 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -1,4 +1,5 @@ import { useCallback, useEffect, useMemo, useRef, useState } from "react"; +import { AnalyticsSseMessage } from "shared"; import { ArrowClient, connectSSE } from "@/js"; import type { AnalyticsFormat, @@ -88,13 +89,13 @@ function getArrowStreamUrl(id: string) { * @param options - Analytics query settings including format * @returns Query result state with format-appropriate data type * - * @example JSON format (default) + * @example JSON_ARRAY format (default) * ```typescript * const { data } = useAnalyticsQuery("spend_data", params); * // data: Array<{ group_key: string; cost_usd: number; ... }> | null * ``` * - * @example Arrow format + * @example ARROW_STREAM format * ```typescript * const { data } = useAnalyticsQuery("spend_data", params, { format: "ARROW_STREAM" }); * // data: TypedArrowTable<{ group_key: string; cost_usd: number; ... }> | null @@ -120,6 +121,7 @@ export function useAnalyticsQuery< const [data, setData] = useState(null); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); + const [errorCode, setErrorCode] = useState(null); const abortControllerRef = useRef(null); if (!queryKey || queryKey.trim().length === 0) { @@ -167,6 +169,7 @@ export function useAnalyticsQuery< setLoading(true); setError(null); + setErrorCode(null); setData(null); const abortController = new AbortController(); @@ -178,20 +181,35 @@ export function useAnalyticsQuery< signal: abortController.signal, onMessage: async (message) => { try { - const parsed = JSON.parse(message.data); + const rawParsed = JSON.parse(message.data); - // success - JSON format - if (parsed.type === "result") { + // The error/code branch below predates the SSE wire schema and + // can fire for messages that don't match any AnalyticsSseMessage + // variant (e.g. server-side error events from executeStream). + // Try schema validation first; if it fails, fall through to the + // generic error/code handling below. + const validated = AnalyticsSseMessage.safeParse(rawParsed); + const msg = validated.success ? validated.data : null; + + // success - JSON format. The wire schema makes `data` optional + // (e.g. an empty result set may omit it), so normalize the + // missing case to an explicit empty array rather than letting + // `undefined` bleed into the hook's `T | null` state. + if (msg?.type === "result") { setLoading(false); - setData(parsed.data as ResultType); + setData((msg.data ?? []) as ResultType); return; } - // success - Arrow format - if (parsed.type === "arrow") { + // success - Arrow format. Both INLINE (server-stashed, + // statement_id prefixed with "inline-") and EXTERNAL_LINKS + // (warehouse statement_id) flow through this single branch — the + // /arrow-result route dispatches based on the id prefix so the + // client doesn't need to know which path the bytes came from. + if (msg?.type === "arrow") { try { const arrowData = await ArrowClient.fetchArrow( - getArrowStreamUrl(parsed.statement_id), + getArrowStreamUrl(msg.statement_id), ); const table = await ArrowClient.processArrowBuffer(arrowData); setLoading(false); @@ -209,6 +227,11 @@ export function useAnalyticsQuery< } } + // The schema didn't match — fall through to error/code handling + // below for legacy error events or surface a malformed-payload + // error if no error fields are present. + const parsed = rawParsed; + // error if (parsed.type === "error" || parsed.error || parsed.code) { const errorMsg = @@ -216,6 +239,14 @@ export function useAnalyticsQuery< setLoading(false); setError(errorMsg); + // Propagate the upstream structured code so UI consumers + // can branch on a stable identifier (e.g. retry on + // INLINE_ARROW_STASH_EXHAUSTED, format-switch on + // RESULT_TOO_LARGE_FOR_JSON_FALLBACK) instead of parsing + // the human-readable message. + if (typeof parsed.errorCode === "string") { + setErrorCode(parsed.errorCode); + } if (parsed.code) { console.error( @@ -224,8 +255,33 @@ export function useAnalyticsQuery< } return; } + + // The payload matched neither AnalyticsSseMessage nor an error + // event — surface a generic error rather than silently dropping it. + if (!validated.success) { + console.error( + "[useAnalyticsQuery] Malformed SSE payload", + validated.error.flatten(), + ); + setLoading(false); + setError("Unable to load data, please try again"); + return; + } } catch (error) { + // A `JSON.parse` failure (or any other thrown error inside the + // SSE message handler) used to leave the hook permanently in + // `loading=true` with no error surfaced — the UI would just + // spin forever. Clear loading and report a user-facing error + // so the consumer can render a retry affordance. + // + // We also abort the SSE connection: if the upstream is + // emitting un-parseable frames, leaving the stream open just + // re-fires the same failure on the next message. Closing + // forces the consumer into a clean retry path. console.warn("[useAnalyticsQuery] Malformed message received", error); + setLoading(false); + setError("Unable to load data, please try again"); + abortController.abort(); } }, onError: (error) => { @@ -265,5 +321,5 @@ export function useAnalyticsQuery< // Enable HMR for query updates in dev mode useQueryHMR(queryKey, start); - return { data, loading, error }; + return { data, loading, error, errorCode }; } diff --git a/packages/appkit-ui/src/react/hooks/use-chart-data.ts b/packages/appkit-ui/src/react/hooks/use-chart-data.ts index a90481a2e..64b6e167f 100644 --- a/packages/appkit-ui/src/react/hooks/use-chart-data.ts +++ b/packages/appkit-ui/src/react/hooks/use-chart-data.ts @@ -17,8 +17,8 @@ export interface UseChartDataOptions { parameters?: Record; /** * Data format preference - * - "json": Force JSON format - * - "arrow": Force Arrow format + * - "json_array": Force JSON format + * - "arrow_stream": Force Arrow format * - "auto": Auto-select based on heuristics * @default "auto" */ @@ -51,9 +51,10 @@ function resolveFormat( format: DataFormat, parameters?: Record, ): "JSON_ARRAY" | "ARROW_STREAM" { - // Explicit format selection - if (format === "json") return "JSON_ARRAY"; - if (format === "arrow") return "ARROW_STREAM"; + // Explicit format selection (legacy "json"/"arrow" accepted for back-compat + // with appkit-ui < 0.33.0 — see DataFormat in ../charts/types.ts). + if (format === "json_array" || format === "json") return "JSON_ARRAY"; + if (format === "arrow_stream" || format === "arrow") return "ARROW_STREAM"; // Auto-selection heuristics if (format === "auto") { @@ -97,7 +98,7 @@ function resolveFormat( * // Force Arrow format * const { data } = useChartData({ * queryKey: "big_query", - * format: "arrow" + * format: "arrow_stream" * }); * ``` */ diff --git a/packages/appkit/package.json b/packages/appkit/package.json index a5cb234dc..9f1d180bf 100644 --- a/packages/appkit/package.json +++ b/packages/appkit/package.json @@ -73,6 +73,7 @@ "@opentelemetry/sdk-trace-base": "2.6.0", "@opentelemetry/semantic-conventions": "1.38.0", "@types/semver": "7.7.1", + "apache-arrow": "21.1.0", "dotenv": "16.6.1", "express": "4.22.0", "get-port": "7.2.0", diff --git a/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts b/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts new file mode 100644 index 000000000..17d099e37 --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts @@ -0,0 +1,441 @@ +import { + Binary, + Bool, + type DataType, + DateDay, + Decimal, + DurationMicrosecond, + Field, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + IntervalYearMonth, + List, + Map_, + Null, + Schema, + Struct, + Table, + TimestampMicrosecond, + tableToIPC, + Utf8, +} from "apache-arrow"; + +/** + * Parse a Databricks SQL type text (the value returned by the Statement + * Execution API in `ColumnInfo.type_text`) into an Apache Arrow DataType. + * + * Supports: + * - All scalar types (STRING, INT, BIGINT, DECIMAL, TIMESTAMP, etc.) + * - Parameterized scalars: DECIMAL(p,s), VARCHAR(n), CHAR(n) + * - Nested types: ARRAY, MAP, STRUCT + * - INTERVAL year-month and day-time variants + * - Backtick-quoted struct field names with embedded `` `` `` escapes + * + * Unknown or unparseable types fall back to Utf8 — empty-Table consumers + * still see a column with the right name; only the inner type is degraded. + */ +export function parseDatabricksType(typeText: string): DataType { + const parser = new TypeParser(typeText); + const result = parser.parseType(); + parser.expectEnd(); + return result; +} + +/** + * Build an empty Arrow IPC stream (base64-encoded) matching the column schema + * returned by the warehouse. Used so ARROW_STREAM responses with no rows still + * deliver a real Arrow Table to the client, preserving the hook's typed + * contract. + */ +export function buildEmptyArrowIPCBase64( + columns: Array<{ + name?: string; + type_text?: string; + type_name?: string; + }>, +): string { + const fields = columns.map((col, index) => { + const typeText = col.type_text ?? col.type_name ?? "STRING"; + let dataType: DataType; + try { + dataType = parseDatabricksType(typeText); + } catch { + dataType = new Utf8(); + } + const name = col.name && col.name.length > 0 ? col.name : `column_${index}`; + return new Field(name, dataType, true); + }); + const schema = new Schema(fields); + const table = new Table(schema); + const ipc = tableToIPC(table, "stream"); + return Buffer.from(ipc).toString("base64"); +} + +// ============================================================================ +// Recursive-descent parser +// ============================================================================ + +class TypeParser { + private readonly input: string; + private pos = 0; + + constructor(input: string) { + this.input = input; + } + + parseType(): DataType { + this.skipWs(); + + let name: string; + if (this.peek() === "`") { + name = this.consumeBacktickIdent(); + } else { + name = this.consumeIdent(); + } + const upper = name.toUpperCase(); + + this.skipWs(); + + if (upper === "INTERVAL") { + return this.parseInterval(); + } + + if (this.peek() === "(") { + this.consume("("); + const args = this.parseNumberArgs(); + this.consume(")"); + this.skipWs(); + return this.makeParameterized(upper, args); + } + + if (this.peek() === "<") { + this.consume("<"); + const result = this.makeGeneric(upper); + this.skipWs(); + this.consume(">"); + return result; + } + + return this.makeScalar(upper); + } + + expectEnd(): void { + this.skipWs(); + if (this.pos < this.input.length) { + throw new Error( + `Unexpected trailing input at position ${this.pos}: "${this.input.slice(this.pos)}"`, + ); + } + } + + // ─── Type constructors ─────────────────────────────────── + + private makeScalar(upper: string): DataType { + switch (upper) { + case "STRING": + case "VARIANT": + return new Utf8(); + case "VARCHAR": + case "CHAR": + return new Utf8(); + case "BINARY": + case "GEOGRAPHY": + case "GEOMETRY": + return new Binary(); + case "BOOLEAN": + case "BOOL": + return new Bool(); + case "TINYINT": + case "BYTE": + return new Int8(); + case "SMALLINT": + case "SHORT": + return new Int16(); + case "INT": + case "INTEGER": + return new Int32(); + case "BIGINT": + case "LONG": + return new Int64(); + case "FLOAT": + case "REAL": + return new Float32(); + case "DOUBLE": + return new Float64(); + case "DECIMAL": + case "NUMERIC": + case "DEC": + return new Decimal(0, 10, 128); + case "DATE": + return new DateDay(); + case "TIMESTAMP": + case "TIMESTAMP_LTZ": + return new TimestampMicrosecond("UTC"); + case "TIMESTAMP_NTZ": + return new TimestampMicrosecond(); + case "VOID": + case "NULL": + return new Null(); + default: + return new Utf8(); + } + } + + private makeParameterized(upper: string, args: number[]): DataType { + switch (upper) { + case "DECIMAL": + case "NUMERIC": + case "DEC": { + const precision = args[0] ?? 10; + const scale = args[1] ?? 0; + // Arrow JS Decimal constructor signature is (scale, precision, bitWidth). + return new Decimal(scale, precision, 128); + } + case "VARCHAR": + case "CHAR": + return new Utf8(); + default: + return new Utf8(); + } + } + + private makeGeneric(upper: string): DataType { + switch (upper) { + case "ARRAY": { + const inner = this.parseType(); + return new List(new Field("item", inner, true)); + } + case "MAP": { + const keyType = this.parseType(); + this.skipWs(); + this.consume(","); + this.skipWs(); + const valueType = this.parseType(); + const entriesStruct = new Struct([ + new Field("key", keyType, false), + new Field("value", valueType, true), + ]); + return new Map_(new Field("entries", entriesStruct, false), false); + } + case "STRUCT": + return this.parseStructFields(); + default: + // Unknown generic — skip to matching '>' and fall back. + this.skipBalancedAngles(); + return new Utf8(); + } + } + + private parseStructFields(): DataType { + const fields: Field[] = []; + while (true) { + this.skipWs(); + if (this.peek() === ">") break; + + let name: string; + if (this.peek() === "`") { + name = this.consumeBacktickIdent(); + } else { + name = this.consumeIdent(); + } + + this.skipWs(); + this.consume(":"); + this.skipWs(); + + const type = this.parseType(); + + // Optional `NOT NULL` and `COMMENT '...'`. Both are accepted by + // Databricks DDL and may appear in `type_text`. + this.skipWs(); + while (this.peekKeyword("NOT")) { + this.consumeIdent(); + this.skipWs(); + if (this.peekKeyword("NULL")) { + this.consumeIdent(); + } + this.skipWs(); + } + if (this.peekKeyword("COMMENT")) { + this.consumeIdent(); + this.skipWs(); + this.consumeStringLiteral(); + this.skipWs(); + } + + fields.push(new Field(name, type, true)); + + this.skipWs(); + if (this.peek() === ",") { + this.consume(","); + } else { + break; + } + } + return new Struct(fields); + } + + private parseInterval(): DataType { + // Grammar: INTERVAL [TO ] + // YEAR / MONTH variants -> IntervalYearMonth + // DAY / HOUR / MINUTE / SECOND variants -> Duration(microsecond) + const seen: string[] = []; + while (this.pos < this.input.length) { + this.skipWs(); + const c = this.peek(); + if (c === "" || c === "," || c === ">" || c === ")") break; + const word = this.consumeIdent().toUpperCase(); + seen.push(word); + } + const isYearMonth = seen.some((w) => w === "YEAR" || w === "MONTH"); + return isYearMonth ? new IntervalYearMonth() : new DurationMicrosecond(); + } + + private parseNumberArgs(): number[] { + const args: number[] = []; + while (true) { + this.skipWs(); + if (this.peek() === ")") break; + args.push(this.consumeNumber()); + this.skipWs(); + if (this.peek() === ",") { + this.consume(","); + } else { + break; + } + } + return args; + } + + // ─── Token utilities ───────────────────────────────────── + + private peek(): string { + return this.input[this.pos] ?? ""; + } + + private peekKeyword(word: string): boolean { + const slice = this.input.slice(this.pos, this.pos + word.length); + if (slice.toUpperCase() !== word.toUpperCase()) return false; + // Must be followed by a non-identifier character (boundary check). + const next = this.input[this.pos + word.length] ?? ""; + return !/[A-Za-z0-9_]/.test(next); + } + + private consume(expected: string): void { + if (this.peek() !== expected) { + throw new Error( + `Expected "${expected}" at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + this.pos++; + } + + private skipWs(): void { + while ( + this.pos < this.input.length && + /\s/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + } + + private consumeIdent(): string { + const start = this.pos; + while ( + this.pos < this.input.length && + /[A-Za-z0-9_]/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + if (this.pos === start) { + throw new Error( + `Expected identifier at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + return this.input.slice(start, this.pos); + } + + private consumeBacktickIdent(): string { + this.consume("`"); + let value = ""; + while (this.pos < this.input.length) { + if (this.input[this.pos] === "`") { + if (this.input[this.pos + 1] === "`") { + value += "`"; + this.pos += 2; + continue; + } + break; + } + value += this.input[this.pos]; + this.pos++; + } + this.consume("`"); + return value; + } + + private consumeNumber(): number { + const start = this.pos; + while ( + this.pos < this.input.length && + /[0-9]/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + if (this.pos === start) { + throw new Error( + `Expected number at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + return Number.parseInt(this.input.slice(start, this.pos), 10); + } + + private consumeStringLiteral(): string { + const quote = this.peek(); + if (quote !== "'" && quote !== '"') { + throw new Error( + `Expected string literal at position ${this.pos}, got "${quote}" in "${this.input}"`, + ); + } + this.pos++; + let value = ""; + while (this.pos < this.input.length) { + const c = this.input[this.pos]; + if (c === "\\") { + // Escape sequence: keep the next char verbatim. + const next = this.input[this.pos + 1]; + if (next !== undefined) { + value += next; + this.pos += 2; + continue; + } + this.pos++; + continue; + } + if (c === quote) { + this.pos++; + return value; + } + value += c; + this.pos++; + } + throw new Error(`Unterminated string literal in "${this.input}"`); + } + + private skipBalancedAngles(): void { + let depth = 1; + while (this.pos < this.input.length && depth > 0) { + const c = this.peek(); + if (c === "<") depth++; + else if (c === ">") { + depth--; + if (depth === 0) return; + } + this.pos++; + } + } +} diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index d0a1c1816..a0016d7bd 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -21,10 +21,23 @@ import { SpanStatusCode, TelemetryManager, } from "../../telemetry"; +import { buildEmptyArrowIPCBase64 } from "./arrow-schema"; import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); +/** + * Maximum size for inline Arrow IPC attachments (25 MiB decoded — the + * Databricks Statement Execution API hard cap on INLINE responses). + * + * Bulk Arrow payloads no longer traverse SSE — the analytics route stashes + * them via `InlineArrowStash` and the client fetches over HTTP — so this + * cap is bounded by the upstream API rather than our event-size budget. + * Larger results still fall through to `disposition: "EXTERNAL_LINKS"`, + * handled by the analytics format-fallback. + */ +const MAX_INLINE_ATTACHMENT_BYTES = 25 * 1024 * 1024; + interface SQLWarehouseConfig { timeout?: number; telemetry?: TelemetryOptions; @@ -196,7 +209,10 @@ export class SQLWarehouseConnector { result = this._transformDataArray(response); break; case "FAILED": - throw ExecutionError.statementFailed(status.error?.message); + throw ExecutionError.statementFailed( + status.error?.message, + status.error?.error_code, + ); case "CANCELED": throw ExecutionError.canceled(); case "CLOSED": @@ -246,8 +262,17 @@ export class SQLWarehouseConnector { if (error instanceof AppKitError) { throw error; } + // Preserve the SDK's structured ApiError.errorCode (e.g. + // "INVALID_PARAMETER_VALUE", "BAD_REQUEST") through the wrap so + // callers can branch on a stable identifier rather than + // substring-matching the message. + const sdkErrorCode = + error && typeof error === "object" && "errorCode" in error + ? (error as { errorCode?: unknown }).errorCode + : undefined; throw ExecutionError.statementFailed( error instanceof Error ? error.message : String(error), + typeof sdkErrorCode === "string" ? sdkErrorCode : undefined, ); } finally { // remove abort handler @@ -360,7 +385,10 @@ export class SQLWarehouseConnector { span.setStatus({ code: SpanStatusCode.OK }); return this._transformDataArray(response); case "FAILED": - throw ExecutionError.statementFailed(status.error?.message); + throw ExecutionError.statementFailed( + status.error?.message, + status.error?.error_code, + ); case "CANCELED": throw ExecutionError.canceled(); case "CLOSED": @@ -382,12 +410,16 @@ export class SQLWarehouseConnector { message: error instanceof Error ? error.message : String(error), }); - // error logging is handled by executeStatement's catch block (gated on isAborted) if (error instanceof AppKitError) { throw error; } + const sdkErrorCode = + error && typeof error === "object" && "errorCode" in error + ? (error as { errorCode?: unknown }).errorCode + : undefined; throw ExecutionError.statementFailed( error instanceof Error ? error.message : String(error), + typeof sdkErrorCode === "string" ? sdkErrorCode : undefined, ); } finally { span.end(); @@ -399,7 +431,47 @@ export class SQLWarehouseConnector { private _transformDataArray(response: sql.StatementResponse) { if (response.manifest?.format === "ARROW_STREAM") { - return this.updateWithArrowStatus(response); + const result = response.result as + | (sql.ResultData & { attachment?: string }) + | undefined; + + // Inline Arrow: pass the base64 IPC attachment through unmodified so + // the analytics route can stream it to the client, where the existing + // ArrowClient infrastructure decodes it into a Table. Validate size + // here to fail fast on runaway payloads. + if (result?.attachment) { + return this._validateArrowAttachment(response, result.attachment); + } + + // External links: data fetched separately via statement_id. + if (result?.external_links) { + return this.updateWithArrowStatus(response); + } + + // Empty result with a known schema: synthesize a zero-row Arrow IPC + // attachment so the client always receives an Arrow Table for + // ARROW_STREAM, regardless of whether the warehouse returned data. + // Note: an empty array (`data_array: []`) is truthy, so length-check + // explicitly — otherwise zero-row responses fall through to the JSON + // row transform below and return `[]` JSON rows instead of an Arrow + // table. + const hasNoRows = + !result?.data_array || + (Array.isArray(result.data_array) && result.data_array.length === 0); + if (hasNoRows && response.manifest?.schema?.columns) { + const synthesized = buildEmptyArrowIPCBase64( + response.manifest.schema.columns, + ); + return { + ...response, + result: { ...(result ?? {}), attachment: synthesized }, + }; + } + + // Inline data_array under ARROW_STREAM (rare): fall through to the + // row transform below. The hook will receive `type: "result"` rows; + // callers asking for ARROW_STREAM should not hit this path with + // current Databricks warehouses. } if (!response.result?.data_array || !response.manifest?.schema?.columns) { @@ -445,6 +517,41 @@ export class SQLWarehouseConnector { }; } + /** + * Validate (but do not decode) a base64 Arrow IPC attachment. + * Some serverless warehouses return inline results as Arrow IPC in + * `result.attachment`. We pass the base64 string through to the client, + * which decodes it into an Arrow Table via the existing ArrowClient + * infrastructure. This keeps the wire contract for ARROW_STREAM + * consistent (client always receives an Arrow Table) and avoids + * decode/re-encode work on the server. + */ + private _validateArrowAttachment( + response: sql.StatementResponse, + attachment: string, + ) { + // Cap the size to protect against unbounded inline payloads from + // misbehaving warehouses. 64 MiB is well above the typical inline limit + // (~25 MiB hard cap on the API) but bounds memory if a server returns + // a runaway response. + // + // Strip whitespace (rare but legal in base64) and account for trailing + // `=` padding so the byte count is exact rather than an upper bound. + const stripped = attachment.replace(/\s+/g, ""); + const padding = stripped.endsWith("==") + ? 2 + : stripped.endsWith("=") + ? 1 + : 0; + const decodedSize = Math.floor((stripped.length * 3) / 4) - padding; + if (decodedSize > MAX_INLINE_ATTACHMENT_BYTES) { + throw ExecutionError.statementFailed( + `Inline Arrow attachment exceeds maximum size (${decodedSize} > ${MAX_INLINE_ATTACHMENT_BYTES} bytes)`, + ); + } + return response; + } + private updateWithArrowStatus(response: sql.StatementResponse): { result: { statement_id: string; status: sql.StatementStatus }; } { diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts new file mode 100644 index 000000000..e30b7315a --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts @@ -0,0 +1,514 @@ +import { + Binary, + Bool, + type DataType, + DateDay, + Decimal, + DurationMicrosecond, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + IntervalYearMonth, + List, + Map_, + Null, + Struct, + TimestampMicrosecond, + Type, + tableFromIPC, + Utf8, +} from "apache-arrow"; +import { describe, expect, test } from "vitest"; +import { buildEmptyArrowIPCBase64, parseDatabricksType } from "../arrow-schema"; + +// ============================================================================ +// Helpers +// ============================================================================ + +/** Walk the type tree and produce a stable string representation for assertions. */ +function typeSummary(t: DataType): string { + if (t instanceof Decimal) return `Decimal(${t.precision},${t.scale})`; + if (t instanceof TimestampMicrosecond) { + const tz = (t as TimestampMicrosecond & { timezone?: string }).timezone; + return tz ? `Timestamp[us,${tz}]` : "Timestamp[us]"; + } + if (t instanceof List) { + const inner = (t.children?.[0]?.type as DataType | undefined) ?? new Utf8(); + return `List<${typeSummary(inner)}>`; + } + if (t instanceof Struct) { + const inner = (t.children ?? []) + .map((f) => `${f.name}:${typeSummary(f.type as DataType)}`) + .join(","); + return `Struct<${inner}>`; + } + if (t instanceof Map_) { + const entries = + (t.children?.[0]?.type as Struct | undefined)?.children ?? []; + const k = entries[0]?.type as DataType | undefined; + const v = entries[1]?.type as DataType | undefined; + return `Map<${typeSummary(k ?? new Utf8())},${typeSummary(v ?? new Utf8())}>`; + } + // Fall back to typeId for primitives. + return Type[t.typeId] ?? t.constructor.name; +} + +// ============================================================================ +// Scalar types +// ============================================================================ + +describe("parseDatabricksType — scalars", () => { + test.each([ + ["STRING", Utf8], + ["VARIANT", Utf8], + ["BINARY", Binary], + ["GEOGRAPHY", Binary], + ["GEOMETRY", Binary], + ["BOOLEAN", Bool], + ["BOOL", Bool], + ["TINYINT", Int8], + ["BYTE", Int8], + ["SMALLINT", Int16], + ["SHORT", Int16], + ["INT", Int32], + ["INTEGER", Int32], + ["BIGINT", Int64], + ["LONG", Int64], + ["FLOAT", Float32], + ["REAL", Float32], + ["DOUBLE", Float64], + ["DATE", DateDay], + ["VOID", Null], + ["NULL", Null], + ] as const)("%s parses to expected type", (input, ctor) => { + const t = parseDatabricksType(input); + expect(t).toBeInstanceOf(ctor); + }); + + test("case-insensitive — lowercase is accepted", () => { + expect(parseDatabricksType("string")).toBeInstanceOf(Utf8); + expect(parseDatabricksType("bigint")).toBeInstanceOf(Int64); + }); + + test("TIMESTAMP defaults to UTC tz", () => { + const t = parseDatabricksType("TIMESTAMP") as TimestampMicrosecond; + expect(t).toBeInstanceOf(TimestampMicrosecond); + expect(t.timezone).toBe("UTC"); + }); + + test("TIMESTAMP_LTZ behaves like TIMESTAMP", () => { + const t = parseDatabricksType("TIMESTAMP_LTZ") as TimestampMicrosecond; + expect(t.timezone).toBe("UTC"); + }); + + test("TIMESTAMP_NTZ has no timezone", () => { + const t = parseDatabricksType("TIMESTAMP_NTZ") as TimestampMicrosecond; + expect(t).toBeInstanceOf(TimestampMicrosecond); + expect(t.timezone == null || t.timezone === "").toBe(true); + }); + + test("Unknown scalar falls back to Utf8 (degraded but doesn't throw)", () => { + expect(parseDatabricksType("SOMETHING_NEW")).toBeInstanceOf(Utf8); + }); +}); + +// ============================================================================ +// Parameterized scalars +// ============================================================================ + +describe("parseDatabricksType — parameterized scalars", () => { + test("VARCHAR(255) → Utf8 (Arrow doesn't track string length)", () => { + expect(parseDatabricksType("VARCHAR(255)")).toBeInstanceOf(Utf8); + }); + + test("CHAR(10) → Utf8", () => { + expect(parseDatabricksType("CHAR(10)")).toBeInstanceOf(Utf8); + }); + + test("DECIMAL(10,2) → Decimal(precision=10, scale=2)", () => { + const t = parseDatabricksType("DECIMAL(10,2)") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(t.precision).toBe(10); + expect(t.scale).toBe(2); + }); + + test("DECIMAL(38,0) — max precision, no scale", () => { + const t = parseDatabricksType("DECIMAL(38,0)") as Decimal; + expect(t.precision).toBe(38); + expect(t.scale).toBe(0); + }); + + test("NUMERIC(p,s) is an alias for DECIMAL(p,s)", () => { + const t = parseDatabricksType("NUMERIC(15,4)") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(t.precision).toBe(15); + expect(t.scale).toBe(4); + }); + + test("DEC(p,s) is an alias for DECIMAL(p,s)", () => { + const t = parseDatabricksType("DEC(7,3)") as Decimal; + expect(t.precision).toBe(7); + expect(t.scale).toBe(3); + }); + + test("DECIMAL with whitespace inside parens", () => { + const t = parseDatabricksType("DECIMAL( 10 , 2 )") as Decimal; + expect(t.precision).toBe(10); + expect(t.scale).toBe(2); + }); + + test("DECIMAL with single arg (precision only) defaults scale=0", () => { + const t = parseDatabricksType("DECIMAL(20)") as Decimal; + expect(t.precision).toBe(20); + expect(t.scale).toBe(0); + }); + + test("Bare DECIMAL falls back to default precision/scale", () => { + const t = parseDatabricksType("DECIMAL") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(typeof t.precision).toBe("number"); + expect(typeof t.scale).toBe("number"); + }); +}); + +// ============================================================================ +// INTERVAL types +// ============================================================================ + +describe("parseDatabricksType — INTERVAL", () => { + test("INTERVAL YEAR → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL YEAR")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL MONTH → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL MONTH")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL YEAR TO MONTH → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL YEAR TO MONTH")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL DAY → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL DAY")).toBeInstanceOf( + DurationMicrosecond, + ); + }); + + test("INTERVAL DAY TO SECOND → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL DAY TO SECOND")).toBeInstanceOf( + DurationMicrosecond, + ); + }); + + test("INTERVAL HOUR TO MINUTE → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL HOUR TO MINUTE")).toBeInstanceOf( + DurationMicrosecond, + ); + }); +}); + +// ============================================================================ +// ARRAY +// ============================================================================ + +describe("parseDatabricksType — ARRAY", () => { + test("ARRAY → List", () => { + const t = parseDatabricksType("ARRAY") as List; + expect(t).toBeInstanceOf(List); + expect(t.children?.[0]?.type).toBeInstanceOf(Utf8); + }); + + test("ARRAY → List", () => { + const t = parseDatabricksType("ARRAY") as List; + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("ARRAY preserves precision/scale", () => { + const t = parseDatabricksType("ARRAY") as List; + const inner = t.children?.[0]?.type as Decimal; + expect(inner).toBeInstanceOf(Decimal); + expect(inner.precision).toBe(10); + expect(inner.scale).toBe(2); + }); + + test("ARRAY> — nested twice", () => { + const t = parseDatabricksType("ARRAY>") as List; + const inner1 = t.children?.[0]?.type as List; + expect(inner1).toBeInstanceOf(List); + expect(inner1.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("ARRAY>> — three levels deep", () => { + expect( + typeSummary(parseDatabricksType("ARRAY>>")), + ).toBe("List>>"); + }); + + test("ARRAY with whitespace", () => { + const t = parseDatabricksType("ARRAY < STRING >") as List; + expect(t.children?.[0]?.type).toBeInstanceOf(Utf8); + }); +}); + +// ============================================================================ +// MAP +// ============================================================================ + +describe("parseDatabricksType — MAP", () => { + test("MAP", () => { + expect(typeSummary(parseDatabricksType("MAP"))).toBe( + "Map", + ); + }); + + test("MAP — with whitespace", () => { + expect(typeSummary(parseDatabricksType("MAP"))).toBe( + "Map", + ); + }); + + test("MAP> — value is nested", () => { + expect(typeSummary(parseDatabricksType("MAP>"))).toBe( + "Map>", + ); + }); + + test("MAP> — fully nested", () => { + expect( + typeSummary(parseDatabricksType("MAP>")), + ).toBe("Map>"); + }); +}); + +// ============================================================================ +// STRUCT +// ============================================================================ + +describe("parseDatabricksType — STRUCT", () => { + test("STRUCT", () => { + const t = parseDatabricksType("STRUCT") as Struct; + expect(t).toBeInstanceOf(Struct); + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("a"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + expect(t.children?.[1]?.name).toBe("b"); + expect(t.children?.[1]?.type).toBeInstanceOf(Utf8); + }); + + test("STRUCT with whitespace and many fields", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.map((f) => f.name)).toEqual(["id", "name", "ts"]); + expect(t.children?.[0]?.type).toBeInstanceOf(Int64); + expect(t.children?.[2]?.type).toBeInstanceOf(TimestampMicrosecond); + }); + + test("STRUCT with COMMENT on a field", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + expect(t.children?.[1]?.name).toBe("name"); + }); + + test("STRUCT with COMMENT containing escaped quote", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + }); + + test("STRUCT with NOT NULL annotation on a field", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + }); + + test("STRUCT with backticked field name", () => { + const t = parseDatabricksType( + "STRUCT<`weird name`:INT, normal:STRING>", + ) as Struct; + expect(t.children?.[0]?.name).toBe("weird name"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("STRUCT with backticked field name containing escaped backtick", () => { + const t = parseDatabricksType( + "STRUCT<`with``tick`:INT, other:STRING>", + ) as Struct; + expect(t.children?.[0]?.name).toBe("with`tick"); + }); + + test("STRUCT with nested STRUCT", () => { + const t = parseDatabricksType( + "STRUCT, name:STRING>", + ) as Struct; + expect(t.children?.length).toBe(2); + const nested = t.children?.[0]?.type as Struct; + expect(nested).toBeInstanceOf(Struct); + expect(nested.children?.[0]?.name).toBe("inner"); + expect(nested.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("Empty STRUCT<>", () => { + const t = parseDatabricksType("STRUCT<>") as Struct; + expect(t).toBeInstanceOf(Struct); + expect(t.children?.length).toBe(0); + }); +}); + +// ============================================================================ +// Deep nesting / mixed types +// ============================================================================ + +describe("parseDatabricksType — deeply nested", () => { + test("MAP>>", () => { + expect( + typeSummary( + parseDatabricksType( + "MAP>>", + ), + ), + ).toBe("Map>>"); + }); + + test("ARRAY>>> — 4 levels mixed", () => { + expect( + typeSummary( + parseDatabricksType( + "ARRAY>>>", + ), + ), + ).toBe("List>>>"); + }); +}); + +// ============================================================================ +// Error / robustness behavior +// ============================================================================ + +describe("parseDatabricksType — error / robustness", () => { + test("trailing garbage throws", () => { + expect(() => parseDatabricksType("INT junk")).toThrow(); + }); + + test("unmatched < throws", () => { + expect(() => parseDatabricksType("ARRAY { + expect(() => parseDatabricksType("DECIMAL(10,2")).toThrow(); + }); + + test("empty string throws", () => { + expect(() => parseDatabricksType("")).toThrow(); + }); +}); + +// ============================================================================ +// buildEmptyArrowIPCBase64 — round-trip +// ============================================================================ + +describe("buildEmptyArrowIPCBase64", () => { + test("produces a decodable empty Arrow Table with the right schema", () => { + const columns = [ + { name: "user_id", type_text: "BIGINT" }, + { name: "name", type_text: "STRING" }, + { name: "created_at", type_text: "TIMESTAMP" }, + { name: "balance", type_text: "DECIMAL(10,2)" }, + { name: "active", type_text: "BOOLEAN" }, + ]; + const b64 = buildEmptyArrowIPCBase64(columns); + const buf = Buffer.from(b64, "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(5); + expect(table.schema.fields.map((f) => f.name)).toEqual([ + "user_id", + "name", + "created_at", + "balance", + "active", + ]); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Utf8); + // After IPC round-trip Arrow JS resolves Timestamp* subclasses to a + // generic Timestamp with `unit` and `timezone`; assert structurally. + expect(table.schema.fields[2]?.type.typeId).toBe(Type.Timestamp); + expect((table.schema.fields[2]?.type as { unit?: number }).unit).toBe(2); // TimeUnit.MICROSECOND + const decimal = table.schema.fields[3]?.type as Decimal; + expect(decimal).toBeInstanceOf(Decimal); + expect(decimal.precision).toBe(10); + expect(decimal.scale).toBe(2); + expect(table.schema.fields[4]?.type).toBeInstanceOf(Bool); + }); + + test("round-trips nested types end-to-end", () => { + const columns = [ + { name: "tags", type_text: "ARRAY" }, + { name: "meta", type_text: "STRUCT" }, + { name: "counts", type_text: "MAP" }, + ]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(3); + expect(table.schema.fields[0]?.type).toBeInstanceOf(List); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Struct); + expect(table.schema.fields[2]?.type).toBeInstanceOf(Map_); + }); + + test("falls back from type_text to type_name when type_text missing", () => { + const columns = [{ name: "id", type_name: "BIGINT" }]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + }); + + test("unknown type degrades to Utf8 without throwing", () => { + const columns = [ + { name: "id", type_text: "BIGINT" }, + { name: "weird", type_text: "FUTURE_TYPE_NOT_YET_SUPPORTED" }, + ]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Utf8); + }); + + test("missing column name gets a synthesized placeholder", () => { + const columns = [{ type_text: "STRING" }, { name: "", type_text: "INT" }]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect(table.schema.fields[0]?.name).toBe("column_0"); + expect(table.schema.fields[1]?.name).toBe("column_1"); + }); + + test("empty schema produces a valid 0-column 0-row Table", () => { + const buf = Buffer.from(buildEmptyArrowIPCBase64([]), "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(0); + }); +}); diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts new file mode 100644 index 000000000..c6780f3f1 --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -0,0 +1,383 @@ +import type { sql } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; +import { describe, expect, test, vi } from "vitest"; + +vi.mock("../../../telemetry", () => { + const mockMeter = { + createCounter: () => ({ add: vi.fn() }), + createHistogram: () => ({ record: vi.fn() }), + }; + return { + TelemetryManager: { + getProvider: () => ({ + startActiveSpan: vi.fn(), + getMeter: () => mockMeter, + }), + }, + SpanKind: { CLIENT: 1 }, + SpanStatusCode: { ERROR: 2 }, + }; +}); +vi.mock("../../../logging/logger", () => ({ + createLogger: () => ({ + info: vi.fn(), + debug: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + event: () => null, + }), +})); +vi.mock("../../../stream/arrow-stream-processor", () => ({ + ArrowStreamProcessor: vi.fn(), +})); + +import { SQLWarehouseConnector } from "../client"; + +function createConnector() { + return new SQLWarehouseConnector({ timeout: 30000 }); +} + +// Real base64 Arrow IPC from a serverless warehouse returning +// `SELECT 1 AS test_col, 2 AS test_col2` with INLINE + ARROW_STREAM. +// Contains schema (two INT columns) + one record batch with values [1, 2]. +const REAL_ARROW_ATTACHMENT = + "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAABMAAAABAAAAMz///8QAAAAGAAAAAAAAQIUAAAAvP///yAAAAAAAAABAAAAAAkAAAB0ZXN0X2NvbDIAAAAQABQAEAAOAA8ABAAAAAgAEAAAABgAAAAgAAAAAAABAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAAIAAAAdGVzdF9jb2wAAAAA/////7gAAAAQAAAADAAaABgAFwAEAAgADAAAACAAAAAAAQAAAAAAAAAAAAAAAAADBAAKABgADAAIAAQACgAAADwAAAAQAAAAAQAAAAAAAAAAAAAAAgAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAEAAAAAAAAAQAAAAAAAAAAEAAAAAAAAAIAAAAAAAAAAAQAAAAAAAADAAAAAAAAAAAQAAAAAAAAA/wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAA"; + +describe("SQLWarehouseConnector._transformDataArray", () => { + describe("classic warehouse (JSON_ARRAY + INLINE)", () => { + test("transforms data_array rows into named objects", () => { + const connector = createConnector(); + // Real response shape from classic warehouse: INLINE + JSON_ARRAY + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "JSON_ARRAY", + schema: { + column_count: 2, + columns: [ + { + name: "test_col", + type_text: "INT", + type_name: "INT", + position: 0, + }, + { + name: "test_col2", + type_text: "INT", + type_name: "INT", + position: 1, + }, + ], + }, + total_row_count: 1, + truncated: false, + }, + result: { + data_array: [["1", "2"]], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([{ test_col: "1", test_col2: "2" }]); + expect(result.result.data_array).toBeUndefined(); + }); + + test("parses JSON strings in STRING columns", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "JSON_ARRAY", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "metadata", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [["1", '{"key":"value"}']], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data[0].metadata).toEqual({ key: "value" }); + }); + }); + + describe("classic warehouse (EXTERNAL_LINKS + ARROW_STREAM)", () => { + test("returns statement_id for external links fetch", () => { + const connector = createConnector(); + // Real response shape from classic warehouse: EXTERNAL_LINKS + ARROW_STREAM + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + external_links: [ + { + external_link: "https://storage.example.com/chunk0", + expiration: "2026-04-15T00:00:00Z", + }, + ], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.statement_id).toBe("stmt-1"); + expect(result.result.data).toBeUndefined(); + }); + }); + + describe("serverless warehouse (INLINE + ARROW_STREAM with attachment)", () => { + test("passes attachment through unchanged for client-side decoding", () => { + const connector = createConnector(); + // Real response shape from serverless warehouse: INLINE + ARROW_STREAM + // Data arrives in result.attachment as base64-encoded Arrow IPC, not data_array. + const response = { + statement_id: "00000001-test-stmt", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + column_count: 2, + columns: [ + { + name: "test_col", + type_text: "INT", + type_name: "INT", + position: 0, + }, + { + name: "test_col2", + type_text: "INT", + type_name: "INT", + position: 1, + }, + ], + total_chunk_count: 1, + chunks: [{ chunk_index: 0, row_offset: 0, row_count: 1 }], + total_row_count: 1, + }, + truncated: false, + }, + result: { + chunk_index: 0, + row_offset: 0, + row_count: 1, + attachment: REAL_ARROW_ATTACHMENT, + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + expect(result.result.data).toBeUndefined(); + // Preserves other result fields + expect(result.result.row_count).toBe(1); + }); + + test("preserves manifest and status alongside attachment", () => { + const connector = createConnector(); + const response = { + statement_id: "00000001-test-stmt", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + chunk_index: 0, + row_count: 1, + attachment: REAL_ARROW_ATTACHMENT, + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + // Manifest, statement_id, and attachment are all preserved + expect(result.manifest.format).toBe("ARROW_STREAM"); + expect(result.statement_id).toBe("00000001-test-stmt"); + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + }); + + test("synthesizes an empty Arrow IPC attachment for empty results so the client always gets a Table", () => { + const connector = createConnector(); + // Empty result: no attachment, no data_array, no external_links — but + // the manifest still describes the schema. The connector should fill in + // `attachment` with a zero-row Arrow IPC matching the schema. + const response = { + statement_id: "stmt-empty", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "user_id", type_text: "BIGINT", type_name: "BIGINT" }, + { name: "name", type_text: "STRING", type_name: "STRING" }, + { + name: "balance", + type_text: "DECIMAL(10,2)", + type_name: "DECIMAL", + }, + ], + }, + total_row_count: 0, + }, + result: {}, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + const attachment: string = transformed.result.attachment; + expect(typeof attachment).toBe("string"); + expect(attachment.length).toBeGreaterThan(0); + + // Verify the synthesized attachment decodes into the right empty schema. + const table = tableFromIPC(Buffer.from(attachment, "base64")); + expect(table.numRows).toBe(0); + expect(table.schema.fields.map((f) => f.name)).toEqual([ + "user_id", + "name", + "balance", + ]); + }); + + test("does NOT synthesize an attachment when external_links are present", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-ext", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { columns: [{ name: "x", type_text: "INT" }] }, + }, + result: { + external_links: [ + { external_link: "https://example.com/x", expiration: "9999" }, + ], + }, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + // External-links path returns the statement_id projection — no attachment. + expect(transformed.result.attachment).toBeUndefined(); + expect(transformed.result.statement_id).toBe("stmt-ext"); + }); + + test("does NOT synthesize an attachment when schema is missing", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-no-schema", + status: { state: "SUCCEEDED" }, + manifest: { format: "ARROW_STREAM" }, + result: {}, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + // Without a schema we cannot build a Table — pass through unchanged. + expect(transformed.result?.attachment).toBeUndefined(); + }); + + test("rejects oversized attachments to bound memory", () => { + const connector = createConnector(); + // 25 MiB decoded cap (Databricks API hard cap on INLINE) → 36 MiB of + // base64 chars decodes to ~27 MiB, comfortably above the limit. + const oversized = "A".repeat(36 * 1024 * 1024); + const response = { + statement_id: "stmt-oversized", + status: { state: "SUCCEEDED" }, + manifest: { format: "ARROW_STREAM" }, + result: { attachment: oversized }, + } as unknown as sql.StatementResponse; + + expect(() => (connector as any)._transformDataArray(response)).toThrow( + /exceeds maximum size/, + ); + }); + }); + + describe("ARROW_STREAM with data_array (hypothetical inline variant)", () => { + test("transforms data_array like JSON_ARRAY path", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "value", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [ + ["1", "hello"], + ["2", "world"], + ], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([ + { id: "1", value: "hello" }, + { id: "2", value: "world" }, + ]); + }); + }); + + describe("edge cases", () => { + test("returns response unchanged when no data_array, attachment, or schema", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { format: "JSON_ARRAY" }, + result: {}, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result).toBe(response); + }); + + test("attachment takes priority over data_array when both present", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + attachment: REAL_ARROW_ATTACHMENT, + data_array: [["999", "999"]], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + // Should pass attachment through (client decodes), not transform data_array + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + expect(result.result.data).toBeUndefined(); + }); + }); +}); diff --git a/packages/appkit/src/errors/base.ts b/packages/appkit/src/errors/base.ts index 502338232..7ed56d213 100644 --- a/packages/appkit/src/errors/base.ts +++ b/packages/appkit/src/errors/base.ts @@ -46,14 +46,32 @@ export abstract class AppKitError extends Error { /** Additional context for the error */ readonly context?: Record; + /** + * Client-safe error message. When set, callers serializing the error to + * a client (SSE, HTTP body) MUST prefer `clientMessage` over `message` + * — `message` may contain raw upstream / SDK text including statement + * fragments, internal object names, and correlation IDs. + * + * Subclasses can set this in their constructor for a fixed sanitized + * string. When unset, `clientMessage` defaults to a generic per-code + * string (see the getter), and the raw `message` is kept server-side + * only. + */ + protected readonly _clientMessage?: string; + constructor( message: string, - options?: { cause?: Error; context?: Record }, + options?: { + cause?: Error; + context?: Record; + clientMessage?: string; + }, ) { super(message); this.name = this.constructor.name; this.cause = options?.cause; this.context = options?.context; + this._clientMessage = options?.clientMessage; // Maintains proper stack trace for where the error was thrown if (Error.captureStackTrace) { @@ -61,6 +79,14 @@ export abstract class AppKitError extends Error { } } + /** + * Sanitized message safe to forward to clients. Override in subclasses + * if a more specific default is appropriate. + */ + get clientMessage(): string { + return this._clientMessage ?? "An internal error occurred"; + } + /** * Convert error to JSON for logging/serialization. * Sensitive values in context are automatically redacted. diff --git a/packages/appkit/src/errors/execution.ts b/packages/appkit/src/errors/execution.ts index 42de77043..3c9df7fe0 100644 --- a/packages/appkit/src/errors/execution.ts +++ b/packages/appkit/src/errors/execution.ts @@ -16,20 +16,69 @@ export class ExecutionError extends AppKitError { readonly isRetryable = false; /** - * Create an execution error for statement failure + * Structured error code from the upstream source (typically the warehouse's + * `error_code` for statement-level failures, or the SDK's `ApiError.errorCode` + * for HTTP failures). Preserved through wrapping so callers can branch on a + * stable identifier without substring-matching the message. */ - static statementFailed(errorMessage?: string): ExecutionError { + readonly errorCode?: string; + + constructor( + message: string, + options?: { + cause?: Error; + context?: Record; + errorCode?: string; + clientMessage?: string; + }, + ) { + super(message, options); + this.errorCode = options?.errorCode; + } + + /** + * Execution errors default to a generic message — the raw warehouse / + * SDK text in `.message` often includes statement fragments, internal + * paths, and correlation IDs. UI code should branch on `errorCode` + * (`INLINE_ARROW_STASH_EXHAUSTED`, `NOT_IMPLEMENTED`, etc.) and not on + * the human string. + */ + override get clientMessage(): string { + return this._clientMessage ?? "Query execution failed"; + } + + /** + * Create an execution error for statement failure. + * @param errorMessage Human-readable error from the warehouse / SDK. + * Goes into `.message` for server logs only — *never* echoed to the + * client. Pass `clientMessage` explicitly if a sanitized text should + * reach the UI. + * @param errorCode Structured code (e.g. "INVALID_PARAMETER_VALUE") to + * preserve through wrapping. Optional. Forwarded on SSE error + * payloads so UI can branch on it instead of substring-matching + * `error`. + * @param clientMessage Optional client-safe replacement for `.message`. + * Defaults to "Query execution failed" via the `clientMessage` + * getter. Set this only when the upstream text is known-safe. + */ + static statementFailed( + errorMessage?: string, + errorCode?: string, + clientMessage?: string, + ): ExecutionError { const message = errorMessage ? `Statement failed: ${errorMessage}` : "Statement failed: Unknown error"; - return new ExecutionError(message); + return new ExecutionError(message, { errorCode, clientMessage }); } /** * Create an execution error for canceled operation */ static canceled(): ExecutionError { - return new ExecutionError("Statement was canceled"); + return new ExecutionError("Statement was canceled", { + clientMessage: "Query was canceled", + }); } /** @@ -38,6 +87,7 @@ export class ExecutionError extends AppKitError { static resultsClosed(): ExecutionError { return new ExecutionError( "Statement execution completed but results are no longer available (CLOSED state)", + { clientMessage: "Query results expired" }, ); } @@ -58,4 +108,23 @@ export class ExecutionError extends AppKitError { context: { dataType }, }); } + + /** + * Create an execution error for the inline Arrow stash being unable to + * accept a payload (both regular and overflow pools at cap). + * + * The route deliberately surfaces this rather than silently re-running + * the statement on EXTERNAL_LINKS — a second execution can be billed + * and return divergent results for non-deterministic SQL. Operators + * should tune stash capacity or back off load when this fires. + */ + static stashExhausted(): ExecutionError { + return new ExecutionError( + "Inline Arrow stash exhausted; retry shortly or increase stash capacity", + { + errorCode: "INLINE_ARROW_STASH_EXHAUSTED", + clientMessage: "Server is at capacity, please retry", + }, + ); + } } diff --git a/packages/appkit/src/plugins/agents/agents.ts b/packages/appkit/src/plugins/agents/agents.ts index 40217e54e..ca744eafa 100644 --- a/packages/appkit/src/plugins/agents/agents.ts +++ b/packages/appkit/src/plugins/agents/agents.ts @@ -4,7 +4,6 @@ import type express from "express"; import pc from "picocolors"; import type { AgentAdapter, - AgentEvent, AgentRunContext, AgentToolDefinition, IAppRouter, @@ -16,7 +15,6 @@ import type { ToolProvider, } from "shared"; import { AppKitMcpClient, buildMcpHostPolicy } from "../../connectors/mcp"; -import { getWorkspaceClient } from "../../context"; import { consumeAdapterStream } from "../../core/agent/consume-adapter-stream"; import { loadAgentsFromDir } from "../../core/agent/load-agents"; import { normalizeToolResult } from "../../core/agent/normalize-result"; diff --git a/packages/appkit/src/plugins/agents/tests/dos-limits.test.ts b/packages/appkit/src/plugins/agents/tests/dos-limits.test.ts index e2bbcbe99..a0c64e578 100644 --- a/packages/appkit/src/plugins/agents/tests/dos-limits.test.ts +++ b/packages/appkit/src/plugins/agents/tests/dos-limits.test.ts @@ -307,7 +307,7 @@ describe("runSubAgent — depth guard", () => { * so we can drive `runSubAgent` directly against the depth guard. */ function makeRunState( - plugin: AgentsPlugin, + _plugin: AgentsPlugin, overrides: Partial<{ maxToolCalls: number; maxSubAgentDepth: number; diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 05709aea6..5ba121d1b 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -1,12 +1,16 @@ import type { WorkspaceClient } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import type express from "express"; -import type { - AgentToolDefinition, - IAppRouter, - PluginExecuteConfig, - SQLTypeMarker, - StreamExecutionSettings, - ToolProvider, +import { + type AgentToolDefinition, + type AnalyticsSseMessage, + type IAppRouter, + makeArrowMessage, + makeResultMessage, + type PluginExecuteConfig, + type SQLTypeMarker, + type StreamExecutionSettings, + type ToolProvider, } from "shared"; import { z } from "zod"; import { SQLWarehouseConnector } from "../../connectors"; @@ -18,18 +22,22 @@ import { toolsFromRegistry, } from "../../core/agent/tools/define-tool"; import { assertReadOnlySql } from "../../core/agent/tools/sql-policy"; +import { ExecutionError } from "../../errors"; import { createLogger } from "../../logging/logger"; import { Plugin, toPlugin } from "../../plugin"; import type { PluginManifest } from "../../registry"; +import type { Counter, Histogram } from "../../telemetry"; import { queryDefaults } from "./defaults"; +import { InlineArrowStash } from "./inline-arrow-stash"; import manifest from "./manifest.json"; import { QueryProcessor } from "./query"; -import type { - AnalyticsQueryResponse, - IAnalyticsConfig, - IAnalyticsQueryRequest, +import { + type AnalyticsFormat, + type AnalyticsQueryResponse, + type IAnalyticsConfig, + type IAnalyticsQueryRequest, + normalizeAnalyticsFormat, } from "./types"; -import { normalizeAnalyticsFormat } from "./types"; const logger = createLogger("analytics"); @@ -44,6 +52,33 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { private SQLClient: SQLWarehouseConnector; private queryProcessor: QueryProcessor; + /** + * Server-side stash for inline Arrow IPC payloads. + * + * INLINE ARROW_STREAM responses do not ride the SSE control channel — + * the route puts the decoded bytes here and emits an `arrow` SSE + * message with a synthetic `inline-` id, and the client fetches + * the bytes through the existing `/arrow-result/:jobId` endpoint with + * a real binary content-type. + */ + protected inlineArrowStash: InlineArrowStash = new InlineArrowStash(); + + /** + * Stash telemetry — created lazily on first stash operation because + * `this.telemetry` may not be bound at constructor time (see + * `Plugin.attachContext`). Cached via `_stashMetrics` after the first + * call; subsequent puts hit the cached counter/histogram instances. + * + * Counter labels: `result` ∈ {"regular", "overflow", "exhausted"} — + * one counter with a label is friendlier to dashboards than three + * separate metrics, and aggregates trivially in Prometheus / OTel. + */ + private _stashMetrics?: { + putCount: Counter; + putBytes: Histogram; + }; + private _stashMetricsAttempted = false; + constructor(config: IAnalyticsConfig) { super(config); this.config = config; @@ -81,24 +116,60 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { /** * Handle Arrow data download requests. - * When called via asUser(req), uses the user's Databricks credentials. + * + * Two id shapes are supported: + * - `inline-`: bytes were stashed server-side by the query route. + * Drain the stash, serve directly with the canonical Arrow content + * type. No warehouse round-trip. + * - any other id: a warehouse-issued statement id. Fetch the Arrow + * stream from the warehouse via the SDK; serve the bytes. + * + * When called via asUser(req), uses the user's Databricks credentials + * for the warehouse path. The inline path is user-scoped at the stash + * layer instead. */ async _handleArrowRoute( req: express.Request, res: express.Response, ): Promise { + const { jobId } = req.params; + const event = logger.event(req); + event?.setComponent("analytics", "getArrowData").setContext("analytics", { + job_id: jobId, + plugin: this.name, + }); + + if (jobId.startsWith("inline-")) { + const userKey = this._stashUserKey(req); + const bytes = this.inlineArrowStash.take(jobId, userKey); + if (!bytes) { + // Already drained, expired, or never belonged to this user. 410 + // distinguishes this from "warehouse statement id not found" (404) + // so the client can surface a useful error. + logger.debug("Inline Arrow stash miss for jobId=%s", jobId); + res.status(410).json({ + error: "Inline Arrow result expired or unknown", + plugin: this.name, + }); + return; + } + logger.debug( + "Serving inline Arrow buffer: %d bytes for jobId=%s", + bytes.length, + jobId, + ); + res.setHeader("Content-Type", "application/vnd.apache.arrow.stream"); + res.setHeader("Content-Length", bytes.length.toString()); + // Inline payloads are single-use and short-lived; no public caching. + res.setHeader("Cache-Control", "no-store"); + res.send(Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength)); + return; + } + try { - const { jobId } = req.params; const workspaceClient = getWorkspaceClient(); - logger.debug("Processing Arrow job request for jobId=%s", jobId); - const event = logger.event(req); - event?.setComponent("analytics", "getArrowData").setContext("analytics", { - job_id: jobId, - plugin: this.name, - }); - const result = await this.getArrowData(workspaceClient, jobId); res.setHeader("Content-Type", "application/octet-stream"); @@ -113,13 +184,86 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { res.send(Buffer.from(result.data)); } catch (error) { logger.error("Arrow job error: %O", error); + // Do not echo upstream / SDK error text to the client — it can + // include statement fragments, internal object names, and + // correlation IDs. The full detail stays in the server log above. + const errorCode = + error instanceof ExecutionError ? error.errorCode : undefined; res.status(404).json({ - error: error instanceof Error ? error.message : "Arrow job not found", + error: "Arrow result unavailable", + errorCode, plugin: this.name, }); } } + /** + * Lazy accessor for stash telemetry instruments. Returns `undefined` + * when telemetry isn't bound (factory-constructed plugins before + * `attachContext`, no-op test paths) — callers should branch on the + * return value rather than relying on it being present. + * + * Records once per call site: + * - `analytics.inline_arrow_stash.put.count{result}` — counter with + * label "regular" | "overflow" | "exhausted" + * - `analytics.inline_arrow_stash.put.bytes` — histogram of accepted + * payload sizes (label same as counter; exhausted puts record 0) + */ + private _getStashMetrics() { + if (this._stashMetricsAttempted) return this._stashMetrics; + this._stashMetricsAttempted = true; + if (!this.telemetry) return undefined; + const meter = this.telemetry.getMeter(); + this._stashMetrics = { + putCount: meter.createCounter("analytics.inline_arrow_stash.put.count", { + description: + "Count of inline-Arrow stash put attempts, labeled by pool outcome (regular | overflow | exhausted).", + unit: "1", + }), + putBytes: meter.createHistogram( + "analytics.inline_arrow_stash.put.bytes", + { + description: + "Sizes of accepted inline-Arrow stash payloads. Aggregate by `result` label for utilization analysis.", + unit: "By", + }, + ), + }; + return this._stashMetrics; + } + + private _recordStashOutcome( + result: "regular" | "overflow" | "exhausted", + bytes: number, + ): void { + const m = this._getStashMetrics(); + if (!m) return; + m.putCount.add(1, { result }); + m.putBytes.record(result === "exhausted" ? 0 : bytes, { result }); + } + + /** + * Stash key used at put-time (in `_handleQueryRoute`) and take-time + * (in `_handleArrowRoute`). Centralized so the two sides cannot drift. + * + * Returns the user id when an `x-forwarded-user` header is present, + * otherwise `"global"` for service-principal contexts (no user header). + * Both queries from the same request resolve to the same key, so the + * subsequent /arrow-result fetch reliably hits the entry stashed + * during the SSE query. + * + * `resolveUserId` throws when no header is present — catch and degrade + * to "global" rather than letting that failure mode bubble through the + * route handler. + */ + protected _stashUserKey(req: express.Request): string { + try { + return this.resolveUserId(req) || "global"; + } catch { + return "global"; + } + } + /** * Handle SQL query execution requests. * When called via asUser(req), uses the user's Databricks credentials. @@ -131,6 +275,19 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { const { query_key } = req.params; const { parameters, format: rawFormat = "JSON_ARRAY" } = req.body as IAnalyticsQueryRequest; + + if ( + rawFormat !== "JSON_ARRAY" && + rawFormat !== "ARROW_STREAM" && + rawFormat !== "JSON" && + rawFormat !== "ARROW" + ) { + res.status(400).json({ + error: `Invalid format: ${String(rawFormat)}. Expected "JSON_ARRAY" or "ARROW_STREAM".`, + }); + return; + } + const format = normalizeAnalyticsFormat(rawFormat); // Request-scoped logging with WideEvent tracking @@ -165,35 +322,41 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { // get execution context - user-scoped if .obo.sql, otherwise service principal const executor = isAsUser ? this.asUser(req) : this; const executorKey = isAsUser ? this.resolveUserId(req) : "global"; - - const queryParameters = - format === "ARROW_STREAM" - ? { - formatParameters: { - disposition: "EXTERNAL_LINKS", - format: "ARROW_STREAM", - }, - type: "arrow", - } - : { - type: "result", - }; + // Stash key is always per-request user (never "global"), independent + // of the executor's cache scope. Inline Arrow payloads are single-use + // and short-lived — there is no benefit to sharing them across users, + // and per-user scoping is defense in depth on top of unguessable ids. + const stashUserKey = this._stashUserKey(req); const hashedQuery = this.queryProcessor.hashQuery(query); + // ARROW_STREAM responses reference ephemeral resources that cannot be + // safely replayed from cache: + // - EXTERNAL_LINKS pre-signed URLs expire ~15 min after issue, and + // the warehouse rotates them per execution. + // - INLINE responses point at a synthetic `inline-` job id + // backed by `InlineArrowStash`, which drains on the first + // /arrow-result fetch. A cache hit would replay an id whose bytes + // are already gone and reliably 410 the client. + // So we bypass cache for ARROW_STREAM and let every request execute + // a fresh statement. JSON_ARRAY responses still cache normally. + const cacheTtl = format === "ARROW_STREAM" ? 0 : queryDefaults.cache?.ttl; + const cacheConfig = { + ...queryDefaults.cache, + ttl: cacheTtl, + cacheKey: [ + "analytics:query", + query_key, + JSON.stringify(parameters), + format, + hashedQuery, + executorKey, + ], + }; + const defaultConfig: PluginExecuteConfig = { ...queryDefaults, - cache: { - ...queryDefaults.cache, - cacheKey: [ - "analytics:query", - query_key, - JSON.stringify(parameters), - JSON.stringify(format), - hashedQuery, - executorKey, - ], - }, + cache: cacheConfig, }; const streamExecutionSettings: StreamExecutionSettings = { @@ -208,20 +371,230 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { parameters, ); - const result = await executor.query( + return this._executeWithFormatFallback( + executor, query, processedParams, - queryParameters.formatParameters, + format, + stashUserKey, signal, ); - - return { type: queryParameters.type, ...result }; }, streamExecutionSettings, executorKey, ); } + /** + * Execute a query with automatic disposition/format fallback. + * + * - **JSON_ARRAY** first tries `INLINE + JSON_ARRAY`. If the warehouse + * only supports `ARROW_STREAM` for `INLINE` (some serverless variants), + * retries as `INLINE + ARROW_STREAM`, decodes the Arrow IPC attachment + * server-side, and returns plain row objects — the caller's + * `JSON_ARRAY` contract is preserved. + * - **ARROW_STREAM** first tries `INLINE + ARROW_STREAM`. If the + * warehouse refuses (most classic + some serverless variants), falls + * back to `EXTERNAL_LINKS + ARROW_STREAM`. When the regular stash is + * full, the already-decoded bytes spill into the stash's overflow + * pool — never thrown away to trigger a second warehouse round-trip. + * + * INLINE Arrow attachments under the ARROW_STREAM path are decoded once + * and put on the plugin's `inlineArrowStash`; the SSE message carries the + * synthetic stash id so the client fetches the bytes out-of-band via + * `/arrow-result/`. + */ + private async _executeWithFormatFallback( + executor: AnalyticsPlugin, + query: string, + processedParams: + | Record + | undefined, + requestedFormat: AnalyticsFormat, + stashUserKey: string, + signal?: AbortSignal, + ): Promise { + return requestedFormat === "JSON_ARRAY" + ? this._executeJsonArrayPath(executor, query, processedParams, signal) + : this._executeArrowStreamPath( + executor, + query, + processedParams, + stashUserKey, + signal, + ); + } + + /** + * JSON_ARRAY path. Tries `INLINE + JSON_ARRAY` first; on a structured + * `needs-arrow` rejection (warehouse only accepts ARROW_STREAM under + * INLINE), retries as `INLINE + ARROW_STREAM` and decodes the Arrow + * IPC attachment server-side so the caller's JSON_ARRAY contract is + * preserved. + * + * Failure modes surfaced to the caller: + * - Unrelated warehouse errors (anything not classified as + * `needs-arrow`) propagate untouched. + * - The retry can throw `ExecutionError.missingData("ARROW_STREAM + * attachment")` if the warehouse returned no attachment. + * - The decode itself can throw `RESULT_TOO_LARGE_FOR_JSON_FALLBACK` + * when the attachment exceeds the row or byte cap. + */ + private async _executeJsonArrayPath( + executor: AnalyticsPlugin, + query: string, + processedParams: + | Record + | undefined, + signal?: AbortSignal, + ): Promise { + try { + const result = await executor.query( + query, + processedParams, + { disposition: "INLINE", format: "JSON_ARRAY" }, + signal, + ); + return makeResultMessage(result?.data, { + status: result?.status, + statement_id: result?.statement_id, + }); + } catch (err: unknown) { + if (signal?.aborted) throw err; + if (_classifyInlineRejection(err) !== "needs-arrow") throw err; + + const msg = err instanceof Error ? err.message : String(err); + logger.warn( + "JSON_ARRAY INLINE rejected by warehouse, retrying as ARROW_STREAM INLINE and decoding server-side: %s", + msg, + ); + } + + // Retry as ARROW_STREAM + INLINE so the warehouse will accept the + // request, then decode the Arrow IPC attachment to plain row + // objects so the caller still gets JSON_ARRAY-shaped data. + const arrowResult = await executor.query( + query, + processedParams, + { disposition: "INLINE", format: "ARROW_STREAM" }, + signal, + ); + if (!arrowResult?.attachment) { + throw ExecutionError.missingData("ARROW_STREAM attachment"); + } + const rows = decodeArrowAttachmentToRows(arrowResult.attachment); + return makeResultMessage(rows, { + status: arrowResult.status, + statement_id: arrowResult.statement_id, + }); + } + + /** + * ARROW_STREAM path. Tries `INLINE + ARROW_STREAM` first; on a + * structured `needs-json` rejection (warehouse refuses ARROW_STREAM + * under INLINE), falls back to `EXTERNAL_LINKS + ARROW_STREAM`. When + * INLINE succeeds with an Arrow attachment, the decoded bytes go + * through `inlineArrowStash` and the client fetches them via + * `/arrow-result/inline-` — never the SSE channel. + * + * Failure modes surfaced to the caller: + * - Unrelated warehouse errors (not classified as `needs-json`) + * propagate untouched. + * - `ExecutionError.canceled()` if the client disconnects between + * the INLINE response and the stash put. + * - `ExecutionError.stashExhausted()` if both stash pools are full — + * we never silently re-run on EXTERNAL_LINKS because that would + * double-bill the warehouse and risk a divergent result for + * non-deterministic SQL. + */ + private async _executeArrowStreamPath( + executor: AnalyticsPlugin, + query: string, + processedParams: + | Record + | undefined, + stashUserKey: string, + signal?: AbortSignal, + ): Promise { + try { + const result = await executor.query( + query, + processedParams, + { disposition: "INLINE", format: "ARROW_STREAM" }, + signal, + ); + if (result?.attachment) { + return this._stashAndEmitInline(result, stashUserKey, signal); + } + // INLINE succeeded but the warehouse didn't return an Arrow + // attachment — degrade to a plain result message with whatever + // data the warehouse did return. + return makeResultMessage(result?.data, { + status: result?.status, + statement_id: result?.statement_id, + }); + } catch (err: unknown) { + // If the request was aborted, do not retry — the signal is dead and + // a second statement would be billed but never read. + if (signal?.aborted) throw err; + if (_classifyInlineRejection(err) !== "needs-json") throw err; + + const msg = err instanceof Error ? err.message : String(err); + logger.warn( + "ARROW_STREAM INLINE rejected by warehouse, falling back to EXTERNAL_LINKS: %s", + msg, + ); + } + + const result = await executor.query( + query, + processedParams, + { disposition: "EXTERNAL_LINKS", format: "ARROW_STREAM" }, + signal, + ); + return makeArrowMessage(result.statement_id, { status: result.status }); + } + + /** + * Decode an INLINE Arrow attachment, push it onto the stash, and + * return the `arrow` SSE message that references the synthetic id. + * Pulled out of `_executeArrowStreamPath` so the cap / overflow / + * exhaustion logic lives in one place. + */ + private _stashAndEmitInline( + result: { attachment: string; status?: unknown }, + stashUserKey: string, + signal?: AbortSignal, + ): AnalyticsSseMessage { + // If the client has already disconnected, the SSE write would be + // dropped anyway — skip the decode + stash so the bytes do not + // linger in memory until TTL eviction. + if (signal?.aborted) { + throw ExecutionError.canceled(); + } + const decoded = Buffer.from(result.attachment, "base64"); + const stashBytes = new Uint8Array( + decoded.buffer, + decoded.byteOffset, + decoded.byteLength, + ); + const putResult = this.inlineArrowStash.put(stashUserKey, stashBytes); + if (putResult === null) { + // Both regular and overflow pools are at cap. Throw with a clear + // signal rather than re-running the same statement on + // EXTERNAL_LINKS: the bytes we already decoded are gone, the + // warehouse has been billed, and a second execution could return + // a divergent result for non-deterministic SQL. + this._recordStashOutcome("exhausted", stashBytes.byteLength); + logger.warn( + "Inline Arrow stash exhausted (regular + overflow); rejecting", + ); + throw ExecutionError.stashExhausted(); + } + this._recordStashOutcome(putResult.pool, stashBytes.byteLength); + return makeArrowMessage(putResult.id, { status: result.status }); + } + /** * Execute a SQL query using the current execution context. * @@ -338,6 +711,177 @@ export class AnalyticsPlugin extends Plugin implements ToolProvider { } } +/** + * Hard caps on the server-side JSON_ARRAY fallback. The materializer + * builds every row as a plain JS object on the Node main thread + * (O(rows × cols) allocations), so a runaway result blocks the event + * loop and pressures GC. We cap two ways — rows AND decoded bytes — + * because either dimension can blow up independently (100k×1B rows is + * fine, 10 rows × 10MB cells is not). + */ +const JSON_ARRAY_FALLBACK_MAX_ROWS = 100_000; +const JSON_ARRAY_FALLBACK_MAX_BYTES = 64 * 1024 * 1024; + +/** + * Replacer used by the nested-value `JSON.stringify` path. Arrow IPC + * carries column values whose JS representations include `bigint` + * (which the spec'd `JSON.stringify` cannot serialize and throws on), + * `Uint8Array` (binary buffers — would serialize as `{"0":...,"1":...}` + * which is wrong), and `Date` (would lose timezone fidelity inside a + * nested struct). Coerce each to a string form that matches what the + * warehouse itself emits under native JSON_ARRAY. + */ +function nestedJsonReplacer(_key: string, value: unknown): unknown { + if (typeof value === "bigint") return value.toString(); + if (value instanceof Uint8Array) { + return Buffer.from(value).toString("base64"); + } + if (value instanceof Date) return value.toISOString(); + return value; +} + +/** + * Decode a base64 Arrow IPC attachment to plain row objects. + * + * Used by the JSON_ARRAY fallback path when a warehouse refuses + * `JSON_ARRAY + INLINE` and we have to satisfy the request via + * `ARROW_STREAM + INLINE` — the bytes come back as Arrow IPC but the + * caller's contract is JSON-shaped rows, so we convert server-side. + * + * Shape rules (chosen to match what the warehouse emits natively under + * JSON_ARRAY, so callers cannot tell which path served their query): + * - **Scalars (number / bigint / boolean)**: stringified. JSON_ARRAY + * wire format emits everything in `result.data_array` as strings; + * bigint stringification is also necessary for JSON-serializability. + * - **Strings / Date / Uint8Array**: stringified consistently — + * `Date` to ISO string, `Uint8Array` to base64. + * - **Nested (List / Struct / Map)**: `JSON.stringify` with the + * `nestedJsonReplacer` so nested bigints / Dates / binary buffers + * serialize sanely instead of throwing or producing object-as-map + * output. Apache-arrow's typed values expose `toJSON()` so the + * resulting string matches what the warehouse would have emitted. + */ +function decodeArrowAttachmentToRows( + attachment: string, +): Record[] { + const decoded = Buffer.from(attachment, "base64"); + if (decoded.byteLength > JSON_ARRAY_FALLBACK_MAX_BYTES) { + throw ExecutionError.statementFailed( + `Result attachment is ${decoded.byteLength} bytes; JSON_ARRAY fallback materializer caps at ${JSON_ARRAY_FALLBACK_MAX_BYTES} bytes. Re-issue the query with format="ARROW_STREAM" to stream the full result.`, + "RESULT_TOO_LARGE_FOR_JSON_FALLBACK", + "Result too large for JSON format. Re-run with ARROW_STREAM format.", + ); + } + const table = tableFromIPC( + new Uint8Array(decoded.buffer, decoded.byteOffset, decoded.byteLength), + ); + if (table.numRows > JSON_ARRAY_FALLBACK_MAX_ROWS) { + throw ExecutionError.statementFailed( + `Result has ${table.numRows} rows; JSON_ARRAY fallback materializer caps at ${JSON_ARRAY_FALLBACK_MAX_ROWS}. Re-issue the query with format="ARROW_STREAM" to stream the full result.`, + "RESULT_TOO_LARGE_FOR_JSON_FALLBACK", + `Result too large for JSON format (over ${JSON_ARRAY_FALLBACK_MAX_ROWS} rows). Re-run with ARROW_STREAM format.`, + ); + } + // Resolve the child vectors once. `table.getChild(name)` walks the + // schema fields on every call; without this hoist we'd pay that cost + // O(rows × cols) times. At 100k × 50 columns that's millions of + // redundant lookups on the event loop. + const columns = table.schema.fields.map( + (f) => [f.name, table.getChild(f.name)] as const, + ); + const rows: Record[] = []; + for (let i = 0; i < table.numRows; i++) { + const row: Record = {}; + for (const [name, col] of columns) { + const v = col?.get(i); + if (v == null) { + row[name] = null; + } else if ( + typeof v === "number" || + typeof v === "bigint" || + typeof v === "boolean" + ) { + row[name] = String(v); + } else if (typeof v === "string") { + row[name] = v; + } else if (v instanceof Date) { + row[name] = v.toISOString(); + } else if (v instanceof Uint8Array) { + row[name] = Buffer.from(v).toString("base64"); + } else { + // Nested types (List, Struct, Map). Use the replacer so nested + // bigint / Date / Uint8Array values serialize predictably — + // bare `JSON.stringify` throws on bigint and emits broken + // shapes for binary buffers. + row[name] = JSON.stringify(v, nestedJsonReplacer); + } + } + rows.push(row); + } + return rows; +} + +/** + * Classify a warehouse rejection of an INLINE statement. + * + * Two distinct rejection modes are observed in the wild: + * + * - **needs-arrow**: warehouse refuses `JSON_ARRAY + INLINE`, only accepts + * `ARROW_STREAM + INLINE`. Example message: + * `"Inline disposition only supports ARROW_STREAM format."` + * Action: retry as `ARROW_STREAM + INLINE` and decode server-side. + * + * - **needs-json**: warehouse refuses `ARROW_STREAM + INLINE`, only accepts + * `JSON_ARRAY + INLINE`. Examples: + * `"The format field must be JSON_ARRAY when the disposition field is INLINE."` + * `"ARROW_STREAM not supported with INLINE disposition"` + * `"ExternalLinks disposition is not yet implemented."` (same family — + * the warehouse rejected the disposition/format combo we sent). + * Action: retry as `ARROW_STREAM + EXTERNAL_LINKS`. + * + * The structured `errorCode` (INVALID_PARAMETER_VALUE / NOT_IMPLEMENTED) + * gates the classification so unrelated SQL errors don't trigger a retry. + * Message matching is case-insensitive — warehouses are inconsistent about + * casing of "Inline"/"INLINE". + */ +type InlineRejection = "needs-arrow" | "needs-json" | null; + +function _classifyInlineRejection(err: unknown): InlineRejection { + const msg = err instanceof Error ? err.message : String(err); + const lower = msg.toLowerCase(); + + const structuredCode = + err instanceof ExecutionError ? err.errorCode : undefined; + const hasCode = + structuredCode === "INVALID_PARAMETER_VALUE" || + structuredCode === "NOT_IMPLEMENTED" || + lower.includes("invalid_parameter_value") || + lower.includes("not_implemented"); + if (!hasCode) return null; + + // Must mention the inline disposition to count as a disposition-rejection. + if (!lower.includes("inline")) return null; + + // "needs-arrow": warehouse only supports ARROW_STREAM for INLINE. + if ( + /only supports\s+arrow_stream/i.test(msg) || + /must be\s+arrow_stream/i.test(msg) + ) { + return "needs-arrow"; + } + + // "needs-json": warehouse only supports JSON_ARRAY for INLINE. + if ( + /only supports\s+json_array/i.test(msg) || + /must be\s+json_array/i.test(msg) || + /arrow_stream\s+(is\s+|was\s+)?not\s+supported/i.test(msg) + ) { + return "needs-json"; + } + + return null; +} + /** * @internal */ diff --git a/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts b/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts new file mode 100644 index 000000000..cb52ff711 --- /dev/null +++ b/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts @@ -0,0 +1,240 @@ +import { randomUUID } from "node:crypto"; + +/** + * Server-side stash for inline Arrow IPC payloads. + * + * When a warehouse returns ARROW_STREAM + INLINE results, the bytes are + * stashed here and a synthetic "inline-" job id is emitted on the + * SSE control channel. The client then fetches the bytes out-of-band via + * `/arrow-result/`, which drains the stash and serves the payload as + * `application/vnd.apache.arrow.stream`. + * + * Keeps multi-MiB Arrow blobs off SSE, lets the existing /arrow-result + * pipeline handle both inline and EXTERNAL_LINKS results uniformly, and + * delivers the bytes with a real binary content-type instead of base64 + * inside JSON inside SSE framing. + * + * Properties: + * - **Drain-on-read**: a successful `take()` removes the entry. There is + * no replay path — a lost client connection means the bytes are gone. + * - **TTL bounded**: entries past their expiry are evicted on every + * `put()` and `take()`. No background timer. + * - **Per-user keyed**: `take()` only returns bytes if the requesting + * user matches the user that originally put them. Defense in depth on + * top of unguessable ids. + * - **Memory bounded with overflow slot**: total stashed bytes are capped + * at `maxBytes`. When `put()` cannot fit a payload, it spills into a + * separate overflow pool capped at `maxOverflowBytes` (default + * `maxBytes / 4` — kept small because overflow only needs to bridge + * the immediate `/arrow-result` fetch). Overflow entries behave like + * regular ones except (a) they do not count against the regular cap + * and (b) they expire on a much shorter TTL (`overflowTtlMs`, + * default 30s) — they exist to absorb already-decoded bytes for a + * single request, not to hold them long-term. Only when both pools + * are full does `put()` return `null` and the caller has to fall + * back to a different delivery path. Memory is bounded above by + * `maxBytes + maxOverflowBytes`. + * + * Caveat (multi-replica deployments): this stash is process-local. A + * subsequent `GET /arrow-result/inline-*` that lands on a different + * replica than the one that stashed the bytes will 410. Deployments + * that run more than one replica need sticky sessions (route both + * requests in the same logical session to the same replica) or a + * shared external store, neither of which is in scope here. + */ +interface InlineArrowStashOptions { + /** Regular-pool entries older than this are dropped on the next gc tick. */ + ttlMs?: number; + /** + * Overflow-pool entries older than this are dropped on the next gc + * tick. Defaults to 30s — overflow exists solely to bridge the + * immediate `/arrow-result` fetch that follows the SSE `arrow` + * message, so it should drain on the order of seconds, not minutes. + * A short TTL bounds the cross-user memory pressure that overflow + * can sustain in a multi-tenant deployment. + */ + overflowTtlMs?: number; + /** + * Hard cap on total bytes held in the regular pool. `put()` spills to + * the overflow pool when this cap would be exceeded; entries already + * in the stash are not evicted to fit new ones. + */ + maxBytes?: number; + /** + * Hard cap on total bytes held in the overflow pool. Overflow holds + * bytes that have already been decoded for an in-flight request — its + * purpose is to avoid throwing those bytes away and double-billing + * the warehouse. Defaults to `maxBytes / 4` (kept small because the + * pool only needs to absorb transient spillover, not hold long-term + * state). `put()` returns `null` only when both regular and overflow + * pools are at cap. + */ + maxOverflowBytes?: number; + /** + * Minimum interval between gc passes. `gc()` is O(N) in entry count, + * so on hot paths we skip when the previous pass was recent enough. + * Defaults to 5s. Set to 0 to disable throttling (test seam). + */ + gcMinIntervalMs?: number; + /** Test seam: override the synthetic-id generator. */ + idGenerator?: () => string; + /** Test seam: override the clock. */ + now?: () => number; +} + +interface StashEntry { + userId: string; + bytes: Uint8Array; + expiresAt: number; + insertedAt: number; + /** True for entries in the overflow pool (do not count against maxBytes). */ + overflow: boolean; +} + +export class InlineArrowStash { + private entries = new Map(); + private totalBytes = 0; + private overflowBytes = 0; + private lastGcAt = 0; + private readonly ttlMs: number; + private readonly overflowTtlMs: number; + private readonly maxBytes: number; + private readonly maxOverflowBytes: number; + private readonly gcMinIntervalMs: number; + private readonly idGenerator: () => string; + private readonly now: () => number; + + constructor(opts: InlineArrowStashOptions = {}) { + this.ttlMs = opts.ttlMs ?? 10 * 60 * 1000; + this.overflowTtlMs = opts.overflowTtlMs ?? 30 * 1000; + this.maxBytes = opts.maxBytes ?? 256 * 1024 * 1024; + this.maxOverflowBytes = + opts.maxOverflowBytes ?? Math.floor(this.maxBytes / 4); + this.gcMinIntervalMs = opts.gcMinIntervalMs ?? 5000; + this.idGenerator = opts.idGenerator ?? randomUUID; + this.now = opts.now ?? Date.now; + } + + /** + * Stash a payload and return its synthetic job id. + * + * Tries the regular pool first; if it would overflow, spills into the + * overflow pool (sized at `maxOverflowBytes`). Returns `null` only when + * both pools are at cap — the caller then has no choice but to fall + * back to a different delivery path (e.g. EXTERNAL_LINKS). + * + * The overflow pool exists because the caller has *already decoded* + * these bytes for an in-flight request: throwing them away would force + * a second warehouse round-trip (extra latency, double billing, and + * potentially divergent results for non-deterministic SQL). Holding + * them transiently in a bounded overflow region — they are drained + * single-use on the next `/arrow-result` fetch — is strictly safer + * than re-execution. + * + * A single payload can only land in one pool — the pools are not + * split across — so the largest payload we can accept is + * `Math.max(maxBytes, maxOverflowBytes)`. Exceeding that throws + * synchronously so the caller sees the misconfiguration loudly + * rather than burning a warehouse round-trip and then getting a + * null id. + * + * Returns `{ id, pool }` on success (`pool` ∈ {"regular", "overflow"}) + * or `null` when both pools are at cap. The pool tag lets callers + * emit accurate telemetry labels without re-introspecting the stash. + */ + put( + userId: string, + bytes: Uint8Array, + ): { id: string; pool: "regular" | "overflow" } | null { + const largestSlot = Math.max(this.maxBytes, this.maxOverflowBytes); + if (bytes.length > largestSlot) { + throw new Error( + `Inline Arrow payload (${bytes.length} bytes) exceeds largest stash slot (${largestSlot}); cannot fit in either pool`, + ); + } + this.gc(); + const fitsRegular = this.totalBytes + bytes.length <= this.maxBytes; + const fitsOverflow = + !fitsRegular && + this.overflowBytes + bytes.length <= this.maxOverflowBytes; + if (!fitsRegular && !fitsOverflow) { + // Both pools are full — refuse rather than evicting any issued id. + return null; + } + const id = `inline-${this.idGenerator()}`; + const now = this.now(); + const overflow = !fitsRegular; + this.entries.set(id, { + userId, + bytes, + expiresAt: now + (overflow ? this.overflowTtlMs : this.ttlMs), + insertedAt: now, + overflow, + }); + if (overflow) { + this.overflowBytes += bytes.length; + } else { + this.totalBytes += bytes.length; + } + return { id, pool: overflow ? "overflow" : "regular" }; + } + + /** + * Drain a payload from the stash. Returns `undefined` if the id is + * unknown, expired, or belongs to a different user. + */ + take(id: string, userId: string): Uint8Array | undefined { + this.gc(); + const entry = this.entries.get(id); + if (!entry) return undefined; + if (entry.userId !== userId) return undefined; + this.entries.delete(id); + if (entry.overflow) { + this.overflowBytes -= entry.bytes.length; + } else { + this.totalBytes -= entry.bytes.length; + } + return entry.bytes; + } + + /** Inspection helpers (primarily for tests). */ + size(): number { + return this.totalBytes; + } + /** Bytes currently held in the overflow pool. */ + overflowSize(): number { + return this.overflowBytes; + } + count(): number { + return this.entries.size; + } + + /** Drop all entries (used in plugin shutdown). */ + clear(): void { + this.entries.clear(); + this.totalBytes = 0; + this.overflowBytes = 0; + } + + private gc(): void { + const now = this.now(); + if (now - this.lastGcAt < this.gcMinIntervalMs) { + // Skip the O(N) sweep — recent enough to assume nothing + // significant has expired since the last pass. Worst case an + // entry lingers an extra `gcMinIntervalMs` past its TTL, which + // is negligible relative to either pool's intended lifetime. + return; + } + this.lastGcAt = now; + for (const [id, entry] of this.entries) { + if (entry.expiresAt <= now) { + this.entries.delete(id); + if (entry.overflow) { + this.overflowBytes -= entry.bytes.length; + } else { + this.totalBytes -= entry.bytes.length; + } + } + } + } +} diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index eb06ea952..9c7d96987 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -9,6 +9,7 @@ import { sql } from "shared"; import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; import { ServiceContext } from "../../../context/service-context"; import { AnalyticsPlugin, analytics } from "../analytics"; +import { InlineArrowStash } from "../inline-arrow-stash"; import type { IAnalyticsConfig } from "../types"; // Mock CacheManager singleton with actual caching behavior @@ -106,6 +107,87 @@ describe("Analytics Plugin", () => { ); }); + test("/arrow-result/inline-* drains the stash and serves bytes as application/vnd.apache.arrow.stream", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + plugin.injectRoutes(router); + + const arrowBytes = new Uint8Array([0xff, 0xfe, 0xfd, 0xfc]); + const { id } = (plugin as any).inlineArrowStash.put("global", arrowBytes); + expect(id.startsWith("inline-")).toBe(true); + + const handler = getHandler("GET", "/arrow-result/:jobId"); + const mockReq = createMockRequest({ params: { jobId: id } }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(mockRes.setHeader).toHaveBeenCalledWith( + "Content-Type", + "application/vnd.apache.arrow.stream", + ); + expect(mockRes.setHeader).toHaveBeenCalledWith( + "Content-Length", + String(arrowBytes.length), + ); + expect(mockRes.setHeader).toHaveBeenCalledWith( + "Cache-Control", + "no-store", + ); + expect(mockRes.send).toHaveBeenCalledTimes(1); + const sentBuf = (mockRes.send as any).mock.calls[0][0] as Buffer; + expect(Buffer.isBuffer(sentBuf)).toBe(true); + expect(Array.from(sentBuf)).toEqual(Array.from(arrowBytes)); + + // Drain-on-read: a second fetch must return 410, not the bytes again. + const secondRes = createMockResponse(); + await handler(mockReq, secondRes); + expect(secondRes.status).toHaveBeenCalledWith(410); + }); + + test("/arrow-result/inline-* returns 410 when the stash entry never existed", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + plugin.injectRoutes(router); + + const handler = getHandler("GET", "/arrow-result/:jobId"); + const mockReq = createMockRequest({ + params: { jobId: "inline-does-not-exist" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(mockRes.status).toHaveBeenCalledWith(410); + expect(mockRes.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: expect.stringMatching(/expired or unknown/), + }), + ); + }); + + test("/arrow-result/inline-* returns 410 when the stash entry belongs to a different user", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + plugin.injectRoutes(router); + + // Stash entry keyed to user-a, but the request resolves to "global" + // (no x-forwarded-user header) — keys differ, take must return + // nothing, and the entry stays put (single-user view). + const bytes = new Uint8Array([1, 2, 3]); + const { id } = (plugin as any).inlineArrowStash.put("user-a", bytes); + + const handler = getHandler("GET", "/arrow-result/:jobId"); + const mockReq = createMockRequest({ params: { jobId: id } }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(mockRes.status).toHaveBeenCalledWith(410); + // The entry must still be there for the real owner. + expect((plugin as any).inlineArrowStash.take(id, "user-a")).toBeDefined(); + }); + test("/query/:query_key should return 400 when query_key is missing", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -581,6 +663,744 @@ describe("Analytics Plugin", () => { ); }); + test("/query/:query_key should pass INLINE + ARROW_STREAM format parameters when format is ARROW_STREAM", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + statement: "SELECT * FROM test", + warehouse_id: "test-warehouse-id", + disposition: "INLINE", + format: "ARROW_STREAM", + }), + expect.any(AbortSignal), + ); + }); + + test("/query/:query_key should use INLINE + JSON_ARRAY by default when no format specified", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {} }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + disposition: "INLINE", + format: "JSON_ARRAY", + }), + expect.any(AbortSignal), + ); + }); + + test("/query/:query_key should pass INLINE + JSON_ARRAY when format is explicitly JSON_ARRAY", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON_ARRAY" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); + }); + + test("/query/:query_key should fall back ARROW_STREAM from INLINE to EXTERNAL_LINKS when warehouse rejects INLINE", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValueOnce( + new Error( + "INVALID_PARAMETER_VALUE: ARROW_STREAM not supported with INLINE disposition", + ), + ) + .mockResolvedValueOnce({ + result: { statement_id: "stmt-1", status: { state: "SUCCEEDED" } }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // First call: INLINE (rejected) + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + // Second call: EXTERNAL_LINKS (fallback) + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }); + }); + + test("/query/:query_key falls back on a structured ExecutionError.errorCode without scanning the message", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Properly-structured ExecutionError, as the connector now produces + // when the SDK's ApiError surfaces with errorCode set. + const { ExecutionError } = await import("../../../errors/execution"); + const structuredError = ExecutionError.statementFailed( + "ARROW_STREAM is not supported with INLINE disposition", + "INVALID_PARAMETER_VALUE", + ); + + const executeMock = vi + .fn() + .mockRejectedValueOnce(structuredError) + .mockResolvedValueOnce({ + result: { statement_id: "stmt-1", status: { state: "SUCCEEDED" } }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Both attempts: INLINE (rejected via structured code) → EXTERNAL_LINKS. + expect(executeMock).toHaveBeenCalledTimes(2); + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }); + }); + + test("/query/:query_key falls back when error message carries a structured INVALID_PARAMETER_VALUE error_code", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Wrapped JSON error like the SDK surfaces from a `Bad Request` HTTP + // response. Both INLINE and ARROW_STREAM appear, plus the structured code. + const wrappedJsonError = new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"ARROW_STREAM is not supported with INLINE disposition on this warehouse"}', + ); + const executeMock = vi + .fn() + .mockRejectedValueOnce(wrappedJsonError) + .mockResolvedValueOnce({ + result: { statement_id: "stmt-1", status: { state: "SUCCEEDED" } }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Both attempts ran: INLINE (rejected) then EXTERNAL_LINKS (succeeded). + expect(executeMock).toHaveBeenCalledTimes(2); + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }); + }); + + test("/query/:query_key does NOT fall back when only one of INLINE/ARROW_STREAM appears in the error", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Realistic non-format error that mentions just one of the keywords — + // e.g. an unrelated INVALID_PARAMETER_VALUE about a different param. + const executeMock = vi + .fn() + .mockRejectedValue( + new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"INLINE is not a valid value for parameter `mode`"}', + ), + ); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // The retry interceptor may attempt the query multiple times, but the + // analytics plugin must never escalate to EXTERNAL_LINKS for an error + // that doesn't actually indicate a format/disposition rejection. + for (const call of executeMock.mock.calls) { + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + } + }); + + test("/query/:query_key should not fall back for non-format errors", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValue(new Error("PERMISSION_DENIED: no access")); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Only one call — non-format error is not retried with different disposition. + for (const call of executeMock.mock.calls) { + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + } + }); + + test("/query/:query_key stashes ARROW_STREAM INLINE bytes and emits an arrow message with a synthetic inline- id", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Real base64 so the route can decode it via Buffer.from(..., "base64"). + const arrowBytes = new Uint8Array([1, 2, 3, 4, 5]); + const fakeAttachment = Buffer.from(arrowBytes).toString("base64"); + const executeMock = vi.fn().mockResolvedValue({ + result: { attachment: fakeAttachment, row_count: 1 }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // The route should not fall back to EXTERNAL_LINKS — INLINE succeeded. + expect(executeMock).toHaveBeenCalledTimes(1); + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + // SSE payload: unified `arrow` message with an inline- prefixed id. + // The base64 attachment must NOT appear on the SSE channel. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const payload = writeCalls.find((s: string) => s.startsWith("data: ")); + expect(payload).toBeDefined(); + expect(payload).toContain('"type":"arrow"'); + expect(payload).toMatch(/"statement_id":"inline-[^"]+"/); + expect(payload).not.toContain("arrow_inline"); + expect(payload).not.toContain(fakeAttachment); + + // The decoded bytes should be in the stash, keyed by the same + // synthetic id; a subsequent /arrow-result fetch will drain them. + const idMatch = payload?.match(/"statement_id":"(inline-[^"]+)"/); + expect(idMatch).not.toBeNull(); + const inlineId = idMatch?.[1]; + const stashed = (plugin as any).inlineArrowStash.take(inlineId, "global"); + expect(stashed).toBeDefined(); + expect(Array.from(stashed)).toEqual(Array.from(arrowBytes)); + }); + + test("/query/:query_key spills the already-decoded bytes into the stash overflow pool when the regular pool is full — single execution, no double-billing", async () => { + // The regular stash put may refuse new entries when at cap. We must + // NOT respond by re-executing the same statement with EXTERNAL_LINKS: + // the warehouse has already been billed, the bytes are already + // decoded server-side, and a second execution can return a divergent + // result for non-deterministic SQL (CURRENT_TIMESTAMP, RAND, late + // rows). The stash's overflow pool absorbs the bytes so the client + // still gets an inline- id pointing at the original result. + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const fakeAttachment = Buffer.from(new Uint8Array([1, 2, 3])).toString( + "base64", + ); + const executeMock = vi.fn().mockResolvedValueOnce({ + result: { attachment: fakeAttachment, row_count: 1 }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + // Force the regular pool to be at cap so the put spills into overflow. + // We construct a tiny stash and pre-fill the regular pool. + const tinyStash = new InlineArrowStash({ + maxBytes: 4, + maxOverflowBytes: 64, + }); + tinyStash.put("filler", new Uint8Array([9, 9, 9, 9])); + (plugin as any).inlineArrowStash = tinyStash; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Single execution: no EXTERNAL_LINKS retry. + expect(executeMock).toHaveBeenCalledTimes(1); + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + + // SSE payload carries an inline- id pointing at the overflow entry. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const payload = writeCalls.find((s: string) => s.startsWith("data: ")); + expect(payload).toBeDefined(); + expect(payload).toContain('"type":"arrow"'); + expect(payload).toMatch(/"statement_id":"inline-[^"]+"/); + + // Bytes landed in the overflow pool, regular pool size unchanged. + expect(tinyStash.overflowSize()).toBe(3); + expect(tinyStash.size()).toBe(4); + }); + + test("/query/:query_key surfaces a stable error when both regular and overflow pools are exhausted — never silently double-bills", async () => { + // When even the overflow pool cannot fit the payload, the route + // surfaces INLINE_ARROW_STASH_EXHAUSTED instead of re-running the + // statement on EXTERNAL_LINKS. The previous behavior (silent retry) + // double-billed the warehouse and could return divergent results. + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const fakeAttachment = Buffer.from(new Uint8Array([1, 2, 3])).toString( + "base64", + ); + const executeMock = vi.fn().mockResolvedValueOnce({ + result: { attachment: fakeAttachment, row_count: 1 }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + // Force both put paths to refuse: spy returns null unconditionally. + vi.spyOn((plugin as any).inlineArrowStash, "put").mockReturnValue(null); + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Single execution: never re-runs on EXTERNAL_LINKS. + expect(executeMock).toHaveBeenCalledTimes(1); + + // SSE error payload carries the stable errorCode for UI branching. + // The writer emits separate lines (`id:`, `event: error`, `data: ...`) + // — join them so we can match across the framing. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const errorFrame = writeCalls + .filter((s: string) => s.startsWith("data: ")) + .join("\n"); + expect(errorFrame).toContain("INLINE_ARROW_STASH_EXHAUSTED"); + // Client-safe message reaches the wire; raw upstream text does not. + expect(errorFrame).toContain("Server is at capacity"); + expect(errorFrame).not.toContain("Inline Arrow stash exhausted"); + }); + + test("/query/:query_key falls back JSON_ARRAY to ARROW_STREAM INLINE when warehouse refuses JSON_ARRAY for INLINE", async () => { + // Some serverless warehouses (the ones this PR is centrally aimed at) + // only accept ARROW_STREAM for INLINE results — JSON_ARRAY + INLINE is + // rejected outright. The caller still asked for JSON_ARRAY, so the + // server retries as ARROW_STREAM + INLINE and decodes the attachment + // back into plain row objects: the caller's contract is preserved and + // the SSE channel still carries a `result` message, not an `arrow` + // message. + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Real base64 Arrow IPC captured from a serverless warehouse running + // `SELECT 1 AS test_col, 2 AS test_col2` (one row, two INT columns). + const REAL_ARROW_ATTACHMENT = + "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAABMAAAABAAAAMz///8QAAAAGAAAAAAAAQIUAAAAvP///yAAAAAAAAABAAAAAAkAAAB0ZXN0X2NvbDIAAAAQABQAEAAOAA8ABAAAAAgAEAAAABgAAAAgAAAAAAABAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAAIAAAAdGVzdF9jb2wAAAAA/////7gAAAAQAAAADAAaABgAFwAEAAgADAAAACAAAAAAAQAAAAAAAAAAAAAAAAADBAAKABgADAAIAAQACgAAADwAAAAQAAAAAQAAAAAAAAAAAAAAAgAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAEAAAAAAAAAQAAAAAAAAAAEAAAAAAAAAIAAAAAAAAAAAQAAAAAAAADAAAAAAAAAAAQAAAAAAAAA/wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAA"; + + const executeMock = vi + .fn() + // First call: JSON_ARRAY + INLINE — warehouse rejects. + .mockRejectedValueOnce( + new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"Inline disposition only supports ARROW_STREAM format."}', + ), + ) + // Second call: ARROW_STREAM + INLINE — warehouse returns the bytes. + .mockResolvedValueOnce({ + result: { + attachment: REAL_ARROW_ATTACHMENT, + status: { state: "SUCCEEDED" }, + }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON_ARRAY" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Two calls: first JSON_ARRAY + INLINE (rejected), then the fallback + // ARROW_STREAM + INLINE (the warehouse's preferred shape for INLINE). + expect(executeMock).toHaveBeenCalledTimes(2); + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + + // The SSE wire payload must look like a JSON_ARRAY result, not an + // arrow message — the caller asked for JSON_ARRAY and the server has + // already decoded Arrow → rows. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const payload = writeCalls.find((s: string) => s.startsWith("data: ")); + expect(payload).toBeDefined(); + expect(payload).toContain('"type":"result"'); + expect(payload).not.toContain('"type":"arrow"'); + // Real row values from the captured attachment: test_col=1, test_col2=2. + // Integer columns are coerced to strings to match what JSON_ARRAY would + // have produced for the same warehouse + same INT columns. + expect(payload).toContain('"test_col":"1"'); + expect(payload).toContain('"test_col2":"2"'); + }); + + test("/query/:query_key surfaces an error when both JSON_ARRAY + INLINE and the ARROW_STREAM retry fail", async () => { + // If the JSON_ARRAY retry path (ARROW_STREAM + INLINE) also fails — e.g. + // a downstream warehouse outage that affects both shapes — the route + // must surface the failure rather than silently dropping it. + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // First mocked call (JSON_ARRAY + INLINE) rejects with a needs-arrow + // signal; every subsequent call rejects with an unrelated failure. The + // retry interceptor may retry the second call multiple times — we only + // care that the retry path was taken and that the request ultimately + // surfaces an error rather than a successful result. + const executeMock = vi.fn().mockImplementation((_wc, opts) => { + if (opts?.disposition === "INLINE" && opts?.format === "JSON_ARRAY") { + return Promise.reject( + new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"Inline disposition only supports ARROW_STREAM format."}', + ), + ); + } + return Promise.reject(new Error("warehouse is down")); + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON_ARRAY" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // The retry happened: at least one ARROW_STREAM + INLINE call followed + // the initial JSON_ARRAY + INLINE rejection. + const formats = executeMock.mock.calls.map((c: any[]) => c[1]); + expect( + formats.some( + (f: any) => f?.disposition === "INLINE" && f?.format === "JSON_ARRAY", + ), + ).toBe(true); + expect( + formats.some( + (f: any) => + f?.disposition === "INLINE" && f?.format === "ARROW_STREAM", + ), + ).toBe(true); + // No call should escalate to EXTERNAL_LINKS — that fallback only + // exists on the ARROW_STREAM caller path. + expect( + formats.some((f: any) => f?.disposition === "EXTERNAL_LINKS"), + ).toBe(false); + + // The SSE payload, if any was written, must NOT carry a successful + // result frame. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const payload = writeCalls.find((s: string) => s.startsWith("data: ")); + if (payload) { + expect(payload).not.toContain('"type":"result"'); + } + }); + + test("/query/:query_key rejects unknown format values with 400", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + const executeMock = vi.fn(); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + // "CSV" is genuinely unsupported. The legacy spellings "JSON" / "ARROW" + // are *accepted* by the route (normalized to JSON_ARRAY / ARROW_STREAM + // for back-compat with appkit < 0.33.0), so they must not be used here. + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "CSV" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(mockRes.status).toHaveBeenCalledWith(400); + expect(executeMock).not.toHaveBeenCalled(); + }); + + test("/query/:query_key does not retry the fallback when the request was aborted", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockImplementation((_wc, _opts, signal) => { + // Simulate a signal that becomes aborted before the failure surfaces — + // e.g. the client cancelled the SSE stream mid-query. Use vitest's + // getter spy rather than Object.defineProperty so we don't try to + // override the native non-configurable AbortSignal.aborted getter. + if (signal) { + vi.spyOn(signal, "aborted", "get").mockReturnValue(true); + } + return Promise.reject( + new Error( + "INVALID_PARAMETER_VALUE: ARROW_STREAM not supported with INLINE disposition", + ), + ); + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Even though the error message would normally trigger fallback, the + // aborted signal should short-circuit and prevent a second statement. + expect(executeMock).toHaveBeenCalledTimes(1); + }); + + test("/query/:query_key does NOT fall back JSON_ARRAY when the rejection lacks a needs-arrow signal", async () => { + // A generic INVALID_PARAMETER_VALUE that doesn't mention the INLINE + // disposition could be any unrelated SQL/permission error. The classifier + // must NOT interpret it as "warehouse wants ARROW_STREAM" — falling back + // would mask the real failure. + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValue( + new Error("INVALID_PARAMETER_VALUE: only supports ARROW_STREAM"), + ); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON_ARRAY" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // All calls stay on JSON_ARRAY + INLINE — no retry path with a different + // disposition or format was taken. + for (const call of executeMock.mock.calls) { + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); + } + }); + test("should return 404 when query file is not found", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); diff --git a/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts b/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts new file mode 100644 index 000000000..05ae821bd --- /dev/null +++ b/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts @@ -0,0 +1,184 @@ +import { describe, expect, test } from "vitest"; +import { InlineArrowStash } from "../inline-arrow-stash"; + +function bytes(n: number): Uint8Array { + return new Uint8Array(n); +} + +// `put()` returns `{ id, pool } | null` — it rejects with null when the stash +// is full. Most tests only care about the id; this helper narrows via the +// non-null contract and returns just the id. Tests that need the pool tag +// call `stash.put(...)` directly. +function mustPut( + stash: InlineArrowStash, + userId: string, + b: Uint8Array, +): string { + const result = stash.put(userId, b); + if (result === null) { + throw new Error("test setup: stash unexpectedly rejected put"); + } + return result.id; +} + +describe("InlineArrowStash", () => { + test("put returns an inline-prefixed synthetic id", () => { + const stash = new InlineArrowStash({ idGenerator: () => "abc" }); + const id = mustPut(stash, "user-1", bytes(100)); + expect(id).toBe("inline-abc"); + }); + + test("take drains the entry", () => { + const stash = new InlineArrowStash(); + const id = mustPut(stash, "user-1", bytes(100)); + expect(stash.count()).toBe(1); + expect(stash.size()).toBe(100); + + const got = stash.take(id, "user-1"); + expect(got).toBeDefined(); + expect(got?.length).toBe(100); + expect(stash.count()).toBe(0); + expect(stash.size()).toBe(0); + // Drain-on-read: second take returns undefined. + expect(stash.take(id, "user-1")).toBeUndefined(); + }); + + test("take returns undefined for unknown id", () => { + const stash = new InlineArrowStash(); + expect(stash.take("inline-nope", "user-1")).toBeUndefined(); + }); + + test("take returns undefined when userId does not match", () => { + const stash = new InlineArrowStash(); + const id = mustPut(stash, "user-1", bytes(100)); + expect(stash.take(id, "user-2")).toBeUndefined(); + // Entry is still there for the right user. + expect(stash.take(id, "user-1")).toBeDefined(); + }); + + test("entries past TTL are evicted on next gc tick", () => { + let clock = 0; + // gcMinIntervalMs: 0 disables gc throttling so the test can drive + // the clock past TTL on a sub-throttle scale without the gc pass + // being skipped. + const stash = new InlineArrowStash({ + ttlMs: 1000, + gcMinIntervalMs: 0, + now: () => clock, + }); + const id = mustPut(stash, "user-1", bytes(50)); + clock = 999; + expect(stash.take(id, "user-1")).toBeDefined(); + + const id2 = mustPut(stash, "user-1", bytes(50)); + clock = 2000; + // Bump the clock past TTL and trigger gc via another put. + mustPut(stash, "user-2", bytes(10)); + expect(stash.take(id2, "user-1")).toBeUndefined(); + }); + + test("put spills into the overflow pool when the regular pool is at cap — every issued id remains valid", () => { + let seq = 0; + const stash = new InlineArrowStash({ + maxBytes: 200, + maxOverflowBytes: 200, + idGenerator: () => String(seq++), + }); + const a = mustPut(stash, "user-1", bytes(80)); + const b = mustPut(stash, "user-1", bytes(80)); + expect(stash.size()).toBe(160); + expect(stash.overflowSize()).toBe(0); + + // The third 80-byte entry would push the regular pool to 240 (>200), + // so it spills into overflow. Both prior entries must survive. + const c = mustPut(stash, "user-1", bytes(80)); + expect(stash.size()).toBe(160); + expect(stash.overflowSize()).toBe(80); + expect(stash.take(a, "user-1")).toBeDefined(); + expect(stash.take(b, "user-1")).toBeDefined(); + expect(stash.take(c, "user-1")).toBeDefined(); + // After draining the overflow entry, the counter reflects it. + expect(stash.overflowSize()).toBe(0); + }); + + test("put returns null only when both regular and overflow pools are full", () => { + let seq = 0; + const stash = new InlineArrowStash({ + maxBytes: 100, + maxOverflowBytes: 100, + idGenerator: () => String(seq++), + }); + const a = mustPut(stash, "user-1", bytes(100)); // fills regular + const b = mustPut(stash, "user-1", bytes(100)); // fills overflow + expect(stash.size()).toBe(100); + expect(stash.overflowSize()).toBe(100); + + // Both pools at cap — refuse rather than evict. + const c = stash.put("user-1", bytes(50)); + expect(c).toBeNull(); + expect(stash.take(a, "user-1")).toBeDefined(); + expect(stash.take(b, "user-1")).toBeDefined(); + }); + + test("put tags the result with its pool so callers can label telemetry without re-introspecting", () => { + const stash = new InlineArrowStash({ + maxBytes: 100, + maxOverflowBytes: 100, + }); + expect(stash.put("u", bytes(80))).toMatchObject({ pool: "regular" }); + // Regular has 80/100 used; another 80 would push it to 160 > 100 so it + // spills. + expect(stash.put("u", bytes(80))).toMatchObject({ pool: "overflow" }); + }); + + test("put throws when a single payload would not fit in the largest pool (caller misconfiguration)", () => { + const stash = new InlineArrowStash({ + maxBytes: 100, + maxOverflowBytes: 100, + }); + // 300 > max(maxBytes, maxOverflowBytes) (100); pools don't split, + // so no individual put can ever succeed. + expect(() => stash.put("user-1", bytes(300))).toThrow( + /exceeds largest stash slot/, + ); + }); + + test("overflow entries expire on a shorter TTL than regular entries", () => { + let clock = 0; + const stash = new InlineArrowStash({ + maxBytes: 80, + maxOverflowBytes: 80, + ttlMs: 600_000, + overflowTtlMs: 30_000, + gcMinIntervalMs: 0, + now: () => clock, + }); + const reg = mustPut(stash, "user-1", bytes(80)); // fills regular + const ovf = mustPut(stash, "user-1", bytes(80)); // spills to overflow + expect(stash.size()).toBe(80); + expect(stash.overflowSize()).toBe(80); + + // 45 s in: overflow expired, regular still alive. + clock = 45_000; + expect(stash.take(ovf, "user-1")).toBeUndefined(); + expect(stash.take(reg, "user-1")).toBeDefined(); + }); + + test("synthetic ids are unique across puts", () => { + const stash = new InlineArrowStash(); + const a = mustPut(stash, "user-1", bytes(10)); + const b = mustPut(stash, "user-1", bytes(10)); + expect(a).not.toBe(b); + expect(a.startsWith("inline-")).toBe(true); + expect(b.startsWith("inline-")).toBe(true); + }); + + test("clear drops every entry", () => { + const stash = new InlineArrowStash(); + mustPut(stash, "user-1", bytes(10)); + mustPut(stash, "user-2", bytes(20)); + stash.clear(); + expect(stash.count()).toBe(0); + expect(stash.size()).toBe(0); + }); +}); diff --git a/packages/appkit/src/stream/defaults.ts b/packages/appkit/src/stream/defaults.ts index c8fc91591..5cb822efd 100644 --- a/packages/appkit/src/stream/defaults.ts +++ b/packages/appkit/src/stream/defaults.ts @@ -1,6 +1,13 @@ export const streamDefaults = { bufferSize: 100, - maxEventSize: 1024 * 1024, // 1MB + // 1 MiB. SSE is used only for short JSON control messages — JSON_ARRAY + // result rows (already row-size-bounded by the warehouse) and the small + // `arrow` envelope (statement id + status) for ARROW_STREAM. Bulk Arrow + // payloads do not traverse SSE; they are fetched over HTTP via + // `/api/analytics/arrow-result/:jobId`, which dispatches to the warehouse + // (EXTERNAL_LINKS) or the server-side `InlineArrowStash` (INLINE) based + // on the id prefix. + maxEventSize: 1 * 1024 * 1024, bufferTTL: 10 * 60 * 1000, // 10 minutes cleanupInterval: 5 * 60 * 1000, // 5 minutes maxPersistentBuffers: 10000, // 10000 buffers diff --git a/packages/appkit/src/stream/sse-writer.ts b/packages/appkit/src/stream/sse-writer.ts index b73cdb587..2b49d1dee 100644 --- a/packages/appkit/src/stream/sse-writer.ts +++ b/packages/appkit/src/stream/sse-writer.ts @@ -39,12 +39,14 @@ export class SSEWriter { eventId: string, error: string, code: SSEErrorCode = SSEErrorCode.INTERNAL_ERROR, + errorCode?: string, ): void { if (res.writableEnded) return; const errorData: SSEError = { error, code, + ...(errorCode ? { errorCode } : {}), }; res.write(`id: ${eventId}\n`); diff --git a/packages/appkit/src/stream/stream-manager.ts b/packages/appkit/src/stream/stream-manager.ts index 901e0b46c..2cacbcb9c 100644 --- a/packages/appkit/src/stream/stream-manager.ts +++ b/packages/appkit/src/stream/stream-manager.ts @@ -1,6 +1,8 @@ import { randomUUID } from "node:crypto"; import { context } from "@opentelemetry/api"; import type { IAppResponse, StreamConfig } from "shared"; +import { AppKitError } from "../errors/base"; +import { ExecutionError } from "../errors/execution"; import { createLogger } from "../logging/logger"; import { EventRingBuffer } from "./buffers"; import { streamDefaults } from "./defaults"; @@ -285,8 +287,21 @@ export class StreamManager { // cleanup if no clients are connected this._cleanupStream(streamEntry); } catch (error) { - const errorMsg = + // Two distinct messages: a *raw* one for server-side logs (full + // detail, statement fragments, correlation IDs) and a *client* + // one for the SSE payload (sanitized, stable, safe to render in + // a UI). Mixing them leaks upstream wording to anyone connected + // to the stream — see CWE-209. + const rawMsg = error instanceof Error ? error.message : "Internal server error"; + const clientMsg = + error instanceof AppKitError + ? error.clientMessage + : "Internal server error"; + // Upstream structured code (e.g. INLINE_ARROW_STASH_EXHAUSTED, + // NOT_IMPLEMENTED). UI should branch on this, not on `error`. + const upstreamCode = + error instanceof ExecutionError ? error.errorCode : undefined; const errorEventId = randomUUID(); const errorCode = this._categorizeError(error); @@ -295,17 +310,24 @@ export class StreamManager { logger.info("Stream aborted by client (code=%s)", errorCode); } else { logger.error( - "Stream execution failed: %s (code=%s)", - errorMsg, + "Stream execution failed: %s (code=%s upstreamCode=%s)", + rawMsg, errorCode, + upstreamCode ?? "n/a", ); } + const payload: Record = { + error: clientMsg, + code: errorCode, + }; + if (upstreamCode) payload.errorCode = upstreamCode; + // buffer error event streamEntry.eventBuffer.add({ id: errorEventId, type: "error", - data: JSON.stringify({ error: errorMsg, code: errorCode }), + data: JSON.stringify(payload), timestamp: Date.now(), }); @@ -313,9 +335,10 @@ export class StreamManager { this._broadcastErrorToClients( streamEntry, errorEventId, - errorMsg, + clientMsg, errorCode, true, + upstreamCode, ); streamEntry.isCompleted = true; } @@ -370,10 +393,17 @@ export class StreamManager { errorMessage: string, errorCode: SSEErrorCode, closeClients: boolean = false, + upstreamCode?: string, ): void { for (const client of streamEntry.clients) { if (!client.writableEnded) { - this.sseWriter.writeError(client, eventId, errorMessage, errorCode); + this.sseWriter.writeError( + client, + eventId, + errorMessage, + errorCode, + upstreamCode, + ); if (closeClients) { client.end(); } diff --git a/packages/appkit/src/stream/tests/stream-registry.test.ts b/packages/appkit/src/stream/tests/stream-registry.test.ts index d3f70e95a..efb88c888 100644 --- a/packages/appkit/src/stream/tests/stream-registry.test.ts +++ b/packages/appkit/src/stream/tests/stream-registry.test.ts @@ -374,7 +374,7 @@ describe("StreamRegistry", () => { expect(dataCall).toBeDefined(); const payload = JSON.parse( - (dataCall![0] as string).replace("data: ", "").trim(), + (dataCall?.[0] as string).replace("data: ", "").trim(), ); expect(payload).toEqual({ error: "Stream evicted", diff --git a/packages/appkit/src/stream/tests/stream.test.ts b/packages/appkit/src/stream/tests/stream.test.ts index c18d0f67b..a99242038 100644 --- a/packages/appkit/src/stream/tests/stream.test.ts +++ b/packages/appkit/src/stream/tests/stream.test.ts @@ -233,21 +233,30 @@ describe("StreamManager", () => { }); describe("error handling", () => { - test("should send error event when handler throws", async () => { + test("should send a sanitized error event when handler throws — raw error text is kept server-side only", async () => { + // The SSE error broadcaster never echoes a bare Error's `.message` + // to the wire: that path can carry statement fragments, internal + // object names, and correlation IDs (CWE-209). Non-AppKitError + // throws collapse to "Internal server error"; AppKitErrors carry + // their `clientMessage` (sanitized) and `errorCode` (structured). const { mockRes, events } = createMockResponse(); async function* generator() { yield { type: "start" }; - throw new Error("Test error"); + throw new Error("Test error with /internal/path and corrId=abc-123"); } await streamManager.stream(mockRes as any, generator); expect(events).toContain('data: {"type":"start"}\n\n'); expect(events).toContain("event: error\n"); - expect(events).toContain( - 'data: {"error":"Test error","code":"INTERNAL_ERROR"}\n\n', - ); + const dataLines = events.filter((e) => e.startsWith("data: ")); + const errorLine = dataLines.find((e) => e.includes('"error"')); + expect(errorLine).toContain('"error":"Internal server error"'); + expect(errorLine).toContain('"code":"INTERNAL_ERROR"'); + // The raw Error.message must not appear anywhere on the wire. + expect(events.join("")).not.toContain("/internal/path"); + expect(events.join("")).not.toContain("corrId=abc-123"); expect(mockRes.end).toHaveBeenCalled(); }); @@ -513,10 +522,15 @@ describe("StreamManager", () => { await streamManager.stream(mockRes2 as any, generator2, { streamId }); - const replayedError = events2.some((e) => - e.includes("Something went wrong"), + // The buffered error event must be replayed — sanitized, not raw. + // We don't pin the exact wording (that's a separate test) but we + // do require the error frame to be present. + const replayedError = events2.some( + (e) => e.startsWith("data:") && e.includes('"error"'), ); expect(replayedError).toBe(true); + // And the raw thrown message must not have leaked. + expect(events2.join("")).not.toContain("Something went wrong"); }); test("should detect buffer overflow on completed stream and close connection", async () => { diff --git a/packages/appkit/src/stream/types.ts b/packages/appkit/src/stream/types.ts index bb6f65f6e..ba0d4915e 100644 --- a/packages/appkit/src/stream/types.ts +++ b/packages/appkit/src/stream/types.ts @@ -25,6 +25,12 @@ export type SSEErrorCode = (typeof SSEErrorCode)[keyof typeof SSEErrorCode]; export interface SSEError { error: string; code: SSEErrorCode; + /** + * Upstream-domain structured code (e.g. `INLINE_ARROW_STASH_EXHAUSTED`, + * `NOT_IMPLEMENTED`). UI code should branch on this instead of parsing + * the human-readable `error` string. + */ + errorCode?: string; } export interface BufferedEvent { diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 06ee64bac..ced586f40 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { WorkspaceClient } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import pc from "picocolors"; import { createLogger } from "../logging/logger"; import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache"; @@ -129,18 +130,69 @@ function formatParametersType(sql: string): string { : "Record"; } +/** + * Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and + * extract column metadata. Returns the same shape as rows parsed from the + * legacy data_array path. + * + * IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows + * shaped like `(col_name, data_type, comment)` describing the user query's + * output schema. We must read those rows — NOT `table.schema.fields`, which + * would describe DESCRIBE QUERY's own output (`col_name`, `data_type`, + * `comment`) and yield bogus types for every query. + */ +function columnsFromArrowAttachment( + attachment: string, +): Array<{ name: string; type_name: string; comment: string | undefined }> { + const buf = Buffer.from(attachment, "base64"); + const table = tableFromIPC(buf); + return table.toArray().map((row) => { + const obj = row.toJSON() as { + col_name?: unknown; + data_type?: unknown; + comment?: unknown; + }; + return { + name: typeof obj.col_name === "string" ? obj.col_name : "", + type_name: + typeof obj.data_type === "string" + ? obj.data_type.toUpperCase() + : "STRING", + comment: + typeof obj.comment === "string" && obj.comment !== "" + ? obj.comment + : undefined, + }; + }); +} + export function convertToQueryType( result: DatabricksStatementExecutionResponse, sql: string, queryName: string, ): { type: string; hasResults: boolean } { const dataRows = result.result?.data_array || []; - const columns = dataRows.map((row) => ({ + let columns = dataRows.map((row) => ({ name: row[0] || "", type_name: row[1]?.toUpperCase() || "STRING", comment: row[2] || undefined, })); + // Fallback: serverless warehouses return ARROW_STREAM format with an inline + // base64 attachment instead of data_array. Decode the Arrow IPC rows (the + // DESCRIBE QUERY result table) to extract column names and types. + if (columns.length === 0 && result.result?.attachment) { + logger.debug("data_array empty, decoding Arrow IPC attachment for schema"); + try { + columns = columnsFromArrowAttachment(result.result.attachment); + } catch (err) { + logger.warn( + "Failed to decode Arrow IPC attachment: %s", + err instanceof Error ? err.message : String(err), + ); + } + } + const paramsType = formatParametersType(sql); // generate result fields with JSDoc @@ -391,6 +443,16 @@ export async function generateQueriesFromDescribe( `Describing ${total} ${total === 1 ? "query" : "queries"} (0/${total})`, ); + // Some serverless warehouses reject JSON_ARRAY+INLINE for DESCRIBE — and + // they signal the rejection two different ways: either as a thrown error, + // or as a `status.state === "FAILED"` response. Both paths funnel through + // this matcher so we can retry with ARROW_STREAM+INLINE consistently. + const looksLikeFormatRejection = (msg: string): boolean => + msg.includes("JSON_ARRAY") && + (msg.includes("not supported") || + msg.includes("INVALID_PARAMETER_VALUE") || + msg.includes("NOT_IMPLEMENTED")); + const describeOne = async ({ index, queryName, @@ -398,10 +460,56 @@ export async function generateQueriesFromDescribe( sqlHash, cleanedSql, }: (typeof uncachedQueries)[number]): Promise => { - const result = (await client.statementExecution.executeStatement({ - statement: `DESCRIBE QUERY ${cleanedSql}`, - warehouse_id: warehouseId, - })) as DatabricksStatementExecutionResponse; + // Prefer JSON_ARRAY + INLINE so `data_array` parsing works directly. + // Some serverless warehouses reject this combination — fall back to + // ARROW_STREAM + INLINE (still inline, just a different format) and + // let `convertToQueryType` decode the inline attachment. Forcing + // INLINE on the retry avoids EXTERNAL_LINKS, which would silently + // produce empty `data_array` and degrade types to `unknown`. + let result: DatabricksStatementExecutionResponse; + try { + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "JSON_ARRAY", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + if (looksLikeFormatRejection(msg)) { + logger.debug( + "Warehouse rejected JSON_ARRAY+INLINE for %s (thrown), retrying with ARROW_STREAM+INLINE", + queryName, + ); + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "ARROW_STREAM", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; + } else { + throw err; + } + } + + // Some warehouses surface the format rejection as `status.state === + // "FAILED"` instead of throwing. Detect that shape and retry with + // ARROW_STREAM before we degrade the type to `unknown`. + if ( + result.status.state === "FAILED" && + looksLikeFormatRejection(result.status.error?.message ?? "") + ) { + logger.debug( + "Warehouse rejected JSON_ARRAY+INLINE for %s (state=FAILED), retrying with ARROW_STREAM+INLINE", + queryName, + ); + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "ARROW_STREAM", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; + } completed++; spinner.update( @@ -409,10 +517,11 @@ export async function generateQueriesFromDescribe( ); logger.debug( - "DESCRIBE result for %s: state=%s, rows=%d", + "DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s", queryName, result.status.state, result.result?.data_array?.length ?? 0, + !!result.result?.attachment, ); if (result.status.state === "FAILED") { diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index b149d5bbe..5d77f2aee 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -1,4 +1,24 @@ -import { describe, expect, test } from "vitest"; +import { Table, tableToIPC, vectorFromArray } from "apache-arrow"; +import { describe, expect, test, vi } from "vitest"; + +const { mockLoggerWarn, mockLoggerDebug } = vi.hoisted(() => ({ + mockLoggerWarn: vi.fn(), + mockLoggerDebug: vi.fn(), +})); +vi.mock("../../logging/logger", () => ({ + createLogger: vi.fn(() => ({ + debug: mockLoggerDebug, + info: vi.fn(), + warn: mockLoggerWarn, + error: vi.fn(), + event: vi.fn(() => ({ + set: vi.fn().mockReturnThis(), + setComponent: vi.fn().mockReturnThis(), + setContext: vi.fn().mockReturnThis(), + })), + })), +})); + import { convertToQueryType, defaultForType, @@ -11,6 +31,20 @@ import { } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; +// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response — +// a result *table* with columns (col_name, data_type, comment) describing +// the user query's output schema. +function describeQueryAttachment( + rows: Array<{ col_name: string; data_type: string; comment: string | null }>, +): string { + const table = new Table({ + col_name: vectorFromArray(rows.map((r) => r.col_name)), + data_type: vectorFromArray(rows.map((r) => r.data_type)), + comment: vectorFromArray(rows.map((r) => r.comment ?? "")), + }); + return Buffer.from(tableToIPC(table, "stream")).toString("base64"); +} + describe("normalizeTypeName", () => { test("returns simple types unchanged", () => { expect(normalizeTypeName("STRING")).toBe("STRING"); @@ -389,6 +423,107 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name` ); expect(hasResults).toBe(false); }); + + describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => { + test("decodes column metadata from Arrow IPC data rows, not schema fields", () => { + // Critical regression test: it would be a bug to read + // `table.schema.fields` here, which would generate types like + // { col_name: string; data_type: string; comment: string } for every + // query (those are DESCRIBE QUERY's own output columns). We must read + // the data rows. + const attachment = describeQueryAttachment([ + { col_name: "user_id", data_type: "BIGINT", comment: null }, + { col_name: "name", data_type: "STRING", comment: "display name" }, + { col_name: "active", data_type: "BOOLEAN", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type, hasResults } = convertToQueryType( + response, + "SELECT user_id, name, active FROM users", + "users", + ); + + expect(hasResults).toBe(true); + // Real query columns appear in the generated type: + expect(type).toContain("user_id: number"); + expect(type).toContain("name: string"); + expect(type).toContain("active: boolean"); + // Column comments survive: + expect(type).toContain("/** display name"); + // The DESCRIBE QUERY metadata column names must NOT leak as user types: + expect(type).not.toContain("col_name: string"); + expect(type).not.toContain("data_type: string"); + }); + + test("normalizes lowercase data_type values to uppercase", () => { + const attachment = describeQueryAttachment([ + { col_name: "id", data_type: "int", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("@sqlType INT"); + expect(type).toContain("id: number"); + }); + + test("prefers data_array over attachment when both are present", () => { + const attachment = describeQueryAttachment([ + { col_name: "from_arrow", data_type: "STRING", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-both", + status: { state: "SUCCEEDED" }, + result: { + data_array: [["from_data_array", "INT", null]], + attachment, + }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("from_data_array: number"); + expect(type).not.toContain("from_arrow"); + }); + + test("logs a warning and yields the unknown-result fallback on malformed attachment", () => { + mockLoggerWarn.mockClear(); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-bad", + status: { state: "SUCCEEDED" }, + result: { attachment: "not-valid-arrow-ipc" }, + }; + + const { hasResults, type } = convertToQueryType( + response, + "SELECT 1", + "test", + ); + + // No columns extracted → unknown-result type, hasResults false. + expect(hasResults).toBe(false); + expect(type).toContain("unknown"); + // None of DESCRIBE QUERY's metadata column names should leak in as + // user-facing type fields — that would mean the parser swallowed + // the failure and produced bogus columns instead. + expect(type).not.toContain("col_name"); + expect(type).not.toContain("data_type"); + + // The warning must fire so a regression that silently produces empty + // types (no telemetry signal) fails this test. + expect(mockLoggerWarn).toHaveBeenCalledWith( + expect.stringContaining("Failed to decode Arrow IPC attachment"), + expect.any(String), + ); + }); + }); }); describe("inferParameterTypes", () => { diff --git a/packages/appkit/src/type-generator/types.ts b/packages/appkit/src/type-generator/types.ts index f54176a8c..fdc420483 100644 --- a/packages/appkit/src/type-generator/types.ts +++ b/packages/appkit/src/type-generator/types.ts @@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse { }; result?: { data_array?: (string | null)[][]; + /** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */ + attachment?: string; }; } diff --git a/packages/shared/package.json b/packages/shared/package.json index 27d268ca3..542f7a965 100644 --- a/packages/shared/package.json +++ b/packages/shared/package.json @@ -40,6 +40,7 @@ "ajv": "8.17.1", "ajv-formats": "3.0.1", "@clack/prompts": "1.0.1", - "commander": "12.1.0" + "commander": "12.1.0", + "zod": "4.3.6" } } diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index 9829729a7..d036e0dbd 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -4,4 +4,5 @@ export * from "./execute"; export * from "./genie"; export * from "./plugin"; export * from "./sql"; +export * from "./sse/analytics"; export * from "./tunnel"; diff --git a/packages/shared/src/sse/analytics.test.ts b/packages/shared/src/sse/analytics.test.ts new file mode 100644 index 000000000..f66437c36 --- /dev/null +++ b/packages/shared/src/sse/analytics.test.ts @@ -0,0 +1,87 @@ +import { describe, expect, test } from "vitest"; +import { + AnalyticsSseMessage, + makeArrowMessage, + makeResultMessage, +} from "./analytics"; + +describe("AnalyticsSseMessage schema", () => { + test("accepts a result message with rows", () => { + const parsed = AnalyticsSseMessage.parse({ + type: "result", + data: [{ id: 1, name: "alice" }], + }); + expect(parsed.type).toBe("result"); + }); + + test("accepts a result message with no data (empty result)", () => { + expect(() => AnalyticsSseMessage.parse({ type: "result" })).not.toThrow(); + }); + + test("accepts an arrow message with warehouse statement_id", () => { + const parsed = AnalyticsSseMessage.parse({ + type: "arrow", + statement_id: "stmt-1", + }); + expect(parsed.type).toBe("arrow"); + }); + + test("accepts an arrow message with synthetic inline- id", () => { + // Inline Arrow payloads are stashed server-side and surfaced through the + // same `arrow` message variant — the `inline-` prefix tells the + // /arrow-result handler to drain the stash instead of hitting the + // warehouse. The schema must accept both id shapes transparently. + const parsed = AnalyticsSseMessage.parse({ + type: "arrow", + statement_id: "inline-abc-123", + }); + expect(parsed.statement_id).toBe("inline-abc-123"); + }); + + test("rejects an arrow message with empty statement_id", () => { + expect(() => + AnalyticsSseMessage.parse({ type: "arrow", statement_id: "" }), + ).toThrow(); + }); + + test("rejects an arrow message with no statement_id", () => { + expect(() => AnalyticsSseMessage.parse({ type: "arrow" })).toThrow(); + }); + + test("rejects the retired arrow_inline message type", () => { + // arrow_inline was the prior wire shape (base64 payload on the SSE + // channel). The current protocol routes all Arrow payloads through + // /arrow-result; the type must no longer parse. + expect(() => + AnalyticsSseMessage.parse({ type: "arrow_inline", attachment: "AQID" }), + ).toThrow(); + }); + + test("rejects an unknown type", () => { + expect(() => + AnalyticsSseMessage.parse({ type: "unknown_kind", foo: "bar" }), + ).toThrow(); + }); + + test("safeParse returns success: false for malformed payloads", () => { + const r = AnalyticsSseMessage.safeParse({ type: "arrow" }); + expect(r.success).toBe(false); + }); +}); + +describe("typed builders", () => { + test("makeResultMessage roundtrips through the schema", () => { + const msg = makeResultMessage([{ id: 1 }], { statement_id: "s-1" }); + expect(() => AnalyticsSseMessage.parse(msg)).not.toThrow(); + }); + + test("makeArrowMessage roundtrips through the schema", () => { + const msg = makeArrowMessage("stmt-2"); + expect(() => AnalyticsSseMessage.parse(msg)).not.toThrow(); + }); + + test("makeArrowMessage accepts synthetic inline- ids", () => { + const msg = makeArrowMessage("inline-some-uuid"); + expect(() => AnalyticsSseMessage.parse(msg)).not.toThrow(); + }); +}); diff --git a/packages/shared/src/sse/analytics.ts b/packages/shared/src/sse/analytics.ts new file mode 100644 index 000000000..eba7597e6 --- /dev/null +++ b/packages/shared/src/sse/analytics.ts @@ -0,0 +1,103 @@ +import { z } from "zod"; + +/** + * Wire protocol for analytics SSE messages emitted by `/api/analytics/query`. + * + * These schemas are the single source of truth for the contract between the + * server (`AnalyticsPlugin._handleQueryRoute`) and the client + * (`useAnalyticsQuery`). Both sides validate with the same schema: + * + * - Server uses the typed builders (`makeResultMessage`, `makeArrowMessage`) + * to construct messages with compile-time guarantees that all required + * fields are present. + * - Client calls `AnalyticsSseMessage.parse(JSON.parse(event.data))` to fail + * loudly on a malformed payload instead of silently treating an undefined + * field as data. + * + * Arrow payloads — inline or external-links — never traverse the SSE control + * channel; both flow through `/api/analytics/arrow-result/:jobId` and are + * differentiated by an `inline-` prefix on the job id (see + * `InlineArrowStash`). The wire shape from the client's perspective is + * therefore uniform: an `arrow` message carries an id, the client fetches. + * + * Adding a new message variant requires a schema update here, which keeps + * server and client in lockstep. + */ + +/** Successful row-shaped result (JSON_ARRAY format, or empty results). */ +export const AnalyticsResultMessage = z.object({ + type: z.literal("result"), + // `data` is intentionally `z.array(z.unknown())` rather than a deep + // row schema. Validating every row's keys for shape costs O(rows × cols) + // CPU and main-thread blocking time on the *client* (the schema is + // also reused for `safeParse` in `useAnalyticsQuery`); for large JSON + // results that pushes hundreds of ms to seconds of TBT into the + // render pipeline. The server constructs `data` via the typed + // `makeResultMessage` builder, so the per-row shape is enforced at + // the source, not at validation time. The TS-level interface below + // narrows `data` to `Record[]` for callers. + data: z.array(z.unknown()).optional(), + // Status is opaque metadata forwarded from the warehouse — keep it as + // `unknown` so we don't bake the SDK's detailed shape into the contract. + status: z.unknown().optional(), + statement_id: z.string().optional(), +}); + +/** + * TS-level shape of a successful row-shaped result message. + * + * **Kept in sync by hand** with `AnalyticsResultMessage` above. The Zod + * schema is intentionally loose (`z.array(z.unknown())`) to keep client + * validation cheap; this interface narrows `data` to + * `Record[]` so consumers don't have to cast at every + * call site. If you add a field to the Zod schema, add it here too. + */ +export interface AnalyticsResultMessage { + type: "result"; + data?: Record[]; + status?: unknown; + statement_id?: string; +} + +/** + * ARROW_STREAM result delivered via /arrow-result/:jobId. The id is either: + * - the warehouse-issued `statement_id` for EXTERNAL_LINKS responses, or + * - a synthetic `inline-` id pointing at the server-side + * `InlineArrowStash` for INLINE responses. + * + * Both shapes are fetched the same way; the prefix tells the route handler + * which path to take. + */ +export const AnalyticsArrowMessage = z.object({ + type: z.literal("arrow"), + statement_id: z.string().min(1), + status: z.unknown().optional(), +}); +export type AnalyticsArrowMessage = z.infer; + +/** Discriminated union of every message the analytics SSE stream may emit. */ +export const AnalyticsSseMessage = z.discriminatedUnion("type", [ + AnalyticsResultMessage, + AnalyticsArrowMessage, +]); +export type AnalyticsSseMessage = z.infer; + +// ──────────────────────────────────────────────────────────────────────────── +// Typed builders — call from the server route handler. The compiler enforces +// that every required field is supplied, and the return type narrows so +// downstream code (executeStream / SSE writer) keeps full type information. +// ──────────────────────────────────────────────────────────────────────────── + +export function makeResultMessage( + data: Record[] | undefined, + extras: { status?: unknown; statement_id?: string } = {}, +): AnalyticsResultMessage { + return { type: "result", data, ...extras }; +} + +export function makeArrowMessage( + statement_id: string, + extras: { status?: unknown } = {}, +): AnalyticsArrowMessage { + return { type: "arrow", statement_id, ...extras }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c576bd74a..6353a26ba 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -299,6 +299,9 @@ importers: '@types/semver': specifier: 7.7.1 version: 7.7.1 + apache-arrow: + specifier: 21.1.0 + version: 21.1.0 dotenv: specifier: 16.6.1 version: 16.6.1 @@ -554,6 +557,9 @@ importers: commander: specifier: 12.1.0 version: 12.1.0 + zod: + specifier: 4.3.6 + version: 4.3.6 devDependencies: '@types/express': specifier: 4.17.23 @@ -5562,7 +5568,7 @@ packages: basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} engines: {node: '>=10.0.0'} - deprecated: Security vulnerability fixed in 5.2.0, please upgrade + deprecated: Security vulnerability fixed in 5.2.1, please upgrade batch@0.6.1: resolution: {integrity: sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==}