Skip to content

Commit 8240130

Browse files
committed
Add sse tracking
1 parent c72fa4e commit 8240130

9 files changed

Lines changed: 247 additions & 34 deletions

File tree

apps/sim/app/workspace/[workspaceId]/home/hooks/use-chat.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3533,9 +3533,7 @@ export function useChat(
35333533
method: 'POST',
35343534
headers: {
35353535
'Content-Type': 'application/json',
3536-
...(stopTraceparentSnapshot
3537-
? { traceparent: stopTraceparentSnapshot }
3538-
: {}),
3536+
...(stopTraceparentSnapshot ? { traceparent: stopTraceparentSnapshot } : {}),
35393537
},
35403538
body: JSON.stringify({
35413539
streamId: sid,

apps/sim/lib/copilot/generated/trace-attribute-values-v1.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,19 @@ export const CopilotResumeOutcome = {
191191
export type CopilotResumeOutcomeKey = keyof typeof CopilotResumeOutcome
192192
export type CopilotResumeOutcomeValue = (typeof CopilotResumeOutcome)[CopilotResumeOutcomeKey]
193193

194+
export const CopilotSseCloseReason = {
195+
Aborted: 'aborted',
196+
BackendError: 'backend_error',
197+
BillingLimit: 'billing_limit',
198+
ClosedNoTerminal: 'closed_no_terminal',
199+
Error: 'error',
200+
Terminal: 'terminal',
201+
Timeout: 'timeout',
202+
} as const
203+
204+
export type CopilotSseCloseReasonKey = keyof typeof CopilotSseCloseReason
205+
export type CopilotSseCloseReasonValue = (typeof CopilotSseCloseReason)[CopilotSseCloseReasonKey]
206+
194207
export const CopilotStopOutcome = {
195208
ChatNotFound: 'chat_not_found',
196209
InternalError: 'internal_error',

apps/sim/lib/copilot/generated/trace-attributes-v1.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,21 @@ export const TraceAttr = {
208208
CopilotRunParentId: 'copilot.run.parent_id',
209209
CopilotRunProvider: 'copilot.run.provider',
210210
CopilotRunStatus: 'copilot.run.status',
211+
CopilotSseBytesReceived: 'copilot.sse.bytes_received',
212+
CopilotSseChunksReceived: 'copilot.sse.chunks_received',
213+
CopilotSseCloseReason: 'copilot.sse.close_reason',
214+
CopilotSseEventsComplete: 'copilot.sse.events.complete',
215+
CopilotSseEventsError: 'copilot.sse.events.error',
216+
CopilotSseEventsReceived: 'copilot.sse.events.received',
217+
CopilotSseEventsResource: 'copilot.sse.events.resource',
218+
CopilotSseEventsRun: 'copilot.sse.events.run',
219+
CopilotSseEventsSession: 'copilot.sse.events.session',
220+
CopilotSseEventsSpan: 'copilot.sse.events.span',
221+
CopilotSseEventsText: 'copilot.sse.events.text',
222+
CopilotSseEventsTool: 'copilot.sse.events.tool',
223+
CopilotSseFirstEventMs: 'copilot.sse.first_event_ms',
224+
CopilotSseLongestIdleGapMs: 'copilot.sse.longest_idle_gap_ms',
225+
CopilotSseTerminalEventSeen: 'copilot.sse.terminal_event_seen',
211226
CopilotStopAppendedAssistant: 'copilot.stop.appended_assistant',
212227
CopilotStopBlocksCount: 'copilot.stop.blocks_count',
213228
CopilotStopContentLength: 'copilot.stop.content_length',
@@ -696,6 +711,21 @@ export const TraceAttrValues: readonly TraceAttrValue[] = [
696711
'copilot.run.parent_id',
697712
'copilot.run.provider',
698713
'copilot.run.status',
714+
'copilot.sse.bytes_received',
715+
'copilot.sse.chunks_received',
716+
'copilot.sse.close_reason',
717+
'copilot.sse.events.complete',
718+
'copilot.sse.events.error',
719+
'copilot.sse.events.received',
720+
'copilot.sse.events.resource',
721+
'copilot.sse.events.run',
722+
'copilot.sse.events.session',
723+
'copilot.sse.events.span',
724+
'copilot.sse.events.text',
725+
'copilot.sse.events.tool',
726+
'copilot.sse.first_event_ms',
727+
'copilot.sse.longest_idle_gap_ms',
728+
'copilot.sse.terminal_event_seen',
699729
'copilot.stop.appended_assistant',
700730
'copilot.stop.blocks_count',
701731
'copilot.stop.content_length',

apps/sim/lib/copilot/generated/trace-events-v1.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
export const TraceEvent = {
1313
BedrockInvokeRetryWithoutImages: 'bedrock.invoke.retry_without_images',
1414
CopilotOutputFileError: 'copilot.output_file.error',
15+
CopilotSseFirstEvent: 'copilot.sse.first_event',
16+
CopilotSseIdleGapExceeded: 'copilot.sse.idle_gap_exceeded',
17+
CopilotSseTerminalEventReceived: 'copilot.sse.terminal_event_received',
1518
CopilotTableError: 'copilot.table.error',
1619
CopilotVfsParseFailed: 'copilot.vfs.parse_failed',
1720
CopilotVfsResizeAttempt: 'copilot.vfs.resize_attempt',
@@ -31,6 +34,9 @@ export type TraceEventValue = (typeof TraceEvent)[TraceEventKey]
3134
export const TraceEventValues: readonly TraceEventValue[] = [
3235
'bedrock.invoke.retry_without_images',
3336
'copilot.output_file.error',
37+
'copilot.sse.first_event',
38+
'copilot.sse.idle_gap_exceeded',
39+
'copilot.sse.terminal_event_received',
3440
'copilot.table.error',
3541
'copilot.vfs.parse_failed',
3642
'copilot.vfs.resize_attempt',

apps/sim/lib/copilot/generated/trace-spans-v1.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export const TraceSpan = {
5959
CopilotFinalizeStream: 'copilot.finalize_stream',
6060
CopilotRecoveryCheckReplayGap: 'copilot.recovery.check_replay_gap',
6161
CopilotResumeRequest: 'copilot.resume.request',
62+
CopilotSseReadLoop: 'copilot.sse.read_loop',
6263
CopilotSubagentExecute: 'copilot.subagent.execute',
6364
CopilotToolWaitForClientResult: 'copilot.tool.wait_for_client_result',
6465
CopilotToolsHandleResourceSideEffects: 'copilot.tools.handle_resource_side_effects',
@@ -132,6 +133,7 @@ export const TraceSpanValues: readonly TraceSpanValue[] = [
132133
'copilot.finalize_stream',
133134
'copilot.recovery.check_replay_gap',
134135
'copilot.resume.request',
136+
'copilot.sse.read_loop',
135137
'copilot.subagent.execute',
136138
'copilot.tool.wait_for_client_result',
137139
'copilot.tools.handle_resource_side_effects',

apps/sim/lib/copilot/request/go/stream.ts

Lines changed: 183 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
import type { Context } from '@opentelemetry/api'
1+
import { type Context, SpanStatusCode } from '@opentelemetry/api'
22
import { createLogger } from '@sim/logger'
33
import { ORCHESTRATION_TIMEOUT_MS } from '@/lib/copilot/constants'
4-
import { MothershipStreamV1SpanLifecycleEvent } from '@/lib/copilot/generated/mothership-stream-v1'
4+
import {
5+
type MothershipStreamV1EventType,
6+
MothershipStreamV1SpanLifecycleEvent,
7+
} from '@/lib/copilot/generated/mothership-stream-v1'
8+
import { CopilotSseCloseReason } from '@/lib/copilot/generated/trace-attribute-values-v1'
59
import { TraceAttr } from '@/lib/copilot/generated/trace-attributes-v1'
10+
import { TraceEvent } from '@/lib/copilot/generated/trace-events-v1'
11+
import { TraceSpan } from '@/lib/copilot/generated/trace-spans-v1'
612
import { fetchGo } from '@/lib/copilot/request/go/fetch'
713
import {
814
buildPreviewContentUpdate,
@@ -17,6 +23,7 @@ import {
1723
sseHandlers,
1824
subAgentHandlers,
1925
} from '@/lib/copilot/request/handlers'
26+
import { getCopilotTracer } from '@/lib/copilot/request/otel'
2027
import {
2128
eventToStreamEvent,
2229
isSubagentSpanStreamEvent,
@@ -165,27 +172,74 @@ export async function runStreamLoop(
165172
url: fetchUrl,
166173
method: fetchOptions.method ?? 'GET',
167174
})
168-
const bodyStart = performance.now()
169-
let firstEventMs: number | undefined
170-
let eventsReceived = 0
171-
let endedOn = 'terminal'
172175

173-
const reader = response.body.getReader()
176+
// Aggregate counters populated inline by the reader wrapper + onEvent
177+
// dispatcher below and flushed to both the legacy TraceCollector span
178+
// and the OTel read-loop span when the loop terminates. Kept as plain
179+
// JS variables (not span attrs) so incrementing them is free — we
180+
// only pay OTel cost once at span End().
181+
const counters = {
182+
bytes: 0,
183+
chunks: 0,
184+
events: 0,
185+
eventsByType: {
186+
session: 0,
187+
text: 0,
188+
tool: 0,
189+
span: 0,
190+
resource: 0,
191+
run: 0,
192+
error: 0,
193+
complete: 0,
194+
} as Record<MothershipStreamV1EventType, number>,
195+
firstEventMs: undefined as number | undefined,
196+
lastChunkMs: performance.now(),
197+
longestIdleGapMs: 0,
198+
}
199+
const bodyStart = performance.now()
200+
let endedOn: string = CopilotSseCloseReason.Terminal
201+
202+
// Wrap the body's reader so we can track per-chunk bytes and the gap
203+
// between chunks. `processSSEStream` consumes this reader exactly as
204+
// it would the raw one — no API changes there. The longest idle gap
205+
// is the diagnostic signal for "silent TTFT" perceived hangs: a gap
206+
// larger than the threshold means the user saw nothing for that long.
207+
const IDLE_GAP_EVENT_THRESHOLD_MS = 10000
208+
const rawReader = response.body.getReader()
209+
const reader: ReadableStreamDefaultReader<Uint8Array> = {
210+
async read() {
211+
const result = await rawReader.read()
212+
if (!result.done && result.value) {
213+
const now = performance.now()
214+
const gap = now - counters.lastChunkMs
215+
if (gap > counters.longestIdleGapMs) counters.longestIdleGapMs = gap
216+
counters.lastChunkMs = now
217+
counters.chunks += 1
218+
counters.bytes += result.value.byteLength
219+
}
220+
return result
221+
},
222+
cancel: (reason) => rawReader.cancel(reason),
223+
releaseLock: () => rawReader.releaseLock(),
224+
get closed() {
225+
return rawReader.closed
226+
},
227+
}
174228
const decoder = new TextDecoder()
175229

176230
const timeoutId = setTimeout(() => {
177231
context.errors.push('Request timed out')
178232
context.streamComplete = true
179-
endedOn = 'timeout'
233+
endedOn = CopilotSseCloseReason.Timeout
180234
reader.cancel().catch(() => {})
181235
}, timeout)
182236

183237
try {
184238
await processSSEStream(reader, decoder, abortSignal, async (raw) => {
185-
if (eventsReceived === 0) {
186-
firstEventMs = Math.round(performance.now() - bodyStart)
239+
if (counters.events === 0) {
240+
counters.firstEventMs = Math.round(performance.now() - bodyStart)
187241
}
188-
eventsReceived += 1
242+
counters.events += 1
189243
if (abortSignal?.aborted) {
190244
context.wasAborted = true
191245
return true
@@ -214,6 +268,12 @@ export async function runStreamLoop(
214268
options.onGoTraceId?.(goTraceId)
215269
}
216270

271+
// Per-type counters for the copilot.sse.read_loop span. Bound set
272+
// (8 types) so this can never blow up into high cardinality.
273+
if (streamEvent.type in counters.eventsByType) {
274+
counters.eventsByType[streamEvent.type as MothershipStreamV1EventType] += 1
275+
}
276+
217277
if (shouldSkipToolCallEvent(streamEvent) || shouldSkipToolResultEvent(streamEvent)) {
218278
return
219279
}
@@ -319,44 +379,58 @@ export async function runStreamLoop(
319379
requestId: context.requestId,
320380
messageId: context.messageId,
321381
})
322-
endedOn = 'closed_no_terminal'
382+
endedOn = CopilotSseCloseReason.ClosedNoTerminal
323383
throw new CopilotBackendError(message, { status: 503 })
324384
}
325385
} catch (error) {
326386
if (error instanceof FatalSseEventError && !context.errors.includes(error.message)) {
327387
context.errors.push(error.message)
328388
}
329-
if (endedOn === 'terminal') {
389+
if (endedOn === CopilotSseCloseReason.Terminal) {
330390
endedOn =
331391
error instanceof CopilotBackendError
332-
? 'backend_error'
392+
? CopilotSseCloseReason.BackendError
333393
: error instanceof BillingLimitError
334-
? 'billing_limit'
335-
: 'error'
394+
? CopilotSseCloseReason.BillingLimit
395+
: CopilotSseCloseReason.Error
336396
}
337397
throw error
338398
} finally {
339399
if (abortSignal?.aborted) {
340400
context.wasAborted = true
341401
await reader.cancel().catch(() => {})
342-
if (endedOn === 'terminal') {
343-
endedOn = 'aborted'
402+
if (endedOn === CopilotSseCloseReason.Terminal) {
403+
endedOn = CopilotSseCloseReason.Aborted
344404
}
345405
}
346406
clearTimeout(timeoutId)
347407

408+
// Legacy TraceCollector span (consumed by the in-memory trace
409+
// collector, kept for backwards compatibility with existing
410+
// tooling). The real OTel span is stamped below.
348411
const bodyDurationMs = Math.round(performance.now() - bodyStart)
349412
bodySpan.attributes = {
350413
...(bodySpan.attributes ?? {}),
351-
eventsReceived,
352-
firstEventMs,
414+
eventsReceived: counters.events,
415+
firstEventMs: counters.firstEventMs,
353416
endedOn,
354417
durationMs: bodyDurationMs,
355418
}
356419
context.trace.endSpan(
357420
bodySpan,
358-
endedOn === 'terminal' ? 'ok' : endedOn === 'aborted' ? 'cancelled' : 'error'
421+
endedOn === CopilotSseCloseReason.Terminal
422+
? 'ok'
423+
: endedOn === CopilotSseCloseReason.Aborted
424+
? 'cancelled'
425+
: 'error'
359426
)
427+
428+
// Real OTel span for Tempo/Grafana. Stamped aggregate-only so
429+
// there is no per-chunk OTel cost — one span per read loop with
430+
// integer counters, plus a bounded set of events.
431+
stampSseReadLoopSpan(bodyStart, counters, endedOn, fetchUrl, pathname, {
432+
idleGapEventThresholdMs: IDLE_GAP_EVENT_THRESHOLD_MS,
433+
})
360434
}
361435
}
362436

@@ -375,3 +449,91 @@ function estimateBodyBytes(body: BodyInit | null | undefined): number {
375449
}
376450
return 0
377451
}
452+
453+
type SseReadLoopCounters = {
454+
bytes: number
455+
chunks: number
456+
events: number
457+
eventsByType: Record<MothershipStreamV1EventType, number>
458+
firstEventMs: number | undefined
459+
longestIdleGapMs: number
460+
}
461+
462+
/**
463+
* Ship a one-shot `copilot.sse.read_loop` OTel span with the aggregate
464+
* counters collected during the read loop. Uses `startTime` so the
465+
* span's duration reflects the actual loop wall clock even though we
466+
* only talk to OTel once at the end.
467+
*
468+
* Deliberately synchronous, no per-chunk span calls: total OTel cost
469+
* per read loop is fixed (~10 attrs + up to 3 events), independent of
470+
* chunk count.
471+
*/
472+
function stampSseReadLoopSpan(
473+
startPerfMs: number,
474+
counters: SseReadLoopCounters,
475+
closeReason: string,
476+
fetchUrl: string,
477+
pathname: string,
478+
opts: { idleGapEventThresholdMs: number }
479+
): void {
480+
// Translate performance.now() values into wall-clock Date values so
481+
// the span's timestamps land in real time (OTel accepts both, but we
482+
// need to pair startTime with a matching "now" for .end()).
483+
const nowPerf = performance.now()
484+
const nowWall = Date.now()
485+
const startWall = nowWall - (nowPerf - startPerfMs)
486+
487+
const tracer = getCopilotTracer()
488+
const span = tracer.startSpan(TraceSpan.CopilotSseReadLoop, {
489+
startTime: startWall,
490+
attributes: {
491+
[TraceAttr.HttpUrl]: fetchUrl,
492+
[TraceAttr.HttpPath]: pathname,
493+
[TraceAttr.CopilotSseBytesReceived]: counters.bytes,
494+
[TraceAttr.CopilotSseChunksReceived]: counters.chunks,
495+
[TraceAttr.CopilotSseEventsReceived]: counters.events,
496+
[TraceAttr.CopilotSseEventsSession]: counters.eventsByType.session,
497+
[TraceAttr.CopilotSseEventsText]: counters.eventsByType.text,
498+
[TraceAttr.CopilotSseEventsTool]: counters.eventsByType.tool,
499+
[TraceAttr.CopilotSseEventsSpan]: counters.eventsByType.span,
500+
[TraceAttr.CopilotSseEventsResource]: counters.eventsByType.resource,
501+
[TraceAttr.CopilotSseEventsRun]: counters.eventsByType.run,
502+
[TraceAttr.CopilotSseEventsError]: counters.eventsByType.error,
503+
[TraceAttr.CopilotSseEventsComplete]: counters.eventsByType.complete,
504+
[TraceAttr.CopilotSseLongestIdleGapMs]: Math.round(counters.longestIdleGapMs),
505+
[TraceAttr.CopilotSseCloseReason]: closeReason,
506+
[TraceAttr.CopilotSseTerminalEventSeen]: counters.eventsByType.complete > 0,
507+
},
508+
})
509+
510+
if (counters.firstEventMs !== undefined) {
511+
span.setAttribute(TraceAttr.CopilotSseFirstEventMs, counters.firstEventMs)
512+
span.addEvent(TraceEvent.CopilotSseFirstEvent, {
513+
[TraceAttr.CopilotSseFirstEventMs]: counters.firstEventMs,
514+
})
515+
}
516+
if (counters.longestIdleGapMs >= opts.idleGapEventThresholdMs) {
517+
span.addEvent(TraceEvent.CopilotSseIdleGapExceeded, {
518+
[TraceAttr.CopilotSseLongestIdleGapMs]: Math.round(counters.longestIdleGapMs),
519+
})
520+
}
521+
if (counters.eventsByType.complete > 0) {
522+
span.addEvent(TraceEvent.CopilotSseTerminalEventReceived)
523+
}
524+
525+
// Span status: only mark ERROR for real failures. User aborts and
526+
// clean terminals stay UNSET so dashboards filtering `status=error`
527+
// don't light up for normal cancellations.
528+
if (
529+
closeReason !== CopilotSseCloseReason.Terminal &&
530+
closeReason !== CopilotSseCloseReason.Aborted
531+
) {
532+
span.setStatus({
533+
code: SpanStatusCode.ERROR,
534+
message: `SSE read loop ended with reason: ${closeReason}`,
535+
})
536+
}
537+
538+
span.end(nowWall)
539+
}

0 commit comments

Comments
 (0)