diff --git a/package.json b/package.json index 7289b9ab..b58e1bea 100644 --- a/package.json +++ b/package.json @@ -40,6 +40,8 @@ "docs:api:generate": "node scripts/generate-api-docs.mjs" }, "dependencies": { + "@sentry/browser": "8.54.0", + "@sentry/react": "8.54.0", "@tensorflow/tfjs-node": "^5.0.0", "express": "^4.18.2", "@axe-core/react": "^4.11.3", diff --git a/src/lib/alertDispatch.ts b/src/lib/alertDispatch.ts new file mode 100644 index 00000000..1510ab46 --- /dev/null +++ b/src/lib/alertDispatch.ts @@ -0,0 +1,285 @@ +/** + * Alert Dispatch – Slack & PagerDuty Webhook Integration + * + * Sits on top of the existing `alertChannels.ts` delivery layer and adds + * production-grade routing to external incident management services. + * + * Routing rules: + * - severity "info" → in-app + browser notification only + * - severity "warning" → in-app + browser + Slack (if configured) + * - severity "critical" → in-app + browser + Slack + PagerDuty + * + * Environment / runtime configuration: + * VITE_SLACK_WEBHOOK_URL – Incoming Webhook URL from your Slack App + * VITE_PAGERDUTY_ROUTING_KEY – PagerDuty Events API v2 Integration Key + * + * You can also call `configureAlertDispatch()` at runtime to override or + * extend the defaults without redeploying. + */ + +import * as Sentry from '@sentry/react'; +import { dispatchToChannels, type AlertPayload } from './alertChannels'; +import { createLogger } from '../utils/logger'; + +const logger = createLogger('AlertDispatch'); + +// ─── Configuration ──────────────────────────────────────────────────────────── + +export interface AlertDispatchConfig { + /** Slack Incoming Webhook URL – leave undefined to disable Slack delivery. */ + slackWebhookUrl?: string; + /** + * PagerDuty Events API v2 routing / integration key. + * Leave undefined to disable PagerDuty delivery. + */ + pagerDutyRoutingKey?: string; + /** + * Human-readable service name included in Slack/PD payloads. + * Defaults to "stellar-dev-dashboard". + */ + serviceName: string; + /** + * If `true`, Slack/PagerDuty calls are skipped and payloads are logged + * to the console instead. Automatically `true` in non-production builds. + */ + dryRun: boolean; +} + +let _cfg: AlertDispatchConfig = { + slackWebhookUrl: import.meta.env.VITE_SLACK_WEBHOOK_URL as string | undefined, + pagerDutyRoutingKey: import.meta.env.VITE_PAGERDUTY_ROUTING_KEY as string | undefined, + serviceName: 'stellar-dev-dashboard', + // Only fire live webhooks in production builds to avoid alert noise during dev + dryRun: import.meta.env.MODE !== 'production', +}; + +export function configureAlertDispatch(overrides: Partial): void { + _cfg = { ..._cfg, ...overrides }; + logger.info('Alert dispatch reconfigured', { + slackEnabled: !!_cfg.slackWebhookUrl, + pagerDutyEnabled: !!_cfg.pagerDutyRoutingKey, + dryRun: _cfg.dryRun, + }); +} + +// ─── Slack payload builder ──────────────────────────────────────────────────── + +const SEVERITY_EMOJI: Record = { + info: ':information_source:', + warning: ':warning:', + critical: ':red_circle:', +}; + +const SEVERITY_COLOR: Record = { + info: '#2196F3', + warning: '#FF9800', + critical: '#F44336', +}; + +function buildSlackPayload(alert: AlertPayload, serviceName: string): object { + const emoji = SEVERITY_EMOJI[alert.severity] ?? ':bell:'; + const color = SEVERITY_COLOR[alert.severity] ?? '#9E9E9E'; + + return { + text: `${emoji} *[${alert.severity.toUpperCase()}]* ${alert.title}`, + attachments: [ + { + color, + fields: [ + { title: 'Service', value: serviceName, short: true }, + { title: 'Severity', value: alert.severity, short: true }, + { title: 'Description', value: alert.description, short: false }, + { title: 'Alert ID', value: alert.id, short: true }, + { title: 'Timestamp', value: alert.timestamp, short: true }, + ...(alert.tags?.length + ? [{ title: 'Tags', value: alert.tags.join(', '), short: false }] + : []), + ], + footer: serviceName, + ts: Math.floor(Date.parse(alert.timestamp) / 1000), + }, + ], + }; +} + +async function sendSlack(payload: object, webhookUrl: string): Promise { + const response = await fetch(webhookUrl, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(8000), + }); + + if (!response.ok) { + throw new Error(`Slack webhook returned HTTP ${response.status}`); + } +} + +// ─── PagerDuty payload builder ──────────────────────────────────────────────── + +type PagerDutyEventAction = 'trigger' | 'acknowledge' | 'resolve'; + +function buildPagerDutyPayload( + alert: AlertPayload, + serviceName: string, + action: PagerDutyEventAction = 'trigger', +): object { + return { + routing_key: _cfg.pagerDutyRoutingKey, + event_action: action, + dedup_key: alert.id, + payload: { + summary: `[${serviceName}] ${alert.title}`, + source: serviceName, + severity: alert.severity === 'critical' ? 'critical' : 'warning', + timestamp: alert.timestamp, + class: 'application_alert', + component: serviceName, + group: serviceName, + custom_details: { + description: alert.description, + alert_id: alert.id, + tags: alert.tags ?? [], + }, + }, + links: [ + { + href: typeof window !== 'undefined' ? window.location.href : serviceName, + text: 'Open Dashboard', + }, + ], + }; +} + +async function sendPagerDuty(payload: object): Promise { + const response = await fetch('https://events.pagerduty.com/v2/enqueue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(8000), + }); + + if (!response.ok) { + const body = await response.text().catch(() => ''); + throw new Error(`PagerDuty API returned HTTP ${response.status}: ${body}`); + } +} + +// ─── Core dispatch ──────────────────────────────────────────────────────────── + +export interface DispatchOptions { + /** Override auto-resolved PagerDuty action. */ + pagerDutyAction?: PagerDutyEventAction; +} + +/** + * Dispatch an alert through all appropriate channels based on severity. + * + * - "info" → in-app + browser + * - "warning" → in-app + browser + Slack + * - "critical" → in-app + browser + Slack + PagerDuty + * + * External webhook calls are fire-and-forget (errors are captured in Sentry + * and logged, but they never throw to the caller). + */ +export async function dispatchAlert( + alert: AlertPayload, + opts: DispatchOptions = {}, +): Promise { + const { serviceName, slackWebhookUrl, pagerDutyRoutingKey, dryRun } = _cfg; + + // Always deliver in-app + browser regardless of severity + await dispatchToChannels(alert, [{ type: 'in_app' }, { type: 'browser' }]); + + if (alert.severity === 'info') return; + + // ── Slack ───────────────────────────────────────────────────────────────── + if (slackWebhookUrl) { + const slackPayload = buildSlackPayload(alert, serviceName); + + if (dryRun) { + logger.info('[DRY-RUN] Slack alert suppressed', { alert: alert.id, title: alert.title }); + } else { + sendSlack(slackPayload, slackWebhookUrl).catch(err => { + logger.warn('Slack delivery failed', { alertId: alert.id }, err); + Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'slack' } }); + }); + } + } + + if (alert.severity !== 'critical') return; + + // ── PagerDuty ───────────────────────────────────────────────────────────── + if (pagerDutyRoutingKey) { + const action = opts.pagerDutyAction ?? 'trigger'; + const pdPayload = buildPagerDutyPayload(alert, serviceName, action); + + if (dryRun) { + logger.info('[DRY-RUN] PagerDuty alert suppressed', { alert: alert.id, title: alert.title }); + } else { + sendPagerDuty(pdPayload).catch(err => { + logger.warn('PagerDuty delivery failed', { alertId: alert.id }, err); + Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'pagerduty' } }); + }); + } + } +} + +/** + * Convenience wrapper: resolve an active PagerDuty incident by dedup key. + * Pass the original alert ID that was used to trigger the incident. + */ +export function resolvePagerDutyIncident(alertId: string, title: string): void { + const { pagerDutyRoutingKey, dryRun } = _cfg; + if (!pagerDutyRoutingKey) return; + + const resolvePayload = { + routing_key: pagerDutyRoutingKey, + event_action: 'resolve' as const, + dedup_key: alertId, + payload: { + summary: `RESOLVED: ${title}`, + source: _cfg.serviceName, + severity: 'info', + timestamp: new Date().toISOString(), + }, + }; + + if (dryRun) { + logger.info('[DRY-RUN] PagerDuty resolve suppressed', { alertId }); + return; + } + + sendPagerDuty(resolvePayload).catch(err => { + logger.warn('PagerDuty resolve failed', { alertId }, err); + Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'pagerduty' } }); + }); +} + +/** + * Build a standard `AlertPayload` from a raw error. + * Use with `dispatchAlert()` for one-line critical error dispatch. + * + * @example + * dispatchAlert(buildAlertFromError(err, 'critical', ['stellar', 'horizon'])); + */ +export function buildAlertFromError( + err: unknown, + severity: AlertPayload['severity'] = 'critical', + tags: string[] = [], +): AlertPayload { + const message = err instanceof Error ? err.message : String(err ?? 'Unknown error'); + const name = err instanceof Error ? err.name : 'Error'; + const id = `alert-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`; + + return { + id, + title: `${name}: ${message.slice(0, 80)}`, + description: err instanceof Error && err.stack + ? err.stack.split('\n').slice(0, 5).join('\n') + : message, + severity, + timestamp: new Date().toISOString(), + tags, + }; +} diff --git a/src/lib/dashboardConfig.ts b/src/lib/dashboardConfig.ts new file mode 100644 index 00000000..4aeaa1d8 --- /dev/null +++ b/src/lib/dashboardConfig.ts @@ -0,0 +1,349 @@ +/** + * Observability Dashboard Configuration + * + * Exports: + * 1. `GRAFANA_DASHBOARD` – Grafana dashboard JSON (import via the UI or + * provisioning API at /api/dashboards/import) + * 2. `SENTRY_SAVED_QUERIES` – Pre-built Sentry Discover query definitions + * (paste into Sentry → Discover → Saved Queries) + * 3. `exportGrafanaDashboard()` – Downloads the JSON file in the browser + * 4. `getDatadogMonitorTemplate()` – Datadog monitor JSON for p95 latency + * + * These configs visualise: + * - Error rates and unhandled exception counts + * - API latency percentiles (p50 / p95 / p99) + * - Core Web Vitals (LCP, CLS, FID) + * - Client-side health score over time + * - Cache hit ratios + * - Memory pressure + */ + +// ─── Grafana dashboard JSON ─────────────────────────────────────────────────── + +/** + * A ready-to-import Grafana dashboard for the Stellar Dev Dashboard. + * + * Assumes: + * - Prometheus data source named "Prometheus" (change `datasource` if yours differs) + * - Metrics emitted by your app or a synthetic exporter into Prometheus + * following the naming convention below. + * + * Metric naming convention (use a Prometheus Pushgateway or custom exporter + * to forward the RUM events emitted by `src/lib/performance.ts`): + * + * stellar_dashboard_lcp_milliseconds + * stellar_dashboard_fid_milliseconds + * stellar_dashboard_cls_ratio + * stellar_dashboard_health_score (gauge 0–100) + * stellar_dashboard_api_request_duration_seconds{endpoint, method, status} + * stellar_dashboard_error_total{category, severity} + * stellar_dashboard_cache_hits_total{namespace} + * stellar_dashboard_cache_misses_total{namespace} + */ +export const GRAFANA_DASHBOARD = { + title: 'Stellar Dev Dashboard — Production Health', + uid: 'stellar-prod-health-v1', + schemaVersion: 38, + version: 1, + refresh: '30s', + time: { from: 'now-3h', to: 'now' }, + templating: { + list: [ + { + name: 'datasource', + type: 'datasource', + pluginId: 'prometheus', + label: 'Prometheus', + current: { text: 'Prometheus', value: 'Prometheus' }, + }, + ], + }, + panels: [ + // ── Row 1: Health overview ──────────────────────────────────────────── + { + id: 1, + type: 'stat', + title: 'Health Score (avg)', + gridPos: { x: 0, y: 0, w: 4, h: 4 }, + datasource: '$datasource', + targets: [ + { + expr: 'avg(stellar_dashboard_health_score)', + legendFormat: 'Health Score', + }, + ], + options: { + colorMode: 'background', + thresholds: { + steps: [ + { value: 0, color: 'red' }, + { value: 50, color: 'orange' }, + { value: 80, color: 'green' }, + ], + }, + }, + }, + { + id: 2, + type: 'stat', + title: 'Error Rate (5m)', + gridPos: { x: 4, y: 0, w: 4, h: 4 }, + datasource: '$datasource', + targets: [ + { + expr: 'sum(rate(stellar_dashboard_error_total[5m]))', + legendFormat: 'Errors/s', + }, + ], + options: { + colorMode: 'background', + thresholds: { + steps: [ + { value: 0, color: 'green' }, + { value: 0.01, color: 'orange' }, + { value: 0.1, color: 'red' }, + ], + }, + }, + }, + { + id: 3, + type: 'stat', + title: 'LCP (p75)', + gridPos: { x: 8, y: 0, w: 4, h: 4 }, + datasource: '$datasource', + targets: [ + { + expr: 'histogram_quantile(0.75, rate(stellar_dashboard_lcp_milliseconds_bucket[5m]))', + legendFormat: 'LCP p75 ms', + }, + ], + options: { + unit: 'ms', + colorMode: 'background', + thresholds: { + steps: [ + { value: 0, color: 'green' }, + { value: 2500, color: 'orange' }, + { value: 4000, color: 'red' }, + ], + }, + }, + }, + { + id: 4, + type: 'stat', + title: 'Cache Hit Ratio', + gridPos: { x: 12, y: 0, w: 4, h: 4 }, + datasource: '$datasource', + targets: [ + { + expr: ` + sum(rate(stellar_dashboard_cache_hits_total[5m])) + / ( + sum(rate(stellar_dashboard_cache_hits_total[5m])) + + sum(rate(stellar_dashboard_cache_misses_total[5m])) + ) + `, + legendFormat: 'Hit Ratio', + }, + ], + options: { + unit: 'percentunit', + colorMode: 'background', + thresholds: { + steps: [ + { value: 0, color: 'red' }, + { value: 0.7, color: 'orange' }, + { value: 0.9, color: 'green' }, + ], + }, + }, + }, + + // ── Row 2: API latency percentiles ──────────────────────────────────── + { + id: 10, + type: 'timeseries', + title: 'API Request Duration — p50 / p95 / p99', + gridPos: { x: 0, y: 4, w: 16, h: 8 }, + datasource: '$datasource', + targets: [ + { + expr: 'histogram_quantile(0.50, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))', + legendFormat: 'p50', + }, + { + expr: 'histogram_quantile(0.95, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))', + legendFormat: 'p95', + }, + { + expr: 'histogram_quantile(0.99, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))', + legendFormat: 'p99', + }, + ], + fieldConfig: { + defaults: { + unit: 's', + custom: { lineWidth: 2 }, + }, + }, + }, + + // ── Row 3: Error breakdown ──────────────────────────────────────────── + { + id: 20, + type: 'timeseries', + title: 'Errors by Severity', + gridPos: { x: 0, y: 12, w: 12, h: 6 }, + datasource: '$datasource', + targets: [ + { + expr: 'sum by (severity) (rate(stellar_dashboard_error_total[5m]))', + legendFormat: '{{severity}}', + }, + ], + fieldConfig: { defaults: { unit: 'short' } }, + }, + { + id: 21, + type: 'timeseries', + title: 'Errors by Category', + gridPos: { x: 12, y: 12, w: 12, h: 6 }, + datasource: '$datasource', + targets: [ + { + expr: 'sum by (category) (rate(stellar_dashboard_error_total[5m]))', + legendFormat: '{{category}}', + }, + ], + fieldConfig: { defaults: { unit: 'short' } }, + }, + + // ── Row 4: Core Web Vitals ──────────────────────────────────────────── + { + id: 30, + type: 'timeseries', + title: 'Core Web Vitals over time', + gridPos: { x: 0, y: 18, w: 24, h: 7 }, + datasource: '$datasource', + targets: [ + { + expr: 'histogram_quantile(0.75, rate(stellar_dashboard_lcp_milliseconds_bucket[5m]))', + legendFormat: 'LCP p75 (ms)', + }, + { + expr: 'histogram_quantile(0.75, rate(stellar_dashboard_fid_milliseconds_bucket[5m]))', + legendFormat: 'FID p75 (ms)', + }, + { + expr: 'avg(stellar_dashboard_cls_ratio)', + legendFormat: 'CLS (avg ×1000)', + transformations: [{ id: 'multiplyBy', options: { value: 1000 } }], + }, + ], + fieldConfig: { defaults: { unit: 'ms' } }, + }, + ], +} as const; + +// ─── Sentry Discover saved queries ─────────────────────────────────────────── + +/** + * Paste each entry's `query` into Sentry → Discover → "Saved Queries". + * Column definitions map directly to Sentry Discover field names. + */ +export const SENTRY_SAVED_QUERIES = [ + { + name: 'Unhandled Exceptions — last 24 h', + query: 'event.type:error !has:handled.exception', + fields: ['count()', 'issue', 'title', 'project', 'last_seen()'], + orderby: '-count()', + range: '24h', + }, + { + name: 'P95 Transaction Duration', + query: 'event.type:transaction', + fields: ['transaction', 'count()', 'p50(transaction.duration)', 'p95(transaction.duration)', 'p99(transaction.duration)'], + orderby: '-p95(transaction.duration)', + range: '1h', + }, + { + name: 'Critical Errors by Category', + query: 'event.type:error level:fatal', + fields: ['count()', 'issue', 'tags[category]', 'tags[severity]', 'last_seen()'], + orderby: '-count()', + range: '7d', + }, + { + name: 'Frontend Web Vitals — LCP outliers', + query: 'event.type:transaction measurements.lcp:>4000', + fields: ['transaction', 'measurements.lcp', 'measurements.fid', 'measurements.cls', 'count()'], + orderby: '-measurements.lcp', + range: '6h', + }, + { + name: 'Stellar Horizon API Errors', + query: 'event.type:error tags[category]:network tags[context]:*horizon*', + fields: ['count()', 'issue', 'title', 'tags[url]', 'last_seen()'], + orderby: '-count()', + range: '24h', + }, +] as const; + +// ─── Datadog monitor template ───────────────────────────────────────────────── + +/** + * Datadog monitor JSON for p95 API latency alerting. + * Use with the Datadog API: POST /api/v1/monitor + * or import via the Datadog Terraform provider. + */ +export function getDatadogMonitorTemplate(service = 'stellar-dev-dashboard'): object { + return { + name: `[${service}] API p95 Latency SLO breach`, + type: 'metric alert', + query: `percentile(last_5m):p95:stellar.dashboard.api.request.duration.seconds{service:${service}} > 2`, + message: ` +p95 API latency for **${service}** has exceeded 2 seconds over the past 5 minutes. + +Runbook: https://your-wiki/runbooks/high-api-latency + +@pagerduty-${service} @slack-alerts-${service} + `.trim(), + tags: [`service:${service}`, 'team:platform', 'severity:high'], + options: { + thresholds: { + critical: 2, // seconds + warning: 1, + }, + notify_no_data: true, + no_data_timeframe: 10, + renotify_interval: 30, + include_tags: true, + evaluation_delay: 60, + }, + priority: 2, + }; +} + +// ─── Browser download helper ────────────────────────────────────────────────── + +/** + * Downloads the Grafana dashboard JSON as a file in the browser. + * Wire this to a "Export Dashboard" button in the admin panel. + */ +export function exportGrafanaDashboard(): void { + const blob = new Blob( + [JSON.stringify(GRAFANA_DASHBOARD, null, 2)], + { type: 'application/json' }, + ); + const url = URL.createObjectURL(blob); + const a = Object.assign(document.createElement('a'), { + href: url, + download: 'stellar-dashboard-grafana.json', + }); + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(url); +} diff --git a/src/lib/healthCheck.ts b/src/lib/healthCheck.ts new file mode 100644 index 00000000..a74fdc1f --- /dev/null +++ b/src/lib/healthCheck.ts @@ -0,0 +1,291 @@ +/** + * Client-side Health Check + * + * Provides `getHealthStatus()` — a lightweight, dependency-aware health + * snapshot analogous to a `/health` HTTP endpoint. It is surfaced: + * + * 1. In the existing System Health dashboard tab (pass it to any component + * that renders operational metrics). + * 2. Via the nginx `/health` location (already returns 200 'ok' at the + * network layer; this JS layer adds runtime depth on top). + * 3. By the `alertDispatch` pipeline to decide whether to fire a critical + * alert when a dependency probe fails. + * + * No server-side code is needed — everything runs in the browser. + */ + +import { createLogger } from '../utils/logger'; +// collectHealthSnapshot / computeHealthScore live in the pre-existing JS module. +// We import them with the .js extension so bundler mode resolves without .ts +import { + collectHealthSnapshot, + computeHealthScore, +} from '../utils/monitoring.js'; +import { addBreadcrumb } from './errorReporting'; + +const logger = createLogger('HealthCheck'); + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export type HealthStatus = 'healthy' | 'degraded' | 'unhealthy'; + +export interface DependencyProbe { + name: string; + status: HealthStatus; + latencyMs: number | null; + message?: string; +} + +export interface HealthReport { + status: HealthStatus; + score: number; // 0–100 + uptimeMs: number; + memory: { + usedMB: number | null; + totalMB: number | null; + heapLimitMB: number | null; + pressureLevel: 'low' | 'medium' | 'high' | 'critical' | 'unknown'; + }; + network: { + online: boolean; + effectiveType: string | null; + rttMs: number | null; + downlinkMbps: number | null; + }; + dependencies: DependencyProbe[]; + webVitals: { + lcp: number | null; + fid: number | null; + cls: number | null; + }; + timestamp: string; + version: string; +} + +// ─── Internal state ─────────────────────────────────────────────────────────── + +const _appStart = Date.now(); +const _vitals: { lcp: number | null; fid: number | null; cls: number | null } = { + lcp: null, + fid: null, + cls: null, +}; + +// Passively capture web vitals as they arrive so `getHealthStatus()` can +// report them without blocking. +if (typeof PerformanceObserver !== 'undefined') { + try { + new PerformanceObserver(list => { + const last = list.getEntries().at(-1) as PerformanceEntry | undefined; + if (last) _vitals.lcp = Math.round(last.startTime); + }).observe({ type: 'largest-contentful-paint', buffered: true }); + } catch { /* unsupported */ } + + try { + let clsAcc = 0; + new PerformanceObserver(list => { + for (const e of list.getEntries()) { + clsAcc += (e as unknown as { value: number }).value ?? 0; + } + _vitals.cls = Number(clsAcc.toFixed(3)); + }).observe({ type: 'layout-shift', buffered: true }); + } catch { /* unsupported */ } + + try { + new PerformanceObserver(list => { + for (const e of list.getEntries()) { + const entry = e as unknown as { processingStart: number; startTime: number }; + _vitals.fid = Math.round(entry.processingStart - entry.startTime); + } + }).observe({ type: 'first-input', buffered: true }); + } catch { /* unsupported */ } +} + +// ─── Dependency probes ──────────────────────────────────────────────────────── + +/** + * Probe an external HTTP dependency. + * Returns within `timeoutMs` even if the request hangs. + */ +async function probeHttpEndpoint( + name: string, + url: string, + timeoutMs = 4000, +): Promise { + const start = performance.now(); + try { + const res = await fetch(url, { + method: 'HEAD', + signal: AbortSignal.timeout(timeoutMs), + cache: 'no-store', + }); + const latencyMs = Math.round(performance.now() - start); + return { + name, + status: res.ok ? 'healthy' : 'degraded', + latencyMs, + message: res.ok ? undefined : `HTTP ${res.status}`, + }; + } catch (err) { + return { + name, + status: 'unhealthy', + latencyMs: null, + message: err instanceof Error ? err.message : 'probe failed', + }; + } +} + +/** + * Register custom dependency probes. Out of the box we probe: + * - Stellar Horizon testnet (always available; a sensible live canary) + * - Stellar Horizon mainnet + * + * Add your own via `registerDependencyProbe()`. + */ +type ProbeFactory = () => Promise; + +const _probes: Map = new Map([ + [ + 'horizon.testnet', + () => + probeHttpEndpoint( + 'Stellar Horizon (testnet)', + 'https://horizon-testnet.stellar.org', + ), + ], + [ + 'horizon.mainnet', + () => + probeHttpEndpoint( + 'Stellar Horizon (mainnet)', + 'https://horizon.stellar.org', + ), + ], +]); + +export function registerDependencyProbe(key: string, factory: ProbeFactory): void { + _probes.set(key, factory); +} + +export function unregisterDependencyProbe(key: string): void { + _probes.delete(key); +} + +// ─── Memory helpers ─────────────────────────────────────────────────────────── + +type MemoryInfo = { + usedJSHeapSize: number; + totalJSHeapSize: number; + jsHeapSizeLimit: number; +}; + +function readMemory() { + const mem = (performance as unknown as { memory?: MemoryInfo }).memory; + if (!mem) { + return { usedMB: null, totalMB: null, heapLimitMB: null, pressureLevel: 'unknown' as const }; + } + const usedMB = Math.round(mem.usedJSHeapSize / 1_048_576); + const totalMB = Math.round(mem.totalJSHeapSize / 1_048_576); + const heapLimitMB = Math.round(mem.jsHeapSizeLimit / 1_048_576); + const ratio = mem.usedJSHeapSize / mem.jsHeapSizeLimit; + + const pressureLevel = + ratio > 0.9 ? 'critical' : + ratio > 0.8 ? 'high' : + ratio > 0.7 ? 'medium' : 'low'; + + return { usedMB, totalMB, heapLimitMB, pressureLevel } as const; +} + +function readNetwork() { + type NetInfo = { effectiveType?: string; rtt?: number; downlink?: number }; + const conn = (navigator as unknown as { connection?: NetInfo }).connection; + return { + online: navigator.onLine, + effectiveType: conn?.effectiveType ?? null, + rttMs: conn?.rtt ?? null, + downlinkMbps: conn?.downlink ?? null, + }; +} + +// ─── Aggregate status ───────────────────────────────────────────────────────── + +function aggregateStatus(score: number, deps: DependencyProbe[]): HealthStatus { + const anyUnhealthy = deps.some(d => d.status === 'unhealthy'); + const anyDegraded = deps.some(d => d.status === 'degraded'); + + if (!navigator.onLine || score < 30 || anyUnhealthy) return 'unhealthy'; + if (score < 70 || anyDegraded) return 'degraded'; + return 'healthy'; +} + +// ─── Public API ─────────────────────────────────────────────────────────────── + +/** + * Perform a full health check and return a structured report. + * + * `runProbes` defaults to `true` in production so callers get real dependency + * latency data; set it to `false` for a cheaper synchronous-ish snapshot. + */ +export async function getHealthStatus(runProbes = true): Promise { + const snapshot = collectHealthSnapshot(); + const score = computeHealthScore(snapshot); + const memory = readMemory(); + const network = readNetwork(); + + let dependencies: DependencyProbe[] = []; + if (runProbes) { + const probeResults = await Promise.allSettled( + Array.from(_probes.values()).map(factory => factory()), + ); + dependencies = probeResults.map(r => + r.status === 'fulfilled' + ? r.value + : { name: 'unknown', status: 'unhealthy' as HealthStatus, latencyMs: null, message: 'probe threw' }, + ); + } + + const status = aggregateStatus(score, dependencies); + + const report: HealthReport = { + status, + score, + uptimeMs: Date.now() - _appStart, + memory, + network, + dependencies, + webVitals: { ..._vitals }, + timestamp: new Date().toISOString(), + version: (import.meta.env.VITE_SENTRY_RELEASE as string | undefined) ?? 'unknown', + }; + + addBreadcrumb(`Health check: ${status} (score ${score})`, 'health', { score, status }); + + if (status !== 'healthy') { + logger.warn('Health check degraded', { status, score, dependencies }); + } else { + logger.debug('Health check passed', { score }); + } + + return report; +} + +/** + * Simplified probe suitable for a status badge or polling hook. + * Returns within `timeoutMs` even if dependency probes are slow. + */ +export async function quickHealthCheck(timeoutMs = 5000): Promise { + try { + const result = await Promise.race([ + getHealthStatus(true), + new Promise((_, reject) => + setTimeout(() => reject(new Error('health check timed out')), timeoutMs), + ), + ]); + return result.status; + } catch (err) { + logger.warn('Quick health check timed out or failed', {}, err instanceof Error ? err : undefined); + return 'degraded'; + } +} diff --git a/src/main.jsx b/src/main.jsx index abdb82fe..e4162f21 100644 --- a/src/main.jsx +++ b/src/main.jsx @@ -3,12 +3,13 @@ import ReactDOM from "react-dom/client"; import { BrowserRouter } from "react-router-dom"; import App from "./App"; import "./styles/globals.css"; -import { initPerformanceMonitoring } from "./lib/performance"; +import { initMonitoring } from "./utils/monitoring"; import { selfHealingManager } from "./lib/errorHandling/SelfHealingManager"; import { registerBuiltInStrategies, registerNetworkProbes } from "./lib/errorHandling/RecoveryStrategyRegistry"; -// Initialize performance monitoring (no RUM endpoint by default) -initPerformanceMonitoring(); +// ── Monitoring must be the very first thing that runs so Sentry and the +// global error handlers are in place before any React code executes. +initMonitoring(); // D-057 — Bootstrap error recovery & self-healing registerBuiltInStrategies(); diff --git a/src/utils/monitoring.ts b/src/utils/monitoring.ts new file mode 100644 index 00000000..ac13b49f --- /dev/null +++ b/src/utils/monitoring.ts @@ -0,0 +1,324 @@ +/** + * Production Monitoring & Observability + * + * Initialises Sentry for error tracking + performance tracing, + * bridges into the existing errorReporting / performance pipelines, + * and captures global uncaught exceptions / unhandled rejections. + * + * Call `initMonitoring()` once at the very top of `src/main.jsx`, + * before `ReactDOM.createRoot(…).render(…)`. + * + * Environment variables (set via .env or your CI secrets manager): + * VITE_SENTRY_DSN – Sentry project DSN (required in production) + * VITE_SENTRY_ENV – "production" | "staging" | "development" + * VITE_SENTRY_RELEASE – build SHA / semver tag injected at build time + * VITE_SENTRY_TRACES_RATE – 0–1 float for performance sampling (default 0.1) + * VITE_SENTRY_REPLAY_RATE – 0–1 float for Session Replay (default 0.05) + */ + +import * as Sentry from '@sentry/react'; +import { + reportError, + addBreadcrumb, + initializeErrorReporting, +} from '../lib/errorReporting'; +import { initPerformanceMonitoring } from '../lib/performance'; +import { createLogger } from './logger'; + +const logger = createLogger('Monitoring'); + +// ─── Config ─────────────────────────────────────────────────────────────────── + +export interface MonitoringConfig { + /** Sentry DSN – omit or leave empty to disable Sentry. */ + sentryDsn?: string; + /** "production" | "staging" | "development" */ + environment: string; + /** Release identifier (git SHA, semver tag). */ + release?: string; + /** Fraction of transactions to sample for performance tracing (0–1). */ + tracesSampleRate: number; + /** Fraction of sessions to record via Session Replay (0–1). */ + replaySampleRate: number; + /** Optional RUM endpoint for the existing performance pipeline. */ + rumEndpoint?: string; +} + +const defaultConfig: MonitoringConfig = { + sentryDsn: import.meta.env.VITE_SENTRY_DSN as string | undefined, + environment: (import.meta.env.VITE_SENTRY_ENV as string | undefined) ?? import.meta.env.MODE ?? 'development', + release: import.meta.env.VITE_SENTRY_RELEASE as string | undefined, + tracesSampleRate: Number(import.meta.env.VITE_SENTRY_TRACES_RATE ?? 0.1), + replaySampleRate: Number(import.meta.env.VITE_SENTRY_REPLAY_RATE ?? 0.05), + rumEndpoint: import.meta.env.VITE_RUM_ENDPOINT as string | undefined, +}; + +let _initialised = false; + +// ─── Sentry init ────────────────────────────────────────────────────────────── + +function initialiseSentry(cfg: MonitoringConfig): void { + if (!cfg.sentryDsn) { + logger.warn('Sentry DSN not set – error tracking disabled.', { env: cfg.environment }); + return; + } + + Sentry.init({ + dsn: cfg.sentryDsn, + environment: cfg.environment, + release: cfg.release, + + // ── Performance (APM) ──────────────────────────────────────────────────── + // Instruments fetch/XHR, React routing spans, and long-tasks automatically. + tracesSampleRate: cfg.tracesSampleRate, + + // ── Session Replay ─────────────────────────────────────────────────────── + // Captures a lightweight DOM snapshot replay for error sessions. + replaysSessionSampleRate: 0, // don't record healthy sessions + replaysOnErrorSampleRate: cfg.replaySampleRate, + + // ── Integrations ───────────────────────────────────────────────────────── + integrations: [ + // Browser performance tracing (route changes, HTTP requests, long-tasks) + Sentry.browserTracingIntegration(), + // Session Replay on error + Sentry.replayIntegration({ + maskAllText: true, + blockAllMedia: true, + }), + // Capture console.error calls as breadcrumbs + Sentry.breadcrumbsIntegration({ console: true }), + ], + + // ── Scrubbing ──────────────────────────────────────────────────────────── + // Strip PII / secrets from outgoing event payloads. + beforeSend(event) { + // Remove auth tokens from request headers recorded in the event + if (event.request?.headers) { + const h = event.request.headers as Record; + delete h['Authorization']; + delete h['Cookie']; + delete h['X-Api-Key']; + } + // Strip query params that may carry secrets + if (event.request?.url) { + try { + const u = new URL(event.request.url); + ['token', 'key', 'secret', 'api_key', 'access_token'].forEach(p => + u.searchParams.delete(p), + ); + event.request.url = u.toString(); + } catch { + /* ignore malformed URLs */ + } + } + return event; + }, + + // Drop Sentry's own internal traffic and localhost noise + denyUrls: [/localhost/, /127\.0\.0\.1/, /extensions\//i], + }); + + logger.info('Sentry initialised', { + env: cfg.environment, + release: cfg.release ?? 'unknown', + tracesSampleRate: cfg.tracesSampleRate, + }); +} + +// ─── Global error listeners ─────────────────────────────────────────────────── + +/** + * Captures uncaught exceptions and unhandled promise rejections, + * forwarding them to both Sentry and the existing errorReporting pipeline. + */ +function attachGlobalErrorHandlers(): void { + window.addEventListener('error', (event: ErrorEvent) => { + const err = event.error instanceof Error ? event.error : new Error(event.message); + + // Sentry already captures window.onerror via its SDK, but we add extra + // context tags here for the existing in-app error store. + Sentry.withScope(scope => { + scope.setTag('capture_mechanism', 'window.onerror'); + scope.setExtra('filename', event.filename); + scope.setExtra('lineno', event.lineno); + scope.setExtra('colno', event.colno); + Sentry.captureException(err); + }); + + reportError(err, { + context: 'Global Error Handler', + filename: event.filename, + lineno: event.lineno, + colno: event.colno, + category: 'javascript', + severity: 'high', + }); + }); + + window.addEventListener('unhandledrejection', (event: PromiseRejectionEvent) => { + const reason = event.reason; + const err = reason instanceof Error ? reason : new Error(String(reason ?? 'Unhandled rejection')); + + Sentry.withScope(scope => { + scope.setTag('capture_mechanism', 'unhandledrejection'); + Sentry.captureException(err); + }); + + reportError(err, { + context: 'Unhandled Promise Rejection', + category: 'promise', + severity: 'high', + }); + }); +} + +// ─── Web Vitals → Sentry custom measurements ───────────────────────────────── + +/** + * Pipes Core Web Vitals collected by the existing performance pipeline + * into Sentry as custom measurements on the active transaction/span, + * and also emits a Sentry breadcrumb for quick triage. + */ +function attachWebVitalsBridge(): void { + if (typeof PerformanceObserver === 'undefined') return; + + // LCP + try { + const lcpObs = new PerformanceObserver(list => { + const entries = list.getEntries(); + const last = entries[entries.length - 1] as PerformanceEntry; + if (!last) return; + const value = Math.round(last.startTime); + Sentry.setMeasurement('lcp', value, 'millisecond'); + addBreadcrumb(`LCP: ${value}ms`, 'performance', { value }); + }); + lcpObs.observe({ type: 'largest-contentful-paint', buffered: true }); + } catch { /* observer not supported */ } + + // CLS + try { + let clsValue = 0; + const clsObs = new PerformanceObserver(list => { + for (const entry of list.getEntries()) { + clsValue += (entry as unknown as { value: number }).value ?? 0; + } + const cls = Number(clsValue.toFixed(3)); + Sentry.setMeasurement('cls', cls, 'none'); + addBreadcrumb(`CLS: ${cls}`, 'performance', { value: cls }); + }); + clsObs.observe({ type: 'layout-shift', buffered: true }); + } catch { /* observer not supported */ } + + // FID / INP + try { + const inputObs = new PerformanceObserver(list => { + for (const entry of list.getEntries()) { + const e = entry as unknown as { processingStart: number; startTime: number }; + const fid = Math.round(e.processingStart - e.startTime); + Sentry.setMeasurement('fid', fid, 'millisecond'); + addBreadcrumb(`FID: ${fid}ms`, 'performance', { value: fid }); + } + }); + inputObs.observe({ type: 'first-input', buffered: true }); + } catch { /* observer not supported */ } +} + +// ─── Public API ─────────────────────────────────────────────────────────────── + +/** + * Initialise the full monitoring stack. + * + * Call once before `ReactDOM.createRoot(…).render(…)`. + */ +export function initMonitoring(userConfig: Partial = {}): void { + if (_initialised) return; + _initialised = true; + + const cfg: MonitoringConfig = { ...defaultConfig, ...userConfig }; + + // 1. Sentry SDK + initialiseSentry(cfg); + + // 2. Global error capture (bridges into errorReporting + Sentry) + attachGlobalErrorHandlers(); + + // 3. Web Vitals → Sentry measurements + breadcrumbs + attachWebVitalsBridge(); + + // 4. Existing performance monitoring (LCP/CLS/FID budgets, RUM endpoint) + initPerformanceMonitoring({ rumEndpoint: cfg.rumEndpoint }); + + // 5. Existing error reporting queue (batched flush, localStorage backup) + initializeErrorReporting({ enabled: true }); + + logger.info('Monitoring stack initialised', { env: cfg.environment }); +} + +// ─── Sentry user context helpers ───────────────────────────────────────────── + +/** + * Attach a Stellar account address as the Sentry "user" for session correlation. + * Call after wallet connect; pass `null` to clear on disconnect. + */ +export function setMonitoringUser( + stellarAddress: string | null, + extra?: Record, +): void { + if (stellarAddress) { + Sentry.setUser({ id: stellarAddress, ...extra }); + } else { + Sentry.setUser(null); + } +} + +/** + * Wrap a synchronous or async operation in a named Sentry performance span. + * + * @example + * const result = await withSpan('stellar.horizon.fetchAccount', async () => + * horizon.loadAccount(address) + * ); + */ +export async function withSpan( + name: string, + fn: () => T | Promise, + attributes?: Record, +): Promise { + return Sentry.startSpan({ name, attributes }, () => fn()); +} + +/** + * Manually capture an exception in Sentry with additional context. + * Mirrors the existing `reportError` API but also sends to Sentry. + */ +export function captureError( + err: unknown, + context?: Record, +): void { + Sentry.withScope(scope => { + if (context) { + Object.entries(context).forEach(([k, v]) => scope.setExtra(k, v)); + } + Sentry.captureException(err instanceof Error ? err : new Error(String(err))); + }); + reportError(err, context ?? {}); +} + +/** + * Expose the Sentry error boundary component for wrapping route-level trees. + * + * @example + * Something went wrong

}> + * + *
+ */ +export const SentryErrorBoundary = Sentry.ErrorBoundary; + +export default { + initMonitoring, + setMonitoringUser, + withSpan, + captureError, + SentryErrorBoundary, +};