diff --git a/package.json b/package.json
index 7289b9ab..b58e1bea 100644
--- a/package.json
+++ b/package.json
@@ -40,6 +40,8 @@
"docs:api:generate": "node scripts/generate-api-docs.mjs"
},
"dependencies": {
+ "@sentry/browser": "8.54.0",
+ "@sentry/react": "8.54.0",
"@tensorflow/tfjs-node": "^5.0.0",
"express": "^4.18.2",
"@axe-core/react": "^4.11.3",
diff --git a/src/lib/alertDispatch.ts b/src/lib/alertDispatch.ts
new file mode 100644
index 00000000..1510ab46
--- /dev/null
+++ b/src/lib/alertDispatch.ts
@@ -0,0 +1,285 @@
+/**
+ * Alert Dispatch – Slack & PagerDuty Webhook Integration
+ *
+ * Sits on top of the existing `alertChannels.ts` delivery layer and adds
+ * production-grade routing to external incident management services.
+ *
+ * Routing rules:
+ * - severity "info" → in-app + browser notification only
+ * - severity "warning" → in-app + browser + Slack (if configured)
+ * - severity "critical" → in-app + browser + Slack + PagerDuty
+ *
+ * Environment / runtime configuration:
+ * VITE_SLACK_WEBHOOK_URL – Incoming Webhook URL from your Slack App
+ * VITE_PAGERDUTY_ROUTING_KEY – PagerDuty Events API v2 Integration Key
+ *
+ * You can also call `configureAlertDispatch()` at runtime to override or
+ * extend the defaults without redeploying.
+ */
+
+import * as Sentry from '@sentry/react';
+import { dispatchToChannels, type AlertPayload } from './alertChannels';
+import { createLogger } from '../utils/logger';
+
+const logger = createLogger('AlertDispatch');
+
+// ─── Configuration ────────────────────────────────────────────────────────────
+
+export interface AlertDispatchConfig {
+ /** Slack Incoming Webhook URL – leave undefined to disable Slack delivery. */
+ slackWebhookUrl?: string;
+ /**
+ * PagerDuty Events API v2 routing / integration key.
+ * Leave undefined to disable PagerDuty delivery.
+ */
+ pagerDutyRoutingKey?: string;
+ /**
+ * Human-readable service name included in Slack/PD payloads.
+ * Defaults to "stellar-dev-dashboard".
+ */
+ serviceName: string;
+ /**
+ * If `true`, Slack/PagerDuty calls are skipped and payloads are logged
+ * to the console instead. Automatically `true` in non-production builds.
+ */
+ dryRun: boolean;
+}
+
+let _cfg: AlertDispatchConfig = {
+ slackWebhookUrl: import.meta.env.VITE_SLACK_WEBHOOK_URL as string | undefined,
+ pagerDutyRoutingKey: import.meta.env.VITE_PAGERDUTY_ROUTING_KEY as string | undefined,
+ serviceName: 'stellar-dev-dashboard',
+ // Only fire live webhooks in production builds to avoid alert noise during dev
+ dryRun: import.meta.env.MODE !== 'production',
+};
+
+export function configureAlertDispatch(overrides: Partial): void {
+ _cfg = { ..._cfg, ...overrides };
+ logger.info('Alert dispatch reconfigured', {
+ slackEnabled: !!_cfg.slackWebhookUrl,
+ pagerDutyEnabled: !!_cfg.pagerDutyRoutingKey,
+ dryRun: _cfg.dryRun,
+ });
+}
+
+// ─── Slack payload builder ────────────────────────────────────────────────────
+
+const SEVERITY_EMOJI: Record = {
+ info: ':information_source:',
+ warning: ':warning:',
+ critical: ':red_circle:',
+};
+
+const SEVERITY_COLOR: Record = {
+ info: '#2196F3',
+ warning: '#FF9800',
+ critical: '#F44336',
+};
+
+function buildSlackPayload(alert: AlertPayload, serviceName: string): object {
+ const emoji = SEVERITY_EMOJI[alert.severity] ?? ':bell:';
+ const color = SEVERITY_COLOR[alert.severity] ?? '#9E9E9E';
+
+ return {
+ text: `${emoji} *[${alert.severity.toUpperCase()}]* ${alert.title}`,
+ attachments: [
+ {
+ color,
+ fields: [
+ { title: 'Service', value: serviceName, short: true },
+ { title: 'Severity', value: alert.severity, short: true },
+ { title: 'Description', value: alert.description, short: false },
+ { title: 'Alert ID', value: alert.id, short: true },
+ { title: 'Timestamp', value: alert.timestamp, short: true },
+ ...(alert.tags?.length
+ ? [{ title: 'Tags', value: alert.tags.join(', '), short: false }]
+ : []),
+ ],
+ footer: serviceName,
+ ts: Math.floor(Date.parse(alert.timestamp) / 1000),
+ },
+ ],
+ };
+}
+
+async function sendSlack(payload: object, webhookUrl: string): Promise {
+ const response = await fetch(webhookUrl, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(payload),
+ signal: AbortSignal.timeout(8000),
+ });
+
+ if (!response.ok) {
+ throw new Error(`Slack webhook returned HTTP ${response.status}`);
+ }
+}
+
+// ─── PagerDuty payload builder ────────────────────────────────────────────────
+
+type PagerDutyEventAction = 'trigger' | 'acknowledge' | 'resolve';
+
+function buildPagerDutyPayload(
+ alert: AlertPayload,
+ serviceName: string,
+ action: PagerDutyEventAction = 'trigger',
+): object {
+ return {
+ routing_key: _cfg.pagerDutyRoutingKey,
+ event_action: action,
+ dedup_key: alert.id,
+ payload: {
+ summary: `[${serviceName}] ${alert.title}`,
+ source: serviceName,
+ severity: alert.severity === 'critical' ? 'critical' : 'warning',
+ timestamp: alert.timestamp,
+ class: 'application_alert',
+ component: serviceName,
+ group: serviceName,
+ custom_details: {
+ description: alert.description,
+ alert_id: alert.id,
+ tags: alert.tags ?? [],
+ },
+ },
+ links: [
+ {
+ href: typeof window !== 'undefined' ? window.location.href : serviceName,
+ text: 'Open Dashboard',
+ },
+ ],
+ };
+}
+
+async function sendPagerDuty(payload: object): Promise {
+ const response = await fetch('https://events.pagerduty.com/v2/enqueue', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(payload),
+ signal: AbortSignal.timeout(8000),
+ });
+
+ if (!response.ok) {
+ const body = await response.text().catch(() => '');
+ throw new Error(`PagerDuty API returned HTTP ${response.status}: ${body}`);
+ }
+}
+
+// ─── Core dispatch ────────────────────────────────────────────────────────────
+
+export interface DispatchOptions {
+ /** Override auto-resolved PagerDuty action. */
+ pagerDutyAction?: PagerDutyEventAction;
+}
+
+/**
+ * Dispatch an alert through all appropriate channels based on severity.
+ *
+ * - "info" → in-app + browser
+ * - "warning" → in-app + browser + Slack
+ * - "critical" → in-app + browser + Slack + PagerDuty
+ *
+ * External webhook calls are fire-and-forget (errors are captured in Sentry
+ * and logged, but they never throw to the caller).
+ */
+export async function dispatchAlert(
+ alert: AlertPayload,
+ opts: DispatchOptions = {},
+): Promise {
+ const { serviceName, slackWebhookUrl, pagerDutyRoutingKey, dryRun } = _cfg;
+
+ // Always deliver in-app + browser regardless of severity
+ await dispatchToChannels(alert, [{ type: 'in_app' }, { type: 'browser' }]);
+
+ if (alert.severity === 'info') return;
+
+ // ── Slack ─────────────────────────────────────────────────────────────────
+ if (slackWebhookUrl) {
+ const slackPayload = buildSlackPayload(alert, serviceName);
+
+ if (dryRun) {
+ logger.info('[DRY-RUN] Slack alert suppressed', { alert: alert.id, title: alert.title });
+ } else {
+ sendSlack(slackPayload, slackWebhookUrl).catch(err => {
+ logger.warn('Slack delivery failed', { alertId: alert.id }, err);
+ Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'slack' } });
+ });
+ }
+ }
+
+ if (alert.severity !== 'critical') return;
+
+ // ── PagerDuty ─────────────────────────────────────────────────────────────
+ if (pagerDutyRoutingKey) {
+ const action = opts.pagerDutyAction ?? 'trigger';
+ const pdPayload = buildPagerDutyPayload(alert, serviceName, action);
+
+ if (dryRun) {
+ logger.info('[DRY-RUN] PagerDuty alert suppressed', { alert: alert.id, title: alert.title });
+ } else {
+ sendPagerDuty(pdPayload).catch(err => {
+ logger.warn('PagerDuty delivery failed', { alertId: alert.id }, err);
+ Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'pagerduty' } });
+ });
+ }
+ }
+}
+
+/**
+ * Convenience wrapper: resolve an active PagerDuty incident by dedup key.
+ * Pass the original alert ID that was used to trigger the incident.
+ */
+export function resolvePagerDutyIncident(alertId: string, title: string): void {
+ const { pagerDutyRoutingKey, dryRun } = _cfg;
+ if (!pagerDutyRoutingKey) return;
+
+ const resolvePayload = {
+ routing_key: pagerDutyRoutingKey,
+ event_action: 'resolve' as const,
+ dedup_key: alertId,
+ payload: {
+ summary: `RESOLVED: ${title}`,
+ source: _cfg.serviceName,
+ severity: 'info',
+ timestamp: new Date().toISOString(),
+ },
+ };
+
+ if (dryRun) {
+ logger.info('[DRY-RUN] PagerDuty resolve suppressed', { alertId });
+ return;
+ }
+
+ sendPagerDuty(resolvePayload).catch(err => {
+ logger.warn('PagerDuty resolve failed', { alertId }, err);
+ Sentry.captureException(err, { tags: { subsystem: 'alertDispatch', channel: 'pagerduty' } });
+ });
+}
+
+/**
+ * Build a standard `AlertPayload` from a raw error.
+ * Use with `dispatchAlert()` for one-line critical error dispatch.
+ *
+ * @example
+ * dispatchAlert(buildAlertFromError(err, 'critical', ['stellar', 'horizon']));
+ */
+export function buildAlertFromError(
+ err: unknown,
+ severity: AlertPayload['severity'] = 'critical',
+ tags: string[] = [],
+): AlertPayload {
+ const message = err instanceof Error ? err.message : String(err ?? 'Unknown error');
+ const name = err instanceof Error ? err.name : 'Error';
+ const id = `alert-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`;
+
+ return {
+ id,
+ title: `${name}: ${message.slice(0, 80)}`,
+ description: err instanceof Error && err.stack
+ ? err.stack.split('\n').slice(0, 5).join('\n')
+ : message,
+ severity,
+ timestamp: new Date().toISOString(),
+ tags,
+ };
+}
diff --git a/src/lib/dashboardConfig.ts b/src/lib/dashboardConfig.ts
new file mode 100644
index 00000000..4aeaa1d8
--- /dev/null
+++ b/src/lib/dashboardConfig.ts
@@ -0,0 +1,349 @@
+/**
+ * Observability Dashboard Configuration
+ *
+ * Exports:
+ * 1. `GRAFANA_DASHBOARD` – Grafana dashboard JSON (import via the UI or
+ * provisioning API at /api/dashboards/import)
+ * 2. `SENTRY_SAVED_QUERIES` – Pre-built Sentry Discover query definitions
+ * (paste into Sentry → Discover → Saved Queries)
+ * 3. `exportGrafanaDashboard()` – Downloads the JSON file in the browser
+ * 4. `getDatadogMonitorTemplate()` – Datadog monitor JSON for p95 latency
+ *
+ * These configs visualise:
+ * - Error rates and unhandled exception counts
+ * - API latency percentiles (p50 / p95 / p99)
+ * - Core Web Vitals (LCP, CLS, FID)
+ * - Client-side health score over time
+ * - Cache hit ratios
+ * - Memory pressure
+ */
+
+// ─── Grafana dashboard JSON ───────────────────────────────────────────────────
+
+/**
+ * A ready-to-import Grafana dashboard for the Stellar Dev Dashboard.
+ *
+ * Assumes:
+ * - Prometheus data source named "Prometheus" (change `datasource` if yours differs)
+ * - Metrics emitted by your app or a synthetic exporter into Prometheus
+ * following the naming convention below.
+ *
+ * Metric naming convention (use a Prometheus Pushgateway or custom exporter
+ * to forward the RUM events emitted by `src/lib/performance.ts`):
+ *
+ * stellar_dashboard_lcp_milliseconds
+ * stellar_dashboard_fid_milliseconds
+ * stellar_dashboard_cls_ratio
+ * stellar_dashboard_health_score (gauge 0–100)
+ * stellar_dashboard_api_request_duration_seconds{endpoint, method, status}
+ * stellar_dashboard_error_total{category, severity}
+ * stellar_dashboard_cache_hits_total{namespace}
+ * stellar_dashboard_cache_misses_total{namespace}
+ */
+export const GRAFANA_DASHBOARD = {
+ title: 'Stellar Dev Dashboard — Production Health',
+ uid: 'stellar-prod-health-v1',
+ schemaVersion: 38,
+ version: 1,
+ refresh: '30s',
+ time: { from: 'now-3h', to: 'now' },
+ templating: {
+ list: [
+ {
+ name: 'datasource',
+ type: 'datasource',
+ pluginId: 'prometheus',
+ label: 'Prometheus',
+ current: { text: 'Prometheus', value: 'Prometheus' },
+ },
+ ],
+ },
+ panels: [
+ // ── Row 1: Health overview ────────────────────────────────────────────
+ {
+ id: 1,
+ type: 'stat',
+ title: 'Health Score (avg)',
+ gridPos: { x: 0, y: 0, w: 4, h: 4 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'avg(stellar_dashboard_health_score)',
+ legendFormat: 'Health Score',
+ },
+ ],
+ options: {
+ colorMode: 'background',
+ thresholds: {
+ steps: [
+ { value: 0, color: 'red' },
+ { value: 50, color: 'orange' },
+ { value: 80, color: 'green' },
+ ],
+ },
+ },
+ },
+ {
+ id: 2,
+ type: 'stat',
+ title: 'Error Rate (5m)',
+ gridPos: { x: 4, y: 0, w: 4, h: 4 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'sum(rate(stellar_dashboard_error_total[5m]))',
+ legendFormat: 'Errors/s',
+ },
+ ],
+ options: {
+ colorMode: 'background',
+ thresholds: {
+ steps: [
+ { value: 0, color: 'green' },
+ { value: 0.01, color: 'orange' },
+ { value: 0.1, color: 'red' },
+ ],
+ },
+ },
+ },
+ {
+ id: 3,
+ type: 'stat',
+ title: 'LCP (p75)',
+ gridPos: { x: 8, y: 0, w: 4, h: 4 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'histogram_quantile(0.75, rate(stellar_dashboard_lcp_milliseconds_bucket[5m]))',
+ legendFormat: 'LCP p75 ms',
+ },
+ ],
+ options: {
+ unit: 'ms',
+ colorMode: 'background',
+ thresholds: {
+ steps: [
+ { value: 0, color: 'green' },
+ { value: 2500, color: 'orange' },
+ { value: 4000, color: 'red' },
+ ],
+ },
+ },
+ },
+ {
+ id: 4,
+ type: 'stat',
+ title: 'Cache Hit Ratio',
+ gridPos: { x: 12, y: 0, w: 4, h: 4 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: `
+ sum(rate(stellar_dashboard_cache_hits_total[5m]))
+ / (
+ sum(rate(stellar_dashboard_cache_hits_total[5m]))
+ + sum(rate(stellar_dashboard_cache_misses_total[5m]))
+ )
+ `,
+ legendFormat: 'Hit Ratio',
+ },
+ ],
+ options: {
+ unit: 'percentunit',
+ colorMode: 'background',
+ thresholds: {
+ steps: [
+ { value: 0, color: 'red' },
+ { value: 0.7, color: 'orange' },
+ { value: 0.9, color: 'green' },
+ ],
+ },
+ },
+ },
+
+ // ── Row 2: API latency percentiles ────────────────────────────────────
+ {
+ id: 10,
+ type: 'timeseries',
+ title: 'API Request Duration — p50 / p95 / p99',
+ gridPos: { x: 0, y: 4, w: 16, h: 8 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'histogram_quantile(0.50, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))',
+ legendFormat: 'p50',
+ },
+ {
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))',
+ legendFormat: 'p95',
+ },
+ {
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(stellar_dashboard_api_request_duration_seconds_bucket[5m])))',
+ legendFormat: 'p99',
+ },
+ ],
+ fieldConfig: {
+ defaults: {
+ unit: 's',
+ custom: { lineWidth: 2 },
+ },
+ },
+ },
+
+ // ── Row 3: Error breakdown ────────────────────────────────────────────
+ {
+ id: 20,
+ type: 'timeseries',
+ title: 'Errors by Severity',
+ gridPos: { x: 0, y: 12, w: 12, h: 6 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'sum by (severity) (rate(stellar_dashboard_error_total[5m]))',
+ legendFormat: '{{severity}}',
+ },
+ ],
+ fieldConfig: { defaults: { unit: 'short' } },
+ },
+ {
+ id: 21,
+ type: 'timeseries',
+ title: 'Errors by Category',
+ gridPos: { x: 12, y: 12, w: 12, h: 6 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'sum by (category) (rate(stellar_dashboard_error_total[5m]))',
+ legendFormat: '{{category}}',
+ },
+ ],
+ fieldConfig: { defaults: { unit: 'short' } },
+ },
+
+ // ── Row 4: Core Web Vitals ────────────────────────────────────────────
+ {
+ id: 30,
+ type: 'timeseries',
+ title: 'Core Web Vitals over time',
+ gridPos: { x: 0, y: 18, w: 24, h: 7 },
+ datasource: '$datasource',
+ targets: [
+ {
+ expr: 'histogram_quantile(0.75, rate(stellar_dashboard_lcp_milliseconds_bucket[5m]))',
+ legendFormat: 'LCP p75 (ms)',
+ },
+ {
+ expr: 'histogram_quantile(0.75, rate(stellar_dashboard_fid_milliseconds_bucket[5m]))',
+ legendFormat: 'FID p75 (ms)',
+ },
+ {
+ expr: 'avg(stellar_dashboard_cls_ratio)',
+ legendFormat: 'CLS (avg ×1000)',
+ transformations: [{ id: 'multiplyBy', options: { value: 1000 } }],
+ },
+ ],
+ fieldConfig: { defaults: { unit: 'ms' } },
+ },
+ ],
+} as const;
+
+// ─── Sentry Discover saved queries ───────────────────────────────────────────
+
+/**
+ * Paste each entry's `query` into Sentry → Discover → "Saved Queries".
+ * Column definitions map directly to Sentry Discover field names.
+ */
+export const SENTRY_SAVED_QUERIES = [
+ {
+ name: 'Unhandled Exceptions — last 24 h',
+ query: 'event.type:error !has:handled.exception',
+ fields: ['count()', 'issue', 'title', 'project', 'last_seen()'],
+ orderby: '-count()',
+ range: '24h',
+ },
+ {
+ name: 'P95 Transaction Duration',
+ query: 'event.type:transaction',
+ fields: ['transaction', 'count()', 'p50(transaction.duration)', 'p95(transaction.duration)', 'p99(transaction.duration)'],
+ orderby: '-p95(transaction.duration)',
+ range: '1h',
+ },
+ {
+ name: 'Critical Errors by Category',
+ query: 'event.type:error level:fatal',
+ fields: ['count()', 'issue', 'tags[category]', 'tags[severity]', 'last_seen()'],
+ orderby: '-count()',
+ range: '7d',
+ },
+ {
+ name: 'Frontend Web Vitals — LCP outliers',
+ query: 'event.type:transaction measurements.lcp:>4000',
+ fields: ['transaction', 'measurements.lcp', 'measurements.fid', 'measurements.cls', 'count()'],
+ orderby: '-measurements.lcp',
+ range: '6h',
+ },
+ {
+ name: 'Stellar Horizon API Errors',
+ query: 'event.type:error tags[category]:network tags[context]:*horizon*',
+ fields: ['count()', 'issue', 'title', 'tags[url]', 'last_seen()'],
+ orderby: '-count()',
+ range: '24h',
+ },
+] as const;
+
+// ─── Datadog monitor template ─────────────────────────────────────────────────
+
+/**
+ * Datadog monitor JSON for p95 API latency alerting.
+ * Use with the Datadog API: POST /api/v1/monitor
+ * or import via the Datadog Terraform provider.
+ */
+export function getDatadogMonitorTemplate(service = 'stellar-dev-dashboard'): object {
+ return {
+ name: `[${service}] API p95 Latency SLO breach`,
+ type: 'metric alert',
+ query: `percentile(last_5m):p95:stellar.dashboard.api.request.duration.seconds{service:${service}} > 2`,
+ message: `
+p95 API latency for **${service}** has exceeded 2 seconds over the past 5 minutes.
+
+Runbook: https://your-wiki/runbooks/high-api-latency
+
+@pagerduty-${service} @slack-alerts-${service}
+ `.trim(),
+ tags: [`service:${service}`, 'team:platform', 'severity:high'],
+ options: {
+ thresholds: {
+ critical: 2, // seconds
+ warning: 1,
+ },
+ notify_no_data: true,
+ no_data_timeframe: 10,
+ renotify_interval: 30,
+ include_tags: true,
+ evaluation_delay: 60,
+ },
+ priority: 2,
+ };
+}
+
+// ─── Browser download helper ──────────────────────────────────────────────────
+
+/**
+ * Downloads the Grafana dashboard JSON as a file in the browser.
+ * Wire this to a "Export Dashboard" button in the admin panel.
+ */
+export function exportGrafanaDashboard(): void {
+ const blob = new Blob(
+ [JSON.stringify(GRAFANA_DASHBOARD, null, 2)],
+ { type: 'application/json' },
+ );
+ const url = URL.createObjectURL(blob);
+ const a = Object.assign(document.createElement('a'), {
+ href: url,
+ download: 'stellar-dashboard-grafana.json',
+ });
+ document.body.appendChild(a);
+ a.click();
+ document.body.removeChild(a);
+ URL.revokeObjectURL(url);
+}
diff --git a/src/lib/healthCheck.ts b/src/lib/healthCheck.ts
new file mode 100644
index 00000000..a74fdc1f
--- /dev/null
+++ b/src/lib/healthCheck.ts
@@ -0,0 +1,291 @@
+/**
+ * Client-side Health Check
+ *
+ * Provides `getHealthStatus()` — a lightweight, dependency-aware health
+ * snapshot analogous to a `/health` HTTP endpoint. It is surfaced:
+ *
+ * 1. In the existing System Health dashboard tab (pass it to any component
+ * that renders operational metrics).
+ * 2. Via the nginx `/health` location (already returns 200 'ok' at the
+ * network layer; this JS layer adds runtime depth on top).
+ * 3. By the `alertDispatch` pipeline to decide whether to fire a critical
+ * alert when a dependency probe fails.
+ *
+ * No server-side code is needed — everything runs in the browser.
+ */
+
+import { createLogger } from '../utils/logger';
+// collectHealthSnapshot / computeHealthScore live in the pre-existing JS module.
+// We import them with the .js extension so bundler mode resolves without .ts
+import {
+ collectHealthSnapshot,
+ computeHealthScore,
+} from '../utils/monitoring.js';
+import { addBreadcrumb } from './errorReporting';
+
+const logger = createLogger('HealthCheck');
+
+// ─── Types ────────────────────────────────────────────────────────────────────
+
+export type HealthStatus = 'healthy' | 'degraded' | 'unhealthy';
+
+export interface DependencyProbe {
+ name: string;
+ status: HealthStatus;
+ latencyMs: number | null;
+ message?: string;
+}
+
+export interface HealthReport {
+ status: HealthStatus;
+ score: number; // 0–100
+ uptimeMs: number;
+ memory: {
+ usedMB: number | null;
+ totalMB: number | null;
+ heapLimitMB: number | null;
+ pressureLevel: 'low' | 'medium' | 'high' | 'critical' | 'unknown';
+ };
+ network: {
+ online: boolean;
+ effectiveType: string | null;
+ rttMs: number | null;
+ downlinkMbps: number | null;
+ };
+ dependencies: DependencyProbe[];
+ webVitals: {
+ lcp: number | null;
+ fid: number | null;
+ cls: number | null;
+ };
+ timestamp: string;
+ version: string;
+}
+
+// ─── Internal state ───────────────────────────────────────────────────────────
+
+const _appStart = Date.now();
+const _vitals: { lcp: number | null; fid: number | null; cls: number | null } = {
+ lcp: null,
+ fid: null,
+ cls: null,
+};
+
+// Passively capture web vitals as they arrive so `getHealthStatus()` can
+// report them without blocking.
+if (typeof PerformanceObserver !== 'undefined') {
+ try {
+ new PerformanceObserver(list => {
+ const last = list.getEntries().at(-1) as PerformanceEntry | undefined;
+ if (last) _vitals.lcp = Math.round(last.startTime);
+ }).observe({ type: 'largest-contentful-paint', buffered: true });
+ } catch { /* unsupported */ }
+
+ try {
+ let clsAcc = 0;
+ new PerformanceObserver(list => {
+ for (const e of list.getEntries()) {
+ clsAcc += (e as unknown as { value: number }).value ?? 0;
+ }
+ _vitals.cls = Number(clsAcc.toFixed(3));
+ }).observe({ type: 'layout-shift', buffered: true });
+ } catch { /* unsupported */ }
+
+ try {
+ new PerformanceObserver(list => {
+ for (const e of list.getEntries()) {
+ const entry = e as unknown as { processingStart: number; startTime: number };
+ _vitals.fid = Math.round(entry.processingStart - entry.startTime);
+ }
+ }).observe({ type: 'first-input', buffered: true });
+ } catch { /* unsupported */ }
+}
+
+// ─── Dependency probes ────────────────────────────────────────────────────────
+
+/**
+ * Probe an external HTTP dependency.
+ * Returns within `timeoutMs` even if the request hangs.
+ */
+async function probeHttpEndpoint(
+ name: string,
+ url: string,
+ timeoutMs = 4000,
+): Promise {
+ const start = performance.now();
+ try {
+ const res = await fetch(url, {
+ method: 'HEAD',
+ signal: AbortSignal.timeout(timeoutMs),
+ cache: 'no-store',
+ });
+ const latencyMs = Math.round(performance.now() - start);
+ return {
+ name,
+ status: res.ok ? 'healthy' : 'degraded',
+ latencyMs,
+ message: res.ok ? undefined : `HTTP ${res.status}`,
+ };
+ } catch (err) {
+ return {
+ name,
+ status: 'unhealthy',
+ latencyMs: null,
+ message: err instanceof Error ? err.message : 'probe failed',
+ };
+ }
+}
+
+/**
+ * Register custom dependency probes. Out of the box we probe:
+ * - Stellar Horizon testnet (always available; a sensible live canary)
+ * - Stellar Horizon mainnet
+ *
+ * Add your own via `registerDependencyProbe()`.
+ */
+type ProbeFactory = () => Promise;
+
+const _probes: Map = new Map([
+ [
+ 'horizon.testnet',
+ () =>
+ probeHttpEndpoint(
+ 'Stellar Horizon (testnet)',
+ 'https://horizon-testnet.stellar.org',
+ ),
+ ],
+ [
+ 'horizon.mainnet',
+ () =>
+ probeHttpEndpoint(
+ 'Stellar Horizon (mainnet)',
+ 'https://horizon.stellar.org',
+ ),
+ ],
+]);
+
+export function registerDependencyProbe(key: string, factory: ProbeFactory): void {
+ _probes.set(key, factory);
+}
+
+export function unregisterDependencyProbe(key: string): void {
+ _probes.delete(key);
+}
+
+// ─── Memory helpers ───────────────────────────────────────────────────────────
+
+type MemoryInfo = {
+ usedJSHeapSize: number;
+ totalJSHeapSize: number;
+ jsHeapSizeLimit: number;
+};
+
+function readMemory() {
+ const mem = (performance as unknown as { memory?: MemoryInfo }).memory;
+ if (!mem) {
+ return { usedMB: null, totalMB: null, heapLimitMB: null, pressureLevel: 'unknown' as const };
+ }
+ const usedMB = Math.round(mem.usedJSHeapSize / 1_048_576);
+ const totalMB = Math.round(mem.totalJSHeapSize / 1_048_576);
+ const heapLimitMB = Math.round(mem.jsHeapSizeLimit / 1_048_576);
+ const ratio = mem.usedJSHeapSize / mem.jsHeapSizeLimit;
+
+ const pressureLevel =
+ ratio > 0.9 ? 'critical' :
+ ratio > 0.8 ? 'high' :
+ ratio > 0.7 ? 'medium' : 'low';
+
+ return { usedMB, totalMB, heapLimitMB, pressureLevel } as const;
+}
+
+function readNetwork() {
+ type NetInfo = { effectiveType?: string; rtt?: number; downlink?: number };
+ const conn = (navigator as unknown as { connection?: NetInfo }).connection;
+ return {
+ online: navigator.onLine,
+ effectiveType: conn?.effectiveType ?? null,
+ rttMs: conn?.rtt ?? null,
+ downlinkMbps: conn?.downlink ?? null,
+ };
+}
+
+// ─── Aggregate status ─────────────────────────────────────────────────────────
+
+function aggregateStatus(score: number, deps: DependencyProbe[]): HealthStatus {
+ const anyUnhealthy = deps.some(d => d.status === 'unhealthy');
+ const anyDegraded = deps.some(d => d.status === 'degraded');
+
+ if (!navigator.onLine || score < 30 || anyUnhealthy) return 'unhealthy';
+ if (score < 70 || anyDegraded) return 'degraded';
+ return 'healthy';
+}
+
+// ─── Public API ───────────────────────────────────────────────────────────────
+
+/**
+ * Perform a full health check and return a structured report.
+ *
+ * `runProbes` defaults to `true` in production so callers get real dependency
+ * latency data; set it to `false` for a cheaper synchronous-ish snapshot.
+ */
+export async function getHealthStatus(runProbes = true): Promise {
+ const snapshot = collectHealthSnapshot();
+ const score = computeHealthScore(snapshot);
+ const memory = readMemory();
+ const network = readNetwork();
+
+ let dependencies: DependencyProbe[] = [];
+ if (runProbes) {
+ const probeResults = await Promise.allSettled(
+ Array.from(_probes.values()).map(factory => factory()),
+ );
+ dependencies = probeResults.map(r =>
+ r.status === 'fulfilled'
+ ? r.value
+ : { name: 'unknown', status: 'unhealthy' as HealthStatus, latencyMs: null, message: 'probe threw' },
+ );
+ }
+
+ const status = aggregateStatus(score, dependencies);
+
+ const report: HealthReport = {
+ status,
+ score,
+ uptimeMs: Date.now() - _appStart,
+ memory,
+ network,
+ dependencies,
+ webVitals: { ..._vitals },
+ timestamp: new Date().toISOString(),
+ version: (import.meta.env.VITE_SENTRY_RELEASE as string | undefined) ?? 'unknown',
+ };
+
+ addBreadcrumb(`Health check: ${status} (score ${score})`, 'health', { score, status });
+
+ if (status !== 'healthy') {
+ logger.warn('Health check degraded', { status, score, dependencies });
+ } else {
+ logger.debug('Health check passed', { score });
+ }
+
+ return report;
+}
+
+/**
+ * Simplified probe suitable for a status badge or polling hook.
+ * Returns within `timeoutMs` even if dependency probes are slow.
+ */
+export async function quickHealthCheck(timeoutMs = 5000): Promise {
+ try {
+ const result = await Promise.race([
+ getHealthStatus(true),
+ new Promise((_, reject) =>
+ setTimeout(() => reject(new Error('health check timed out')), timeoutMs),
+ ),
+ ]);
+ return result.status;
+ } catch (err) {
+ logger.warn('Quick health check timed out or failed', {}, err instanceof Error ? err : undefined);
+ return 'degraded';
+ }
+}
diff --git a/src/main.jsx b/src/main.jsx
index abdb82fe..e4162f21 100644
--- a/src/main.jsx
+++ b/src/main.jsx
@@ -3,12 +3,13 @@ import ReactDOM from "react-dom/client";
import { BrowserRouter } from "react-router-dom";
import App from "./App";
import "./styles/globals.css";
-import { initPerformanceMonitoring } from "./lib/performance";
+import { initMonitoring } from "./utils/monitoring";
import { selfHealingManager } from "./lib/errorHandling/SelfHealingManager";
import { registerBuiltInStrategies, registerNetworkProbes } from "./lib/errorHandling/RecoveryStrategyRegistry";
-// Initialize performance monitoring (no RUM endpoint by default)
-initPerformanceMonitoring();
+// ── Monitoring must be the very first thing that runs so Sentry and the
+// global error handlers are in place before any React code executes.
+initMonitoring();
// D-057 — Bootstrap error recovery & self-healing
registerBuiltInStrategies();
diff --git a/src/utils/monitoring.ts b/src/utils/monitoring.ts
new file mode 100644
index 00000000..ac13b49f
--- /dev/null
+++ b/src/utils/monitoring.ts
@@ -0,0 +1,324 @@
+/**
+ * Production Monitoring & Observability
+ *
+ * Initialises Sentry for error tracking + performance tracing,
+ * bridges into the existing errorReporting / performance pipelines,
+ * and captures global uncaught exceptions / unhandled rejections.
+ *
+ * Call `initMonitoring()` once at the very top of `src/main.jsx`,
+ * before `ReactDOM.createRoot(…).render(…)`.
+ *
+ * Environment variables (set via .env or your CI secrets manager):
+ * VITE_SENTRY_DSN – Sentry project DSN (required in production)
+ * VITE_SENTRY_ENV – "production" | "staging" | "development"
+ * VITE_SENTRY_RELEASE – build SHA / semver tag injected at build time
+ * VITE_SENTRY_TRACES_RATE – 0–1 float for performance sampling (default 0.1)
+ * VITE_SENTRY_REPLAY_RATE – 0–1 float for Session Replay (default 0.05)
+ */
+
+import * as Sentry from '@sentry/react';
+import {
+ reportError,
+ addBreadcrumb,
+ initializeErrorReporting,
+} from '../lib/errorReporting';
+import { initPerformanceMonitoring } from '../lib/performance';
+import { createLogger } from './logger';
+
+const logger = createLogger('Monitoring');
+
+// ─── Config ───────────────────────────────────────────────────────────────────
+
+export interface MonitoringConfig {
+ /** Sentry DSN – omit or leave empty to disable Sentry. */
+ sentryDsn?: string;
+ /** "production" | "staging" | "development" */
+ environment: string;
+ /** Release identifier (git SHA, semver tag). */
+ release?: string;
+ /** Fraction of transactions to sample for performance tracing (0–1). */
+ tracesSampleRate: number;
+ /** Fraction of sessions to record via Session Replay (0–1). */
+ replaySampleRate: number;
+ /** Optional RUM endpoint for the existing performance pipeline. */
+ rumEndpoint?: string;
+}
+
+const defaultConfig: MonitoringConfig = {
+ sentryDsn: import.meta.env.VITE_SENTRY_DSN as string | undefined,
+ environment: (import.meta.env.VITE_SENTRY_ENV as string | undefined) ?? import.meta.env.MODE ?? 'development',
+ release: import.meta.env.VITE_SENTRY_RELEASE as string | undefined,
+ tracesSampleRate: Number(import.meta.env.VITE_SENTRY_TRACES_RATE ?? 0.1),
+ replaySampleRate: Number(import.meta.env.VITE_SENTRY_REPLAY_RATE ?? 0.05),
+ rumEndpoint: import.meta.env.VITE_RUM_ENDPOINT as string | undefined,
+};
+
+let _initialised = false;
+
+// ─── Sentry init ──────────────────────────────────────────────────────────────
+
+function initialiseSentry(cfg: MonitoringConfig): void {
+ if (!cfg.sentryDsn) {
+ logger.warn('Sentry DSN not set – error tracking disabled.', { env: cfg.environment });
+ return;
+ }
+
+ Sentry.init({
+ dsn: cfg.sentryDsn,
+ environment: cfg.environment,
+ release: cfg.release,
+
+ // ── Performance (APM) ────────────────────────────────────────────────────
+ // Instruments fetch/XHR, React routing spans, and long-tasks automatically.
+ tracesSampleRate: cfg.tracesSampleRate,
+
+ // ── Session Replay ───────────────────────────────────────────────────────
+ // Captures a lightweight DOM snapshot replay for error sessions.
+ replaysSessionSampleRate: 0, // don't record healthy sessions
+ replaysOnErrorSampleRate: cfg.replaySampleRate,
+
+ // ── Integrations ─────────────────────────────────────────────────────────
+ integrations: [
+ // Browser performance tracing (route changes, HTTP requests, long-tasks)
+ Sentry.browserTracingIntegration(),
+ // Session Replay on error
+ Sentry.replayIntegration({
+ maskAllText: true,
+ blockAllMedia: true,
+ }),
+ // Capture console.error calls as breadcrumbs
+ Sentry.breadcrumbsIntegration({ console: true }),
+ ],
+
+ // ── Scrubbing ────────────────────────────────────────────────────────────
+ // Strip PII / secrets from outgoing event payloads.
+ beforeSend(event) {
+ // Remove auth tokens from request headers recorded in the event
+ if (event.request?.headers) {
+ const h = event.request.headers as Record;
+ delete h['Authorization'];
+ delete h['Cookie'];
+ delete h['X-Api-Key'];
+ }
+ // Strip query params that may carry secrets
+ if (event.request?.url) {
+ try {
+ const u = new URL(event.request.url);
+ ['token', 'key', 'secret', 'api_key', 'access_token'].forEach(p =>
+ u.searchParams.delete(p),
+ );
+ event.request.url = u.toString();
+ } catch {
+ /* ignore malformed URLs */
+ }
+ }
+ return event;
+ },
+
+ // Drop Sentry's own internal traffic and localhost noise
+ denyUrls: [/localhost/, /127\.0\.0\.1/, /extensions\//i],
+ });
+
+ logger.info('Sentry initialised', {
+ env: cfg.environment,
+ release: cfg.release ?? 'unknown',
+ tracesSampleRate: cfg.tracesSampleRate,
+ });
+}
+
+// ─── Global error listeners ───────────────────────────────────────────────────
+
+/**
+ * Captures uncaught exceptions and unhandled promise rejections,
+ * forwarding them to both Sentry and the existing errorReporting pipeline.
+ */
+function attachGlobalErrorHandlers(): void {
+ window.addEventListener('error', (event: ErrorEvent) => {
+ const err = event.error instanceof Error ? event.error : new Error(event.message);
+
+ // Sentry already captures window.onerror via its SDK, but we add extra
+ // context tags here for the existing in-app error store.
+ Sentry.withScope(scope => {
+ scope.setTag('capture_mechanism', 'window.onerror');
+ scope.setExtra('filename', event.filename);
+ scope.setExtra('lineno', event.lineno);
+ scope.setExtra('colno', event.colno);
+ Sentry.captureException(err);
+ });
+
+ reportError(err, {
+ context: 'Global Error Handler',
+ filename: event.filename,
+ lineno: event.lineno,
+ colno: event.colno,
+ category: 'javascript',
+ severity: 'high',
+ });
+ });
+
+ window.addEventListener('unhandledrejection', (event: PromiseRejectionEvent) => {
+ const reason = event.reason;
+ const err = reason instanceof Error ? reason : new Error(String(reason ?? 'Unhandled rejection'));
+
+ Sentry.withScope(scope => {
+ scope.setTag('capture_mechanism', 'unhandledrejection');
+ Sentry.captureException(err);
+ });
+
+ reportError(err, {
+ context: 'Unhandled Promise Rejection',
+ category: 'promise',
+ severity: 'high',
+ });
+ });
+}
+
+// ─── Web Vitals → Sentry custom measurements ─────────────────────────────────
+
+/**
+ * Pipes Core Web Vitals collected by the existing performance pipeline
+ * into Sentry as custom measurements on the active transaction/span,
+ * and also emits a Sentry breadcrumb for quick triage.
+ */
+function attachWebVitalsBridge(): void {
+ if (typeof PerformanceObserver === 'undefined') return;
+
+ // LCP
+ try {
+ const lcpObs = new PerformanceObserver(list => {
+ const entries = list.getEntries();
+ const last = entries[entries.length - 1] as PerformanceEntry;
+ if (!last) return;
+ const value = Math.round(last.startTime);
+ Sentry.setMeasurement('lcp', value, 'millisecond');
+ addBreadcrumb(`LCP: ${value}ms`, 'performance', { value });
+ });
+ lcpObs.observe({ type: 'largest-contentful-paint', buffered: true });
+ } catch { /* observer not supported */ }
+
+ // CLS
+ try {
+ let clsValue = 0;
+ const clsObs = new PerformanceObserver(list => {
+ for (const entry of list.getEntries()) {
+ clsValue += (entry as unknown as { value: number }).value ?? 0;
+ }
+ const cls = Number(clsValue.toFixed(3));
+ Sentry.setMeasurement('cls', cls, 'none');
+ addBreadcrumb(`CLS: ${cls}`, 'performance', { value: cls });
+ });
+ clsObs.observe({ type: 'layout-shift', buffered: true });
+ } catch { /* observer not supported */ }
+
+ // FID / INP
+ try {
+ const inputObs = new PerformanceObserver(list => {
+ for (const entry of list.getEntries()) {
+ const e = entry as unknown as { processingStart: number; startTime: number };
+ const fid = Math.round(e.processingStart - e.startTime);
+ Sentry.setMeasurement('fid', fid, 'millisecond');
+ addBreadcrumb(`FID: ${fid}ms`, 'performance', { value: fid });
+ }
+ });
+ inputObs.observe({ type: 'first-input', buffered: true });
+ } catch { /* observer not supported */ }
+}
+
+// ─── Public API ───────────────────────────────────────────────────────────────
+
+/**
+ * Initialise the full monitoring stack.
+ *
+ * Call once before `ReactDOM.createRoot(…).render(…)`.
+ */
+export function initMonitoring(userConfig: Partial = {}): void {
+ if (_initialised) return;
+ _initialised = true;
+
+ const cfg: MonitoringConfig = { ...defaultConfig, ...userConfig };
+
+ // 1. Sentry SDK
+ initialiseSentry(cfg);
+
+ // 2. Global error capture (bridges into errorReporting + Sentry)
+ attachGlobalErrorHandlers();
+
+ // 3. Web Vitals → Sentry measurements + breadcrumbs
+ attachWebVitalsBridge();
+
+ // 4. Existing performance monitoring (LCP/CLS/FID budgets, RUM endpoint)
+ initPerformanceMonitoring({ rumEndpoint: cfg.rumEndpoint });
+
+ // 5. Existing error reporting queue (batched flush, localStorage backup)
+ initializeErrorReporting({ enabled: true });
+
+ logger.info('Monitoring stack initialised', { env: cfg.environment });
+}
+
+// ─── Sentry user context helpers ─────────────────────────────────────────────
+
+/**
+ * Attach a Stellar account address as the Sentry "user" for session correlation.
+ * Call after wallet connect; pass `null` to clear on disconnect.
+ */
+export function setMonitoringUser(
+ stellarAddress: string | null,
+ extra?: Record,
+): void {
+ if (stellarAddress) {
+ Sentry.setUser({ id: stellarAddress, ...extra });
+ } else {
+ Sentry.setUser(null);
+ }
+}
+
+/**
+ * Wrap a synchronous or async operation in a named Sentry performance span.
+ *
+ * @example
+ * const result = await withSpan('stellar.horizon.fetchAccount', async () =>
+ * horizon.loadAccount(address)
+ * );
+ */
+export async function withSpan(
+ name: string,
+ fn: () => T | Promise,
+ attributes?: Record,
+): Promise {
+ return Sentry.startSpan({ name, attributes }, () => fn());
+}
+
+/**
+ * Manually capture an exception in Sentry with additional context.
+ * Mirrors the existing `reportError` API but also sends to Sentry.
+ */
+export function captureError(
+ err: unknown,
+ context?: Record,
+): void {
+ Sentry.withScope(scope => {
+ if (context) {
+ Object.entries(context).forEach(([k, v]) => scope.setExtra(k, v));
+ }
+ Sentry.captureException(err instanceof Error ? err : new Error(String(err)));
+ });
+ reportError(err, context ?? {});
+}
+
+/**
+ * Expose the Sentry error boundary component for wrapping route-level trees.
+ *
+ * @example
+ * Something went wrong
}>
+ *
+ *
+ */
+export const SentryErrorBoundary = Sentry.ErrorBoundary;
+
+export default {
+ initMonitoring,
+ setMonitoringUser,
+ withSpan,
+ captureError,
+ SentryErrorBoundary,
+};