diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5df72ed..8e138c4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,9 +57,9 @@ jobs: r = client.get('/health') assert r.status_code == 200 data = r.json() - assert data['status'] == 'ok' - assert 'engines_loaded' in data - assert 'vad_loaded' in data + assert data['status'] in ('ok', 'degraded') + assert 'engines' in data + assert 'modalities' in data # Voices endpoint r = client.get('/v1/voices') diff --git a/.gitignore b/.gitignore index fda447b..b53a61e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,9 @@ __pycache__/ *.pyc .DS_Store .claude/ + +# ONNX Runtime WASM binaries (large, download via setup.sh) +dashboard/vad/*.wasm +dashboard/vad/*.mjs +dashboard/vad/ort.js +dashboard/vad/ort.min.mjs diff --git a/agent_loop.py b/agent_loop.py new file mode 100644 index 0000000..449fcba --- /dev/null +++ b/agent_loop.py @@ -0,0 +1,262 @@ +"""Agent loop — receives percepts, calls LLM with tools, dispatches actions. + +The agent loop is the bridge between the ModalityBus (perception/action) +and the InferenceProvider (thinking). It maintains conversation history +and routes tool calls through the bus. +""" + +from __future__ import annotations + +import json as _json +import logging +import os +import time +from typing import TYPE_CHECKING + +import httpx + +from bus import ModalityBus +from modality import CognitiveEvent, CognitiveIntent, ModalityType +from pipeline_state import PipelineState +from providers import AGENT_TOOLS, InferenceProvider + +if TYPE_CHECKING: + from channels import BrowserChannel + +logger = logging.getLogger("mod3.agent_loop") + +# Base system prompt — kernel context is appended dynamically +_BASE_SYSTEM_PROMPT = ( + "You are Cog, a voice assistant running on Mod³ (Apple Silicon, fully local). " + "You respond using tool calls. Use speak() for conversational voice responses — " + "keep them concise, 1-3 sentences. Use send_text() only when the content is " + "better read than heard (code, lists, links, structured data). " + "No markdown in speak() text. Speak naturally. " + "If the user asks something you can't do, say so briefly." +) + +# CogOS kernel endpoint for context enrichment +_COGOS_ENDPOINT = os.environ.get("COGOS_ENDPOINT", "http://localhost:6931") + +# Bus endpoint for logging exchanges (observation channel) +_COGOS_BUS_ENDPOINT = f"{_COGOS_ENDPOINT}/v1/bus" + + +def _fetch_kernel_context() -> str: + """Pull active context from CogOS kernel to enrich the system prompt. + + Returns a context block string, or empty string if kernel unavailable. + This is the afferent path: kernel → local model. + """ + try: + resp = httpx.get(f"{_COGOS_ENDPOINT}/health", timeout=2.0) + if resp.status_code != 200: + return "" + health = resp.json() + + parts = [] + identity = health.get("identity", "cog") + state = health.get("state", "unknown") + parts.append(f"Kernel identity: {identity}, state: {state}") + + # Try to get active session context + try: + ctx_resp = httpx.get(f"{_COGOS_ENDPOINT}/v1/context", timeout=2.0) + if ctx_resp.status_code == 200: + ctx = ctx_resp.json() + nucleus = ctx.get("nucleus", "") + if nucleus: + parts.append(f"Active nucleus: {nucleus}") + process_state = ctx.get("state", "") + if process_state: + parts.append(f"Process state: {process_state}") + except Exception: + pass + + # Check for barge-in context (what was Claude saying when interrupted?) + signal_file = os.environ.get("BARGEIN_SIGNAL", "/tmp/mod3-barge-in.json") + try: + if os.path.exists(signal_file): + with open(signal_file) as f: + signal = _json.load(f) + interrupted = signal.get("interrupted") + if interrupted: + delivered = interrupted.get("delivered_text", "") + pct = interrupted.get("spoken_pct", 0) + parts.append( + f"[barge-in] Claude's speech was interrupted at {pct * 100:.0f}%. " + f'Delivered: "{delivered}". ' + f"The user interrupted to say something — acknowledge and respond to them." + ) + except Exception: + pass + + if parts: + return "\n\nKernel context:\n" + "\n".join(f"- {p}" for p in parts) + return "" + except Exception: + return "" + + +def _log_exchange_to_bus(user_text: str, assistant_text: str, provider_name: str): + """Log the local model exchange to the CogOS bus (observation channel). + + This is the efferent path: local model → kernel → Claude can observe. + """ + try: + payload = { + "type": "modality.voice.exchange", + "from": f"mod3-reflex:{provider_name}", + "payload": { + "user": user_text, + "assistant": assistant_text, + "provider": provider_name, + "timestamp": time.time(), + }, + } + httpx.post( + _COGOS_BUS_ENDPOINT, + json=payload, + timeout=2.0, + ) + except Exception as e: + logger.debug("Failed to log exchange to bus: %s", e) + + +MAX_HISTORY = 50 + + +class AgentLoop: + """Conversational agent that receives percepts and acts through the bus.""" + + def __init__( + self, + bus: ModalityBus, + provider: InferenceProvider, + pipeline_state: PipelineState, + channel_id: str = "", + ): + self.bus = bus + self.provider = provider + self.pipeline_state = pipeline_state + self.channel_id = channel_id + self.conversation: list[dict[str, str]] = [] + self._channel_ref: BrowserChannel | None = None + self._processing = False + + async def handle_event(self, event: CognitiveEvent) -> None: + """Called when a CognitiveEvent arrives from the channel.""" + if not event.content.strip(): + return + + if self._processing: + logger.warning("agent busy, dropping: %s", event.content[:50]) + return + + self._processing = True + try: + await self._process(event) + except Exception as e: + logger.error("agent_loop error: %s", e, exc_info=True) + try: + if self._channel_ref: + await self._channel_ref.send_response_text(f"[error: {e}]") + await self._channel_ref.send_response_complete() + except Exception: + pass # channel may be dead, don't block finally + finally: + self._processing = False + + async def _process(self, event: CognitiveEvent) -> None: + """Core: event → provider → tool dispatch.""" + self.conversation.append({"role": "user", "content": event.content}) + self._trim_history() + + t_start = time.perf_counter() + + # Assemble system prompt with kernel context (afferent path) + kernel_ctx = _fetch_kernel_context() + system_prompt = _BASE_SYSTEM_PROMPT + kernel_ctx + + response = await self.provider.chat( + messages=self.conversation, + tools=AGENT_TOOLS, + system=system_prompt, + ) + + t_llm = (time.perf_counter() - t_start) * 1000 + + # Dispatch tool calls + assistant_parts: list[str] = [] + + for tc in response.tool_calls: + if tc.name == "speak": + text = tc.arguments.get("text", "") + if text: + assistant_parts.append(text) + # Show text in chat panel + if self._channel_ref: + await self._channel_ref.send_response_text(text) + # Route through bus → VoiceEncoder → TTS → channel.deliver + intent = CognitiveIntent( + modality=ModalityType.VOICE, + content=text, + target_channel=self.channel_id, + metadata={ + "voice": self._channel_ref.config.get("voice", "bm_lewis") + if self._channel_ref + else "bm_lewis", + "speed": self._channel_ref.config.get("speed", 1.25) if self._channel_ref else 1.25, + }, + ) + # Fire-and-forget: bus.act(blocking=False) returns QueuedJob immediately, + # OutputQueue drain thread handles TTS encoding + delivery. + self.bus.act(intent, channel=self.channel_id) + + elif tc.name == "send_text": + text = tc.arguments.get("text", "") + if text: + assistant_parts.append(text) + if self._channel_ref: + await self._channel_ref.send_response_text(text) + + # Fallback: if provider returned text but no tool calls, auto-speak + if not response.tool_calls and response.text: + text = response.text + assistant_parts.append(text) + if self._channel_ref: + await self._channel_ref.send_response_text(text) + intent = CognitiveIntent( + modality=ModalityType.VOICE, + content=text, + target_channel=self.channel_id, + metadata={ + "voice": self._channel_ref.config.get("voice", "bm_lewis") if self._channel_ref else "bm_lewis", + "speed": self._channel_ref.config.get("speed", 1.25) if self._channel_ref else 1.25, + }, + ) + self.bus.act(intent, channel=self.channel_id) + + # Update conversation history + if assistant_parts: + assistant_text = " ".join(assistant_parts) + self.conversation.append( + { + "role": "assistant", + "content": assistant_text, + } + ) + + # Log exchange to CogOS bus (observation channel — Claude can see this) + _log_exchange_to_bus(event.content, assistant_text, self.provider.name) + + # Signal completion + if self._channel_ref: + await self._channel_ref.send_response_complete( + metrics={"llm_ms": round(t_llm, 1), "provider": self.provider.name} + ) + + def _trim_history(self) -> None: + """Keep conversation within MAX_HISTORY messages.""" + if len(self.conversation) > MAX_HISTORY: + self.conversation = self.conversation[-MAX_HISTORY:] diff --git a/channels.py b/channels.py new file mode 100644 index 0000000..d0ab8c8 --- /dev/null +++ b/channels.py @@ -0,0 +1,315 @@ +"""Browser channel — WebSocket adapter for the Mod³ dashboard. + +Wraps a FastAPI WebSocket connection as a ChannelDescriptor on the bus. +Knows the WebSocket protocol (binary PCM / JSON control frames), +knows nothing about LLMs or agent logic. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import time +import uuid +from typing import Any, Awaitable, Callable + +from fastapi import WebSocket, WebSocketDisconnect + +from bus import ModalityBus +from modality import CognitiveEvent, EncodedOutput, ModalityType +from pipeline_state import PipelineState + +logger = logging.getLogger("mod3.channels") + + +class BrowserChannel: + """WebSocket-backed channel for the browser dashboard.""" + + def __init__( + self, + ws: WebSocket, + bus: ModalityBus, + pipeline_state: PipelineState, + loop: asyncio.AbstractEventLoop, + on_event: Callable[[CognitiveEvent], Awaitable[None]] | None = None, + ): + self.ws = ws + self.bus = bus + self.pipeline_state = pipeline_state + self._loop = loop + self._on_event = on_event + self.channel_id = f"browser:{uuid.uuid4().hex[:8]}" + self.config: dict[str, Any] = { + "voice": "bm_lewis", + "speed": 1.25, + "model": "kokoro", + } + self._audio_buffer = bytearray() + self._active = True + + # Register on the bus with a delivery callback + bus.register_channel( + self.channel_id, + modalities=[ModalityType.VOICE, ModalityType.TEXT], + deliver=self._deliver_sync, + ) + logger.info("BrowserChannel registered: %s", self.channel_id) + + # ------------------------------------------------------------------ + # Delivery (bus → browser) + # ------------------------------------------------------------------ + + def _deliver_sync(self, output: EncodedOutput) -> None: + """Called from the sync OutputQueue drain thread. Bridges to async.""" + if not self._active: + return + try: + future = asyncio.run_coroutine_threadsafe(self._deliver_async(output), self._loop) + future.result(timeout=10.0) + except (WebSocketDisconnect, RuntimeError, TimeoutError): + logger.debug("deliver failed (client disconnected?), deactivating channel") + self._active = False + + async def _deliver_async(self, output: EncodedOutput) -> None: + """Send encoded output over the WebSocket.""" + import base64 + + logger.info( + "deliver: modality=%s format=%s bytes=%d duration=%.1fs", + output.modality.value if output.modality else "none", + output.format, + len(output.data) if output.data else 0, + output.duration_sec, + ) + + if output.modality == ModalityType.VOICE and output.data: + # Send audio as base64 JSON (avoids binary frame issues) + audio_b64 = base64.b64encode(output.data).decode("ascii") + logger.info("deliver: sending base64 audio JSON (%d chars)", len(audio_b64)) + await self.ws.send_json( + { + "type": "audio", + "data": audio_b64, + "format": output.format or "wav", + "duration_sec": round(output.duration_sec, 2), + "sample_rate": output.metadata.get("sample_rate", 24000), + } + ) + logger.info("deliver: audio sent OK") + elif output.modality == ModalityType.TEXT: + text = output.data.decode("utf-8") if isinstance(output.data, bytes) else str(output.data) + logger.info("deliver: sending text response (%d chars)", len(text)) + await self.ws.send_json({"type": "response_text", "text": text}) + else: + logger.warning("deliver: unhandled modality %s, dropping", output.modality) + + # ------------------------------------------------------------------ + # Receive loop (browser → server) + # ------------------------------------------------------------------ + + async def run(self) -> None: + """Main receive loop — runs until WebSocket disconnects.""" + try: + while True: + message = await self.ws.receive() + msg_type = message.get("type", "") + if msg_type == "websocket.disconnect": + break + if "bytes" in message and message["bytes"]: + self._handle_audio(message["bytes"]) + elif "text" in message and message["text"]: + await self._handle_json(json.loads(message["text"])) + except WebSocketDisconnect: + pass + except Exception as e: + logger.error("BrowserChannel error: %s", e) + finally: + self._cleanup() + + def _handle_audio(self, pcm_bytes: bytes) -> None: + """Binary frame: raw Int16 PCM at 16kHz from browser Silero VAD.""" + self._audio_buffer.extend(pcm_bytes) + + async def _handle_json(self, msg: dict) -> None: + """JSON frame: control message dispatch.""" + msg_type = msg.get("type", "") + logger.info("Received JSON: type=%s", msg_type) + + if msg_type == "end_of_speech": + await self._process_utterance() + elif msg_type == "text_message": + text = msg.get("text", "").strip() + if text: + await self._process_text(text) + elif msg_type == "interrupt": + await self._handle_interrupt() + elif msg_type == "config": + for key in ("model", "voice", "speed"): + if key in msg: + self.config[key] = msg[key] + + # ------------------------------------------------------------------ + # Processing + # ------------------------------------------------------------------ + + async def _process_utterance(self) -> None: + """PCM audio buffer → WhisperDecoder STT → CognitiveEvent → agent loop. + + Skips the server-side VoiceGate (Silero VAD) because the browser + already ran Silero VAD client-side — no need to validate again, + and it avoids the torchaudio dependency for resampling. + """ + pcm_data = bytes(self._audio_buffer) + self._audio_buffer.clear() + + if len(pcm_data) < 6400: # <200ms at 16kHz Int16 + return + + t0 = time.perf_counter() + + # Transcribe via mlx_whisper — needs a temp WAV file + def _transcribe(): + import io + import os + import struct + import tempfile + + import mlx_whisper + import numpy as np + + from vad import is_hallucination + + audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0 + + # Skip silence + if len(audio) < 16000 * 0.3: + return None + rms = float(np.sqrt(np.mean(audio**2))) + if rms < 0.005: + return None + + # mlx_whisper needs a file path — write temp WAV + buf = io.BytesIO() + buf.write(b"RIFF") + buf.write(struct.pack(" None: + """Text message → CognitiveEvent → agent loop.""" + event = CognitiveEvent( + modality=ModalityType.TEXT, + content=text, + source_channel=self.channel_id, + confidence=1.0, + ) + await self.ws.send_json( + { + "type": "transcript", + "text": text, + "source": "text", + } + ) + if self._on_event: + await self._on_event(event) + + async def _handle_interrupt(self) -> None: + """Interrupt in-flight speech.""" + if self.pipeline_state.is_speaking: + self.pipeline_state.interrupt(reason="browser_interrupt") + await self.ws.send_json({"type": "interrupted"}) + + # ------------------------------------------------------------------ + # Helper methods (called by agent loop) + # ------------------------------------------------------------------ + + async def send_response_text(self, text: str) -> None: + """Send response text for display in chat panel.""" + if self._active: + try: + logger.info("send_response_text: %s", text[:100]) + await self.ws.send_json({"type": "response_text", "text": text}) + except Exception: + self._active = False + + async def send_response_complete(self, metrics: dict | None = None) -> None: + """Signal response is complete.""" + if self._active: + try: + await self.ws.send_json( + { + "type": "response_complete", + "metrics": metrics or {}, + } + ) + except Exception: + self._active = False + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def _cleanup(self) -> None: + """Deactivate channel and cancel pending TTS jobs on disconnect.""" + self._active = False + ch = self.bus._channels.get(self.channel_id) + if ch: + ch.active = False + cancelled = self.bus._queue_manager.cancel_channel(self.channel_id) + logger.info( + "BrowserChannel disconnected: %s (cancelled %d pending jobs)", + self.channel_id, + cancelled, + ) diff --git a/dashboard/index.html b/dashboard/index.html new file mode 100644 index 0000000..6b1f9f8 --- /dev/null +++ b/dashboard/index.html @@ -0,0 +1,696 @@ + + + + + +Mod³ Dashboard + + + + + +
+

Mod³

+
+ + Server +
+
+ + Mic +
+
+ + +
+ + +
+ Connecting... +
+ + + + +
+ + + + + +
+ + +
+ + +
+ + +
+ +
For voice, use headphones. Speak naturally — the system detects when you start and stop. Speak during playback to interrupt.
+
+ + + + + + + + + + + + + + + diff --git a/dashboard/playback.js b/dashboard/playback.js new file mode 100644 index 0000000..82be279 --- /dev/null +++ b/dashboard/playback.js @@ -0,0 +1,112 @@ +/** + * Streaming audio playback engine. + * Receives Int16 PCM chunks and plays them seamlessly via Web Audio API. + */ +class AudioPlayback { + constructor(sampleRate = 24000) { + this.sampleRate = sampleRate; + this.queue = []; + this.isPlaying = false; + this.audioContext = null; + this.currentSource = null; + this.nextStartTime = 0; + this.onPlaybackStart = null; + this.onPlaybackEnd = null; + this.sinkId = undefined; // output device ID + } + + _ensureContext() { + if (!this.audioContext || this.audioContext.state === "closed") { + this.audioContext = new AudioContext({ sampleRate: this.sampleRate }); + // Apply selected output device if set + if (this.sinkId !== undefined && this.audioContext.setSinkId) { + this.audioContext.setSinkId(this.sinkId).catch(() => {}); + } + } + if (this.audioContext.state === "suspended") { + this.audioContext.resume(); + } + } + + async setOutputDevice(deviceId) { + this.sinkId = deviceId; + if (this.audioContext && this.audioContext.setSinkId) { + await this.audioContext.setSinkId(deviceId); + } + } + + /** Enqueue raw Int16 PCM for playback. */ + enqueue(pcmArrayBuffer) { + this._ensureContext(); + + const int16 = new Int16Array(pcmArrayBuffer); + const float32 = new Float32Array(int16.length); + for (let i = 0; i < int16.length; i++) { + float32[i] = int16[i] / 32768; + } + + const buffer = this.audioContext.createBuffer(1, float32.length, this.sampleRate); + buffer.getChannelData(0).set(float32); + this.queue.push(buffer); + + if (!this.isPlaying) this._playNext(); + } + + /** Enqueue a full WAV file (with header) for playback. */ + async enqueueWav(wavArrayBuffer) { + this._ensureContext(); + try { + const audioBuffer = await this.audioContext.decodeAudioData(wavArrayBuffer.slice(0)); + this.queue.push(audioBuffer); + if (!this.isPlaying) this._playNext(); + } catch (err) { + console.error("[AudioPlayback] Failed to decode WAV:", err); + } + } + + _playNext() { + if (this.queue.length === 0) { + this.isPlaying = false; + if (this.onPlaybackEnd) this.onPlaybackEnd(); + return; + } + + if (!this.isPlaying) { + this.isPlaying = true; + this.nextStartTime = this.audioContext.currentTime; + if (this.onPlaybackStart) this.onPlaybackStart(); + } + + const buffer = this.queue.shift(); + const source = this.audioContext.createBufferSource(); + source.buffer = buffer; + source.connect(this.audioContext.destination); + source.onended = () => this._playNext(); + + // Schedule this chunk right after the previous one for gapless playback + const startTime = Math.max(this.nextStartTime, this.audioContext.currentTime); + source.start(startTime); + this.nextStartTime = startTime + buffer.duration; + this.currentSource = source; + } + + flush() { + this.queue = []; + if (this.currentSource) { + try { this.currentSource.stop(); } catch {} + } + this.isPlaying = false; + this.nextStartTime = 0; + } + + setSampleRate(rate) { + if (rate !== this.sampleRate) { + this.sampleRate = rate; + // Close old context so next enqueue creates one with the right rate + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + } + } +} diff --git a/dashboard/transport.js b/dashboard/transport.js new file mode 100644 index 0000000..d0d02ad --- /dev/null +++ b/dashboard/transport.js @@ -0,0 +1,112 @@ +/** + * WebSocket transport for voice chat. + * Binary frames = audio, text frames = JSON control messages. + */ +class VoiceTransport { + constructor(url, handlers) { + this.url = url; + this.handlers = handlers; // { onAudio, onTranscript, onResponseText, onResponseComplete, onInterrupted, onMetrics, onError, onOpen, onClose } + this.ws = null; + this.reconnectAttempts = 0; + this.maxReconnects = 3; + } + + connect() { + this.ws = new WebSocket(this.url); + this.ws.binaryType = "arraybuffer"; + + this.ws.onopen = () => { + this.reconnectAttempts = 0; + if (this.handlers.onOpen) this.handlers.onOpen(); + }; + + this.ws.onmessage = (event) => { + if (event.data instanceof ArrayBuffer) { + console.log(`[WS] Binary frame: ${event.data.byteLength} bytes`); + if (this.handlers.onAudio) this.handlers.onAudio(event.data); + } else if (event.data instanceof Blob) { + console.log(`[WS] Blob frame: ${event.data.size} bytes (converting)`); + event.data.arrayBuffer().then(buf => { + if (this.handlers.onAudio) this.handlers.onAudio(buf); + }); + } else { + try { + const msg = JSON.parse(event.data); + console.log(`[WS] JSON: ${msg.type}`, msg.type === 'response_text' ? msg.text?.substring(0, 80) : ''); + this._dispatch(msg); + } catch (e) { + console.error("Failed to parse WS message:", e, "data:", typeof event.data, event.data?.substring?.(0, 100)); + } + } + }; + + this.ws.onclose = (event) => { + if (this.handlers.onClose) this.handlers.onClose(event); + }; + + this.ws.onerror = (error) => { + console.error("WebSocket error:", error); + if (this.handlers.onError) this.handlers.onError(error); + }; + } + + _dispatch(msg) { + // Handle base64 audio message — decode and route to onAudio + if (msg.type === "audio" && msg.data) { + const binary = atob(msg.data); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i); + if (this.handlers.onAudio) this.handlers.onAudio(bytes.buffer); + if (this.handlers.onMetrics) this.handlers.onMetrics(msg); + return; + } + + const handlerMap = { + transcript: "onTranscript", + response_text: "onResponseText", + response_complete: "onResponseComplete", + interrupted: "onInterrupted", + metrics: "onMetrics", + error: "onError", + }; + const handler = handlerMap[msg.type]; + if (handler && this.handlers[handler]) { + this.handlers[handler](msg); + } + } + + sendAudio(pcmBuffer) { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(pcmBuffer); + } + } + + sendControl(msg) { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify(msg)); + } + } + + endSpeech() { + this.sendControl({ type: "end_of_speech" }); + } + + interrupt() { + this.sendControl({ type: "interrupt" }); + } + + setConfig(config) { + this.sendControl({ type: "config", ...config }); + } + + disconnect() { + if (this.ws) { + this.ws.close(); + this.ws = null; + } + } + + get connected() { + return this.ws && this.ws.readyState === WebSocket.OPEN; + } +} diff --git a/dashboard/vad/bundle.min.js b/dashboard/vad/bundle.min.js new file mode 100644 index 0000000..77366c0 --- /dev/null +++ b/dashboard/vad/bundle.min.js @@ -0,0 +1,2 @@ +/*! For license information please see bundle.min.js.LICENSE.txt */ +!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("onnxruntime-web")):"function"==typeof define&&define.amd?define(["onnxruntime-web"],t):"object"==typeof exports?exports.vad=t(require("onnxruntime-web")):e.vad=t(e.ort)}(self,(e=>(()=>{var t={647:(e,t,r)=>{"use strict";var o=(()=>{var e,t,o,n,s,a,i,l,u,d,c,p,f,h,m,w,g,y,b,v,S,A,E,T,_,O,P,M,C,R,F,x=Object.defineProperty,D=Object.getOwnPropertyDescriptor,U=Object.getOwnPropertyNames,L=Object.prototype.hasOwnProperty,I=r(936),B=(e,t)=>()=>(e&&(t=e(e=0)),t),N=(e,t)=>{for(var r in t)x(e,r,{get:t[r],enumerable:!0})},k=e=>((e,t,r,o)=>{if(t&&"object"==typeof t||"function"==typeof t)for(let r of U(t))!L.call(e,r)&&undefined!==r&&x(e,r,{get:()=>t[r],enumerable:!(o=D(t,r))||o.enumerable});return e})(x({},"__esModule",{value:!0}),e),$=B((()=>{e=new Map,t=[],o=(r,o,n)=>{if(!o||"function"!=typeof o.init||"function"!=typeof o.createInferenceSessionHandler)throw new TypeError("not a valid backend");{let s=e.get(r);if(void 0===s)e.set(r,{backend:o,priority:n});else{if(s.priority>n)return;if(s.priority===n&&s.backend!==o)throw new Error(`cannot register backend "${r}" using priority ${n}`)}if(n>=0){let o=t.indexOf(r);-1!==o&&t.splice(o,1);for(let o=0;o{let r=e.get(t);if(!r)return"backend not found.";if(r.initialized)return r.backend;if(r.aborted)return r.error;{let e=!!r.initPromise;try{return e||(r.initPromise=r.backend.init(t)),await r.initPromise,r.initialized=!0,r.backend}catch(t){return e||(r.error=`${t}`,r.aborted=!0),r.error}finally{delete r.initPromise}}},s=async e=>{let r,o=e.executionProviders||[],s=o.map((e=>"string"==typeof e?e:e.name)),a=0===s.length?t:s,i=[],l=new Set;for(let e of a){let t=await n(e);"string"==typeof t?i.push({name:e,err:t}):(r||(r=t),r===t&&l.add(e))}if(!r)throw new Error(`no available backend found. ERR: ${i.map((e=>`[${e.name}] ${e.err}`)).join(", ")}`);for(let{name:e,err:t}of i)s.includes(e)&&console.warn(`removing requested execution provider "${e}" from session options because it is not available: ${t}`);let u=o.filter((e=>l.has("string"==typeof e?e:e.name)));return[r,new Proxy(e,{get:(e,t)=>"executionProviders"===t?u:Reflect.get(e,t)})]}})),j=B((()=>{$()})),V=B((()=>{a="1.22.0"})),G=B((()=>{V(),i="warning",l={wasm:{},webgl:{},webgpu:{},versions:{common:a},set logLevel(e){if(void 0!==e){if("string"!=typeof e||-1===["verbose","info","warning","error","fatal"].indexOf(e))throw new Error(`Unsupported logging level: ${e}`);i=e}},get logLevel(){return i}},Object.defineProperty(l,"logLevel",{enumerable:!0})})),W=B((()=>{G(),u=l})),z=B((()=>{d=(e,t)=>{let r=typeof document<"u"?document.createElement("canvas"):new OffscreenCanvas(1,1);r.width=e.dims[3],r.height=e.dims[2];let o=r.getContext("2d");if(null!=o){let n,s;void 0!==t?.tensorLayout&&"NHWC"===t.tensorLayout?(n=e.dims[2],s=e.dims[3]):(n=e.dims[3],s=e.dims[2]);let a,i,l=void 0!==t?.format?t.format:"RGB",u=t?.norm;void 0===u||void 0===u.mean?a=[255,255,255,255]:"number"==typeof u.mean?a=[u.mean,u.mean,u.mean,u.mean]:(a=[u.mean[0],u.mean[1],u.mean[2],0],void 0!==u.mean[3]&&(a[3]=u.mean[3])),void 0===u||void 0===u.bias?i=[0,0,0,0]:"number"==typeof u.bias?i=[u.bias,u.bias,u.bias,u.bias]:(i=[u.bias[0],u.bias[1],u.bias[2],0],void 0!==u.bias[3]&&(i[3]=u.bias[3]));let d=s*n,c=0,p=d,f=2*d,h=-1;"RGBA"===l?(c=0,p=d,f=2*d,h=3*d):"RGB"===l?(c=0,p=d,f=2*d):"RBG"===l&&(c=0,f=d,p=2*d);for(let t=0;t{let r,o=typeof document<"u"?document.createElement("canvas").getContext("2d"):new OffscreenCanvas(1,1).getContext("2d");if(null==o)throw new Error("Can not access image data");{let n,s,a;void 0!==t?.tensorLayout&&"NHWC"===t.tensorLayout?(n=e.dims[2],s=e.dims[1],a=e.dims[3]):(n=e.dims[3],s=e.dims[2],a=e.dims[1]);let i,l,u=void 0!==t&&void 0!==t.format?t.format:"RGB",d=t?.norm;void 0===d||void 0===d.mean?i=[255,255,255,255]:"number"==typeof d.mean?i=[d.mean,d.mean,d.mean,d.mean]:(i=[d.mean[0],d.mean[1],d.mean[2],255],void 0!==d.mean[3]&&(i[3]=d.mean[3])),void 0===d||void 0===d.bias?l=[0,0,0,0]:"number"==typeof d.bias?l=[d.bias,d.bias,d.bias,d.bias]:(l=[d.bias[0],d.bias[1],d.bias[2],0],void 0!==d.bias[3]&&(l[3]=d.bias[3]));let c=s*n;if(void 0!==t&&(void 0!==t.format&&4===a&&"RGBA"!==t.format||3===a&&"RGB"!==t.format&&"BGR"!==t.format))throw new Error("Tensor format doesn't match input tensor dims");let p=4,f=0,h=1,m=2,w=3,g=0,y=c,b=2*c,v=-1;"RGBA"===u?(g=0,y=c,b=2*c,v=3*c):"RGB"===u?(g=0,y=c,b=2*c):"RBG"===u&&(g=0,b=c,y=2*c),r=o.createImageData(n,s);for(let t=0;t{J(),p=(e,t)=>{if(void 0===e)throw new Error("Image buffer must be defined");if(void 0===t.height||void 0===t.width)throw new Error("Image height and width must be defined");if("NHWC"===t.tensorLayout)throw new Error("NHWC Tensor layout is not supported yet");let r,o,{height:n,width:s}=t,a=t.norm??{mean:255,bias:0};r="number"==typeof a.mean?[a.mean,a.mean,a.mean,a.mean]:[a.mean[0],a.mean[1],a.mean[2],a.mean[3]??255],o="number"==typeof a.bias?[a.bias,a.bias,a.bias,a.bias]:[a.bias[0],a.bias[1],a.bias[2],a.bias[3]??0];let i=void 0!==t.format?t.format:"RGBA",l=void 0!==t.tensorFormat&&void 0!==t.tensorFormat?t.tensorFormat:"RGB",u=n*s,d="RGBA"===l?new Float32Array(4*u):new Float32Array(3*u),c=4,p=0,f=1,h=2,m=3,w=0,g=u,y=2*u,b=-1;"RGB"===i&&(c=3,p=0,f=1,h=2,m=-1),"RGBA"===l?b=3*u:"RBG"===l?(w=0,y=u,g=2*u):"BGR"===l&&(y=0,g=u,w=2*u);for(let t=0;t{let r,o=typeof HTMLImageElement<"u"&&e instanceof HTMLImageElement,n=typeof ImageData<"u"&&e instanceof ImageData,s=typeof ImageBitmap<"u"&&e instanceof ImageBitmap,a="string"==typeof e,i=t??{},l=()=>{if(typeof document<"u")return document.createElement("canvas");if(typeof OffscreenCanvas<"u")return new OffscreenCanvas(1,1);throw new Error("Canvas is not supported")},u=e=>typeof HTMLCanvasElement<"u"&&e instanceof HTMLCanvasElement||e instanceof OffscreenCanvas?e.getContext("2d"):null;if(o){let o=l();o.width=e.width,o.height=e.height;let n=u(o);if(null==n)throw new Error("Can not access image data");{let o=e.height,s=e.width;if(void 0!==t&&void 0!==t.resizedHeight&&void 0!==t.resizedWidth&&(o=t.resizedHeight,s=t.resizedWidth),void 0!==t){if(i=t,void 0!==t.tensorFormat)throw new Error("Image input config format must be RGBA for HTMLImageElement");i.tensorFormat="RGBA",i.height=o,i.width=s}else i.tensorFormat="RGBA",i.height=o,i.width=s;n.drawImage(e,0,0),r=n.getImageData(0,0,s,o).data}}else{if(!n){if(s){if(void 0===t)throw new Error("Please provide image config with format for Imagebitmap");let o=l();o.width=e.width,o.height=e.height;let n=u(o);if(null!=n){let t=e.height,o=e.width;return n.drawImage(e,0,0,o,t),r=n.getImageData(0,0,o,t).data,i.height=t,i.width=o,p(r,i)}throw new Error("Can not access image data")}if(a)return new Promise(((t,r)=>{let o=l(),n=u(o);if(!e||!n)return r();let s=new Image;s.crossOrigin="Anonymous",s.src=e,s.onload=()=>{o.width=s.width,o.height=s.height,n.drawImage(s,0,0,o.width,o.height);let e=n.getImageData(0,0,o.width,o.height);i.height=o.height,i.width=o.width,t(p(e.data,i))}}));throw new Error("Input data provided is not supported - aborted tensor creation")}{let o,n;if(void 0!==t&&void 0!==t.resizedWidth&&void 0!==t.resizedHeight?(o=t.resizedHeight,n=t.resizedWidth):(o=e.height,n=e.width),void 0!==t&&(i=t),i.format="RGBA",i.height=o,i.width=n,void 0!==t){let t=l();t.width=n,t.height=o;let s=u(t);if(null==s)throw new Error("Can not access image data");s.putImageData(e,0,0),r=s.getImageData(0,0,n,o).data}else r=e.data}}if(void 0!==r)return p(r,i);throw new Error("Input data provided is not supported - aborted tensor creation")},h=(e,t)=>{let{width:r,height:o,download:n,dispose:s}=t;return new T({location:"texture",type:"float32",texture:e,dims:[1,o,r,4],download:n,dispose:s})},m=(e,t)=>{let{dataType:r,dims:o,download:n,dispose:s}=t;return new T({location:"gpu-buffer",type:r??"float32",gpuBuffer:e,dims:o,download:n,dispose:s})},w=(e,t)=>{let{dataType:r,dims:o,download:n,dispose:s}=t;return new T({location:"ml-tensor",type:r??"float32",mlTensor:e,dims:o,download:n,dispose:s})},g=(e,t,r)=>new T({location:"cpu-pinned",type:e,data:t,dims:r??[t.length]})})),Z=B((()=>{y=new Map([["float32",Float32Array],["uint8",Uint8Array],["int8",Int8Array],["uint16",Uint16Array],["int16",Int16Array],["int32",Int32Array],["bool",Uint8Array],["float64",Float64Array],["uint32",Uint32Array],["int4",Uint8Array],["uint4",Uint8Array]]),b=new Map([[Float32Array,"float32"],[Uint8Array,"uint8"],[Int8Array,"int8"],[Uint16Array,"uint16"],[Int16Array,"int16"],[Int32Array,"int32"],[Float64Array,"float64"],[Uint32Array,"uint32"]]),v=!1,S=()=>{if(!v){v=!0;let e=typeof BigInt64Array<"u"&&BigInt64Array.from,t=typeof BigUint64Array<"u"&&BigUint64Array.from,r=globalThis.Float16Array,o=typeof r<"u"&&r.from;e&&(y.set("int64",BigInt64Array),b.set(BigInt64Array,"int64")),t&&(y.set("uint64",BigUint64Array),b.set(BigUint64Array,"uint64")),o?(y.set("float16",r),b.set(r,"float16")):y.set("float16",Uint16Array)}}})),q=B((()=>{J(),A=e=>{let t=1;for(let r=0;r{switch(e.location){case"cpu":return new T(e.type,e.data,t);case"cpu-pinned":return new T({location:"cpu-pinned",data:e.data,type:e.type,dims:t});case"texture":return new T({location:"texture",texture:e.texture,type:e.type,dims:t});case"gpu-buffer":return new T({location:"gpu-buffer",gpuBuffer:e.gpuBuffer,type:e.type,dims:t});case"ml-tensor":return new T({location:"ml-tensor",mlTensor:e.mlTensor,type:e.type,dims:t});default:throw new Error(`tensorReshape: tensor location ${e.location} is not supported`)}}})),J=B((()=>{z(),H(),Z(),q(),T=class{constructor(e,t,r){let o,n;if(S(),"object"==typeof e&&"location"in e)switch(this.dataLocation=e.location,o=e.type,n=e.dims,e.location){case"cpu-pinned":{let t=y.get(o);if(!t)throw new TypeError(`unsupported type "${o}" to create tensor from pinned buffer`);if(!(e.data instanceof t))throw new TypeError(`buffer should be of type ${t.name}`);this.cpuData=e.data;break}case"texture":if("float32"!==o)throw new TypeError(`unsupported type "${o}" to create tensor from texture`);this.gpuTextureData=e.texture,this.downloader=e.download,this.disposer=e.dispose;break;case"gpu-buffer":if("float32"!==o&&"float16"!==o&&"int32"!==o&&"int64"!==o&&"uint32"!==o&&"uint8"!==o&&"bool"!==o&&"uint4"!==o&&"int4"!==o)throw new TypeError(`unsupported type "${o}" to create tensor from gpu buffer`);this.gpuBufferData=e.gpuBuffer,this.downloader=e.download,this.disposer=e.dispose;break;case"ml-tensor":if("float32"!==o&&"float16"!==o&&"int32"!==o&&"int64"!==o&&"uint32"!==o&&"uint64"!==o&&"int8"!==o&&"uint8"!==o&&"bool"!==o&&"uint4"!==o&&"int4"!==o)throw new TypeError(`unsupported type "${o}" to create tensor from MLTensor`);this.mlTensorData=e.mlTensor,this.downloader=e.download,this.disposer=e.dispose;break;default:throw new Error(`Tensor constructor: unsupported location '${this.dataLocation}'`)}else{let s,a;if("string"==typeof e)if(o=e,a=r,"string"===e){if(!Array.isArray(t))throw new TypeError("A string tensor's data must be a string array.");s=t}else{let r=y.get(e);if(void 0===r)throw new TypeError(`Unsupported tensor type: ${e}.`);if(Array.isArray(t)){if("float16"===e&&r===Uint16Array||"uint4"===e||"int4"===e)throw new TypeError(`Creating a ${e} tensor from number array is not supported. Please use ${r.name} as data.`);s="uint64"===e||"int64"===e?r.from(t,BigInt):r.from(t)}else if(t instanceof r)s=t;else if(t instanceof Uint8ClampedArray){if("uint8"!==e)throw new TypeError("A Uint8ClampedArray tensor's data must be type of uint8");s=Uint8Array.from(t)}else{if(!("float16"===e&&t instanceof Uint16Array&&r!==Uint16Array))throw new TypeError(`A ${o} tensor's data must be type of ${r}`);s=new globalThis.Float16Array(t.buffer,t.byteOffset,t.length)}}else if(a=t,Array.isArray(e)){if(0===e.length)throw new TypeError("Tensor type cannot be inferred from an empty array.");let t=typeof e[0];if("string"===t)o="string",s=e;else{if("boolean"!==t)throw new TypeError(`Invalid element type of data array: ${t}.`);o="bool",s=Uint8Array.from(e)}}else if(e instanceof Uint8ClampedArray)o="uint8",s=Uint8Array.from(e);else{let t=b.get(e.constructor);if(void 0===t)throw new TypeError(`Unsupported type for tensor data: ${e.constructor}.`);o=t,s=e}if(void 0===a)a=[s.length];else if(!Array.isArray(a))throw new TypeError("A tensor's dims must be a number array");n=a,this.cpuData=s,this.dataLocation="cpu"}let s=A(n);if(this.cpuData&&s!==this.cpuData.length&&("uint4"!==o&&"int4"!==o||Math.ceil(s/2)!==this.cpuData.length))throw new Error(`Tensor's size(${s}) does not match data length(${this.cpuData.length}).`);this.type=o,this.dims=n,this.size=s}static async fromImage(e,t){return f(e,t)}static fromTexture(e,t){return h(e,t)}static fromGpuBuffer(e,t){return m(e,t)}static fromMLTensor(e,t){return w(e,t)}static fromPinnedBuffer(e,t,r){return g(e,t,r)}toDataURL(e){return d(this,e)}toImageData(e){return c(this,e)}get data(){if(this.ensureValid(),!this.cpuData)throw new Error("The data is not on CPU. Use `getData()` to download GPU data to CPU, or use `texture` or `gpuBuffer` property to access the GPU data directly.");return this.cpuData}get location(){return this.dataLocation}get texture(){if(this.ensureValid(),!this.gpuTextureData)throw new Error("The data is not stored as a WebGL texture.");return this.gpuTextureData}get gpuBuffer(){if(this.ensureValid(),!this.gpuBufferData)throw new Error("The data is not stored as a WebGPU buffer.");return this.gpuBufferData}get mlTensor(){if(this.ensureValid(),!this.mlTensorData)throw new Error("The data is not stored as a WebNN MLTensor.");return this.mlTensorData}async getData(e){switch(this.ensureValid(),this.dataLocation){case"cpu":case"cpu-pinned":return this.data;case"texture":case"gpu-buffer":case"ml-tensor":if(!this.downloader)throw new Error("The current tensor is not created with a specified data downloader.");if(this.isDownloading)throw new Error("The current tensor is being downloaded.");try{this.isDownloading=!0;let t=await this.downloader();return this.downloader=void 0,this.dataLocation="cpu",this.cpuData=t,e&&this.disposer&&(this.disposer(),this.disposer=void 0),t}finally{this.isDownloading=!1}default:throw new Error(`cannot get data from location: ${this.dataLocation}`)}}dispose(){if(this.isDownloading)throw new Error("The current tensor is being downloaded.");this.disposer&&(this.disposer(),this.disposer=void 0),this.cpuData=void 0,this.gpuTextureData=void 0,this.gpuBufferData=void 0,this.mlTensorData=void 0,this.downloader=void 0,this.isDownloading=void 0,this.dataLocation="none"}ensureValid(){if("none"===this.dataLocation)throw new Error("The tensor is disposed.")}reshape(e){if(this.ensureValid(),this.downloader||this.disposer)throw new Error("Cannot reshape a tensor that owns GPU resource.");return E(this,e)}}})),K=B((()=>{J(),_=T})),Q=B((()=>{G(),O=(e,t)=>{(typeof l.trace>"u"?!l.wasm.trace:!l.trace)||console.timeStamp(`${e}::ORT::${t}`)},P=(e,t)=>{let r=(new Error).stack?.split(/\r\n|\r|\n/g)||[],o=!1;for(let n=0;n{(typeof l.trace>"u"?!l.wasm.trace:!l.trace)||P("BEGIN",e)},C=e=>{(typeof l.trace>"u"?!l.wasm.trace:!l.trace)||P("END",e)}})),X=B((()=>{$(),K(),Q(),R=class e{constructor(e){this.handler=e}async run(e,t,r){M();let o={},n={};if("object"!=typeof e||null===e||e instanceof _||Array.isArray(e))throw new TypeError("'feeds' must be an object that use input names as keys and OnnxValue as corresponding values.");let s=!0;if("object"==typeof t){if(null===t)throw new TypeError("Unexpected argument[1]: cannot be null.");if(t instanceof _)throw new TypeError("'fetches' cannot be a Tensor");if(Array.isArray(t)){if(0===t.length)throw new TypeError("'fetches' cannot be an empty array.");s=!1;for(let e of t){if("string"!=typeof e)throw new TypeError("'fetches' must be a string array or an object.");if(-1===this.outputNames.indexOf(e))throw new RangeError(`'fetches' contains invalid output name: ${e}.`);o[e]=null}if("object"==typeof r&&null!==r)n=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else{let e=!1,a=Object.getOwnPropertyNames(t);for(let r of this.outputNames)if(-1!==a.indexOf(r)){let n=t[r];(null===n||n instanceof _)&&(e=!0,s=!1,o[r]=n)}if(e){if("object"==typeof r&&null!==r)n=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else n=t}}else if(typeof t<"u")throw new TypeError("Unexpected argument[1]: must be 'fetches' or 'options'.");for(let t of this.inputNames)if(typeof e[t]>"u")throw new Error(`input '${t}' is missing in 'feeds'.`);if(s)for(let e of this.outputNames)o[e]=null;let a=await this.handler.run(e,o,n),i={};for(let e in a)if(Object.hasOwnProperty.call(a,e)){let t=a[e];i[e]=t instanceof _?t:new _(t.type,t.data,t.dims)}return C(),i}async release(){return this.handler.dispose()}static async create(t,r,o,n){M();let a,i={};if("string"==typeof t){if(a=t,"object"==typeof r&&null!==r)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else if(t instanceof Uint8Array){if(a=t,"object"==typeof r&&null!==r)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else{if(!(t instanceof ArrayBuffer||typeof SharedArrayBuffer<"u"&&t instanceof SharedArrayBuffer))throw new TypeError("Unexpected argument[0]: must be 'path' or 'buffer'.");{let e=t,s=0,l=t.byteLength;if("object"==typeof r&&null!==r)i=r;else if("number"==typeof r){if(s=r,!Number.isSafeInteger(s))throw new RangeError("'byteOffset' must be an integer.");if(s<0||s>=e.byteLength)throw new RangeError(`'byteOffset' is out of range [0, ${e.byteLength}).`);if(l=t.byteLength-s,"number"==typeof o){if(l=o,!Number.isSafeInteger(l))throw new RangeError("'byteLength' must be an integer.");if(l<=0||s+l>e.byteLength)throw new RangeError(`'byteLength' is out of range (0, ${e.byteLength-s}].`);if("object"==typeof n&&null!==n)i=n;else if(typeof n<"u")throw new TypeError("'options' must be an object.")}else if(typeof o<"u")throw new TypeError("'byteLength' must be a number.")}else if(typeof r<"u")throw new TypeError("'options' must be an object.");a=new Uint8Array(e,s,l)}}let[l,u]=await s(i),d=await l.createInferenceSessionHandler(a,u);return C(),new e(d)}startProfiling(){this.handler.startProfiling()}endProfiling(){this.handler.endProfiling()}get inputNames(){return this.handler.inputNames}get outputNames(){return this.handler.outputNames}get inputMetadata(){return this.handler.inputMetadata}get outputMetadata(){return this.handler.outputMetadata}}})),Y=B((()=>{X(),F=R})),ee=B((()=>{})),te=B((()=>{})),re=B((()=>{})),oe=B((()=>{})),ne={};N(ne,{InferenceSession:()=>F,TRACE:()=>O,TRACE_FUNC_BEGIN:()=>M,TRACE_FUNC_END:()=>C,Tensor:()=>_,env:()=>u,registerBackend:()=>o});var se=B((()=>{j(),W(),Y(),K(),ee(),te(),Q(),re(),oe()})),ae=B((()=>{})),ie={};N(ie,{default:()=>de});var le,ue,de,ce,pe,fe,he,me,we,ge,ye,be,ve,Se,Ae,Ee,Te,_e,Oe,Pe,Me,Ce,Re,Fe,xe,De,Ue,Le,Ie,Be,Ne,ke,$e,je,Ve,Ge,We,ze,He,Ze,qe,Je,Ke,Qe,Xe,Ye,et,tt,rt,ot,nt,st,at,it,lt,ut,dt,ct,pt,ft,ht,mt,wt,gt,yt,bt,vt,St,At,Et,Tt,_t,Ot,Pt,Mt,Ct,Rt=B((()=>{Nt(),xt(),Ft(),le="ort-wasm-proxy-worker",(ue=globalThis.self?.name===le)&&(self.onmessage=e=>{let{type:t,in:r}=e.data;try{switch(t){case"init-wasm":Ce(r.wasm).then((()=>{Qe(r).then((()=>{postMessage({type:t})}),(e=>{postMessage({type:t,err:e})}))}),(e=>{postMessage({type:t,err:e})}));break;case"init-ep":{let{epName:e,env:o}=r;Xe(o,e).then((()=>{postMessage({type:t})}),(e=>{postMessage({type:t,err:e})}));break}case"copy-from":{let{buffer:e}=r,o=rt(e);postMessage({type:t,out:o});break}case"create":{let{model:e,options:o}=r;ot(e,o).then((e=>{postMessage({type:t,out:e})}),(e=>{postMessage({type:t,err:e})}));break}case"release":nt(r),postMessage({type:t});break;case"run":{let{sessionId:e,inputIndices:o,inputs:n,outputIndices:s,options:a}=r;at(e,o,n,s,new Array(s.length).fill(null),a).then((e=>{e.some((e=>"cpu"!==e[3]))?postMessage({type:t,err:"Proxy does not support non-cpu tensor location."}):postMessage({type:t,out:e},lt([...n,...e]))}),(e=>{postMessage({type:t,err:e})}));break}case"end-profiling":it(r),postMessage({type:t})}}catch(e){postMessage({type:t,err:e})}}),de=ue?null:e=>new Worker(e??pe,{type:"classic",name:le})})),Ft=B((()=>{ae(),ce=typeof location>"u"?void 0:location.origin,pe=typeof document<"u"?document.currentScript?.src:typeof self<"u"?self.location?.href:void 0,fe=()=>{if(pe&&!pe.startsWith("blob:"))return pe.substring(0,pe.lastIndexOf("/")+1)},he=(e,t)=>{try{let r=t??pe;return(r?new URL(e,r):new URL(e)).origin===ce}catch{return!1}},me=(e,t)=>{let r=t??pe;try{return(r?new URL(e,r):new URL(e)).href}catch{return}},we=(e,t)=>`${t??"./"}${e}`,ge=async e=>{let t=await(await fetch(e,{credentials:"same-origin"})).blob();return URL.createObjectURL(t)},ye=async e=>(await import(e)).default,be=(Rt(),k(ie)).default,ve=async()=>{if(!pe)throw new Error("Failed to load proxy worker: cannot determine the script source URL.");if(he(pe))return[void 0,be()];let e=await ge(pe);return[e,be(e)]},Se=async(e,t,r)=>{{let o="ort-wasm-simd-threaded.mjs",n=e??me(o,t),s=r&&n&&!he(n,t),a=s?await ge(n):n??we(o,t);return[s?a:void 0,await ye(a)]}}})),xt=B((()=>{Ft(),Ee=!1,Te=!1,_e=!1,Oe=()=>{if(typeof SharedArrayBuffer>"u")return!1;try{return typeof MessageChannel<"u"&&(new MessageChannel).port1.postMessage(new SharedArrayBuffer(1)),WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,5,4,1,3,1,1,10,11,1,9,0,65,0,254,16,2,0,26,11]))}catch{return!1}},Pe=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,10,30,1,28,0,65,0,253,15,253,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,186,1,26,11]))}catch{return!1}},Me=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,5,1,96,0,1,123,3,2,1,0,10,19,1,17,0,65,1,253,15,65,2,253,15,65,3,253,15,253,147,2,11]))}catch{return!1}},Ce=async e=>{if(Ee)return Promise.resolve();if(Te)throw new Error("multiple calls to 'initializeWebAssembly()' detected.");if(_e)throw new Error("previous call to 'initializeWebAssembly()' failed.");Te=!0;let t=e.initTimeout,r=e.numThreads;if(!1!==e.simd)if("relaxed"===e.simd){if(!Me())throw new Error("Relaxed WebAssembly SIMD is not supported in the current environment.")}else if(!Pe())throw new Error("WebAssembly SIMD is not supported in the current environment.");let o=Oe();r>1&&!o&&(typeof self<"u"&&!self.crossOriginIsolated&&console.warn("env.wasm.numThreads is set to "+r+", but this will not work unless you enable crossOriginIsolated mode. See https://web.dev/cross-origin-isolation-guide/ for more info."),console.warn("WebAssembly multi-threading is not supported in the current environment. Falling back to single-threading."),e.numThreads=r=1);let n=e.wasmPaths,s="string"==typeof n?n:void 0,a=n?.mjs,i=a?.href??a,l=n?.wasm,u=l?.href??l,d=e.wasmBinary,[c,p]=await Se(i,s,r>1),f=!1,h=[];if(t>0&&h.push(new Promise((e=>{setTimeout((()=>{f=!0,e()}),t)}))),h.push(new Promise(((e,t)=>{let o={numThreads:r};if(d)o.wasmBinary=d;else if(u||s)o.locateFile=e=>u??s+e;else if(i&&0!==i.indexOf("blob:"))o.locateFile=e=>new URL(e,i).href;else if(c){let e=fe();e&&(o.locateFile=t=>e+t)}p(o).then((t=>{Te=!1,Ee=!0,Ae=t,e(),c&&URL.revokeObjectURL(c)}),(e=>{Te=!1,_e=!0,t(e)}))}))),await Promise.race(h),f)throw new Error(`WebAssembly backend initializing failed due to timeout: ${t}ms`)},Re=()=>{if(Ee&&Ae)return Ae;throw new Error("WebAssembly is not initialized yet.")}})),Dt=B((()=>{xt(),Fe=(e,t)=>{let r=Re(),o=r.lengthBytesUTF8(e)+1,n=r._malloc(o);return r.stringToUTF8(e,n,o),t.push(n),n},xe=(e,t,r,o)=>{if("object"==typeof e&&null!==e){if(r.has(e))throw new Error("Circular reference in options");r.add(e)}Object.entries(e).forEach((([e,n])=>{let s=t?t+e:e;if("object"==typeof n)xe(n,s+".",r,o);else if("string"==typeof n||"number"==typeof n)o(s,n.toString());else{if("boolean"!=typeof n)throw new Error("Can't handle extra config type: "+typeof n);o(s,n?"1":"0")}}))},De=e=>{let t=Re(),r=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);t._OrtGetLastError(o,o+r);let n=Number(t.getValue(o,4===r?"i32":"i64")),s=t.getValue(o+r,"*"),a=s?t.UTF8ToString(s):"";throw new Error(`${e} ERROR_CODE: ${n}, ERROR_MESSAGE: ${a}`)}finally{t.stackRestore(r)}}})),Ut=B((()=>{xt(),Dt(),Ue=e=>{let t=Re(),r=0,o=[],n=e||{};try{if(void 0===e?.logSeverityLevel)n.logSeverityLevel=2;else if("number"!=typeof e.logSeverityLevel||!Number.isInteger(e.logSeverityLevel)||e.logSeverityLevel<0||e.logSeverityLevel>4)throw new Error(`log serverity level is not valid: ${e.logSeverityLevel}`);if(void 0===e?.logVerbosityLevel)n.logVerbosityLevel=0;else if("number"!=typeof e.logVerbosityLevel||!Number.isInteger(e.logVerbosityLevel))throw new Error(`log verbosity level is not valid: ${e.logVerbosityLevel}`);void 0===e?.terminate&&(n.terminate=!1);let s=0;return void 0!==e?.tag&&(s=Fe(e.tag,o)),r=t._OrtCreateRunOptions(n.logSeverityLevel,n.logVerbosityLevel,!!n.terminate,s),0===r&&De("Can't create run options."),void 0!==e?.extra&&xe(e.extra,"",new WeakSet,((e,n)=>{let s=Fe(e,o),a=Fe(n,o);0!==t._OrtAddRunConfigEntry(r,s,a)&&De(`Can't set a run config entry: ${e} - ${n}.`)})),[r,o]}catch(e){throw 0!==r&&t._OrtReleaseRunOptions(r),o.forEach((e=>t._free(e))),e}}})),Lt=B((()=>{xt(),Dt(),Le=e=>{switch(e){case"disabled":return 0;case"basic":return 1;case"extended":return 2;case"all":return 99;default:throw new Error(`unsupported graph optimization level: ${e}`)}},Ie=e=>{switch(e){case"sequential":return 0;case"parallel":return 1;default:throw new Error(`unsupported execution mode: ${e}`)}},Be=e=>{e.extra||(e.extra={}),e.extra.session||(e.extra.session={});let t=e.extra.session;t.use_ort_model_bytes_directly||(t.use_ort_model_bytes_directly="1"),e.executionProviders&&e.executionProviders.some((e=>"webgpu"===("string"==typeof e?e:e.name)))&&(e.enableMemPattern=!1)},Ne=(e,t,r,o)=>{let n=Fe(t,o),s=Fe(r,o);0!==Re()._OrtAddSessionConfigEntry(e,n,s)&&De(`Can't set a session config entry: ${t} - ${r}.`)},ke=async(e,t,r)=>{for(let o of t){let t="string"==typeof o?o:o.name,n=[];switch(t){case"webnn":if(t="WEBNN","string"!=typeof o){let t=o?.deviceType;t&&Ne(e,"deviceType",t,r)}break;case"webgpu":if(t="JS","string"!=typeof o){let t=o;if(t?.preferredLayout){if("NCHW"!==t.preferredLayout&&"NHWC"!==t.preferredLayout)throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${t.preferredLayout}`);Ne(e,"preferredLayout",t.preferredLayout,r)}}break;case"wasm":case"cpu":continue;default:throw new Error(`not supported execution provider: ${t}`)}let s=Fe(t,r),a=n.length,i=0,l=0;if(a>0){i=Re()._malloc(a*Re().PTR_SIZE),r.push(i),l=Re()._malloc(a*Re().PTR_SIZE),r.push(l);for(let e=0;e{let t=Re(),r=0,o=[],n=e||{};Be(n);try{let e=Le(n.graphOptimizationLevel??"all"),s=Ie(n.executionMode??"sequential"),a="string"==typeof n.logId?Fe(n.logId,o):0,i=n.logSeverityLevel??2;if(!Number.isInteger(i)||i<0||i>4)throw new Error(`log serverity level is not valid: ${i}`);let l=n.logVerbosityLevel??0;if(!Number.isInteger(l)||l<0||l>4)throw new Error(`log verbosity level is not valid: ${l}`);let u="string"==typeof n.optimizedModelFilePath?Fe(n.optimizedModelFilePath,o):0;if(r=t._OrtCreateSessionOptions(e,!!n.enableCpuMemArena,!!n.enableMemPattern,s,!!n.enableProfiling,0,a,i,l,u),0===r&&De("Can't create session options."),n.executionProviders&&await ke(r,n.executionProviders,o),void 0!==n.enableGraphCapture){if("boolean"!=typeof n.enableGraphCapture)throw new Error(`enableGraphCapture must be a boolean value: ${n.enableGraphCapture}`);Ne(r,"enableGraphCapture",n.enableGraphCapture.toString(),o)}if(n.freeDimensionOverrides)for(let[e,s]of Object.entries(n.freeDimensionOverrides)){if("string"!=typeof e)throw new Error(`free dimension override name must be a string: ${e}`);if("number"!=typeof s||!Number.isInteger(s)||s<0)throw new Error(`free dimension override value must be a non-negative integer: ${s}`);let n=Fe(e,o);0!==t._OrtAddFreeDimensionOverride(r,n,s)&&De(`Can't set a free dimension override: ${e} - ${s}.`)}return void 0!==n.extra&&xe(n.extra,"",new WeakSet,((e,t)=>{Ne(r,e,t,o)})),[r,o]}catch(e){throw 0!==r&&0!==t._OrtReleaseSessionOptions(r)&&De("Can't release session options."),o.forEach((e=>t._free(e))),e}}})),It=B((()=>{je=e=>{switch(e){case"int8":return 3;case"uint8":return 2;case"bool":return 9;case"int16":return 5;case"uint16":return 4;case"int32":return 6;case"uint32":return 12;case"float16":return 10;case"float32":return 1;case"float64":return 11;case"string":return 8;case"int64":return 7;case"uint64":return 13;case"int4":return 22;case"uint4":return 21;default:throw new Error(`unsupported data type: ${e}`)}},Ve=e=>{switch(e){case 3:return"int8";case 2:return"uint8";case 9:return"bool";case 5:return"int16";case 4:return"uint16";case 6:return"int32";case 12:return"uint32";case 10:return"float16";case 1:return"float32";case 11:return"float64";case 8:return"string";case 7:return"int64";case 13:return"uint64";case 22:return"int4";case 21:return"uint4";default:throw new Error(`unsupported data type: ${e}`)}},Ge=(e,t)=>{let r=[-1,4,1,1,2,2,4,8,-1,1,2,8,4,8,-1,-1,-1,-1,-1,-1,-1,.5,.5][e],o="number"==typeof t?t:t.reduce(((e,t)=>e*t),1);return r>0?Math.ceil(o*r):void 0},We=e=>{switch(e){case"float16":return typeof Float16Array<"u"&&Float16Array.from?Float16Array:Uint16Array;case"float32":return Float32Array;case"uint8":case"bool":return Uint8Array;case"int8":return Int8Array;case"uint16":return Uint16Array;case"int16":return Int16Array;case"int32":return Int32Array;case"float64":return Float64Array;case"uint32":return Uint32Array;case"int64":return BigInt64Array;case"uint64":return BigUint64Array;default:throw new Error(`unsupported type: ${e}`)}},ze=e=>{switch(e){case"verbose":return 0;case"info":return 1;case"warning":return 2;case"error":return 3;case"fatal":return 4;default:throw new Error(`unsupported logging level: ${e}`)}},He=e=>"float32"===e||"float16"===e||"int32"===e||"int64"===e||"uint32"===e||"uint8"===e||"bool"===e||"uint4"===e||"int4"===e,Ze=e=>"float32"===e||"float16"===e||"int32"===e||"int64"===e||"uint32"===e||"uint64"===e||"int8"===e||"uint8"===e||"bool"===e||"uint4"===e||"int4"===e,qe=e=>{switch(e){case"none":return 0;case"cpu":return 1;case"cpu-pinned":return 2;case"texture":return 3;case"gpu-buffer":return 4;case"ml-tensor":return 5;default:throw new Error(`unsupported data location: ${e}`)}}})),Bt=B((()=>{ae(),Je=async e=>{if("string"==typeof e){let t=await fetch(e);if(!t.ok)throw new Error(`failed to load external data file: ${e}`);let r=t.headers.get("Content-Length"),o=r?parseInt(r,10):0;if(o<1073741824)return new Uint8Array(await t.arrayBuffer());{if(!t.body)throw new Error(`failed to load external data file: ${e}, no response body.`);let r,n=t.body.getReader();try{r=new ArrayBuffer(o)}catch(e){if(!(e instanceof RangeError))throw e;{let e=Math.ceil(o/65536);r=new WebAssembly.Memory({initial:e,maximum:e}).buffer}}let s=0;for(;;){let{done:e,value:t}=await n.read();if(e)break;let o=t.byteLength;new Uint8Array(r,s,o).set(t),s+=o}return new Uint8Array(r,0,o)}}return e instanceof Blob?new Uint8Array(await e.arrayBuffer()):e instanceof Uint8Array?e:new Uint8Array(e)}})),Nt=B((()=>{Ut(),Lt(),It(),xt(),Dt(),Bt(),Ke=(e,t)=>{0!==Re()._OrtInit(e,t)&&De("Can't initialize onnxruntime.")},Qe=async e=>{Ke(e.wasm.numThreads,ze(e.logLevel))},Xe=async(e,t)=>{Re().asyncInit?.()},Ye=new Map,et=e=>{let t=Re(),r=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);0!==t._OrtGetInputOutputCount(e,o,o+r)&&De("Can't get session input/output count.");let n=4===r?"i32":"i64";return[Number(t.getValue(o,n)),Number(t.getValue(o+r,n))]}finally{t.stackRestore(r)}},tt=(e,t)=>{let r=Re(),o=r.stackSave(),n=0;try{let o=r.PTR_SIZE,s=r.stackAlloc(2*o);0!==r._OrtGetInputOutputMetadata(e,t,s,s+o)&&De("Can't get session input/output metadata.");let a=Number(r.getValue(s,"*"));n=Number(r.getValue(s+o,"*"));let i=r.HEAP32[n/4];if(0===i)return[a,0];let l=r.HEAPU32[n/4+1],u=[];for(let e=0;e{let t=Re(),r=t._malloc(e.byteLength);if(0===r)throw new Error(`Can't create a session. failed to allocate a buffer of size ${e.byteLength}.`);return t.HEAPU8.set(e,r),[r,e.byteLength]},ot=async(e,t)=>{let r,o,n=Re();Array.isArray(e)?[r,o]=e:e.buffer===n.HEAPU8.buffer?[r,o]=[e.byteOffset,e.byteLength]:[r,o]=rt(e);let s=0,a=0,i=[],l=[],u=[];try{if([a,i]=await $e(t),t?.externalData&&n.mountExternalData){let e=[];for(let r of t.externalData){let t="string"==typeof r?r:r.path;e.push(Je("string"==typeof r?r:r.data).then((e=>{n.mountExternalData(t,e)})))}await Promise.all(e)}for(let e of t?.executionProviders??[])if("webnn"===("string"==typeof e?e:e.name)){if(n.shouldTransferToMLTensor=!1,"string"!=typeof e){let t=e,r=t?.context,o=t?.gpuDevice,s=t?.deviceType,a=t?.powerPreference;n.currentContext=r||(o?await n.webnnCreateMLContext(o):await n.webnnCreateMLContext({deviceType:s,powerPreference:a}))}else n.currentContext=await n.webnnCreateMLContext();break}s=await n._OrtCreateSession(r,o,a),n.webgpuOnCreateSession?.(s),0===s&&De("Can't create a session."),n.jsepOnCreateSession?.(),n.currentContext&&(n.webnnRegisterMLContext(s,n.currentContext),n.currentContext=void 0,n.shouldTransferToMLTensor=!0);let[e,d]=et(s),c=!!t?.enableGraphCapture,p=[],f=[],h=[],m=[];for(let t=0;tn._OrtFree(e))),u.forEach((e=>n._OrtFree(e))),0!==s&&0!==n._OrtReleaseSession(s)&&De("Can't release session."),e}finally{n._free(r),0!==a&&0!==n._OrtReleaseSessionOptions(a)&&De("Can't release session options."),i.forEach((e=>n._free(e))),n.unmountExternalData?.()}},nt=e=>{let t=Re(),r=Ye.get(e);if(!r)throw new Error(`cannot release session. invalid session id: ${e}`);let[o,n,s,a,i]=r;a&&(i&&0!==t._OrtClearBoundOutputs(a.handle)&&De("Can't clear bound outputs."),0!==t._OrtReleaseBinding(a.handle)&&De("Can't release IO binding.")),t.jsepOnReleaseSession?.(e),t.webnnOnReleaseSession?.(e),t.webgpuOnReleaseSession?.(e),n.forEach((e=>t._OrtFree(e))),s.forEach((e=>t._OrtFree(e))),0!==t._OrtReleaseSession(o)&&De("Can't release session."),Ye.delete(e)},st=async(e,t,r,o,n,s,a=!1)=>{if(!e)return void t.push(0);let i,l,u=Re(),d=u.PTR_SIZE,c=e[0],p=e[1],f=e[3],h=f;if("string"===c&&("gpu-buffer"===f||"ml-tensor"===f))throw new Error("String tensor is not supported on GPU.");if(a&&"gpu-buffer"!==f)throw new Error(`External buffer must be provided for input/output index ${s} when enableGraphCapture is true.`);if("gpu-buffer"===f){let t=e[2].gpuBuffer;l=Ge(je(c),p);{let e=u.jsepRegisterBuffer;if(!e)throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');i=e(o,s,t,l)}}else if("ml-tensor"===f){let t=e[2].mlTensor;l=Ge(je(c),p);let r=u.webnnRegisterMLTensor;if(!r)throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');i=r(o,t,je(c),p)}else{let t=e[2];if(Array.isArray(t)){l=d*t.length,i=u._malloc(l),r.push(i);for(let e=0;eu.setValue(w+t*d,e,4===d?"i32":"i64")));let e=u._OrtCreateTensor(je(c),i,l,w,p.length,qe(h));0===e&&De(`Can't create tensor for input/output. session=${o}, index=${s}.`),t.push(e)}finally{u.stackRestore(m)}},at=async(e,t,r,o,n,s)=>{let a=Re(),i=a.PTR_SIZE,l=Ye.get(e);if(!l)throw new Error(`cannot run inference. invalid session id: ${e}`);let u=l[0],d=l[1],c=l[2],p=l[3],f=l[4],h=(l[5],t.length),m=o.length,w=0,g=[],y=[],b=[],v=[],S=a.stackSave(),A=a.stackAlloc(h*i),E=a.stackAlloc(h*i),T=a.stackAlloc(m*i),_=a.stackAlloc(m*i);try{[w,g]=Ue(s);for(let o=0;oe*t),1);s=Ve(l);let g=p?.outputPreferredLocations[o[t]];if("string"===s){if("gpu-buffer"===g||"ml-tensor"===g)throw new Error("String tensor is not supported on GPU.");let e=[];for(let t=0;t0){let e=a.jsepGetBuffer;if(!e)throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');let t=e(c),o=Ge(l,w);if(void 0===o||!He(s))throw new Error(`Unsupported data type: ${s}`);d=!0,S.push([s,m,{gpuBuffer:t,download:a.jsepCreateDownloader(t,o,s),dispose:()=>{0!==a._OrtReleaseTensor(r)&&De("Can't release tensor.")}},"gpu-buffer"])}else if("ml-tensor"===g&&w>0){let t=a.webnnEnsureTensor,o=a.webnnIsGraphInputOutputTypeSupported;if(!t||!o)throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');if(void 0===Ge(l,w)||!Ze(s))throw new Error(`Unsupported data type: ${s}`);if(!o(e,s,!1))throw new Error(`preferredLocation "ml-tensor" for ${s} output is not supported by current WebNN Context.`);let n=await t(e,c,l,m,!1);d=!0,S.push([s,m,{mlTensor:n,download:a.webnnCreateMLTensorDownloader(c,s),dispose:()=>{a.webnnReleaseTensorId(c),a._OrtReleaseTensor(r)}},"ml-tensor"])}else if("ml-tensor-cpu-output"===g&&w>0){let e=a.webnnCreateMLTensorDownloader(c,s)(),t=S.length;d=!0,O.push((async()=>{let o=[t,await e];return a.webnnReleaseTensorId(c),a._OrtReleaseTensor(r),o})()),S.push([s,m,[],"cpu"])}else{let e=new(We(s))(w);new Uint8Array(e.buffer,e.byteOffset,e.byteLength).set(a.HEAPU8.subarray(c,c+e.byteLength)),S.push([s,m,e,"cpu"])}}finally{a.stackRestore(l),"string"===s&&c&&a._free(c),d||a._OrtReleaseTensor(r)}}p&&!f&&(0!==a._OrtClearBoundOutputs(p.handle)&&De("Can't clear bound outputs."),Ye.set(e,[u,d,c,p,f,!1]));for(let[e,t]of await Promise.all(O))S[e][2]=t;return S}finally{a.webnnOnRunEnd?.(u),a.stackRestore(S),y.forEach((e=>a._OrtReleaseTensor(e))),b.forEach((e=>a._OrtReleaseTensor(e))),v.forEach((e=>a._free(e))),0!==w&&a._OrtReleaseRunOptions(w),g.forEach((e=>a._free(e)))}},it=e=>{let t=Re(),r=Ye.get(e);if(!r)throw new Error("invalid session id");let o=r[0],n=t._OrtEndProfiling(o);0===n&&De("Can't get an profile file name."),t._OrtFree(n)},lt=e=>{let t=[];for(let r of e){let e=r[2];!Array.isArray(e)&&"buffer"in e&&t.push(e.buffer)}return t}})),kt=B((()=>{se(),Nt(),xt(),Ft(),ut=()=>!!u.wasm.proxy&&typeof document<"u",ct=!1,pt=!1,ft=!1,wt=new Map,gt=(e,t)=>{let r=wt.get(e);r?r.push(t):wt.set(e,[t])},yt=()=>{if(ct||!pt||ft||!dt)throw new Error("worker not ready")},bt=e=>{switch(e.data.type){case"init-wasm":ct=!1,e.data.err?(ft=!0,mt[1](e.data.err)):(pt=!0,mt[0]()),ht&&(URL.revokeObjectURL(ht),ht=void 0);break;case"init-ep":case"copy-from":case"create":case"release":case"run":case"end-profiling":{let t=wt.get(e.data.type);e.data.err?t.shift()[1](e.data.err):t.shift()[0](e.data.out);break}}},vt=async()=>{if(!pt){if(ct)throw new Error("multiple calls to 'initWasm()' detected.");if(ft)throw new Error("previous call to 'initWasm()' failed.");if(ct=!0,ut())return new Promise(((e,t)=>{dt?.terminate(),ve().then((([r,o])=>{try{(dt=o).onerror=e=>t(e),dt.onmessage=bt,mt=[e,t];let n={type:"init-wasm",in:u};if(!n.in.wasm.wasmPaths&&r){let e=fe();e&&(n.in.wasm.wasmPaths=e)}dt.postMessage(n),ht=r}catch(e){t(e)}}),t)}));try{await Ce(u.wasm),await Qe(u),pt=!0}catch(e){throw ft=!0,e}finally{ct=!1}}},St=async e=>{if(ut())return yt(),new Promise(((t,r)=>{gt("init-ep",[t,r]);let o={type:"init-ep",in:{epName:e,env:u}};dt.postMessage(o)}));await Xe(u,e)},At=async e=>ut()?(yt(),new Promise(((t,r)=>{gt("copy-from",[t,r]);let o={type:"copy-from",in:{buffer:e}};dt.postMessage(o,[e.buffer])}))):rt(e),Et=async(e,t)=>{if(ut()){if(t?.preferredOutputLocation)throw new Error('session option "preferredOutputLocation" is not supported for proxy.');return yt(),new Promise(((r,o)=>{gt("create",[r,o]);let n={type:"create",in:{model:e,options:{...t}}},s=[];e instanceof Uint8Array&&s.push(e.buffer),dt.postMessage(n,s)}))}return ot(e,t)},Tt=async e=>{if(ut())return yt(),new Promise(((t,r)=>{gt("release",[t,r]);let o={type:"release",in:e};dt.postMessage(o)}));nt(e)},_t=async(e,t,r,o,n,s)=>{if(ut()){if(r.some((e=>"cpu"!==e[3])))throw new Error("input tensor on GPU is not supported for proxy.");if(n.some((e=>e)))throw new Error("pre-allocated output tensor is not supported for proxy.");return yt(),new Promise(((n,a)=>{gt("run",[n,a]);let i=r,l={type:"run",in:{sessionId:e,inputIndices:t,inputs:i,outputIndices:o,options:s}};dt.postMessage(l,lt(i))}))}return at(e,t,r,o,n,s)},Ot=async e=>{if(ut())return yt(),new Promise(((t,r)=>{gt("end-profiling",[t,r]);let o={type:"end-profiling",in:e};dt.postMessage(o)}));it(e)}})),$t=B((()=>{se(),kt(),It(),ae(),Bt(),Pt=(e,t)=>{switch(e.location){case"cpu":return[e.type,e.dims,e.data,"cpu"];case"gpu-buffer":return[e.type,e.dims,{gpuBuffer:e.gpuBuffer},"gpu-buffer"];case"ml-tensor":return[e.type,e.dims,{mlTensor:e.mlTensor},"ml-tensor"];default:throw new Error(`invalid data location: ${e.location} for ${t()}`)}},Mt=e=>{switch(e[3]){case"cpu":return new _(e[0],e[2],e[1]);case"gpu-buffer":{let t=e[0];if(!He(t))throw new Error(`not supported data type: ${t} for deserializing GPU tensor`);let{gpuBuffer:r,download:o,dispose:n}=e[2];return _.fromGpuBuffer(r,{dataType:t,dims:e[1],download:o,dispose:n})}case"ml-tensor":{let t=e[0];if(!Ze(t))throw new Error(`not supported data type: ${t} for deserializing MLTensor tensor`);let{mlTensor:r,download:o,dispose:n}=e[2];return _.fromMLTensor(r,{dataType:t,dims:e[1],download:o,dispose:n})}default:throw new Error(`invalid data location: ${e[3]}`)}},Ct=class{async fetchModelAndCopyToWasmMemory(e){return At(await Je(e))}async loadModel(e,t){let r;M(),r="string"==typeof e?await this.fetchModelAndCopyToWasmMemory(e):e,[this.sessionId,this.inputNames,this.outputNames,this.inputMetadata,this.outputMetadata]=await Et(r,t),C()}async dispose(){return Tt(this.sessionId)}async run(e,t,r){M();let o=[],n=[];Object.entries(e).forEach((e=>{let t=e[0],r=e[1],s=this.inputNames.indexOf(t);if(-1===s)throw new Error(`invalid input '${t}'`);o.push(r),n.push(s)}));let s=[],a=[];Object.entries(t).forEach((e=>{let t=e[0],r=e[1],o=this.outputNames.indexOf(t);if(-1===o)throw new Error(`invalid output '${t}'`);s.push(r),a.push(o)}));let i=o.map(((e,t)=>Pt(e,(()=>`input "${this.inputNames[n[t]]}"`)))),l=s.map(((e,t)=>e?Pt(e,(()=>`output "${this.outputNames[a[t]]}"`)):null)),u=await _t(this.sessionId,n,i,a,l,r),d={};for(let e=0;eGt,initializeFlags:()=>Vt,wasmBackend:()=>Wt});var Vt,Gt,Wt,zt=B((()=>{se(),kt(),$t(),Vt=()=>{("number"!=typeof u.wasm.initTimeout||u.wasm.initTimeout<0)&&(u.wasm.initTimeout=0);let e=u.wasm.simd;if("boolean"!=typeof e&&void 0!==e&&"fixed"!==e&&"relaxed"!==e&&(console.warn(`Property "env.wasm.simd" is set to unknown value "${e}". Reset it to \`false\` and ignore SIMD feature checking.`),u.wasm.simd=!1),"boolean"!=typeof u.wasm.proxy&&(u.wasm.proxy=!1),"boolean"!=typeof u.wasm.trace&&(u.wasm.trace=!1),"number"!=typeof u.wasm.numThreads||!Number.isInteger(u.wasm.numThreads)||u.wasm.numThreads<=0)if(typeof self<"u"&&!self.crossOriginIsolated)u.wasm.numThreads=1;else{let e=typeof navigator>"u"?I("node:os").cpus().length:navigator.hardwareConcurrency;u.wasm.numThreads=Math.min(4,Math.ceil((e||1)/2))}},Wt=new(Gt=class{async init(e){Vt(),await vt(),await St(e)}async createInferenceSessionHandler(e,t){let r=new Ct;return await r.loadModel(e,t),r}})})),Ht={};N(Ht,{InferenceSession:()=>F,TRACE:()=>O,TRACE_FUNC_BEGIN:()=>M,TRACE_FUNC_END:()=>C,Tensor:()=>_,default:()=>Zt,env:()=>u,registerBackend:()=>o}),se(),se(),se();var Zt=ne;{let e=(zt(),k(jt)).wasmBackend;o("cpu",e,10),o("wasm",e,10)}return Object.defineProperty(u.versions,"web",{value:"1.22.0",enumerable:!0}),k(Ht)})();e.exports=o},936:e=>{function t(e){var t=new Error("Cannot find module '"+e+"'");throw t.code="MODULE_NOT_FOUND",t}t.keys=()=>[],t.resolve=t,t.id=936,e.exports=t},485:(e,t)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.baseAssetPath=void 0;const r="undefined"!=typeof window&&void 0!==window.document?window.document.currentScript:null;let o="/";r&&(o=r.src.replace(/#.*$/,"").replace(/\?.*$/,"").replace(/\/[^/]+$/,"/")),t.baseAssetPath=o},973:(e,t)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.defaultModelFetcher=void 0,t.defaultModelFetcher=e=>fetch(e).then((e=>e.arrayBuffer()))},362:(e,t,r)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.FrameProcessor=t.validateOptions=t.defaultFrameProcessorOptions=void 0;const o=r(710),n=r(954);t.defaultFrameProcessorOptions={positiveSpeechThreshold:.3,negativeSpeechThreshold:.25,preSpeechPadMs:800,redemptionMs:1400,minSpeechMs:400,submitUserSpeechOnPause:!1},t.validateOptions=function(e){(e.positiveSpeechThreshold<0||e.positiveSpeechThreshold>1)&&o.log.error("positiveSpeechThreshold should be a number between 0 and 1"),(e.negativeSpeechThreshold<0||e.negativeSpeechThreshold>e.positiveSpeechThreshold)&&o.log.error("negativeSpeechThreshold should be between 0 and positiveSpeechThreshold"),e.preSpeechPadMs<0&&o.log.error("preSpeechPadMs should be positive"),e.redemptionMs<0&&o.log.error("redemptionMs should be positive"),e.minSpeechMs<0&&o.log.error("minSpeechMs should be positive")};const s=e=>{const t=e.reduce(((e,t)=>(e.push(e.at(-1)+t.length),e)),[0]),r=new Float32Array(t.at(-1));return e.forEach(((e,o)=>{const n=t[o];r.set(e,n)})),r};function a(e,t){return{redemptionFrames:Math.floor(e.redemptionMs/t),preSpeechPadFrames:Math.floor(e.preSpeechPadMs/t),minSpeechFrames:Math.floor(e.minSpeechMs/t)}}t.FrameProcessor=class{constructor(e,t,r,o){this.modelProcessFunc=e,this.modelResetFunc=t,this.options=r,this.msPerFrame=o,this.speaking=!1,this.redemptionCounter=0,this.speechFrameCount=0,this.active=!1,this.speechRealStartFired=!1,this.setOptions=e=>{this.options={...this.options,...e};const{redemptionFrames:t,preSpeechPadFrames:r,minSpeechFrames:o}=a(this.options,this.msPerFrame);this.redemptionFrames=t,this.preSpeechPadFrames=r,this.minSpeechFrames=o},this.reset=()=>{this.speaking=!1,this.speechRealStartFired=!1,this.audioBuffer=[],this.modelResetFunc(),this.redemptionCounter=0,this.speechFrameCount=0},this.pause=e=>{this.active=!1,this.options.submitUserSpeechOnPause?this.endSegment(e):this.reset()},this.resume=()=>{this.active=!0},this.endSegment=e=>{const t=this.audioBuffer;this.audioBuffer=[];const r=this.speaking;if(this.reset(),r)if(t.reduce(((e,t)=>t.isSpeech?e+1:e),0)>=this.minSpeechFrames){const r=s(t.map((e=>e.frame)));e({msg:n.Message.SpeechEnd,audio:r})}else e({msg:n.Message.VADMisfire});return{}},this.process=async(e,t)=>{if(!this.active)return;const r=await this.modelProcessFunc(e),o=r.isSpeech>=this.options.positiveSpeechThreshold;if(t({probs:r,msg:n.Message.FrameProcessed,frame:e}),this.audioBuffer.push({frame:e,isSpeech:o}),o&&(this.speechFrameCount++,this.redemptionCounter=0),o&&!this.speaking&&(this.speaking=!0,t({msg:n.Message.SpeechStart})),this.speaking&&this.speechFrameCount===this.minSpeechFrames&&!this.speechRealStartFired&&(this.speechRealStartFired=!0,t({msg:n.Message.SpeechRealStart})),r.isSpeech=this.redemptionFrames){this.redemptionCounter=0,this.speechFrameCount=0,this.speaking=!1,this.speechRealStartFired=!1;const e=this.audioBuffer;if(this.audioBuffer=[],e.reduce(((e,t)=>t.isSpeech?e+1:e),0)>=this.minSpeechFrames){const r=s(e.map((e=>e.frame)));t({msg:n.Message.SpeechEnd,audio:r})}else t({msg:n.Message.VADMisfire})}if(!this.speaking){for(;this.audioBuffer.length>this.preSpeechPadFrames;)this.audioBuffer.shift();this.speechFrameCount=0}},this.audioBuffer=[];const{redemptionFrames:i,preSpeechPadFrames:l,minSpeechFrames:u}=a(this.options,this.msPerFrame);this.redemptionFrames=i,this.preSpeechPadFrames=l,this.minSpeechFrames=u,this.reset()}}},710:(e,t)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.log=void 0;const r=e=>t=>{console.log(`VAD | ${e} >`,t)};t.log={error:r("error"),debug:r("debug"),warn:r("warn")}},954:(e,t)=>{"use strict";var r;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP",e.SpeechRealStart="SPEECH_REAL_START",e.FrameProcessed="FRAME_PROCESSED"}(r||(t.Message=r={}))},650:(e,t)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0})},559:function(e,t,r){"use strict";var o=this&&this.__createBinding||(Object.create?function(e,t,r,o){void 0===o&&(o=r);var n=Object.getOwnPropertyDescriptor(t,r);n&&!("get"in n?!t.__esModule:n.writable||n.configurable)||(n={enumerable:!0,get:function(){return t[r]}}),Object.defineProperty(e,o,n)}:function(e,t,r,o){void 0===o&&(o=r),e[o]=t[r]}),n=this&&this.__exportStar||function(e,t){for(var r in e)"default"===r||Object.prototype.hasOwnProperty.call(t,r)||o(t,e,r)};Object.defineProperty(t,"__esModule",{value:!0}),t.SileroV5=t.SileroLegacy=void 0,n(r(650),t);var s=r(143);Object.defineProperty(t,"SileroLegacy",{enumerable:!0,get:function(){return s.SileroLegacy}});var a=r(508);Object.defineProperty(t,"SileroV5",{enumerable:!0,get:function(){return a.SileroV5}})},143:(e,t,r)=>{"use strict";var o;Object.defineProperty(t,"__esModule",{value:!0}),t.SileroLegacy=void 0;const n=r(710);class s{constructor(e,t,r,o,n){this.ortInstance=e,this._session=t,this._h=r,this._c=o,this._sr=n,this.reset_state=()=>{const e=Array(128).fill(0);this._h=new this.ortInstance.Tensor("float32",e,[2,1,64]),this._c=new this.ortInstance.Tensor("float32",e,[2,1,64])},this.process=async e=>{const t={input:new this.ortInstance.Tensor("float32",e,[1,e.length]),h:this._h,c:this._c,sr:this._sr},r=await this._session.run(t);this._h=r.hn,this._c=r.cn;const[o]=r.output?.data;return{notSpeech:1-o,isSpeech:o}},this.release=async()=>{await this._session.release(),this._h.dispose(),this._c.dispose(),this._sr.dispose()}}}t.SileroLegacy=s,o=s,s.new=async(e,t)=>{n.log.debug("initializing vad");const r=await t(),s=await e.InferenceSession.create(r),a=new e.Tensor("int64",[16000n]),i=Array(128).fill(0),l=new e.Tensor("float32",i,[2,1,64]),u=new e.Tensor("float32",i,[2,1,64]);return n.log.debug("vad is initialized"),new o(e,s,l,u,a)}},508:(e,t,r)=>{"use strict";var o;Object.defineProperty(t,"__esModule",{value:!0}),t.SileroV5=void 0;const n=r(710);function s(e){const t=Array(256).fill(0);return new e.Tensor("float32",t,[2,1,128])}class a{constructor(e,t,r,o){this._session=e,this._state=t,this._sr=r,this.ortInstance=o,this.reset_state=()=>{this._state=s(this.ortInstance)},this.process=async e=>{const t={input:new this.ortInstance.Tensor("float32",e,[1,e.length]),state:this._state,sr:this._sr},r=await this._session.run(t);if(!r.stateN)throw new Error("No state from model");if(this._state=r.stateN,!r.output?.data)throw new Error("No output from model");const o=r.output.data[0];if("number"!=typeof o)throw new Error("Weird output data");return{notSpeech:1-o,isSpeech:o}},this.release=async()=>{await this._session.release(),this._state.dispose(),this._sr.dispose()}}}t.SileroV5=a,o=a,a.new=async(e,t)=>{n.log.debug("Loading VAD...");const r=await t(),a=await e.InferenceSession.create(r),i=new e.Tensor("int64",[16000n]),l=s(e);return n.log.debug("...finished loading VAD"),new o(a,l,i,e)}},202:function(e,t,r){"use strict";var o=this&&this.__createBinding||(Object.create?function(e,t,r,o){void 0===o&&(o=r);var n=Object.getOwnPropertyDescriptor(t,r);n&&!("get"in n?!t.__esModule:n.writable||n.configurable)||(n={enumerable:!0,get:function(){return t[r]}}),Object.defineProperty(e,o,n)}:function(e,t,r,o){void 0===o&&(o=r),e[o]=t[r]}),n=this&&this.__setModuleDefault||(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),s=this&&this.__importStar||function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var r in e)"default"!==r&&Object.prototype.hasOwnProperty.call(e,r)&&o(t,e,r);return n(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.NonRealTimeVAD=t.defaultNonRealTimeVADOptions=void 0;const a=s(r(656)),i=r(485),l=r(973),u=r(362),d=r(954),c=r(559),p=r(825);t.defaultNonRealTimeVADOptions={...u.defaultFrameProcessorOptions,modelURL:i.baseAssetPath+"silero_vad_legacy.onnx",modelFetcher:l.defaultModelFetcher},t.NonRealTimeVAD=class{static async new(e={}){const r={...t.defaultNonRealTimeVADOptions,...e};(0,u.validateOptions)(r),void 0!==r.ortConfig&&r.ortConfig(a);const o=()=>r.modelFetcher(r.modelURL),n=await c.SileroLegacy.new(a,o),s=new u.FrameProcessor(n.process,n.reset_state,{positiveSpeechThreshold:r.positiveSpeechThreshold,negativeSpeechThreshold:r.negativeSpeechThreshold,redemptionMs:r.redemptionMs,preSpeechPadMs:r.preSpeechPadMs,minSpeechMs:r.minSpeechMs,submitUserSpeechOnPause:r.submitUserSpeechOnPause},96);return s.resume(),new this(o,a,r,s)}constructor(e,t,r,o){this.modelFetcher=e,this.ort=t,this.options=r,this.frameProcessor=o,this.frameSamples=1536}async*run(e,t){const r={nativeSampleRate:t,targetSampleRate:16e3,targetFrameSize:this.frameSamples},o=new p.Resampler(r);let n=0,s=0,a=0;for await(const t of o.stream(e)){const e=[];await this.frameProcessor.process(t,(t=>{e.push(t)}));for(const t of e)switch(t.msg){case d.Message.SpeechStart:n=a*this.frameSamples/16;break;case d.Message.SpeechEnd:s=(a+1)*this.frameSamples/16,yield{audio:t.audio,start:n,end:s}}a++}const i=[];this.frameProcessor.endSegment((e=>{i.push(e)}));for(const e of i)e.msg===d.Message.SpeechEnd&&(yield{audio:e.audio,start:n,end:a*this.frameSamples/16})}}},746:function(e,t,r){"use strict";var o=this&&this.__createBinding||(Object.create?function(e,t,r,o){void 0===o&&(o=r);var n=Object.getOwnPropertyDescriptor(t,r);n&&!("get"in n?!t.__esModule:n.writable||n.configurable)||(n={enumerable:!0,get:function(){return t[r]}}),Object.defineProperty(e,o,n)}:function(e,t,r,o){void 0===o&&(o=r),e[o]=t[r]}),n=this&&this.__setModuleDefault||(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),s=this&&this.__importStar||function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var r in e)"default"!==r&&Object.prototype.hasOwnProperty.call(e,r)&&o(t,e,r);return n(t,e),t};Object.defineProperty(t,"__esModule",{value:!0}),t.MicVAD=t.getDefaultRealTimeVADOptions=t.ort=t.DEFAULT_MODEL=void 0;const a=s(r(647)),i=r(973),l=r(362),u=r(710),d=r(954),c=r(559),p=r(825);t.DEFAULT_MODEL="legacy",t.ort=a,t.getDefaultRealTimeVADOptions=e=>({...l.defaultFrameProcessorOptions,onFrameProcessed:()=>{},onVADMisfire:()=>{u.log.debug("VAD misfire")},onSpeechStart:()=>{u.log.debug("Detected speech start")},onSpeechEnd:()=>{u.log.debug("Detected speech end")},onSpeechRealStart:()=>{u.log.debug("Detected real speech start")},baseAssetPath:"./",onnxWASMBasePath:"./",model:e,workletOptions:{},getStream:async()=>await navigator.mediaDevices.getUserMedia({audio:{channelCount:1,echoCancellation:!0,autoGainControl:!0,noiseSuppression:!0}}),pauseStream:async e=>{e.getTracks().forEach((e=>{e.stop()}))},resumeStream:async()=>await navigator.mediaDevices.getUserMedia({audio:{channelCount:1,echoCancellation:!0,autoGainControl:!0,noiseSuppression:!0}}),ortConfig:e=>{e.env.logLevel="error"},startOnLoad:!0,processorType:"auto"});class f{constructor(e,t,r,o,n=!1,s=null,a=null,i=null,l=null,c=null,f=null,h="uninitialized",m=!1){this.options=e,this.frameProcessor=t,this.model=r,this.frameSamples=o,this.listening=n,this.errored=s,this._stream=a,this._audioContext=i,this._vadNode=l,this._mediaStreamAudioSourceNode=c,this._audioProcessorAdapterType=f,this.initializationState=h,this.ownsAudioContext=m,this.getAudioInstances=()=>{if(null===this._stream||null===this._audioContext||null==this._vadNode||null==this._mediaStreamAudioSourceNode)throw new Error("MicVAD has null stream, audio context, or processor adapter");return{stream:this._stream,audioContext:this._audioContext,vadNode:this._vadNode,mediaStreamAudioSourceNode:this._mediaStreamAudioSourceNode}},this.setErrored=e=>{this.initializationState="errored",this.errored=e},this.start=async()=>{switch(this.initializationState){case"uninitialized":u.log.debug("initializing micVAD"),this.initializationState="initializing",this.frameProcessor.resume();try{this._stream=await this.options.getStream()}catch(e){throw e instanceof Error?this.setErrored(e.message):this.setErrored(String(e)),e}if(this.options.audioContext?(console.log("using custom audio context"),this._audioContext=this.options.audioContext):(console.log("using default audio context"),this._audioContext=new AudioContext,this.ownsAudioContext=!0),!this._audioContext)throw this.setErrored("Audio context is null"),Error("Audio context is null");switch(this._audioProcessorAdapterType="auto"==this.options.processorType?"audioWorklet"in this._audioContext&&"function"==typeof AudioWorkletNode?"AudioWorklet":"ScriptProcessor":this.options.processorType,this._audioProcessorAdapterType){case"AudioWorklet":this._vadNode=await async function(e,t,r,o,n){await r.audioWorklet.addModule(e),t.processorOptions={...t.processorOptions??{},frameSamples:o};const s=new AudioWorkletNode(r,"vad-helper-worklet",t);return s.port.onmessage=async e=>{const t=e.data;if("object"==typeof t&&t&&"message"in t)switch(t.message){case d.Message.AudioFrame:{if(!("data"in t&&t.data instanceof ArrayBuffer))return void console.log("Audio frame message has no data");const e=new Float32Array(t.data);await n(e);break}}else console.error("Invalid message event",t)},s}(this.options.baseAssetPath+"vad.worklet.bundle.min.js",this.options.workletOptions,this._audioContext,this.frameSamples,this.processFrame);break;case"ScriptProcessor":this._vadNode=await async function(e,t,r){const o=new p.Resampler({nativeSampleRate:e.sampleRate,targetSampleRate:16e3,targetFrameSize:t});u.log.debug("using script processor");const n=e.createScriptProcessor(4096,1,1);let s=!1;return n.onaudioprocess=async e=>{if(!s){s=!0;try{const t=e.inputBuffer.getChannelData(0);e.outputBuffer.getChannelData(0).fill(0);const n=o.process(t);for(const e of n)await r(e)}catch(e){console.error("Error processing audio:",e)}finally{s=!1}}},n.connect(e.destination),n}(this._audioContext,this.frameSamples,this.processFrame);break;default:throw new Error(`Unsupported audio processor adapter type: ${this._audioProcessorAdapterType}`)}this._mediaStreamAudioSourceNode=new MediaStreamAudioSourceNode(this._audioContext,{mediaStream:this._stream}),this._mediaStreamAudioSourceNode.connect(this._vadNode),u.log.debug("started micVAD"),this.listening=!0,this.initializationState="initialized";break;case"initializing":u.log.warn("start called while initializing");break;case"initialized":{if(this.listening)return;this.listening=!0,this.frameProcessor.resume();const{stream:e,audioContext:t,vadNode:r}=this.getAudioInstances();this._stream=await this.options.resumeStream(e);const o=new MediaStreamAudioSourceNode(t,{mediaStream:this._stream});this._mediaStreamAudioSourceNode=o,o.connect(r);break}case"destroyed":u.log.warn("start called after destroyed");break;case"errored":u.log.error("start called after errored");break;default:u.log.warn("weird initialization state")}},this.pause=async()=>{if(!this.listening)return;this.listening=!1;const{stream:e,mediaStreamAudioSourceNode:t}=this.getAudioInstances();await this.options.pauseStream(e),t.disconnect(),this.frameProcessor.pause(this.handleFrameProcessorEvent)},this.destroy=async()=>{u.log.debug("destroy called"),this.initializationState="destroyed";const{vadNode:e}=this.getAudioInstances();e instanceof AudioWorkletNode&&e.port.postMessage(d.Message.SpeechStop),this.listening&&await this.pause(),await this.model.release(),this.ownsAudioContext&&await(this._audioContext?.close())},this.setOptions=e=>{this.frameProcessor.setOptions(e)},this.processFrame=async e=>{await this.frameProcessor.process(e,this.handleFrameProcessorEvent)},this.handleFrameProcessorEvent=e=>{switch(e.msg){case d.Message.FrameProcessed:this.options.onFrameProcessed(e.probs,e.frame);break;case d.Message.SpeechStart:this.options.onSpeechStart();break;case d.Message.SpeechRealStart:this.options.onSpeechRealStart();break;case d.Message.VADMisfire:this.options.onVADMisfire();break;case d.Message.SpeechEnd:this.options.onSpeechEnd(e.audio)}}}static async new(e={}){const r={...(0,t.getDefaultRealTimeVADOptions)(e.model??t.DEFAULT_MODEL),...e};(0,l.validateOptions)(r),t.ort.env.wasm.wasmPaths=r.onnxWASMBasePath,void 0!==r.ortConfig&&r.ortConfig(t.ort);const o="v5"===r.model?"silero_vad_v5.onnx":"silero_vad_legacy.onnx",n=r.baseAssetPath+o,s="v5"===r.model?c.SileroV5.new:c.SileroLegacy.new;let a;try{a=await s(t.ort,(()=>(0,i.defaultModelFetcher)(n)))}catch(e){throw console.error(`Encountered an error while loading model file ${n}`),e}const u="v5"===r.model?512:1536,d=u/16,p=new l.FrameProcessor(a.process,a.reset_state,{positiveSpeechThreshold:r.positiveSpeechThreshold,negativeSpeechThreshold:r.negativeSpeechThreshold,redemptionMs:r.redemptionMs,preSpeechPadMs:r.preSpeechPadMs,minSpeechMs:r.minSpeechMs,submitUserSpeechOnPause:r.submitUserSpeechOnPause},d),h=new f(r,p,a,u);if(r.startOnLoad)try{await h.start()}catch(e){throw console.error("Error starting micVad",e),e}return h}}t.MicVAD=f},825:(e,t,r)=>{"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const o=r(710);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const r of e)for(this.inputBuffer.push(r);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();t.push(e)}return t},e.nativeSampleRate<16e3&&o.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}async*stream(e){for(const t of e)for(this.inputBuffer.push(t);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();yield e}}hasEnoughDataForFrame(){return this.inputBuffer.length*this.options.targetSampleRate/this.options.nativeSampleRate>=this.options.targetFrameSize}generateOutputFrame(){const e=new Float32Array(this.options.targetFrameSize);let t=0,r=0;for(;t{"use strict";function r(e,t,r){for(let o=0;o{r.addEventListener("loadend",(()=>{const e=r.result;t.decodeAudioData(e,(e=>{o=e,t.startRendering().then((()=>{console.log("Rendering completed successfully"),n()})).catch((e=>{console.error("Rendering failed: ",e)}))}),(e=>{console.log("Error with decoding audio data: ",e)}))})),r.readAsArrayBuffer(e)})),null===o)throw Error("some shit");const n=o,s=new Float32Array(n.length);for(let e=0;e{"use strict";t.exports=e}},r={};function o(e){var n=r[e];if(void 0!==n)return n.exports;var s=r[e]={exports:{}};return t[e].call(s.exports,s,s.exports,o),s.exports}o.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t);var n={};return(()=>{"use strict";var e=n;Object.defineProperty(e,"__esModule",{value:!0}),e.getDefaultRealTimeVADOptions=e.MicVAD=e.DEFAULT_MODEL=e.utils=e.NonRealTimeVAD=e.Message=e.FrameProcessor=e.defaultModelFetcher=e.baseAssetPath=void 0;var t=o(485);Object.defineProperty(e,"baseAssetPath",{enumerable:!0,get:function(){return t.baseAssetPath}});var r=o(973);Object.defineProperty(e,"defaultModelFetcher",{enumerable:!0,get:function(){return r.defaultModelFetcher}});var s=o(362);Object.defineProperty(e,"FrameProcessor",{enumerable:!0,get:function(){return s.FrameProcessor}});var a=o(954);Object.defineProperty(e,"Message",{enumerable:!0,get:function(){return a.Message}});var i=o(202);Object.defineProperty(e,"NonRealTimeVAD",{enumerable:!0,get:function(){return i.NonRealTimeVAD}});const l=o(787);e.utils={audioFileToArray:l.audioFileToArray,minFramesForTargetMS:l.minFramesForTargetMS,arrayBufferToBase64:l.arrayBufferToBase64,encodeWAV:l.encodeWAV};var u=o(746);Object.defineProperty(e,"DEFAULT_MODEL",{enumerable:!0,get:function(){return u.DEFAULT_MODEL}}),Object.defineProperty(e,"MicVAD",{enumerable:!0,get:function(){return u.MicVAD}}),Object.defineProperty(e,"getDefaultRealTimeVADOptions",{enumerable:!0,get:function(){return u.getDefaultRealTimeVADOptions}})})(),n})())); \ No newline at end of file diff --git a/dashboard/vad/silero_vad_legacy.onnx b/dashboard/vad/silero_vad_legacy.onnx new file mode 100644 index 0000000..e6db48d Binary files /dev/null and b/dashboard/vad/silero_vad_legacy.onnx differ diff --git a/dashboard/vad/silero_vad_v5.onnx b/dashboard/vad/silero_vad_v5.onnx new file mode 100644 index 0000000..b3e3a90 Binary files /dev/null and b/dashboard/vad/silero_vad_v5.onnx differ diff --git a/dashboard/vad/vad.worklet.bundle.min.js b/dashboard/vad/vad.worklet.bundle.min.js new file mode 100644 index 0000000..dfa2c2b --- /dev/null +++ b/dashboard/vad/vad.worklet.bundle.min.js @@ -0,0 +1 @@ +(()=>{"use strict";var e={710:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.log=void 0;const s=e=>t=>{console.log(`VAD | ${e} >`,t)};t.log={error:s("error"),debug:s("debug"),warn:s("warn")}},954:(e,t)=>{var s;Object.defineProperty(t,"__esModule",{value:!0}),t.Message=void 0,function(e){e.AudioFrame="AUDIO_FRAME",e.SpeechStart="SPEECH_START",e.VADMisfire="VAD_MISFIRE",e.SpeechEnd="SPEECH_END",e.SpeechStop="SPEECH_STOP",e.SpeechRealStart="SPEECH_REAL_START",e.FrameProcessed="FRAME_PROCESSED"}(s||(t.Message=s={}))},825:(e,t,s)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.Resampler=void 0;const r=s(710);t.Resampler=class{constructor(e){this.options=e,this.process=e=>{const t=[];for(const s of e)for(this.inputBuffer.push(s);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();t.push(e)}return t},e.nativeSampleRate<16e3&&r.log.error("nativeSampleRate is too low. Should have 16000 = targetSampleRate <= nativeSampleRate"),this.inputBuffer=[]}async*stream(e){for(const t of e)for(this.inputBuffer.push(t);this.hasEnoughDataForFrame();){const e=this.generateOutputFrame();yield e}}hasEnoughDataForFrame(){return this.inputBuffer.length*this.options.targetSampleRate/this.options.nativeSampleRate>=this.options.targetFrameSize}generateOutputFrame(){const e=new Float32Array(this.options.targetFrameSize);let t=0,s=0;for(;t{const e=s(710),t=s(954),r=s(825);class o extends AudioWorkletProcessor{constructor(s){super(),this._stopProcessing=!1,this.options=s.processorOptions,this.port.onmessage=s=>{s.data===t.Message.SpeechStop&&(e.log.debug("Worklet received speech stop message"),this._stopProcessing=!0)},this.resampler=new r.Resampler({nativeSampleRate:sampleRate,targetSampleRate:16e3,targetFrameSize:this.options.frameSamples})}process(e){if(this._stopProcessing)return!1;const s=e[0];if(void 0===s)return!0;const r=s[0];if(void 0===r)return!0;const o=this.resampler.process(r);for(const e of o)this.port.postMessage({message:t.Message.AudioFrame,data:e.buffer},[e.buffer]);return!0}}registerProcessor("vad-helper-worklet",o)})()})(); \ No newline at end of file diff --git a/http_api.py b/http_api.py index 420568a..45a7fb8 100644 --- a/http_api.py +++ b/http_api.py @@ -1,4 +1,4 @@ -"""Mod³ HTTP API — REST interface for TTS synthesis and VAD. +"""Mod³ HTTP API — REST interface for TTS synthesis, VAD, and dashboard. Endpoints: POST /v1/synthesize — text → audio bytes (WAV/PCM) + structured metrics @@ -9,18 +9,29 @@ GET /v1/jobs — list recent generation jobs with full metrics GET /v1/jobs/{id} — get a specific job's metrics GET /health — server health check + POST /shutdown — graceful server shutdown (kernel lifecycle) + GET /capabilities — machine-readable capability manifest + WS /ws/chat — dashboard voice/text chat + GET /dashboard — dashboard UI """ +import asyncio import io +import logging +import os +import signal import struct import time import uuid import wave from collections import OrderedDict +from pathlib import Path from threading import Lock +from typing import Optional -from fastapi import FastAPI, Response, UploadFile -from fastapi.responses import JSONResponse +from fastapi import FastAPI, Request, Response, UploadFile, WebSocket +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles from pydantic import BaseModel, Field from bus import ModalityBus @@ -33,6 +44,29 @@ app = FastAPI(title="Mod³", description="Local multi-model TTS on Apple Silicon") +logger = logging.getLogger("mod3.http") + +_server_start_time = time.time() +_shutting_down = False + + +@app.on_event("startup") +async def _warmup_kokoro(): + """Pre-load Kokoro TTS engine in background to avoid ~60s cold start on first request.""" + import threading + + def _do_warmup(): + try: + from engine import get_model + + get_model("kokoro") + logger.info("Kokoro TTS engine pre-warmed successfully") + except Exception as e: + logger.warning("Kokoro pre-warm failed (will lazy-load on first request): %s", e) + + threading.Thread(target=_do_warmup, daemon=True, name="kokoro-warmup").start() + + try: from server import _bus as _shared_bus except Exception: @@ -170,6 +204,29 @@ class SpeechRequest(BaseModel): speed: float = Field(default=1.0) +class ShutdownRequest(BaseModel): + """Graceful shutdown request from the kernel.""" + + timeout_sec: float = Field(default=5.0, ge=0, le=60) + reason: str = Field(default="shutdown-requested") + + +# --------------------------------------------------------------------------- +# Shutdown middleware — reject new requests once shutdown is initiated +# --------------------------------------------------------------------------- + + +@app.middleware("http") +async def _reject_during_shutdown(request: Request, call_next): + """Return 503 for new requests once graceful shutdown has been initiated.""" + if _shutting_down and request.url.path != "/health": + return JSONResponse( + status_code=503, + content={"error": "server is shutting down"}, + ) + return await call_next(request) + + # --------------------------------------------------------------------------- # Endpoints # --------------------------------------------------------------------------- @@ -505,24 +562,190 @@ def voices(): return {"engines": engines} +@app.post("/v1/stop") +def stop_speech(job_id: str = ""): + """Stop current speech and/or cancel queued items. + + If job_id is provided, cancels that specific job. + If empty, interrupts current playback and clears the queue. + Returns interruption context for barge-in support. + """ + try: + from server import _speech_queue, pipeline_state + + if job_id: + cancelled = _speech_queue.cancel(job_id) + return {"status": "ok", "message": f"Cancelled {job_id}" if cancelled else f"Job {job_id} not found"} + else: + # Get interrupt info before stopping + interrupt_info = None + if pipeline_state.is_speaking: + info = pipeline_state.interrupt(reason="http_barge_in") + if info: + interrupt_info = { + "spoken_pct": info.spoken_pct, + "delivered_text": info.delivered_text, + "full_text": info.full_text, + "reason": info.reason, + } + cancelled_count = _speech_queue.cancel_all_queued() + return { + "status": "ok", + "message": f"Interrupted playback; cancelled {cancelled_count} queued items", + "interrupted": interrupt_info, + } + except ImportError: + return JSONResponse(status_code=503, content={"error": "Speech queue not available in HTTP-only mode"}) + + @app.get("/health") def health(): - """Health check with summary stats.""" - with _jobs_lock: - total = len(_jobs) - active = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing")) - by_type = {} - for j in _jobs.values(): - t = j.get("type", "unknown") - by_type[t] = by_type.get(t, 0) + 1 + """Health check — standardized CogOS service format.""" + try: + loaded = get_loaded_engines() + + # Engine status: loaded/unloaded for each registered engine + engines = {} + for engine_name in MODELS: + engines[engine_name] = "loaded" if engine_name in loaded else "unloaded" + + # Modality availability + modalities = { + "tts": len(loaded) > 0, + "stt": False, # STT not yet implemented as a server modality + "vad": vad_loaded(), + } + + # Queue state from job ledger + with _jobs_lock: + total = len(_jobs) + active = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing")) + + # Overall status: ok if at least one TTS engine loaded, degraded if none + status = "ok" if loaded else "degraded" + + return { + "status": status, + "service": "mod3", + "version": "0.3.0", + "uptime_sec": round(time.time() - _server_start_time, 1), + "engines": engines, + "modalities": modalities, + "queue": { + "depth": total, + "active_jobs": active, + }, + } + except Exception as e: + return JSONResponse( + status_code=500, + content={ + "status": "error", + "service": "mod3", + "version": "0.3.0", + "error": str(e), + }, + ) + + +@app.post("/shutdown") +async def shutdown(req: Optional[ShutdownRequest] = None): + """Initiate graceful server shutdown. + + Called by the CogOS kernel for lifecycle management. Returns immediately + with confirmation, then drains active jobs and exits. + + Body (optional): {"timeout_sec": 5, "reason": "kernel-restart"} + """ + global _shutting_down + + if _shutting_down: + return JSONResponse( + status_code=409, + content={"status": "already_shutting_down"}, + ) + + if req is None: + req = ShutdownRequest() + + timeout_sec = req.timeout_sec + reason = req.reason + + _shutting_down = True + logger.info("Shutdown requested: reason=%s timeout=%.1fs", reason, timeout_sec) + + async def _graceful_exit(): + """Wait for active jobs to drain, then signal the process to stop.""" + deadline = time.time() + timeout_sec + while time.time() < deadline: + with _jobs_lock: + active = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing")) + if active == 0: + break + await asyncio.sleep(0.25) + + with _jobs_lock: + remaining = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing")) + + if remaining: + logger.warning("Shutdown timeout reached with %d active jobs — forcing exit", remaining) + else: + logger.info("All jobs drained — exiting cleanly") + + # Send SIGINT to our own process, which uvicorn handles gracefully + os.kill(os.getpid(), signal.SIGINT) + + # Fire-and-forget: schedule the shutdown coroutine on the running loop + asyncio.ensure_future(_graceful_exit()) + return { - "status": "ok", - "engines_loaded": get_loaded_engines(), - "vad_loaded": vad_loaded(), - "jobs": { - "total": total, - "active": active, - "by_type": by_type, + "status": "shutting_down", + "reason": reason, + "timeout_sec": timeout_sec, + } + + +@app.get("/capabilities") +def capabilities(): + """Machine-readable capability manifest for service discovery.""" + voices = {name: cfg["voices"] for name, cfg in MODELS.items()} + return { + "service": "mod3", + "version": "0.3.0", + "description": "Model Modality Modulator — local TTS, STT, and VAD on Apple Silicon", + "modalities": ["voice"], + "capabilities": { + "tts": { + "engines": list(MODELS.keys()), + "default_voice": "bm_lewis", + "default_speed": 1.25, + "endpoint": "/v1/synthesize", + }, + "stt": { + "engine": "mlx_whisper", + "model": "mlx-community/whisper-large-v3-turbo", + "languages": ["en"], + "endpoint": None, + }, + "vad": { + "engine": "silero_v5", + "endpoint": "/v1/vad", + }, + }, + "voices": voices, + "endpoints": { + "synthesize": "POST /v1/synthesize", + "speech": "POST /v1/audio/speech", + "vad": "POST /v1/vad", + "voices": "GET /v1/voices", + "health": "GET /health", + "shutdown": "POST /shutdown", + "capabilities": "GET /capabilities", + }, + "protocols": { + "mcp": True, + "http": True, + "websocket": True, }, } @@ -626,6 +849,91 @@ def get_bus() -> ModalityBus: return _bus +# --------------------------------------------------------------------------- +# Dashboard — voice/text chat via WebSocket +# --------------------------------------------------------------------------- + +_logger = logging.getLogger("mod3.dashboard") + +_dashboard_dir = Path(__file__).parent / "dashboard" + + +@app.get("/dashboard") +async def dashboard_page(): + """Serve the dashboard UI.""" + index = _dashboard_dir / "index.html" + if index.exists(): + return FileResponse(str(index)) + return JSONResponse({"error": "dashboard not found"}, status_code=404) + + +@app.websocket("/ws/chat") +async def ws_chat(websocket: WebSocket): + """Dashboard voice/text chat — one session per connection.""" + await websocket.accept() + + loop = asyncio.get_running_loop() + + from agent_loop import AgentLoop + from channels import BrowserChannel + from pipeline_state import PipelineState + from providers import auto_detect_provider + + provider = auto_detect_provider() + ps = PipelineState() + + agent = AgentLoop( + bus=_bus, + provider=provider, + pipeline_state=ps, + ) + + channel = BrowserChannel( + ws=websocket, + bus=_bus, + pipeline_state=ps, + loop=loop, + on_event=agent.handle_event, + ) + + agent.channel_id = channel.channel_id + agent._channel_ref = channel + + _logger.info("Dashboard session started: %s (provider: %s)", channel.channel_id, provider.name) + + try: + await channel.run() + finally: + _logger.info("Dashboard session ended: %s", channel.channel_id) + + +# Mount dashboard static files (after explicit routes so they don't shadow /v1/*) +if _dashboard_dir.exists(): + # VAD assets need their own mount (ONNX workers request from this path) + _vad_dir = _dashboard_dir / "vad" + if _vad_dir.exists(): + app.mount("/dashboard/vad", StaticFiles(directory=str(_vad_dir)), name="dashboard_vad") + app.mount("/dashboard", StaticFiles(directory=str(_dashboard_dir)), name="dashboard_static") + + +# ONNX Runtime WASM workers request .wasm and .onnx files at the root path. +# These catch-all routes serve them from dashboard/vad/. +@app.get("/{filename:path}.wasm") +async def serve_wasm(filename: str): + wasm_path = _dashboard_dir / "vad" / f"{filename}.wasm" + if wasm_path.exists(): + return FileResponse(str(wasm_path), media_type="application/wasm") + return JSONResponse({"detail": "Not Found"}, status_code=404) + + +@app.get("/{filename:path}.onnx") +async def serve_onnx(filename: str): + onnx_path = _dashboard_dir / "vad" / f"{filename}.onnx" + if onnx_path.exists(): + return FileResponse(str(onnx_path), media_type="application/octet-stream") + return JSONResponse({"detail": "Not Found"}, status_code=404) + + # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- diff --git a/integrations/bargein-producer.py b/integrations/bargein-producer.py new file mode 100755 index 0000000..2b9fa51 --- /dev/null +++ b/integrations/bargein-producer.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +Barge-in signal producer — detects SuperWhisper recording and writes +the signal file that Mod3's barge-in consumer watches. + +Detection method: + SuperWhisper creates a timestamped directory in its recordings folder + the instant recording begins (the dir is empty). When recording finishes, + it writes output.wav and meta.json into that directory. We poll for new + empty directories to detect start, and for the appearance of output.wav + to detect end. + +Signal file: /tmp/mod3-barge-in.json + Start: {"event": "user_speaking_start", "timestamp": "...", "source": "superwhisper"} + End: {"event": "user_speaking_end", "timestamp": "...", "source": "superwhisper"} + +Usage: + python3 bargein-producer.py # foreground (logs to stderr) + python3 bargein-producer.py & # background + launchctl load com.cogos.bargein-producer.plist # launchd + +Environment variables: + BARGEIN_SIGNAL — override signal file path (default: /tmp/mod3-barge-in.json) + BARGEIN_POLL_MS — poll interval in ms (default: 150) + SW_RECORDINGS_DIR — override recordings path +""" + +import json +import logging +import os +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +SIGNAL_FILE = os.environ.get("BARGEIN_SIGNAL", "/tmp/mod3-barge-in.json") +POLL_INTERVAL = int(os.environ.get("BARGEIN_POLL_MS", "150")) / 1000.0 + +# SuperWhisper recordings directory +_default_rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings") +RECORDINGS_DIR = os.environ.get("SW_RECORDINGS_DIR", _default_rec_dir) + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="%(asctime)s [bargein] %(levelname)s %(message)s", + datefmt="%H:%M:%S", +) +log = logging.getLogger("bargein-producer") + +# --------------------------------------------------------------------------- +# State +# --------------------------------------------------------------------------- + + +class State: + """Tracks whether a recording is currently active.""" + + def __init__(self): + self.recording = False + # The folder name (timestamp) of the active recording + self.active_folder: str | None = None + # Set of known-completed folder names (avoid re-triggering) + self.known_folders: set[str] = set() + + def start(self, folder: str): + if self.recording and self.active_folder == folder: + return # already tracking + self.recording = True + self.active_folder = folder + _write_signal("user_speaking_start") + log.info("Recording started (folder=%s)", folder) + + def end(self): + if not self.recording: + return + folder = self.active_folder + self.recording = False + if self.active_folder: + self.known_folders.add(self.active_folder) + self.active_folder = None + _write_signal("user_speaking_end") + log.info("Recording finished (folder=%s)", folder) + + +# --------------------------------------------------------------------------- +# Signal file +# --------------------------------------------------------------------------- + + +def _write_signal(event: str): + """Atomically write the barge-in signal file.""" + payload = { + "event": event, + "timestamp": datetime.now(timezone.utc).isoformat(), + "source": "superwhisper", + } + tmp = SIGNAL_FILE + ".tmp" + try: + with open(tmp, "w") as f: + json.dump(payload, f) + os.replace(tmp, SIGNAL_FILE) + except OSError as e: + log.error("Failed to write signal file: %s", e) + + +# --------------------------------------------------------------------------- +# Detection logic +# --------------------------------------------------------------------------- + + +def _is_empty_dir(path: Path) -> bool: + """True if path is a directory with no children (except . and ..).""" + try: + return path.is_dir() and not any(path.iterdir()) + except OSError: + return False + + +def _has_output(path: Path) -> bool: + """True if the recording directory has output.wav (recording complete).""" + return (path / "output.wav").exists() or (path / "meta.json").exists() + + +_last_dir_mtime: float = 0.0 + + +def _scan(state: State, rec_dir: Path): + """One poll cycle: scan the recordings directory for state changes. + + Optimisation: stat() the recordings dir first (~1us). Only do a full + iterdir() when the directory mtime has changed (new folder created) + OR when we're actively tracking a recording (need to check for output.wav). + """ + global _last_dir_mtime + + # Fast path: if we're tracking an active recording, just check that folder + if state.recording and state.active_folder: + active_path = rec_dir / state.active_folder + if _has_output(active_path): + state.end() + return + + # Check if directory changed since last scan + try: + dir_mtime = os.stat(rec_dir).st_mtime + except OSError: + return + if dir_mtime == _last_dir_mtime: + return # nothing changed — skip expensive iterdir() + _last_dir_mtime = dir_mtime + + # Directory changed — scan for new entries + try: + candidates: list[Path] = [] + for entry in rec_dir.iterdir(): + if entry.is_dir() and entry.name.isdigit() and entry.name not in state.known_folders: + candidates.append(entry) + except OSError: + return + + candidates.sort(key=lambda p: p.name, reverse=True) + for entry in candidates[:5]: + name = entry.name + + if _is_empty_dir(entry): + # New empty directory = recording just started + state.start(name) + return + + # Non-empty, not tracked — it's a completed recording we haven't seen + state.known_folders.add(name) + + +# --------------------------------------------------------------------------- +# Staleness guard +# --------------------------------------------------------------------------- + +_STALE_TIMEOUT = 120 # 2 minutes — if a recording folder stays empty this long, +# assume SuperWhisper cancelled/crashed and clear state + + +def _check_stale(state: State, rec_dir: Path): + """Clear recording state if the active folder has been empty too long.""" + if not state.recording or not state.active_folder: + return + folder = rec_dir / state.active_folder + try: + # Use folder creation time as the start timestamp + ctime = folder.stat().st_birthtime + except (OSError, AttributeError): + return + if time.time() - ctime > _STALE_TIMEOUT: + log.warning("Stale recording detected (>%ds), clearing state", _STALE_TIMEOUT) + state.end() + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + + +def main(): + rec_dir = Path(RECORDINGS_DIR) + if not rec_dir.is_dir(): + log.error("Recordings directory not found: %s", rec_dir) + log.error("Is SuperWhisper installed and configured?") + sys.exit(1) + + state = State() + + # Pre-populate known folders so we don't false-trigger on startup. + # Only treat an empty folder as "currently recording" if it was created + # within the last 30 seconds — older empties are stale/cancelled. + _STARTUP_FRESH_SECS = 30 + now = time.time() + newest_empty: tuple[str, float] | None = None + try: + for entry in rec_dir.iterdir(): + if entry.is_dir() and entry.name.isdigit(): + if _has_output(entry): + state.known_folders.add(entry.name) + elif _is_empty_dir(entry): + try: + age = now - entry.stat().st_birthtime + except (OSError, AttributeError): + age = float("inf") + if age < _STARTUP_FRESH_SECS: + # Possibly active — track the newest one + if newest_empty is None or entry.name > newest_empty[0]: + newest_empty = (entry.name, age) + else: + # Stale empty folder — ignore it + state.known_folders.add(entry.name) + log.debug("Ignoring stale empty folder: %s (age=%.0fs)", entry.name, age) + except OSError as e: + log.warning("Error during startup scan: %s", e) + + if newest_empty: + state.start(newest_empty[0]) + log.info("Detected in-progress recording on startup (age=%.1fs)", newest_empty[1]) + + log.info( + "Barge-in producer started (poll=%dms, signal=%s, recordings=%s, known=%d)", + POLL_INTERVAL * 1000, + SIGNAL_FILE, + RECORDINGS_DIR, + len(state.known_folders), + ) + + stale_counter = 0 + try: + while True: + _scan(state, rec_dir) + + # Check for stale recordings every ~5 seconds + stale_counter += 1 + if stale_counter >= int(5.0 / POLL_INTERVAL): + _check_stale(state, rec_dir) + stale_counter = 0 + + time.sleep(POLL_INTERVAL) + except KeyboardInterrupt: + log.info("Shutting down") + # Clean up signal file on exit + if state.recording: + state.end() + + +if __name__ == "__main__": + main() diff --git a/output_queue.py b/output_queue.py index 02d84f9..31d733f 100644 --- a/output_queue.py +++ b/output_queue.py @@ -102,6 +102,16 @@ def submit(self, channel_id: str, fn: Callable, **metadata) -> QueuedJob: """Submit a job to a channel's queue. Non-blocking.""" return self.get_queue(channel_id).submit(fn, **metadata) + def cancel_channel(self, channel_id: str) -> int: + """Cancel all pending jobs for a channel. Returns number of jobs cancelled.""" + queue = self._queues.get(channel_id) + if not queue: + return 0 + with queue._lock: + cancelled = len(queue._queue) + queue._queue.clear() + return cancelled + def status(self) -> dict[str, Any]: """Snapshot of all channel queues.""" return { diff --git a/providers.py b/providers.py new file mode 100644 index 0000000..cc551c4 --- /dev/null +++ b/providers.py @@ -0,0 +1,439 @@ +"""Inference providers for the Mod³ agent loop. + +Abstracts LLM backends behind an InferenceProvider protocol. +Each provider returns structured ToolCall responses, not streaming text. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import re +from dataclasses import dataclass, field +from typing import Any, Protocol, runtime_checkable + +import httpx + +logger = logging.getLogger("mod3.providers") + +# --------------------------------------------------------------------------- +# Types +# --------------------------------------------------------------------------- + + +@dataclass +class ToolCall: + """A single tool invocation from the LLM.""" + + name: str + arguments: dict[str, Any] + + +@dataclass +class ProviderResponse: + """Structured response from an inference provider.""" + + tool_calls: list[ToolCall] = field(default_factory=list) + text: str = "" # fallback plain text (no tool call) + raw: dict | None = None + + +# --------------------------------------------------------------------------- +# Protocol +# --------------------------------------------------------------------------- + + +@runtime_checkable +class InferenceProvider(Protocol): + @property + def name(self) -> str: ... + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + system: str = "", + ) -> ProviderResponse: ... + + +# --------------------------------------------------------------------------- +# Tool definitions +# --------------------------------------------------------------------------- + +AGENT_TOOLS: list[dict] = [ + { + "type": "function", + "function": { + "name": "speak", + "description": "Speak text aloud to the user via TTS. Use for conversational responses.", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to speak aloud", + } + }, + "required": ["text"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "send_text", + "description": "Send text to the chat panel (no speech). Use for code, lists, links, or anything better read than heard.", + "parameters": { + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The text to display in chat", + } + }, + "required": ["text"], + }, + }, + }, +] + + +# --------------------------------------------------------------------------- +# MLX provider (in-process, Apple Silicon) +# --------------------------------------------------------------------------- + + +def _format_tools_for_prompt(tools: list[dict]) -> str: + """Format tool definitions into a text block for Gemma's system prompt.""" + lines = ["You have access to the following tools:\n"] + for tool in tools: + fn = tool.get("function", tool) + name = fn.get("name", "") + desc = fn.get("description", "") + params = fn.get("parameters", {}) + lines.append(f"- **{name}**: {desc}") + props = params.get("properties", {}) + required = set(params.get("required", [])) + if props: + lines.append(" Parameters:") + for pname, pinfo in props.items(): + req_marker = " (required)" if pname in required else "" + lines.append( + f" - {pname} ({pinfo.get('type', 'string')}): {pinfo.get('description', '')}{req_marker}" + ) + lines.append( + "\nTo call a tool, output exactly:\n" + "\n" + '{"name": "", "arguments": {}}\n' + "\n" + "\nYou may make multiple tool calls. Every tool call must be wrapped " + "in its own block." + ) + return "\n".join(lines) + + +_TOOL_CALL_RE = re.compile(r"\s*(\{.*?\})\s*", re.DOTALL) + + +def _parse_tool_calls(text: str) -> list[ToolCall]: + """Extract JSON blocks from model output.""" + calls: list[ToolCall] = [] + for match in _TOOL_CALL_RE.finditer(text): + try: + obj = json.loads(match.group(1)) + name = obj.get("name", "") + args = obj.get("arguments", {}) + if isinstance(args, str): + try: + args = json.loads(args) + except Exception: + args = {"text": args} + if name: + calls.append(ToolCall(name=name, arguments=args)) + except json.JSONDecodeError: + logger.warning("mlx: failed to parse tool_call JSON: %s", match.group(1)) + return calls + + +class MlxProvider: + """In-process Gemma inference on Apple Silicon via mlx-lm. + + Lazy-loads the model on first call. The model stays resident in memory + for the lifetime of the process — no cold-start on subsequent calls. + """ + + def __init__(self, model_id: str | None = None): + self._model_id = model_id or os.environ.get("MLX_MODEL", "mlx-community/gemma-3-4b-it-4bit") + self._model = None + self._tokenizer = None + + @property + def name(self) -> str: + return f"mlx/{self._model_id}" + + def _ensure_loaded(self) -> None: + """Load model + tokenizer on first use (synchronous, called from thread).""" + if self._model is not None: + return + from mlx_lm import load + + logger.info("mlx: loading model %s (first call, this may take a moment)", self._model_id) + self._model, self._tokenizer = load(self._model_id) + logger.info("mlx: model loaded successfully") + + def _generate_sync( + self, + messages: list[dict], + tools: list[dict] | None, + system: str, + ) -> ProviderResponse: + """Run generation synchronously (meant to be called via asyncio.to_thread).""" + from mlx_lm import generate + + self._ensure_loaded() + + # Build messages list with system prompt + msgs = list(messages) + system_parts: list[str] = [] + if system: + system_parts.append(system) + if tools: + system_parts.append(_format_tools_for_prompt(tools)) + if system_parts: + msgs = [{"role": "system", "content": "\n\n".join(system_parts)}] + msgs + + # Apply chat template + prompt = self._tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False) + + max_tokens = int(os.environ.get("MLX_MAX_TOKENS", "512")) + raw_output = generate( + self._model, + self._tokenizer, + prompt=prompt, + max_tokens=max_tokens, + ) + + # Parse tool calls from output + tool_calls = _parse_tool_calls(raw_output) + + # Strip tool_call blocks from the text to get any remaining content + text = _TOOL_CALL_RE.sub("", raw_output).strip() + + return ProviderResponse( + tool_calls=tool_calls, + text=text, + raw={"model": self._model_id, "output": raw_output}, + ) + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + system: str = "", + ) -> ProviderResponse: + return await asyncio.to_thread(self._generate_sync, messages, tools, system) + + +# --------------------------------------------------------------------------- +# Ollama provider +# --------------------------------------------------------------------------- + + +class OllamaProvider: + """Ollama inference with native tool calling (validated with Gemma 4 E4B).""" + + def __init__( + self, + endpoint: str | None = None, + model: str | None = None, + ): + self._endpoint = endpoint or os.environ.get("OLLAMA_ENDPOINT", "http://localhost:11434") + self._model = model or os.environ.get("OLLAMA_MODEL", "gemma4:e4b") + + @property + def name(self) -> str: + return f"ollama/{self._model}" + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + system: str = "", + ) -> ProviderResponse: + msgs = list(messages) + if system: + msgs = [{"role": "system", "content": system}] + msgs + + body: dict[str, Any] = { + "model": self._model, + "messages": msgs, + "stream": False, + "think": False, + } + if tools: + body["tools"] = tools + + async with httpx.AsyncClient(timeout=300.0) as client: + resp = await client.post(f"{self._endpoint}/api/chat", json=body) + resp.raise_for_status() + data = resp.json() + + msg = data.get("message", {}) + raw_tool_calls = msg.get("tool_calls", []) + content = msg.get("content", "") + + tool_calls = [] + for tc in raw_tool_calls: + fn = tc.get("function", {}) + name = fn.get("name", "") + args = fn.get("arguments", {}) + if isinstance(args, str): + import json + + try: + args = json.loads(args) + except Exception: + args = {"text": args} + if name: + tool_calls.append(ToolCall(name=name, arguments=args)) + + return ProviderResponse(tool_calls=tool_calls, text=content, raw=data) + + +# --------------------------------------------------------------------------- +# CogOS provider (OpenAI-compatible SSE) +# --------------------------------------------------------------------------- + + +class CogOSProvider: + """CogOS kernel — OpenAI-compatible chat/completions with tool support.""" + + def __init__(self, endpoint: str | None = None): + self._endpoint = endpoint or os.environ.get("COGOS_ENDPOINT", "http://localhost:5100") + + @property + def name(self) -> str: + return "cogos" + + async def chat( + self, + messages: list[dict], + tools: list[dict] | None = None, + system: str = "", + ) -> ProviderResponse: + msgs = list(messages) + if system: + msgs = [{"role": "system", "content": system}] + msgs + + body: dict[str, Any] = { + "model": "cogos/auto", + "messages": msgs, + "stream": False, + } + if tools: + body["tools"] = tools + + headers = { + "X-UCP-Identity": '{"name":"cog"}', + "X-Session-ID": "mod3-dashboard", + "X-Origin": "mod3-dashboard", + } + + async with httpx.AsyncClient(timeout=60.0) as client: + resp = await client.post( + f"{self._endpoint}/v1/chat/completions", + json=body, + headers=headers, + ) + resp.raise_for_status() + data = resp.json() + + choice = data.get("choices", [{}])[0] + msg = choice.get("message", {}) + content = msg.get("content", "") + raw_tool_calls = msg.get("tool_calls", []) + + tool_calls = [] + for tc in raw_tool_calls: + fn = tc.get("function", {}) + name = fn.get("name", "") + args = fn.get("arguments", {}) + if isinstance(args, str): + import json + + try: + args = json.loads(args) + except Exception: + args = {"text": args} + if name: + tool_calls.append(ToolCall(name=name, arguments=args)) + + return ProviderResponse(tool_calls=tool_calls, text=content, raw=data) + + +# --------------------------------------------------------------------------- +# Auto-detection +# --------------------------------------------------------------------------- + + +async def _probe(url: str, timeout: float = 2.0) -> bool: + try: + async with httpx.AsyncClient(timeout=timeout) as client: + r = await client.get(url) + return r.status_code < 500 + except Exception: + return False + + +def _mlx_available() -> bool: + """Check whether mlx-lm is importable (Apple Silicon with mlx installed).""" + try: + import mlx_lm # noqa: F401 + + return True + except ImportError: + return False + + +async def auto_detect_provider_async() -> InferenceProvider: + """Probe available backends: MLX > CogOS > Ollama.""" + # MLX — in-process, fastest, no network overhead + if _mlx_available(): + logger.info("auto-detect: using MLX (in-process)") + return MlxProvider() + + # CogOS — local kernel with OpenAI-compat API + cogos_endpoint = os.environ.get("COGOS_ENDPOINT", "http://localhost:5100") + if await _probe(f"{cogos_endpoint}/health"): + logger.info("auto-detect: using CogOS at %s", cogos_endpoint) + return CogOSProvider(endpoint=cogos_endpoint) + + # Ollama — local daemon + ollama_endpoint = os.environ.get("OLLAMA_ENDPOINT", "http://localhost:11434") + if await _probe(f"{ollama_endpoint}/api/tags"): + logger.info("auto-detect: using Ollama at %s", ollama_endpoint) + return OllamaProvider(endpoint=ollama_endpoint) + + logger.warning("auto-detect: no provider found, defaulting to Ollama") + return OllamaProvider() + + +def auto_detect_provider() -> InferenceProvider: + """Synchronous wrapper for auto-detection.""" + import asyncio + + # Fast path: MLX doesn't need async probing + if _mlx_available(): + logger.info("auto-detect: using MLX (in-process)") + return MlxProvider() + + try: + asyncio.get_running_loop() + except RuntimeError: + return asyncio.run(auto_detect_provider_async()) + + # If called from an async context, can't use asyncio.run — default to Ollama + logger.info("auto-detect: async context, defaulting to Ollama") + return OllamaProvider() diff --git a/requirements.txt b/requirements.txt index 772b8ea..b927da7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ espeakng-loader>=0.2.0 setuptools fastapi>=0.115.0 uvicorn>=0.30.0 +httpx>=0.24.0 diff --git a/server.py b/server.py index 690156d..4d18002 100644 --- a/server.py +++ b/server.py @@ -20,6 +20,7 @@ import json import logging +import os import threading import time import uuid @@ -359,6 +360,53 @@ async def _filter_read_stream(): pipeline_state = PipelineState() +# --------------------------------------------------------------------------- +# Barge-in file watcher — monitors /tmp/mod3-barge-in.json for pause signals +# --------------------------------------------------------------------------- + +_BARGEIN_SIGNAL = "/tmp/mod3-barge-in.json" +_bargein_last_mtime: float = 0.0 + + +def _bargein_watcher(): + """Background thread that watches for barge-in signal file changes.""" + global _bargein_last_mtime + import json as _json + + while True: + try: + import os + + if os.path.exists(_BARGEIN_SIGNAL): + mtime = os.path.getmtime(_BARGEIN_SIGNAL) + if mtime > _bargein_last_mtime: + _bargein_last_mtime = mtime + with open(_BARGEIN_SIGNAL) as f: + signal = _json.load(f) + if signal.get("event") == "user_speaking_start": + if pipeline_state.is_speaking: + info = pipeline_state.interrupt(reason="barge_in") + if info: + # Write interrupt context back to signal file + signal["interrupted"] = { + "spoken_pct": info.spoken_pct, + "delivered_text": info.delivered_text, + "full_text": info.full_text, + } + with open(_BARGEIN_SIGNAL, "w") as f: + _json.dump(signal, f, indent=2) + logging.info( + "Barge-in: paused playback (%.0f%% delivered)", info.spoken_pct * 100 if info else 0 + ) + except Exception as e: + logging.debug("Barge-in watcher error: %s", e) + time.sleep(0.1) # 100ms poll + + +_bargein_thread = threading.Thread(target=_bargein_watcher, daemon=True) +_bargein_thread.start() + + async def _emit_interruption(info: InterruptInfo): """Emit a channel notification when playback is interrupted. @@ -702,6 +750,34 @@ def speak( if not text.strip(): return json.dumps({"status": "error", "error": "Nothing to say"}) + # Check if user is currently speaking (barge-in signal file) + user_state = "idle" + try: + if os.path.exists(_BARGEIN_SIGNAL): + with open(_BARGEIN_SIGNAL) as _bf: + _bsig = json.load(_bf) + if _bsig.get("event") == "user_speaking_start": + user_state = "recording" + except Exception: + pass # signal file missing or corrupt — assume idle + + # If user is currently recording, don't play — just inform the agent. + # The agent is responsible for re-calling speak() after the user finishes. + # We intentionally do NOT enqueue the job or create a _jobs entry, because + # a "held" job in the queue becomes a zombie: the drain thread tries to play + # it immediately (ignoring the hold), and if anything goes wrong the job + # can't be cleared by stop(). + if user_state == "recording": + est_duration = _estimate_duration_sec(text, speed) + return json.dumps( + { + "status": "held", + "reason": "User is currently speaking — re-send this speak() call after user finishes.", + "user_state": "recording", + "estimated_duration_sec": round(est_duration, 1), + } + ) + try: job_id, position = _start_speech(text, voice, stream=stream, speed=speed, emotion=emotion) except ValueError as e: @@ -714,7 +790,8 @@ def speak( if currently_playing is None or currently_playing["job_id"] == job_id: # Playing immediately (no queue ahead) - return json.dumps({"status": "speaking", "job_id": job_id}) + result = {"status": "speaking", "job_id": job_id} + return json.dumps(result) # Something is already playing — return enriched queue status queue_snapshot = _speech_queue.get_queue_snapshot() @@ -756,6 +833,8 @@ def speak( "To cancel all and speak immediately, call stop() then speak()." ), } + if user_state != "idle": + result["user_state"] = user_state return json.dumps(result) @@ -797,7 +876,7 @@ def speech_status(job_id: str = "", verbose: bool = False) -> str: if entry["job_id"] == job_id: result["queue_position"] = i + 1 break - if job["metrics"]: + if job.get("metrics"): metrics = job["metrics"] if not verbose and "chunks" in metrics: chunks = metrics["chunks"]["per_chunk"] @@ -811,7 +890,7 @@ def speech_status(job_id: str = "", verbose: bool = False) -> str: }, } result["metrics"] = metrics - if job["error"]: + if job.get("error"): result["error"] = job["error"] # Always include queue state @@ -882,9 +961,9 @@ def stop(job_id: str = "") -> str: # No job_id: stop everything — interrupt current + clear queue cleared = _speech_queue.cancel_all_queued() - # Mark all cleared queued jobs + # Mark all cleared queued and held jobs as cancelled for jid, jdata in _jobs.items(): - if jdata["status"] == "queued": + if jdata["status"] in ("queued", "held"): jdata["status"] = "cancelled" with _current_player_lock: @@ -1097,6 +1176,7 @@ def _run_http(host: str = "0.0.0.0", port: int = 7860): parser.add_argument("--http", action="store_true", help="Run HTTP API only") parser.add_argument("--all", action="store_true", help="Run both MCP (stdio) and HTTP") parser.add_argument("--channel", action="store_true", help="Run as channel server with voice input") + parser.add_argument("--dashboard", action="store_true", help="Run HTTP API with voice/text dashboard (no MCP)") parser.add_argument("--port", type=int, default=7860, help="HTTP port (default: 7860)") parser.add_argument("--host", type=str, default="0.0.0.0", help="HTTP bind address") args = parser.parse_args() @@ -1112,6 +1192,18 @@ def _run_http(host: str = "0.0.0.0", port: int = 7860): ) http_thread.start() mcp.run() + elif args.dashboard: + # Dashboard mode: HTTP server with WebSocket voice/text chat + # Swap PlaceholderDecoder → WhisperDecoder for real STT + from modules.text import TextModule + from modules.voice import VoiceModule, WhisperDecoder + + _bus._modules.clear() + _bus.register(VoiceModule(decoder=WhisperDecoder())) + _bus.register(TextModule()) + logging.basicConfig(level=logging.INFO) + logger.info("Starting dashboard mode (WhisperDecoder enabled)") + _run_http(host=args.host, port=args.port) elif args.channel: # Channel mode: MCP on stdio + inbound voice pipeline from bus import ModalityBus