Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions skills/voice.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,20 @@ Use voice responses when:
Do NOT use voice responses when:
- The response contains code, links, or structured data.
- The response is long or complex.

## Writing for voice

When you add `[respond_with_voice]`, write the whole message to be *spoken*, not
read. The medium changed, so the style changes with it. Before deciding on voice,
ask: does this content even work aloud? If it only makes sense on screen, reply
with text instead.

A voice reply must contain only plain, speakable words:
- No emojis, no symbols (`*`, `#`, `~`, `>`, etc.) — say the meaning instead.
- No URLs — describe the link ("I sent the booking page") or send it as text.
- No code snippets, tables, or structured/markdown formatting.
- No bullet points or dashes as list markers — speak it as flowing sentences
("First… then… finally…").
- Spell awkward things out: say "version one point two", not "v1.2".

Keep it short and conversational, the way you'd actually say it out loud.
32 changes: 32 additions & 0 deletions tests/test_voice_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from voice.pipeline import clean_for_speech


def test_strips_emoji():
assert clean_for_speech("Done 👍 ✅") == "Done"


def test_strips_urls():
assert "http" not in clean_for_speech("See https://example.com/x for more")
assert "www" not in clean_for_speech("Visit www.example.com now")


def test_strips_code():
assert clean_for_speech("Run `npm install` then go") == "Run then go"
assert clean_for_speech("Code:\n```\nx = 1\n```\ndone") == "Code:\ndone"


def test_strips_markdown_symbols():
assert clean_for_speech("**bold** and #heading") == "bold and heading"


def test_strips_bullets():
assert clean_for_speech("- first\n- second") == "first\nsecond"


def test_dash_separator_becomes_pause():
assert clean_for_speech("yes — really") == "yes, really"
assert clean_for_speech("e-mail stays") == "e-mail stays"


def test_plain_text_untouched():
assert clean_for_speech("Hello there, how are you?") == "Hello there, how are you?"
30 changes: 30 additions & 0 deletions voice/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import io
import logging
import re
import unicodedata
from functools import partial
from typing import TYPE_CHECKING

Expand All @@ -15,6 +17,33 @@

log = logging.getLogger(__name__)

_CODE_BLOCK_RE = re.compile(r"```.*?```", re.DOTALL) # fenced code
_INLINE_CODE_RE = re.compile(r"`[^`]*`")
_URL_RE = re.compile(r"\b(?:https?://|www\.)\S+", re.IGNORECASE)
_LIST_MARKER_RE = re.compile(r"^[ \t]*[-*•‣◦]+[ \t]+", re.MULTILINE) # leading bullets
_MD_SYMBOLS_RE = re.compile(r"[*#_~>`|]") # markdown emphasis/heading/table chars
_WS_RE = re.compile(r"[ \t]{2,}")


def clean_for_speech(text: str) -> str:
"""Strip anything that reads badly when spoken: code, URLs, emojis, markdown.
Voice replies should be plain speakable text — no emojis, bullets, code
snippets, URLs, or symbols like * and #. See issue #10.
"""
text = _CODE_BLOCK_RE.sub(" ", text)
text = _INLINE_CODE_RE.sub(" ", text)
text = _URL_RE.sub(" ", text)
text = _LIST_MARKER_RE.sub("", text)
# dashes used as separators → pause; keep hyphens inside words
text = re.sub(r"\s[-–—]+\s", ", ", text)
text = _MD_SYMBOLS_RE.sub("", text)
# drop emoji & other pictographic symbols (unicode category "So")
text = "".join(ch for ch in text if unicodedata.category(ch) != "So")
text = _WS_RE.sub(" ", text)
lines = (line.strip() for line in text.splitlines())
return "\n".join(line for line in lines if line).strip()


class VoicePipeline:
"""Speech-to-text via faster-whisper, text-to-speech via edge-tts."""
Expand Down Expand Up @@ -62,6 +91,7 @@ async def synthesize(self, text: str) -> bytes:
if not self.tts_enabled:
raise RuntimeError("TTS is disabled in config")

text = clean_for_speech(text)
communicate = edge_tts.Communicate(text, self.tts_voice)
buf = io.BytesIO()
async for chunk in communicate.stream():
Expand Down
Loading