From d3f7ee1530a41d8e1d1b3df7a4e4c0c4d6805ba6 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 13:52:18 +0200 Subject: [PATCH 1/2] feat(voice): clean non-pronounceable chars from TTS replies Voice responses now strip emojis, URLs, code snippets, bullets, dashes, and markdown symbols (*, #, _, ~, >, |) before synthesis, leaving plain speakable text. Cleaning lives at the synthesize() chokepoint so every caller benefits. Closes #10 Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_voice_clean.py | 32 ++++++++++++++++++++++++++++++++ voice/pipeline.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tests/test_voice_clean.py diff --git a/tests/test_voice_clean.py b/tests/test_voice_clean.py new file mode 100644 index 0000000..d5b6018 --- /dev/null +++ b/tests/test_voice_clean.py @@ -0,0 +1,32 @@ +from voice.pipeline import clean_for_speech + + +def test_strips_emoji(): + assert clean_for_speech("Done 👍 ✅") == "Done" + + +def test_strips_urls(): + assert "http" not in clean_for_speech("See https://example.com/x for more") + assert "www" not in clean_for_speech("Visit www.example.com now") + + +def test_strips_code(): + assert clean_for_speech("Run `npm install` then go") == "Run then go" + assert clean_for_speech("Code:\n```\nx = 1\n```\ndone") == "Code:\ndone" + + +def test_strips_markdown_symbols(): + assert clean_for_speech("**bold** and #heading") == "bold and heading" + + +def test_strips_bullets(): + assert clean_for_speech("- first\n- second") == "first\nsecond" + + +def test_dash_separator_becomes_pause(): + assert clean_for_speech("yes — really") == "yes, really" + assert clean_for_speech("e-mail stays") == "e-mail stays" + + +def test_plain_text_untouched(): + assert clean_for_speech("Hello there, how are you?") == "Hello there, how are you?" diff --git a/voice/pipeline.py b/voice/pipeline.py index 2aa09b6..71fba29 100644 --- a/voice/pipeline.py +++ b/voice/pipeline.py @@ -4,6 +4,8 @@ import io import logging +import re +import unicodedata from functools import partial from typing import TYPE_CHECKING @@ -15,6 +17,33 @@ log = logging.getLogger(__name__) +_CODE_BLOCK_RE = re.compile(r"```.*?```", re.DOTALL) # fenced code +_INLINE_CODE_RE = re.compile(r"`[^`]*`") +_URL_RE = re.compile(r"\b(?:https?://|www\.)\S+", re.IGNORECASE) +_LIST_MARKER_RE = re.compile(r"^[ \t]*[-*•‣◦]+[ \t]+", re.MULTILINE) # leading bullets +_MD_SYMBOLS_RE = re.compile(r"[*#_~>`|]") # markdown emphasis/heading/table chars +_WS_RE = re.compile(r"[ \t]{2,}") + + +def clean_for_speech(text: str) -> str: + """Strip anything that reads badly when spoken: code, URLs, emojis, markdown. + + Voice replies should be plain speakable text — no emojis, bullets, code + snippets, URLs, or symbols like * and #. See issue #10. + """ + text = _CODE_BLOCK_RE.sub(" ", text) + text = _INLINE_CODE_RE.sub(" ", text) + text = _URL_RE.sub(" ", text) + text = _LIST_MARKER_RE.sub("", text) + # dashes used as separators → pause; keep hyphens inside words + text = re.sub(r"\s[-–—]+\s", ", ", text) + text = _MD_SYMBOLS_RE.sub("", text) + # drop emoji & other pictographic symbols (unicode category "So") + text = "".join(ch for ch in text if unicodedata.category(ch) != "So") + text = _WS_RE.sub(" ", text) + lines = (line.strip() for line in text.splitlines()) + return "\n".join(line for line in lines if line).strip() + class VoicePipeline: """Speech-to-text via faster-whisper, text-to-speech via edge-tts.""" @@ -62,6 +91,7 @@ async def synthesize(self, text: str) -> bytes: if not self.tts_enabled: raise RuntimeError("TTS is disabled in config") + text = clean_for_speech(text) communicate = edge_tts.Communicate(text, self.tts_voice) buf = io.BytesIO() async for chunk in communicate.stream(): From 57d880aa84909c8d74d34b063e235a793d703db8 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 14:04:38 +0200 Subject: [PATCH 2/2] docs(voice): instruct the model to write speakable text for voice replies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-hoc cleaner is a safety net; intent belongs at the source. When the model chooses [respond_with_voice], the skill now tells it to write the whole message to be spoken — no emojis, symbols, URLs, code, or bullets — and to fall back to text when content only works on screen. Relates to #10 Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/voice.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/skills/voice.md b/skills/voice.md index 66ba4c3..6b0c909 100644 --- a/skills/voice.md +++ b/skills/voice.md @@ -17,3 +17,20 @@ Use voice responses when: Do NOT use voice responses when: - The response contains code, links, or structured data. - The response is long or complex. + +## Writing for voice + +When you add `[respond_with_voice]`, write the whole message to be *spoken*, not +read. The medium changed, so the style changes with it. Before deciding on voice, +ask: does this content even work aloud? If it only makes sense on screen, reply +with text instead. + +A voice reply must contain only plain, speakable words: +- No emojis, no symbols (`*`, `#`, `~`, `>`, etc.) — say the meaning instead. +- No URLs — describe the link ("I sent the booking page") or send it as text. +- No code snippets, tables, or structured/markdown formatting. +- No bullet points or dashes as list markers — speak it as flowing sentences + ("First… then… finally…"). +- Spell awkward things out: say "version one point two", not "v1.2". + +Keep it short and conversational, the way you'd actually say it out loud.