From d3f7ee1530a41d8e1d1b3df7a4e4c0c4d6805ba6 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Thu, 25 Jun 2026 13:52:18 +0200
Subject: [PATCH 1/2] feat(voice): clean non-pronounceable chars from TTS
 replies

Voice responses now strip emojis, URLs, code snippets, bullets, dashes,
and markdown symbols (*, #, _, ~, >, |) before synthesis, leaving plain
speakable text. Cleaning lives at the synthesize() chokepoint so every
caller benefits.

Closes #10

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_voice_clean.py | 32 ++++++++++++++++++++++++++++++++
 voice/pipeline.py         | 30 ++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tests/test_voice_clean.py

diff --git a/tests/test_voice_clean.py b/tests/test_voice_clean.py
new file mode 100644
index 0000000..d5b6018
--- /dev/null
+++ b/tests/test_voice_clean.py
@@ -0,0 +1,32 @@
+from voice.pipeline import clean_for_speech
+
+
+def test_strips_emoji():
+    assert clean_for_speech("Done 👍 ✅") == "Done"
+
+
+def test_strips_urls():
+    assert "http" not in clean_for_speech("See https://example.com/x for more")
+    assert "www" not in clean_for_speech("Visit www.example.com now")
+
+
+def test_strips_code():
+    assert clean_for_speech("Run `npm install` then go") == "Run then go"
+    assert clean_for_speech("Code:\n```\nx = 1\n```\ndone") == "Code:\ndone"
+
+
+def test_strips_markdown_symbols():
+    assert clean_for_speech("**bold** and #heading") == "bold and heading"
+
+
+def test_strips_bullets():
+    assert clean_for_speech("- first\n- second") == "first\nsecond"
+
+
+def test_dash_separator_becomes_pause():
+    assert clean_for_speech("yes — really") == "yes, really"
+    assert clean_for_speech("e-mail stays") == "e-mail stays"
+
+
+def test_plain_text_untouched():
+    assert clean_for_speech("Hello there, how are you?") == "Hello there, how are you?"
diff --git a/voice/pipeline.py b/voice/pipeline.py
index 2aa09b6..71fba29 100644
--- a/voice/pipeline.py
+++ b/voice/pipeline.py
@@ -4,6 +4,8 @@
 
 import io
 import logging
+import re
+import unicodedata
 from functools import partial
 from typing import TYPE_CHECKING
 
@@ -15,6 +17,33 @@
 
 log = logging.getLogger(__name__)
 
+_CODE_BLOCK_RE = re.compile(r"```.*?```", re.DOTALL)  # fenced code
+_INLINE_CODE_RE = re.compile(r"`[^`]*`")
+_URL_RE = re.compile(r"\b(?:https?://|www\.)\S+", re.IGNORECASE)
+_LIST_MARKER_RE = re.compile(r"^[ \t]*[-*•‣◦]+[ \t]+", re.MULTILINE)  # leading bullets
+_MD_SYMBOLS_RE = re.compile(r"[*#_~>`|]")  # markdown emphasis/heading/table chars
+_WS_RE = re.compile(r"[ \t]{2,}")
+
+
+def clean_for_speech(text: str) -> str:
+    """Strip anything that reads badly when spoken: code, URLs, emojis, markdown.
+
+    Voice replies should be plain speakable text — no emojis, bullets, code
+    snippets, URLs, or symbols like * and #.  See issue #10.
+    """
+    text = _CODE_BLOCK_RE.sub(" ", text)
+    text = _INLINE_CODE_RE.sub(" ", text)
+    text = _URL_RE.sub(" ", text)
+    text = _LIST_MARKER_RE.sub("", text)
+    # dashes used as separators → pause; keep hyphens inside words
+    text = re.sub(r"\s[-–—]+\s", ", ", text)
+    text = _MD_SYMBOLS_RE.sub("", text)
+    # drop emoji & other pictographic symbols (unicode category "So")
+    text = "".join(ch for ch in text if unicodedata.category(ch) != "So")
+    text = _WS_RE.sub(" ", text)
+    lines = (line.strip() for line in text.splitlines())
+    return "\n".join(line for line in lines if line).strip()
+
 
 class VoicePipeline:
     """Speech-to-text via faster-whisper, text-to-speech via edge-tts."""
@@ -62,6 +91,7 @@ async def synthesize(self, text: str) -> bytes:
         if not self.tts_enabled:
             raise RuntimeError("TTS is disabled in config")
 
+        text = clean_for_speech(text)
         communicate = edge_tts.Communicate(text, self.tts_voice)
         buf = io.BytesIO()
         async for chunk in communicate.stream():

From 57d880aa84909c8d74d34b063e235a793d703db8 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Thu, 25 Jun 2026 14:04:38 +0200
Subject: [PATCH 2/2] docs(voice): instruct the model to write speakable text
 for voice replies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The post-hoc cleaner is a safety net; intent belongs at the source. When
the model chooses [respond_with_voice], the skill now tells it to write the
whole message to be spoken — no emojis, symbols, URLs, code, or bullets —
and to fall back to text when content only works on screen.

Relates to #10

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 skills/voice.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/skills/voice.md b/skills/voice.md
index 66ba4c3..6b0c909 100644
--- a/skills/voice.md
+++ b/skills/voice.md
@@ -17,3 +17,20 @@ Use voice responses when:
 Do NOT use voice responses when:
 - The response contains code, links, or structured data.
 - The response is long or complex.
+
+## Writing for voice
+
+When you add `[respond_with_voice]`, write the whole message to be *spoken*, not
+read. The medium changed, so the style changes with it. Before deciding on voice,
+ask: does this content even work aloud? If it only makes sense on screen, reply
+with text instead.
+
+A voice reply must contain only plain, speakable words:
+- No emojis, no symbols (`*`, `#`, `~`, `>`, etc.) — say the meaning instead.
+- No URLs — describe the link ("I sent the booking page") or send it as text.
+- No code snippets, tables, or structured/markdown formatting.
+- No bullet points or dashes as list markers — speak it as flowing sentences
+  ("First… then… finally…").
+- Spell awkward things out: say "version one point two", not "v1.2".
+
+Keep it short and conversational, the way you'd actually say it out loud.