-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathengine.py
More file actions
200 lines (169 loc) · 6.18 KB
/
engine.py
File metadata and controls
200 lines (169 loc) · 6.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
"""Mod³ inference core — model registry, loading, and audio generation.
No MCP or playback dependencies. Takes text + params, yields numpy audio chunks.
"""
import threading
from dataclasses import dataclass
from typing import Iterator
import numpy as np
import pysbd
_segmenter = pysbd.Segmenter(language="en", clean=False)
# ---------------------------------------------------------------------------
# Model registry
# ---------------------------------------------------------------------------
MODELS = {
"voxtral": {
"id": "mlx-community/Voxtral-4B-TTS-2603-mlx-4bit",
"voices": [
"casual_male",
"casual_female",
"cheerful_female",
"neutral_male",
"neutral_female",
"fr_male",
"fr_female",
"es_male",
"es_female",
"de_male",
"de_female",
"it_male",
"it_female",
"pt_male",
"pt_female",
"nl_male",
"nl_female",
"ar_male",
"hi_male",
"hi_female",
],
"default_voice": "casual_male",
},
"kokoro": {
"id": "mlx-community/Kokoro-82M-bf16",
"voices": [
"af_heart",
"af_bella",
"af_nicole",
"af_sarah",
"af_sky",
"am_adam",
"am_michael",
"bf_emma",
"bf_isabella",
"bm_george",
"bm_lewis",
],
"default_voice": "af_heart",
"supports_speed": True,
},
"chatterbox": {
"id": "mlx-community/chatterbox-4bit",
"voices": ["chatterbox"],
"default_voice": "chatterbox",
"supports_exaggeration": True,
},
"spark": {
"id": "mlx-community/Spark-TTS-0.5B-bf16",
"voices": ["spark_male", "spark_female"],
"default_voice": "spark_male",
"supports_pitch": True,
"supports_speed": True,
},
}
_models: dict = {}
_model_lock = threading.Lock()
def split_sentences(text: str) -> list[str]:
"""Split text into sentences using pysbd."""
sentences = _segmenter.segment(text.strip())
return [s.strip() for s in sentences if s.strip()]
def resolve_model(voice: str) -> tuple[str, str]:
"""Given a voice name, return (engine_name, voice) or raise."""
for engine, cfg in MODELS.items():
if voice in cfg["voices"]:
return engine, voice
raise ValueError(f"Unknown voice '{voice}'. Use list_voices() to see options.")
def get_model(engine: str):
"""Load and cache an engine's model. Thread-safe."""
if engine not in _models:
with _model_lock:
if engine not in _models:
from mlx_audio.tts import load
_models[engine] = load(MODELS[engine]["id"])
return _models[engine]
def get_loaded_engines() -> list[str]:
"""Return names of currently loaded engines."""
return list(_models.keys())
# ---------------------------------------------------------------------------
# Audio chunk
# ---------------------------------------------------------------------------
@dataclass
class AudioChunk:
samples: np.ndarray
sample_rate: int
metadata: dict
# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
def generate_audio(
text: str,
voice: str = "bm_lewis",
speed: float = 1.25,
emotion: float = 0.5,
stream: bool = True,
streaming_interval: float = 1.0,
) -> Iterator[AudioChunk]:
"""Yield AudioChunks for the given text. Core generation pipeline."""
engine, voice = resolve_model(voice)
model = get_model(engine)
sample_rate = model.sample_rate
sentences = split_sentences(text)
feather = int(sample_rate * 0.02)
for si, sentence in enumerate(sentences):
gen_kwargs: dict[str, object] = dict(text=sentence, verbose=False)
cfg = MODELS[engine]
if engine == "chatterbox":
gen_kwargs["exaggeration"] = emotion
gen_kwargs["stream"] = stream
gen_kwargs["streaming_interval"] = streaming_interval
elif engine == "spark":
gen_kwargs["gender"] = "female" if voice == "spark_female" else "male"
gen_kwargs["speed"] = speed
else:
gen_kwargs["voice"] = voice
if cfg.get("supports_speed"):
gen_kwargs["speed"] = speed
else:
gen_kwargs["stream"] = stream
gen_kwargs["streaming_interval"] = streaming_interval
for result in model.generate(**gen_kwargs):
audio = np.array(result.audio).flatten().astype(np.float32)
metadata = {
"gen_time_sec": round(result.processing_time_seconds, 4),
"rtf": round(result.real_time_factor, 2),
"samples": int(result.samples),
"tokens": result.token_count,
"is_final": result.is_final_chunk,
"sentence": si,
"peak_memory_gb": round(result.peak_memory_usage, 2),
}
if result.is_final_chunk and len(audio) > feather:
audio = audio.copy()
audio[-feather:] *= np.linspace(1, 0, feather, dtype=np.float32)
yield AudioChunk(samples=audio, sample_rate=sample_rate, metadata=metadata)
# Adaptive sentence gap
if si < len(sentences) - 1:
gap_sec = min(0.2, 0.05 + len(sentence) * 0.001)
gap = np.zeros(int(sample_rate * gap_sec), dtype=np.float32)
yield AudioChunk(samples=gap, sample_rate=sample_rate, metadata={})
def synthesize(
text: str,
voice: str = "bm_lewis",
speed: float = 1.25,
emotion: float = 0.5,
) -> tuple[np.ndarray, int]:
"""Generate complete audio. Returns (concatenated_samples, sample_rate)."""
chunks = list(generate_audio(text, voice=voice, speed=speed, emotion=emotion, stream=False))
if not chunks:
return np.array([], dtype=np.float32), 24000
sample_rate = chunks[0].sample_rate
all_samples = np.concatenate([c.samples for c in chunks])
return all_samples, sample_rate