Skip to content

Commit 3b55753

Browse files
committed
feat: phase 3 — ACE-Step music, Chatterbox TTS+cloning, PySceneDetect
Music generation: added ACE-Step 1.5 — full songs with vocals+lyrics, 10x faster than MusicGen, <4GB VRAM, Apache 2.0. New /audio/music-ai/ace-step route. TTS + voice cloning: added Chatterbox (Resemble AI, MIT) as premium engine — zero-shot voice cloning from 5s audio reference, emotion control, 23 languages. Three-tier TTS: edge-tts (free), Kokoro (CPU), Chatterbox (premium+cloning). Scene detection: added PySceneDetect as fast heuristic method alongside FFmpeg threshold and TransNetV2 ML. ContentDetector with configurable threshold. New "pyscenedetect" method in /video/scenes route.
1 parent d1949fe commit 3b55753

6 files changed

Lines changed: 310 additions & 7 deletions

File tree

CLAUDE.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -628,13 +628,13 @@ enhance = ["resemble-enhance>=0.0.1"]
628628
- [x] **Face detection**: Added InsightFace `buffalo_l` as `"insightface"` detector option in face_tools (highest accuracy). Route allowlists updated.
629629

630630
### Phase 3 — New Features (Higher Effort)
631-
- [ ] **Music generation**: Add `ACE-Step 1.5` — full songs WITH vocals+lyrics, 10x faster than MusicGen, 4x less VRAM, Apache 2.0
632-
- [ ] **TTS tiers**: Add `Kokoro` (82M params, CPU, fast) + `Chatterbox` (voice cloning, emotion, 23 langs, MIT) alongside edge-tts
633-
- [ ] **Voice cloning**: Via Chatterbox — zero-shot from 5s audio, emotion control, paralinguistic tags
631+
- [x] **Music generation**: Added `ACE-Step 1.5` — full songs WITH vocals+lyrics, `/audio/music-ai/ace-step` route
632+
- [x] **TTS tiers**: Kokoro already existed; added `Chatterbox` (voice cloning, emotion, 23 langs, MIT) as `"chatterbox"` engine in `/audio/tts/generate`
633+
- [x] **Voice cloning**: Via Chatterbox `voice_ref` param — zero-shot from 5s audio, emotion control
634634
- [ ] **AI color grading**: Add `Image-Adaptive-3DLUT` — learned 3D LUTs, <2ms on 4K, replaces histogram matching
635635
- [ ] **Motion graphics**: Add `Remotion` render service — React-based, After Effects quality titles/animations vs FFmpeg drawtext
636636
- [ ] **Video denoising**: Add `BasicVSR++` as GPU option — temporal propagation across frames vs spatial-only nlmeans
637-
- [ ] **Scene detection**: Add `PySceneDetect` as fast complement to TransNetV24.6k stars, actively maintained
637+
- [x] **Scene detection**: Added `PySceneDetect` as `"pyscenedetect"` method in `/video/scenes`heuristic, fast, ContentDetector
638638
- [ ] **Neural LUT blending**: Add `NILUT` for continuous style blending — single slider between any two color grades
639639
- [ ] **Translation**: Add `SeamlessM4T v2` as "High Quality" option — 20% BLEU improvement, multimodal
640640
- [ ] **Caption NLP emphasis**: Auto-detect important words, apply different highlight colors/sizes in Pillow renderer

opencut/core/music_ai.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,95 @@ def continue_audio(
245245
]
246246

247247

248+
def check_ace_step_available() -> bool:
249+
try:
250+
import ace_step # noqa: F401
251+
return True
252+
except ImportError:
253+
return False
254+
255+
256+
# ---------------------------------------------------------------------------
257+
# ACE-Step Music Generation (full songs with vocals + lyrics)
258+
# ---------------------------------------------------------------------------
259+
def generate_music_ace_step(
260+
prompt: str,
261+
lyrics: str = "",
262+
output_path: Optional[str] = None,
263+
output_dir: str = "",
264+
duration: float = 30.0,
265+
on_progress: Optional[Callable] = None,
266+
) -> str:
267+
"""
268+
Generate music (with optional vocals + lyrics) using ACE-Step 1.5.
269+
270+
Superior to MusicGen: full songs with vocals+lyrics, 10x faster,
271+
4x less VRAM (<4GB), 1000+ styles, 19 languages. Apache 2.0.
272+
273+
Args:
274+
prompt: Style/genre description (e.g. "upbeat pop, female vocalist, catchy melody").
275+
lyrics: Optional lyrics for vocal generation. Empty = instrumental.
276+
duration: Song length in seconds (10-600).
277+
"""
278+
if not ensure_package("ace_step", "ace-step", on_progress):
279+
raise RuntimeError("ACE-Step not installed. Run: pip install ace-step")
280+
281+
if on_progress:
282+
on_progress(5, "Loading ACE-Step model...")
283+
284+
import torch
285+
from ace_step import ACEStep
286+
287+
device = "cuda" if torch.cuda.is_available() else "cpu"
288+
model = ACEStep.from_pretrained(device=device)
289+
290+
if output_path is None:
291+
directory = output_dir or tempfile.gettempdir()
292+
import re
293+
safe_prompt = re.sub(r'[^\w\-]', '_', prompt[:30]).strip('_')
294+
output_path = os.path.join(directory, f"ace_step_{safe_prompt}.wav")
295+
296+
duration = max(10.0, min(600.0, duration))
297+
298+
if on_progress:
299+
on_progress(20, f"Generating music: '{prompt[:50]}'...")
300+
301+
with torch.inference_mode():
302+
result = model.generate(
303+
prompt=prompt,
304+
lyrics=lyrics or None,
305+
duration=duration,
306+
)
307+
308+
if on_progress:
309+
on_progress(80, "Saving audio...")
310+
311+
# Save output
312+
import soundfile as sf
313+
audio_data = result["audio"]
314+
if hasattr(audio_data, "cpu"):
315+
audio_data = audio_data.cpu().numpy()
316+
if audio_data.ndim == 2:
317+
audio_data = audio_data.T
318+
sf.write(output_path, audio_data, result.get("sample_rate", 44100))
319+
320+
# Free GPU
321+
try:
322+
del model
323+
if torch.cuda.is_available():
324+
torch.cuda.empty_cache()
325+
except Exception:
326+
pass
327+
328+
if on_progress:
329+
on_progress(100, "Music generated with ACE-Step!")
330+
return output_path
331+
332+
248333
def get_music_ai_capabilities() -> Dict:
249334
return {
250335
"audiocraft": check_audiocraft_available(),
336+
"ace_step": check_ace_step_available(),
251337
"cuda": check_torch_cuda(),
252338
"models": MUSICGEN_MODELS,
253339
}

opencut/core/scene_detect.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,75 @@ def detect_scenes_ml(
457457
)
458458

459459

460+
# ---------------------------------------------------------------------------
461+
# PySceneDetect (fast, heuristic-based)
462+
# ---------------------------------------------------------------------------
463+
def detect_scenes_pyscenedetect(
464+
input_path: str,
465+
threshold: float = 27.0,
466+
min_scene_length: float = 2.0,
467+
on_progress: Optional[Callable] = None,
468+
) -> SceneInfo:
469+
"""
470+
Detect scene boundaries using PySceneDetect (heuristic, fast).
471+
472+
Faster than TransNetV2 ML approach. Good for rough cuts and long videos.
473+
Uses ContentDetector (HSV color histogram + delta analysis).
474+
475+
Args:
476+
threshold: ContentDetector threshold (default 27.0, range ~15-50).
477+
min_scene_length: Minimum scene duration in seconds.
478+
"""
479+
try:
480+
from scenedetect import SceneManager, open_video
481+
from scenedetect.detectors import ContentDetector
482+
except ImportError:
483+
raise RuntimeError("PySceneDetect not installed. Run: pip install scenedetect[opencv]")
484+
485+
if on_progress:
486+
on_progress(10, "Opening video with PySceneDetect...")
487+
488+
video = open_video(input_path)
489+
fps = video.frame_rate
490+
duration = video.duration.get_seconds() if hasattr(video.duration, 'get_seconds') else 0.0
491+
492+
scene_manager = SceneManager()
493+
min_frames = max(1, int(min_scene_length * fps))
494+
scene_manager.add_detector(ContentDetector(threshold=threshold, min_scene_len=min_frames))
495+
496+
if on_progress:
497+
on_progress(20, "Detecting scenes...")
498+
499+
scene_manager.detect_scenes(video, show_progress=False)
500+
scene_list = scene_manager.get_scene_list()
501+
502+
if on_progress:
503+
on_progress(80, "Building scene boundaries...")
504+
505+
boundaries = []
506+
for i, (start, end) in enumerate(scene_list):
507+
start_sec = start.get_seconds()
508+
boundaries.append(SceneBoundary(
509+
time=round(start_sec, 3),
510+
frame=start.get_frames(),
511+
score=1.0,
512+
label=f"Scene {i + 1}",
513+
))
514+
515+
total_scenes = len(boundaries)
516+
avg_scene = duration / total_scenes if total_scenes > 0 else duration
517+
518+
if on_progress:
519+
on_progress(100, f"Found {total_scenes} scenes (PySceneDetect)")
520+
521+
return SceneInfo(
522+
boundaries=boundaries,
523+
total_scenes=total_scenes,
524+
duration=duration,
525+
avg_scene_length=round(avg_scene, 2),
526+
)
527+
528+
460529
SPEED_RAMP_PRESETS = [
461530
{"name": "dramatic_pause", "label": "Dramatic Pause", "description": "Slow down in the middle for impact"},
462531
{"name": "smooth_ramp_up", "label": "Smooth Ramp Up", "description": "Gradually accelerate"},

opencut/core/voice_gen.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,86 @@ def kokoro_generate(
303303

304304

305305
# ---------------------------------------------------------------------------
306-
# Voice Cloning Stub (future: OpenVoice / F5-TTS)
306+
# Chatterbox TTS + Voice Cloning (Resemble AI, MIT)
307+
# ---------------------------------------------------------------------------
308+
def check_chatterbox_available() -> bool:
309+
try:
310+
import chatterbox # noqa: F401
311+
return True
312+
except ImportError:
313+
return False
314+
315+
316+
def chatterbox_generate(
317+
text: str,
318+
voice_ref: Optional[str] = None,
319+
output_path: Optional[str] = None,
320+
output_dir: str = "",
321+
exaggeration: float = 0.5,
322+
cfg_weight: float = 0.5,
323+
on_progress: Optional[Callable] = None,
324+
) -> str:
325+
"""
326+
Generate speech using Chatterbox (Resemble AI).
327+
328+
Premium TTS with zero-shot voice cloning from 5s audio reference,
329+
emotion control, and 23 language support. MIT licensed.
330+
331+
Args:
332+
text: Text to synthesize.
333+
voice_ref: Path to reference audio for voice cloning (5-30s WAV).
334+
If None, uses default voice.
335+
exaggeration: Emotion/expressiveness (0.0=neutral, 1.0=dramatic).
336+
cfg_weight: Classifier-free guidance weight (0.0-1.0).
337+
"""
338+
if not ensure_package("chatterbox", "chatterbox-tts", on_progress):
339+
raise RuntimeError("Failed to install chatterbox-tts. Install: pip install chatterbox-tts")
340+
341+
if output_path is None:
342+
directory = output_dir or tempfile.gettempdir()
343+
output_path = os.path.join(directory, f"chatterbox_{hash(text[:20]) & 0xFFFF:04x}.wav")
344+
345+
if on_progress:
346+
on_progress(10, "Loading Chatterbox model...")
347+
348+
import torch
349+
from chatterbox.tts import ChatterboxTTS
350+
351+
device = "cuda" if torch.cuda.is_available() else "cpu"
352+
model = ChatterboxTTS.from_pretrained(device=device)
353+
354+
if on_progress:
355+
on_progress(40, "Synthesizing speech...")
356+
357+
wav = model.generate(
358+
text,
359+
audio_prompt_path=voice_ref,
360+
exaggeration=exaggeration,
361+
cfg_weight=cfg_weight,
362+
)
363+
364+
if on_progress:
365+
on_progress(80, "Saving audio...")
366+
367+
import torchaudio
368+
torchaudio.save(output_path, wav, model.sr)
369+
370+
# Free GPU memory
371+
try:
372+
del model
373+
if torch.cuda.is_available():
374+
torch.cuda.empty_cache()
375+
except Exception:
376+
pass
377+
378+
if on_progress:
379+
on_progress(100, "Chatterbox speech generated!")
380+
381+
return output_path
382+
383+
384+
# ---------------------------------------------------------------------------
385+
# Voice list
307386
# ---------------------------------------------------------------------------
308387
KOKORO_VOICES = [
309388
{"id": "af_heart", "label": "Heart (Female, Warm)", "lang": "en", "gender": "female"},

opencut/routes/audio.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1154,7 +1154,22 @@ def _on_progress(pct, msg):
11541154

11551155
effective_dir = output_dir or tempfile.gettempdir()
11561156

1157-
if engine == "kokoro":
1157+
if engine == "chatterbox":
1158+
from opencut.core.voice_gen import chatterbox_generate
1159+
voice_ref = data.get("voice_ref", "")
1160+
if voice_ref:
1161+
try:
1162+
voice_ref = validate_filepath(voice_ref)
1163+
except ValueError:
1164+
voice_ref = None
1165+
else:
1166+
voice_ref = None
1167+
exaggeration = safe_float(data.get("exaggeration", 0.5), 0.5, min_val=0.0, max_val=1.0)
1168+
out = chatterbox_generate(
1169+
text, voice_ref=voice_ref, output_dir=effective_dir,
1170+
exaggeration=exaggeration, on_progress=_on_progress,
1171+
)
1172+
elif engine == "kokoro":
11581173
from opencut.core.voice_gen import kokoro_generate
11591174
out = kokoro_generate(
11601175
text, voice=voice, output_dir=effective_dir,
@@ -1697,6 +1712,53 @@ def _p(pct, msg):
16971712
return jsonify({"job_id": job_id, "status": "running"})
16981713

16991714

1715+
@audio_bp.route("/audio/music-ai/ace-step", methods=["POST"])
1716+
@require_csrf
1717+
def music_ai_ace_step():
1718+
"""Generate music with vocals + lyrics using ACE-Step 1.5."""
1719+
data = request.get_json(force=True)
1720+
prompt = data.get("prompt", "").strip()
1721+
lyrics = data.get("lyrics", "").strip()
1722+
if not prompt:
1723+
return jsonify({"error": "No prompt"}), 400
1724+
if len(lyrics) > 10000:
1725+
return jsonify({"error": "Lyrics too long (max 10000 chars)"}), 400
1726+
1727+
job_id = _new_job("ace-step", prompt[:40])
1728+
1729+
def _process():
1730+
try:
1731+
from opencut.core.music_ai import generate_music_ace_step
1732+
1733+
def _p(pct, msg):
1734+
_update_job(job_id, progress=pct, message=msg)
1735+
1736+
d = data.get("output_dir", "")
1737+
if d:
1738+
try:
1739+
d = validate_path(d)
1740+
except ValueError as e:
1741+
_update_job(job_id, status="error", message=str(e))
1742+
return
1743+
else:
1744+
d = tempfile.gettempdir()
1745+
out = generate_music_ace_step(
1746+
prompt, lyrics=lyrics, output_dir=d,
1747+
duration=safe_float(data.get("duration", 30), 30.0, min_val=10.0, max_val=600.0),
1748+
on_progress=_p,
1749+
)
1750+
_update_job(job_id, status="complete", progress=100, result={"output_path": out})
1751+
except Exception as e:
1752+
_update_job(job_id, status="error", error=str(e), message=f"Error: {e}")
1753+
1754+
thread = threading.Thread(target=_process, daemon=True)
1755+
thread.start()
1756+
with job_lock:
1757+
if job_id in jobs:
1758+
jobs[job_id]["_thread"] = thread
1759+
return jsonify({"job_id": job_id, "status": "running"})
1760+
1761+
17001762
@audio_bp.route("/audio/music-ai/melody", methods=["POST"])
17011763
@require_csrf
17021764
def music_ai_melody():

opencut/routes/video.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ def video_scenes():
421421
job_id = _new_job("scenes", filepath)
422422

423423
method = data.get("method", "ffmpeg").strip().lower()
424-
if method not in ("ffmpeg", "ml"):
424+
if method not in ("ffmpeg", "ml", "pyscenedetect"):
425425
method = "ffmpeg"
426426

427427
def _process():
@@ -438,6 +438,13 @@ def _on_progress(pct, msg):
438438
min_scene_length=min_scene,
439439
on_progress=_on_progress,
440440
)
441+
elif method == "pyscenedetect":
442+
from opencut.core.scene_detect import detect_scenes_pyscenedetect
443+
info = detect_scenes_pyscenedetect(
444+
filepath, threshold=safe_float(data.get("threshold", 27.0), 27.0, min_val=5.0, max_val=80.0),
445+
min_scene_length=min_scene,
446+
on_progress=_on_progress,
447+
)
441448
else:
442449
info = detect_scenes(
443450
filepath, threshold=threshold,

0 commit comments

Comments
 (0)