feat: phase 3 — ACE-Step music, Chatterbox TTS+cloning, PySceneDetect

SysAdminDoc · SysAdminDoc · commit 3b55753ebf8c · 2026-03-19T00:28:18.000-04:00
Music generation: added ACE-Step 1.5 — full songs with vocals+lyrics,
10x faster than MusicGen, &lt;4GB VRAM, Apache 2.0. New /audio/music-ai/ace-step route.

TTS + voice cloning: added Chatterbox (Resemble AI, MIT) as premium
engine — zero-shot voice cloning from 5s audio reference, emotion
control, 23 languages. Three-tier TTS: edge-tts (free), Kokoro (CPU),
Chatterbox (premium+cloning).

Scene detection: added PySceneDetect as fast heuristic method alongside
FFmpeg threshold and TransNetV2 ML. ContentDetector with configurable
threshold. New "pyscenedetect" method in /video/scenes route.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -628,13 +628,13 @@ enhance = ["resemble-enhance>=0.0.1"]
 - [x] **Face detection**: Added InsightFace `buffalo_l` as `"insightface"` detector option in face_tools (highest accuracy). Route allowlists updated.
 
 ### Phase 3 — New Features (Higher Effort)
-- [ ] **Music generation**: Add `ACE-Step 1.5` — full songs WITH vocals+lyrics, 10x faster than MusicGen, 4x less VRAM, Apache 2.0
-- [ ] **TTS tiers**: Add `Kokoro` (82M params, CPU, fast) + `Chatterbox` (voice cloning, emotion, 23 langs, MIT) alongside edge-tts
-- [ ] **Voice cloning**: Via Chatterbox — zero-shot from 5s audio, emotion control, paralinguistic tags
+- [x] **Music generation**: Added `ACE-Step 1.5` — full songs WITH vocals+lyrics, `/audio/music-ai/ace-step` route
+- [x] **TTS tiers**: Kokoro already existed; added `Chatterbox` (voice cloning, emotion, 23 langs, MIT) as `"chatterbox"` engine in `/audio/tts/generate`
+- [x] **Voice cloning**: Via Chatterbox `voice_ref` param — zero-shot from 5s audio, emotion control
 - [ ] **AI color grading**: Add `Image-Adaptive-3DLUT` — learned 3D LUTs, <2ms on 4K, replaces histogram matching
 - [ ] **Motion graphics**: Add `Remotion` render service — React-based, After Effects quality titles/animations vs FFmpeg drawtext
 - [ ] **Video denoising**: Add `BasicVSR++` as GPU option — temporal propagation across frames vs spatial-only nlmeans
-- [ ] **Scene detection**: Add `PySceneDetect` as fast complement to TransNetV2 — 4.6k stars, actively maintained
+- [x] **Scene detection**: Added `PySceneDetect` as `"pyscenedetect"` method in `/video/scenes` — heuristic, fast, ContentDetector
 - [ ] **Neural LUT blending**: Add `NILUT` for continuous style blending — single slider between any two color grades
 - [ ] **Translation**: Add `SeamlessM4T v2` as "High Quality" option — 20% BLEU improvement, multimodal
 - [ ] **Caption NLP emphasis**: Auto-detect important words, apply different highlight colors/sizes in Pillow renderer
diff --git a/opencut/core/music_ai.py b/opencut/core/music_ai.py
@@ -245,9 +245,95 @@ def continue_audio(
 ]
 
 
+def check_ace_step_available() -> bool:
+    try:
+        import ace_step  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+# ---------------------------------------------------------------------------
+# ACE-Step Music Generation (full songs with vocals + lyrics)
+# ---------------------------------------------------------------------------
+def generate_music_ace_step(
+    prompt: str,
+    lyrics: str = "",
+    output_path: Optional[str] = None,
+    output_dir: str = "",
+    duration: float = 30.0,
+    on_progress: Optional[Callable] = None,
+) -> str:
+    """
+    Generate music (with optional vocals + lyrics) using ACE-Step 1.5.
+
+    Superior to MusicGen: full songs with vocals+lyrics, 10x faster,
+    4x less VRAM (<4GB), 1000+ styles, 19 languages. Apache 2.0.
+
+    Args:
+        prompt: Style/genre description (e.g. "upbeat pop, female vocalist, catchy melody").
+        lyrics: Optional lyrics for vocal generation. Empty = instrumental.
+        duration: Song length in seconds (10-600).
+    """
+    if not ensure_package("ace_step", "ace-step", on_progress):
+        raise RuntimeError("ACE-Step not installed. Run: pip install ace-step")
+
+    if on_progress:
+        on_progress(5, "Loading ACE-Step model...")
+
+    import torch
+    from ace_step import ACEStep
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = ACEStep.from_pretrained(device=device)
+
+    if output_path is None:
+        directory = output_dir or tempfile.gettempdir()
+        import re
+        safe_prompt = re.sub(r'[^\w\-]', '_', prompt[:30]).strip('_')
+        output_path = os.path.join(directory, f"ace_step_{safe_prompt}.wav")
+
+    duration = max(10.0, min(600.0, duration))
+
+    if on_progress:
+        on_progress(20, f"Generating music: '{prompt[:50]}'...")
+
+    with torch.inference_mode():
+        result = model.generate(
+            prompt=prompt,
+            lyrics=lyrics or None,
+            duration=duration,
+        )
+
+    if on_progress:
+        on_progress(80, "Saving audio...")
+
+    # Save output
+    import soundfile as sf
+    audio_data = result["audio"]
+    if hasattr(audio_data, "cpu"):
+        audio_data = audio_data.cpu().numpy()
+    if audio_data.ndim == 2:
+        audio_data = audio_data.T
+    sf.write(output_path, audio_data, result.get("sample_rate", 44100))
+
+    # Free GPU
+    try:
+        del model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+
+    if on_progress:
+        on_progress(100, "Music generated with ACE-Step!")
+    return output_path
+
+
 def get_music_ai_capabilities() -> Dict:
     return {
         "audiocraft": check_audiocraft_available(),
+        "ace_step": check_ace_step_available(),
         "cuda": check_torch_cuda(),
         "models": MUSICGEN_MODELS,
     }
diff --git a/opencut/core/scene_detect.py b/opencut/core/scene_detect.py
@@ -457,6 +457,75 @@ def detect_scenes_ml(
     )
 
 
+# ---------------------------------------------------------------------------
+# PySceneDetect (fast, heuristic-based)
+# ---------------------------------------------------------------------------
+def detect_scenes_pyscenedetect(
+    input_path: str,
+    threshold: float = 27.0,
+    min_scene_length: float = 2.0,
+    on_progress: Optional[Callable] = None,
+) -> SceneInfo:
+    """
+    Detect scene boundaries using PySceneDetect (heuristic, fast).
+
+    Faster than TransNetV2 ML approach. Good for rough cuts and long videos.
+    Uses ContentDetector (HSV color histogram + delta analysis).
+
+    Args:
+        threshold: ContentDetector threshold (default 27.0, range ~15-50).
+        min_scene_length: Minimum scene duration in seconds.
+    """
+    try:
+        from scenedetect import SceneManager, open_video
+        from scenedetect.detectors import ContentDetector
+    except ImportError:
+        raise RuntimeError("PySceneDetect not installed. Run: pip install scenedetect[opencv]")
+
+    if on_progress:
+        on_progress(10, "Opening video with PySceneDetect...")
+
+    video = open_video(input_path)
+    fps = video.frame_rate
+    duration = video.duration.get_seconds() if hasattr(video.duration, 'get_seconds') else 0.0
+
+    scene_manager = SceneManager()
+    min_frames = max(1, int(min_scene_length * fps))
+    scene_manager.add_detector(ContentDetector(threshold=threshold, min_scene_len=min_frames))
+
+    if on_progress:
+        on_progress(20, "Detecting scenes...")
+
+    scene_manager.detect_scenes(video, show_progress=False)
+    scene_list = scene_manager.get_scene_list()
+
+    if on_progress:
+        on_progress(80, "Building scene boundaries...")
+
+    boundaries = []
+    for i, (start, end) in enumerate(scene_list):
+        start_sec = start.get_seconds()
+        boundaries.append(SceneBoundary(
+            time=round(start_sec, 3),
+            frame=start.get_frames(),
+            score=1.0,
+            label=f"Scene {i + 1}",
+        ))
+
+    total_scenes = len(boundaries)
+    avg_scene = duration / total_scenes if total_scenes > 0 else duration
+
+    if on_progress:
+        on_progress(100, f"Found {total_scenes} scenes (PySceneDetect)")
+
+    return SceneInfo(
+        boundaries=boundaries,
+        total_scenes=total_scenes,
+        duration=duration,
+        avg_scene_length=round(avg_scene, 2),
+    )
+
+
 SPEED_RAMP_PRESETS = [
     {"name": "dramatic_pause", "label": "Dramatic Pause", "description": "Slow down in the middle for impact"},
     {"name": "smooth_ramp_up", "label": "Smooth Ramp Up", "description": "Gradually accelerate"},
diff --git a/opencut/core/voice_gen.py b/opencut/core/voice_gen.py
@@ -303,7 +303,86 @@ def kokoro_generate(
 
 
 # ---------------------------------------------------------------------------
-# Voice Cloning Stub (future: OpenVoice / F5-TTS)
+# Chatterbox TTS + Voice Cloning (Resemble AI, MIT)
+# ---------------------------------------------------------------------------
+def check_chatterbox_available() -> bool:
+    try:
+        import chatterbox  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
+def chatterbox_generate(
+    text: str,
+    voice_ref: Optional[str] = None,
+    output_path: Optional[str] = None,
+    output_dir: str = "",
+    exaggeration: float = 0.5,
+    cfg_weight: float = 0.5,
+    on_progress: Optional[Callable] = None,
+) -> str:
+    """
+    Generate speech using Chatterbox (Resemble AI).
+
+    Premium TTS with zero-shot voice cloning from 5s audio reference,
+    emotion control, and 23 language support. MIT licensed.
+
+    Args:
+        text: Text to synthesize.
+        voice_ref: Path to reference audio for voice cloning (5-30s WAV).
+            If None, uses default voice.
+        exaggeration: Emotion/expressiveness (0.0=neutral, 1.0=dramatic).
+        cfg_weight: Classifier-free guidance weight (0.0-1.0).
+    """
+    if not ensure_package("chatterbox", "chatterbox-tts", on_progress):
+        raise RuntimeError("Failed to install chatterbox-tts. Install: pip install chatterbox-tts")
+
+    if output_path is None:
+        directory = output_dir or tempfile.gettempdir()
+        output_path = os.path.join(directory, f"chatterbox_{hash(text[:20]) & 0xFFFF:04x}.wav")
+
+    if on_progress:
+        on_progress(10, "Loading Chatterbox model...")
+
+    import torch
+    from chatterbox.tts import ChatterboxTTS
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = ChatterboxTTS.from_pretrained(device=device)
+
+    if on_progress:
+        on_progress(40, "Synthesizing speech...")
+
+    wav = model.generate(
+        text,
+        audio_prompt_path=voice_ref,
+        exaggeration=exaggeration,
+        cfg_weight=cfg_weight,
+    )
+
+    if on_progress:
+        on_progress(80, "Saving audio...")
+
+    import torchaudio
+    torchaudio.save(output_path, wav, model.sr)
+
+    # Free GPU memory
+    try:
+        del model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+
+    if on_progress:
+        on_progress(100, "Chatterbox speech generated!")
+
+    return output_path
+
+
+# ---------------------------------------------------------------------------
+# Voice list
 # ---------------------------------------------------------------------------
 KOKORO_VOICES = [
     {"id": "af_heart", "label": "Heart (Female, Warm)", "lang": "en", "gender": "female"},
diff --git a/opencut/routes/audio.py b/opencut/routes/audio.py
@@ -1154,7 +1154,22 @@ def _on_progress(pct, msg):
 
             effective_dir = output_dir or tempfile.gettempdir()
 
-            if engine == "kokoro":
+            if engine == "chatterbox":
+                from opencut.core.voice_gen import chatterbox_generate
+                voice_ref = data.get("voice_ref", "")
+                if voice_ref:
+                    try:
+                        voice_ref = validate_filepath(voice_ref)
+                    except ValueError:
+                        voice_ref = None
+                else:
+                    voice_ref = None
+                exaggeration = safe_float(data.get("exaggeration", 0.5), 0.5, min_val=0.0, max_val=1.0)
+                out = chatterbox_generate(
+                    text, voice_ref=voice_ref, output_dir=effective_dir,
+                    exaggeration=exaggeration, on_progress=_on_progress,
+                )
+            elif engine == "kokoro":
                 from opencut.core.voice_gen import kokoro_generate
                 out = kokoro_generate(
                     text, voice=voice, output_dir=effective_dir,
@@ -1697,6 +1712,53 @@ def _p(pct, msg):
     return jsonify({"job_id": job_id, "status": "running"})
 
 
+@audio_bp.route("/audio/music-ai/ace-step", methods=["POST"])
+@require_csrf
+def music_ai_ace_step():
+    """Generate music with vocals + lyrics using ACE-Step 1.5."""
+    data = request.get_json(force=True)
+    prompt = data.get("prompt", "").strip()
+    lyrics = data.get("lyrics", "").strip()
+    if not prompt:
+        return jsonify({"error": "No prompt"}), 400
+    if len(lyrics) > 10000:
+        return jsonify({"error": "Lyrics too long (max 10000 chars)"}), 400
+
+    job_id = _new_job("ace-step", prompt[:40])
+
+    def _process():
+        try:
+            from opencut.core.music_ai import generate_music_ace_step
+
+            def _p(pct, msg):
+                _update_job(job_id, progress=pct, message=msg)
+
+            d = data.get("output_dir", "")
+            if d:
+                try:
+                    d = validate_path(d)
+                except ValueError as e:
+                    _update_job(job_id, status="error", message=str(e))
+                    return
+            else:
+                d = tempfile.gettempdir()
+            out = generate_music_ace_step(
+                prompt, lyrics=lyrics, output_dir=d,
+                duration=safe_float(data.get("duration", 30), 30.0, min_val=10.0, max_val=600.0),
+                on_progress=_p,
+            )
+            _update_job(job_id, status="complete", progress=100, result={"output_path": out})
+        except Exception as e:
+            _update_job(job_id, status="error", error=str(e), message=f"Error: {e}")
+
+    thread = threading.Thread(target=_process, daemon=True)
+    thread.start()
+    with job_lock:
+        if job_id in jobs:
+            jobs[job_id]["_thread"] = thread
+    return jsonify({"job_id": job_id, "status": "running"})
+
+
 @audio_bp.route("/audio/music-ai/melody", methods=["POST"])
 @require_csrf
 def music_ai_melody():
diff --git a/opencut/routes/video.py b/opencut/routes/video.py
@@ -421,7 +421,7 @@ def video_scenes():
     job_id = _new_job("scenes", filepath)
 
     method = data.get("method", "ffmpeg").strip().lower()
-    if method not in ("ffmpeg", "ml"):
+    if method not in ("ffmpeg", "ml", "pyscenedetect"):
         method = "ffmpeg"
 
     def _process():
@@ -438,6 +438,13 @@ def _on_progress(pct, msg):
                     min_scene_length=min_scene,
                     on_progress=_on_progress,
                 )
+            elif method == "pyscenedetect":
+                from opencut.core.scene_detect import detect_scenes_pyscenedetect
+                info = detect_scenes_pyscenedetect(
+                    filepath, threshold=safe_float(data.get("threshold", 27.0), 27.0, min_val=5.0, max_val=80.0),
+                    min_scene_length=min_scene,
+                    on_progress=_on_progress,
+                )
             else:
                 info = detect_scenes(
                     filepath, threshold=threshold,