SysAdminDoc
diff --git a/‎CLAUDE.md‎
Lines changed: 6 additions & 6 deletions b/‎CLAUDE.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎opencut/core/audio_enhance.py‎
Lines changed: 109 additions & 3 deletions b/‎opencut/core/audio_enhance.py‎
Lines changed: 109 additions & 3 deletions
diff --git a/‎opencut/core/face_swap.py‎
Lines changed: 86 additions & 24 deletions b/‎opencut/core/face_swap.py‎
Lines changed: 86 additions & 24 deletions
diff --git a/‎opencut/core/face_tools.py‎
Lines changed: 48 additions & 3 deletions b/‎opencut/core/face_tools.py‎
Lines changed: 48 additions & 3 deletions
@@ -620,12 +620,12 @@ enhance = ["resemble-enhance>=0.0.1"]
 ## Competitive Upgrade Roadmap (March 2026 Research)
 
 ### Phase 2 — Dependency Swaps (Medium Effort)
-- [ ] **Audio separation**: Replace archived Demucs with `python-audio-separator` + Mel-Band RoFormer models (better SDR, actively maintained)
-- [ ] **Speech enhancement**: Replace stale Resemble Enhance + DeepFilterNet with `ClearerVoice-Studio` (Alibaba) — single library for denoise + super-res + separation, 48kHz
-- [ ] **Style transfer**: Replace 2016 .t7 models with PyTorch AdaIN arbitrary style transfer — any image as style reference
-- [ ] **Object removal**: Replace per-frame LAMA with `ProPainter` for video inpainting — temporal flow coherence eliminates flickering
-- [ ] **Face enhancement**: Add `CodeFormer` alongside GFPGAN — tunable fidelity slider, better identity preservation
-- [ ] **Face detection**: Use InsightFace `buffalo_l` for accuracy-critical paths (swap, enhance) — already a dependency
+- [x] **Audio separation**: Added `python-audio-separator` backend with Mel-Band RoFormer, BS-RoFormer, SCNet, MDX23C models alongside Demucs (backend param in `/audio/separate`)
+- [x] **Speech enhancement**: Added `ClearerVoice-Studio` as recommended backend (MossFormer2/FRCRN) alongside Resemble Enhance. `backend` param in `/audio/enhance`
+- [x] **Style transfer**: Added `arbitrary_style_transfer()` — any image as style reference via AdaIN color transfer in LAB space. New `/video/style/arbitrary` route. Original .t7 preset styles retained.
+- [x] **Object removal**: Added `inpaint_video_propainter()` for temporally coherent video inpainting (ICCV 2023). LAMA retained as per-frame fallback.
+- [x] **Face enhancement**: Added `CodeFormer` alongside GFPGAN — tunable fidelity slider (0=quality, 1=identity), model param in `/video/face/enhance`
+- [x] **Face detection**: Added InsightFace `buffalo_l` as `"insightface"` detector option in face_tools (highest accuracy). Route allowlists updated.
 
 ### Phase 3 — New Features (Higher Effort)
 - [ ] **Music generation**: Add `ACE-Step 1.5` — full songs WITH vocals+lyrics, 10x faster than MusicGen, 4x less VRAM, Apache 2.0
 
@@ -1,10 +1,11 @@
 """
 OpenCut Audio Enhancement Module
 
-Speech super-resolution using Resemble Enhance.
-Upsamples low-quality speech audio to studio quality.
+Speech denoising and super-resolution:
+- ClearerVoice-Studio (recommended): MossFormer2/FRCRN, 16kHz/48kHz, denoise+enhance+separation
+- Resemble Enhance (legacy): ODE-based super-resolution
 
-Requires: pip install resemble-enhance
+Requires: pip install clearvoice (preferred) or pip install resemble-enhance (legacy)
 """
 
 import logging
@@ -260,3 +261,108 @@ def enhance_speech(
                 torch.cuda.empty_cache()
         except Exception:
             pass
+
+
+# ---------------------------------------------------------------------------
+# ClearerVoice-Studio enhancement (recommended alternative)
+# ---------------------------------------------------------------------------
+def enhance_speech_clearvoice(
+    input_path,
+    output_path=None,
+    output_dir="",
+    task="speech_enhancement",
+    model="MossFormer2_SE_48K",
+    on_progress=None,
+):
+    """
+    Enhance speech audio using ClearerVoice-Studio (Alibaba).
+
+    Superior to Resemble Enhance: single library handles denoising,
+    super-resolution, and separation. Supports 16kHz and 48kHz models.
+
+    Args:
+        input_path: Path to input audio/video file.
+        output_path: Optional explicit output path.
+        output_dir: Directory for output.
+        task: "speech_enhancement" (denoise+enhance) or "speech_separation".
+        model: ClearerVoice model name. Options:
+            - "MossFormer2_SE_48K" (best quality, 48kHz)
+            - "FRCRN_SE_16K" (fast, 16kHz, 3M+ uses on ModelScope)
+            - "MossFormerGAN_SE_16K" (balanced, 16kHz)
+        on_progress: Progress callback(pct, msg).
+
+    Returns:
+        Output file path string.
+    """
+    if not os.path.isfile(input_path):
+        raise FileNotFoundError(f"Input file not found: {input_path}")
+
+    if on_progress:
+        on_progress(5, "Loading ClearerVoice model...")
+
+    try:
+        from clearvoice import ClearVoice
+    except ImportError:
+        raise RuntimeError(
+            "clearvoice is required. Install with: pip install clearvoice"
+        )
+
+    # If input is video, extract audio to temp WAV
+    temp_wav = None
+    audio_path = input_path
+
+    if _is_video(input_path):
+        if on_progress:
+            on_progress(10, "Extracting audio from video...")
+
+        _tmp = tempfile.NamedTemporaryFile(suffix=".wav", prefix="opencut_cv_", delete=False)
+        temp_wav = _tmp.name
+        _tmp.close()
+        _extract_audio(input_path, temp_wav)
+        audio_path = temp_wav
+
+    try:
+        if on_progress:
+            on_progress(20, f"Running {model}...")
+
+        cv = ClearVoice(task=task, model_names=[model])
+        result = cv(input_path=audio_path, online_write=False)
+
+        if on_progress:
+            on_progress(80, "Saving enhanced audio...")
+
+        # Build output path
+        if output_path is None:
+            base_name = os.path.splitext(os.path.basename(input_path))[0]
+            suffix = "_enhanced.wav"
+            if output_dir and os.path.isdir(output_dir):
+                output_path = os.path.join(output_dir, base_name + suffix)
+            else:
+                output_path = os.path.join(os.path.dirname(input_path), base_name + suffix)
+
+        out_dir = os.path.dirname(output_path)
+        if out_dir:
+            os.makedirs(out_dir, exist_ok=True)
+
+        # ClearVoice returns dict of {model: output_array} or writes to file
+        # Write result using the library's write method
+        cv.write(result, output_path=output_path)
+
+        if on_progress:
+            on_progress(100, "Audio enhanced!")
+
+        logger.info("ClearerVoice enhanced: %s -> %s", input_path, output_path)
+        return output_path
+
+    finally:
+        if temp_wav and os.path.isfile(temp_wav):
+            try:
+                os.remove(temp_wav)
+            except OSError:
+                pass
+        try:
+            import torch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except Exception:
+            pass
@@ -48,44 +48,82 @@ def enhance_faces(
     output_dir: str = "",
     model: str = "gfpgan",
     upscale: int = 2,
+    fidelity: float = 0.5,
     on_progress: Optional[Callable] = None,
 ) -> str:
     """
-    Enhance/restore faces in video using GFPGAN.
+    Enhance/restore faces in video using GFPGAN or CodeFormer.
 
     Upscales and restores face quality - fixes blur, compression artifacts,
     and low resolution on faces while preserving the rest of the frame.
 
     Args:
-        model: "gfpgan" (default, best general quality).
+        model: "gfpgan" (fast, good general quality) or "codeformer" (tunable fidelity, better identity).
         upscale: Face upscale factor (1-4).
+        fidelity: CodeFormer fidelity weight (0.0=quality, 1.0=fidelity). Ignored for GFPGAN.
     """
-    if not ensure_package("gfpgan", "gfpgan", on_progress):
-        raise RuntimeError("GFPGAN not installed. Run: pip install gfpgan")
     if not ensure_package("cv2", "opencv-python-headless", on_progress):
         raise RuntimeError("Failed to install opencv-python-headless. Install manually: pip install opencv-python-headless")
 
     import cv2
-    from gfpgan import GFPGANer
+
+    use_codeformer = model == "codeformer"
+
+    if use_codeformer:
+        if not ensure_package("basicsr", "basicsr", on_progress):
+            raise RuntimeError("basicsr not installed. Run: pip install basicsr")
+        if not ensure_package("facelib", "facexlib", on_progress):
+            raise RuntimeError("facexlib not installed. Run: pip install facexlib")
+    else:
+        if not ensure_package("gfpgan", "gfpgan", on_progress):
+            raise RuntimeError("GFPGAN not installed. Run: pip install gfpgan")
 
     if output_path is None:
         base = os.path.splitext(os.path.basename(video_path))[0]
         directory = output_dir or os.path.dirname(video_path)
         output_path = os.path.join(directory, f"{base}_enhanced.mp4")
 
     if on_progress:
-        on_progress(5, "Loading GFPGAN model...")
-
-    # Auto-download model
-    model_path = os.path.expanduser("~/.opencut/models/GFPGANv1.4.pth")
-    os.makedirs(os.path.dirname(model_path), exist_ok=True)
-
-    restorer = GFPGANer(
-        model_path=model_path,
-        upscale=upscale,
-        arch="clean",
-        channel_multiplier=2,
-    )
+        on_progress(5, f"Loading {model} model...")
+
+    if use_codeformer:
+        # CodeFormer — tunable fidelity, better identity preservation
+        import torch
+        from basicsr.archs.codeformer_arch import CodeFormer as CodeFormerArch
+        from basicsr.utils.download_util import load_file_from_url
+
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        codeformer_model_path = os.path.expanduser("~/.opencut/models/codeformer.pth")
+        os.makedirs(os.path.dirname(codeformer_model_path), exist_ok=True)
+        if not os.path.isfile(codeformer_model_path):
+            load_file_from_url(
+                "https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth",
+                model_dir=os.path.dirname(codeformer_model_path),
+                file_name="codeformer.pth",
+            )
+        codeformer_net = CodeFormerArch(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=["32", "64", "128", "256"]).to(device)
+        ckpt = torch.load(codeformer_model_path, map_location=device, weights_only=True)
+        codeformer_net.load_state_dict(ckpt.get("params_ema", ckpt.get("params", ckpt)), strict=False)
+        codeformer_net.eval()
+
+        from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+        face_helper = FaceRestoreHelper(
+            upscale_factor=upscale, face_size=512, crop_ratio=(1, 1),
+            det_model="retinaface_resnet50", save_ext="png", device=device,
+        )
+        restorer = None  # signal to use codeformer path
+    else:
+        from gfpgan import GFPGANer
+        # Auto-download model
+        model_path = os.path.expanduser("~/.opencut/models/GFPGANv1.4.pth")
+        os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+        restorer = GFPGANer(
+            model_path=model_path,
+            upscale=upscale,
+            arch="clean",
+            channel_multiplier=2,
+        )
 
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():
@@ -119,12 +157,33 @@ def enhance_faces(
                 break
 
             try:
-                _, _, output = restorer.enhance(frame, paste_back=True)
-                if output is not None:
-                    output = cv2.resize(output, (orig_w, orig_h))
-                    writer.write(output)
+                if use_codeformer:
+                    import torch
+                    face_helper.clean_all()
+                    face_helper.read_image(frame)
+                    face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
+                    face_helper.align_warp_face()
+                    for cropped_face in face_helper.cropped_faces:
+                        cropped_t = torch.from_numpy(cropped_face.transpose(2, 0, 1)).float().unsqueeze(0) / 255.0
+                        cropped_t = cropped_t.to(face_helper.device)
+                        with torch.no_grad():
+                            cf_output = codeformer_net(cropped_t, w=fidelity, adain=True)[0]
+                        restored = cf_output.squeeze(0).clamp(0, 1).cpu().numpy().transpose(1, 2, 0) * 255
+                        face_helper.add_restored_face(restored.astype("uint8"))
+                    face_helper.get_inverse_affine(None)
+                    output = face_helper.paste_faces_to_input_image()
+                    if output is not None:
+                        output = cv2.resize(output, (orig_w, orig_h))
+                        writer.write(output)
+                    else:
+                        writer.write(frame)
                 else:
-                    writer.write(frame)
+                    _, _, output = restorer.enhance(frame, paste_back=True)
+                    if output is not None:
+                        output = cv2.resize(output, (orig_w, orig_h))
+                        writer.write(output)
+                    else:
+                        writer.write(frame)
             except Exception as e:
                 logger.debug("Face enhance frame %d failed: %s", frame_idx, e)
                 writer.write(frame)
@@ -154,9 +213,12 @@ def enhance_faces(
             os.unlink(tmp_video)
         except OSError:
             pass
-        # Free GPU memory from GFPGAN model
+        # Free GPU memory from face enhancement model
         try:
-            del restorer
+            if use_codeformer:
+                del codeformer_net, face_helper
+            else:
+                del restorer
         except Exception:
             pass
         try:
 
@@ -33,8 +33,16 @@ def check_mediapipe_available() -> bool:
         return False
 
 
+def check_insightface_available() -> bool:
+    try:
+        import insightface  # noqa: F401
+        return True
+    except ImportError:
+        return False
+
+
 def check_face_tools_available() -> Dict:
-    caps = {"mediapipe": check_mediapipe_available()}
+    caps = {"mediapipe": check_mediapipe_available(), "insightface": check_insightface_available()}
     try:
         import cv2  # noqa: F401
         caps["opencv"] = True
@@ -82,6 +90,30 @@ def _detect_faces_haar(frame, cascade):
     return [(int(x), int(y), int(w), int(h)) for (x, y, w, h) in rects]
 
 
+def _detect_faces_insightface(frame, app):
+    """Detect faces using InsightFace buffalo_l. Returns list of (x, y, w, h) rects.
+
+    Higher accuracy than MediaPipe/Haar, especially on difficult angles,
+    occlusions, and low-resolution faces. Uses RetinaFace detector internally.
+    """
+    faces = app.get(frame)
+    rects = []
+    for face in faces:
+        bbox = face.bbox.astype(int)
+        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+        # Add padding (15%)
+        w = x2 - x1
+        h = y2 - y1
+        pad_x = int(w * 0.15)
+        pad_y = int(h * 0.15)
+        x1 = max(0, x1 - pad_x)
+        y1 = max(0, y1 - pad_y)
+        w = min(frame.shape[1] - x1, w + 2 * pad_x)
+        h = min(frame.shape[0] - y1, h + 2 * pad_y)
+        rects.append((x1, y1, w, h))
+    return rects
+
+
 # ---------------------------------------------------------------------------
 # Face Blur / Pixelate
 # ---------------------------------------------------------------------------
@@ -100,7 +132,7 @@ def blur_faces(
     Args:
         method: "gaussian" (smooth blur), "pixelate" (mosaic), "black" (solid box).
         strength: Blur kernel size (odd number, higher = more blur). For pixelate, block size.
-        detector: "mediapipe" (best) or "haar" (fallback, no install needed).
+        detector: "insightface" (highest accuracy), "mediapipe" (fast), or "haar" (fallback).
     """
     if not ensure_package("cv2", "opencv-python-headless", on_progress):
         raise RuntimeError("Failed to install opencv-python-headless. Install manually: pip install opencv-python-headless")
@@ -119,6 +151,17 @@ def blur_faces(
     # Set up detector
     face_det = None
     mp_face = None
+    insight_app = None
+    if detector == "insightface":
+        try:
+            ensure_package("insightface", "insightface", on_progress)
+            ensure_package("onnxruntime", "onnxruntime", on_progress)
+            import insightface
+            insight_app = insightface.app.FaceAnalysis(name="buffalo_l", providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+            insight_app.prepare(ctx_id=0, det_size=(640, 640))
+        except Exception:
+            detector = "mediapipe"
+
     if detector == "mediapipe":
         try:
             ensure_package("mediapipe", "mediapipe", on_progress)
@@ -162,7 +205,9 @@ def blur_faces(
                 continue
 
             # Detect faces
-            if detector == "mediapipe" and mp_face:
+            if detector == "insightface" and insight_app:
+                rects = _detect_faces_insightface(frame, insight_app)
+            elif detector == "mediapipe" and mp_face:
                 rects = _detect_faces_mediapipe(frame, mp_face)
             else:
                 rects = _detect_faces_haar(frame, face_det)