fix: batch 27 — 17 audit fixes for v1.4.0 new code

SysAdminDoc · SysAdminDoc · commit aad18a1d6c76 · 2026-03-19T02:37:50.000-04:00
MCP server: job_id path injection blocked (regex validation), filepath
traversal blocked (defense-in-depth), CSRF retry on 403 (backend
restart), stale version fixed (uses __version__), JSON parse error
response per spec

SeamlessM4T: model loaded ONCE for all segments (was loading 2.3GB
per-segment causing OOM). GPU cleanup in finally block.

Transcript cache: only used for SRT/VTT/JSON export (not styled
captions). Cached segments now have .start/.end/.text/.words attrs.

Queue allowlist: added 4 missing v1.4.0 endpoints (ace-step,
ai-denoise, ai-lut, lut-blend)

TTS engine allowlist, ACE-Step result validation, BasicVSR++ missing
weights error, LUT blend size mismatch error, Chatterbox sr fallback,
ClearerVoice output validation, audio-separator stems filtering,
waveform button type, PySceneDetect start boundary, Remotion template
component, LUT name reserved word validation
diff --git a/extension/com.opencut.panel/client/main.js b/extension/com.opencut.panel/client/main.js
@@ -5268,6 +5268,7 @@
             parent = parent.parentNode;
             if (!parent || parent.querySelector(".waveform-audio-btn")) continue;
             var btn = document.createElement("button");
+            btn.type = "button";
             btn.className = "btn-outline btn-sm waveform-audio-btn";
             btn.textContent = "Preview Waveform";
             btn.style.marginBottom = "6px";
diff --git a/opencut/core/audio_enhance.py b/opencut/core/audio_enhance.py
@@ -348,6 +348,9 @@ def enhance_speech_clearvoice(
         # Write result using the library's write method
         cv.write(result, output_path=output_path)
 
+        if not os.path.isfile(output_path) or os.path.getsize(output_path) == 0:
+            raise RuntimeError("ClearerVoice produced empty or missing output file")
+
         if on_progress:
             on_progress(100, "Audio enhanced!")
 
diff --git a/opencut/core/lut_library.py b/opencut/core/lut_library.py
@@ -849,14 +849,18 @@ def _parse_cube(path):
                 ba = vals_a[i][2] * (1 - blend) + vals_b[i][2] * blend
                 f.write(f"{_clamp(ra):.6f} {_clamp(ga):.6f} {_clamp(ba):.6f}\n")
         else:
-            # Sizes differ — generate identity-blended output
+            raise ValueError(
+                f"LUT size mismatch: {lut_a_name} has {len(vals_a)} entries, "
+                f"{lut_b_name} has {len(vals_b)} entries (need {total_entries} for size {size}). "
+                f"Both LUTs must have the same cube size for blending."
+            )
+            # Unreachable fallback
             for b_i in range(size):
                 for g_i in range(size):
                     for r_i in range(size):
                         r = r_i / (size - 1)
                         g = g_i / (size - 1)
                         b = b_i / (size - 1)
-                        # Just write identity when sizes mismatch (safe fallback)
                         f.write(f"{r:.6f} {g:.6f} {b:.6f}\n")
 
     if on_progress:
diff --git a/opencut/core/motion_graphics.py b/opencut/core/motion_graphics.py
@@ -421,4 +421,28 @@ def _generate_default_template(template_dir: str, template_name: str):
     with open(os.path.join(template_dir, "package.json"), "w") as f:
         json.dump(package, f, indent=2)
 
+    # Create a minimal Main composition component
+    comp_code = """import {Composition, useCurrentFrame, useVideoConfig, interpolate} from 'remotion';
+import React from 'react';
+
+const Main = () => {
+  const frame = useCurrentFrame();
+  const {fps, durationInFrames, width, height} = useVideoConfig();
+  const opacity = interpolate(frame, [0, fps * 0.5], [0, 1], {extrapolateRight: 'clamp'});
+  const outOpacity = interpolate(frame, [durationInFrames - fps * 0.5, durationInFrames], [1, 0], {extrapolateLeft: 'clamp'});
+  return (
+    <div style={{width, height, display: 'flex', alignItems: 'center', justifyContent: 'center', background: '#1e1e2e', opacity: Math.min(opacity, outOpacity)}}>
+      <h1 style={{color: '#cdd6f4', fontSize: 64, fontFamily: 'sans-serif', textAlign: 'center', padding: 40}}>Title</h1>
+    </div>
+  );
+};
+
+export const RemotionRoot = () => (
+  <Composition id="Main" component={Main} durationInFrames={150} fps={30} width={1920} height={1080} />
+);
+"""
+    os.makedirs(os.path.join(template_dir, "src"), exist_ok=True)
+    with open(os.path.join(template_dir, "src", "index.tsx"), "w") as f:
+        f.write(comp_code)
+
     logger.info("Generated default Remotion template: %s", template_dir)
diff --git a/opencut/core/music_ai.py b/opencut/core/music_ai.py
@@ -310,12 +310,20 @@ def generate_music_ace_step(
 
     # Save output
     import soundfile as sf
-    audio_data = result["audio"]
+    if isinstance(result, dict):
+        audio_data = result.get("audio")
+    elif hasattr(result, "cpu"):
+        audio_data = result  # tensor returned directly
+    else:
+        raise RuntimeError(f"Unexpected ACE-Step result type: {type(result)}")
+    if audio_data is None:
+        raise RuntimeError("ACE-Step produced no audio output")
     if hasattr(audio_data, "cpu"):
         audio_data = audio_data.cpu().numpy()
     if audio_data.ndim == 2:
         audio_data = audio_data.T
-    sf.write(output_path, audio_data, result.get("sample_rate", 44100))
+    sr = result.get("sample_rate", 44100) if isinstance(result, dict) else 44100
+    sf.write(output_path, audio_data, sr)
 
     # Free GPU
     try:
diff --git a/opencut/core/scene_detect.py b/opencut/core/scene_detect.py
@@ -502,15 +502,16 @@ def detect_scenes_pyscenedetect(
     if on_progress:
         on_progress(80, "Building scene boundaries...")
 
-    boundaries = []
+    boundaries = [SceneBoundary(time=0.0, frame=0, score=1.0, label="Start")]
     for i, (start, end) in enumerate(scene_list):
         start_sec = start.get_seconds()
-        boundaries.append(SceneBoundary(
-            time=round(start_sec, 3),
-            frame=start.get_frames(),
-            score=1.0,
-            label=f"Scene {i + 1}",
-        ))
+        if start_sec > 0.01:  # Skip if scene starts at 0 (duplicate of Start)
+            boundaries.append(SceneBoundary(
+                time=round(start_sec, 3),
+                frame=start.get_frames(),
+                score=1.0,
+                label=f"Scene {i + 1}",
+            ))
 
     total_scenes = len(boundaries)
     avg_scene = duration / total_scenes if total_scenes > 0 else duration
diff --git a/opencut/core/video_ai.py b/opencut/core/video_ai.py
@@ -486,7 +486,11 @@ def _denoise_basicvsr(
         ckpt = torch.load(weights_path, map_location=device, weights_only=True)
         model.load_state_dict(ckpt.get("params", ckpt.get("params_ema", ckpt)), strict=False)
     else:
-        logger.warning("BasicVSR++ weights not found at %s — using untrained model", weights_path)
+        raise RuntimeError(
+            f"BasicVSR++ weights not found at {weights_path}. "
+            "Download from https://github.com/ckkelvinchan/BasicVSR_PlusPlus "
+            "and place as ~/.opencut/models/basicvsrpp_denoise.pth"
+        )
 
     model.eval()
 
diff --git a/opencut/core/voice_gen.py b/opencut/core/voice_gen.py
@@ -365,7 +365,8 @@ def chatterbox_generate(
         on_progress(80, "Saving audio...")
 
     import torchaudio
-    torchaudio.save(output_path, wav, model.sr)
+    sample_rate = getattr(model, "sr", getattr(model, "sample_rate", 24000))
+    torchaudio.save(output_path, wav, sample_rate)
 
     # Free GPU memory
     try:
diff --git a/opencut/mcp_server.py b/opencut/mcp_server.py
@@ -15,29 +15,37 @@
 
 import json
 import logging
+import re
 import sys
 import urllib.error
 import urllib.request
 
+from opencut import __version__
+
 logger = logging.getLogger("opencut.mcp")
 
 BACKEND_URL = "http://127.0.0.1:5679"
 _csrf_token = ""
 
 
+def _refresh_csrf():
+    """Fetch fresh CSRF token from backend."""
+    global _csrf_token
+    try:
+        req = urllib.request.Request(f"{BACKEND_URL}/health")
+        with urllib.request.urlopen(req, timeout=5) as resp:
+            body = json.loads(resp.read())
+            _csrf_token = body.get("csrf_token", "")
+    except Exception:
+        pass
+
+
 def _api(method, path, data=None):
     """Call the OpenCut Flask backend."""
     global _csrf_token
 
-    # Get CSRF token if we don't have one
     if not _csrf_token:
-        try:
-            req = urllib.request.Request(f"{BACKEND_URL}/health")
-            with urllib.request.urlopen(req, timeout=5) as resp:
-                body = json.loads(resp.read())
-                _csrf_token = body.get("csrf_token", "")
-        except Exception:
-            pass
+        _refresh_csrf()
 
     url = f"{BACKEND_URL}{path}"
     headers = {"Content-Type": "application/json"}
@@ -51,6 +59,16 @@ def _api(method, path, data=None):
         with urllib.request.urlopen(req, timeout=120) as resp:
             return json.loads(resp.read())
     except urllib.error.HTTPError as e:
+        # Retry once on 403 (stale CSRF token after backend restart)
+        if e.code == 403 and _csrf_token:
+            _refresh_csrf()
+            headers["X-OpenCut-Token"] = _csrf_token
+            req2 = urllib.request.Request(url, data=body, headers=headers, method=method)
+            try:
+                with urllib.request.urlopen(req2, timeout=120) as resp2:
+                    return json.loads(resp2.read())
+            except Exception:
+                pass
         error_body = e.read().decode(errors="replace")
         try:
             return json.loads(error_body)
@@ -214,16 +232,34 @@ def _api(method, path, data=None):
 }
 
 
+def _validate_mcp_filepath(args, key="filepath"):
+    """Validate filepath arguments at MCP layer (defense-in-depth)."""
+    path = args.get(key, "")
+    if not isinstance(path, str):
+        return False
+    if ".." in path or "\x00" in path:
+        return False
+    return True
+
+
 def handle_tool_call(tool_name, arguments):
     """Execute an MCP tool call by proxying to the Flask backend."""
     if tool_name not in _TOOL_ROUTES:
         return {"error": f"Unknown tool: {tool_name}"}
 
+    # Validate filepath arguments at MCP layer
+    for key in ("filepath", "style_image", "voice_ref"):
+        if key in arguments and not _validate_mcp_filepath(arguments, key):
+            return {"error": f"Invalid {key}: path traversal or null bytes detected"}
+
     method, path = _TOOL_ROUTES[tool_name]
 
     # Handle special routing
     if tool_name == "opencut_job_status":
         job_id = arguments.get("job_id", "")
+        # Validate job_id format (UUID hex + hyphens only)
+        if not re.match(r'^[a-f0-9-]+$', job_id):
+            return {"error": "Invalid job_id format"}
         path = f"/status/{job_id}"
         return _api("GET", path)
 
@@ -255,6 +291,9 @@ def run_mcp_stdio():
         try:
             msg = json.loads(line)
         except json.JSONDecodeError:
+            err = {"jsonrpc": "2.0", "id": None, "error": {"code": -32700, "message": "Parse error"}}
+            sys.stdout.write(json.dumps(err) + "\n")
+            sys.stdout.flush()
             continue
 
         msg_id = msg.get("id")
@@ -270,7 +309,7 @@ def run_mcp_stdio():
                     "capabilities": {"tools": {}},
                     "serverInfo": {
                         "name": "opencut",
-                        "version": "1.3.1",
+                        "version": __version__,
                     },
                 },
             }
diff --git a/opencut/routes/audio.py b/opencut/routes/audio.py
@@ -538,9 +538,14 @@ def _process():
                 output_files = separator.separate(input_audio)
 
                 output_paths = []
+                # Filter to requested stems only
+                requested = set(stems)
                 for f in output_files:
                     if os.path.isfile(f):
-                        output_paths.append(f)
+                        fname = os.path.splitext(os.path.basename(f))[0].lower()
+                        # Match if any requested stem appears in filename
+                        if not requested or any(s in fname for s in requested):
+                            output_paths.append(f)
 
                 if temp_audio and os.path.exists(temp_audio):
                     try:
@@ -1124,6 +1129,8 @@ def tts_generate():
     data = request.get_json(force=True)
     text = data.get("text", "").strip()
     engine = data.get("engine", "edge")
+    if engine not in ("edge", "kokoro", "chatterbox"):
+        engine = "edge"
     voice = data.get("voice", "en-US-AriaNeural")
     import re as _re_tts
     rate = data.get("rate", "+0%")
diff --git a/opencut/routes/captions.py b/opencut/routes/captions.py
@@ -162,11 +162,22 @@ def _process():
                 word_timestamps=word_timestamps,
             )
 
-            # Check transcript cache first (avoids re-transcribing)
-            cached = get_cached_transcript(filepath, model=model)
-            if cached and not data.get("force_retranscribe", False):
+            # Check transcript cache (only for SRT/VTT export — styled captions need Segment objects)
+            _use_cache = sub_format in ("srt", "vtt", "json") and not data.get("force_retranscribe", False)
+            cached = get_cached_transcript(filepath, model=model) if _use_cache else None
+            if cached:
                 _update_job(job_id, progress=20, message="Using cached transcript...")
-                result = type("TranscriptResult", (), {"segments": cached, "language": language or "en"})()
+                # Build minimal objects with .start/.end/.text for export functions
+                _CachedSeg = type("CachedSeg", (), {})
+                cached_segs = []
+                for cs in cached:
+                    seg_obj = _CachedSeg()
+                    seg_obj.start = cs.get("start", 0)
+                    seg_obj.end = cs.get("end", 0)
+                    seg_obj.text = cs.get("text", "")
+                    seg_obj.words = []
+                    cached_segs.append(seg_obj)
+                result = type("TranscriptResult", (), {"segments": cached_segs, "language": language or "en"})()
             else:
                 _update_job(job_id, progress=20, message="Transcribing audio (this takes a while for long files)...")
                 result = transcribe(filepath, config=config)
@@ -910,21 +921,38 @@ def _on_progress(pct, msg):
                 _update_job(job_id, progress=pct, message=msg)
 
             if backend == "seamless":
-                # SeamlessM4T v2 — higher quality, per-segment translation
-                from opencut.core.captions_enhanced import translate_text_seamless
-                translated = []
-                total = len(segments)
-                for i, seg in enumerate(segments):
-                    text = seg.get("text", "").strip()
-                    if not text:
-                        translated.append(seg.copy())
-                        continue
-                    t = translate_text_seamless(text, source_lang=source_lang, target_lang=target_lang)
-                    new_seg = seg.copy()
-                    new_seg["text"] = t
-                    translated.append(new_seg)
-                    if i % 10 == 0:
-                        _on_progress(10 + int((i / total) * 85), f"Translating {i+1}/{total}...")
+                # SeamlessM4T v2 — load model ONCE, translate all segments
+                _on_progress(10, "Loading SeamlessM4T v2 model...")
+                import torch
+                from transformers import AutoProcessor, SeamlessM4Tv2ForTextToText
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                model_id = "facebook/seamless-m4t-v2-large"
+                processor = AutoProcessor.from_pretrained(model_id)
+                s_model = SeamlessM4Tv2ForTextToText.from_pretrained(model_id).to(device)
+                try:
+                    translated = []
+                    total = len(segments)
+                    for i, seg in enumerate(segments):
+                        text = seg.get("text", "").strip()
+                        if not text:
+                            translated.append(seg.copy())
+                            continue
+                        inputs = processor(text=text, src_lang=source_lang, return_tensors="pt").to(device)
+                        with torch.inference_mode():
+                            tokens = s_model.generate(**inputs, tgt_lang=target_lang, max_new_tokens=512)
+                        t = processor.decode(tokens[0].tolist(), skip_special_tokens=True)
+                        new_seg = seg.copy()
+                        new_seg["text"] = t
+                        translated.append(new_seg)
+                        if i % 10 == 0:
+                            _on_progress(10 + int((i / total) * 85), f"Translating {i+1}/{total}...")
+                finally:
+                    del s_model
+                    try:
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                    except Exception:
+                        pass
             else:
                 from opencut.core.captions_enhanced import translate_segments
                 translated = translate_segments(
diff --git a/opencut/routes/jobs_routes.py b/opencut/routes/jobs_routes.py
@@ -156,7 +156,10 @@ def generate():
     "/video/pip", "/video/blend", "/video/merge", "/video/trim",
     "/video/object/remove", "/video/watermark",
     "/export-video",
-    "/video/ai/upscale", "/video/style/apply", "/video/style/arbitrary", "/video/ai/rembg",
+    "/video/ai/upscale", "/video/ai/denoise",
+    "/video/style/apply", "/video/style/arbitrary", "/video/ai/rembg",
+    "/video/lut/generate-ai", "/video/lut/blend",
+    "/audio/music-ai/ace-step",
 })
 
 
diff --git a/opencut/routes/video.py b/opencut/routes/video.py
@@ -3640,6 +3640,10 @@ def video_lut_ai():
         return jsonify({"error": str(e)}), 400
 
     lut_name = data.get("lut_name", "").strip() or ""
+    # Defense-in-depth: reject Windows reserved names and path chars
+    _RESERVED = {"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "LPT1", "LPT2", "LPT3"}
+    if lut_name and (lut_name.upper().split(".")[0] in _RESERVED or re.search(r'[<>:"/\\|?*]', lut_name)):
+        return jsonify({"error": "Invalid LUT name"}), 400
 
     job_id = _new_job("lut-gen-ai", reference_path)