[detector] Add new detector interface and implment it for ContentDetector

Breakthrough · Breakthrough · commit 3dfeb9ad0ad9 · 2025-03-07T21:35:59.000-05:00
Fixing VFR videos requires detectors have knowledge of timestamps, so we should tackle the detector API overhaul now.

This commit introduces the new API as well as a basic implementation for ContentDetector to show what needs to be changed.

The new detector is used when specifing `detect-content -b`.  It does not yet enforce min_scene_len, and may still have bugs.
diff --git a/scenedetect/_cli/__init__.py b/scenedetect/_cli/__init__.py
@@ -41,6 +41,7 @@
 from scenedetect.detectors import (
     AdaptiveDetector,
     ContentDetector,
+    ContentDetector2,
     HashDetector,
     HistogramDetector,
     ThresholdDetector,
@@ -561,6 +562,12 @@ def time_command(
         USER_CONFIG.get_help_string("detect-content", "filter-mode"),
     ),
 )
+@click.option(
+    "-b",
+    "--beta",
+    is_flag=True,
+    flag_value=True,
+)
 @click.pass_context
 def detect_content_command(
     ctx: click.Context,
@@ -570,6 +577,7 @@ def detect_content_command(
     kernel_size: ty.Optional[int],
     min_scene_len: ty.Optional[str],
     filter_mode: ty.Optional[str],
+    beta,
 ):
     ctx = ctx.obj
     assert isinstance(ctx, CliContext)
@@ -581,7 +589,7 @@ def detect_content_command(
         kernel_size=kernel_size,
         filter_mode=filter_mode,
     )
-    ctx.add_detector(ContentDetector, detector_args)
+    ctx.add_detector(ContentDetector if not beta else ContentDetector2, detector_args)
 
 
 DETECT_ADAPTIVE_HELP = """Find fast cuts using diffs in HSL colorspace (rolling average).
diff --git a/scenedetect/detector.py b/scenedetect/detector.py
@@ -0,0 +1,63 @@
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2014-2025 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+
+"""``scenedetect.detector`` Module
+
+This module contains the :class:`Detector` interface which all detectors must implement (e.g. those
+in the :mod:`scenedetect.detectors` module)."""
+
+import typing as ty
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+
+import numpy
+
+from scenedetect.frame_timecode import FrameTimecode
+from scenedetect.stats_manager import StatsManager
+
+# TODO: Documentation.
+
+
+class EventType(Enum):
+    CUT = 0
+    FADE_IN = 1
+    FADE_OUT = 2
+
+
+@dataclass
+class Event:
+    type: EventType
+    time: FrameTimecode
+    data: ty.Dict[str, ty.Any] = field(default_factory=dict)
+
+
+class DetectorBase:
+    def __init__(self):
+        self._stats = None
+
+    @property
+    def stats(self) -> ty.Optional[StatsManager]:
+        return self._stats
+
+    # For use by SceneManager to register stats handler with this detector.
+    def _set_stats_manager(self, stats: StatsManager):
+        assert self._stats is None
+        self._stats = stats
+
+
+class Detector(ABC, DetectorBase):
+    @abstractmethod
+    def process(self, frame: numpy.ndarray, timecode: FrameTimecode) -> ty.List[Event]: ...
+
+    def postprocess(self) -> ty.List[Event]:
+        return []
diff --git a/scenedetect/detectors/__init__.py b/scenedetect/detectors/__init__.py
@@ -35,7 +35,7 @@
 processing videos, however they can also be used to process frames directly.
 """
 
-from scenedetect.detectors.content_detector import ContentDetector  # noqa: I001
+from scenedetect.detectors.content_detector import ContentDetector, ContentDetector2  # noqa: I001
 from scenedetect.detectors.threshold_detector import ThresholdDetector
 from scenedetect.detectors.adaptive_detector import AdaptiveDetector
 from scenedetect.detectors.hash_detector import HashDetector
diff --git a/scenedetect/detectors/content_detector.py b/scenedetect/detectors/content_detector.py
@@ -16,13 +16,17 @@
 """
 
 import math
+import typing as ty
 from dataclasses import dataclass
-from typing import List, NamedTuple, Optional
+from typing import NamedTuple
 
 import cv2
 import numpy
 
+from scenedetect.detector import Detector, Event, EventType
+from scenedetect.frame_timecode import FrameTimecode
 from scenedetect.scene_detector import FlashFilter, SceneDetector
+from scenedetect.stats_manager import StatsManager
 
 
 def _mean_pixel_distance(left: numpy.ndarray, right: numpy.ndarray) -> float:
@@ -97,7 +101,7 @@ class _FrameData:
         """Frame saturation map [2D 8-bit]."""
         lum: numpy.ndarray
         """Frame luma/brightness map [2D 8-bit]."""
-        edges: Optional[numpy.ndarray]
+        edges: ty.Optional[numpy.ndarray]
         """Frame edge map [2D 8-bit, edges are 255, non edges 0]. Affected by `kernel_size`."""
 
     def __init__(
@@ -106,7 +110,7 @@ def __init__(
         min_scene_len: int = 15,
         weights: "ContentDetector.Components" = DEFAULT_COMPONENT_WEIGHTS,
         luma_only: bool = False,
-        kernel_size: Optional[int] = None,
+        kernel_size: ty.Optional[int] = None,
         filter_mode: FlashFilter.Mode = FlashFilter.Mode.MERGE,
     ):
         """
@@ -126,17 +130,17 @@ def __init__(
         super().__init__()
         self._threshold: float = threshold
         self._min_scene_len: int = min_scene_len
-        self._last_above_threshold: Optional[int] = None
-        self._last_frame: Optional[ContentDetector._FrameData] = None
+        self._last_above_threshold: ty.Optional[int] = None
+        self._last_frame: ty.Optional[ContentDetector._FrameData] = None
         self._weights: ContentDetector.Components = weights
         if luma_only:
             self._weights = ContentDetector.LUMA_ONLY_WEIGHTS
-        self._kernel: Optional[numpy.ndarray] = None
+        self._kernel: ty.Optional[numpy.ndarray] = None
         if kernel_size is not None:
             if kernel_size < 3 or kernel_size % 2 == 0:
                 raise ValueError("kernel_size must be odd integer >= 3")
             self._kernel = numpy.ones((kernel_size, kernel_size), numpy.uint8)
-        self._frame_score: Optional[float] = None
+        self._frame_score: ty.Optional[float] = None
         self._flash_filter = FlashFilter(mode=filter_mode, length=min_scene_len)
 
     def get_metrics(self):
@@ -187,7 +191,7 @@ def _calculate_frame_score(self, frame_num: int, frame_img: numpy.ndarray) -> fl
         self._last_frame = ContentDetector._FrameData(hue, sat, lum, edges)
         return frame_score
 
-    def process_frame(self, frame_num: int, frame_img: numpy.ndarray) -> List[int]:
+    def process_frame(self, frame_num: int, frame_img: numpy.ndarray) -> ty.List[int]:
         """Process the next frame. `frame_num` is assumed to be sequential.
 
         Args:
@@ -196,7 +200,7 @@ def process_frame(self, frame_num: int, frame_img: numpy.ndarray) -> List[int]:
             frame_img (numpy.ndarray or None): Video frame corresponding to `frame_img`.
 
         Returns:
-            List[int]: List of frames where scene cuts have been detected. There may be 0
+            ty.List[int]: List of frames where scene cuts have been detected. There may be 0
             or more frames in the list, and not necessarily the same as frame_num.
         """
         self._frame_score = self._calculate_frame_score(frame_num, frame_img)
@@ -237,3 +241,205 @@ def _detect_edges(self, lum: numpy.ndarray) -> numpy.ndarray:
     @property
     def event_buffer_length(self) -> int:
         return self._flash_filter.max_behind
+
+
+# TODO: Make ContentDetector implement both interfaces once ContentDetector2 is at feature parity.
+# Currently it is missing enforcement of min_scene_length, as FlashFilter needs to be transitioned
+# to work off of time instead of frames.
+class ContentDetector2(Detector):
+    """Detects fast cuts using changes in colour and intensity between frames.
+
+    The difference is calculated in the HSV color space, and compared against a set threshold to
+    determine when a fast cut has occurred.
+    """
+
+    # TODO: Come up with some good weights for a new default if there is one that can pass
+    # a wider variety of test cases.
+    class Components(NamedTuple):
+        """Components that make up a frame's score, and their default values."""
+
+        delta_hue: float = 1.0
+        """Difference between pixel hue values of adjacent frames."""
+        delta_sat: float = 1.0
+        """Difference between pixel saturation values of adjacent frames."""
+        delta_lum: float = 1.0
+        """Difference between pixel luma (brightness) values of adjacent frames."""
+        delta_edges: float = 0.0
+        """Difference between calculated edges of adjacent frames.
+
+        Edge differences are typically larger than the other components, so the detection
+        threshold may need to be adjusted accordingly."""
+
+    DEFAULT_COMPONENT_WEIGHTS = Components()
+    """Default component weights. Actual default values are specified in :class:`Components`
+    to allow adding new components without breaking existing usage."""
+
+    LUMA_ONLY_WEIGHTS = Components(
+        delta_hue=0.0,
+        delta_sat=0.0,
+        delta_lum=1.0,
+        delta_edges=0.0,
+    )
+    """Component weights to use if `luma_only` is set."""
+
+    FRAME_SCORE_KEY = "content_val"
+    """Key in statsfile representing the final frame score after weighed by specified components."""
+
+    METRIC_KEYS = [FRAME_SCORE_KEY, *Components._fields]
+    """All statsfile keys this detector produces."""
+
+    @dataclass
+    class _FrameData:
+        """Data calculated for a given frame."""
+
+        hue: numpy.ndarray
+        """Frame hue map [2D 8-bit]."""
+        sat: numpy.ndarray
+        """Frame saturation map [2D 8-bit]."""
+        lum: numpy.ndarray
+        """Frame luma/brightness map [2D 8-bit]."""
+        edges: ty.Optional[numpy.ndarray]
+        """Frame edge map [2D 8-bit, edges are 255, non edges 0]. Affected by `kernel_size`."""
+
+    def __init__(
+        self,
+        threshold: float = 27.0,
+        min_scene_len: FrameTimecode = 15,
+        weights: "ContentDetector2.Components" = DEFAULT_COMPONENT_WEIGHTS,
+        luma_only: bool = False,
+        kernel_size: ty.Optional[int] = None,
+        filter_mode: FlashFilter.Mode = FlashFilter.Mode.MERGE,
+    ):
+        """
+        Arguments:
+            threshold: Threshold the average change in pixel intensity must exceed to trigger a cut.
+            min_scene_len: Once a cut is detected, this many frames must pass before a new one can
+                be added to the scene list. Can be an int or FrameTimecode type.
+            weights: Weight to place on each component when calculating frame score
+                (`content_val` in a statsfile, the value `threshold` is compared against).
+            luma_only: If True, only considers changes in the luminance channel of the video.
+                Equivalent to specifying `weights` as :data:`ContentDetector2.LUMA_ONLY`.
+                Overrides `weights` if both are set.
+            kernel_size: Size of kernel for expanding detected edges. Must be odd integer
+                greater than or equal to 3. If None, automatically set using video resolution.
+            filter_mode: Mode to use when filtering cuts to meet `min_scene_len`.
+        """
+        super().__init__()
+        self._threshold: float = threshold
+        self._min_scene_len: FrameTimecode = min_scene_len
+        self._last_above_threshold: ty.Optional[FrameTimecode] = None
+        self._last_frame: ty.Optional[ContentDetector2._FrameData] = None
+        self._weights: ContentDetector2.Components = weights
+        if luma_only:
+            self._weights = ContentDetector2.LUMA_ONLY_WEIGHTS
+        self._kernel: ty.Optional[numpy.ndarray] = None
+        if kernel_size is not None:
+            if kernel_size < 3 or kernel_size % 2 == 0:
+                raise ValueError("kernel_size must be odd integer >= 3")
+            self._kernel = numpy.ones((kernel_size, kernel_size), numpy.uint8)
+        self._frame_score: ty.Optional[float] = None
+        self._flash_filter = FlashFilter(mode=filter_mode, length=min_scene_len)
+        self._stats: ty.Optional[StatsManager] = None
+
+    def get_metrics(self):
+        return ContentDetector2.METRIC_KEYS
+
+    def set_stats_manager(self, stats: StatsManager):
+        self._stats = stats
+
+    def _calculate_frame_score(self, frame: numpy.ndarray, timecode: FrameTimecode) -> float:
+        """Calculate score representing relative amount of motion in `frame_img` compared to
+        the last time the function was called (returns 0.0 on the first call)."""
+        # TODO: Add option to enable motion estimation before calculating score components.
+        # TODO: Investigate methods of performing cheaper alternatives, e.g. shifting or resizing
+        # the frame to simulate camera movement, using optical flow, etc...
+
+        # Convert image into HSV colorspace.
+        hue, sat, lum = cv2.split(cv2.cvtColor(frame, cv2.COLOR_BGR2HSV))
+
+        # Performance: Only calculate edges if we have to.
+        calculate_edges: bool = (self._weights.delta_edges > 0.0) or self._stats is not None
+        edges = self._detect_edges(lum) if calculate_edges else None
+
+        if self._last_frame is None:
+            # Need another frame to compare with for score calculation.
+            self._last_frame = ContentDetector2._FrameData(hue, sat, lum, edges)
+            return 0.0
+
+        score_components = ContentDetector2.Components(
+            delta_hue=_mean_pixel_distance(hue, self._last_frame.hue),
+            delta_sat=_mean_pixel_distance(sat, self._last_frame.sat),
+            delta_lum=_mean_pixel_distance(lum, self._last_frame.lum),
+            delta_edges=(
+                0.0 if edges is None else _mean_pixel_distance(edges, self._last_frame.edges)
+            ),
+        )
+
+        frame_score: float = sum(
+            component * weight for (component, weight) in zip(score_components, self._weights)
+        ) / sum(abs(weight) for weight in self._weights)
+
+        # Record components and frame score if needed for analysis.
+        if self._stats is not None:
+            metrics = {self.FRAME_SCORE_KEY: frame_score}
+            metrics.update(score_components._asdict())
+            self._stats.set_metrics(timecode.frame_num, metrics)
+
+        # Store all data required to calculate the next frame's score.
+        self._last_frame = ContentDetector2._FrameData(hue, sat, lum, edges)
+        return frame_score
+
+    def process(self, frame: numpy.ndarray, timecode: FrameTimecode) -> ty.List[Event]:
+        """Process the next frame. `frame_num` is assumed to be sequential.
+
+        Args:
+            frame_num (int): Frame number of frame that is being passed. Can start from any value
+                but must remain sequential.
+            frame_img (numpy.ndarray or None): Video frame corresponding to `frame_img`.
+
+        Returns:
+            ty.List[int]: List of frames where scene cuts have been detected. There may be 0
+            or more frames in the list, and not necessarily the same as frame_num.
+        """
+        self._frame_score = self._calculate_frame_score(frame, timecode)
+        if self._frame_score is None:
+            return []
+
+        above_threshold: bool = self._frame_score >= self._threshold
+        # TODO: Need to fix FlashFilter so we can enforce min_scene_length. We should be able to
+        # just return `self._flash_filter.filter(timecode, above_threshold)` here.
+        if above_threshold:
+            return [Event(type=EventType.CUT, time=timecode)]
+        return []
+
+    def _detect_edges(self, lum: numpy.ndarray) -> numpy.ndarray:
+        """Detect edges using the luma channel of a frame.
+
+        Arguments:
+            lum: 2D 8-bit image representing the luma channel of a frame.
+
+        Returns:
+            2D 8-bit image of the same size as the input, where pixels with values of 255
+            represent edges, and all other pixels are 0.
+        """
+        # Initialize kernel.
+        if self._kernel is None:
+            kernel_size = _estimated_kernel_size(lum.shape[1], lum.shape[0])
+            self._kernel = numpy.ones((kernel_size, kernel_size), numpy.uint8)
+
+        # Estimate levels for thresholding.
+        # TODO: Add config file entries for sigma, aperture/kernel size, etc.
+        sigma: float = 1.0 / 3.0
+        median = numpy.median(lum)
+        low = int(max(0, (1.0 - sigma) * median))
+        high = int(min(255, (1.0 + sigma) * median))
+
+        # Calculate edges using Canny algorithm, and reduce noise by dilating the edges.
+        # This increases edge overlap leading to improved robustness against noise and slow
+        # camera movement. Note that very large kernel sizes can negatively affect accuracy.
+        edges = cv2.Canny(lum, low, high)
+        return cv2.dilate(edges, self._kernel)
+
+    @property
+    def event_buffer_length(self) -> int:
+        return self._flash_filter.max_behind
diff --git a/scenedetect/scene_detector.py b/scenedetect/scene_detector.py
diff --git a/scenedetect/scene_manager.py b/scenedetect/scene_manager.py