feat: add support for video deserialization with torchcodec when torchvision>0.25 (#802)

deependujha · web-flow · commit 90bd40444c3f · 2026-03-26T13:14:18.000+01:00
diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
@@ -34,6 +34,15 @@ jobs:
       UV_TORCH_BACKEND: "cpu"
 
     steps:
+      # FFmpeg is required for Video Deserializer tests
+      - name: Setup FFmpeg (shared libs, Linux)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg
+
+      - name: Setup FFmpeg (shared libs, macOS)
+        if: runner.os == 'macOS'
+        run: brew install ffmpeg
+
       - uses: actions/checkout@v6
       - name: Install uv and setup python ${{ matrix.python-version }}
         uses: astral-sh/setup-uv@v7
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -16,3 +16,4 @@ lightning
 transformers >=4.51.0
 zstd; python_version < "3.14"
 soundfile >=0.13.0
+torchcodec >=0.1.0 # check compatibility table: https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec
diff --git a/src/litdata/constants.py b/src/litdata/constants.py
@@ -49,6 +49,7 @@
 _POLARS_AVAILABLE = RequirementCache("polars>1.0.0")
 _PIL_AVAILABLE = RequirementCache("PIL")
 _TORCH_VISION_AVAILABLE = RequirementCache("torchvision")
+_TORCH_VISION_LESS_THAN_0_26 = RequirementCache("torchvision<0.26.0")
 _AV_AVAILABLE = RequirementCache("av")
 _OBSTORE_AVAILABLE = RequirementCache("obstore")
 
diff --git a/src/litdata/streaming/serializers.py b/src/litdata/streaming/serializers.py
@@ -20,6 +20,7 @@
 from collections import OrderedDict
 from contextlib import suppress
 from copy import deepcopy
+from dataclasses import asdict
 from itertools import chain
 from typing import Any
 
@@ -32,6 +33,7 @@
     _NUMPY_DTYPES_MAPPING,
     _PIL_AVAILABLE,
     _TORCH_DTYPES_MAPPING,
+    _TORCH_VISION_LESS_THAN_0_26,
 )
 
 
@@ -403,6 +405,13 @@ def serialize(self, filepath: str) -> tuple[bytes, str | None]:
             return f.read(), f"video:{file_extension}"
 
     def deserialize(self, data: bytes) -> Any:
+        # if using torchvision <=0.25, we will use torchvision.io to decode the video
+        # otherwise, we will use torchcodec to decode the video, which is faster and more robust
+        if _TORCH_VISION_LESS_THAN_0_26:
+            return self._deserialize_with_torchvision_io(data)
+        return self._deserialize_with_torchcodec(data)
+
+    def _deserialize_with_torchvision_io(self, data: bytes) -> Any:
         if not _AV_AVAILABLE:
             raise ModuleNotFoundError("av is required. Run `pip install av`")
 
@@ -416,6 +425,29 @@ def deserialize(self, data: bytes) -> Any:
                 stream.write(data)
             return torchvision.io.read_video(fname, pts_unit="sec")
 
+    def _deserialize_with_torchcodec(self, data: bytes) -> Any:
+        try:
+            import torch
+            from torchcodec.decoders import AudioDecoder, VideoDecoder
+        except ImportError:
+            raise ModuleNotFoundError("torchcodec is required. Run `pip install torchcodec>0.11`")
+
+        dec = VideoDecoder(data, dimension_order="NHWC")  # NHWC → T,H,W,C after stacking
+        metadata = asdict(dec.metadata) if dec.metadata is not None else {}
+
+        # get_all_frames() returns a FrameBatch; .data is (N, C, H, W) or (N, H, W, C)
+        # depending on dimension_order above
+        frame_batch = dec.get_all_frames()
+        video = frame_batch.data  # shape: (T, H, W, C) with NHWC
+
+        try:
+            audio_dec = AudioDecoder(data)
+            audio = audio_dec.get_all_samples().data  # (num_channels, num_samples)
+        except ValueError:
+            audio = torch.zeros(1, 0)  # old torchvision path returns aframes with shape (1, 0) for no-audio videos.
+
+        return video, audio, metadata
+
     def can_serialize(self, data: Any) -> bool:
         return isinstance(data, str) and os.path.isfile(data) and any(data.endswith(ext) for ext in self._EXTENSIONS)
 
diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py
@@ -277,6 +277,7 @@ def test_assert_no_header_numpy_serializer():
     np.testing.assert_equal(t, new_t)
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="Not tested on windows")
 @pytest.mark.skipif(condition=not _AV_AVAILABLE, reason="Requires: 'av'")
 def test_wav_deserialization(tmpdir):
     from torch.hub import download_url_to_file
@@ -293,7 +294,14 @@ def test_wav_deserialization(tmpdir):
     vframes, aframes, info = serializer.deserialize(data)
     assert vframes.shape == torch.Size([301, 512, 512, 3])
     assert aframes.shape == torch.Size([1, 0])
-    assert info == {"video_fps": 25.0}
+    # The metadata keys for video serialization may vary by serializer.
+    # For example, `torchvision` typically uses `video_fps`, while `torchcodec` uses `average_fps`.
+    # Despite these naming differences, both keys represent the same fps value,
+    # ensuring consistency in video frame rate representation across serialization methods.
+    assert "video_fps" in info or "average_fps" in info
+    fps = info.get("video_fps", info.get("average_fps"))
+    assert fps is not None
+    assert fps == 25.0
 
 
 def test_get_serializers():