iejMac · MattUnderscoreZhang · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/tests/test_subsamplers.py b/tests/test_subsamplers.py
@@ -211,15 +211,14 @@ def test_audio_rate_subsampler(sample_rate, n_audio_channels):
     "cut_detection_mode,framerates", [("longest", []), ("longest", [1]), ("all", []), ("all", [1])]
 )
 def test_cut_detection_subsampler(cut_detection_mode, framerates):
-    current_folder = os.path.dirname(__file__)
-    video = os.path.join(current_folder, "test_files/test_video.mp4")
-    with open(video, "rb") as vid_f:
-        video_bytes = vid_f.read()
-
     subsampler = CutDetectionSubsampler(cut_detection_mode, framerates, threshold=5)
 
-    streams = {"video": [video_bytes]}
-    streams, cuts, err_msg = subsampler(streams)
+    current_folder = os.path.dirname(__file__)
+    video_filepath = os.path.join(current_folder, "test_files/test_video.mp4")
+    metadata, error_message = subsampler(video_filepath)
+    assert error_message is None
+    cuts = metadata["cuts"]
+
     if cut_detection_mode == "longest":
         assert len(cuts["cuts_original_fps"]) == 1
         assert cuts["cuts_original_fps"][0] == [0, 2096]
@@ -276,17 +275,11 @@ def test_optical_flow_subsampler(detector, fps, params):
 
 @pytest.mark.parametrize("extract_keyframes", [False, True])
 def test_ffprobe_subsampler(extract_keyframes):
-    current_folder = os.path.dirname(__file__)
-    # video length - 2:02, 1080x1920, 30 fps
-    video = os.path.join(current_folder, "test_files/test_video.mp4")
-    with open(video, "rb") as vid_f:
-        video_bytes = vid_f.read()
-
     subsampler = FFProbeSubsampler(extract_keyframes)
 
-    streams = {"video": [video_bytes]}
-    metadata = {}
-    subsampled_streams, metadata, error_message = subsampler(streams, metadata)
+    current_folder = os.path.dirname(__file__)
+    video_filepath = os.path.join(current_folder, "test_files/test_video.mp4")
+    metadata, error_message = subsampler(video_filepath)
     assert error_message is None
     assert metadata is not None
     assert "video_metadata" in metadata

diff --git a/video2dataset/subsamplers/cut_detection_subsampler.py b/video2dataset/subsamplers/cut_detection_subsampler.py
@@ -3,10 +3,10 @@
 """
 import numpy as np
 from scenedetect import ContentDetector, SceneManager, open_video
-import os
-import tempfile
+from typing import Tuple, List, Optional, Literal
 
-from .subsampler import Subsampler
+from video2dataset.subsamplers.subsampler import Subsampler
+from video2dataset.types import Metadata, Error
 
 # TODO: this can be done more elegantly:
 # from scenedetect import scene_manager and set that in correct namespace
@@ -45,53 +45,52 @@ class CutDetectionSubsampler(Subsampler):
     - min_scene_len - minimum scene length to not drop a scene (see pyscenedeteect docs for more explanation)
     """
 
-    def __init__(self, cut_detection_mode="all", framerates=None, threshold=27, min_scene_len=15):
-        self.framerates = framerates
+    def __init__(
+        self,
+        cut_detection_mode: Literal["all", "longest"] = "all",
+        framerates: Optional[List[int]] = None,
+        threshold: int = 27,
+        min_scene_len: int = 15,
+    ):
+        self.framerates = framerates if framerates is not None else []
         self.cut_detection_mode = cut_detection_mode
         self.threshold = threshold
         self.min_scene_len = min_scene_len
 
-    def __call__(self, streams, metadata=None):
-        video_bytes = streams["video"][0]
-
+    def __call__(self, video_filepath: str, metadata: Optional[Metadata] = None) -> Tuple[Metadata, Error]:
+        metadata = metadata if metadata is not None else {}
         try:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                video_path = os.path.join(tmpdir, "input.mp4")
-                with open(video_path, "wb") as f:
-                    f.write(video_bytes)
-
-                video = open_video(video_path)
+            # find scene changes
+            video = open_video(video_filepath)
+            detector = ContentDetector(threshold=self.threshold, min_scene_len=self.min_scene_len)
+            scene_manager = SceneManager()
+            scene_manager.add_detector(detector)
+            scene_manager.auto_downscale = False
+            scene_manager.downscale = video.frame_size[0] // DEFAULT_MIN_WIDTH
+            scene_manager.detect_scenes(video=video)
+
+            # extract cuts in both original fps and target fps
+            cuts = {}
+            original_fps = video.frame_rate
+            cuts["original_fps"] = original_fps
+            cuts["cuts_original_fps"] = get_scenes_from_scene_manager(scene_manager, self.cut_detection_mode)
+            for target_fps in self.framerates:
+                video.reset()
 
                 detector = ContentDetector(threshold=self.threshold, min_scene_len=self.min_scene_len)
                 scene_manager = SceneManager()
                 scene_manager.add_detector(detector)
-                scene_manager.auto_downscale = False
-                scene_manager.downscale = video.frame_size[0] // DEFAULT_MIN_WIDTH
-
-                cuts = {}
-                original_fps = video.frame_rate
-                cuts["original_fps"] = original_fps
-
-                scene_manager.detect_scenes(video=video)
-                cuts["cuts_original_fps"] = get_scenes_from_scene_manager(scene_manager, self.cut_detection_mode)
-                if self.framerates is not None:
-                    for target_fps in self.framerates:
-                        video.reset()
-
-                        detector = ContentDetector(threshold=self.threshold, min_scene_len=self.min_scene_len)
-                        scene_manager = SceneManager()
-                        scene_manager.add_detector(detector)
-                        frame_skip = max(
-                            int(original_fps // target_fps) - 1, 0
-                        )  # if we take 1 frame and skip N frames we're sampling 1/N+1 % of the video
-                        # so if we desire to sample 1/N of the video, we need to subtract one when doing frame skipping
-
-                        scene_manager.detect_scenes(video=video, frame_skip=frame_skip)
-                        cuts[f"cuts_{target_fps}"] = get_scenes_from_scene_manager(
-                            scene_manager, self.cut_detection_mode
-                        )
-                        scene_manager.clear()
-        except Exception as err:  # pylint: disable=broad-except
-            return {}, None, str(err)
+                frame_skip = max(
+                    int(original_fps // target_fps) - 1, 0
+                )  # if we take 1 frame and skip N frames we're sampling 1/N+1 % of the video
+                # so if we desire to sample 1/N of the video, we need to subtract one when doing frame skipping
 
-        return streams, cuts, None
+                scene_manager.detect_scenes(video=video, frame_skip=frame_skip)
+                cuts[f"cuts_{target_fps}"] = get_scenes_from_scene_manager(scene_manager, self.cut_detection_mode)
+                scene_manager.clear()
+
+            # save and return metadata
+            metadata["cuts"] = cuts
+        except Exception as err:  # pylint: disable=broad-except
+            return metadata, str(err)
+        return metadata, None
diff --git a/video2dataset/subsamplers/ffprobe_subsampler.py b/video2dataset/subsamplers/ffprobe_subsampler.py
@@ -1,10 +1,10 @@
 """extracts basic video compression metadata."""
-import os
 import json
 import subprocess
-import tempfile
+from typing import Tuple, Optional
 
-from .subsampler import Subsampler
+from video2dataset.subsamplers.subsampler import Subsampler
+from video2dataset.types import Metadata, Error
 
 
 # TODO: figuer out why this is so slow (12 samples/s)
@@ -18,41 +18,38 @@ class FFProbeSubsampler(Subsampler):
     def __init__(self, extract_keyframes=False):
         self.extract_keyframes = extract_keyframes
 
-    def __call__(self, streams, metadata):
-        # TODO: this should also work for audio (maybe others)
-        video_bytes = streams["video"][0]
-        with tempfile.TemporaryDirectory() as tmpdir:
-            with open(os.path.join(tmpdir, "input.mp4"), "wb") as f:
-                f.write(video_bytes)
-            try:
-                command = [
-                    "ffprobe",
-                    "-v",
-                    "quiet",
-                    "-print_format",
-                    "json",
-                    "-show_format",
-                    "-show_streams",
-                    f"{tmpdir}/input.mp4",
+    def __call__(self, video_filepath: str, metadata: Optional[Metadata] = None) -> Tuple[Metadata, Error]:
+        metadata = metadata if metadata is not None else {}
+        try:
+            # extract video metadata
+            command = [
+                "ffprobe",
+                "-v",
+                "quiet",
+                "-print_format",
+                "json",
+                "-show_format",
+                "-show_streams",
+                f"{video_filepath}",
+            ]
+            if self.extract_keyframes:
+                command.extend(["-select_streams", "v:0", "-show_entries", "packet=pts_time,flags"])
+            process = subprocess.run(command, capture_output=True, text=True, check=True)
+            video_metadata = json.loads(process.stdout)
+
+            # extract keyframe timestamps if requested
+            if self.extract_keyframes:
+                keyframe_timestamps = [
+                    float(packet["pts_time"]) for packet in video_metadata["packets"] if "K" in packet.get("flags", "")
                 ]
-
-                if self.extract_keyframes:
-                    command.extend(["-select_streams", "v:0", "-show_entries", "packet=pts_time,flags"])
-
-                process = subprocess.run(command, capture_output=True, text=True, check=True)
-                video_metadata = json.loads(process.stdout)
-
-                if self.extract_keyframes:
-                    keyframe_info = [entry for entry in video_metadata["packets"] if "K" in entry.get("flags", "")]
-                    keyframe_timestamps = [float(entry["pts_time"]) for entry in keyframe_info]
-                    if "duration" in video_metadata["format"]:
-                        duration = float(video_metadata["format"]["duration"])
-                        keyframe_timestamps.append(duration)
-                    video_metadata["keyframe_timestamps"] = keyframe_timestamps
-                    video_metadata.pop("packets")  # Don't need it anymore
-                metadata["video_metadata"] = video_metadata
-
-            except Exception as err:  # pylint: disable=broad-except
-                return streams, metadata, str(err)
-
-        return streams, metadata, None
+                if "duration" in video_metadata["format"]:
+                    duration = float(video_metadata["format"]["duration"])
+                    keyframe_timestamps.append(duration)
+                video_metadata["keyframe_timestamps"] = keyframe_timestamps
+                video_metadata.pop("packets")  # Don't need it anymore
+
+            # save and return metadata
+            metadata["video_metadata"] = video_metadata
+        except Exception as err:  # pylint: disable=broad-except
+            return metadata, str(err)
+        return metadata, None
diff --git a/video2dataset/subsamplers/noop_subsampler.py b/video2dataset/subsamplers/noop_subsampler.py
@@ -1,11 +1,13 @@
 """No operation subsampler"""
+from typing import List, Tuple
 
-from .subsampler import Subsampler
+from video2dataset.subsamplers.subsampler import Subsampler
+from video2dataset.types import Metadata, Error, TempFilepaths
 
 
 class NoOpSubsampler(Subsampler):
     def __init__(self):
         pass
 
-    def __call__(self, streams, metadata):
-        return streams, [metadata], None
+    def __call__(self, filepaths: TempFilepaths, metadata: Metadata) -> Tuple[TempFilepaths, List[Metadata], Error]:
+        return filepaths, [metadata], None
diff --git a/video2dataset/types.py b/video2dataset/types.py
@@ -1,5 +1,5 @@
 """Type definitions for video2dataset."""
-from typing import List, TypedDict
+from typing import List, TypedDict, Optional
 
 
 class EncodeFormats(TypedDict, total=False):
@@ -14,3 +14,12 @@ class Streams(TypedDict, total=False):
 
 # TODO: make more structured
 Metadata = dict
+
+
+Error = Optional[str]
+
+
+# TODO: remove after refactoring is complete
+class TempFilepaths(TypedDict, total=False):
+    video: List[str]
+    audio: List[str]