Update dev deps (#283)

following https://github.com/rom1504/python-template
iejMac · Jan 21, 2024 · 527245c · 527245c
1 parent bd0088b
commit 527245c
Show file tree

Hide file tree

Showing 22 changed files with 31 additions and 98 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -7,9 +7,6 @@
 # pygtk.require().
 #init-hook=
 
-# Profiled execution.
-profile=no
-
 # Add files or directories to the blacklist. They should be base names, not
 # paths.
 ignore=CVS
@@ -41,22 +38,13 @@ enable=indexing-exception,old-raise-syntax
 disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330
 
 
-# Set the cache size for astng objects.
-cache-size=500
-
-
 [REPORTS]
 
 # Set the output format. Available formats are text, parseable, colorized, msvs
 # (visual studio) and html. You can also give a reporter class, eg
 # mypackage.mymodule.MyReporterClass.
 output-format=text
 
-# Put messages in a separate file for each module / package specified on the
-# command line instead of printing them on stdout. Reports (if any) will be
-# written in a file name "pylint_global.[txt|html]".
-files-output=no
-
 # Tells whether to display a full report or only the messages
 reports=no
 
@@ -67,10 +55,6 @@ reports=no
 # (RP0004).
 evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 
-# Add a comment according to your evaluation note. This is used by the global
-# evaluation report (RP0004).
-comment=no
-
 # Template used to display messages. This is a python new-style format string
 # used to format the message information. See doc for all details
 #msg-template=
@@ -86,10 +70,6 @@ ignore-mixin-members=yes
 # (useful for classes with attributes dynamically set).
 ignored-classes=SQLObject
 
-# When zope mode is activated, add a predefined set of Zope acquired attributes
-# to generated-members.
-zope=no
-
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E0201 when accessed. Python regular
 # expressions are accepted.
@@ -116,17 +96,6 @@ additional-builtins=
 
 [BASIC]
 
-# Required attributes for module, separated by a comma
-required-attributes=
-
-# List of builtins function names that should not be used, separated by a comma
-bad-functions=apply,input,reduce
-
-
-# Disable the report(s) with the given id(s).
-# All non-Google reports are disabled by default.
-disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923
-
 # Regular expression which should only match correct module names
 module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
 
@@ -196,9 +165,6 @@ ignore-long-lines=(?x)
 # else.
 single-line-if-stmt=y
 
-# List of optional constructs for which whitespace checking is disabled
-no-space-check=
-
 # Maximum number of lines in a module
 max-module-lines=99999
 
@@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet
 
 [CLASSES]
 
-# List of interface methods to ignore, separated by a comma. This is used for
-# instance to not check methods defines in Zope's Interface base class.
-ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
-
 # List of method names used to declare (i.e. assign) instance attributes.
 defining-attr-methods=__init__,__new__,setUp
 
@@ -298,31 +260,6 @@ min-public-methods=2
 max-public-methods=20
 
 
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=Exception,StandardError,BaseException
-
-
-[AST]
-
-# Maximum line length for lambdas
-short-func-length=1
-
-# List of module members that should be marked as deprecated.
-# All of the string functions are listed in 4.1.4 Deprecated string functions
-# in the Python 2.4 docs.
-deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc
-
-
-[DOCSTRING]
-
-# List of exceptions that do not need to be mentioned in the Raises section of
-# a docstring.
-ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError
-
-
 
 [TOKENS]
 

diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ install-dev: ## [Local development] Install test requirements
 lint: ## [Local development] Run mypy, pylint and black
 	python -m mypy video2dataset
 	python -m pylint video2dataset
-	python -m black --check -l 120 video2dataset
+	python -m black --check -l 120 .
 
 black: ## [Local development] Auto-format python code using black
 	python -m black -l 120 .

diff --git a/benchmark/benchmark_dataloader.py b/benchmark/benchmark_dataloader.py
@@ -8,6 +8,8 @@
 
 # Benchmark videos are the WebVid validation split (5000 videos)
 SHARDS = "examples/dataset/{00000..00004}.tar"
+
+
 def benchmark_train_dl(num_frames, num_workers, bs=1, num_threads=4, resize_size=None, crop_size=None):
     from argparse import Namespace
     from webdataset import WebLoader

diff --git a/benchmark/benchmark_subsamplers.py b/benchmark/benchmark_subsamplers.py
@@ -14,6 +14,7 @@
 from video2dataset import subsamplers
 from video2dataset.dataloader import get_video_dataset
 
+
 # Add this function to gather system information
 def gather_system_info():
     cpu_count = os.cpu_count()

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,11 +1,11 @@
-black==22.3.0
-mypy==1.6.0
+black==23.12.1
+mypy==1.8.0
+pylint==3.0.3
+pytest-cov==4.1.0
+pytest-xdist==3.5.0
+pytest==7.4.4
 types-requests==2.31.0.20240106
 types-PyYAML==6.0.12.12
-pylint==2.13.4
-pytest-cov==3.0.0
-pytest-xdist==2.5.0
-pytest==7.0.1
 types-requests
 webvtt-py
 tensorflow

diff --git a/tests/test_data_writers.py b/tests/test_data_writers.py
@@ -72,7 +72,6 @@ def test_writer(modalities, writer_type, tmp_path):
     writer.close()
 
     if writer_type != "dummy":
-
         df = pd.read_parquet(output_folder + "/00000.parquet")
 
         expected_columns = [

diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py
@@ -11,7 +11,6 @@
 
 @pytest.mark.parametrize("batch_size", [1, 4])
 def test_return_all(batch_size):
-
     decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1}
 
     dset = get_video_dataset(
@@ -47,7 +46,6 @@ def test_return_all(batch_size):
 
 @pytest.mark.parametrize("batch_size", [1, 2])
 def test_default(batch_size):
-
     decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1}
 
     dset = get_video_dataset(
@@ -80,7 +78,6 @@ def test_default(batch_size):
     [(1, ["0000008_00001", "0000030_00005", "0000038_00003"]), (2, ["0000008_00001", "0000030_00005"])],
 )
 def test_drop_last(batch_size, expected_keys):
-
     decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1}
 
     dset = get_video_dataset(

diff --git a/tests/test_subsamplers.py b/tests/test_subsamplers.py
@@ -125,7 +125,7 @@ def test_resolution_subsampler_video_size(size, resize_mode):
             assert w_vid == size
 
 
-@pytest.mark.parametrize("height,width,resize_mode", [(-1,128, ["scale"]), (1620,1620, ["scale", "crop", "pad"])])
+@pytest.mark.parametrize("height,width,resize_mode", [(-1, 128, ["scale"]), (1620, 1620, ["scale", "crop", "pad"])])
 def test_resolution_subsampler_height_and_width(height, width, resize_mode):
     current_folder = os.path.dirname(__file__)
     # video lenght - 2:02, 1080x1920

diff --git a/video2dataset/data_reader.py b/video2dataset/data_reader.py
@@ -77,11 +77,10 @@ def get_yt_meta(url, yt_metadata_args: dict) -> dict:
     info_dict, sub_dict = None, None
 
     with yt_dlp.YoutubeDL(yt_metadata_args) as yt:
-
         info_dict = yt.extract_info(url, download=False)
         if write_subs:
             sub_url = info_dict["requested_subtitles"][yt_metadata_args["subtitleslangs"][0]]["url"]
-            res = requests.get(sub_url)
+            res = requests.get(sub_url, timeout=10)
             sub = io.TextIOWrapper(io.BytesIO(res.content)).read()
             sub_dict = sub_to_dict(sub)
 

diff --git a/video2dataset/dataloader/audio_decode.py b/video2dataset/dataloader/audio_decode.py
@@ -37,7 +37,9 @@ def __call__(self, key, data):
         if pad_start < 0:
             waveform = waveform[:, : self.max_length * self.sample_rate]
         if pad_start > 0:
-            waveform = F.pad(waveform, (0, self.max_length * self.sample_rate - waveform.shape[1]), "constant")
+            waveform = F.pad(  # pylint: disable=not-callable
+                waveform, (0, self.max_length * self.sample_rate - waveform.shape[1]), "constant"
+            )
             pad_masks[:pad_start] = 1.0
 
         additional_info["audio_pad_masks"] = pad_masks

diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py
@@ -371,7 +371,6 @@ def refill_prefix(self, prefix):
 
     def __iter__(self):
         while self.it < self.__len__():
-
             # sample prefix with corresponding probs
             prefix_id = np.random.choice(len(self.ps), 1, p=list(self.ps.values())).item()
             prefix = list(self.ps.keys())[prefix_id]

diff --git a/video2dataset/dataloader/dataloader.py b/video2dataset/dataloader/dataloader.py
@@ -67,7 +67,6 @@ def get_video_dataset(
     return_always: bool = False,
     handler=wds.reraise_exception,
 ):
-
     """
     Generates a webdataset given the specified parameters.
     Parameters:

diff --git a/video2dataset/dataloader/transform.py b/video2dataset/dataloader/transform.py
@@ -125,7 +125,6 @@ def __call__(self, data):
         reference = self._get_reference_frame(resize_size, h, w)
 
         for frame in frames:
-
             if resize_size is not None:
                 frame = cv2.resize(
                     frame,

diff --git a/video2dataset/dataloader/video_decode.py b/video2dataset/dataloader/video_decode.py
@@ -115,7 +115,7 @@ def get_frames(self, reader, n_frames, stride, **kwargs):  # pylint: disable=arg
         # can just output first_pad_index or a mask or something
         pad_start = len(frames)
         if self.pad_frames and frames.shape[0] < self.n_frames:
-            frames = F.pad(frames, (0, 0) * 3 + (0, self.n_frames - frames.shape[0]))
+            frames = F.pad(frames, (0, 0) * 3 + (0, self.n_frames - frames.shape[0]))  # pylint: disable=not-callable
 
         return frames, frame_start, pad_start
 

diff --git a/video2dataset/distributor.py b/video2dataset/distributor.py
@@ -48,7 +48,7 @@ def multiprocessing_distributor(processes_count, worker, input_sharder, _, max_s
 
         def run(gen):
             failed_shards = []
-            for (status, row) in tqdm(process_pool.imap_unordered(worker, gen)):
+            for status, row in tqdm(process_pool.imap_unordered(worker, gen)):
                 if status is False:
                     failed_shards.append(row)
             return failed_shards
@@ -76,7 +76,7 @@ def run(gen):
             failed_shards = []
             for batch in batcher(gen, subjob_size):
                 rdd = spark.sparkContext.parallelize(batch, len(batch))
-                for (status, row) in rdd.map(worker).collect():
+                for status, row in rdd.map(worker).collect():
                     if status is False:
                         failed_shards.append(row)
             return failed_shards
@@ -199,7 +199,6 @@ def __init__(
         print(f"Wrote sbatch to {self.sbatch_path}")
 
     def _make_sbatch(self):
-
         nodelist = ("#SBATCH --nodelist " + self.nodelist) if self.nodelist is not None else ""
         exclude = ("#SBATCH --exclude " + self.exclude) if self.exclude is not None else ""
         account = ("#SBATCH --account " + self.account) if self.account is not None else ""

diff --git a/video2dataset/input_sharder.py b/video2dataset/input_sharder.py
@@ -54,7 +54,7 @@ def __init__(
         if fs.isdir(url_path):
             self.input_files = sorted(fs.glob(url_path + "/*." + input_format))
             if len(self.input_files) == 0:
-                raise Exception(f"No file found at path {url_path} with extension {input_format}")
+                raise ValueError(f"No file found at path {url_path} with extension {input_format}")
         else:
             self.input_files = [url_path]
 
@@ -142,7 +142,7 @@ def write_shard(t):
                     else:
                         raise e
             # can't reach here
-            raise Exception("Failed to write to file.")
+            raise ValueError("Failed to write to file.")
 
         for i in range(10):
             shards = []

diff --git a/video2dataset/output_sharder.py b/video2dataset/output_sharder.py
@@ -26,7 +26,7 @@ def __init__(self, shard_list, input_format, done_shards, sampler=lambda x: x) -
             if "s3://" in shard_list:
                 self.shard_list = ["s3://" + s for s in self.shard_list]
             if len(self.shard_list) == 0:
-                raise Exception(f"No file found at path {url_path} with extension {input_format}")
+                raise ValueError(f"No file found at path {url_path} with extension {input_format}")
         else:
             self.shard_list = list(braceexpand.braceexpand(shard_list))
 

diff --git a/video2dataset/subsamplers/ffprobe_subsampler.py b/video2dataset/subsamplers/ffprobe_subsampler.py
@@ -6,6 +6,7 @@
 
 from .subsampler import Subsampler
 
+
 # TODO: figuer out why this is so slow (12 samples/s)
 class FFProbeSubsampler(Subsampler):
     """

diff --git a/video2dataset/subsamplers/optical_flow_subsampler.py b/video2dataset/subsamplers/optical_flow_subsampler.py
@@ -85,7 +85,6 @@ class RAFTDetector:
     """
 
     def __init__(self, args, downsample_size=None):
-
         self.device = args.get("device", "cuda")
         self.downsample_size = downsample_size
 

diff --git a/video2dataset/subsamplers/resolution_subsampler.py b/video2dataset/subsamplers/resolution_subsampler.py
@@ -35,7 +35,7 @@ def __init__(
         encode_format: str = "mp4",
     ):
         if video_size > 0 and (height > 0 or width > 0):
-            raise Exception("Either set video_size, or set height and/or width")
+            raise ValueError("Either set video_size, or set height and/or width")
         self.resize_mode = resize_mode
         self.height = height if video_size < 0 else video_size
         self.width = width if video_size < 0 else video_size

diff --git a/video2dataset/workers/download_worker.py b/video2dataset/workers/download_worker.py
@@ -193,7 +193,7 @@ def data_generator():
                         print(error_message)
                         if "[youtube]" in error_message:  # video-specific error, remove videoID
                             error_message = "ERROR: [youtube]:" + error_message.split(":")[-1]
-                        raise Exception("failed_to_download")
+                        raise ValueError("failed_to_download")
 
                     for stream in streams.values():
                         bytes_downloaded += len(stream)
@@ -203,7 +203,7 @@ def data_generator():
                     if self.ffprobe_subsampler is not None:
                         streams, meta, error_message = self.ffprobe_subsampler(streams, meta)
                         if error_message is not None:
-                            raise Exception("failed_to_subsample")
+                            raise ValueError("failed_to_subsample")
 
                     if self.config["storage"]["captions_are_subtitles"]:  # create clips
                         subtitles = meta["yt_meta_dict"]["subtitles"]
@@ -212,7 +212,7 @@ def data_generator():
                         streams, cuts, error_message = self.cut_detector(streams)
 
                         if error_message is not None:
-                            raise Exception("failed_to_subsample")
+                            raise ValueError("failed_to_subsample")
 
                         meta["cuts"] = cuts
 
@@ -239,7 +239,7 @@ def data_generator():
 
                     if error_message is not None:
                         meta["clips"] = []
-                        raise Exception("failed_to_subsample")
+                        raise ValueError("failed_to_subsample")
 
                     successes += 1
                     status = "success"