diff --git a/.pylintrc b/.pylintrc index 237a57e8..711d32e5 100644 --- a/.pylintrc +++ b/.pylintrc @@ -7,9 +7,6 @@ # pygtk.require(). #init-hook= -# Profiled execution. -profile=no - # Add files or directories to the blacklist. They should be base names, not # paths. ignore=CVS @@ -41,10 +38,6 @@ enable=indexing-exception,old-raise-syntax disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330 -# Set the cache size for astng objects. -cache-size=500 - - [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs @@ -52,11 +45,6 @@ cache-size=500 # mypackage.mymodule.MyReporterClass. output-format=text -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". -files-output=no - # Tells whether to display a full report or only the messages reports=no @@ -67,10 +55,6 @@ reports=no # (RP0004). evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) -# Add a comment according to your evaluation note. This is used by the global -# evaluation report (RP0004). -comment=no - # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details #msg-template= @@ -86,10 +70,6 @@ ignore-mixin-members=yes # (useful for classes with attributes dynamically set). ignored-classes=SQLObject -# When zope mode is activated, add a predefined set of Zope acquired attributes -# to generated-members. -zope=no - # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E0201 when accessed. Python regular # expressions are accepted. @@ -116,17 +96,6 @@ additional-builtins= [BASIC] -# Required attributes for module, separated by a comma -required-attributes= - -# List of builtins function names that should not be used, separated by a comma -bad-functions=apply,input,reduce - - -# Disable the report(s) with the given id(s). -# All non-Google reports are disabled by default. -disable-report=R0001,R0002,R0003,R0004,R0101,R0102,R0201,R0202,R0220,R0401,R0402,R0701,R0801,R0901,R0902,R0903,R0904,R0911,R0912,R0913,R0914,R0915,R0921,R0922,R0923 - # Regular expression which should only match correct module names module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ @@ -196,9 +165,6 @@ ignore-long-lines=(?x) # else. single-line-if-stmt=y -# List of optional constructs for which whitespace checking is disabled -no-space-check= - # Maximum number of lines in a module max-module-lines=99999 @@ -250,10 +216,6 @@ extension-pkg-whitelist=_jsonnet [CLASSES] -# List of interface methods to ignore, separated by a comma. This is used for -# instance to not check methods defines in Zope's Interface base class. -ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by - # List of method names used to declare (i.e. assign) instance attributes. defining-attr-methods=__init__,__new__,setUp @@ -298,31 +260,6 @@ min-public-methods=2 max-public-methods=20 -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception,StandardError,BaseException - - -[AST] - -# Maximum line length for lambdas -short-func-length=1 - -# List of module members that should be marked as deprecated. -# All of the string functions are listed in 4.1.4 Deprecated string functions -# in the Python 2.4 docs. -deprecated-members=string.atof,string.atoi,string.atol,string.capitalize,string.expandtabs,string.find,string.rfind,string.index,string.rindex,string.count,string.lower,string.split,string.rsplit,string.splitfields,string.join,string.joinfields,string.lstrip,string.rstrip,string.strip,string.swapcase,string.translate,string.upper,string.ljust,string.rjust,string.center,string.zfill,string.replace,sys.exitfunc - - -[DOCSTRING] - -# List of exceptions that do not need to be mentioned in the Raises section of -# a docstring. -ignore-exceptions=AssertionError,NotImplementedError,StopIteration,TypeError - - [TOKENS] diff --git a/Makefile b/Makefile index a7052d08..5dbd2d28 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ install-dev: ## [Local development] Install test requirements lint: ## [Local development] Run mypy, pylint and black python -m mypy video2dataset python -m pylint video2dataset - python -m black --check -l 120 video2dataset + python -m black --check -l 120 . black: ## [Local development] Auto-format python code using black python -m black -l 120 . diff --git a/benchmark/benchmark_dataloader.py b/benchmark/benchmark_dataloader.py index 21ba8b0c..2d9fe3f2 100644 --- a/benchmark/benchmark_dataloader.py +++ b/benchmark/benchmark_dataloader.py @@ -8,6 +8,8 @@ # Benchmark videos are the WebVid validation split (5000 videos) SHARDS = "examples/dataset/{00000..00004}.tar" + + def benchmark_train_dl(num_frames, num_workers, bs=1, num_threads=4, resize_size=None, crop_size=None): from argparse import Namespace from webdataset import WebLoader diff --git a/benchmark/benchmark_subsamplers.py b/benchmark/benchmark_subsamplers.py index efd34d34..2f25b04b 100644 --- a/benchmark/benchmark_subsamplers.py +++ b/benchmark/benchmark_subsamplers.py @@ -14,6 +14,7 @@ from video2dataset import subsamplers from video2dataset.dataloader import get_video_dataset + # Add this function to gather system information def gather_system_info(): cpu_count = os.cpu_count() diff --git a/requirements-test.txt b/requirements-test.txt index aa43cd2d..52d39b1e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,11 +1,11 @@ -black==22.3.0 -mypy==1.6.0 +black==23.12.1 +mypy==1.8.0 +pylint==3.0.3 +pytest-cov==4.1.0 +pytest-xdist==3.5.0 +pytest==7.4.4 types-requests==2.31.0.20240106 types-PyYAML==6.0.12.12 -pylint==2.13.4 -pytest-cov==3.0.0 -pytest-xdist==2.5.0 -pytest==7.0.1 types-requests webvtt-py tensorflow diff --git a/tests/test_data_writers.py b/tests/test_data_writers.py index ca6f606f..d3c573bf 100644 --- a/tests/test_data_writers.py +++ b/tests/test_data_writers.py @@ -72,7 +72,6 @@ def test_writer(modalities, writer_type, tmp_path): writer.close() if writer_type != "dummy": - df = pd.read_parquet(output_folder + "/00000.parquet") expected_columns = [ diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 98d8d0d8..aa2bba6e 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -11,7 +11,6 @@ @pytest.mark.parametrize("batch_size", [1, 4]) def test_return_all(batch_size): - decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1} dset = get_video_dataset( @@ -47,7 +46,6 @@ def test_return_all(batch_size): @pytest.mark.parametrize("batch_size", [1, 2]) def test_default(batch_size): - decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1} dset = get_video_dataset( @@ -80,7 +78,6 @@ def test_default(batch_size): [(1, ["0000008_00001", "0000030_00005", "0000038_00003"]), (2, ["0000008_00001", "0000030_00005"])], ) def test_drop_last(batch_size, expected_keys): - decoder_kwargs = {"n_frames": 10, "fps": None, "num_threads": 1} dset = get_video_dataset( diff --git a/tests/test_subsamplers.py b/tests/test_subsamplers.py index e6a5b5f0..7a5e706b 100644 --- a/tests/test_subsamplers.py +++ b/tests/test_subsamplers.py @@ -125,7 +125,7 @@ def test_resolution_subsampler_video_size(size, resize_mode): assert w_vid == size -@pytest.mark.parametrize("height,width,resize_mode", [(-1,128, ["scale"]), (1620,1620, ["scale", "crop", "pad"])]) +@pytest.mark.parametrize("height,width,resize_mode", [(-1, 128, ["scale"]), (1620, 1620, ["scale", "crop", "pad"])]) def test_resolution_subsampler_height_and_width(height, width, resize_mode): current_folder = os.path.dirname(__file__) # video lenght - 2:02, 1080x1920 diff --git a/video2dataset/data_reader.py b/video2dataset/data_reader.py index 03d14c8d..afce685f 100644 --- a/video2dataset/data_reader.py +++ b/video2dataset/data_reader.py @@ -77,11 +77,10 @@ def get_yt_meta(url, yt_metadata_args: dict) -> dict: info_dict, sub_dict = None, None with yt_dlp.YoutubeDL(yt_metadata_args) as yt: - info_dict = yt.extract_info(url, download=False) if write_subs: sub_url = info_dict["requested_subtitles"][yt_metadata_args["subtitleslangs"][0]]["url"] - res = requests.get(sub_url) + res = requests.get(sub_url, timeout=10) sub = io.TextIOWrapper(io.BytesIO(res.content)).read() sub_dict = sub_to_dict(sub) diff --git a/video2dataset/dataloader/audio_decode.py b/video2dataset/dataloader/audio_decode.py index d62c9ac0..f0dcfcb0 100644 --- a/video2dataset/dataloader/audio_decode.py +++ b/video2dataset/dataloader/audio_decode.py @@ -37,7 +37,9 @@ def __call__(self, key, data): if pad_start < 0: waveform = waveform[:, : self.max_length * self.sample_rate] if pad_start > 0: - waveform = F.pad(waveform, (0, self.max_length * self.sample_rate - waveform.shape[1]), "constant") + waveform = F.pad( # pylint: disable=not-callable + waveform, (0, self.max_length * self.sample_rate - waveform.shape[1]), "constant" + ) pad_masks[:pad_start] = 1.0 additional_info["audio_pad_masks"] = pad_masks diff --git a/video2dataset/dataloader/custom_wds.py b/video2dataset/dataloader/custom_wds.py index f1dd6ae5..4f720524 100644 --- a/video2dataset/dataloader/custom_wds.py +++ b/video2dataset/dataloader/custom_wds.py @@ -371,7 +371,6 @@ def refill_prefix(self, prefix): def __iter__(self): while self.it < self.__len__(): - # sample prefix with corresponding probs prefix_id = np.random.choice(len(self.ps), 1, p=list(self.ps.values())).item() prefix = list(self.ps.keys())[prefix_id] diff --git a/video2dataset/dataloader/dataloader.py b/video2dataset/dataloader/dataloader.py index df452e42..64d2d15c 100644 --- a/video2dataset/dataloader/dataloader.py +++ b/video2dataset/dataloader/dataloader.py @@ -67,7 +67,6 @@ def get_video_dataset( return_always: bool = False, handler=wds.reraise_exception, ): - """ Generates a webdataset given the specified parameters. Parameters: diff --git a/video2dataset/dataloader/transform.py b/video2dataset/dataloader/transform.py index 6bd72a60..e1e49acb 100644 --- a/video2dataset/dataloader/transform.py +++ b/video2dataset/dataloader/transform.py @@ -125,7 +125,6 @@ def __call__(self, data): reference = self._get_reference_frame(resize_size, h, w) for frame in frames: - if resize_size is not None: frame = cv2.resize( frame, diff --git a/video2dataset/dataloader/video_decode.py b/video2dataset/dataloader/video_decode.py index 14ebbc5c..d302ce82 100644 --- a/video2dataset/dataloader/video_decode.py +++ b/video2dataset/dataloader/video_decode.py @@ -115,7 +115,7 @@ def get_frames(self, reader, n_frames, stride, **kwargs): # pylint: disable=arg # can just output first_pad_index or a mask or something pad_start = len(frames) if self.pad_frames and frames.shape[0] < self.n_frames: - frames = F.pad(frames, (0, 0) * 3 + (0, self.n_frames - frames.shape[0])) + frames = F.pad(frames, (0, 0) * 3 + (0, self.n_frames - frames.shape[0])) # pylint: disable=not-callable return frames, frame_start, pad_start diff --git a/video2dataset/distributor.py b/video2dataset/distributor.py index 4dc24236..bce8d3c9 100644 --- a/video2dataset/distributor.py +++ b/video2dataset/distributor.py @@ -48,7 +48,7 @@ def multiprocessing_distributor(processes_count, worker, input_sharder, _, max_s def run(gen): failed_shards = [] - for (status, row) in tqdm(process_pool.imap_unordered(worker, gen)): + for status, row in tqdm(process_pool.imap_unordered(worker, gen)): if status is False: failed_shards.append(row) return failed_shards @@ -76,7 +76,7 @@ def run(gen): failed_shards = [] for batch in batcher(gen, subjob_size): rdd = spark.sparkContext.parallelize(batch, len(batch)) - for (status, row) in rdd.map(worker).collect(): + for status, row in rdd.map(worker).collect(): if status is False: failed_shards.append(row) return failed_shards @@ -199,7 +199,6 @@ def __init__( print(f"Wrote sbatch to {self.sbatch_path}") def _make_sbatch(self): - nodelist = ("#SBATCH --nodelist " + self.nodelist) if self.nodelist is not None else "" exclude = ("#SBATCH --exclude " + self.exclude) if self.exclude is not None else "" account = ("#SBATCH --account " + self.account) if self.account is not None else "" diff --git a/video2dataset/input_sharder.py b/video2dataset/input_sharder.py index 74031dac..5f0f9dc0 100644 --- a/video2dataset/input_sharder.py +++ b/video2dataset/input_sharder.py @@ -54,7 +54,7 @@ def __init__( if fs.isdir(url_path): self.input_files = sorted(fs.glob(url_path + "/*." + input_format)) if len(self.input_files) == 0: - raise Exception(f"No file found at path {url_path} with extension {input_format}") + raise ValueError(f"No file found at path {url_path} with extension {input_format}") else: self.input_files = [url_path] @@ -142,7 +142,7 @@ def write_shard(t): else: raise e # can't reach here - raise Exception("Failed to write to file.") + raise ValueError("Failed to write to file.") for i in range(10): shards = [] diff --git a/video2dataset/output_sharder.py b/video2dataset/output_sharder.py index a8f81c3c..b929d0d8 100644 --- a/video2dataset/output_sharder.py +++ b/video2dataset/output_sharder.py @@ -26,7 +26,7 @@ def __init__(self, shard_list, input_format, done_shards, sampler=lambda x: x) - if "s3://" in shard_list: self.shard_list = ["s3://" + s for s in self.shard_list] if len(self.shard_list) == 0: - raise Exception(f"No file found at path {url_path} with extension {input_format}") + raise ValueError(f"No file found at path {url_path} with extension {input_format}") else: self.shard_list = list(braceexpand.braceexpand(shard_list)) diff --git a/video2dataset/subsamplers/ffprobe_subsampler.py b/video2dataset/subsamplers/ffprobe_subsampler.py index 8e2ebeb4..52c5b61e 100644 --- a/video2dataset/subsamplers/ffprobe_subsampler.py +++ b/video2dataset/subsamplers/ffprobe_subsampler.py @@ -6,6 +6,7 @@ from .subsampler import Subsampler + # TODO: figuer out why this is so slow (12 samples/s) class FFProbeSubsampler(Subsampler): """ diff --git a/video2dataset/subsamplers/optical_flow_subsampler.py b/video2dataset/subsamplers/optical_flow_subsampler.py index b8844336..837f3fb9 100644 --- a/video2dataset/subsamplers/optical_flow_subsampler.py +++ b/video2dataset/subsamplers/optical_flow_subsampler.py @@ -85,7 +85,6 @@ class RAFTDetector: """ def __init__(self, args, downsample_size=None): - self.device = args.get("device", "cuda") self.downsample_size = downsample_size diff --git a/video2dataset/subsamplers/resolution_subsampler.py b/video2dataset/subsamplers/resolution_subsampler.py index e2be0fb5..9f95969b 100644 --- a/video2dataset/subsamplers/resolution_subsampler.py +++ b/video2dataset/subsamplers/resolution_subsampler.py @@ -35,7 +35,7 @@ def __init__( encode_format: str = "mp4", ): if video_size > 0 and (height > 0 or width > 0): - raise Exception("Either set video_size, or set height and/or width") + raise ValueError("Either set video_size, or set height and/or width") self.resize_mode = resize_mode self.height = height if video_size < 0 else video_size self.width = width if video_size < 0 else video_size diff --git a/video2dataset/workers/download_worker.py b/video2dataset/workers/download_worker.py index 96d51619..7f37ee7a 100644 --- a/video2dataset/workers/download_worker.py +++ b/video2dataset/workers/download_worker.py @@ -193,7 +193,7 @@ def data_generator(): print(error_message) if "[youtube]" in error_message: # video-specific error, remove videoID error_message = "ERROR: [youtube]:" + error_message.split(":")[-1] - raise Exception("failed_to_download") + raise ValueError("failed_to_download") for stream in streams.values(): bytes_downloaded += len(stream) @@ -203,7 +203,7 @@ def data_generator(): if self.ffprobe_subsampler is not None: streams, meta, error_message = self.ffprobe_subsampler(streams, meta) if error_message is not None: - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") if self.config["storage"]["captions_are_subtitles"]: # create clips subtitles = meta["yt_meta_dict"]["subtitles"] @@ -212,7 +212,7 @@ def data_generator(): streams, cuts, error_message = self.cut_detector(streams) if error_message is not None: - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") meta["cuts"] = cuts @@ -239,7 +239,7 @@ def data_generator(): if error_message is not None: meta["clips"] = [] - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") successes += 1 status = "success" diff --git a/video2dataset/workers/subset_worker.py b/video2dataset/workers/subset_worker.py index 20d77eb8..06383074 100644 --- a/video2dataset/workers/subset_worker.py +++ b/video2dataset/workers/subset_worker.py @@ -162,7 +162,7 @@ def process_shard( if self.ffprobe_subsampler is not None: streams, meta, error_message = self.ffprobe_subsampler(streams, meta) if error_message is not None: - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") if self.config["storage"]["captions_are_subtitles"]: # create clips subtitles = meta["yt_meta_dict"]["subtitles"] @@ -170,7 +170,7 @@ def process_shard( elif self.cut_detector is not None: # apply cut detection to get clips streams, cuts, error_message = self.cut_detector(streams) if error_message is not None: - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") meta["cuts"] = cuts @@ -188,14 +188,14 @@ def process_shard( subsampled_streams, metas, error_message = broadcast_subsampler(streams, meta) if error_message is not None: meta["clips"] = [] - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") for modality in list(subsampled_streams.keys()): for modality_subsampler in self.subsamplers[modality]: subsampled_streams, metas, error_message = modality_subsampler(subsampled_streams, metas) if error_message is not None: - raise Exception("failed_to_subsample") + raise ValueError("failed_to_subsample") successes += 1 status = "success"