From 7331d881a3b743a19d9b868abdc9d4ec478954be Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Apr 2024 20:52:06 +0200 Subject: [PATCH] [pre-commit.ci] pre-commit autoupdate (#532) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 +- data/check_pandas.py | 1 + data/check_smiles_split.py | 1 + data/natural/preprocess_europepmc.py | 1 + data/natural/preprocess_msds.py | 1 + data/natural/preprocess_nougat.py | 1 + data/postprocess_split.py | 1 + data/tabular/check_smiles_split.py | 1 + data/tabular/odd_one_out/transform.py | 8 ++-- data/tabular/train_test_split.py | 1 + data/text_sampling/text_sampling.py | 42 +++++++++---------- data/train_test_split.py | 1 + experiments/data/merge_epmc_to_jsonl.py | 1 + experiments/data/prepare_gptneox_chemrxiv.py | 1 + experiments/data/prepare_hf_dataset.py | 1 + .../scripts/eval_create_batch_configs.py | 6 +-- experiments/scripts/run_tune.py | 15 ++++--- src/chemnlp/data_val/config.py | 6 +-- src/chemnlp/trainer.py | 1 + 19 files changed, 56 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dc61d41d3..c382fca0a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,13 +21,13 @@ repos: exclude: ^experiments/configs - repo: https://github.com/psf/black - rev: 23.12.1 + rev: 24.3.0 hooks: - id: black language_version: python3 # Should be a command that runs python3.6+ - repo: https://github.com/PyCQA/flake8 - rev: 6.1.0 + rev: 7.0.0 hooks: - id: flake8 args: [--count, --show-source, --statistics] diff --git a/data/check_pandas.py b/data/check_pandas.py index 4526bc08f..5df3390b0 100644 --- a/data/check_pandas.py +++ b/data/check_pandas.py @@ -8,6 +8,7 @@ This script has a command line interface. You can run it using `python check_pandas `, where `` points to a nested set of directories with `data_clean.csv` files. """ + import os from glob import glob from pathlib import Path diff --git a/data/check_smiles_split.py b/data/check_smiles_split.py index df6df7ffe..e3f3c1bc7 100644 --- a/data/check_smiles_split.py +++ b/data/check_smiles_split.py @@ -6,6 +6,7 @@ This script uses dask. This might cause some errors with mismatching data types, for which there are currently a few fallbacks. """ + import os from glob import glob from pathlib import Path diff --git a/data/natural/preprocess_europepmc.py b/data/natural/preprocess_europepmc.py index e2a9b1b7c..92b96bdee 100644 --- a/data/natural/preprocess_europepmc.py +++ b/data/natural/preprocess_europepmc.py @@ -5,6 +5,7 @@ Before running this scripts, the filepaths need to be changed. """ + import json import os import re diff --git a/data/natural/preprocess_msds.py b/data/natural/preprocess_msds.py index 30f21a269..c28f2d7d8 100644 --- a/data/natural/preprocess_msds.py +++ b/data/natural/preprocess_msds.py @@ -3,6 +3,7 @@ You need to change filepaths before running this script """ + import json import os diff --git a/data/natural/preprocess_nougat.py b/data/natural/preprocess_nougat.py index 93fe85cef..f664b5096 100644 --- a/data/natural/preprocess_nougat.py +++ b/data/natural/preprocess_nougat.py @@ -6,6 +6,7 @@ The filepaths need to be updated before running the script. """ + import glob import json import os diff --git a/data/postprocess_split.py b/data/postprocess_split.py index dc2353330..ef8999b0b 100644 --- a/data/postprocess_split.py +++ b/data/postprocess_split.py @@ -5,6 +5,7 @@ This script needs to be run after the splitting script. """ + import os from glob import glob from pathlib import Path diff --git a/data/tabular/check_smiles_split.py b/data/tabular/check_smiles_split.py index a16030c1a..b7149bfaf 100644 --- a/data/tabular/check_smiles_split.py +++ b/data/tabular/check_smiles_split.py @@ -1,4 +1,5 @@ """This script checks for data leakage in the splits of a tabular dataset.""" + import os from glob import glob from pathlib import Path diff --git a/data/tabular/odd_one_out/transform.py b/data/tabular/odd_one_out/transform.py index a529046d6..f79219da1 100644 --- a/data/tabular/odd_one_out/transform.py +++ b/data/tabular/odd_one_out/transform.py @@ -104,9 +104,11 @@ def transform_dataset(dataset, n_permutations): "smi_4": smis[smi_idx_arr[:, 3]], "odd_one_out_idx": odd_one_out_idx, "odd_one_out_mol": [ - smis[smi_idx_arr[i, int(odd_one_out_idx[i])]] - if not np.isnan(odd_one_out_idx[i]) - else np.nan + ( + smis[smi_idx_arr[i, int(odd_one_out_idx[i])]] + if not np.isnan(odd_one_out_idx[i]) + else np.nan + ) for i in range(len(odd_one_out_idx)) ], # "similarity_list": similarity_list, diff --git a/data/tabular/train_test_split.py b/data/tabular/train_test_split.py index a015e62f3..4ec096e89 100644 --- a/data/tabular/train_test_split.py +++ b/data/tabular/train_test_split.py @@ -20,6 +20,7 @@ - Some CSV files contain complicated strings. We cannot parse them in a chunked manner. In this case, we set blocksize=None and read the whole file into memory. """ + import logging import os import random diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py index 681ba3b60..94dadbe33 100644 --- a/data/text_sampling/text_sampling.py +++ b/data/text_sampling/text_sampling.py @@ -942,9 +942,9 @@ def export(self, fn_suffix: str = None): inplace=True, ) if self.multiple_choice_benchmarking_templates: - df_out[ - ["output", "answer_choices", "correct_output_index"] - ] = df_out["output"].str.split(pat="", n=2, expand=True) + df_out[["output", "answer_choices", "correct_output_index"]] = ( + df_out["output"].str.split(pat="", n=2, expand=True) + ) df_out["answer_choices"] = df_out["answer_choices"].apply( lambda x: x.split("|") ) @@ -982,15 +982,15 @@ def export(self, fn_suffix: str = None): os.makedirs(output_path_dir, exist_ok=True) output_path = output_path_dir + f"{split}.jsonl" - lm_eval_yaml_template_multiple_choice[ - "task" - ] = self.path_data_dir.split("/")[-1] - lm_eval_yaml_template_multiple_choice[ - "dataset_path" - ] = output_path_dir - lm_eval_yaml_template_multiple_choice[ - "dataset_name" - ] = self.path_data_dir.split("/")[-1] + lm_eval_yaml_template_multiple_choice["task"] = ( + self.path_data_dir.split("/")[-1] + ) + lm_eval_yaml_template_multiple_choice["dataset_path"] = ( + output_path_dir + ) + lm_eval_yaml_template_multiple_choice["dataset_name"] = ( + self.path_data_dir.split("/")[-1] + ) fn_lm_eval_yaml = output_path_dir + "/config.yaml" with open(fn_lm_eval_yaml, "w") as f: @@ -1005,15 +1005,15 @@ def export(self, fn_suffix: str = None): os.makedirs(output_path_dir, exist_ok=True) output_path = output_path_dir + f"{split}_{fn_suffix}.jsonl" - lm_eval_yaml_template_loglikelihood[ - "task" - ] = self.path_data_dir.split("/")[-1] - lm_eval_yaml_template_loglikelihood[ - "dataset_path" - ] = output_path_dir - lm_eval_yaml_template_loglikelihood[ - "dataset_name" - ] = self.path_data_dir.split("/")[-1] + lm_eval_yaml_template_loglikelihood["task"] = ( + self.path_data_dir.split("/")[-1] + ) + lm_eval_yaml_template_loglikelihood["dataset_path"] = ( + output_path_dir + ) + lm_eval_yaml_template_loglikelihood["dataset_name"] = ( + self.path_data_dir.split("/")[-1] + ) fn_lm_eval_yaml = output_path_dir + "/config.yaml" with open(fn_lm_eval_yaml, "w") as f: diff --git a/data/train_test_split.py b/data/train_test_split.py index 650586512..79691c03b 100644 --- a/data/train_test_split.py +++ b/data/train_test_split.py @@ -20,6 +20,7 @@ - Some CSV files contain complicated strings. We cannot parse them in a chunked manner. In this case, we set blocksize=None and read the whole file into memory. """ + import logging import os import random diff --git a/experiments/data/merge_epmc_to_jsonl.py b/experiments/data/merge_epmc_to_jsonl.py index 87632c23b..1c361dc7b 100644 --- a/experiments/data/merge_epmc_to_jsonl.py +++ b/experiments/data/merge_epmc_to_jsonl.py @@ -7,6 +7,7 @@ /2022_05_25/file2.jsonl ... """ + import multiprocessing import os from typing import List diff --git a/experiments/data/prepare_gptneox_chemrxiv.py b/experiments/data/prepare_gptneox_chemrxiv.py index 328d04f74..fc25c3d6b 100644 --- a/experiments/data/prepare_gptneox_chemrxiv.py +++ b/experiments/data/prepare_gptneox_chemrxiv.py @@ -5,6 +5,7 @@ Example usage: python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/ """ + import argparse import os diff --git a/experiments/data/prepare_hf_dataset.py b/experiments/data/prepare_hf_dataset.py index 7a6bbe710..0a0a12c03 100644 --- a/experiments/data/prepare_hf_dataset.py +++ b/experiments/data/prepare_hf_dataset.py @@ -5,6 +5,7 @@ Example Usage: python prepare_hf_dataset.py full_path/config.yml """ + import argparse import json import os diff --git a/experiments/scripts/eval_create_batch_configs.py b/experiments/scripts/eval_create_batch_configs.py index c2c1bb274..da0749307 100644 --- a/experiments/scripts/eval_create_batch_configs.py +++ b/experiments/scripts/eval_create_batch_configs.py @@ -20,9 +20,9 @@ def run( ] for model_name in model_names: - raw_config[ - "model_args" - ] = f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}" + raw_config["model_args"] = ( + f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}" + ) raw_config["wandb_run_name"] = model_name with open( diff --git a/experiments/scripts/run_tune.py b/experiments/scripts/run_tune.py index d12dc51b5..ec68a0ea8 100644 --- a/experiments/scripts/run_tune.py +++ b/experiments/scripts/run_tune.py @@ -3,6 +3,7 @@ Usage: python run_tune.py """ + import argparse import json import os @@ -98,9 +99,9 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None: model_ref = getattr(transformers, config.model.base) model = model_ref.from_pretrained( pretrained_model_name_or_path=config.model.checkpoint_path or config.model.name, - revision=config.model.revision - if config.model.checkpoint_path is None - else None, + revision=( + config.model.revision if config.model.checkpoint_path is None else None + ), ) if config.prompt_tuning.enabled: @@ -171,9 +172,11 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None: **config.trainer.dict(exclude={"deepspeed_config", "restart_checkpoint"}), report_to="wandb" if config.wandb.enabled else "none", local_rank=local_rank, - deepspeed=CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}" - if config.trainer.deepspeed_config - else None, + deepspeed=( + CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}" + if config.trainer.deepspeed_config + else None + ), ) print_zero_rank(local_rank, training_args) diff --git a/src/chemnlp/data_val/config.py b/src/chemnlp/data_val/config.py index f63ff3c11..1888016ee 100644 --- a/src/chemnlp/data_val/config.py +++ b/src/chemnlp/data_val/config.py @@ -10,9 +10,9 @@ class Data(BaseModel): path: Union[List[str], str] # can be local or S3 directory validation_size: Union[List[float], float] = 0.05 interleave_probs: Optional[List[float]] = None - sampling_criterion: Optional[ - Literal["first_exhausted", "all_exhausted"] - ] = None # as of v2.10.1 + sampling_criterion: Optional[Literal["first_exhausted", "all_exhausted"]] = ( + None # as of v2.10.1 + ) @validator("validation_size") def small_positive_validation_sizes(cls, value_orig): diff --git a/src/chemnlp/trainer.py b/src/chemnlp/trainer.py index 31c0042b2..20fa0a05a 100644 --- a/src/chemnlp/trainer.py +++ b/src/chemnlp/trainer.py @@ -1,4 +1,5 @@ """A custom trainer for modifying data sampling behaviour""" + from typing import Optional import datasets