From 7331d881a3b743a19d9b868abdc9d4ec478954be Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Apr 2024 20:52:06 +0200
Subject: [PATCH] [pre-commit.ci] pre-commit autoupdate (#532)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml                       |  4 +-
 data/check_pandas.py                          |  1 +
 data/check_smiles_split.py                    |  1 +
 data/natural/preprocess_europepmc.py          |  1 +
 data/natural/preprocess_msds.py               |  1 +
 data/natural/preprocess_nougat.py             |  1 +
 data/postprocess_split.py                     |  1 +
 data/tabular/check_smiles_split.py            |  1 +
 data/tabular/odd_one_out/transform.py         |  8 ++--
 data/tabular/train_test_split.py              |  1 +
 data/text_sampling/text_sampling.py           | 42 +++++++++----------
 data/train_test_split.py                      |  1 +
 experiments/data/merge_epmc_to_jsonl.py       |  1 +
 experiments/data/prepare_gptneox_chemrxiv.py  |  1 +
 experiments/data/prepare_hf_dataset.py        |  1 +
 .../scripts/eval_create_batch_configs.py      |  6 +--
 experiments/scripts/run_tune.py               | 15 ++++---
 src/chemnlp/data_val/config.py                |  6 +--
 src/chemnlp/trainer.py                        |  1 +
 19 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dc61d41d3..c382fca0a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,13 +21,13 @@ repos:
             exclude: ^experiments/configs
 
     - repo: https://github.com/psf/black
-      rev: 23.12.1
+      rev: 24.3.0
       hooks:
           - id: black
             language_version: python3 # Should be a command that runs python3.6+
 
     - repo: https://github.com/PyCQA/flake8
-      rev: 6.1.0
+      rev: 7.0.0
       hooks:
           - id: flake8
             args: [--count, --show-source, --statistics]
diff --git a/data/check_pandas.py b/data/check_pandas.py
index 4526bc08f..5df3390b0 100644
--- a/data/check_pandas.py
+++ b/data/check_pandas.py
@@ -8,6 +8,7 @@
 This script has a command line interface. You can run it using `python check_pandas <data_dir>`,
 where `<data_dir>` points to a nested set of directories with `data_clean.csv` files.
 """
+
 import os
 from glob import glob
 from pathlib import Path
diff --git a/data/check_smiles_split.py b/data/check_smiles_split.py
index df6df7ffe..e3f3c1bc7 100644
--- a/data/check_smiles_split.py
+++ b/data/check_smiles_split.py
@@ -6,6 +6,7 @@
 This script uses dask. This might cause some errors with mismatching data types,
 for which there are currently a few fallbacks.
 """
+
 import os
 from glob import glob
 from pathlib import Path
diff --git a/data/natural/preprocess_europepmc.py b/data/natural/preprocess_europepmc.py
index e2a9b1b7c..92b96bdee 100644
--- a/data/natural/preprocess_europepmc.py
+++ b/data/natural/preprocess_europepmc.py
@@ -5,6 +5,7 @@
 
 Before running this scripts, the filepaths need to be changed.
 """
+
 import json
 import os
 import re
diff --git a/data/natural/preprocess_msds.py b/data/natural/preprocess_msds.py
index 30f21a269..c28f2d7d8 100644
--- a/data/natural/preprocess_msds.py
+++ b/data/natural/preprocess_msds.py
@@ -3,6 +3,7 @@
 
 You need to change filepaths before running this script
 """
+
 import json
 import os
 
diff --git a/data/natural/preprocess_nougat.py b/data/natural/preprocess_nougat.py
index 93fe85cef..f664b5096 100644
--- a/data/natural/preprocess_nougat.py
+++ b/data/natural/preprocess_nougat.py
@@ -6,6 +6,7 @@
 
 The filepaths need to be updated before running the script.
 """
+
 import glob
 import json
 import os
diff --git a/data/postprocess_split.py b/data/postprocess_split.py
index dc2353330..ef8999b0b 100644
--- a/data/postprocess_split.py
+++ b/data/postprocess_split.py
@@ -5,6 +5,7 @@
 
 This script needs to be run after the splitting script.
 """
+
 import os
 from glob import glob
 from pathlib import Path
diff --git a/data/tabular/check_smiles_split.py b/data/tabular/check_smiles_split.py
index a16030c1a..b7149bfaf 100644
--- a/data/tabular/check_smiles_split.py
+++ b/data/tabular/check_smiles_split.py
@@ -1,4 +1,5 @@
 """This script checks for data leakage in the splits of a tabular dataset."""
+
 import os
 from glob import glob
 from pathlib import Path
diff --git a/data/tabular/odd_one_out/transform.py b/data/tabular/odd_one_out/transform.py
index a529046d6..f79219da1 100644
--- a/data/tabular/odd_one_out/transform.py
+++ b/data/tabular/odd_one_out/transform.py
@@ -104,9 +104,11 @@ def transform_dataset(dataset, n_permutations):
         "smi_4": smis[smi_idx_arr[:, 3]],
         "odd_one_out_idx": odd_one_out_idx,
         "odd_one_out_mol": [
-            smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
-            if not np.isnan(odd_one_out_idx[i])
-            else np.nan
+            (
+                smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
+                if not np.isnan(odd_one_out_idx[i])
+                else np.nan
+            )
             for i in range(len(odd_one_out_idx))
         ],
         # "similarity_list": similarity_list,
diff --git a/data/tabular/train_test_split.py b/data/tabular/train_test_split.py
index a015e62f3..4ec096e89 100644
--- a/data/tabular/train_test_split.py
+++ b/data/tabular/train_test_split.py
@@ -20,6 +20,7 @@
     - Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
         In this case, we set blocksize=None and read the whole file into memory.
 """
+
 import logging
 import os
 import random
diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
index 681ba3b60..94dadbe33 100644
--- a/data/text_sampling/text_sampling.py
+++ b/data/text_sampling/text_sampling.py
@@ -942,9 +942,9 @@ def export(self, fn_suffix: str = None):
                     inplace=True,
                 )
                 if self.multiple_choice_benchmarking_templates:
-                    df_out[
-                        ["output", "answer_choices", "correct_output_index"]
-                    ] = df_out["output"].str.split(pat="<MC>", n=2, expand=True)
+                    df_out[["output", "answer_choices", "correct_output_index"]] = (
+                        df_out["output"].str.split(pat="<MC>", n=2, expand=True)
+                    )
                     df_out["answer_choices"] = df_out["answer_choices"].apply(
                         lambda x: x.split("|")
                     )
@@ -982,15 +982,15 @@ def export(self, fn_suffix: str = None):
                     os.makedirs(output_path_dir, exist_ok=True)
                     output_path = output_path_dir + f"{split}.jsonl"
 
-                    lm_eval_yaml_template_multiple_choice[
-                        "task"
-                    ] = self.path_data_dir.split("/")[-1]
-                    lm_eval_yaml_template_multiple_choice[
-                        "dataset_path"
-                    ] = output_path_dir
-                    lm_eval_yaml_template_multiple_choice[
-                        "dataset_name"
-                    ] = self.path_data_dir.split("/")[-1]
+                    lm_eval_yaml_template_multiple_choice["task"] = (
+                        self.path_data_dir.split("/")[-1]
+                    )
+                    lm_eval_yaml_template_multiple_choice["dataset_path"] = (
+                        output_path_dir
+                    )
+                    lm_eval_yaml_template_multiple_choice["dataset_name"] = (
+                        self.path_data_dir.split("/")[-1]
+                    )
 
                     fn_lm_eval_yaml = output_path_dir + "/config.yaml"
                     with open(fn_lm_eval_yaml, "w") as f:
@@ -1005,15 +1005,15 @@ def export(self, fn_suffix: str = None):
                     os.makedirs(output_path_dir, exist_ok=True)
                     output_path = output_path_dir + f"{split}_{fn_suffix}.jsonl"
 
-                    lm_eval_yaml_template_loglikelihood[
-                        "task"
-                    ] = self.path_data_dir.split("/")[-1]
-                    lm_eval_yaml_template_loglikelihood[
-                        "dataset_path"
-                    ] = output_path_dir
-                    lm_eval_yaml_template_loglikelihood[
-                        "dataset_name"
-                    ] = self.path_data_dir.split("/")[-1]
+                    lm_eval_yaml_template_loglikelihood["task"] = (
+                        self.path_data_dir.split("/")[-1]
+                    )
+                    lm_eval_yaml_template_loglikelihood["dataset_path"] = (
+                        output_path_dir
+                    )
+                    lm_eval_yaml_template_loglikelihood["dataset_name"] = (
+                        self.path_data_dir.split("/")[-1]
+                    )
 
                     fn_lm_eval_yaml = output_path_dir + "/config.yaml"
                     with open(fn_lm_eval_yaml, "w") as f:
diff --git a/data/train_test_split.py b/data/train_test_split.py
index 650586512..79691c03b 100644
--- a/data/train_test_split.py
+++ b/data/train_test_split.py
@@ -20,6 +20,7 @@
     - Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
         In this case, we set blocksize=None and read the whole file into memory.
 """
+
 import logging
 import os
 import random
diff --git a/experiments/data/merge_epmc_to_jsonl.py b/experiments/data/merge_epmc_to_jsonl.py
index 87632c23b..1c361dc7b 100644
--- a/experiments/data/merge_epmc_to_jsonl.py
+++ b/experiments/data/merge_epmc_to_jsonl.py
@@ -7,6 +7,7 @@
 <dir>/2022_05_25/file2.jsonl
 ...
 """
+
 import multiprocessing
 import os
 from typing import List
diff --git a/experiments/data/prepare_gptneox_chemrxiv.py b/experiments/data/prepare_gptneox_chemrxiv.py
index 328d04f74..fc25c3d6b 100644
--- a/experiments/data/prepare_gptneox_chemrxiv.py
+++ b/experiments/data/prepare_gptneox_chemrxiv.py
@@ -5,6 +5,7 @@
 Example usage:
     python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/
 """
+
 import argparse
 import os
 
diff --git a/experiments/data/prepare_hf_dataset.py b/experiments/data/prepare_hf_dataset.py
index 7a6bbe710..0a0a12c03 100644
--- a/experiments/data/prepare_hf_dataset.py
+++ b/experiments/data/prepare_hf_dataset.py
@@ -5,6 +5,7 @@
 Example Usage:
     python prepare_hf_dataset.py full_path/config.yml
 """
+
 import argparse
 import json
 import os
diff --git a/experiments/scripts/eval_create_batch_configs.py b/experiments/scripts/eval_create_batch_configs.py
index c2c1bb274..da0749307 100644
--- a/experiments/scripts/eval_create_batch_configs.py
+++ b/experiments/scripts/eval_create_batch_configs.py
@@ -20,9 +20,9 @@ def run(
     ]
 
     for model_name in model_names:
-        raw_config[
-            "model_args"
-        ] = f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
+        raw_config["model_args"] = (
+            f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
+        )
         raw_config["wandb_run_name"] = model_name
 
         with open(
diff --git a/experiments/scripts/run_tune.py b/experiments/scripts/run_tune.py
index d12dc51b5..ec68a0ea8 100644
--- a/experiments/scripts/run_tune.py
+++ b/experiments/scripts/run_tune.py
@@ -3,6 +3,7 @@
 
     Usage: python run_tune.py <path-to-config-yml>
 """
+
 import argparse
 import json
 import os
@@ -98,9 +99,9 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
     model_ref = getattr(transformers, config.model.base)
     model = model_ref.from_pretrained(
         pretrained_model_name_or_path=config.model.checkpoint_path or config.model.name,
-        revision=config.model.revision
-        if config.model.checkpoint_path is None
-        else None,
+        revision=(
+            config.model.revision if config.model.checkpoint_path is None else None
+        ),
     )
 
     if config.prompt_tuning.enabled:
@@ -171,9 +172,11 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
         **config.trainer.dict(exclude={"deepspeed_config", "restart_checkpoint"}),
         report_to="wandb" if config.wandb.enabled else "none",
         local_rank=local_rank,
-        deepspeed=CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
-        if config.trainer.deepspeed_config
-        else None,
+        deepspeed=(
+            CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
+            if config.trainer.deepspeed_config
+            else None
+        ),
     )
     print_zero_rank(local_rank, training_args)
 
diff --git a/src/chemnlp/data_val/config.py b/src/chemnlp/data_val/config.py
index f63ff3c11..1888016ee 100644
--- a/src/chemnlp/data_val/config.py
+++ b/src/chemnlp/data_val/config.py
@@ -10,9 +10,9 @@ class Data(BaseModel):
     path: Union[List[str], str]  # can be local or S3 directory
     validation_size: Union[List[float], float] = 0.05
     interleave_probs: Optional[List[float]] = None
-    sampling_criterion: Optional[
-        Literal["first_exhausted", "all_exhausted"]
-    ] = None  # as of v2.10.1
+    sampling_criterion: Optional[Literal["first_exhausted", "all_exhausted"]] = (
+        None  # as of v2.10.1
+    )
 
     @validator("validation_size")
     def small_positive_validation_sizes(cls, value_orig):
diff --git a/src/chemnlp/trainer.py b/src/chemnlp/trainer.py
index 31c0042b2..20fa0a05a 100644
--- a/src/chemnlp/trainer.py
+++ b/src/chemnlp/trainer.py
@@ -1,4 +1,5 @@
 """A custom trainer for modifying data sampling behaviour"""
+
 from typing import Optional
 
 import datasets