[pre-commit.ci] pre-commit autoupdate (#532)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
OpenBioML · Apr 1, 2024 · 7331d88 · 7331d88
1 parent 7612d91
commit 7331d88
Show file tree

Hide file tree

Showing 19 changed files with 56 additions and 38 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,13 +21,13 @@ repos:
  exclude: ^experiments/configs
 
  - repo: https://github.com/psf/black
- rev: 23.12.1
+ rev: 24.3.0
  hooks:
  - id: black
  language_version: python3 # Should be a command that runs python3.6+
 
  - repo: https://github.com/PyCQA/flake8
- rev: 6.1.0
+ rev: 7.0.0
  hooks:
  - id: flake8
  args: [--count, --show-source, --statistics]

diff --git a/data/check_pandas.py b/data/check_pandas.py
@@ -8,6 +8,7 @@
 This script has a command line interface. You can run it using `python check_pandas <data_dir>`,
 where `<data_dir>` points to a nested set of directories with `data_clean.csv` files.
 """
+
 import os
 from glob import glob
 from pathlib import Path

diff --git a/data/check_smiles_split.py b/data/check_smiles_split.py
@@ -6,6 +6,7 @@
 This script uses dask. This might cause some errors with mismatching data types,
 for which there are currently a few fallbacks.
 """
+
 import os
 from glob import glob
 from pathlib import Path

diff --git a/data/natural/preprocess_europepmc.py b/data/natural/preprocess_europepmc.py
@@ -5,6 +5,7 @@
 
 Before running this scripts, the filepaths need to be changed.
 """
+
 import json
 import os
 import re

diff --git a/data/natural/preprocess_msds.py b/data/natural/preprocess_msds.py
@@ -3,6 +3,7 @@
 
 You need to change filepaths before running this script
 """
+
 import json
 import os
 

diff --git a/data/natural/preprocess_nougat.py b/data/natural/preprocess_nougat.py
@@ -6,6 +6,7 @@
 
 The filepaths need to be updated before running the script.
 """
+
 import glob
 import json
 import os

diff --git a/data/postprocess_split.py b/data/postprocess_split.py
@@ -5,6 +5,7 @@
 
 This script needs to be run after the splitting script.
 """
+
 import os
 from glob import glob
 from pathlib import Path

diff --git a/data/tabular/check_smiles_split.py b/data/tabular/check_smiles_split.py
@@ -1,4 +1,5 @@
 """This script checks for data leakage in the splits of a tabular dataset."""
+
 import os
 from glob import glob
 from pathlib import Path

diff --git a/data/tabular/odd_one_out/transform.py b/data/tabular/odd_one_out/transform.py
@@ -104,9 +104,11 @@ def transform_dataset(dataset, n_permutations):
  "smi_4": smis[smi_idx_arr[:, 3]],
  "odd_one_out_idx": odd_one_out_idx,
  "odd_one_out_mol": [
- smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
- if not np.isnan(odd_one_out_idx[i])
- else np.nan
+ (
+ smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
+ if not np.isnan(odd_one_out_idx[i])
+ else np.nan
+ )
  for i in range(len(odd_one_out_idx))
  ],
  # "similarity_list": similarity_list,

diff --git a/data/tabular/train_test_split.py b/data/tabular/train_test_split.py
@@ -20,6 +20,7 @@
  - Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
  In this case, we set blocksize=None and read the whole file into memory.
 """
+
 import logging
 import os
 import random

diff --git a/data/text_sampling/text_sampling.py b/data/text_sampling/text_sampling.py
@@ -942,9 +942,9 @@ def export(self, fn_suffix: str = None):
  inplace=True,
  )
  if self.multiple_choice_benchmarking_templates:
- df_out[
- ["output", "answer_choices", "correct_output_index"]
- ] = df_out["output"].str.split(pat="<MC>", n=2, expand=True)
+ df_out[["output", "answer_choices", "correct_output_index"]] = (
+ df_out["output"].str.split(pat="<MC>", n=2, expand=True)
+ )
  df_out["answer_choices"] = df_out["answer_choices"].apply(
  lambda x: x.split("|")
  )
@@ -982,15 +982,15 @@ def export(self, fn_suffix: str = None):
  os.makedirs(output_path_dir, exist_ok=True)
  output_path = output_path_dir + f"{split}.jsonl"
 
- lm_eval_yaml_template_multiple_choice[
- "task"
- ] = self.path_data_dir.split("/")[-1]
- lm_eval_yaml_template_multiple_choice[
- "dataset_path"
- ] = output_path_dir
- lm_eval_yaml_template_multiple_choice[
- "dataset_name"
- ] = self.path_data_dir.split("/")[-1]
+ lm_eval_yaml_template_multiple_choice["task"] = (
+ self.path_data_dir.split("/")[-1]
+ )
+ lm_eval_yaml_template_multiple_choice["dataset_path"] = (
+ output_path_dir
+ )
+ lm_eval_yaml_template_multiple_choice["dataset_name"] = (
+ self.path_data_dir.split("/")[-1]
+ )
 
  fn_lm_eval_yaml = output_path_dir + "/config.yaml"
  with open(fn_lm_eval_yaml, "w") as f:
@@ -1005,15 +1005,15 @@ def export(self, fn_suffix: str = None):
  os.makedirs(output_path_dir, exist_ok=True)
  output_path = output_path_dir + f"{split}_{fn_suffix}.jsonl"
 
- lm_eval_yaml_template_loglikelihood[
- "task"
- ] = self.path_data_dir.split("/")[-1]
- lm_eval_yaml_template_loglikelihood[
- "dataset_path"
- ] = output_path_dir
- lm_eval_yaml_template_loglikelihood[
- "dataset_name"
- ] = self.path_data_dir.split("/")[-1]
+ lm_eval_yaml_template_loglikelihood["task"] = (
+ self.path_data_dir.split("/")[-1]
+ )
+ lm_eval_yaml_template_loglikelihood["dataset_path"] = (
+ output_path_dir
+ )
+ lm_eval_yaml_template_loglikelihood["dataset_name"] = (
+ self.path_data_dir.split("/")[-1]
+ )
 
  fn_lm_eval_yaml = output_path_dir + "/config.yaml"
  with open(fn_lm_eval_yaml, "w") as f:

diff --git a/data/train_test_split.py b/data/train_test_split.py
@@ -20,6 +20,7 @@
  - Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
  In this case, we set blocksize=None and read the whole file into memory.
 """
+
 import logging
 import os
 import random

diff --git a/experiments/data/merge_epmc_to_jsonl.py b/experiments/data/merge_epmc_to_jsonl.py
@@ -7,6 +7,7 @@
 <dir>/2022_05_25/file2.jsonl
 ...
 """
+
 import multiprocessing
 import os
 from typing import List

diff --git a/experiments/data/prepare_gptneox_chemrxiv.py b/experiments/data/prepare_gptneox_chemrxiv.py
@@ -5,6 +5,7 @@
 Example usage:
  python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/
 """
+
 import argparse
 import os
 

diff --git a/experiments/data/prepare_hf_dataset.py b/experiments/data/prepare_hf_dataset.py
@@ -5,6 +5,7 @@
 Example Usage:
  python prepare_hf_dataset.py full_path/config.yml
 """
+
 import argparse
 import json
 import os

diff --git a/experiments/scripts/eval_create_batch_configs.py b/experiments/scripts/eval_create_batch_configs.py
@@ -20,9 +20,9 @@ def run(
  ]
 
  for model_name in model_names:
- raw_config[
- "model_args"
- ] = f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
+ raw_config["model_args"] = (
+ f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
+ )
  raw_config["wandb_run_name"] = model_name
 
  with open(

diff --git a/experiments/scripts/run_tune.py b/experiments/scripts/run_tune.py
@@ -3,6 +3,7 @@
 
  Usage: python run_tune.py <path-to-config-yml>
 """
+
 import argparse
 import json
 import os
@@ -98,9 +99,9 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
  model_ref = getattr(transformers, config.model.base)
  model = model_ref.from_pretrained(
  pretrained_model_name_or_path=config.model.checkpoint_path or config.model.name,
- revision=config.model.revision
- if config.model.checkpoint_path is None
- else None,
+ revision=(
+  config.model.revision if config.model.checkpoint_path is None else None
+ ),
  )
 
  if config.prompt_tuning.enabled:
@@ -171,9 +172,11 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
  **config.trainer.dict(exclude={"deepspeed_config", "restart_checkpoint"}),
  report_to="wandb" if config.wandb.enabled else "none",
  local_rank=local_rank,
- deepspeed=CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
- if config.trainer.deepspeed_config
- else None,
+ deepspeed=(
+ CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
+ if config.trainer.deepspeed_config
+ else None
+ ),
  )
  print_zero_rank(local_rank, training_args)
 

diff --git a/src/chemnlp/data_val/config.py b/src/chemnlp/data_val/config.py
@@ -10,9 +10,9 @@ class Data(BaseModel):
  path: Union[List[str], str] # can be local or S3 directory
  validation_size: Union[List[float], float] = 0.05
  interleave_probs: Optional[List[float]] = None
- sampling_criterion: Optional[
- Literal["first_exhausted", "all_exhausted"]
- ] = None # as of v2.10.1
+ sampling_criterion: Optional[Literal["first_exhausted", "all_exhausted"]] = (
+ None # as of v2.10.1
+ )
 
  @validator("validation_size")
  def small_positive_validation_sizes(cls, value_orig):

diff --git a/src/chemnlp/trainer.py b/src/chemnlp/trainer.py
@@ -1,4 +1,5 @@
 """A custom trainer for modifying data sampling behaviour"""
+
 from typing import Optional
 
 import datasets