Skip to content

Commit

Permalink
[pre-commit.ci] pre-commit autoupdate (#532)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
pre-commit-ci[bot] committed Apr 1, 2024
1 parent 7612d91 commit 7331d88
Show file tree
Hide file tree
Showing 19 changed files with 56 additions and 38 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ repos:
exclude: ^experiments/configs

- repo: https://github.com/psf/black
rev: 23.12.1
rev: 24.3.0
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+

- repo: https://github.com/PyCQA/flake8
rev: 6.1.0
rev: 7.0.0
hooks:
- id: flake8
args: [--count, --show-source, --statistics]
Expand Down
1 change: 1 addition & 0 deletions data/check_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
This script has a command line interface. You can run it using `python check_pandas <data_dir>`,
where `<data_dir>` points to a nested set of directories with `data_clean.csv` files.
"""

import os
from glob import glob
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions data/check_smiles_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
This script uses dask. This might cause some errors with mismatching data types,
for which there are currently a few fallbacks.
"""

import os
from glob import glob
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions data/natural/preprocess_europepmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Before running this scripts, the filepaths need to be changed.
"""

import json
import os
import re
Expand Down
1 change: 1 addition & 0 deletions data/natural/preprocess_msds.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
You need to change filepaths before running this script
"""

import json
import os

Expand Down
1 change: 1 addition & 0 deletions data/natural/preprocess_nougat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
The filepaths need to be updated before running the script.
"""

import glob
import json
import os
Expand Down
1 change: 1 addition & 0 deletions data/postprocess_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
This script needs to be run after the splitting script.
"""

import os
from glob import glob
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions data/tabular/check_smiles_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This script checks for data leakage in the splits of a tabular dataset."""

import os
from glob import glob
from pathlib import Path
Expand Down
8 changes: 5 additions & 3 deletions data/tabular/odd_one_out/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,11 @@ def transform_dataset(dataset, n_permutations):
"smi_4": smis[smi_idx_arr[:, 3]],
"odd_one_out_idx": odd_one_out_idx,
"odd_one_out_mol": [
smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
if not np.isnan(odd_one_out_idx[i])
else np.nan
(
smis[smi_idx_arr[i, int(odd_one_out_idx[i])]]
if not np.isnan(odd_one_out_idx[i])
else np.nan
)
for i in range(len(odd_one_out_idx))
],
# "similarity_list": similarity_list,
Expand Down
1 change: 1 addition & 0 deletions data/tabular/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
In this case, we set blocksize=None and read the whole file into memory.
"""

import logging
import os
import random
Expand Down
42 changes: 21 additions & 21 deletions data/text_sampling/text_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,9 +942,9 @@ def export(self, fn_suffix: str = None):
inplace=True,
)
if self.multiple_choice_benchmarking_templates:
df_out[
["output", "answer_choices", "correct_output_index"]
] = df_out["output"].str.split(pat="<MC>", n=2, expand=True)
df_out[["output", "answer_choices", "correct_output_index"]] = (
df_out["output"].str.split(pat="<MC>", n=2, expand=True)
)
df_out["answer_choices"] = df_out["answer_choices"].apply(
lambda x: x.split("|")
)
Expand Down Expand Up @@ -982,15 +982,15 @@ def export(self, fn_suffix: str = None):
os.makedirs(output_path_dir, exist_ok=True)
output_path = output_path_dir + f"{split}.jsonl"

lm_eval_yaml_template_multiple_choice[
"task"
] = self.path_data_dir.split("/")[-1]
lm_eval_yaml_template_multiple_choice[
"dataset_path"
] = output_path_dir
lm_eval_yaml_template_multiple_choice[
"dataset_name"
] = self.path_data_dir.split("/")[-1]
lm_eval_yaml_template_multiple_choice["task"] = (
self.path_data_dir.split("/")[-1]
)
lm_eval_yaml_template_multiple_choice["dataset_path"] = (
output_path_dir
)
lm_eval_yaml_template_multiple_choice["dataset_name"] = (
self.path_data_dir.split("/")[-1]
)

fn_lm_eval_yaml = output_path_dir + "/config.yaml"
with open(fn_lm_eval_yaml, "w") as f:
Expand All @@ -1005,15 +1005,15 @@ def export(self, fn_suffix: str = None):
os.makedirs(output_path_dir, exist_ok=True)
output_path = output_path_dir + f"{split}_{fn_suffix}.jsonl"

lm_eval_yaml_template_loglikelihood[
"task"
] = self.path_data_dir.split("/")[-1]
lm_eval_yaml_template_loglikelihood[
"dataset_path"
] = output_path_dir
lm_eval_yaml_template_loglikelihood[
"dataset_name"
] = self.path_data_dir.split("/")[-1]
lm_eval_yaml_template_loglikelihood["task"] = (
self.path_data_dir.split("/")[-1]
)
lm_eval_yaml_template_loglikelihood["dataset_path"] = (
output_path_dir
)
lm_eval_yaml_template_loglikelihood["dataset_name"] = (
self.path_data_dir.split("/")[-1]
)

fn_lm_eval_yaml = output_path_dir + "/config.yaml"
with open(fn_lm_eval_yaml, "w") as f:
Expand Down
1 change: 1 addition & 0 deletions data/train_test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- Some CSV files contain complicated strings. We cannot parse them in a chunked manner.
In this case, we set blocksize=None and read the whole file into memory.
"""

import logging
import os
import random
Expand Down
1 change: 1 addition & 0 deletions experiments/data/merge_epmc_to_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<dir>/2022_05_25/file2.jsonl
...
"""

import multiprocessing
import os
from typing import List
Expand Down
1 change: 1 addition & 0 deletions experiments/data/prepare_gptneox_chemrxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Example usage:
python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/
"""

import argparse
import os

Expand Down
1 change: 1 addition & 0 deletions experiments/data/prepare_hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Example Usage:
python prepare_hf_dataset.py full_path/config.yml
"""

import argparse
import json
import os
Expand Down
6 changes: 3 additions & 3 deletions experiments/scripts/eval_create_batch_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def run(
]

for model_name in model_names:
raw_config[
"model_args"
] = f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
raw_config["model_args"] = (
f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
)
raw_config["wandb_run_name"] = model_name

with open(
Expand Down
15 changes: 9 additions & 6 deletions experiments/scripts/run_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Usage: python run_tune.py <path-to-config-yml>
"""

import argparse
import json
import os
Expand Down Expand Up @@ -98,9 +99,9 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
model_ref = getattr(transformers, config.model.base)
model = model_ref.from_pretrained(
pretrained_model_name_or_path=config.model.checkpoint_path or config.model.name,
revision=config.model.revision
if config.model.checkpoint_path is None
else None,
revision=(
config.model.revision if config.model.checkpoint_path is None else None
),
)

if config.prompt_tuning.enabled:
Expand Down Expand Up @@ -171,9 +172,11 @@ def run(config_path: str, config_overrides: Optional[Dict] = None) -> None:
**config.trainer.dict(exclude={"deepspeed_config", "restart_checkpoint"}),
report_to="wandb" if config.wandb.enabled else "none",
local_rank=local_rank,
deepspeed=CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
if config.trainer.deepspeed_config
else None,
deepspeed=(
CONFIG_DIR / f"deepspeed/{config.trainer.deepspeed_config}"
if config.trainer.deepspeed_config
else None
),
)
print_zero_rank(local_rank, training_args)

Expand Down
6 changes: 3 additions & 3 deletions src/chemnlp/data_val/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ class Data(BaseModel):
path: Union[List[str], str] # can be local or S3 directory
validation_size: Union[List[float], float] = 0.05
interleave_probs: Optional[List[float]] = None
sampling_criterion: Optional[
Literal["first_exhausted", "all_exhausted"]
] = None # as of v2.10.1
sampling_criterion: Optional[Literal["first_exhausted", "all_exhausted"]] = (
None # as of v2.10.1
)

@validator("validation_size")
def small_positive_validation_sizes(cls, value_orig):
Expand Down
1 change: 1 addition & 0 deletions src/chemnlp/trainer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""A custom trainer for modifying data sampling behaviour"""

from typing import Optional

import datasets
Expand Down

0 comments on commit 7331d88

Please sign in to comment.