Skip to content

Commit

Permalink
test processing script
Browse files Browse the repository at this point in the history
  • Loading branch information
kjappelbaum committed Aug 14, 2024
1 parent 9a9709d commit a491182
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 176 deletions.
17 changes: 2 additions & 15 deletions data/tabular/bicerano_dataset/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
bibtex:
- "@article{afzal2021,
author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander
and Halls, Mathew D. and Gavartin, Jacob L. and Morisato,
Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.},
title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical
Properties of Polymers for Various Applications},
journal = {ACS Applied Polymer Materials},
volume = {3},
number = {2},
pages = {620-630},
year = {2021},
doi = {10.1021/acsapm.0c00524}}"
description:
"This paper outlines a MD simulation workflow based on GPU MD simulation
and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
- "@article{afzal2021, author = {Afzal, Mohammad Atif Faiz and Browning, Andrea R. and Goldberg, Alexander and Halls, Mathew D. and Gavartin, Jacob L. and Morisato, Tsuguo and Hughes, Thomas F. and Giesen, David J. and Goose, Joseph E.}, title = {High-Throughput Molecular Dynamics Simulations and Validation of Thermophysical Properties of Polymers for Various Applications}, journal = {ACS Applied Polymer Materials}, volume = {3}, number = {2}, pages = {620-630}, year = {2021}, doi = {10.1021/acsapm.0c00524}}"
description: "This paper outlines a MD simulation workflow based on GPU MD simulation and the refined optimized potentials for liquid simulation (OPLS) OPLS3e force field to calculate glass transition temperatures (Tgs) of 315 polymers for which Bicerano reported experimental values."
identifiers:
- description: PSMILES
id: PSMILES
Expand Down
162 changes: 1 addition & 161 deletions src/chemnlp/data/utils.py
Original file line number Diff line number Diff line change
@@ -1,169 +1,9 @@
import itertools
import random
from typing import Dict, List, Optional

from datasets import concatenate_datasets
from datasets.formatting.formatting import LazyBatch
from transformers import PreTrainedTokenizer

import chemnlp.data.hf_datasets as hf_datasets
from typing import List

import yaml
from typing import Any


def sample_dataset(dataset, num_samples):
n = len(dataset)
num_samples = min(num_samples, n)
return dataset.select(random.sample(range(n), k=num_samples))


def get_datasets(config, tokenizer):
train_datasets, val_datasets = [], []
for dataset_name in config.data.datasets:
dataset_fn_ref = getattr(hf_datasets, dataset_name)
train_tokenized, val_tokenized = dataset_fn_ref(tokenizer)
train_datasets.append(train_tokenized)
val_datasets.append(val_tokenized)
return concatenate_datasets(train_datasets), concatenate_datasets(val_datasets)


def chunks(lst, n):
"""
Yield successive n-sized chunks from lst.
NOTE Hugging face truncates any large samples -> 1 sample
"""
for i in range(0, len(lst), n):
yield lst[i : i + n - 1]


def pad_sequence(sequence, max_len, pad_token_id):
"""Pad a input sequence"""
num_pad_tokens = max_len - len(sequence)
attention_mask = [1] * len(sequence) + [0] * num_pad_tokens
sequence += [pad_token_id] * num_pad_tokens
return sequence, attention_mask


def tokenise(
batch: LazyBatch,
tokenizer: PreTrainedTokenizer,
max_length: int,
string_key: str,
keep_columns: Optional[List[str]] = None,
) -> Dict[str, List]:
"""Tokenise a batch of data using sample chunking"""
tok_articles = [tokenizer(x)["input_ids"] for x in batch[string_key]]
flattened_tokens = list(itertools.chain.from_iterable(tok_articles))
chunked_tokens = list(chunks(flattened_tokens, max_length))
padded_sample_tokens = _pad_batched_data(
dataset=chunked_tokens,
tokenizer=tokenizer,
max_length=max_length,
)

if keep_columns:
# augment with token-level metadata
sample_metadata = [
{meta: batch[meta][i] or "" for meta in keep_columns}
for i, _ in enumerate(batch[string_key])
]
tok_metadata = [
[sample_meta] * len(x)
for sample_meta, x in zip(sample_metadata, tok_articles)
]
tok_metadata = list(itertools.chain.from_iterable(tok_metadata))
tok_metadata = list(chunks(tok_metadata, max_length))
padded_sample_tokens["metadata"] = tok_metadata

return padded_sample_tokens


def get_tokenised_data_minimum_padding(
dataset: List,
tokenizer: PreTrainedTokenizer,
max_length: int,
eos_string: str,
) -> Dict[str, List]:
batched_data = _concatenate_samples_without_splitting(
dataset=dataset,
tokenizer=tokenizer,
max_length=max_length,
eos_string=eos_string,
)

return _pad_batched_data(
dataset=batched_data,
tokenizer=tokenizer,
max_length=max_length,
)


def _concatenate_samples_without_splitting(
dataset: List,
tokenizer: PreTrainedTokenizer,
max_length: int,
eos_string: str,
):
"""concatenate samples into batches upto max_length without
splitting any of the individual samples between batches"""

tok_articles = [tokenizer(x)["input_ids"] for x in dataset]
tok_articles = [sample for sample in tok_articles if len(sample) <= max_length]
tok_articles = list(itertools.chain.from_iterable(tok_articles))
eos_token = tokenizer.encode(eos_string)[0]

concatenated_articles = []
p0, p1, last_eos = 0, 1, 0
while p1 < len(tok_articles):
if tok_articles[p1] == eos_token:
if (p1 - p0 + 1) < max_length:
# keep track of most recent eos index, continue exploring
last_eos = p1

elif (p1 - p0 + 1) == max_length:
# collect whole pointer window
concatenated_articles.append(tok_articles[p0 : p1 + 1])
last_eos = p1
p0 = p1 + 1
p1 = p0
else:
# max_length exceeded, collect only up to last eos
concatenated_articles.append(tok_articles[p0 : last_eos + 1])
p0 = last_eos + 1
p1 = p0
p1 += 1

# collect final batch
concatenated_articles.append(tok_articles[p0:])
return concatenated_articles


def _pad_batched_data(
dataset: List,
tokenizer: PreTrainedTokenizer,
max_length: int,
):
padded_sequences_all = []
attention_masks_all = []

for article in dataset:
if len(article) < max_length:
article, attention_masks = pad_sequence(
article, max_length, tokenizer.pad_token_id
)
else:
attention_masks = [1] * max_length
padded_sequences_all.append(article)
attention_masks_all.append(attention_masks)

return {
"input_ids": padded_sequences_all,
"token_type_ids": [[0] * max_length] * len(padded_sequences_all),
"attention_mask": attention_masks_all,
}


def oxford_comma_join(items: List[str]) -> str:
"""Join a list of items with Oxford comma"""
if len(items) == 1:
Expand Down
119 changes: 119 additions & 0 deletions tests/data/test_sampler_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pytest
import os
import pandas as pd
import yaml
import json
from chemnlp.data.sampler_cli import process_dataset

@pytest.fixture
def temp_data_dir(tmp_path):
data_dir = tmp_path / "data"
data_dir.mkdir()

# Create meta.yaml
meta = {
'identifiers': [{'id': 'SMILES', 'type': 'SMILES'}],
'targets': [{'id': 'property', 'type': 'continuous'}],
'templates': [
'The molecule with SMILES {SMILES#} has property {property#}.',
'What is the property of the molecule with SMILES {SMILES#}?<EOI>{property#}'
]
}
with open(data_dir / "meta.yaml", "w") as f:
yaml.dump(meta, f)

# Create data_clean.csv
df = pd.DataFrame({
'SMILES': ['CC', 'CCC', 'CCCC'],
'property': [1.0, 2.0, 3.0],
'split': ['train', 'test', 'valid']
})
df.to_csv(data_dir / "data_clean.csv", index=False)

return data_dir

@pytest.fixture
def temp_output_dir(tmp_path):
output_dir = tmp_path / "output"
output_dir.mkdir()
return output_dir

def test_process_dataset(temp_data_dir, temp_output_dir):
process_dataset(
data_dir=str(temp_data_dir),
output_dir=str(temp_output_dir),
chunksize=1000,
class_balanced=False,
benchmarking=False,
multiple_choice=False
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

# Check the content of the output files
for split in ['train', 'test', 'valid']:
with open(template_dir / f"{split}.jsonl", "r") as f:
lines = f.readlines()
assert len(lines) == 1 # One sample per split
sample = json.loads(lines[0])
assert "text" in sample
assert "SMILES" in sample["text"]
assert "property" in sample["text"]

def test_process_dataset_benchmarking(temp_data_dir, temp_output_dir):
process_dataset(
data_dir=str(temp_data_dir),
output_dir=str(temp_output_dir),
chunksize=1000,
class_balanced=False,
benchmarking=True,
multiple_choice=False
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

# Check the content of the output files
for split in ['train', 'test', 'valid']:
with open(template_dir / f"{split}.jsonl", "r") as f:
lines = f.readlines()
assert len(lines) == 1 # One sample per split
sample = json.loads(lines[0])
assert "input" in sample
assert "output" in sample
assert "SMILES" in sample["input"]
# assert that we can convert the output to a float
try:
float(sample["output"])
except ValueError:
assert False

def test_process_dataset_class_balanced(temp_data_dir, temp_output_dir):
process_dataset(
data_dir=str(temp_data_dir),
output_dir=str(temp_output_dir),
chunksize=1000,
class_balanced=True,
benchmarking=False,
multiple_choice=False
)

# Check that output files were created
chunk_dir = temp_output_dir / "chunk_0"
template_dir = chunk_dir / "template_0"
assert template_dir.exists()

# Check the content of the output files
for split in ['train', 'test', 'valid']:
with open(template_dir / f"{split}.jsonl", "r") as f:
lines = f.readlines()
assert len(lines) == 1 # One sample per split
sample = json.loads(lines[0])
assert "text" in sample
assert "SMILES" in sample["text"]
assert "property" in sample["text"]

0 comments on commit a491182

Please sign in to comment.