-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_dataset.py
21 lines (19 loc) · 1001 Bytes
/
utils_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from datasets import load_dataset
import json
test_set_names = ["wsj", "legal", "stories"]
def load_shuffle_test_set(dataset_name):
if dataset_name == "wsj":
# This dataset is not available for free, as it is based on the test portion of the Penn Treebank
# See "Extending the Entity Grid with Entity-Specific Features" Elsner et al. for more information
# https://aclanthology.org/P11-2022.pdf
# It must be obtained from the Linguistic Data Consortium (LDC)
# Link: https://catalog.ldc.upenn.edu/LDC99T42
with open("/home/phillab/data/coherence_test_set.json", "r") as f:
return json.load(f)
elif dataset_name == 'legal':
return list(load_dataset('billsum')['test'])[:1000]
elif dataset_name == 'stories':
# taking last 1000 elements to create test set
data = list(load_dataset('reddit_tifu', 'short')['train'])[-1000:]
dataset = [{"text": d["documents"]} for d in data]
return dataset