-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_data.py
42 lines (33 loc) · 1.45 KB
/
make_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
import json
import click
import datetime
import pickle
from vocab import TableVocab, WordVocab
from utils import make_table, make_text
@click.command()
@click.argument("dir_path", type=click.Path(exists=True))
@click.argument("annotation_path", type=click.Path(exists=True))
@click.argument("out_path")
def prep(dir_path, annotation_path, out_path):
desc = str(datetime.datetime.now()) + " Overwrite the preprocessed data: {}? Y/n (default: n)".format(out_path)
if os.path.exists(out_path) and input(desc) != "Y":
print(str(datetime.datetime.now()) + " Exit.")
exit()
print(str(datetime.datetime.now()) + " Building dataset from " + dir_path)
train = json.load(open(os.path.join(dir_path, "train.json")))
tables, texts = [make_table(ins) for ins in train], list(make_text(train, annotation_path))
authors = {ins.get("author", "UNK") for ins in train}
assert len(tables) == len(texts)
tv = {}
for k in ("team", "player"):
tv[k] = TableVocab([t[k] for t in tables], key=k)
wv = WordVocab({w for doc in texts for sent, _ in doc for w in sent})
print(str(datetime.datetime.now()) + " Saving dataset from " + out_path)
pickle.dump({
"data": {"text": texts, "table": tables},
"vocab": {"word": wv.__dict__, "table": {k: v.__dict__ for k, v in tv.items()}},
"author": {k: i for i, k in enumerate(authors)}
}, open(out_path, "wb"))
if __name__ == '__main__':
prep()