-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_imdb_dataset.py
45 lines (34 loc) · 1.52 KB
/
create_imdb_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
from argparse import ArgumentParser
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
def main(source, target):
source = Path(source).expanduser()
target = Path(target).expanduser()
target.mkdir(parents=True, exist_ok=True)
CLASSES = ["neg", "pos"]
def get_texts(path):
texts, labels = [], []
it = (
(idx, fname)
for idx, label in enumerate(CLASSES)
for fname in (path / label).glob("*.txt")
)
for idx, fname in tqdm(it, desc=f"Loading {path}", ncols=80, position=0):
texts.append(fname.read_text(encoding="utf-8").replace("<br /><br />", "\n"))
labels.append(idx)
return pd.DataFrame({"label": labels, "text": texts})
train = get_texts(source / "train")
test = get_texts(source / "test")
# dev, test = train_test_split(test, test_size=.5, random_state=42)
train.to_csv(target / "train.tsv", "\t", index=False)
# dev.to_csv(target / "dev.tsv", "\t", index=False)
test.to_csv(target / "test.tsv", "\t", index=False)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--source", "-s", type=str, default="./aclImdb", help="Source IMDB directory (path to aclImdb)")
parser.add_argument("--target", "-t", type=str, default="./aclImdb", help="Target directory (ex: imdb_data)")
args = parser.parse_args()
main(**vars(args))