-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_test_data.py
61 lines (47 loc) · 1.63 KB
/
generate_test_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#%%
from utils import get_article
from os import listdir
from os.path import isfile, join
import re
import pandas as pd
mypath = "data/test-articles/"
test_articles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]
id_extractor = re.compile("\d+")
row = []
for path in test_articles:
article_id = id_extractor.findall(path)[0]
text = get_article(path)
lines = text.split("\n")
start_idx = 0
for line in lines:
end_idx = len(line) + start_idx + 2
if len(line) > 1:
row.append({"article_id": article_id,
"text": "[SKIP]" if len(line) < 1 else line.strip(),
"article_path": path,
"start_idx": start_idx,
"end_idx": end_idx})
start_idx = end_idx + 1
df = pd.DataFrame(row)
df.to_csv("data/task1_test.csv")
# %%
mypath = "data/dev-articles/"
articles = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]
id_extractor = re.compile("\d+")
row = []
for path in articles:
article_id = id_extractor.findall(path)[0]
text = get_article(path)
lines = text.split("\n")
start_idx = 0
for line in lines:
if len(line) > 1:
row.append({"article_id": article_id,
"text": "[SKIP]" if len(line) < 1 else line.strip(),
"article_path": path,
"start_idx": start_idx,
"end_idx": end_idx})
start_idx = end_idx + 1
df = pd.DataFrame(row)
df.to_csv("data/task1_dev.csv")
# %%