forked from TeamLIR/SigmaLaw-PBSA-Final
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_input.py
84 lines (75 loc) · 3.26 KB
/
process_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import re
import csv
import string
import pandas as pd
def remove_punct(text):
table = str.maketrans("", "", string.punctuation)
return text.translate(table)
def find_nth(sent, aspect, n):
count = 0
words = sent.split(' ')
for j in range(len(words)):
if words[j] == aspect:
count = count + 1
index = sum(len(x) + 1 for i, x in enumerate(words)
if i < j)
if (count == n):
return index
def process_input(raw_input):
content, aspect, start, end = list(), list(), list(), list()
with open(raw_input, 'r', encoding="utf8") as csvfile:
reader = csv.reader(csvfile, delimiter=',')
j = 0
count=0
for row in reader:
if (j == 0):
j = 1
else:
sent = row[0].lower()
print(sent)
sent = remove_punct(sent)
sent.replace('\d+', '')
sent = re.sub(r"^\s+|\s+$", "", sent)
aspects = [x.replace("'", "").replace('[', "").replace("\"", "").replace(']', "").strip().lower()
for x
in row[1].split(",")]
while ("" in aspects):
aspects.remove("")
sentiments = [x.strip().replace("'", "").replace('[', "").replace("\"", "").replace(']', "").lower()
for
x in row[2].split(",")]
while ("" in sentiments):
sentiments.remove("")
for i in range(0, len(aspects)):
_aspect = aspects[i]
_aspect = remove_punct(_aspect)
_aspect.replace('\d+', '')
_aspect = re.sub(r"^\s+|\s+$", "", _aspect)
if len(_aspect.split()) > 1:
start_index = sent.find(_aspect)
if start_index == -1:
print(sent + " - " + _aspect)
continue
end_index = start_index + len(_aspect)
else:
if not (any(char.isdigit() for char in aspects[i])):
start_index = find_nth(sent, _aspect, 1)
if start_index is None:
count = count + 1
continue
end_index = start_index + len(_aspect)
else:
_aspect = aspects[i][:-2]
start_index = find_nth(sent, _aspect, int(aspects[i][-1]))
if start_index is None:
continue
end_index = start_index + len(aspects[i][:-2])
content.append(sent)
aspect.append(_aspect)
start.append(start_index)
end.append(end_index)
sentiment=0
test_data = {'content': content, 'aspect': aspect, 'sentimet':sentiment, 'from': start, 'to': end}
print(aspect)
test_data = pd.DataFrame(test_data, columns=test_data.keys())
test_data.to_csv(('./datasets/semeval14/processed.csv'), index=None)