-
Notifications
You must be signed in to change notification settings - Fork 1
/
extractlabeledtweets.py
126 lines (101 loc) · 5.87 KB
/
extractlabeledtweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import codecs, os
from operator import countOf
from collections import namedtuple
from emotionslist import *
class MySentences(object):
def __init__(self, file_name):
self.file_name = file_name
def __iter__(self):
for lines in codecs.open(self.file_name, 'r', 'utf-8'):
yield lines.split()
class ExtractLabeledTweets(object):
global categories_length
categories_length = len(emotion_categories)
def __init__(self, target_file, limit):
self.target_file = target_file
self.limit = limit
self.line_count = 0
self.no_emotion_count = 0
self.no_emotion_limit = 20000
self.child_emotions_categorised = [{} for i in range(categories_length)]
self.limited_child_emotions_categorised = [{} for i in range(categories_length)]
self.unduplicated_emotions_categorised = [{} for x in range(categories_length)]
self.limited_child_emotions_container = []
def _check_single_root_emotion(self, hashtags):
single_emotion_property = namedtuple('single_emotion_property','status tags category')
single_emotion_property.status = False
single_emotion_property.tags = [tag for tag in hashtags if tag in root_emotions_container]
single_emotion_property.category = list({idx for idx in range(categories_length) for tag in hashtags if tag in root_emotions_categorised[idx]})
if len(single_emotion_property.category) == 1:
single_emotion_property.status = True
return single_emotion_property
def _check_single_emotion(self, root_emotion, child_emotion):
root_category = list({idx for idx in range(categories_length) for tag in root_emotion if tag in root_emotions_categorised[idx]})
child_category = list({idx for idx in range(categories_length) for tag in child_emotion if tag in self.limited_child_emotions_categorised[idx].keys()})
if len(root_category) and len(child_category) == 1:
if root_category[0] == child_category[0]:
return True
elif (len(root_category) == 1 or len(child_category) == 1):
return True
return False
def _add_child_emotions(self, hashtags, category):
for tag in hashtags:
self.child_emotions_categorised[category][tag] = self.child_emotions_categorised[category].get(tag,0) + 1
return
def _remove_duplicate_emotions(self, duplicated_emotions):
for idx in range(categories_length):
for emotion in duplicated_emotions[idx]:
flag = False
for idxx in range(categories_length):
if (idx is not idxx):
if (emotion in duplicated_emotions[idxx].keys()) and (duplicated_emotions[idx][emotion] <= duplicated_emotions[idxx][emotion]):
flag = True
if not flag:
self.unduplicated_emotions_categorised[idx][emotion] = duplicated_emotions[idx][emotion]
return self.unduplicated_emotions_categorised
def _file_len(self, fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
def extract_labels(self):
with codecs.open(self.target_file, 'r', 'utf-8') as infile:
raw_tweets = infile.readlines()
total_lines = self._file_len(self.target_file)
self.line_count = 0
for line in raw_tweets:
self.line_count += 1
print "{:.2f}".format(self.line_count*100.0/total_lines) ,"\r", "Data processed for root emotions. . . ",
hashtags = [words[1:] for words in line.split() if words.startswith('#')]
if hashtags:
single_emotion_result = self._check_single_root_emotion(hashtags)
if single_emotion_result.status:
child_hashtags = [tags for tags in hashtags if tags not in single_emotion_result.tags]
self._add_child_emotions(child_hashtags, single_emotion_result.category[0])
for idx in range(categories_length):
self.limited_child_emotions_categorised[idx] = dict(tags for tags in self.child_emotions_categorised[idx].items() if tags[1]>self.limit)
self.limited_child_emotions_categorised = self._remove_duplicate_emotions(self.limited_child_emotions_categorised)
self.limited_child_emotions_container = dict(emotions for category in self.limited_child_emotions_categorised for emotions in category.items())
self.cnt = 0
print "\n"
base,ext = os.path.splitext(self.target_file)
write_file = open(self.target_file.rstrip(ext) + '_e' + ext ,'w')
self.line_count = 0
for line in raw_tweets:
print "{:.2f}".format(self.line_count*100.0/total_lines) ,"\r", "Data processed for child emotions. . . ",
self.line_count += 1
hashtags = [words[1:] for words in line.split() if words.startswith('#')]
root_emotion = [tag for tag in hashtags if tag in root_emotions_container]
child_emotion = [tag for tag in hashtags if tag in self.limited_child_emotions_container.keys()]
single_emotion_result = self._check_single_emotion(root_emotion, child_emotion)
if single_emotion_result:
write_file.write(line)
elif self.no_emotion_count < self.no_emotion_limit:
if (root_emotion==[]) and (child_emotion==[]):
self.no_emotion_count += 1
write_file.write(line)
write_file.close()
labeled_dataset = namedtuple('labeled_dataset','file_name derived_emotions_categorised')
labeled_dataset.file_name = write_file.name
labeled_dataset.derived_emotions_categorised = self.limited_child_emotions_categorised
return labeled_dataset