-
Notifications
You must be signed in to change notification settings - Fork 42
/
filter_tweets.py
28 lines (22 loc) · 907 Bytes
/
filter_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
spam = open('spam_lexicon.txt', encoding='utf-8').read().split('\n')
spam = [word for word in spam if word.strip()]
def has_spam(tweet):
for word in spam:
if word in tweet:
return True
return False
def filter_tweets(infile, outfile, label):
outfile = open(outfile, encoding='utf-8', mode='w')
tweets = open(infile, encoding='utf-8').read().split('\n')
for tweet in tweets:
if not tweet.strip():
continue
if has_spam(tweet):
continue
outfile.write(label + '\t' + tweet + '\n')
if __name__ == '__main__':
filter_tweets('arabic_tweets_txt/positive_tweets_arabic_20181206_1k.txt',
'arabic_tweets_tsv/pos_20181206_1k.tsv', 'pos')
filter_tweets('arabic_tweets_txt/negative_tweets_arabic_20181206_1k.txt',
'arabic_tweets_tsv/neg_20181206_1k.tsv', 'neg')
print('all done')