-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_scores.py
110 lines (91 loc) · 3.14 KB
/
get_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re, string, unicodedata
import nltk
# import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
words = replace_numbers(words)
words = remove_stopwords(words)
return words
def get_scores(data):
lst_scores = []
"""Returns Sentiment Analysis Scores"""
sia = SentimentIntensityAnalyzer()
for index, row in data.iterrows():
title = row['title']
text_tokens = word_tokenize(title)
cleaned_tokens = normalize(text_tokens)
lemmas = lemmatize_verbs(cleaned_tokens)
cleaned_sentence = (" ").join(cleaned_tokens) # filtered sentences
score = sia.polarity_scores(cleaned_sentence)
lst_scores.append(score)
# print(score['compound'])
data['scores'] = lst_scores
if __name__ == "__main__":
data = pd.read_csv("test.csv") # encoding='unicode_escape' to prevent encoding/ decoding errors
get_scores(data)