-
Notifications
You must be signed in to change notification settings - Fork 0
/
article_preprocessing.py
197 lines (168 loc) · 5.47 KB
/
article_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pandas as pd
import nltk
import re
from re import sub
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import spacy
from spacy.lang.hi import Hindi
from spacy import displacy
from collections import Counter
import en_core_web_sm
from google.cloud import translate_v2 as translate
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# removing named entities
def remove_named_entities(text):
nlp = en_core_web_sm.load()
doc = nlp(text)
words = text.split()
named_entities = [X.text for X in doc.ents]
words = [i for i in words if not i in named_entities ]
result = ' '.join(words)
return result
# removing punctuations and numbers
def preprocess(text):
text = text.lower()
text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
text = sub(r"\+", " plus ", text)
text = sub(r",", " ", text)
text = sub(r"\.", " ", text)
text = sub(r"!", " ! ", text)
text = sub(r"\?", " ? ", text)
text = sub(r"'", " ", text)
text = sub(r":", " : ", text)
text = sub(r"\s{2,}", " ", text)
return text
# tokenize and remove left over named entities as well as lemmatize
def tokenize_named_entities_removal(text):
nlp = en_core_web_sm.load()
doc = nlp(text)
tokens = [ token.lemma_ for token in doc]
tokens = [i for i in tokens if i != '-PRON-' ]
named_entities = [X.text for X in doc.ents]
words = [i for i in tokens if not i in named_entities]
return words
# removing stop words
def remove_stopwords(article):
stopwords = nltk.corpus.stopwords.words('english')
result = [i for i in article if not i in stopwords]
return result
# Preprocessing hindi
def preprocess_hin(article):
text = sub(r'[a-zA-Z]','', article)
text = sub(r'[0-9][0-9]', '', text)
text = sub(r'\n', '', text)
text = sub(r',','', text)
text = sub(r':','', text)
text = sub(r'[0-9]', '', text)
text = sub(r'।', '', text)
text = sub(r'-','', text)
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
# text = emoji_pattern.sub(r'', text)
text = text.strip(string.punctuation)
return text
# tokenizing hindi data
def tokenize_hin(article):
nlp = Hindi()
doc = nlp(article)
tokens = [token.text for token in doc]
return tokens
# Getting Synonyms for a given word with polarity, and word origin
def get_synonyms(word, polarity):
words = []
for sysnet in wordnet.sysnets(word):
for lemma in sysnet.lemmas():
sys = {}
sys['origin'] = word
sys['word'] = lemma.name()
sys['polarity'] = polarity
words.append(sys)
return words
# Get translation
def translator(word, lang_code):
translate_client = translate.Client()
translation = translate_client.translate(word, target_language=lang_code)['translatedText']
return translation
# Get the synonyms
def get_synonyms(word):
syn = []
for synset in wordnet.synsets(word['SynsetTerms']):
for lemma in synset.lemmas():
syn.append(lemma.name())
syn = set(syn)
synonyms = []
try:
syn.remove(word['SynsetTerms'])
except KeyError:
pass
for i in syn:
article = {}
article['parent'] = word['SynsetTerms']
article['word'] = i
article['pos'] = word['PosScore']
article['neg'] = word['NegScore']
synonyms.append(article)
synonyms = pd.DataFrame(synonyms)
return synonyms
# loading the dictionary of ground positive and negative words rem
def load_dicts(filepath, lang_code):
lang_dict = {
'hi':'hindi',
'mr':'Marathi',
'ml':'Malayalam',
'gu': 'Gujarati',
'kn':'Kannada',
'ur':'Urdu',
'bn':'Bangla',
'te':'Telugu',
'ta':'Tamil',
'or':'Oriya',
'pa':'Punjabi'
}
df = pd.read_csv(filepath)
word_map = pd.DataFrame()
word_map['word'] = df[lang_dict[lang_code]]
word_map['pos'] = df['PosScore']
word_map['neg'] = df['NegScore']
word_map = word_map.to_dict('record')
return word_map
# loading the simple spacy tokenizer for english
def spacy_tokenizer(text):
nlp = en_core_web_sm.load()
doc = nlp(text)
tokens = [token.text for token in doc]
tokens = [i for i in tokens if i != '-PRON-']
return tokens
# Sentence Segmentation using spacy for different lanaguages
# Returns the array of strings each being a sentence in a whole article
def sentence_segmentation(article, lang_code):
if lang_code == 'en':
nlp = en_core_web_sm.load()
if lang_code == 'hi':
nlp = Hindi()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(article)
sentences = [i for i in doc.sents]
sentences = [ str(i) for i in sentences]
return sentences
# loading hindi stopwords
def load_hin_stopwords(filepath):
with open(filepath, encoding='utf-8') as f:
stopword = f.read().strip('\ufeff')
stopword = stopword.split(", ")
stopwords = [i.strip("'" ) for i in stopword]
return stopwords
# Removing hindi stopwords from tokenized text
def remove_hin_stopwords(text, filepath):
stopwords = load_hin_stopwords(filepath)
text = [i for i in text if not i in stopwords]
return text