-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_words.py
49 lines (40 loc) · 1.49 KB
/
preprocess_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer("english")
stopwords = open(r'stopwords.txt', 'r').read().splitlines()
def process_text(text):
'''Returns a list of words that go through: punctuation removal,
tokenization, lowercase, stemming.
First remove URLs so we can apply the above easily, then add them
back untouched.
'''
from constants import URL_REGEX
# Save all URLs.
urls = re.findall(URL_REGEX, text)
# Remove all URLs.
text = re.sub(URL_REGEX, '', text)
# Replace punctuation with space so they can be easily tokenized.
for c in string.punctuation:
text = text.replace(c, ' ')
# Get individual lower words out of text now.
words = map(lambda word: word.lower(), word_tokenize(text))
# Remove common words from english grammar.
words = filter(not_stopword, words)
# Keep only long enough words.
words = filter(not_too_short, words)
# Stem all words from body.
words = map(stemmer.stem, words)
# Remove duplicates.
words, urls = list(set(words)), list(set(urls))
return words + urls
def keep_only_letters(word):
'''Given a string like "ana1.bbc", return "anabbc".'''
letter = lambda c : 'a' <= c <= 'z' or 'A' <= c <= 'Z'
return filter(letter, word)
def not_stopword(word):
return word not in stopwords
def not_too_short(word):
from constants import MIN_WORD_LEN
return len(word) >= MIN_WORD_LEN