-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_utils.py
20 lines (17 loc) · 934 Bytes
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import re
STOPWORDS = {"but", "again", "there", "about", "an", "be", "for", "do", "its", "of", "while",
"is", "s", "am", "or", "who", "as", "from", "the", "until", "are", "these", "were", "down",
"should", "to", "had", "when", "at", "before", "and", "have", "in", "will", "on", "does",
"then", "that", "because", "what", "why", "so", "can", "did", "has", "just", "where", "too",
"which", "those", "i", "after", "whom", "t", "being", "if", "a", "by",
"doing", "it", "how", "was", "here", "than", "don", "nor"}
# Converts text into array of words, removes
# punctuation & numbers, lowers case.
def word_tokenise(raw_text):
clean = re.sub("[^a-zA-Z]", " ", raw_text).lower()
words = clean.split()
return words
def remove_stopwords(word_list):
filtered_words = [w for w in word_list if
w not in STOPWORDS]
return filtered_words