forked from UTSAVS26/PyVerse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltk_utils.py
50 lines (42 loc) · 1.41 KB
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import numpy as np
import nltk
# nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
# def tokenize(sentence):
# """
# split sentence into array of words/tokens
# a token can be a word or punctuation character, or number
# """
# return nltk.word_tokenize(sentence)
def tokenize(sentence):
if sentence is None or sentence.strip() == "":
# If the input sentence is None or empty, return an empty list or handle accordingly
return []
return nltk.word_tokenize(sentence)
def stem(word):
"""
stemming = find the root form of the word
examples:
words = ["organize", "organizes", "organizing"]
words = [stem(w) for w in words]
-> ["organ", "organ", "organ"]
"""
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, words):
"""
return bag of words array:
1 for each known word that exists in the sentence, 0 otherwise
example:
sentence = ["hello", "how", "are", "you"]
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
bog = [ 0 , 1 , 0 , 1 , 0 , 0 , 0]
"""
# stem each word
sentence_words = [stem(word) for word in tokenized_sentence]
# initialize bag with 0 for each word
bag = np.zeros(len(words), dtype=np.float32)
for idx, w in enumerate(words):
if w in sentence_words:
bag[idx] = 1
return bag