-
Notifications
You must be signed in to change notification settings - Fork 0
/
nltk_utils.py
52 lines (41 loc) · 1.22 KB
/
nltk_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import nltk
import numpy as np
#nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
# All model of nltk are already trained
stemmer = PorterStemmer()
def tokenization(sentence):
"""
sentence : String
splite the sentence into a word table
"""
return nltk.word_tokenize(sentence)
def stemming(word):
"""
word : String
group by meaning
"""
return stemmer.stem(word.lower())
def matrice_of_word(tokenization_sentence,all_words):
"""
tokenization_sentence = ["hy","who","are","you"]
all_words = ["bonjour","hy","I","who","are","you"]
binList = ["0","1","0","1","1","1"]
"""
tokenization_sentence = [stemming(w) for w in tokenization_sentence]
binList = np.zeros(len(all_words))
for id, word in enumerate(all_words):
if word in tokenization_sentence:
binList[id] = 1.0
return binList
# test function
# a = "How long does shipping take?"
# print(a)
# a = tokenization(a)
# print(a)
# words = ['Organize','organizes','organizing','penis']
# stemmed_word = [stemming(w) for w in words]
# print(stemmed_word)
# sentence = ["hy","who","are","you"]
# words = ["bonjour","hy","I","who","are","you"]
# print(matrice_of_word(sentence,words))