-
Notifications
You must be signed in to change notification settings - Fork 5
/
_NaturalLanguageProcessing.py
115 lines (102 loc) · 5.34 KB
/
_NaturalLanguageProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# TODO : IMplementation of the AI
from nltk.stem import \
PorterStemmer # this is the steamer which will be used for the Steaming of the Tokenized Words
from nltk.tokenize import word_tokenize, sent_tokenize, \
PunktSentenceTokenizer # importing both of the tokenization packages form the modules
from nltk.corpus import \
state_union # this imports the state union function which need to be taken in the corpus as the state union of the words.
import nltk # importing the natural lagnuage processing package
from nltk.corpus import stopwords # locating the stop words.
class NaturalProcessing:
def __init__ (self):
print("NONCE!!!!")
def recognize_text (self):
print("")
def steam_word_port (self, text=""):
if text != "":
tokenized_word = word_tokenize(text) # this is the word tokenized !
ps = PorterStemmer() # Creating a port stemmer , its basically a stemmer technique , which Gives you the stem representation of the specified Words !
tokenized_stem_words = [] # representing a list !
for word in tokenized_word:
tokenized_stem_words.append(ps.stem(word))
return
def word_tokeniztion (self, text="", sent_tokenized=True):
if text != "":
if sent_tokenized == True:
Tokenized_sentence = sent_tokenize(text)
return Tokenized_sentence
else:
Tokenized_words = word_tokenize(
text) # this converts te passed stirng to the word tokenized for the scanning of the probability
return Tokenized_words # This returns the tokenized words !
def stop_words_exclude (self, sentences=''):
stop_words = set(stopwords.words('english')) # English words are parssed out which are meaning less
word_tokens = word_tokenize(sentences)
filtered_sentences = [w for w in word_tokens if
w not in stop_words] # this is the list of the filtered sentence
''' Alternative code for the word tokenization
for w in word_tokens: # loop through each of the word in the word tokens!
if w not in stop_words: # if the word is not i nStop words Then
filtered_sentence.append(w)]'''
return filtered_sentences
"""
One of the most powerful aspects of NLTK module is the parts of speech ,
It CAN DO PARTS OF SPEECH TAGGING FOR YOU. This means labelling the words on the basis of NOUN, Adjectives, verbs etc.
POS tag list:
CC coordinating conjunction
CD cardinal digit
DT determiner
EX existential there (like: "there is" ... think of it like "there exists")
FW foreign word
IN preposition/subordinating conjunction
JJ adjective 'big'
JJR adjective, comparative 'bigger'
JJS adjective, superlative 'biggest'
LS list marker 1)
MD modal could, will
NN noun, singular 'desk'
NNS noun plural 'desks'
NNP proper noun, singular 'Harrison'
NNPS proper noun, plural 'Americans'
PDT predeterminer 'all the kids'
POS possessive ending parent's
PRP personal pronoun I, he, she
PRP$ possessive pronoun my, his, hers
RB adverb very, silently,
RBR adverb, comparative better
RBS adverb, superlative best
RP particle give up
TO to go 'to' the store.
UH interjection errrrrrrrm
VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes
WDT wh-determiner which
WP wh-pronoun who, what
WP$ possessive wh-pronoun whose
WRB wh-abverb where, when
"""
def process_content (self,tokenized=''):
try:
for i in tokenized[
:5]: # here we are applying sentence limit so we can use this one for the processing the sentences.
words = nltk.word_tokenize(i) # Tokenizes all the word , using the word tokenize!
tagged = nltk.pos_tag(words) # Tags the specific words with the Natural language .
named_Ent= nltk.ne_chunk(tagged,binary=False)
named_Ent.draw() # Draws with the Tagging of the Named Entity , which the machine knows the name of those !
print(tagged) # Prints the words with the Tags in the form of the tupple .!
except Exception as e:
print(str(e)) # if there is an exception then this prints out the exception
def partofspeechtag (self, sentences=''): # th
train_text = state_union.raw(
"2005-GWBUSH.txt") # This is the train text which will be used to tokenize the sample Test(unsupervised learning)
sample_text = state_union.raw("2006-GWBUSH.txt") # This is the sample text which will be tokenized later onward
# print(type(sample_text))
custom_sent_tokenizer = PunktSentenceTokenizer(
train_text) # This is the Train Text in the form of sentence being tokenized using the unsupervised learning.!
# tokenized = custom_sent_tokenizer.tokenize(sample_text) # Tokenizing he Custom sentence tokenize
tokenized = custom_sent_tokenizer.tokenize(sentences)
self.process_content(tokenized)