-
Notifications
You must be signed in to change notification settings - Fork 1
/
basics.py
86 lines (77 loc) · 3.35 KB
/
basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import nltk,re,string
from textblob import TextBlob as tb
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize,word_tokenize,wordpunct_tokenize,ne_chunk,pos_tag
sample = 'Hello world. This is a sample text by Kimaru Thagana'
tokenized_sentence = sent_tokenize(sample) # extract sentences from a piece of text
tokenized_words = word_tokenize(sample) # extract words from a piece of text
words = wordpunct_tokenize(sample) #also seperates out punctuations
print(tokenized_sentence)
print(tokenized_words)
print(words)
print(nltk.pos_tag(tokenized_words))# parts of speech tagging
def entities(text):
return ne_chunk((pos_tag(word_tokenize(text) ) ) )
comment = entities("When asked about the comments, Obama told the BBC: 'The UK will not be able to negotiate something with the US;")
comment.pprint()
# using textblob to capture sentiment
'''
Polarity- How positive or negative the sentiment is 1.0- +ve -1.0 -ve
Subjectivity- A measure of how subjective the text is, i.e, influenced by emotions and opinions and is subjected to intermpretation
'''
text1 = "The food at radison was not so good"
text2 = "I hate you"
print(tb(text1).sentiment)
print(tb(text2).sentiment)
####################
# Text Pre-Processing Techniques
####################
#The first process in text processing is normalization. This involves:
#1. converting text to lower or upper case using the .lower()/.upper() method on a string
#2. Removing numbers if they arent important in the analysis
input_str = "Box A contains 3 red and 5 white balls, while Box B contains 4 red and 2 blue balls."
result = re.sub(r"\d+", "", input_str)
print(result)
#3. Removing punctuation
input_str = "This &is [an] example? {of} string. with.? punctuation!!!!” # Sample string"
result = input_str.translate(string.maketrans("",""), string.punctuation)
print(result)
#4. Removing stop words(they dont convey meaning hence nit useful in processing
input_str = "NLTK is a leading platform for building Python programs to work with human language data."
stop_words = set(stopwords.words("english"))
tokens = word_tokenize(input_str)
result = [i for i in tokens if not i in stop_words]
print (result)
#5. Stemming. Find example in stemming.py
#6. Lemmatization. Works the same way as stemming but uses lexical knowledge bases to find
#the correct root-form of a word
lemmatizer = WordNetLemmatizer()
input_str = "been had done languages cities mice"
input_str = word_tokenize(input_str)
for word in input_str:
print(lemmatizer.lemmatize(word))
#7 Chunking. An example can be found in chunking.py
'''
text1 = "I love the food at radison but the waiters were not good"
text2 = "I do not hate you"
text1 = "I love the food at radison but the waiters were not good"
text2 = "I do not like you"
text1 = "I love the food at radison but the waiters were not good"
text2 = "I hate you"
text1 = "I love the food at radison but the waiters were rude"
text2 = "I hate you"
text1 = "I love the food at radison. It was nice"
text2 = "I hate you"
text1 = "The food at radison was awsome"
text2 = "I hate you"
text1 = "The food at radison was good"
text2 = "I hate you"
text1 = "The food at radison was really awsome"
text2 = "I hate you"
text1 = "The food at radison was not so good"
text2 = "I hate you
text1 = "The food at radison was not so bad"
text2 = "I hate you
'''