-
Notifications
You must be signed in to change notification settings - Fork 0
/
SentimentAnalysis_FoodReview_R
146 lines (106 loc) · 4.34 KB
/
SentimentAnalysis_FoodReview_R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Install packages if needed
install.packages("tm") # for text mining
install.packages("SnowballC") # for text stemming
install.packages("wordcloud2") # word-cloud generator
install.packages("RColorBrewer") # color palettes
install.packages("syuzhet") # for sentiment analysis (I will print the graph)
install.packages('sentimentr') # sentiment analysis (I print sentiment with respect to every sentence)
install.packages("ggplot2") # for plotting graphs
# Load the packages
library("tm")
library("SnowballC")
library("wordcloud2")
library("RColorBrewer")
library("syuzhet")
library(sentimentr)
library("ggplot2")
library('dplyr')
# Read the text file from local machine, choose file interactively
library(readxl)
text <- read_excel('FoodReviews.xlsx')
# Load the data as a corpus
TextDoc <- Corpus(VectorSource(text$Text))
#####Cleaning up Text Data#######################################################
# Get the default stopword list
stopwords()
default_stopwords <- stopwords()# it will contain all 174 stopwords
words_to_exclude <- c("not", "doesn't")
# Create a custom stopwords list by removing words to exclude
custom_stopwords <- setdiff(default_stopwords, words_to_exclude)
custom_stopwords
# Update the TextDoc by setting custom stopwords
TextDoc <- tm_map(TextDoc, removeWords, custom_stopwords)
#####################################################################################
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
TextDoc <- tm_map(TextDoc, toSpace, "!")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
stopwords()
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word - specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("service", "dogs", "food")) #### some example
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
##########Building the term document matrix#######
# Build a term-document matrix
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
dtm_m <- as.matrix(TextDoc_dtm)
# Sort by descearing value of frequency
dtm_v <- sort(rowSums(dtm_m),decreasing=TRUE)
dtm_d <- data.frame(words = names(dtm_v),freq = dtm_v)
# Display the top 20 most frequent words
head(dtm_d, 20)# helps in printing the top 20 words
# Display the lowest 10 most frequent words
# tail(dtm_d,10)
library(dplyr)
dtm_d[1:20,]$freq
dtm_d[1:20,]$words
#Plotting the top 5 most frequent words using a bar chart is a good basic way to visualize this word frequent data.
# Plot the most frequent words
barplot(dtm_d[1:20,]$freq, las = 2, names.arg = dtm_d[1:20,]$words,
col =rainbow(20), main ="Top 20 most frequent words",
ylab = "WORD FREQUENCIES")### las shows legacy axis = 2 vertical
#GENDRATE THE WORD CLOUD
#generate word cloudlibrary(wordcloud2)
wordcloud2(dtm_d,
size = 0.4,
shape = 'triangle',
rotateRatio = 0.9,
minSize = 1)
# word association
# Find associations
findAssocs(TextDoc_dtm, terms = c("like","flavor", "tast"), corlimit = 0.15)
# Find associations for words that occur at least 2000 times
findAssocs(TextDoc_dtm, terms = findFreqTerms(TextDoc_dtm, lowfreq = 1000), corlimit = 0.20)
# The top 10 words can be extracted from dtm_d using head(dtm_d,10)$words and the same can be passed
# as terms to the findAssocs function
findAssocs(TextDoc_dtm, terms = head(dtm_d,10)$words, corlimit = 0.15)
#sentiment score
library(syuzhet)
input <- iconv(text$Text)# your data is a character vector
sentiment <- get_nrc_sentiment(input)
head(sentiment, 20)
barplot(colSums(sentiment),
las = 2,
col = rainbow(10),
ylab = 'Count',
main = 'Sentiment Scores on Reviews')
#Plot two - count of words associated with each sentiment, expressed as a percentage
barplot(
sort(colSums(prop.table(sentiment[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = "Emotions in Text", xlab="Percentage"
)