-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrude-oil-sentiment-analysis.R
343 lines (260 loc) · 10.1 KB
/
crude-oil-sentiment-analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
## ----message=FALSE-------------------------------------------------------
library(knitr) # Export R Notebook to R Script
library(qdap) # Quantitative Discourse Analysis Package
library(RCurl) # For HTTP Requests
library(stringr) # For String Manipulation
library(tidytext) # Natural Language Processing package in R
library(tidyverse) # For data wrangling
library(tm) # Text Mining
library(wordcloud) # Generation of Word Clouds
## ----read_data, message=FALSE--------------------------------------------
crude_df <- read_csv("tweets.csv")
## ----preprocess_text-----------------------------------------------------
# Remove all preceding "RT" characters in Retweeted tweets
crude_df$text <- gsub(pattern = "RT", replacement = "", crude_df$text)
# Remove all Twitter user handles in Retweets
crude_df$text <- gsub(pattern = "@\\w+", replacement = "", crude_df$text)
# Convert all tweets to lower case
crude_df$text <- tolower(crude_df$text)
# Remove punctuation
crude_df$text <- removePunctuation(crude_df$text)
# Remove quotation marks
crude_df$text <- gsub(pattern = '“', replacement = "", crude_df$text)
# Remove URL links
crude_df$text <- gsub("http\\w+", "", crude_df$text)
# Replace contractions
crude_df$text <- replace_contraction(crude_df$text)
# Define a list of uninformative stop-words for removal
crude_stopwords <- c("crudeoil", "Crudeoil", "crude", "oil", "oott", stopwords("en"))
# Remove all stop words
crude_df$text <- removeWords(crude_df$text, crude_stopwords)
# Remove all whitespace
crude_df$text <- stripWhitespace(crude_df$text)
# Examine the text data without stop words
crude_df$text[1:10]
## ----plot_wordfreqs------------------------------------------------------
# Explore the most common words in the data set
crude_df %>%
unnest_tokens(word, text) %>%
group_by(word) %>%
count() %>%
arrange(desc(n)) %>%
head(n = 10) %>%
ggplot(mapping = aes(x = reorder(word, desc(n)), y = n, fill = word)) +
geom_col() +
labs(
title = "Word Frequency in #crudeoil tweets",
subtitle = paste("Data collected on", Sys.Date()),
caption = "Source: Twitter API",
x = "Word",
y = "Frequency") +
theme_minimal() +
theme(legend.position = "none")
## ----plot_wordcloud, warning=FALSE---------------------------------------
# Generate a simple word cloud on the Twitter data
wordcloud(
words = crude_df$text,
min.freq = 15,
colors = brewer.pal(8, "Dark2"),
random.color = TRUE,
max.words = 250
)
## ----compute_polarity----------------------------------------------------
# Define a Polarity Object for the set of #crudeoil tweets
crude_polarity <- polarity(crude_df$text)
# Print a summary of the polarity object
summary(crude_polarity$all$polarity)
## ----plot_polarity, warning=FALSE----------------------------------------
# Visualize the polarity object with ggplot
ggplot(crude_polarity$all, aes(x = polarity, y = ..density..)) +
geom_histogram(binwidth = 0.25, fill = "#bada55", colour = "grey60") +
geom_density(color = "darkblue") +
labs(
title = "qdap Polarity Scores of #crudeoil tweets",
subtitle = paste("As of", Sys.Date()),
caption = "Source: Twitter API",
x = "Polarity Score",
y = "Frequency"
)
## ----store_vector--------------------------------------------------------
# Store a character vector containing all tweets
tweet_vector <- crude_df$text
# Examine the vector
head(tweet_vector)
## ----create_corpus-------------------------------------------------------
# Convert the character vector into a Volatilte Corpus
tweet_corpus <- VCorpus(VectorSource(tweet_vector))
# Convert the corpus into a Document Term Matrix
tweet_dtm <- DocumentTermMatrix(tweet_corpus)
tweet_dtm_matrix <- as.matrix(tweet_dtm)
# Examine the Document Term Matrix
str(tweet_dtm_matrix)
## ----tidy_data-----------------------------------------------------------
# Tidy the Document Term Matrix for tidytext analysis
tidy_tweets <- tidy(tweet_dtm)
# Examine the tidied dataset
head(tidy_tweets)
## ----def_bing_polarity---------------------------------------------------
# Store the "Bing" lexicon from tidytext
bing <- get_sentiments(lexicon = "bing")
# Inner join the Bing lexicon to the Document Term Matrix and generate a Polarity Score
tweet_polarity_bing <-
tidy_tweets %>%
inner_join(bing, by = c("term" = "word")) %>%
mutate(index = as.numeric(document)) %>%
count(sentiment, index) %>%
spread(sentiment, n, fill = 0) %>%
mutate(polarity = positive - negative)
# Examine the mutated dataset
str(tweet_polarity_bing)
## ----plot_bing-----------------------------------------------------------
# Plot Polarity Scores against Index
ggplot(data = tweet_polarity_bing, aes(rev(index), polarity)) +
geom_smooth() +
labs(
title = "Polarity of #CrudeOil tweets (With Bing Lexicon)",
caption = "Source: Twitter API, Bing Lexicon",
x = "Time",
y = "Polarity"
) +
theme_minimal()
## ----def_afinn_polarity--------------------------------------------------
# Store the "AFINN" lexicon from tidytext
afinn <- get_sentiments(lexicon = "afinn")
# Inner join the AFINN lexicon to the Document Term Matrix and generate a Polarity Score
tweet_polarity_afinn <-
tidy_tweets %>%
inner_join(afinn, by = c("term" = "word")) %>%
mutate(index = as.numeric(document)) %>%
count(value, index) %>%
group_by(index) %>%
summarize(polarity = sum(value * n))
# Examine the mutated dataset
head(tweet_polarity_afinn)
## ----plot_afinn----------------------------------------------------------
# Plot Polarity Scores against Index
ggplot(data = tweet_polarity_afinn, aes(rev(index), polarity)) +
geom_smooth() +
labs(
title = "Polarity of #CrudeOil tweets (With AFINN Lexicon)",
caption = "Source: Twitter API, AFINN Lexicon",
x = "Time",
y = "Polarity"
) +
theme_minimal()
## ----def_loughran_polarity-----------------------------------------------
# Store the "Loughran-Macdonald" Sentiment Lexicon from tidytext
loughran <- get_sentiments(lexicon = "loughran")
# Inner join the Bing lexicon to the Document Term Matrix and generate a Polarity Score
tweet_polarity_loughran <-
tidy_tweets %>%
inner_join(loughran, by = c("term" = "word")) %>%
mutate(index = as.numeric(document)) %>%
group_by(sentiment) %>%
summarize(count = sum(count))
# Examine the mutated dataset
head(tweet_polarity_loughran)
## ----plot_loughran-------------------------------------------------------
# Plot a distribution of sentiments
ggplot(tweet_polarity_loughran, aes(x = reorder(sentiment, desc(count)), y = count, fill = sentiment)) +
geom_col() +
labs(
title = "Polarity of #CrudeOil tweets (With Loughran-Macdonald Lexicon)",
caption = "Source: Twitter API, Loughran-Macdonald Lexicon",
x = "Sentiment",
y = "Count"
) +
theme(legend.position = "none") +
theme_minimal()
## ----segregate_sentiment-------------------------------------------------
# Extract all "positive-sentiment" tweets and collapse into a single string
pos_terms <-
crude_df %>%
mutate(polarity = polarity(text)$all$polarity) %>%
filter(polarity > 0) %>%
pull(text) %>%
paste(collapse = " ")
# View the joined string
substr(pos_terms, 1, 100)
## ----get_negs------------------------------------------------------------
# Extract all "negative-sentiment" tweets and collapse into a single string
neg_terms <-
crude_df %>%
mutate(polarity = polarity(text)$all$polarity) %>%
filter(polarity < 0) %>%
pull(text) %>%
paste(collapse = " ")
# View the joined string
substr(neg_terms, 1, 100)
## ----combine_two---------------------------------------------------------
# Concatenate the two strings into a vector with 2 elements
all_terms <- c(pos_terms, neg_terms)
# Convert the vector into a Corpus
corpus_all_terms <- VCorpus(VectorSource(all_terms))
# Convert the Corpus into a TermDocumentMatrix
tdm_all_terms <- TermDocumentMatrix(
corpus_all_terms,
control = list(
weighting = weightTf,
removePunctuation = TRUE
)
)
# Convert the TDM into an R matrix object
matrix_all_terms <- as.matrix(tdm_all_terms)
colnames(matrix_all_terms) <- c("Positive", "Negative")
# View the TDM Matrix
tail(matrix_all_terms)
## ----plot_comparison, warning=FALSE--------------------------------------
# Plot a Comparison Cloud with the tidied TDM
comparison.cloud(matrix_all_terms, colors = c("green", "red"))
## ----plot_commonality, warning=FALSE-------------------------------------
commonality.cloud(matrix_all_terms, colors = "steelblue1")
## ----def_comparison------------------------------------------------------
# Create a data.frame containing 25 of the most common terms to compare
top15_terms <-
matrix_all_terms %>%
as_tibble(rownames = "word") %>%
filter_all(all_vars(. > 0)) %>%
mutate(difference = Positive - Negative) %>%
top_n(15, wt = difference) %>%
arrange(desc(difference))
# Examine the top 25 terms
head(top15_terms)
## ----plot_pyramid--------------------------------------------------------
# Create a Pyramid Plot to visualize the differences
library(plotrix)
pyramid.plot(
top15_terms$Positive,
top15_terms$Negative,
labels = top15_terms$word,
top.labels = c("Positive", "Word", "Negative"),
main = "Words in Common",
unit = NULL,
gap = 10,
space = 0.2
)
## ----plot_network--------------------------------------------------------
# Create a Word Network plot with qdap'
word_associate(
crude_df$text,
match.string = c("trump"),
stopwords = c("crude", stopwords("en")),
wordcloud = TRUE,
cloud.colors = c("gray85", "darkred"),
nw.label.proportional = TRUE
)
title(main = "Words Associated with Trump in #CrudeOil Tweets")
## ----plot_network2-------------------------------------------------------
# Create a Word Network plot with qdap'
word_associate(
crude_df$text,
match.string = c("fed"),
stopwords = c("crude", stopwords("en")),
wordcloud = TRUE,
cloud.colors = c("gray85", "darkred"),
nw.label.proportional = TRUE
)
title(main = "Words Associated with Fed in #CrudeOil Tweets")
## ----export_to_script----------------------------------------------------
# Export the analysis to an R Script
purl("crude-oil-sentiment-analysis.Rmd")