-
Notifications
You must be signed in to change notification settings - Fork 1
/
ADSreport_part1&3.Rmd
391 lines (300 loc) · 16.3 KB
/
ADSreport_part1&3.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
---
title: "Analysis on Wine Reviews"
output:
prettydoc::html_pretty:
theme: cayman
highlight: github
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r library, include=FALSE}
library(data.table)
library(tidyverse)
library(tm)
library(stringr)
library(quanteda)
library(irlba)
library(ggplot2)
library(caret)
library(tidytext)
library(Hmisc)
library(wordcloud)
library(wordcloud2)
library(DT)
library(png)
library(recommenderlab)
```
```{r read data, echo=FALSE,warning=FALSE}
dat <- fread(input = "wines.csv",verbose = FALSE,na.strings=c(""))
dat=dat[,-1]
```
```{r functions}
percentage.table <- function(x, digits = 1){
tab <- table(x)
percentage.tab <- 100*tab/(sum(tab))
rounded.tab <- round(x = percentage.tab, digits = digits)
return(rounded.tab)
}
round.numerics <- function(x, digits){
if(is.numeric(x)){
x <- round(x = x, digits = digits)
}
return(x)
}
miss.value <- function(x){
miss.rate <- round(100*mean(is.na(x)),2)
return(miss.rate)
}
get.unique = function(x){
ux=length(unique(x))
return(ux)
}
## create formula
create.formula <- function(outcome.name, input.names, input.patterns=NA, all.data.names=NA, return.as="character"){
variable.names.from.patterns <- c()
if (!is.na(input.patterns[1]) & !is.na(all.data.names[1])) {
pattern <- paste(input.patterns, collapse = "|")
variable.names.from.patterns <- all.data.names[grep(pattern = pattern, x = all.data.names)]
}
all.input.names <- unique(c(input.names, variable.names.from.patterns))
all.input.names <- all.input.names[all.input.names !=outcome.name]
if (!is.na(all.data.names[1])) {
all.input.names <- all.input.names[all.input.names %in% all.data.names]
}
input.names.delineated <- sprintf("`%s`", all.input.names)
the.formula <- sprintf("`%s` ~ %s", outcome.name, paste(input.names.delineated, collapse = " + "))
if (return.as == "formula") {
return(as.formula(the.formula))
}
if (return.as != "formula") {
return(the.formula)
}
}
reduce.formula <- function(dat, the.initial.formula, max.categories = NA) {
require(data.table)
dat <- setDT(dat)
the.sides <- strsplit(x = the.initial.formula, split = "~")[[1]]
lhs <- trimws(x = the.sides[1], which = "both")
lhs.original <- gsub(pattern = "`", replacement = "", x = lhs)
if (!(lhs.original %in% names(dat))) {
return("Error: Outcome variable is not in names(dat).")
}
the.pieces.untrimmed <- strsplit(x = the.sides[2], split = "+", fixed = TRUE)[[1]]
the.pieces.untrimmed.2 <- gsub(pattern = "`", replacement = "", x = the.pieces.untrimmed, fixed = TRUE)
the.pieces.in.names <- trimws(x = the.pieces.untrimmed.2, which = "both")
the.pieces <- the.pieces.in.names[the.pieces.in.names %in% names(dat)]
num.variables <- length(the.pieces)
include.pieces <- logical(num.variables)
for (i in 1:num.variables) {
unique.values <- dat[, unique(get(the.pieces[i]))]
num.unique.values <- length(unique.values)
if (num.unique.values >= 2) {
include.pieces[i] <- TRUE
}
if (!is.na(max.categories)) {
if (dat[, is.character(get(the.pieces[i])) | is.factor(get(the.pieces[i]))] == TRUE) {
if (num.unique.values > max.categories) {
include.pieces[i] <- FALSE
}
}
}
}
pieces.rhs <- sprintf("`%s`", the.pieces[include.pieces == TRUE])
rhs <- paste(pieces.rhs, collapse = " + ")
the.formula <- sprintf("%s ~ %s", lhs, rhs)
return(the.formula)
}
# calculating relative term frequency (TF)
term.frequency <- function(row) {
row / sum(row)
}
# calculating inverse document frequency (IDF)
inverse.doc.freq <- function(col) {
corpus.size <- length(col)
doc.count <- length(which(col > 0))
log10(corpus.size / doc.count)
}
# calculating TF-IDF.
tf.idf <- function(x, idf) {
x * idf
}
text.prep=function(dat,var.name){
# tokenization
dat.token=tokens(dat[,get(var.name)],what="word", remove_punct=T, remove_symbols=T, remove_hyphens = T)
# lower case the tokens
dat.token=tokens_tolower(dat.token)
# stopword removal
dat.token=tokens_select(dat.token,stopwords(),selection = "remove")
# stemming
dat.token=tokens_wordstem(dat.token,language = "english")
return(dat.token)
}
```
```{r constants, echo=F}
description.name="description"
points.name="points"
points.group.name="points.group"
country.name <- "country"
description.name <- "description"
designation.name <- "designation"
points.name <- "points"
price.name <- "price"
province.name <- "province"
region_1.name <- "region_1"
region_2.name <- "region_2"
taster.name <- "taster_name"
twitter.name <- "taster_twitter_handle"
title.name <- "title"
variaty.name <- "variety"
winery.name <- "winery"
dat[,eval(points.name)] <- as.numeric(dat[,get(points.name)])
dat[,eval(price.name)] <- as.numeric(dat[,get(price.name)])
single.var <- c(points.name, price.name, country.name)
```
# Introduction
<p>One of the most important research questions in the wine related area is the ranking, rating and judging of wine[1]. In the past, the research on these problems manily utilized samll dataset[2]. However, with data science techniques, we are able to investigate the problem using large amount of data. </p>
<p>Majority of the studies focus on analyzing physicochemical laboratory wine data, while little research has been conducted on the content of wine reviews. Mining useful information from wine reviews and tasters preferences can provide new insights from a consumer perspective. Therefore, it is meaningful to untilize these data and generate better wine recommendations.</p>
# Source of Data
<body>
We used <b> <a href = "https://www.kaggle.com/zynicide/wine-reviews" > Wine Reviews</a></b> data which was scraped from <i>WineEntusiast</i> website on November 22nd, 2017 by @zackthoutt on Kaggle. The orginial data also had an index variable which we ignored here since it has no impact on our project. There are 13 variables in total:
<ul>
<li><b>Country</b>: The country that the wine is from; </li>
<li><b>Description</b>: wine reviews; </li>
<li><b>Designation</b>: The vineyard within the winery where the grapes that made the wine are from;</li>
<li><b>Points</b>: The number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80);</li>
<li><b>Price</b>: The cost for a bottle of the wine; </li>
<li><b>Province</b>: The province or state that the wine is from;</li>
<li><b>Region_1</b>: The wine growing area in a province or state (ie Napa) ;</li>
<li><b>Region_2</b>: Sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank ;</li>
<li><b>Taster_name</b>: Name of the taster(reviewer);</li>
<li><b>Taster_twitter_handle</b>;</li>
<li><b>Title</b>: The title of the wine review, which often contains the vintage if you're interested in extracting that feature ;</li>
<li><b>Variety</b>: The type of grapes used to make the wine (ie Pinot Noir) ;</li>
<li><b>Winery</b>: The winery that made the wine.</li>
</ul>
</body>
```{r,include=F}
datatable(dat[1:5,])
```
# Examination of the Data
<p>We first checked the missing vlaues in the data and visualize the percentage of that for each varible. **Country, province, region_1 and region_2** are varibales giving geographic information about the wine, since the majority of **region_2** and 16.4% of **region_1** are missing, we choose to focus on **country** unless more detailed geographic information are needed.</p>
We then examined the unique values for each varible. **Designation, winery, title** have many unique values, hence we decided to treat them as random effects but not predictors when modeling.<br>
**Price** of the wine ranges from 4 to 3300 with some extreme values. We then visualized **price** in the report enginee and recommended a way to define price range (<10, 10~30, 30~50, 50~200, 200~500, >500)
Variable **points** ranges from 80 to 100. Its distribution was close to a bell shape. <br>
**Description, taster_name** and **taster_twitter_handle** provides information about the taster and their reviews on the wine. We examined these variables later based on the needs for text anlysis and recommendation systems building.
```{r percentage missing}
tab <- dat[,lapply(X=.SD,FUN="miss.value"),.SD=names(dat)[-1]]
tab2 <- data.table(var=names(dat[,-1]),miss=as.numeric(tab))
# create a plot of all the missing rate
barplot(height=tab2$miss, names.arg=tab2$var,space=0.01, las=1, main="Percentage of Missing Value",ylab="Percentage", ylim=c(0,80),col=rainbow(13),border="white")
text(x = -0.5 + 1.02*1:tab2[, .N], y = -15, labels = tab2$var,srt = 45)
space_val=0
text(x=-0.4+1:length(tab)*(1+space_val),y=tab,labels=sprintf("%.1f%%",tab),pos=3)
#number of unique values for each variable
unitab=dat[,lapply(.SD,FUN="get.unique")]
datatable(unitab)
#range of price
summary(dat$price)
hist(dat[,price],xlab="price", col= "#85C1E9", border="white", main="Histogram of price")
#points plot
hist(dat$points,xlab="points", col= "#85C1E9", border="white", main="Histogram of points")
```
# Investigation:
#1. Sentiment Analysis
#2. Text Analysis
The sentiment analysis shows there exits relationship between reviews and points, which is consistent with our common sense. In this part, we continued analyzing wines' review.
##1.1 The length of reviews
The plot below shows that the wines with longer reviews tend to have higher points. So texters usually have more opinions to share when he/she thinks the wine is great. It is also worth to notice that the variance of text length tend to be bigger if the wine has higher points. It may imply that even the good wines usually have longer reviews, the length is also influenced by many other factors, such as ttesters' habits. But if the wine is not so good, testers always have less to write down.
```{r text_length, warning=FALSE}
textdat <- dat[,.(description=as.character(get(description.name)),
points=as.numeric(get(points.name)))]
textdat <- textdat[!(is.na(get(points.name))),]
textdat=textdat[,textlen:=as.numeric(nchar(description))]
# quantile(textdat$points,probs=seq(0,1,0.25))
cuts.points <- c(80,86,88,91,100)
textdat[, eval(points.group.name):=cut2(x=get(points.name), cuts=cuts.points)]
ggplot(textdat,aes(x=textlen,fill=points.group))+
theme_classic()+
geom_histogram(binwidth = 5,stat="count")
```
##2.2 Data Pipeline
Before analysis on text data, we did some preparation.
- **Tokenization**: Cut the reviews into single words to calculate the frequency of their appearance.
- **Remove "stopwords"**: Remove "stopwords" such as "the", "a", "are", which are meaningless but necessary in sentence.
- **Stem the words**: Stem the words in order to reduce the number of total different words appeare in the reviews. For example, this step change "fruit", "fruity", "fruits" all into "fruiti". Even though "fruiti" is not actually a word, it is still understandable.
```{r}
text.tokens=text.prep(textdat,description.name)
```
- **Create a DFM**: Create a document-feature matrix for later analysis
```{r pipeline ,echo = TRUE}
text.tokens.dfm <- dfm(text.tokens, tolower = FALSE)
```
- **WordCloud**: Use WorldCloud to see the results. "fruit", "acid", "tanni", "cherry", "dry" tend to appear more frequently than others. We should set those words as new variables.
```{r wordcloud}
wordcloudData <- data.table(word=colnames(text.tokens.dfm),freq=colSums(text.tokens.dfm))
setorderv(wordcloudData,cols="freq",order=-1)
wordcloud2(data=wordcloudData,size=1,minSize=0.1)
```
##2.3 Term Frequency-Inverse Document Frequency (TF-IDF)
TF-IDF is a powerful technique for enhancing the information contained within our document-frequency matrix. The tf-idf weight measures how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus (data-set). To complete this process, we need two steps listed below:
- Normalize all documents in the corpus to be length independent (TF)
- Penalize terms occur frequency across corpus and inverse document frequency (IDF)
```{r TF-IDF}
var.chosen<-c('fruit','acid','palat','aroma','cherri','tannin','ripe','dri','spice','rich','fresh','berri','oak','plum','textur','sweet','appl','full','blend','balanc','bodi','blackberri','light','soft','age','structur','white','crisp','fruiti','citrus','dark','miner','herb','Cabernet','raspberri','vanilla','bright','pepper','firm','green','lemon','juici','peach','concentr','pear','chocol','currant','Pinot','smooth','spici','wood','lime','intens','tart','tannic','tight','herbal','orang')
# Transform to a matrix and inspect.
text.tokens.matrix <- as.matrix(text.tokens.dfm[,var.chosen])
# First step, normalize all documents via TF.
train.tokens.df.norm <- apply(text.tokens.matrix, 1, term.frequency)
# Second step, calculate the IDF vector that we will use - both
# for training data and for test data!
text.tokens.idf <- apply(text.tokens.matrix, 2, inverse.doc.freq)
# Lastly, calculate TF-IDF for our training corpus.
text.tokens.tfidf <- apply(train.tokens.df.norm, 2, tf.idf, idf = text.tokens.idf)
round(text.tokens.tfidf[1:10, 1:6],2)
```
For example, in the text 1, fruit has a lower score than acid. We believe that "fruit" has less information because most of reviews include "fruit". On the contrary, acidity is more useful in this case, since only part of the wines are acid, which makes it more unique and characteristic, and more suitable as a variable.
##2.4 Prediction
Based on wordcloud plot and TF-IDF, we selected top 100 frequetent words. Then we delected some meaningless words for this project, such as "wine","well","show" and etc. In the end, we chose 58 words as our new variables.
```{r train_test}
#dim(text.tokens.matrix)
text.tokens.df <- cbind(points = textdat$points, data.frame(text.tokens.matrix))
set.seed(423)
indexes=createDataPartition(text.tokens.df$points,times=1,p=0.7,list=F) # 70% as train data
train=text.tokens.df[indexes,]
test=text.tokens.df[-indexes,]
```
```{r model}
formula.points <- create.formula(outcome.name="points",input.names=var.chosen)
the.formula <- reduce.formula(dat = train, the.initial.formula = formula.points)
model <- lm(formula = the.formula, data=train)
summary(model)
```
F statistic is 483.7 and p-value is smaller than $2.2*10^{-16}$. Thus, the model is significant. Also, most variables are significant in this model. But adjusted R-squared is 0.2353, which is not really high. Considering that testers may not write down all aspect of a wine and the standard of valuating a wine is rather compelecated, we still think this is a nice try.
```{r predict}
test$points.est <- predict(model,newdata=test)
test$points.diff <- test$points.est-test$points
perc <- round(100*sum(abs(test$points.diff)<=2,na.rm=TRUE)/nrow(test),2)
```
Since points is continuous, we allow a samll difference between actual points and estimated points (eg:abs(diff)=2), the accuracy rate is `r perc`%.
```{r check_performance}
ggplot(data = test,aes(x= points.diff,y=..density..)) +
geom_histogram(na.rm=TRUE,fill="royalblue4",bins=30) +
geom_vline(xintercept =-2, col="red",lty=2,lwd=1.5) +
geom_vline(xintercept = 2, col="red",lty=2,lwd=1.5) +
theme_classic() +
ggtitle('Histogram of Residuals') +
xlab('Residuals') +
ylab('Probability') +
theme(plot.title = element_text(hjust = 0.5))
```
#3. Recommendation System
# The Results
# Interpretation
# Assummptions
# Limitations and Uncertainties
# Areas of Future Investigation
# References
[1]Bernard Chen, Valentin V., James P., and Travis A. Wineinformatics: A Quantitative Analysis of Wine Reviewers. <i> Fermentation </i>. 2018 <br>
[2]Quandt, R.E. A note on a test for the sum of ranksums. <i>J. Wine Econ</i>. 2007, 2, 98-102. <br>
[3]tf-idf Model for Page Ranking. Retrieved from https://www.geeksforgeeks.org/tf-idf-model-for-page-ranking/.