ADSreport_part1&3.Rmd

---
title: "Analysis on Wine Reviews"
output:
  prettydoc::html_pretty:
    theme: cayman
    highlight: github
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r library, include=FALSE}
library(data.table)
library(tidyverse)
library(tm)
library(stringr)
library(quanteda)
library(irlba)
library(ggplot2)
library(caret)
library(tidytext)
library(Hmisc)
library(wordcloud)
library(wordcloud2)
library(DT)
library(png)
library(recommenderlab)
```

```{r read data, echo=FALSE,warning=FALSE}
dat <- fread(input = "wines.csv",verbose = FALSE,na.strings=c(""))
dat=dat[,-1]
```


```{r functions}
percentage.table <- function(x, digits = 1){
  tab <- table(x)
  percentage.tab <- 100*tab/(sum(tab))
  rounded.tab <- round(x = percentage.tab, digits = digits)
  return(rounded.tab)
}
round.numerics <- function(x, digits){
  if(is.numeric(x)){
    x <- round(x = x, digits = digits)
  }
  return(x)
}
miss.value <- function(x){
  miss.rate <- round(100*mean(is.na(x)),2)
  return(miss.rate)
}
get.unique = function(x){
  ux=length(unique(x))
  return(ux)
}
## create formula
create.formula <- function(outcome.name, input.names, input.patterns=NA, all.data.names=NA, return.as="character"){
  variable.names.from.patterns <- c()
  if (!is.na(input.patterns[1]) & !is.na(all.data.names[1])) {
    pattern <- paste(input.patterns, collapse = "|")
    variable.names.from.patterns <- all.data.names[grep(pattern = pattern, x = all.data.names)]
  }
  all.input.names <- unique(c(input.names, variable.names.from.patterns))
  all.input.names <- all.input.names[all.input.names !=outcome.name]
  if (!is.na(all.data.names[1])) {
    all.input.names <- all.input.names[all.input.names %in% all.data.names]
  }
  input.names.delineated <- sprintf("`%s`", all.input.names)
  the.formula <- sprintf("`%s` ~ %s", outcome.name, paste(input.names.delineated, collapse = " + "))
  if (return.as == "formula") {
    return(as.formula(the.formula))
  }
  if (return.as != "formula") {
    return(the.formula)
  }
}

reduce.formula <- function(dat, the.initial.formula, max.categories = NA) {
  require(data.table)
  dat <- setDT(dat)
  
  the.sides <- strsplit(x = the.initial.formula, split = "~")[[1]]
  lhs <- trimws(x = the.sides[1], which = "both")
  lhs.original <- gsub(pattern = "`", replacement = "", x = lhs)
  if (!(lhs.original %in% names(dat))) {
    return("Error: Outcome variable is not in names(dat).")
  }
  the.pieces.untrimmed <- strsplit(x = the.sides[2], split = "+", fixed = TRUE)[[1]]
  the.pieces.untrimmed.2 <- gsub(pattern = "`", replacement = "", x = the.pieces.untrimmed, fixed = TRUE)
  the.pieces.in.names <- trimws(x = the.pieces.untrimmed.2, which = "both")

  the.pieces <- the.pieces.in.names[the.pieces.in.names %in% names(dat)]
  num.variables <- length(the.pieces)
  include.pieces <- logical(num.variables)
  
  for (i in 1:num.variables) {
    unique.values <- dat[, unique(get(the.pieces[i]))]
    num.unique.values <- length(unique.values)
    if (num.unique.values >= 2) {
      include.pieces[i] <- TRUE
    }
    if (!is.na(max.categories)) {
      if (dat[, is.character(get(the.pieces[i])) | is.factor(get(the.pieces[i]))] == TRUE) {
        if (num.unique.values > max.categories) {
          include.pieces[i] <- FALSE
        }
      }
    }
  }
  pieces.rhs <- sprintf("`%s`", the.pieces[include.pieces == TRUE])
  rhs <- paste(pieces.rhs, collapse = " + ")
  the.formula <- sprintf("%s ~ %s", lhs, rhs)
  return(the.formula)
}

# calculating relative term frequency (TF)
term.frequency <- function(row) {
  row / sum(row)
}

# calculating inverse document frequency (IDF)
inverse.doc.freq <- function(col) {
  corpus.size <- length(col)
  doc.count <- length(which(col > 0))

  log10(corpus.size / doc.count)
}

# calculating TF-IDF.
tf.idf <- function(x, idf) {
  x * idf
}
text.prep=function(dat,var.name){
  # tokenization
  dat.token=tokens(dat[,get(var.name)],what="word", remove_punct=T, remove_symbols=T, remove_hyphens = T)
  # lower case the tokens
  dat.token=tokens_tolower(dat.token)
  # stopword removal
  dat.token=tokens_select(dat.token,stopwords(),selection = "remove")
  # stemming
  dat.token=tokens_wordstem(dat.token,language = "english")
  return(dat.token)
}
```


```{r constants, echo=F}
description.name="description"
points.name="points"
points.group.name="points.group"

country.name <- "country"
description.name <- "description"           
designation.name <- "designation"           
points.name <- "points"               
price.name <- "price"                 
province.name <- "province"              
region_1.name <- "region_1"              
region_2.name <- "region_2"             
taster.name <- "taster_name"           
twitter.name <- "taster_twitter_handle" 
title.name <- "title"                 
variaty.name <- "variety"              
winery.name <- "winery"               

dat[,eval(points.name)] <- as.numeric(dat[,get(points.name)])
dat[,eval(price.name)] <- as.numeric(dat[,get(price.name)])

single.var <- c(points.name, price.name, country.name)
```

# Introduction

<p>One of the most important research questions in the wine related area is the ranking, rating and judging of wine[1]. In the past, the research on these problems manily utilized samll dataset[2]. However, with data science techniques, we are able to investigate the problem using large amount of data. </p>

<p>Majority of the studies focus on analyzing physicochemical laboratory wine data, while little research has been conducted on the content of wine reviews. Mining useful information from wine reviews and tasters preferences can provide new insights from a consumer perspective. Therefore, it is meaningful to untilize these data and generate better wine recommendations.</p>


# Source of Data
<body>
We used <b> <a href = "https://www.kaggle.com/zynicide/wine-reviews" > Wine Reviews</a></b> data which was scraped from <i>WineEntusiast</i> website on November 22nd, 2017 by @zackthoutt on Kaggle. The orginial data also had an index variable which we ignored here since it has no impact on our project. There are 13 variables in total:

<ul>
<li><b>Country</b>: The country that the wine is from; </li>

<li><b>Description</b>: wine reviews; </li>

<li><b>Designation</b>: The vineyard within the winery where the grapes that made the wine are from;</li>

<li><b>Points</b>: The number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80);</li>

<li><b>Price</b>: The cost for a bottle of the wine; </li>

<li><b>Province</b>: The province or state that the wine is from;</li>

<li><b>Region_1</b>: The wine growing area in a province or state (ie Napa) ;</li>

<li><b>Region_2</b>: Sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank ;</li>

<li><b>Taster_name</b>: Name of the taster(reviewer);</li>

<li><b>Taster_twitter_handle</b>;</li>

<li><b>Title</b>: The title of the wine review, which often contains the vintage if you're interested in extracting that feature ;</li>

<li><b>Variety</b>: The type of grapes used to make the wine (ie Pinot Noir) ;</li>

<li><b>Winery</b>: The winery that made the wine.</li>

</ul>
</body>

```{r,include=F}
datatable(dat[1:5,])
```

# Examination of the Data

<p>We first checked the missing vlaues in the data and visualize the percentage of that for each varible. **Country, province, region_1 and region_2** are varibales giving geographic information about the wine, since the majority of **region_2** and 16.4% of **region_1** are missing, we choose to focus on **country** unless more detailed geographic information are needed.</p>

We then examined the unique values for each varible. **Designation, winery, title** have many unique values, hence we decided to treat them as random effects but not predictors when modeling.<br>

**Price** of the wine ranges from 4 to 3300 with some extreme values. We then visualized **price** in the report enginee and recommended a way to define price range (<10, 10~30, 30~50, 50~200, 200~500, >500)

Variable **points** ranges from 80 to 100. Its distribution was close to a bell shape. <br>

**Description, taster_name** and **taster_twitter_handle** provides information about the taster and their reviews on the wine. We examined these variables later based on the needs for text anlysis and recommendation systems building.

```{r percentage missing}
tab <- dat[,lapply(X=.SD,FUN="miss.value"),.SD=names(dat)[-1]]
tab2 <- data.table(var=names(dat[,-1]),miss=as.numeric(tab))
# create a plot of all the missing rate 
barplot(height=tab2$miss, names.arg=tab2$var,space=0.01, las=1, main="Percentage of Missing Value",ylab="Percentage", ylim=c(0,80),col=rainbow(13),border="white")
text(x = -0.5 + 1.02*1:tab2[, .N], y = -15, labels = tab2$var,srt = 45)
space_val=0
text(x=-0.4+1:length(tab)*(1+space_val),y=tab,labels=sprintf("%.1f%%",tab),pos=3)

#number of unique values for each variable
unitab=dat[,lapply(.SD,FUN="get.unique")]
datatable(unitab)
#range of price
summary(dat$price)
hist(dat[,price],xlab="price", col= "#85C1E9", border="white", main="Histogram of price")
#points plot
hist(dat$points,xlab="points", col= "#85C1E9", border="white", main="Histogram of points")
```


# Investigation:
#1. Sentiment Analysis 

#2. Text Analysis

The sentiment analysis shows there exits relationship between reviews and points, which is consistent with our common sense. In this part, we continued analyzing wines' review.

##1.1 The length of reviews

The plot below shows that the wines with longer reviews tend to have higher points. So texters usually have more opinions to share when he/she thinks the wine is great. It is also worth to notice that the variance of text length tend to be bigger if the wine has higher points. It may imply that even the good wines usually have longer reviews, the length is also influenced by many other factors, such as ttesters' habits. But if the wine is not so good, testers always have less to write down.

```{r text_length, warning=FALSE}
textdat <- dat[,.(description=as.character(get(description.name)),
                  points=as.numeric(get(points.name)))]
textdat <- textdat[!(is.na(get(points.name))),]

textdat=textdat[,textlen:=as.numeric(nchar(description))]
# quantile(textdat$points,probs=seq(0,1,0.25))
cuts.points <- c(80,86,88,91,100)
textdat[, eval(points.group.name):=cut2(x=get(points.name), cuts=cuts.points)]

ggplot(textdat,aes(x=textlen,fill=points.group))+
  theme_classic()+
  geom_histogram(binwidth = 5,stat="count")
```

##2.2 Data Pipeline

Before analysis on text data, we did some preparation. 
  
- **Tokenization**: Cut the reviews into single words to calculate the frequency of their appearance.

- **Remove "stopwords"**: Remove "stopwords" such as "the", "a", "are", which are meaningless but necessary in sentence. 

- **Stem the words**: Stem the words in order to reduce the number of total different words appeare in the reviews. For example, this step change "fruit", "fruity", "fruits" all into "fruiti". Even though "fruiti" is not actually a word, it is still understandable.


```{r}
text.tokens=text.prep(textdat,description.name)
```

- **Create a DFM**: Create a document-feature matrix for later analysis

```{r pipeline ,echo = TRUE}
text.tokens.dfm <- dfm(text.tokens, tolower = FALSE)
```

- **WordCloud**: Use WorldCloud to see the results. "fruit", "acid", "tanni", "cherry", "dry" tend to appear more frequently than others. We should set those words as new variables.

```{r wordcloud}
wordcloudData <- data.table(word=colnames(text.tokens.dfm),freq=colSums(text.tokens.dfm))
setorderv(wordcloudData,cols="freq",order=-1)
wordcloud2(data=wordcloudData,size=1,minSize=0.1)
```

##2.3 Term Frequency-Inverse Document Frequency (TF-IDF)

TF-IDF is a powerful technique for enhancing the information contained within our document-frequency matrix. The tf-idf weight measures how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus (data-set). To complete this process, we need two steps listed below:

- Normalize all documents in the corpus to be length independent (TF)

- Penalize terms occur frequency across corpus and inverse document frequency (IDF) 

```{r TF-IDF}
var.chosen<-c('fruit','acid','palat','aroma','cherri','tannin','ripe','dri','spice','rich','fresh','berri','oak','plum','textur','sweet','appl','full','blend','balanc','bodi','blackberri','light','soft','age','structur','white','crisp','fruiti','citrus','dark','miner','herb','Cabernet','raspberri','vanilla','bright','pepper','firm','green','lemon','juici','peach','concentr','pear','chocol','currant','Pinot','smooth','spici','wood','lime','intens','tart','tannic','tight','herbal','orang')

# Transform to a matrix and inspect.
text.tokens.matrix <- as.matrix(text.tokens.dfm[,var.chosen])

# First step, normalize all documents via TF.
train.tokens.df.norm <- apply(text.tokens.matrix, 1, term.frequency)

# Second step, calculate the IDF vector that we will use - both
# for training data and for test data!
text.tokens.idf <- apply(text.tokens.matrix, 2, inverse.doc.freq)

# Lastly, calculate TF-IDF for our training corpus.
text.tokens.tfidf <-  apply(train.tokens.df.norm, 2, tf.idf, idf = text.tokens.idf)

round(text.tokens.tfidf[1:10, 1:6],2)
```

For example, in the text 1, fruit has a lower score than acid. We believe that "fruit" has less information because most of reviews include "fruit". On the contrary, acidity is more useful in this case, since only part of the wines are acid, which makes it more unique and characteristic, and more suitable as a variable.

##2.4 Prediction

Based on wordcloud plot and TF-IDF, we selected top 100 frequetent words. Then we delected some meaningless words for this project, such as "wine","well","show" and etc. In the end, we chose 58 words as our new variables. 

```{r train_test}
#dim(text.tokens.matrix)
text.tokens.df <- cbind(points = textdat$points, data.frame(text.tokens.matrix))

set.seed(423)
indexes=createDataPartition(text.tokens.df$points,times=1,p=0.7,list=F) # 70% as train data 
train=text.tokens.df[indexes,]
test=text.tokens.df[-indexes,]
```

```{r model}
formula.points <- create.formula(outcome.name="points",input.names=var.chosen)
the.formula <- reduce.formula(dat = train, the.initial.formula = formula.points)
model <- lm(formula = the.formula, data=train)
summary(model)
```

F statistic is 483.7 and p-value is smaller than $2.2*10^{-16}$. Thus, the model is significant. Also, most variables are significant in this model. But adjusted R-squared is 0.2353, which is not really high. Considering that testers may not write down all aspect of a wine and the standard of valuating a wine is rather compelecated, we still think this is a nice try.

```{r predict}
test$points.est <- predict(model,newdata=test)
test$points.diff <- test$points.est-test$points
perc <- round(100*sum(abs(test$points.diff)<=2,na.rm=TRUE)/nrow(test),2)
```

Since points is continuous, we allow a samll difference between actual points and estimated points (eg:abs(diff)=2), the accuracy rate is `r perc`%.

```{r check_performance}
ggplot(data = test,aes(x= points.diff,y=..density..)) + 
  geom_histogram(na.rm=TRUE,fill="royalblue4",bins=30) +
  geom_vline(xintercept =-2, col="red",lty=2,lwd=1.5) +
  geom_vline(xintercept = 2, col="red",lty=2,lwd=1.5) +
  theme_classic() +
  ggtitle('Histogram of Residuals') + 
  xlab('Residuals') + 
  ylab('Probability') + 
  theme(plot.title = element_text(hjust = 0.5)) 
```


#3. Recommendation System

# The Results
# Interpretation
# Assummptions
# Limitations and Uncertainties
# Areas of Future Investigation

# References
[1]Bernard Chen, Valentin V., James P., and Travis A. Wineinformatics: A Quantitative Analysis of Wine Reviewers. <i> Fermentation </i>. 2018 <br>
[2]Quandt, R.E. A note on a test for the sum of ranksums. <i>J. Wine Econ</i>. 2007, 2, 98-102. <br>
[3]tf-idf Model for Page Ranking. Retrieved from https://www.geeksforgeeks.org/tf-idf-model-for-page-ranking/.