Machine_learning_Classification.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<title>AIV classification with machine learning</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/bootstrap.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>


<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #ffffff;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  background: white;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "&#xe258;";
  border: none;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}

@media print {
.toc-content {
  /* see https://github.com/w3c/csswg-drafts/issues/4434 */
  float: right;
}
}

.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
}

.tocify .list-group-item {
  border-radius: 0px;
}

.tocify-subheader {
  display: inline;
}
.tocify-subheader .tocify-item {
  font-size: 0.95em;
}

</style>


</head>

<body>


<div class="container-fluid main-container">


<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">


<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html"></a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="method-development-and-validation.html">LC-MS/MS</a>
</li>
<li>
  <a href="AIV_profile_analysis.html">AIV amino acids</a>
</li>
<li>
  <a href="Machine_learning_Classification.html">Machine learning</a>
</li>
<li>
  <a href="ShinyML.html">Interactive Shiny</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<div class="fluid-row" id="header">


<h1 class="title toc-ignore">AIV classification with machine learning</h1>

</div>


<p><br/></p>
<p><strong>Object naming</strong>:</p>
<p>df.XXX: a data.frame or tibble object; e.g., df.all.data<br />
df.modelname…: a data frame or tibble containing tidied results from a model prediction vs actual<br />
mat.XXX: a matrix object; e.g., mat.content<br />
pltXXX: a plot object; e.g., plt.univariate.boxplot.dot<br />
mdl.trained.XXXX: model from training set; e.g., mdl.trained.LDA<br />
mdl.fitted.XXX: object with fitted results output from a trained model; e.g., mdl.fitted.LDA<br />
mdl.XXX: a model object; e.g., mdl.LDA<br />
fitted.XXX: predicted class, e.g. fitted.logistic.elasticNet<br />
cf.XXX: confusion table; e.g., cf.counts.LDA<br />
func.XXX: defined functions; e.g., func.boxplot.Site()</p>
<p><br/></p>
<div id="packages" class="section level1">
<h1><span class="header-section-number">1</span> Packages</h1>
<pre class="r"><code># functional packages
library(readxl)

# Machine learning packages
library(MASS)
library(rsample)
library(viridis)
library(glmnet)
library(caret)
library(rpart.plot) # for classification tree plot
library(randomForest)
library(e1071)
library(ComplexHeatmap)

# The core package collection
# load last, so as the key functions are not masked by others; but instead masking others if any
library(tidyverse)</code></pre>
<pre class="r"><code>set.seed(19911110)</code></pre>
</div>
<div id="data" class="section level1">
<h1><span class="header-section-number">2</span> Data</h1>
<pre class="r"><code># Read amino acid content dataset
path = &quot;/Users/Boyuan/Desktop/My publication/16. HILIC amino acid machine learning to J. Chroma A/Publish-ready files/AIV free amino acids content.xlsx&quot;

df.content= read_excel(path , sheet = &quot;content in AIV (mg.100g DW-1)&quot;) 

# select needed columns and tidy up
df.content = df.content %&gt;% select(Name, Category, Site) %&gt;%
  mutate(strata.group = paste(Category, Site, sep = &quot;_&quot;)) %&gt;%
  cbind(df.content %&gt;% select(23:ncol(df.content))) %&gt;%
  as_tibble()

# categories
unique.categories = df.content$Category %&gt;% unique()

# category colors
color.category = c(&quot;Black&quot;, &quot;Steelblue&quot;, &quot;Firebrick&quot; , &quot;Darkgreen&quot;) 
names(color.category) = unique.categories</code></pre>
</div>
<div id="define-functions" class="section level1">
<h1><span class="header-section-number">3</span> Define functions</h1>
<div id="train-test-split" class="section level2">
<h2><span class="header-section-number">3.1</span> Train-test split</h2>
<div id="statified-sampling" class="section level3">
<h3><span class="header-section-number">3.1.1</span> Statified sampling</h3>
<pre class="r"><code># strata group
unique.categories.site = df.content$strata.group %&gt;% unique()

# Define function doing stratified sampling based on category-site combination 
func.stratifiedSampling = function(trainingRatio = 0.7){
  index.train = c()
  
  for (a in unique.categories.site){
    df.content.i = df.content %&gt;% filter(strata.group == a)
    index.train.i = sample(df.content.i$Name, size = (trainingRatio * nrow(df.content.i)) %&gt;% floor())
    index.train = c(index.train, index.train.i) 
  }
  
  df.train = df.content %&gt;% filter(Name %in% index.train)
  df.test = df.content %&gt;% filter(! Name %in% index.train)
  list.learn = list(df.train, df.test)
  return(list.learn)
}

# demo
# df.learn = func.stratifiedSampling(trainingRatio = .5)
# In practice, the trainingRatio is manuall changed by citing arguments from higher level function (see below)
# the split ratio in convenient practice should never be changed in this sampling function</code></pre>
</div>
<div id="normalization" class="section level3">
<h3><span class="header-section-number">3.1.2</span> Normalization</h3>
<pre class="r"><code># input list: 1st: training set; 2nd, test set; i.e., the output of func.stratifiedSampling
func.normalize.trainTest = function(list) { 
  
  mat.train = list[[1]] %&gt;% dplyr::select(-c(Name, Category, Site, strata.group, `Data File`)) %&gt;% as.matrix()
  mat.test = list[[2]] %&gt;% dplyr::select(-c(Name, Category, Site, strata.group, `Data File`)) %&gt;% as.matrix()
  
  # mean vector computed from training set (as single column matrix)
  meanVector.train = apply(mat.train, 2, mean) %&gt;% as.matrix()
  
  # diagonol matrix, with standard deviation inverse
  mat.inverse.sd.diaganol = apply(mat.train, 2, sd)  %&gt;% diag() %&gt;% solve()
  # Reserve column names
  colnames(mat.inverse.sd.diaganol) = colnames(mat.train) 
  
  # ones vector, as single column matrix, length = # observation units of TRAINING set
  vector.ones.train = rep(1, nrow(mat.train)) %&gt;% as.matrix()
  
  # Compute normalized training dataset 
  mat.train.scaled = (mat.train - vector.ones.train %*% t(meanVector.train)) %*% mat.inverse.sd.diaganol
  
  
  # Use built-in scale function to double check computation
  # Used for trouble shooting purpose; comment out to keep console cleanness when running code
  
  # mat.train.scaled.test = mat.train %&gt;% scale(center = T, scale = T)
  
  # if ((mat.train.scaled - mat.train.scaled.test ) %&gt;% sum() %&gt;% round(10) == 0){
  #   cat(&quot;Computation is correct!&quot;) 
  # } else{
  #   cat(&quot;Computation may be incorrect. Further examination is required!&quot;)
  # }
  
  
  # Normalize test dataset using training-set mean vector and standard deviation diagonal matrix
  # ones vector, as single column matrix, length = # observation units of TESTING set 
  vector.ones.test = rep(1, nrow(mat.test)) %&gt;% as.matrix()
  mat.test.scaled = (mat.test - vector.ones.test %*% t(meanVector.train)) %*% mat.inverse.sd.diaganol
  
  # Complete two matrices with category labels, and convert to tibble
  df.train.scaled = cbind(data.frame(Category = list[[1]]$Category), mat.train.scaled) %&gt;% as_tibble()
  df.test.scaled  = cbind(data.frame(Category = list[[2]]$Category), mat.test.scaled) %&gt;% as_tibble()
  return(list(df.train.scaled, df.test.scaled))
}

# demo
# func.stratifiedSampling(trainingRatio = .6) %&gt;% func.normalize.trainTest()</code></pre>
</div>
<div id="chaining-prior-two-functions" class="section level3">
<h3><span class="header-section-number">3.1.3</span> Chaining prior two functions</h3>
<pre class="r"><code># Define a third function: chaining the prior two functions together
# 1) stratified sampling into training and test set
# 2) normalize training set, and normalize test set based on training mean vector and standard deviation 
func.strata.Norm.trainTest = function(trainingRatio = 0.7, scaleData = T){
  if(scaleData == T){
    func.stratifiedSampling(trainingRatio = trainingRatio) %&gt;%
      func.normalize.trainTest() %&gt;% return()
  } else {
    func.stratifiedSampling(trainingRatio = trainingRatio) %&gt;% return()
  }
  
}

# demo: 1&gt; func.strata.Norm.trainTest(); # default 0.7 train-test split ratio
#       2&gt; func.strata.Norm.trainTest(trainingRatio = .5) # manually change train-test split ratio</code></pre>
</div>
</div>
<div id="prediction-table-tidy-up" class="section level2">
<h2><span class="header-section-number">3.2</span> Prediction table tidy up</h2>
<div id="confusion-matrix" class="section level3">
<h3><span class="header-section-number">3.2.1</span> Confusion matrix</h3>
<pre class="r"><code># Define two function: tidy up confusion tables (contingency table and stats) 
# 1) For contigency table
func.tidy.cf.contigencyTable = function(inputTable, ModelName){
  inputTable %&gt;% as.data.frame() %&gt;% 
    spread(key = Reference, value = Freq) %&gt;% 
    mutate(Model = ModelName) %&gt;%
    return()
}

# demo: cf.counts.LDA %&gt;% func.tidy.cf.contigencyTable()</code></pre>
</div>
<div id="summary-statistics" class="section level3">
<h3><span class="header-section-number">3.2.2</span> Summary statistics</h3>
<pre class="r"><code># 2) For stats table based on each category
func.tidy.cf.statsTable = function(inputTable, ModelName){
  cbind(Category = rownames(inputTable), inputTable %&gt;% as_tibble()) %&gt;%
    mutate(Category = str_remove(Category, pattern = &quot;Class: &quot;)) %&gt;%
    mutate(Model = ModelName) %&gt;%
    return()
}</code></pre>
<pre class="r"><code># 3) For overal stats table
func.tidy.cf.statsOveral = function(vector, modelName){
  d = data.frame(Accuracy = vector[1], AccuracyLower = vector[3], 
                 AccuracyUpper = vector[4], Model = modelName) %&gt;% as_tibble()
  return(d)
}</code></pre>
</div>
<div id="most-votes" class="section level3">
<h3><span class="header-section-number">3.2.3</span> Most votes</h3>
<pre class="r"><code>func.mostVotes = function(vector){
  x = vector %&gt;% table() %&gt;% sort() %&gt;% rev() 
  names(x)[1] %&gt;% return()
}</code></pre>
</div>
</div>
</div>
<div id="machine-learning" class="section level1">
<h1><span class="header-section-number">4</span> Machine learning</h1>
<div id="traintest-split" class="section level2">
<h2><span class="header-section-number">4.1</span> 70/30 train/test split</h2>
<pre class="r"><code># Now, let&#39;s set up the training and testing set. 
# When needed for certain algorithm, when hyper-parameters are tuned, the training set is further split into validations sets using cross-validation method. 

df.learn = func.strata.Norm.trainTest(trainingRatio = .7, scaleData = T) 
df.train = df.learn[[1]]
df.test = df.learn[[2]]</code></pre>
<p><strong>Note here that the training set is scaled, and the testing set is also scaled using the mean and covariance matrix of the testing set!</strong></p>
<p>Despite Differences in algorithms per se, the workflow is roughly the same: train the model, test performance on the teset set, and tidy up the confusion matrix. Regardless such similarityit and the possibility of wrapping different algorithms into one overal function, in this work each algorithm is written in a rather independant manner. Redundant as this truely is, this practice is rewarded by easy understanding of each algorithm section, which could be read as a more or less standalone method; meanwhile, this practice provides rapid access for trouble shooting.</p>
</div>
<div id="linear-discriminant-analysis-lda" class="section level2">
<h2><span class="header-section-number">4.2</span> Linear discriminant analysis (LDA)</h2>
<pre class="r"><code># model train
mdl.trained.LDA = lda(Category ~., data = df.train)
# predict on test set, with equal prior probability
mdl.fitted.LDA = predict(mdl.trained.LDA, newdata = df.test, prior = rep(1/4, 4))
fitted.LDA = mdl.fitted.LDA$class # here we overwrite the prior fitted.LDA object
cf.LDA = confusionMatrix(data = fitted.LDA, reference = df.test$Category, mode = &quot;everything&quot;)

# confusion table
cf.counts.LDA = cf.LDA$table %&gt;% func.tidy.cf.contigencyTable(ModelName = &quot;LDA&quot;)
# Summary stats table
cf.stats.LDA = cf.LDA$byClass %&gt;% func.tidy.cf.statsTable(ModelName = &quot;LDA&quot;) </code></pre>
</div>
<div id="quadratic-discriminant-analysis" class="section level2">
<h2><span class="header-section-number">4.3</span> Quadratic discriminant analysis</h2>
<pre class="r"><code># model train
mdl.QDA = qda(Category ~., data = df.train)

# predict on test set, with equal prior probability
mdl.fitted.QDA = predict(mdl.QDA, newdata = df.test, prior = rep(1/4, 4))
fitted.QDA = mdl.fitted.QDA$class
cf.QDA = confusionMatrix(data = fitted.QDA, reference = df.test$Category, mode = &quot;everything&quot;)

# confusion matrix
cf.counts.QDA = cf.QDA$table %&gt;% func.tidy.cf.contigencyTable(ModelName = &quot;QDA&quot;) 
# summary results
cf.stats.QDA = cf.QDA$byClass %&gt;% func.tidy.cf.statsTable(ModelName = &quot;QDA&quot;)</code></pre>
<p>With 0.5 split ratio, console would pop up error “rank deficiency in group Mustard”. QDA estimates the covariance matrix for each population, and requries more data input. Mustard is the category with the smallest size of observation units. While LDA assumes equal covariance matrix for all populations, and takes a pooled covaraince matrix, and thus requires much less data input for parameter estimation. In fact, LDA does a fairly nice job when trained with only 10% of data.</p>
</div>
<div id="regularized-logistic-regression" class="section level2">
<h2><span class="header-section-number">4.4</span> Regularized logistic regression</h2>
<pre class="r"><code># cross validation
cv.mdl.logistic.ridge = cv.glmnet(x = df.train[, -1] %&gt;% as.matrix(), y = df.train$Category, 
                                  family = &quot;multinomial&quot;, alpha =  0)

cv.mdl.logistic.elasticNet = cv.glmnet(x = df.train[, -1] %&gt;% as.matrix(), y = df.train$Category, 
                                       family = &quot;multinomial&quot;, alpha =  0.5)

cv.mdl.logistic.lasso = cv.glmnet(x = df.train[, -1] %&gt;% as.matrix(), y = df.train$Category, 
                                  family = &quot;multinomial&quot;, alpha =  1)

par(mfrow = c(1, 3))
plot(cv.mdl.logistic.ridge, main = &quot;Ridge&quot;, line = 2) 
plot(cv.mdl.logistic.elasticNet, main = &quot;Elastic Net (alpha = 0.5)&quot;, line = 2)
plot(cv.mdl.logistic.lasso, main = &quot;Lasso&quot;, line = 2)</code></pre>
<p><img src="Machine_learning_Classification_files/figure-html/unnamed-chunk-14-1.png" width="672" /></p>
<pre class="r"><code>par(mfrow = c(1, 1))</code></pre>
<pre class="r"><code># check model coefficients
x = coef(cv.mdl.logistic.elasticNet, s = &quot;lambda.min&quot;)

df.logisticNets.coefficients = 
  data.frame(x[[1]] %&gt;% as.matrix(), x[[2]] %&gt;% as.matrix(),
             x[[3]] %&gt;% as.matrix(), x[[4]] %&gt;% as.matrix()) 
colnames(df.logisticNets.coefficients) = names(x)
# df.logisticNets.coefficients</code></pre>
<pre class="r"><code># Prediction with train-test split, a formal test of model efficiency

# Define function for performing regularized logistic with different alpha values
func.regularizedLogistic = function(
  input.alpha, # control ridge, lasso, or between
  ModelName    # model type as extra column note in the confusion table output
){
  
  # train with 10-fold cross validation
  cv.mdl.logistic = cv.glmnet(x = df.train[, -1] %&gt;% as.matrix(), y = df.train$Category, 
                              family = &quot;multinomial&quot;, alpha = input.alpha, nfolds = 10)
  # predict with test set
  fitted.logistic =
    predict(cv.mdl.logistic, newx = df.test[, -1] %&gt;% as.matrix(), 
            s = cv.mdl.logistic$lambda.1se, type = &quot;class&quot;) %&gt;% c() %&gt;% 
    factor(levels = sort(unique.categories), ordered = T) # Note: important to sort unique.categories!
  
  # wrap up prediction results
  cf.logistic = confusionMatrix(data = fitted.logistic, reference = df.test$Category)
  cf.counts.logistic = cf.logistic$table %&gt;% func.tidy.cf.contigencyTable(ModelName = ModelName)
  cf.stats.logistic = cf.logistic$byClass %&gt;% func.tidy.cf.statsTable(ModelName = ModelName)
  
  return(list(cf.counts.logistic, cf.stats.logistic, fitted.logistic, cf.logistic))
  
}

# Test upon different alpha values (important to note that alpha is not a hyper-parameter to optimize!)
# func.regularizedLogistic(input.alpha = 0, ModelName = &quot;Ridge&quot;)
# func.regularizedLogistic(input.alpha = 1, ModelName = &quot;Lasso&quot;)
# func.regularizedLogistic(input.alpha = 0.5, ModelName = paste(&quot;ElasticNet, α = 0.5&quot;) )

# We&#39;remore interested in the elastic net results
list.logistic = func.regularizedLogistic(input.alpha = 0.5, ModelName = paste(&quot;EN&quot;) )

cf.counts.ElasticNet =list.logistic[[1]]
cf.stats.ElasticNet = list.logistic[[2]]
fitted.ElasticNet = list.logistic[[3]]
cf.EN = list.logistic[[4]]</code></pre>
</div>
<div id="random-forest" class="section level2">
<h2><span class="header-section-number">4.5</span> Random forest</h2>
<pre class="r"><code># Predict with train-test split
colnames(df.train) = make.names(colnames(df.train))
colnames(df.test) = make.names(colnames(df.test))

# set up training model and test accuracy
mdl.randomForest = randomForest(Category ~., data = df.train, ntree = 500, mtry = 5) 
fitted.randomForest = predict(mdl.randomForest, newdata = df.test)

# set up confusion table
cf.randomForest = confusionMatrix(data = fitted.randomForest, 
                                  reference = df.test$Category, 
                                  mode = &quot;everything&quot;)

# confusion matrix
cf.counts.randomForest = cf.randomForest$table %&gt;%
  func.tidy.cf.contigencyTable(ModelName = &quot;RF&quot;)
# Summary stats
cf.stats.randomForest = cf.randomForest$byClass %&gt;% 
  func.tidy.cf.statsTable(ModelName = &quot;RF&quot;)

# cf.counts.randomForest
# cf.stats.randomForest</code></pre>
</div>
<div id="support-vector-machine" class="section level2">
<h2><span class="header-section-number">4.6</span> Support vector machine</h2>
<div id="cross-validation" class="section level3">
<h3><span class="header-section-number">4.6.1</span> Cross-validation</h3>
<pre class="r"><code># Tune hyper-parameters

svm.gamma = 10^(seq(-5, 2, by = 1))
svm.cost = 10^(seq(-2, 5, by = 1))

cv.svm = df.train %&gt;% 
  vfold_cv(strata = Category, v = 5) %&gt;%
  mutate(train = map(.x = splits, .f = ~training(.x) ),
         validate = map(.x = splits, .f = ~testing(.x) ))  %&gt;%
  select(-splits) %&gt;%
  crossing(gamma = svm.gamma, cost = svm.cost) %&gt;%
  mutate(hyper = map2(.x = gamma, .y = cost, .f = ~list(.x, .y))) %&gt;% # 1st gamma; 2nd cost
  # set up model CV tuning hyper-params
  mutate(model = map2(.x = train, .y = hyper, 
                      .f = ~svm(x = .x[, -1], y = .x[[1]],  gamma = .y[[1]], cost = .y[[2]] ) )) %&gt;%
  # predict on validation set
  # note that here the CV is performed in a somewhat loose manner as the train-validation folds have been normalized upstream prior to train-validation split
  mutate(fitted.validate = map2(.x = model, .y = validate, 
                                .f = ~predict(.x,  newdata = .y[, -1] )),
         actual.validate = map(.x = validate, .f = ~.x[[1]] %&gt;% factor(ordered = F)),
         accuracy = map2_dbl(.x = fitted.validate , .y = actual.validate, 
                             .f = ~sum(.x == .y)/length(.x)))

cv.svm.summary = cv.svm %&gt;%
  group_by(gamma, cost) %&gt;%
  summarise(accuracy.mean = mean(accuracy) * 100,
            accuracy.sd = sd(accuracy) * 100 ) %&gt;%
  arrange(desc(accuracy.mean))</code></pre>
<pre class="r"><code>cv.svm.summary %&gt;% 
  ggplot(aes(x = gamma, y = cost, z = accuracy.mean)) +
  geom_tile(aes(fill = accuracy.mean)) + 
  scale_fill_viridis(option = &quot;A&quot;, alpha = .9)  + 
  # stat_contour(color = &quot;grey&quot;, size = .5) +
  coord_fixed() +
  theme(panel.grid.minor = element_line(colour = &quot;black&quot;, size = 2),
        panel.grid.major = element_blank()) +
  scale_x_log10(breaks = svm.gamma, labels = log10(svm.gamma)) + 
  scale_y_log10(breaks = svm.cost, labels = log10(svm.cost)) +
  labs(x = &quot;gamma, 10 ^ X&quot;, y = &quot;cost, 10 ^ X&quot;, title = &quot;SVM Radial Kernel&quot;) +
  
  geom_text(aes(label = accuracy.mean %&gt;% round(1) ), color = &quot;black&quot;)</code></pre>
<p><img src="Machine_learning_Classification_files/figure-html/unnamed-chunk-19-1.png" width="768" /></p>
</div>
<div id="train-by-entire-training-set" class="section level3">
<h3><span class="header-section-number">4.6.2</span> Train by entire training set</h3>
<pre class="r"><code>mdl.svm = svm(x = df.train[, -1], y = df.train$Category, 
              gamma = cv.svm.summary$gamma[1],
              cost = cv.svm.summary$cost[1])

# predict
fitted.svm = predict(mdl.svm, newdata = df.test[, -1])

# confusion matrix
cf.svm = confusionMatrix(
  data = fitted.svm, reference = df.test$Category, mode = &quot;everything&quot;)

# confusion matrix
cf.counts.svm = cf.svm$table %&gt;%
  func.tidy.cf.contigencyTable(ModelName = &quot;SVM&quot;)

# summary stats
cf.stats.svm = cf.svm$byClass %&gt;% 
  func.tidy.cf.statsTable(ModelName = &quot;SVM&quot;)</code></pre>
</div>
</div>
<div id="naive-bayes-benchmark" class="section level2">
<h2><span class="header-section-number">4.7</span> Naive Bayes (benchmark)</h2>
<pre class="r"><code># train
mdl.Bayes = naiveBayes(x = df.train[, -1], y = df.train$Category)
# predict
fitted.Bayes = predict(mdl.Bayes, newdata = df.test, type = &quot;class&quot;)

# confusion matrix
cf.Bayes = confusionMatrix(
  data = fitted.Bayes, reference = df.test$Category, mode = &quot;everything&quot;)

# confusion matrix
cf.counts.Bayes = cf.Bayes$table %&gt;%
  func.tidy.cf.contigencyTable(ModelName = &quot;NB&quot;)

# summary stats
cf.stats.Bayes = cf.Bayes$byClass %&gt;% 
  func.tidy.cf.statsTable(ModelName = &quot;NB&quot;)</code></pre>
</div>
</div>
<div id="test-results" class="section level1">
<h1><span class="header-section-number">5</span> Test results</h1>
<div id="wrap-up" class="section level2">
<h2><span class="header-section-number">5.1</span> Wrap up</h2>
<pre class="r"><code>unique.models = factor(
  c(&quot;LDA&quot;, &quot;QDA&quot;, &quot;EN&quot;, &quot;RF&quot;, &quot;SVM&quot;, &quot;NB&quot;, &quot;Most voted&quot;), ordered = T)</code></pre>
<pre class="r"><code># Summary of all machine learning techniques ----

# Sample wise prediction of all models and most voted 
df.actual.vs.fit = data.frame(
  &quot;Actual&quot; = df.test$Category, 
  &quot;LDA&quot; = fitted.LDA, 
  &quot;QDA&quot; = fitted.QDA, 
  &quot;EN&quot; = fitted.ElasticNet, 
  &quot;RF&quot; =  fitted.randomForest, 
  &quot;SVM&quot; =  fitted.svm, 
  &quot;NB&quot; = fitted.Bayes)

df.actual.vs.fit = df.actual.vs.fit %&gt;% as_tibble() %&gt;%
  mutate(Actual = factor(Actual, ordered = F))

df.actual.vs.fit = df.actual.vs.fit %&gt;% 
  mutate(most.voted =  apply(df.actual.vs.fit %&gt;% select(-Actual),
                             MARGIN = 1, func.mostVotes))


# Most voted confusion matrix
cf.mostVoted = confusionMatrix(data = df.actual.vs.fit$most.voted %&gt;% factor(), 
                               reference = df.actual.vs.fit$Actual %&gt;% factor(), mode = &quot;everything&quot;)

cf.counts.MostVoted = cf.mostVoted$table %&gt;%
  func.tidy.cf.contigencyTable(ModelName = &quot;Most voted&quot;)

cf.stats.MostVoted = cf.mostVoted$byClass %&gt;%
  func.tidy.cf.statsTable(ModelName = &quot;Most voted&quot;)</code></pre>
<pre class="r"><code># Summary of all machine learning techniques
df.confusionMatrix.all = cf.counts.LDA %&gt;% rbind(cf.counts.QDA) %&gt;%
  rbind(cf.counts.ElasticNet) %&gt;% # rbind(cf.counts.CART) %&gt;%
  rbind(cf.counts.randomForest) %&gt;% rbind(cf.counts.svm) %&gt;%
  rbind(cf.counts.Bayes) %&gt;% rbind(cf.counts.MostVoted) %&gt;% as_tibble()

# tidy up the confusion matrix combined
df.confusionMatrix.all.tidy = df.confusionMatrix.all %&gt;%
  # tidy up
  gather(-c(Prediction, Model), key = reference, value = counts) %&gt;%
  
  # convert AIVs category into ordered factor
  mutate(reference = factor(reference, levels = unique.categories, ordered = T),
         Prediction = factor(Prediction, levels = unique.categories %&gt;% rev(), ordered = T)) %&gt;%
  
  # change model order in the dataset
  mutate(Model = factor(Model, levels = unique.models, ordered = T)) %&gt;%
  arrange(Model, reference, Prediction) %&gt;%
  mutate(Diaganol = Prediction == reference)</code></pre>
<pre class="r"><code># assign color to correct / incorrect prediction
diag = df.confusionMatrix.all.tidy %&gt;% 
  filter(Diaganol == F)

df.confusionMatrix.all.tidy = diag %&gt;% 
  mutate(color = ifelse(counts == 0, &quot;Grey&quot;, &quot;Firebrick&quot;)) %&gt;%
  rbind(df.confusionMatrix.all.tidy %&gt;% filter(Diaganol == T) %&gt;%
          mutate(color = &quot;steelblue&quot;))</code></pre>
</div>
<div id="confusion-matrix-1" class="section level2">
<h2><span class="header-section-number">5.2</span> Confusion matrix</h2>
<pre class="r"><code># Visualize confusion matrix
df.confusionMatrix.all.tidy %&gt;%
  ggplot(aes(x = reference, y = Prediction)) + 
  facet_wrap(~Model, nrow = 2) +
  
  # off diaganol incorrect prediction
  geom_label(data = df.confusionMatrix.all.tidy %&gt;% filter(color == &quot;Firebrick&quot;),
             aes(label = counts), 
             fill = &quot;firebrick&quot;, alpha = .3, size = 6) +
  
  # diaganol correct prediction
  geom_label(data = df.confusionMatrix.all.tidy %&gt;% filter(color == &quot;steelblue&quot;),
             aes(label = counts),
             fill = &quot;Steelblue&quot;, alpha = .3, size = 6) +
  
  # zero counts
  geom_label(data = df.confusionMatrix.all.tidy %&gt;% filter(color == &quot;Grey&quot;),
             aes(label = counts), 
             size = 6, color = &quot;grey&quot;)  +
  theme_bw() +
  
  theme(axis.text.x = element_text(angle = 90, vjust = .8, hjust = .8, color = &quot;black&quot;, size = 12),
        axis.text.y = element_text(color = &quot;black&quot;, size = 12),
        axis.title = element_text(size = 14, colour = &quot;black&quot;),
        strip.background = element_blank(),
        strip.text = element_text(face = &quot;bold&quot;, size = 14),
        panel.border = element_rect(color = &quot;black&quot;, size = 1),
        title = element_text(face = &quot;bold&quot;)) + 
  labs(x = &quot;\nReference&quot;, y = &quot;Prediction\n&quot;) +
ggtitle(&quot;Confusion Matrix&quot;) +
  coord_fixed(ratio = 1)</code></pre>
<p><img src="Machine_learning_Classification_files/figure-html/unnamed-chunk-26-1.png" width="1152" /></p>
</div>
<div id="sample-wise-heatmap" class="section level2">
<h2><span class="header-section-number">5.3</span> Sample-wise heatmap</h2>
<pre class="r"><code># Sample-wise comparison between models
fitted.ElasticNet = func.regularizedLogistic(input.alpha = 0.5, ModelName = &quot;Elastic Net&quot;)[[3]]

df.actual.vs.fit = data.frame(
  &quot;Actual&quot; = df.test$Category, 
  &quot;Linear discriminant&quot; = fitted.LDA, 
  &quot;Quadratic discriminant&quot; = fitted.QDA, 
  &quot;Elastic net&quot; = fitted.ElasticNet, 
  # &quot;CART&quot; = fitted.CART,
  &quot;Random forest&quot; =  fitted.randomForest, 
  &quot;Support vector machine&quot; =  fitted.svm, 
  &quot;Naive Bayes&quot; = fitted.Bayes)

df.actual.vs.fit = df.actual.vs.fit %&gt;% as_tibble() %&gt;%
  mutate(Actual = factor(Actual, ordered = F))
# df.actual.vs.fit


# most votes
func.mostVotes = function(vector){
  x = vector %&gt;% table() %&gt;% sort() %&gt;% rev() 
  names(x)[1] %&gt;% return()
}

df.actual.vs.fit = df.actual.vs.fit %&gt;% 
  mutate(&#39;Most voted&#39; =  apply(df.actual.vs.fit %&gt;% select(-Actual),
                               MARGIN = 1, func.mostVotes))</code></pre>
<pre class="r"><code># Heatmap of sample-wise predicted result
plt.heatmap.machineLearning = 
  df.actual.vs.fit %&gt;% arrange(Actual) %&gt;%
  as.matrix() %&gt;% t() %&gt;%
  Heatmap(col = color.category,
          heatmap_legend_param = list(
            title = &quot;Category&quot;, title_position = &quot;leftcenter&quot;,
            nrow = 1,
            labels_gp = gpar(fontsize = 11)), 
          rect_gp = gpar(col = &quot;white&quot;, lwd = 0.1))

draw(plt.heatmap.machineLearning,
     heatmap_legend_side = &quot;bottom&quot;)</code></pre>
<p><img src="Machine_learning_Classification_files/figure-html/unnamed-chunk-28-1.png" width="960" /></p>
</div>
<div id="summary-statistics-1" class="section level2">
<h2><span class="header-section-number">5.4</span> Summary statistics</h2>
<pre class="r"><code># Set up summary statistics
df.stats = cf.stats.LDA %&gt;% rbind(cf.stats.QDA) %&gt;%
  rbind(cf.stats.ElasticNet) %&gt;% rbind(cf.stats.randomForest) %&gt;%
  rbind(cf.stats.svm) %&gt;% rbind(cf.stats.Bayes) %&gt;% rbind(cf.stats.MostVoted) %&gt;%
  select(Category, Model, Precision, Recall, F1) %&gt;% 
  gather(-c(1:2), key = metrics, value = values) %&gt;%
  mutate(Model = factor(Model, levels = unique.models, ordered = T))


df.stats.overal = func.tidy.cf.statsOveral(cf.LDA$overall, modelName = &quot;LDA&quot;) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.QDA$overall, modelName = &quot;QDA&quot;)) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.EN$overall, modelName = &quot;EN&quot;)) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.randomForest$overall, modelName = &quot;RF&quot;)) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.svm$overall, modelName = &quot;SVM&quot;)) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.Bayes$overall, modelName = &quot;NB&quot;)) %&gt;%
  rbind(func.tidy.cf.statsOveral(cf.mostVoted$overall, modelName = &quot;Most voted&quot;)) %&gt;%
  mutate(Model = factor(Model, levels = unique.models, ordered = T))</code></pre>
<p>The overal accuracy of the model is in bold, with corresponding 95% confidence interval shown in following line.</p>
<pre class="r"><code>plt.summaryStats = 
  df.stats %&gt;% ggplot(aes(x = Category, y = values, color = metrics)) +
  geom_segment(aes( xend = Category, y = 0.5, yend = values), 
               position = position_dodge(0.5)) +
  geom_point(size = 4, position = position_dodge(.3), alpha = .9) +
  facet_wrap(~Model, nrow = 1) +
  theme_bw() +
  theme(legend.position = &quot;bottom&quot;,
        legend.title = element_text(size = 11),
        legend.text = element_text(size = 11),
        
        strip.text = element_text(face = &quot;bold&quot;, size = 12),
        strip.background = element_blank(),
        
        axis.text.x = element_text(angle = 90, hjust = 1, colour = &quot;black&quot;, size = 11),
        axis.text.y = element_text(colour = &quot;black&quot;, size = 11),
        axis.title = element_text(size = 11)) +
  coord_cartesian(ylim = c(0.65, 1)) +
  scale_color_brewer(palette = &quot;Accent&quot;) +
  
  # overal stats
  geom_text(data = df.stats.overal, 
            aes(x = 2.5, y = 0.7, label = round(Accuracy, 3) * 100), 
            color = &quot;black&quot;, fontface = &quot;bold&quot;, size = 5) +
  
  geom_text(data = df.stats.overal, 
            aes(x = 2.5, y = 0.66, 
                label = paste(round(AccuracyLower, 3) * 100, &quot; ~ &quot;,
                              round(AccuracyUpper, 3) * 100)), 
            color = &quot;black&quot;, size = 5) 

plt.summaryStats </code></pre>
<p><img src="Machine_learning_Classification_files/figure-html/unnamed-chunk-30-1.png" width="1152" /></p>
</div>
</div>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open')
  });
});
</script>

<!-- code folding -->

<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3,h4",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = false;
    options.smoothScroll = false;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>