.Rhistory

validate.fitted =  map2(.x = model, .y = validate.scaled, .f = ~predict(.x, .y)))
d.CV.SVM.linear = d.CV.SVM.linear %>% func.cv.prediction()
d.tune.svm.linear = d.CV.SVM.linear %>%
group_by(cost) %>%
summarise(accuracy.mean = mean(accuracy),
accuracy.sd = sd(accuracy)) %>%
arrange(desc(accuracy.mean))
d.tune.svm.linear
d.tune.svm.linear %>% ggplot(aes(x = cost, y = accuracy.mean)) +
geom_bar(stat = "identity", alpha = .8) + geom_point() + geom_line() +
scale_x_log10()
k1 = d.tune.svm.radial[1, 3:4] %>% mutate(kernel = "radial")
k2 = d.tune.svm.polynomial[1, 3:4] %>% mutate(kernel = "polynomial") # best degree 3
k3 = d.tune.svm.linear[1, 2:3]  %>% mutate(kernel = "linear")
rbind(k1, k2, k3)
# radial kernel renders the best cross-validation result up to 90.4 (sd 6.08) % overal accuracy, with hyper-parameter gamma 0.1 and cost 10. Linear kernel gives 82.1% accuracy. Polynomial with optimal degree 3 achieved 82.4% accuracy. Both linear and polynomial kernel generated much higher validation standard deviation.
cv.svm = k1 %>% mutate(model = "SVM")
mdl.svm = svm(data = trainingSet.scaled, type ~.,
# gamma = d.tune.svm.radial$gamma[1], cost = d.tune.svm.radial$cost[1],
kernel = "radial", type = "C-classification")
accuracy.training.svm = sum(predict(mdl.svm) == trainingSet.scaled$type) / nrow(trainingSet.scaled)*100
cat("Accuracy on the training set is", accuracy.training.svm, "%.")
accuracy.testing.svm = sum(predict(mdl.svm, newdata = testingSet.scaled) == testingSet.scaled$type) / nrow(testingSet.scaled) *100
cat("Accuracy on the testing set is", accuracy.testing.svm, "%.")
# confusion matrix
predict.SVM = predict(mdl.svm, newdata = testingSet.scaled)
# Def. func: converting confusion table into tibble format
func.tidyConfusionTable = function(table, modelName){
tb = table %>% as.data.frame() %>% spread(Var2, value = Freq) %>% mutate(model = modelName)
colnames(tb) = colnames(tb) %>% str_extract(pattern = one_or_more(WRD) )
return(tb)
}
cf.svm = table(predict.SVM, testingSet.scaled$type) %>%
func.tidyConfusionTable(modelName = "SVM")
# LDA -----
# Cross validation performance (checking performance only, not for hyper-param tune)
d.CV.LDA = trainingSet.cv.scaled %>%
mutate(model = map(.x = train.scaled, .f = ~lda(data = .x, type ~ ., prior = rep(1/3, 3))),
validate.fitted = map2(.x = model, .y = validate.scaled, .f = ~predict(.x, newdata = .y)$class)) %>%
func.cv.prediction()
cv.LDA = data.frame(accuracy.mean = d.CV.LDA$accuracy %>% mean(),
accuracy.sd = d.CV.LDA$accuracy %>% sd()) %>%
mutate(model = "LDA")
# set up model on entire training set
mdl.lda = lda(data = trainingSet.scaled, type ~., prior = rep(1/3, 3))
# Prediction on the training set
accuracy.training.LDA = sum(predict(mdl.lda)$class == trainingSet.scaled$type) / nrow(trainingSet.scaled) * 100
cat("Accuracy on the training set by Linear Discriminant Analysis is", accuracy.training.LDA, "%." )
# Prediction on the testing set
fitted.lda = predict(mdl.lda, newdata = testingSet.scaled)
predict.LDA = fitted.lda$class
cf.lda = table(predict.LDA, testingSet.scaled$type) %>%
func.tidyConfusionTable(modelName = "LDA")
accuracy.testing.lda = sum(predict(mdl.lda, newdata = testingSet.scaled)$class == testingSet.scaled$type) / nrow(testingSet.scaled) * 100
cat("Accuracy on the testing set by Linear Discriminant Analysis is", accuracy.testing.lda, "%.")
# probability distribution sample-wise
d.prob.lda = fitted.lda$posterior %>% as_tibble() %>% mutate(model = "LDA")
# random forest -----
featuresTune = 2:8
treesTune = seq(from = 100, to = 1000, by = 100)
d.CV.RF = trainingSet.cv.scaled %>%
crossing(features = featuresTune, trees = treesTune) %>%
mutate(parameters = map2(.x = features, .y = trees, .f = ~list(.x, .y)),  # No. of features 1st; No. trees 2nd
model = map2(.x = train.scaled, .y = parameters,
.f = ~ randomForest(data = .x, type ~.,
mtry = .y[[1]], ntrees = .y[[2]]))
)
d.CV.RF = d.CV.RF %>% # prediction of the validate fold
mutate(validate.fitted =  map2(.x = model, .y = validate.scaled, .f = ~ predict(.x, .y)),
# actual validation result
validate.actual = map(.x = validate.scaled, .f = ~.x$type %>% as.factor),
# actual vs. predicted of the validation set
validate.fitted.vs.actual = map2(.x = validate.fitted, .y = validate.actual, .f = ~ .x == .y ),
accuracy = map_dbl(.x = validate.fitted.vs.actual, .f = ~ round(sum(.x) / length(.x) * 100, 3)))
d.tune.RF = d.CV.RF %>%
group_by(trees, features) %>%
summarise(accuracy.mean = mean(accuracy),
accuracy.sd = sd(accuracy)) %>%
arrange(desc(accuracy.mean))
d.tune.RF
plt.RF.tune = d.tune.RF %>%
func.plot.tune.HyperParam(hyper1 = "trees", hyper2 = "features") +
coord_fixed(ratio = 100) + # an arbitrary ratio for nice display
scale_x_continuous(breaks = treesTune) +
scale_y_continuous(breaks = featuresTune)
plt.RF.tune
cv.RF = d.tune.RF[1, ] %>% ungroup() %>%
select(contains("accuracy")) %>% mutate(model = "RF")
# train model using entire training set
mdl.rf = randomForest(data = trainingSet.scaled, type ~., num.trees = 900, mtry = 2)
# Prediction on the training set
accuracy.training.RF =
sum(predict(mdl.rf, data = trainingSet.scaled) == trainingSet.scaled$type) / nrow(trainingSet.scaled) * 100
cat("Accuracy on the training set by Random Forest is", accuracy.training.RF, "%")
# Prediction on the testing set by RF
predict.RF = predict(mdl.rf, testingSet.scaled, type = "response")
cf.RF = table(predict.RF, testingSet.scaled$type) %>%
func.tidyConfusionTable(modelName = "RF")
accuracy.testing.RF = sum(predict.RF == testingSet.scaled$type) / nrow(testingSet.scaled) * 100
cat("Accuracy on the testing set using Random Forest is", accuracy.testing.RF, "%")
# Probability distribution of predicted test set
d.prob.RF = predict(mdl.rf, testingSet.scaled, type = "prob") %>%
as_tibble() %>%
mutate(model = "RF")
# Naive Bayes
# cross validation to evaluate model performance (not for tune of hyper-param)
d.CV.NB = trainingSet.cv.scaled %>%
mutate(model = map(.x = train.scaled, .f = ~naiveBayes(data = .x, type ~ ., prior = rep(1/3, 3))),
validate.fitted = map2(.x = model, .y = validate.scaled, .f = ~predict(.x, newdata = .y))) %>%
func.cv.prediction()
cv.NB = data.frame(accuracy.mean = d.CV.NB$accuracy %>% mean(),
accuracy.sd = d.CV.NB$accuracy %>% sd()) %>%
mutate(model = "NB")
# Set up model on entire training set
mdl.nb = naiveBayes(x = trainingSet.scaled[, -1],
y = trainingSet.scaled$type %>% as.factor(), # y has to be factor
prior = c(1/3, 1/3, 1/3))
accuracy.training.NB = sum(predict(mdl.nb, newdata = trainingSet.scaled[, -1]) == trainingSet.scaled$type)/nrow(trainingSet.scaled) * 100
cat("Accuracy on the training set using Naive Bayes is", accuracy.training.NB, "%.")
predict.NB = predict(mdl.nb, testingSet.scaled[, -1])
cf.NB = table(predict.NB, testingSet.scaled$type) %>%
func.tidyConfusionTable(modelName = "NB")
accuracy.testing.NB = sum(predict.NB == testingSet.scaled$type)/nrow(testingSet.scaled) * 100
cat("Accuracy on the testing set using Naive Bayes is", accuracy.testing.NB, "%.")
d.prob.NB = predict(mdl.nb, testingSet.scaled[, -1], type = "raw")  %>%
as_tibble() %>% mutate(model = "NB")
# d.prob.NB
# regularized logistic (softmax) regression
# cross validation to check model performance.
d.CV.LR = trainingSet.cv.scaled %>%
mutate(model = map(.x = train.scaled, # note that in train and validate folds, the type is the last column
.f = ~ cv.glmnet(x = .x[, -ncol(.x)] %>% as.matrix(), y = .x$type,
# important that input x has to be matrix!
family = "multinomial", alpha = 1)),
validate.fitted = map2(.x = model, .y = validate.scaled,
.f = ~ predict(.x, newx = .y[, -ncol(.y)] %>% as.matrix(),
type = "class", s = .x$lambda.1se ) %>% c() )) %>%
func.cv.prediction()
cv.LR = data.frame(accuracy.mean = d.CV.LR$accuracy %>% mean(),
accuracy.sd = d.CV.LR$accuracy %>% sd()) %>%
mutate(model = "LR")
# set up model on entire training set
softmax.cv = cv.glmnet(x = trainingSet.scaled[, -1] %>% as.matrix(),
y = trainingSet.scaled$type, family = "multinomial", alpha = 1)
plot(softmax.cv)
# Prediction on the training set
fitted.softmax.train = predict(softmax.cv, newx = trainingSet.scaled[, -1] %>% as.matrix(),
s = softmax.cv$lambda.1se, type = "class") %>% c()
accuracy.training.LR = sum(fitted.softmax.train == trainingSet.scaled$type) / nrow(trainingSet.scaled) * 100
cat("Accuracy on the training set using lasso-regularized softmax regression is", accuracy.training.LR, "%.")
# Prediction on the testing set
predict.softmax = predict(softmax.cv, newx = testingSet.scaled[, -1] %>% as.matrix(),
s = softmax.cv$lambda.1se, type = "class") %>% c()
cf.LR = table(predict.softmax, testingSet.scaled$type) %>%
func.tidyConfusionTable(modelName = "LR")
accuracy.testing.LR = sum(predict.softmax == testingSet.scaled$type) / nrow(testingSet.scaled) * 100
cat("Accuracy on the training set using lasso-regularized softmax regression is", accuracy.testing.LR, "%.")
table(predict.softmax, testingSet.scaled$type)
# Predicted probability distribution on the test set
d.prob.LR = predict(softmax.cv, newx = testingSet.scaled[, -1] %>% as.matrix(),
s = softmax.cv$lambda.1se, type = "response") %>%
as_tibble() %>%
mutate(model = "LR")
colnames(d.prob.LR) = colnames(d.prob.LR) %>% str_extract(one_or_more(WRD))
# d.prob.LR
## All model comparison
# prob distribution
func.addSampleInfo = function(dataset) {
dataset %>% cbind(testingSet.copy %>% select(code, Sample, type, character))
}
d.prob.lda = d.prob.lda %>% func.addSampleInfo()
d.prob.NB = d.prob.NB %>% func.addSampleInfo()
d.prob.LR = d.prob.LR %>% func.addSampleInfo()
d.prob.RF = d.prob.RF %>% func.addSampleInfo()
d.prob = d.prob.lda %>% rbind(d.prob.NB) %>% rbind(d.prob.LR) %>% rbind(d.prob.RF)
# plot sample-model wise probability distribution
plt.probabilityDistribution = d.prob %>%
gather(c(adulterated_L_J, authentic_L_J, lemonade), key = type, value = prob) %>%
ggplot(aes(x = code, y = prob, fill = type)) +
geom_bar(stat = "identity", alpha = .8, color = "white", size = .1, position = "stack") +
facet_wrap(~model, nrow = 1) +
coord_flip() +
scale_fill_startrek() +
theme(panel.border = element_blank(),
panel.grid = element_blank(),
# the vertical axis title and text refers to identity prediciton plot
axis.title.y = element_blank(),
axis.text.y = element_blank()) +
scale_y_continuous(breaks = seq(0, 1, by = 1)) +
labs(y = "Prediction probability", x = "Sample code")
# plt.probabilityDistribution
# Samplewise identity prediction result
d.fittedTestingset =
data.frame(LDA = predict.LDA, LR = predict.softmax, NB = predict.NB, RF = predict.RF,
SVM = predict.SVM) %>%
func.addSampleInfo() %>% rename(Actual = type) %>% as_tibble()
d.fittedTestingset.tidy = d.fittedTestingset %>%
gather(c(LDA, LR, NB, RF, SVM, Actual), key = model, value = fittedType)
plt.predictionResult =
d.fittedTestingset.tidy %>%
ggplot(aes(x = code, y = 1, color = fittedType)) +
geom_segment(aes(xend = code, y = .95, yend = 1), size = 4, alpha = .8) +
facet_wrap(~model, nrow = 1) +
coord_flip() +
theme(strip.text = element_text(face = "bold", size = 8),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
panel.spacing = unit(0, "lines"), # facet gap size
# x axis text and title in white color as placeholders for plot alignment
axis.text.x = element_text(colour = "white"),
axis.title.x = element_text(colour = "white"),
axis.text = element_text(size = 10),
axis.ticks = element_blank(),
legend.position = "none") +
scale_color_startrek() +
labs(x = "Sample code")
plt.predictionResult
plt.samplewisePrediction =
plot_grid(plt.predictionResult, plt.probabilityDistribution,
labels = c("A", "B"), label_size = 18, rel_widths = c(2, 4), nrow = 1)
plt.samplewisePrediction
# Confusion matrix
d.cf.tidy = rbind(cf.lda, cf.LR) %>% rbind(cf.NB) %>% rbind(cf.RF) %>% rbind(cf.svm) %>%
gather(c(adulterated_L_J, authentic_L_J, lemonade), key = actual, value = count)
# Def. func. abbreviating sample types (for display in confusion matrix figure)
func.abreviateTypes = function(vector){
vector %>% str_replace(pattern = "adulterated_L_J", replacement = "ADLJ") %>%
str_replace(pattern = "authentic_L_J", replacement = "AULJ") %>%
str_replace(pattern = "lemonade", replacement = "LMND")
}
d.cf.tidy$predict = d.cf.tidy$predict %>% func.abreviateTypes()
d.cf.tidy$actual  = d.cf.tidy$actual %>% func.abreviateTypes()
types = factor(c("LMND", "ADLJ", "AULJ"), ordered = T)
# ordered axis
d.cf.tidy$predict = d.cf.tidy$predict %>% factor(levels = types, ordered = T)
d.cf.tidy$actual = d.cf.tidy$actual %>% factor(levels = rev(types), ordered = T)
# define color
d.cf.tidy = d.cf.tidy %>%
mutate(CorrectOrNot = predict == actual,
diagnal = count != 0 & CorrectOrNot == T,
offDiag.incorrect = diagnal == F & count > 0,
judge = str_c(diagnal,"_", offDiag.incorrect))
plt.confusionMatrix = d.cf.tidy %>%
ggplot(aes(x = actual, y = predict, fill = judge)) +
geom_label(aes(label = count), alpha = .5, fontface = "bold", size = 5) +
facet_wrap(~model, nrow = 1) +
scale_fill_manual(values = c("FALSE_FALSE" = "lightgrey",
"FALSE_TRUE" = "tomato",
"TRUE_FALSE" = "Steelblue")) +
theme(legend.position = "",
axis.text = element_text(face = "bold"),
strip.text = element_text(size = 12)) +
labs(x = "\nActual identity", y = "Prediction\n")
plt.confusionMatrix
# grid.arrange(plt.confusionMatrix, plt.samplewisePrediction, nrow = 2)
# Crossvalidation result
cv.accuracy = rbind(cv.LDA, cv.LR) %>% rbind(cv.NB) %>% rbind(cv.RF) %>%
rbind(cv.svm %>% select(-kernel)) %>%
mutate(Accuracy = paste(accuracy.mean %>% round(1), "±", accuracy.sd %>% round(1)) )
# set up theme for pure text
theme.pureText = theme_void() +
# keeping the text elements in white as place holders for axis alignment with the confusion matrix
theme(axis.text =  element_text(colour = "white"), # y
axis.title = element_text(colour = "white", size = 32),
# large size help text align up with confusion matrix (title wth row gap)
axis.text.x = element_blank(), # x title and text blank to reduce gap between text rows
axis.title.x = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank(),
axis.ticks = element_blank())
# Ensure the model order is the same as shown in the confusion matrix
plt.accuracy.cv = cv.accuracy %>%
ggplot(aes(x = model, y = 1)) +
geom_text(aes(label = Accuracy, fontface = "bold" )) +
theme.pureText
plt.accuracy.cv
# Accuracy on the entire training and testing dataset
model = c("LDA", "LR", "NB", "RF", "SVM")
training = c(accuracy.training.LDA, accuracy.training.LR, accuracy.training.NB, accuracy.training.RF, accuracy.training.svm)
testing = c(accuracy.testing.lda, accuracy.testing.LR, accuracy.testing.NB, accuracy.testing.RF, accuracy.testing.svm)
d.accuracy.train.test = data.frame(model = model, accuracy.training = training, accuracy.testing = testing)
plt.accuracy.Training = d.accuracy.train.test %>%
ggplot(aes(x = model, y = 1)) +
geom_text(aes(label = round(accuracy.training, 1), fontface = "bold" )) +
theme.pureText
plt.accuracy.Training
# Accuracy on the testing set
plt.accuracy.Testing = d.accuracy.train.test %>%
ggplot(aes(x = model, y = 1)) +
geom_text(aes(label = round(accuracy.testing, 2)),
fontface = "bold") +
theme.pureText
plt.accuracy.Testing
# PLOT
# 7.15 X 3.06 on big screen for optimal output!!
plt.accuracy.confusionMatrix =
plot_grid(plt.accuracy.cv, plt.accuracy.Training, plt.accuracy.Testing, plt.confusionMatrix,
rel_heights = c(1, 1, 1, 7), nrow = 4,
labels = c("A", "B", "C", "D"),
label_size = 15, label_x = .03,
label_colour = "black")
plt.accuracy.confusionMatrix
# A,accuracy of prediction of the 5-fold cross-validation within the training set; B, prediction accuracy of the training set using models based on entire training set; C), accuracy of the testing set using models based on entire training set.
# Version for paper, temporarily hide legend for optimal layout, then manually add it in PPT
# Note 7.0 X 4.5 dimension on big screen !!
plt.samplewisePrediction.paperVersion =
plot_grid(plt.predictionResult,
plt.probabilityDistribution + theme(legend.position = "none"),
labels = c("E", "F"), label_size = 15, rel_widths = c(2.5, 4),
label_x = .03,
nrow = 1)
# Prediction result all in all
# 7 X 7 on big screen for optimal layout
plot_grid(plt.accuracy.confusionMatrix,
plt.samplewisePrediction.paperVersion,
nrow = 2, rel_heights = c(2.5, 4))
# model interpretation
# Random forest
func.plot.ICE.RF = function(feature) {
lowerBound = trainingSet.scaled[[feature]] %>% min()
upperBound = trainingSet.scaled[[feature]] %>% max()
ICE = trainingSet.scaled %>%
mutate(instance = 1:nrow(trainingSet.scaled)) # unique instance code for each training example
ICE = ICE %>% select(ncol(ICE), 1:(ncol(ICE)-1))
ICE.grid = expand.grid(instance = ICE$instance,
grid = seq(lowerBound, upperBound, length.out = 100)) %>%
left_join(ICE, by = "instance") %>% as_tibble() %>%
rename(actual.type = type)
# update feature of interest without changing feature column order
ICE.grid[[feature]] = ICE.grid$grid
feature.grid = ICE.grid %>% select(-c(grid, instance))
# Random forest
ICE.fitted = predict(mdl.rf, newdata = feature.grid, type = "prob")  %>% as_tibble()
# Individual instance
ICE.fitted.tidy = ICE.fitted %>% as_tibble() %>%
mutate(instance = ICE.grid$instance, grid = ICE.grid$grid, actual.type = ICE.grid$actual.type,
instance = as.numeric(instance)) %>%
gather(1:3, key = predicted.type, value = fitted.prob)
# the overal trend
ICE.fitted.tidy.OVERAL = ICE.fitted.tidy %>%
group_by(actual.type, predicted.type, grid) %>%
summarise(fitted.prob = mean(fitted.prob))
# plot
plt.ICE =
ICE.fitted.tidy %>%
ggplot(aes(x = grid, y = fitted.prob, color = actual.type)) +
geom_line(aes(group = instance), alpha = .3) +
facet_wrap(~predicted.type, nrow = 1) +
labs(caption = "color by actual type, faceted by predicted type") +
scale_color_manual(values = color.types) +
labs(title = paste0(feature, " (Random Forest)"),
x = "Standard deviation grids",
y = "Predicted probability for each class") +
# overal trend as top layer
geom_line(data = ICE.fitted.tidy.OVERAL, size = 2) +
# rug
geom_rug(data = trainingSet.scaled, aes_string(x = feature),
inherit.aes = F, alpha = .3) +
coord_cartesian(xlim = c(lowerBound, 2)) +
scale_y_continuous(breaks = seq(0, 1, by = .2))
# Turning point usually much ealier than grid sd 2.
# a further manual adjustment than automatic range selection set by "upperBound"
plt.ICE %>% return()
}
# logistic (softmax) regression
func.plot.ICE.logistic = function(feature) {
lowerBound = trainingSet.scaled[[feature]] %>% min()
upperBound = trainingSet.scaled[[feature]] %>% max()
ICE = trainingSet.scaled %>%
mutate(instance = 1:nrow(trainingSet.scaled)) # unique instance code for each training example
ICE = ICE %>% select(ncol(ICE), 1:(ncol(ICE)-1))
ICE.grid = expand.grid(instance = ICE$instance,
grid = seq(lowerBound, upperBound, length.out = 100)) %>%
left_join(ICE, by = "instance") %>% as_tibble() %>%
rename(actual.type = type)
# update feature of interest without changing feature column order
ICE.grid[[feature]] = ICE.grid$grid
feature.grid = ICE.grid %>% select(-c(grid, instance))
# logistic regression
ICE.fitted = predict(softmax.cv, newx = feature.grid[, -1] %>% as.matrix(),
s = softmax.cv$lambda.1se,, type = "response") %>%
as.tibble() %>%
rename(adulterated_L_J = adulterated_L_J.1, authentic_L_J = authentic_L_J.1, lemonade = lemonade.1)
# Individual instance
ICE.fitted.tidy = ICE.fitted %>% as_tibble() %>%
mutate(instance = ICE.grid$instance, grid = ICE.grid$grid, actual.type = ICE.grid$actual.type,
instance = as.numeric(instance)) %>%
gather(1:3, key = predicted.type, value = fitted.prob)
# the overal trend
ICE.fitted.tidy.OVERAL = ICE.fitted.tidy %>%
group_by(actual.type, predicted.type, grid) %>%
summarise(fitted.prob = mean(fitted.prob))
# plot
plt.ICE =
ICE.fitted.tidy %>%
ggplot(aes(x = grid, y = fitted.prob, color = actual.type)) +
geom_line(aes(group = instance), alpha = .3) +
facet_wrap(~predicted.type, nrow = 1) +
scale_color_manual(values = color.types) +
labs(title = paste0(feature, " (Logistic regression)"),
x = "Standard deviation grids",
y = "Predicted probability for each class",
caption = "color by actual type, faceted by predicted type") +
# overal trend as top layer
geom_line(data = ICE.fitted.tidy.OVERAL, size = 2) +
# rug
geom_rug(data = trainingSet.scaled, aes_string(x = feature),
inherit.aes = F, alpha = .3) +
coord_cartesian(xlim = c(lowerBound, 2)) +
scale_y_continuous(breaks = seq(0, 1, by = .2))
# Turning point usually much ealier than grid sd 2.
# a further manual adjustment than automatic range selection set by "upperBound"
plt.ICE %>% return()
}
# Model interpretation comparison: RF vs. LR
func.plt.ICE.modelComparison.distribution = function(featureCode = 1){
lemonFeatures = colnames(trainingSet)[-1]
plt.ICE.citric.acid.logistic = func.plot.ICE.logistic(feature = lemonFeatures[featureCode])
plt.ICE.citric.acid.randomForest = func.plot.ICE.RF(feature = lemonFeatures[featureCode])
plot_grid(plt.ICE.citric.acid.logistic,
plt.ICE.citric.acid.randomForest,
# distribution
plot_grid(
# authentic vs. adulterated
d %>%
filter(type != "lemonade") %>%
ggplot(aes_string(x = lemonFeatures[featureCode], fill = "type", color = "type")) +
geom_density(alpha = .2, position = "dodge") +
scale_color_manual(values = color.types) +
scale_fill_manual(values = color.types) +
theme(legend.position = "none"),
# all three classes
d %>%
ggplot(aes_string(x = lemonFeatures[featureCode], fill = "type", color = "type")) +
geom_density(alpha = .2, position = "dodge") +
scale_color_manual(values = color.types) +
scale_fill_manual(values = color.types),
# layout
nrow = 1, rel_widths = c(4, 5) ),
nrow = 3, rel_heights = c(1, 1, .7), labels = c("A", "B", "C"), label_size = 17
)
}
func.plt.ICE.modelComparison.distribution(featureCode = 2 )
library(readxl)
library(rebus)
library(stringr)
library(ggrepel)
library(gridExtra)
library(cowplot)
library(RColorBrewer)
library(viridis)
library(ggcorrplot)
library(ggsci)
library(plotly)
# machine learning packages
library(glmnet)
library(MASS)
library(e1071)
library(rsample)
library(randomForest)
# finally load tidyverse avoiding key functions from being masked
library(tidyverse)
set.seed(2020)
theme_set(theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_text(face = "bold", size = 11),
legend.text = element_text(size = 10),
legend.title = element_blank(),
axis.text = element_text(size = 11, colour = "black"),
title = element_text(colour = "black", face = "bold"),
axis.title = element_text(size = 12)))
# global color set
color.types = c("firebrick", "steelblue", "darkgreen")
names(color.types) = c("adulterated_L_J", "authentic_L_J", "lemonade")
path = "/Users/Boyuan/Desktop/My publication//Users/Boyuan/Desktop/My publication/14. Lemon juice (Weiting)/publish ready files/June 2020/Lemon paper data_B.Y.xlsx"
d = read_excel(path, sheet = "March")
path = "/Users/Boyuan/Desktop/My publication//Users/Boyuan/Desktop/My publication/14. Lemon juice (Weiting)/publish ready files/June 2020/Lemon paper data_B.Y.xlsx"
d = read_excel(path, sheet = "March")
path = "/Users/Boyuan/Desktop/My publication/14. Lemon juice (Weiting)/publish ready files/Lemon paper data_B.Y.xlsx"
d = read_excel(path, sheet = "March")
d = d %>% filter(!code %in% c(63:66)) # No. 63-66 belongs to comemrcially sourced lemon juices
# Replace special values
vectorReplace = function(x, searchPattern){
replaceWith = NA
if (searchPattern == "T.") {
# arbitrarily replace Trace level as one fifth of the minimum
replaceWith = ((as.numeric(x) %>% min(na.rm = T)) / 5) %>% as.character()
} else if (searchPattern == "n.d.") {
# arbitrarily set non-detected level as content being zero
replaceWith = "0"
} else if (searchPattern == "LOD") {
# for content whose UV absorption beyond instrument limit, set as double of the maximum value
replaceWith = ((as.numeric(x) %>% max(na.rm = T)) * 2) %>% as.character()
}
if (is.na(replaceWith)) { return(x) } else { # only performnce replacement when with special values
x = str_replace_all(x, pattern = searchPattern, replacement = replaceWith)
return(x)
}
}
dd = d[, -c(1:4)]
dd = apply(dd, 2, vectorReplace, searchPattern = "T.")
dd = apply(dd, 2, vectorReplace, searchPattern = "n.d.")
dd = apply(dd, 2, vectorReplace, searchPattern = "LOD") %>% as_tibble()
d = cbind(d[, c(1:4)], # sample id information
apply(dd, 2, as.numeric) %>% as_tibble()) %>% # content in numeric values
as_tibble()
# convert code into ordered factor, in descending order of 1, 2, 3....
d$code = d$code %>% factor(levels = d$code, ordered = T)
d$code = d$code %>% factor(levels = rev(d$code), ordered = T)
shiny::runApp('~/Desktop/My publication/14. Lemon juice (Weiting)/Shiny/LemonClassification')
runApp('~/Desktop/My publication/14. Lemon juice (Weiting)/Shiny/LemonClassification')