gbm-state.Rmd

---
title: "Gradient Boosting Machines by State"
description: |
 This R script trains gradient boosting machines (GBM) models to perform the state classification task (i.e., predict homeostatic vs. intermediate vs. reactive).
author:
  - first_name: "Ayush"
    last_name: "Noori"
    url: https://www.github.com/ayushnoori
    affiliation: Massachusetts General Hospital
    affiliation_url: https://www.serranopozolab.org
    orcid_id: 0000-0003-1420-1236
output:
  distill::distill_article:
    toc: true
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```

# Dependencies

Load requisite packages and define directories.

```{r load-packages, message=FALSE, warning=FALSE}

# data manipulation
library(data.table)
library(purrr)
library(magrittr)

# data visualization
library(ggplot2)

# fast file system operations
library(fs)

# gradient boosted machines
library(caret)
library(gbm)

# ROC curve
library(pROC)
library(plotROC)

# utility functions
library(brainstorm)

```

Note that directories are relative to the R project path.

```{r define-directores}

# create file structure
celltypes = c("Astrocyte", "Microglia", "Vessel") %>% purrr::set_names()

# set directories
ddir = file.path("Results", "CNN", "1.2 - State Partition")
dir4 = file.path("Results", "4 - Spectral Clustering")
dir7 = file.path("Results", "7 - Gradient Boosting Machines", "State")

# create directory
if(!dir.exists(dir7)) {dir.create(dir7)}

# set seed
set.seed(1234)

```

# Load Data

Load processed ROI measurement data from the `4 - Spectral Clustering` directory. **Note that this script uses the same train/test/validation split as the convolutional neural network (CNN) by loading the data object created by the `CNN/1 - Partition ROIs` script.**

```{r load-data}

all = readRDS(file.path(dir4, "Z-Score Data.rds"))
split = readRDS(file.path(ddir, "ROI Partition by State.rds"))

```

# Merge Data

Define function to merge ROI measurement data with the predetermined train/test/validation split.

```{r merge-data}

# function to parse and merge data
merge_data = function(allx, splitx) {
  
  # parse split data
  splitx = splitx %>%
    .[, ID := gsub("(\\.tif|AD_|CTRL_)", "", Name)] %>%
    .[, .(ID, Partition)]

  # merge data
  allx = merge(allx, splitx, by = "ID", all.x = TRUE, all.y = FALSE)
  return(allx)
  
}

```

Map function over data objects for cell-types with CNN output data.

```{r map-merge}

# function to parse and merge data
all = map(celltypes, ~merge_data(all[[.x]], split[[.x]]))

```

# Train GBM Model

Define function to plot variable importance scores.

```{r plot-imp}

# function to plot variable importance
plot_imp = function(imp) {
  
  # convert to data table
  imp = as.data.table(imp$importance, keep.rownames = "Marker") %>%
    setnames("Overall", "Importance") %>%
    .[order(-Importance)] %>%
    .[, Marker := factor(Marker, levels = rev(Marker))]
  
  # plot variable importance
  p = ggplot(imp, aes(x = Importance, y = Marker, fill = Importance,
                      label = round(Importance, 2))) +
    geom_bar(stat = "identity", width = 0.7, color = "black") +
    scale_fill_gradient(low = "#EADA86", high = "#B67977") +
    geom_text(size = 3, hjust = 1.2, fontface = "bold")+
    scale_x_continuous(limits = c(-0.4, max(imp$Importance)), 
                       expand = expansion(mult = c(0, 0.05))) +
    theme_bw() +
    theme(plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
          axis.title.y = element_blank(),
          axis.text.y = element_text(size = 10, color = "black"),
          axis.title.x = element_text(size = 12, face = "bold"),
          axis.ticks.x = element_line(color = "black"),
          axis.ticks.y = element_blank(),
          panel.border = element_rect(color = "black", fill = NA, size = 1),
          legend.position = "none")
  
  return(p)
  
}

```

Define function to train the gradient boosting machines (GBM) model and save output

```{r train-gbm}

train_gbm = function(dat, lab, mx) {
  
  # create subdirectory if needed
  wdir = file.path(dir7, lab)
  if(!dir.exists(wdir)) {dir.create(wdir)}
  
  # partition data into training/test
  train_dat =  dat[Partition %in% c("Train", "Validation"), ]
  test_dat = dat[Partition == "Test", ]
  
  # establish 10-fold cross validation to determine the out-of-sample error
  tC = trainControl(method = "cv", number = 10, savePredictions = TRUE,
                    classProbs = TRUE, verboseIter = TRUE)
  
  # estimate pre-processing transformation (centering, scaling, remove zero
  # variance) from training data, apply to all data
  normalize = preProcess(train_dat[, ..mx],
                         method = c("center", "scale", "zv"), verbose = TRUE)
  train_dat[, (mx) := predict(normalize, train_dat[, mx, with = FALSE])]
  test_dat[, (mx) := predict(normalize, test_dat[, mx, with = FALSE])]
  
  # train model on training set (80%)
  gbm_model = train(x = train_dat[, ..mx], y = train_dat[, State],
                    method = "gbm", trControl = tC)
  gbm_imp = varImp(gbm_model, scale = FALSE)
  
  # plot variable importance
  ggsave(file.path(wdir, "Variable Importance.pdf"), plot_imp(gbm_imp),
         width = 8, height = 2.5 + length(mx)/2)
  
  # test model on test set (20%)
  gbm_pred = predict(gbm_model, test_dat[, ..mx])
  gbm_cm = confusionMatrix(gbm_pred, test_dat[, State])
  gbm_prob = data.table(predict(gbm_model, test_dat[, ..mx], type = "prob"),
                        test_dat[, .(State)])
  
  # write GBM probabilities to file
  fwrite(gbm_prob, file.path(wdir, "GBM Probabilities.csv"))
  
  # calculate multiclass AUC
  roc_calc = multiclass.roc(response = gbm_prob$State, predictor = gbm_prob[, .(Homeostatic, Intermediate, Reactive)])
  
  # define aes values
  h_i = copy(gbm_prob)[State %in% c("Homeostatic", "Intermediate")][, State := droplevels(State)]
  h_r = copy(gbm_prob)[State %in% c("Homeostatic", "Reactive")][, State := droplevels(State)]
  i_r = copy(gbm_prob)[State %in% c("Intermediate", "Reactive")][, State := droplevels(State)]
  
  # plot multiclass AUC curve
  roc_plot = ggplot() +
    ggtitle("Gradient Boosting Machines ROC") +
    geom_abline(aes(intercept = 0, slope = 1), linetype = "dashed", size = 1, color = "gray") +
    geom_roc(data = h_i, aes(d = State, m = Intermediate, color = "Homeostatic/Intermediate"), labels = FALSE, pointsize = 0) +
    geom_roc(data = i_r, aes(d = State, m = Reactive, color = "Intermediate/Reactive"), labels = FALSE, pointsize = 0) +
    geom_roc(data = h_r, aes(d = State, m = Reactive, color = "Reactive/Homeostatic"), labels = FALSE, pointsize = 0) +
    scale_colour_manual(values = c("#39B200", "#F0C808", "#960200")) +
    labs(x = "1 - Specificity", y = "Sensitivity", color = "Comparison", subtitle = paste("Multi-Class AUC =", round(as.numeric(roc_calc$auc), 4))) + theme_bw() +
    theme(plot.title = element_text(size = 16, face = "bold"),
          plot.subtitle = element_text(face = "italic"),
          axis.title.x = element_text(size=14, face="bold"),
          axis.title.y = element_text(size=14, face="bold"),
          legend.title = element_text(size=12, face="bold"),
          legend.text = element_text(size=12),
          legend.position = c(0.72, 0.17),
          legend.background = element_rect(fill = "white", color = "black"),
          panel.border = element_rect(color = "black", fill = NA, size = 1))
  
  # save ROC curve
  ggsave(file.path(wdir, "Multi-Class ROC Curve.pdf"), roc_plot, width = 6, height = 6.2)
  
  # send output to external file
  logf = file.path(wdir, "GBM Log.txt")
  if (file.exists(logf)) { file.remove(logf) }
  
  # establish sink
  sink(logf)
  gbm_model %>%
    list(., .$results, .$finalModel, gbm_imp,
         as.matrix(gbm_imp$importance), gbm_cm) %>%
    walk(., print)
  cat(paste0("\nMulti-Class AUC = ", round(as.numeric(roc_calc$auc), 4)))
  sink()
  
  # return full list of output
  return(list(Test = test_dat, Train = train_dat, Model = gbm_model,
              Prediction = gbm_pred, CM = gbm_cm, Scores = gbm_prob,
              ROC = roc_calc, Importance = gbm_imp))
  
}

```

# Perform Machine Learning

Train the GBM model by mapping over `all`.

```{r perform-ml}

# define markers of interest
markers = list(Astrocyte = c("GFAP", "YKL40", "VIM", "TSPO",
                             "EAAT1", "EAAT2", "GS"),
               Microglia = c("MHC2", "CD68", "TMEM119", "TSPO", "FTL"),
               Vessel = c("GFAP", "YKL40", "VIM", "TSPO", "EAAT1", "EAAT2", "GS"))

# train GBM model
gbm_results = imap(all, ~train_gbm(.x, .y, markers[[.y]]))

# save output
saveRDS(gbm_results, file.path(dir7, "GBM Results.rds"))

```