From d190bc877f55e01d39f19b99b68eb944e49e650f Mon Sep 17 00:00:00 2001
From: CamiloPosso <camilo.posso@pnnl.gov>
Date: Tue, 1 Nov 2022 09:16:29 -0700
Subject: [PATCH] Small updates to plots + subtype AUC interaction

Added a few small changes to plots, like removing vitalStatus from the NMF heatmaps.

Also included the subtype + AUC interaction script.
---
 .gitignore                                    |   6 +
 cohort_summary/NMF/NMF_raw_results.Rmd        |   2 +-
 cohort_summary/NMF/NMF_summary.Rmd            |  14 +-
 ...net_multinomial_predictions_cell_lines.Rmd |  14 +-
 ...k_4_enet_multinomial_predictions_pilot.Rmd |  13 +
 cohort_summary/NMF/subtype_drivers.Rmd        |   8 +-
 cohort_summary/NMF/subtype_drivers_159.Rmd    | 412 ++++++++++++++++
 cohort_summary/NMF/subtype_drivers_alt.Rmd    | 417 ++++++++++++++++
 .../subtype_added_value.Rmd                   | 452 ++++++++++++++++++
 proteomics/1-process_global_data.Rmd          |   3 +-
 util/mutational_analysis_helper.R             |   2 +-
 11 files changed, 1325 insertions(+), 18 deletions(-)
 create mode 100644 cohort_summary/NMF/subtype_drivers_159.Rmd
 create mode 100644 cohort_summary/NMF/subtype_drivers_alt.Rmd
 create mode 100644 cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd

diff --git a/.gitignore b/.gitignore
index 6a9de60..402206c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,9 @@ cohort_summary/NMF/multinomial_models_k_4_elastic_net/cell_line_data_notes.txt
 cohort_summary/NMF/peptide_subtype_interaction/Protein peptide deviation Vlad.pptx
 cohort_summary/NMF/peptide_subtype_interaction/Protein Peptide Deviation PTRC tuesday.pptx
 cohort_summary/supplemental_table_1.txt
+cohort_summary/NMF/cophenetic_correlation.txt
+cohort_summary/NMF/logFC.txt
+cohort_summary/NMF/subtype_mutation_AUC/drug_response.csv
+cohort_summary/NMF/subtype_mutation_AUC/Figures
+Figures
+cohort_summary/NMF/diving_into_subtype_drivers.Rmd
diff --git a/cohort_summary/NMF/NMF_raw_results.Rmd b/cohort_summary/NMF/NMF_raw_results.Rmd
index 1496920..3aeeff7 100644
--- a/cohort_summary/NMF/NMF_raw_results.Rmd
+++ b/cohort_summary/NMF/NMF_raw_results.Rmd
@@ -12,12 +12,12 @@ library(dplyr)
 library(tibble)
 library(tidyr)
 
-
 source("../../util/synapseUtil.R")
 source("../../util/loading_data.R")
 source("NMF_helper.R")
 load.combined.data()
 
+
 ```
 
 
diff --git a/cohort_summary/NMF/NMF_summary.Rmd b/cohort_summary/NMF/NMF_summary.Rmd
index 80715d5..3cd9912 100644
--- a/cohort_summary/NMF/NMF_summary.Rmd
+++ b/cohort_summary/NMF/NMF_summary.Rmd
@@ -120,6 +120,9 @@ coph.correlation <- sapply(results, cophcor) %>%
   data.frame(coph.cor = ., k = 2:8) %>%
   filter(k <= 8)
 
+write.table(coph.correlation, file = "cophenetic_correlation.txt", 
+            sep = "\t", quote = F, row.names = F)
+
 p <- ggplot(coph.correlation, aes(x = k, y = coph.cor)) + geom_line() + 
   ylab("Cophenetic correlation") + scale_x_continuous(breaks = 2:8)
 
@@ -229,7 +232,7 @@ fab_annotations <- summary.table %>%
 ## pheatmap column labels
 ann.labels <- meta %>%
   filter(Barcode.ID %in% samples.common) %>%
-  select(FLT3.ITD, PostChemotherapy, vitalStatus) %>%
+  select(FLT3.ITD, PostChemotherapy) %>%
   mutate(FLT3.ITD = as.factor(FLT3.ITD),
          PostChemotherapy = as.factor(PostChemotherapy),
          Barcode.ID = rownames(.))
@@ -260,12 +263,12 @@ ann.labels <- ann.labels %>%
   mutate(FAB = factor(FAB, levels = c("M0", "M1", "M2", "M3", "M4", "M5", "None"))) %>%
   column_to_rownames("Barcode.ID") %>%
   as.data.frame() %>%
-  select(PostChemotherapy, NPM1, NPM1_clinical, RAS, vitalStatus, FLT3.ITD, FLT3_MAF, FLT3_AR, FAB) %>%
+  select(PostChemotherapy, NPM1, NPM1_clinical, RAS, FLT3.ITD, FLT3_MAF, FLT3_AR, FAB) %>%
   dplyr::rename(`FLT3_MAF (percentage)` = FLT3_MAF,
                 `RAS vaf` = RAS)
 
 ## Slimming down annotations
-ann.labels <- ann.labels[, c("RAS vaf", "FLT3.ITD", "NPM1_clinical", "FAB",  "vitalStatus")]
+ann.labels <- ann.labels[, c("RAS vaf", "FLT3.ITD", "NPM1_clinical", "FAB")] #,  "vitalStatus")]
 
 ## White to dark purple (like the snow storms on weather maps)
 colors.wp <- scales::seq_gradient_pal("#FFFFFF", "#4A0078", "Lab")(seq(0,1,length.out = p.celing*10 + 1))
@@ -282,9 +285,10 @@ ann.colors <- lapply(names(ann.labels), function(x){
       "None" = "white")
   } else if (length(levels(ann.labels[[x]])) == 2) {
     c(`FALSE` = "white", `TRUE` = "darkgrey")
-  } else {
-    c("Alive" = "white", "Dead" = "darkgrey", "Unknown" = "brown")
   }
+  # } else {
+  #   c("Alive" = "white", "Dead" = "darkgrey", "Unknown" = "brown")
+  # }
   })
 names(ann.colors) <- names(ann.labels)
 
diff --git a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd
index 7a853cc..4c3cdd4 100644
--- a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd
+++ b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd
@@ -269,6 +269,7 @@ combined <- get_probabilities(tram$`Global msnset`, tram_comp$`Global msnset`) %
 
 
 ```{r Quizartinib Resistance setup}
+##Updating for paper
 
 ## Quizartinib Resistance Proteomics Data
 quizProtData <- querySynapseTable('syn23595222') %>%
@@ -621,6 +622,7 @@ ggsave(plot = p, filename = path_str, height = 5, width = 7)
 
 
 ```{r Quizartinib Resistance}
+##Updating for paper
 m <- quiz$`Global msnset`
 plot_title <- "Quizartinib Resistance"
 path_str <- tolower(plot_title) %>% gsub(" ", "_", .)
@@ -680,10 +682,9 @@ alluvial_df <- quiz_meta %>%
 p3 <- ggplot(alluvial_df, 
              aes(x = CellLine, y = Proportion, alluvium = Subtype)) +
   geom_alluvium(aes(fill = Subtype)) + scale_fill_manual(values = subtype_colors) +
-  scale_x_discrete(expand = c(0.15, 0)) +
-  ggtitle(plot_title) + scale_x_discrete(expand = c(0.3, 0)) + 
+  ggtitle(plot_title) +
   theme(axis.title.x = element_blank(),
-        axis.text.x = element_text(size = 7))
+        axis.text.x = element_text(size = 9, angle = 45, vjust = .65))
 
 path_str <- sub(".pdf$", "_no_ligand.pdf", path_str)
 ggsave(plot = p3, filename = path_str, height = 5.2, width = 10)
@@ -708,13 +709,12 @@ alluvial_df <- quiz_meta %>%
 p4 <- ggplot(alluvial_df, 
              aes(x = CellLine, y = Proportion, alluvium = Subtype)) +
   geom_alluvium(aes(fill = Subtype)) + scale_fill_manual(values = subtype_colors) +
-  scale_x_discrete(expand = c(0.3, 0)) +
-  ggtitle(plot_title) + scale_x_discrete(expand = c(0.3, 0)) + 
+  ggtitle(plot_title) +
   theme(axis.title.x = element_blank(),
-        axis.text.x = element_text(size = 7))
+        axis.text.x = element_text(size = 9, angle = 45, vjust = .65))
 
 path_str <- sub(".pdf$", "_v2.pdf", path_str)
-ggsave(plot = p4, filename = path_str, height = 5.2, width = 7)
+ggsave(plot = p4, filename = path_str, height = 5.2, width = 8)
 
 
 ```
diff --git a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd
index acb3bce..8f542e1 100644
--- a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd
+++ b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd
@@ -275,6 +275,19 @@ drug_data <- auc.dat %>%
 
 ```
 
+
+```{r}
+table_df <- pilot_meta %>%
+  filter(exp_3_new) %>%
+  select(Sample = sample, Subtype, `New to 210-patient cohort` = exp_3_new)
+write.table(table_df, "enet_multinomial_data/experiment_3_new_samples.txt", sep = "\t", 
+            col.names = TRUE, row.names = FALSE, quote = F)
+
+
+```
+
+
+
 ```{r}
 cohort_subtypes <- read.table(syn$get("syn30030154")$path) %>%
   dplyr::rename(Subtype = Cluster) %>%
diff --git a/cohort_summary/NMF/subtype_drivers.Rmd b/cohort_summary/NMF/subtype_drivers.Rmd
index 3804a22..16da916 100644
--- a/cohort_summary/NMF/subtype_drivers.Rmd
+++ b/cohort_summary/NMF/subtype_drivers.Rmd
@@ -176,9 +176,10 @@ ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){
     plot_path <- paste0("Data/subtype_drivers/ksea_", var_name, ".pdf")
   }
   
+  colnames(ksea_mat) <- sub("^Cluster", "Subtype", colnames(ksea_mat))
   heatmap_breaks <- seq(-5, 5, by = 0.2)
   heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks))
-  p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9,
+  p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
                 fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
                 breaks = heatmap_breaks, color = heatmap_colors, width = 6, 
                 filename = plot_path, ...)
@@ -274,10 +275,11 @@ rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7,
     plot_path <- paste0("Data/subtype_drivers/rank_gsea_", var_name, ".pdf")
   }
   
-  p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9,
+  colnames(gsea_mat) <- sub("^Cluster", "Subtype", colnames(gsea_mat))
+  p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
                 fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
                 width = 6, filename = plot_path, ...)
-  pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9,
+  pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
            fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
            width = 6, filename = sub(".pdf$", ".png", plot_path), ...)
            
diff --git a/cohort_summary/NMF/subtype_drivers_159.Rmd b/cohort_summary/NMF/subtype_drivers_159.Rmd
new file mode 100644
index 0000000..c427650
--- /dev/null
+++ b/cohort_summary/NMF/subtype_drivers_159.Rmd
@@ -0,0 +1,412 @@
+---
+title: "Subtype drivers"
+author: "Camilo Posso"
+date: "06/13/2022"
+output: 
+  html_document:
+    code_folding: hide
+    toc: true
+editor_options: 
+  chunk_output_type: inline
+---
+
+## Goal
+
+
+The goal of this markdown is to find biological identifiers for the subtypes.
+
+
+```{r include=FALSE}
+library(clusterProfiler)
+library(dplyr)
+library(ggpubr)
+library(ggplot2)
+library(kableExtra)
+library(ggplotify)
+library(KSEAapp)
+
+source("../../util/synapseUtil.R")
+source("../../util/loading_data.R")
+source("../../util/mutational_analysis_helper.R")
+source("../../util/make_plots_util.R")
+
+syn <- synapseLogin()
+
+load("../../Misc/load.combined.data 3-09-2022.RData")
+# load.combined.data()
+metadata <- load.metadata()
+clusters <- read.table(syn$get("syn30030154")$path, sep = "\t")
+rownames(clusters) <- clusters$Barcode.ID
+metadata$Cluster <- clusters[rownames(metadata), "Cluster"]
+
+## Restricting to 159 RNA samples here
+rna_samples <- unique(RNA.data$Barcode.ID) 
+global.data <- global.data %>%
+  filter(Barcode.ID %in% rna_samples)
+phospho.data <- phospho.data %>%
+  filter(Barcode.ID %in% rna_samples)
+
+global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", 
+                          values_from = "LogRatio") %>%
+  column_to_rownames("Gene")
+m_global <- MSnSet(exprs = global_mat %>% as.matrix(), 
+                   pData = metadata[colnames(global_mat), ])
+
+rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", 
+                          values_from = "RNA counts") %>%
+  column_to_rownames("Gene")
+m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), 
+                pData = metadata[colnames(rna_mat), ])
+
+phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", 
+                           values_from = "LogRatio") %>%
+  column_to_rownames("SiteID")
+m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), 
+                    pData = metadata[colnames(phospho_mat), ])
+
+```
+
+
+```{r}
+library(clusterProfiler)
+library(msigdbr)
+
+t2g_hallmark <- msigdbr(species = "Homo sapiens", category = "H") %>% 
+  dplyr::select(gs_name, gene_symbol)
+## 50 unique terms
+# unique(t2g_hallmark$gs_name)
+
+t2g_gobp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>% 
+  dplyr::select(gs_name, gene_symbol) %>%
+  mutate(gs_name = gsub("_", " ", gs_name))
+
+t2g_reactome <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:REACTOME") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_biocarta <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:BIOCARTA") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_kegg <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:KEGG") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_wikipathways <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:WIKIPATHWAYS") %>%
+  dplyr::select(gs_name, gene_symbol)
+
+t2g_PathwayInteractionDB <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:PID") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+KSDB <- read.csv(system.file('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv',
+                             package='amlresistancenetworks'),stringsAsFactors = FALSE)
+
+
+```
+
+
+
+```{r}
+ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){
+  var_levels <- unique(pData(m)[[var_name]]) %>% sort()
+  combined_ksea <- data.frame()
+  exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-')
+    
+  for (level in var_levels){
+    fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T)
+    # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T)
+    fold_change <- 2**fold_change_ingroup
+    PX <- data.frame(Protein = "NULL", Gene = names(fold_change), Peptide = "NULL", 
+                     Residue.Both = names(fold_change), p = "NULL", FC = fold_change) %>%
+      dplyr::mutate(Residue.Both = sub("^.*-", "", Residue.Both)) %>%
+      dplyr::mutate(Residue.Both = gsub("[a-z]", ";", Residue.Both)) %>%
+      dplyr::mutate(Residue.Both = gsub(";$", "", Residue.Both),
+                    Gene = sub("^(.*)-.*$", "\\1", Gene))
+  
+    combined_ksea <- KSEA.Scores(KSDB, PX, NetworKIN = TRUE, NetworKIN.cutoff = 5) %>%
+      dplyr::select(Kinase.Gene, m, FDR, z.score) %>%
+      dplyr::rename(pathway = Kinase.Gene, enrichment = z.score,
+                    adj_p_val = FDR, set_size = m) %>%
+      dplyr::mutate(data_type = "phospho",
+                    var_name = level,
+                    DB = "KSDB") %>%
+      filter(set_size >= 3) %>%
+      rbind(combined_ksea)
+  }
+  
+  combined_ksea$pathway_subtype_id <- paste(combined_ksea$pathway, 
+                                            combined_ksea$var_name, sep = " -- ")
+  rownames(combined_ksea) <- combined_ksea$pathway_subtype_id
+  write.table(combined_ksea, paste0("Data/subtype_drivers/ksea_159_", var_name, "_", 
+                                    ending, "_results.txt"), sep = "\t")
+  
+  chosen_pathways <- combined_ksea %>%
+    dplyr::group_by(var_name) %>%
+    dplyr::arrange(adj_p_val) %>%
+    dplyr::slice(1:top_n) %>%
+    dplyr::pull(pathway)
+  
+  combined_ksea_small <- combined_ksea %>%
+    dplyr::filter(pathway %in% chosen_pathways) %>%
+    dplyr::mutate(pathway = str_wrap(pathway, width = 50))
+  
+  ksea_mat <- combined_ksea_small %>%
+    dplyr::select(pathway, enrichment, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = enrichment) %>%
+    column_to_rownames("pathway")
+  
+  ksea_mat_p <- combined_ksea_small %>%
+    dplyr::select(pathway, adj_p_val, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = adj_p_val) %>%
+    column_to_rownames("pathway")
+  
+  ksea_mat_p <- ksea_mat_p %>% as.matrix()
+  ksea_mat_p[is.na(ksea_mat_p)] <- 1
+  ksea_mat_p <- apply(ksea_mat_p, 2, signif, digits = 3)
+  
+  ksea_mat_p <- ksea_mat_p[, var_levels]
+  ksea_mat_p2 <- ksea_mat_p
+  ksea_mat_p2[ksea_mat_p >= 0.01] <- ""
+  ksea_mat_p2[ksea_mat_p < 0.01] <- "*"
+  ksea_mat <- ksea_mat[, var_levels]
+  
+  title <- paste("KSEA", var_name, ending)
+  
+  if (nchar(ending) > 0){
+    ending <- tolower(ending)
+    var_name <- tolower(var_name)
+    plot_path <- paste0("Data/subtype_drivers/ksea_159_", var_name, "_", ending, ".pdf") %>%
+      gsub(" ", "_", .)
+  } else {
+    plot_path <- paste0("Data/subtype_drivers/ksea_159_", var_name, ".pdf")
+  }
+  
+  colnames(ksea_mat) <- sub("^Cluster", "Subtype", colnames(ksea_mat))
+  heatmap_breaks <- seq(-5, 5, by = 0.2)
+  heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks))
+  p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
+                fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+                breaks = heatmap_breaks, color = heatmap_colors, width = 6, 
+                filename = plot_path, ...)
+  return(p)
+  
+}
+
+
+```
+
+
+
+```{r}
+rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7, 
+                      top_n = 7, ending = "", ...){
+  var_levels <- unique(pData(m)[[var_name]]) %>% sort()
+  combined_gsea <- data.frame()
+  exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-')
+  
+  if (file.exists(paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", 
+                         ending, "_results.txt"))){
+    print("Found GSEA results table.")
+    combined_gsea <- read.table(paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", 
+                                       ending, "_results.txt"))
+  } else {
+    for (level in var_levels){
+      fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T)
+      # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T)
+      fold_change <- fold_change_ingroup
+      fold_change <- sort(fold_change, decreasing = TRUE)
+      set.seed(69)
+      ## For some pathways, gsea is unable to assess a pvalue. These are removed by the function
+      ## Internally. This is why some pathways (like "GOBP NCRNA METABOLISM") are excluded from
+      ## the result.
+      combined_gsea <- GSEA(fold_change, eps = 1e-16, minGSSize = 10, 
+                            pvalueCutoff = 1, TERM2GENE = t2g)@result %>%
+        dplyr::select(Description, setSize, NES, pvalue, p.adjust, core_enrichment) %>%
+        dplyr::mutate(var_name = level) %>%
+        rbind(combined_gsea)
+    }
+  
+    combined_gsea$pathway_subtype_id <- paste(combined_gsea$Description, 
+                                              combined_gsea$var_name, sep = " -- ")
+    rownames(combined_gsea) <- combined_gsea$pathway_subtype_id
+    write.table(combined_gsea, paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_",
+                                      ending, "_results.txt"), sep = "\t")
+  }
+  
+  combined_gsea <- combined_gsea %>%
+    dplyr::mutate(Description = sub("^[A-Z]+_", "", Description)) %>%
+    dplyr::mutate(Description = sub("^GOBP ", "", Description))
+  
+  chosen_pathways <- combined_gsea %>%
+    dplyr::group_by(var_name) %>%
+    dplyr::arrange(p.adjust) %>%
+    dplyr::slice(1:top_n) %>%
+    dplyr::pull(Description)
+  
+  combined_gsea_small <- combined_gsea %>%
+    dplyr::filter(Description %in% chosen_pathways) %>%
+    dplyr::mutate(Description = str_wrap(Description, width = 50))
+  
+  gsea_mat <- combined_gsea_small %>%
+    dplyr::select(Description, NES, var_name) %>%
+    dplyr::mutate(Description_og = Description) %>%
+    dplyr::mutate(Description = str_wrap(gsub("_", " ", Description), width = wrap_width)) %>%
+    pivot_wider(names_from = var_name, values_from = NES) %>%
+    column_to_rownames("Description")
+  
+  gsea_mat_p <- combined_gsea_small %>%
+    dplyr::select(Description, p.adjust, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = p.adjust) %>%
+    column_to_rownames("Description")
+  
+  gsea_mat_p <- gsea_mat_p %>% as.matrix()
+  gsea_mat_p[is.na(gsea_mat_p)] <- 1
+  gsea_mat_p <- apply(gsea_mat_p, 2, signif, digits = 2)
+  
+  gsea_mat_p <- gsea_mat_p[, var_levels]
+  gsea_mat_p2 <- gsea_mat_p
+  gsea_mat_p2[gsea_mat_p >= 0.01] <- ""
+  gsea_mat_p2[gsea_mat_p < 0.01] <- "*"
+  gsea_mat <- gsea_mat[, var_levels]
+  
+  title <- paste("GSEA", var_name, ending)
+  
+  if (nchar(ending) > 0){
+    ending <- tolower(ending)
+    var_name <- tolower(var_name)
+    plot_path <- paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", ending, ".pdf") %>%
+      gsub(" ", "_", .)
+  } else {
+    plot_path <- paste0("Data/subtype_drivers/rank_gsea_159_", var_name, ".pdf")
+  }
+  
+  colnames(gsea_mat) <- sub("^Cluster", "Subtype", colnames(gsea_mat))
+  p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
+                fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+                width = 6, filename = plot_path, ...)
+  pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15,
+           fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+           width = 6, filename = sub(".pdf$", ".png", plot_path), ...)
+           
+  return(p)
+  
+}
+
+```
+
+
+
+```{r Hallmarks}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "Global - Hallmarks")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "RNA - Hallmarks")
+p3 <- ksea_wrap(m_phospho, var_name = "Cluster", top_n = 6, ending = "Phospho - PhosphoSitePlus & NetworKIN")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_hallmarks.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_hallmarks.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r GO BP}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_gobp, top_n = 7, 
+                ending = "Global - GO Biological Process", font_size = 6, wrap_width = 50)
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_gobp, top_n = 7, 
+                ending = "RNA - GO Biological Process", font_size = 6, wrap_width = 50)
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_gsea_gobp.pdf", width = 19, height = 5.5)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_gsea_gobp.png", width = 19, height = 5.5)
+                       
+```
+
+
+
+```{r Reactome}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "Global - Reactome", 
+                font_size = 7, wrap_width = 50)
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "RNA - Reactome", 
+                font_size = 6, wrap_width = 50)
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_reactome.pdf", width = 18, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_reactome.png", width = 18, height = 6)
+                       
+```
+
+
+
+```{r Biocarta}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "Global - Biocarta")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "RNA - Biocarta")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_biocarta.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_biocarta.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r KEGG}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "Global - KEGG")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "RNA - KEGG")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_kegg.pdf", width = 17, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_kegg.png", width = 17, height = 6)
+                       
+```
+
+
+
+```{r Wikipathways}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "Global - Wiki Pathways")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "RNA - Wiki Pathways")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_wikipathways.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_wikipathways.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r Pathway Interaction DB}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "Global - PID")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "RNA - PID")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_pathway_interaction_db.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_pathway_interaction_db.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r}
+ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_159_by_cluster.pdf", height = 6, width = 6)
+ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_159_by_cluster.png", height = 6, width = 6)
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/cohort_summary/NMF/subtype_drivers_alt.Rmd b/cohort_summary/NMF/subtype_drivers_alt.Rmd
new file mode 100644
index 0000000..687102c
--- /dev/null
+++ b/cohort_summary/NMF/subtype_drivers_alt.Rmd
@@ -0,0 +1,417 @@
+---
+title: "Subtype drivers"
+author: "Camilo Posso"
+date: "06/13/2022"
+output: 
+  html_document:
+    code_folding: hide
+    toc: true
+editor_options: 
+  chunk_output_type: inline
+---
+
+## Goal
+
+
+The goal of this markdown is to find biological identifiers for the subtypes.
+
+
+```{r include=FALSE}
+library(clusterProfiler)
+library(dplyr)
+library(ggpubr)
+library(ggplot2)
+library(kableExtra)
+library(ggplotify)
+library(KSEAapp)
+
+source("../../util/synapseUtil.R")
+source("../../util/loading_data.R")
+source("../../util/mutational_analysis_helper.R")
+source("../../util/make_plots_util.R")
+
+syn <- synapseLogin()
+
+load("../../Misc/load.combined.data 3-09-2022.RData")
+# load.combined.data()
+metadata <- load.metadata()
+clusters <- read.table(syn$get("syn30030154")$path, sep = "\t")
+rownames(clusters) <- clusters$Barcode.ID
+metadata$Cluster <- clusters[rownames(metadata), "Cluster"]
+
+global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", 
+                          values_from = "LogRatio") %>%
+  column_to_rownames("Gene")
+m_global <- MSnSet(exprs = global_mat %>% as.matrix(), 
+                   pData = metadata[colnames(global_mat), ])
+
+rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", 
+                          values_from = "RNA counts") %>%
+  column_to_rownames("Gene")
+m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), 
+                pData = metadata[colnames(rna_mat), ])
+
+phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", 
+                           values_from = "LogRatio") %>%
+  column_to_rownames("SiteID")
+m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), 
+                    pData = metadata[colnames(phospho_mat), ])
+
+```
+
+
+```{r}
+library(clusterProfiler)
+library(msigdbr)
+
+t2g_hallmark <- msigdbr(species = "Homo sapiens", category = "H") %>% 
+  dplyr::select(gs_name, gene_symbol)
+## 50 unique terms
+# unique(t2g_hallmark$gs_name)
+
+t2g_gobp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>% 
+  dplyr::select(gs_name, gene_symbol) %>%
+  mutate(gs_name = gsub("_", " ", gs_name))
+
+t2g_reactome <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:REACTOME") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_biocarta <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:BIOCARTA") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_kegg <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:KEGG") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+t2g_wikipathways <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:WIKIPATHWAYS") %>%
+  dplyr::select(gs_name, gene_symbol)
+
+t2g_PathwayInteractionDB <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::filter(gs_subcat == "CP:PID") %>%
+  dplyr::select(gs_name, gene_symbol) 
+
+KSDB <- read.csv(system.file('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv',
+                             package='amlresistancenetworks'),stringsAsFactors = FALSE)
+
+
+```
+
+
+
+```{r}
+ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){
+  var_levels <- unique(pData(m)[[var_name]]) %>% sort()
+  combined_ksea <- data.frame()
+  # exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-')
+    
+  for (level in var_levels){
+    fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T)
+    all_means <- data.frame(mean_grp = fold_change_ingroup)
+    for (other_level in setdiff(var_levels, c(level))){
+      all_means[other_level] <- apply(exprs(m)[, m[[var_name]] == other_level], 1, mean, na.rm = T)
+    }
+    other_mean <- all_means[, -1] %>%
+      rowSums()
+    other_mean <- other_mean/(ncol(all_means) - 1)
+    fold_change <- all_means[, 1] - other_mean
+    # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T)
+    fold_change <- 2**fold_change
+    PX <- data.frame(Protein = "NULL", Gene = names(fold_change), Peptide = "NULL", 
+                     Residue.Both = names(fold_change), p = "NULL", FC = fold_change) %>%
+      dplyr::mutate(Residue.Both = sub("^.*-", "", Residue.Both)) %>%
+      dplyr::mutate(Residue.Both = gsub("[a-z]", ";", Residue.Both)) %>%
+      dplyr::mutate(Residue.Both = gsub(";$", "", Residue.Both),
+                    Gene = sub("^(.*)-.*$", "\\1", Gene))
+  
+    combined_ksea <- KSEA.Scores(KSDB, PX, NetworKIN = TRUE, NetworKIN.cutoff = 5) %>%
+      dplyr::select(Kinase.Gene, m, FDR, z.score) %>%
+      dplyr::rename(pathway = Kinase.Gene, enrichment = z.score,
+                    adj_p_val = FDR, set_size = m) %>%
+      dplyr::mutate(data_type = "phospho",
+                    var_name = level,
+                    DB = "KSDB") %>%
+      filter(set_size >= 3) %>%
+      rbind(combined_ksea)
+  }
+  
+  combined_ksea$pathway_subtype_id <- paste(combined_ksea$pathway, 
+                                            combined_ksea$var_name, sep = " -- ")
+  rownames(combined_ksea) <- combined_ksea$pathway_subtype_id
+  write.table(combined_ksea, paste0("Data/subtype_drivers/ksea_v2_", var_name, "_", 
+                                    ending, "_results.txt"), sep = "\t")
+  
+  chosen_pathways <- combined_ksea %>%
+    dplyr::group_by(var_name) %>%
+    dplyr::arrange(adj_p_val) %>%
+    dplyr::slice(1:top_n) %>%
+    dplyr::pull(pathway)
+  
+  combined_ksea_small <- combined_ksea %>%
+    dplyr::filter(pathway %in% chosen_pathways) %>%
+    dplyr::mutate(pathway = str_wrap(pathway, width = 50))
+  
+  ksea_mat <- combined_ksea_small %>%
+    dplyr::select(pathway, enrichment, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = enrichment) %>%
+    column_to_rownames("pathway")
+  
+  ksea_mat_p <- combined_ksea_small %>%
+    dplyr::select(pathway, adj_p_val, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = adj_p_val) %>%
+    column_to_rownames("pathway")
+  
+  ksea_mat_p <- ksea_mat_p %>% as.matrix()
+  ksea_mat_p[is.na(ksea_mat_p)] <- 1
+  ksea_mat_p <- apply(ksea_mat_p, 2, signif, digits = 3)
+  
+  ksea_mat_p <- ksea_mat_p[, var_levels]
+  ksea_mat_p2 <- ksea_mat_p
+  ksea_mat_p2[ksea_mat_p >= 0.01] <- ""
+  ksea_mat_p2[ksea_mat_p < 0.01] <- "*"
+  ksea_mat <- ksea_mat[, var_levels]
+  
+  title <- paste("KSEA", var_name, ending)
+  
+  if (nchar(ending) > 0){
+    ending <- tolower(ending)
+    var_name <- tolower(var_name)
+    plot_path <- paste0("Data/subtype_drivers/ksea_v2_", var_name, "_", ending, ".pdf") %>%
+      gsub(" ", "_", .)
+  } else {
+    plot_path <- paste0("Data/subtype_drivers/ksea_v2_", var_name, ".pdf")
+  }
+  
+  heatmap_breaks <- seq(-5, 5, by = 0.2)
+  heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks))
+  p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9,
+                fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+                breaks = heatmap_breaks, color = heatmap_colors, width = 6, 
+                filename = plot_path, ...)
+  return(p)
+  
+}
+
+
+```
+
+
+
+```{r}
+rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7, 
+                      top_n = 7, ending = "", ...){
+  var_levels <- unique(pData(m)[[var_name]]) %>% sort()
+  combined_gsea <- data.frame()
+  # exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-')
+  
+  if (file.exists(paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", 
+                         ending, "_results.txt"))){
+    print("Found GSEA results table.")
+    combined_gsea <- read.table(paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", 
+                                       ending, "_results.txt"))
+  } else {
+    for (level in var_levels){
+      fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T)
+      all_means <- data.frame(mean_grp = fold_change_ingroup)
+      for (other_level in setdiff(var_levels, c(level))){
+        all_means[other_level] <- apply(exprs(m)[, m[[var_name]] == other_level], 1, mean, na.rm = T)
+      }
+      other_mean <- all_means[, -1] %>%
+        rowSums()
+      other_mean <- other_mean/(ncol(all_means) - 1)
+      fold_change <- all_means[, 1] - other_mean
+      fold_change <- sort(fold_change, decreasing = TRUE)
+      set.seed(69)
+      ## For some pathways, gsea is unable to assess a pvalue. These are removed by the function
+      ## Internally. This is why some pathways (like "GOBP NCRNA METABOLISM") are excluded from
+      ## the result.
+      combined_gsea <- GSEA(fold_change, eps = 1e-16, minGSSize = 10, 
+                            pvalueCutoff = 1, TERM2GENE = t2g)@result %>%
+        dplyr::select(Description, setSize, NES, pvalue, p.adjust, core_enrichment) %>%
+        dplyr::mutate(var_name = level) %>%
+        rbind(combined_gsea)
+    }
+  
+    combined_gsea$pathway_subtype_id <- paste(combined_gsea$Description, 
+                                              combined_gsea$var_name, sep = " -- ")
+    rownames(combined_gsea) <- combined_gsea$pathway_subtype_id
+    write.table(combined_gsea, paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_",
+                                      ending, "_results.txt"), sep = "\t")
+  }
+  
+  combined_gsea <- combined_gsea %>%
+    dplyr::mutate(Description = sub("^[A-Z]+_", "", Description)) %>%
+    dplyr::mutate(Description = sub("^GOBP ", "", Description))
+  
+  chosen_pathways <- combined_gsea %>%
+    dplyr::group_by(var_name) %>%
+    dplyr::arrange(p.adjust) %>%
+    dplyr::slice(1:top_n) %>%
+    dplyr::pull(Description)
+  
+  combined_gsea_small <- combined_gsea %>%
+    dplyr::filter(Description %in% chosen_pathways) %>%
+    dplyr::mutate(Description = str_wrap(Description, width = 50))
+  
+  gsea_mat <- combined_gsea_small %>%
+    dplyr::select(Description, NES, var_name) %>%
+    dplyr::mutate(Description_og = Description) %>%
+    dplyr::mutate(Description = str_wrap(gsub("_", " ", Description), width = wrap_width)) %>%
+    pivot_wider(names_from = var_name, values_from = NES) %>%
+    column_to_rownames("Description")
+  
+  gsea_mat_p <- combined_gsea_small %>%
+    dplyr::select(Description, p.adjust, var_name) %>%
+    pivot_wider(names_from = var_name, values_from = p.adjust) %>%
+    column_to_rownames("Description")
+  
+  gsea_mat_p <- gsea_mat_p %>% as.matrix()
+  gsea_mat_p[is.na(gsea_mat_p)] <- 1
+  gsea_mat_p <- apply(gsea_mat_p, 2, signif, digits = 2)
+  
+  gsea_mat_p <- gsea_mat_p[, var_levels]
+  gsea_mat_p2 <- gsea_mat_p
+  gsea_mat_p2[gsea_mat_p >= 0.01] <- ""
+  gsea_mat_p2[gsea_mat_p < 0.01] <- "*"
+  gsea_mat <- gsea_mat[, var_levels]
+  
+  title <- paste("GSEA", var_name, ending)
+  
+  if (nchar(ending) > 0){
+    ending <- tolower(ending)
+    var_name <- tolower(var_name)
+    plot_path <- paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", ending, ".pdf") %>%
+      gsub(" ", "_", .)
+  } else {
+    plot_path <- paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, ".pdf")
+  }
+  
+  p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9,
+                fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+                width = 6, filename = plot_path, ...)
+  pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9,
+           fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, 
+           width = 6, filename = sub(".pdf$", ".png", plot_path), ...)
+           
+  return(p)
+  
+}
+
+```
+
+
+
+```{r Hallmarks}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "Global - Hallmarks")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "RNA - Hallmarks")
+p3 <- ksea_wrap(m_phospho, var_name = "Cluster", top_n = 6, ending = "Phospho - PhosphoSitePlus & NetworKIN")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_hallmarks.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_hallmarks.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r GO BP}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_gobp, top_n = 7, 
+                ending = "Global - GO Biological Process", font_size = 6, wrap_width = 50)
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_gobp, top_n = 7, 
+                ending = "RNA - GO Biological Process", font_size = 6, wrap_width = 50)
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_gsea_gobp.pdf", width = 19, height = 5.5)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_gsea_gobp.png", width = 19, height = 5.5)
+                       
+```
+
+
+
+```{r Reactome}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "Global - Reactome", 
+                font_size = 7, wrap_width = 50)
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "RNA - Reactome", 
+                font_size = 6, wrap_width = 50)
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_reactome.pdf", width = 18, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_reactome.png", width = 18, height = 6)
+                       
+```
+
+
+
+```{r Biocarta}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "Global - Biocarta")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "RNA - Biocarta")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_biocarta.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_biocarta.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r KEGG}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "Global - KEGG")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "RNA - KEGG")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_kegg.pdf", width = 17, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_kegg.png", width = 17, height = 6)
+                       
+```
+
+
+
+```{r Wikipathways}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "Global - Wiki Pathways")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "RNA - Wiki Pathways")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_wikipathways.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_wikipathways.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r Pathway Interaction DB}
+p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "Global - PID")
+p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "RNA - PID")
+
+p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) 
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_pathway_interaction_db.pdf", width = 16, height = 6)
+ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_pathway_interaction_db.png", width = 16, height = 6)
+                       
+```
+
+
+
+```{r}
+ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_by_cluster.pdf", height = 6, width = 6)
+ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_by_cluster.png", height = 6, width = 6)
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd b/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd
new file mode 100644
index 0000000..e93278a
--- /dev/null
+++ b/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd
@@ -0,0 +1,452 @@
+---
+title: "Subtype drivers"
+author: "Camilo Posso"
+date: "08/29/2022"
+output: 
+  html_document:
+    code_folding: hide
+    toc: true
+editor_options: 
+  chunk_output_type: inline
+---
+
+## Goal
+
+
+The goal of this markdown is to find biological identifiers for the subtypes.
+
+
+```{r include=FALSE}
+library(dplyr)
+library(ggplot2)
+library(broom)
+
+source("../../../util/synapseUtil.R")
+source("../../../util/loading_data.R")
+source("../../../util/mutational_analysis_helper.R")
+source("../../../util/make_plots_util.R")
+
+syn <- synapseLogin()
+
+load("../../../Misc/load.combined.data 3-09-2022.RData")
+# load.combined.data()
+metadata <- load.metadata()
+clusters <- read.table(syn$get("syn30030154")$path, sep = "\t")
+rownames(clusters) <- clusters$Barcode.ID
+metadata$Cluster <- clusters[rownames(metadata), "Cluster"]
+
+global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", 
+                          values_from = "LogRatio") %>%
+  column_to_rownames("Gene")
+m_global <- MSnSet(exprs = global_mat %>% as.matrix(), 
+                   pData = metadata[colnames(global_mat), ])
+
+rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", 
+                          values_from = "RNA counts") %>%
+  column_to_rownames("Gene")
+m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), 
+                pData = metadata[colnames(rna_mat), ])
+
+phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", 
+                           values_from = "LogRatio") %>%
+  column_to_rownames("SiteID")
+m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), 
+                    pData = metadata[colnames(phospho_mat), ])
+
+mutation_data <- load_mutational_sample_data()
+
+NPM1_samples <- mutation_data %>%
+  filter(Gene == "NPM1_clinical") %>%
+  pull(Barcode.ID) %>% unique()
+
+DNMT3A_samples <- mutation_data %>%
+  filter(Gene == "DNMT3A") %>%
+  pull(Barcode.ID) %>% unique()
+
+metadata <- metadata %>%
+  mutate(NPM1 = case_when(Barcode.ID %in% NPM1_samples ~ "TRUE",
+                          TRUE ~ "FALSE"),
+         DNMT3A = case_when(Barcode.ID %in% DNMT3A_samples ~ "TRUE",
+                          TRUE ~ "FALSE"),
+         #mut_col = paste(FLT3.ITD, NPM1),
+         mut_col = FLT3.ITD, 
+         shuffle = sample(Cluster),
+         Cluster = sub("Cluster ", "", Cluster),
+         Cluster_1 = case_when(Cluster == "1" ~ TRUE,
+                               TRUE ~ FALSE),
+         Cluster_2 = case_when(Cluster == "2" ~ TRUE,
+                               TRUE ~ FALSE),
+         Cluster_3 = case_when(Cluster == "3" ~ TRUE,
+                               TRUE ~ FALSE),
+         Cluster_4 = case_when(Cluster == "4" ~ TRUE,
+                               TRUE ~ FALSE))
+
+latest_AUC <- read.table("drug_response.csv", sep = ",")
+latest_AUC <- latest_AUC[-1, -1]
+
+colnames(latest_AUC) <- c("Barcode.ID", "Inhibitor", "AUC")
+latest_AUC$AUC <- as.numeric(latest_AUC$AUC)
+
+
+```
+
+
+
+```{r}
+## Setting up matrices
+AUC_mat_old <- pivot_wider(functional.data.sensitive.family %>% select(Inhibitor, Barcode.ID, AUC), names_from = "Barcode.ID",
+                       values_from = "AUC", values_fn = mean) %>%
+  column_to_rownames("Inhibitor")
+
+## Using drugs from James table
+AUC_mat <- pivot_wider(latest_AUC %>% select(Inhibitor, Barcode.ID, AUC), names_from = "Barcode.ID",
+                       values_from = "AUC", values_fn = mean) %>%
+  column_to_rownames("Inhibitor")
+
+chosen_drugs <- c('Venetoclax', 'Vargetef', 'Trametinib (GSK1120212)',
+       'Tivozanib (AV-951)', 'Sorafenib', 'Selumetinib (AZD6244)',
+       'Selinexor', 'SNS-032 (BMS-387032)', 'Rapamycin',
+       'RAF265 (CHIR-265)', 'Quizartinib (AC220)', 'Ponatinib (AP24534)',
+       'Pelitinib (EKB-569)', 'Panobinostat', 'PP242', 'PI-103',
+       'PD173955', 'OTX-015', 'Neratinib (HKI-272)', 'NVP-TAE684',
+       'NF-kB Activation Inhibitor', 'Midostaurin', 'KW-2449', 'KI20227',
+       'JQ1', 'JNJ-28312141', 'JAK Inhibitor I', 'INK-128',
+       'Gilteritinib', 'GDC-0941', 'Foretinib (XL880)', 'Flavopiridol',
+       'Entospletinib (GS-9973)', 'Elesclomol', 'Dovitinib (CHIR-258)',
+       'Doramapimod (BIRB 796)', 'Dasatinib', 'Cabozantinib', 'CYT387',
+       'CI-1040 (PD184352)', 'Bortezomib (Velcade)', 'BEZ235',
+       'Afatinib (BIBW-2992)', 'AT7519', 'A-674563',
+       '17-AAG (Tanespimycin)')
+
+## Subset to those drugs which we have used before. Can change later.
+AUC_mat <- AUC_mat[chosen_drugs, ]
+
+m_auc <- MSnSet(exprs = AUC_mat %>% as.matrix(), 
+                pData = metadata[colnames(AUC_mat), ])
+
+```
+
+
+## Using limma and F test
+```{r eval=FALSE, include=FALSE}
+## Using limma and F test
+mut_cluster <- limma_gen(m_auc, "~ mut_col + Cluster", "Cluster")
+
+mut_shuffle <- limma_gen(m_auc, "~ mut_col + shuffle", "shuffle")
+
+limma_cluster <- limma_gen(m_auc, "~ Cluster", "Cluster")
+limma_shuffle <- limma_gen(m_auc, "~ shuffle", "shuffle")
+
+limma_mut <- limma_gen(m_auc, "~mut_col", "mut_col")
+
+cluster_mut <- limma_gen(m_auc, "~mut_col + Cluster", "mut_col")
+
+summary_df <- mut_cluster %>%
+  select(adj.P.Val3 = adj.P.Val)
+summary_df$Inhibitor <- rownames(summary_df)
+
+summary_df <- limma_cluster %>%
+  select(adj.P.Val2 = adj.P.Val) %>%
+  mutate(Inhibitor = rownames(.)) %>%
+  merge(summary_df, by = "Inhibitor")
+summary_df <- limma_mut %>%
+  select(adj.P.Val1 = adj.P.Val) %>%
+  mutate(Inhibitor = rownames(.)) %>%
+  merge(summary_df, by = "Inhibitor")
+
+colnames(summary_df) <- c("Inhibitor",
+                          "FLT3.ITD significant", 
+                          "Subtype significant", 
+                          "Subtype improves FLT3 model")
+
+summary_df <- cluster_mut %>%
+  select(adj.P.Val_flt3 = adj.P.Val) %>%
+  mutate(Inhibitor = rownames(.)) %>%
+  merge(summary_df, by = "Inhibitor")
+
+colnames(summary_df)[2] <- "FLT3 improves subtype model"
+rownames(summary_df) <- summary_df$Inhibitor
+
+pheatmap(summary_df[, c(3,5)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2),
+         breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary.png", height = 10, 
+         treeheight_row = 0, treeheight_col = 0, angle_col = 45)
+
+pheatmap(summary_df[, c(4,2)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2),
+         breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_2.png", height = 10, 
+         treeheight_row = 0, treeheight_col = 0, angle_col = 45)
+
+pheatmap(summary_df[, c(3,5)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2),
+         breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_3.png", height = 10, 
+         treeheight_row = 0, treeheight_col = 0, angle_col = 45)
+
+```
+
+
+
+
+```{r eval=FALSE, include=FALSE}
+## scratch work
+lm_df <- data_df_cluster %>%
+  filter(Inhibitor == chosen_drug)
+model_og <- lm("value ~ Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df)
+
+auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T)
+lm_df_std <- lm_df %>%
+  mutate(value = value - auc_mean)
+model <- lm("value ~ Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df_std)
+model1 <- lm("value ~ -1 + Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df_std)
+
+
+```
+
+
+```{r eval=FALSE, include=FALSE}
+## Little interaction, subtype important
+# chosen_drug <- "Selumetinib (AZD6244)"  # subtype predicts AUC response, little interaction.
+
+## Interaction present, subtype important
+# chosen_drug <- "Venetoclax"    ## affected little by FLT3, subtype interaction with FLT3 seen in cluster 3. 
+
+## Purely FLT3 effect, harder to discern for sure
+# chosen_drug <- "KW-2449"
+# chosen_drug <- "Crenolanib"
+chosen_drug <- "Elesclomol"  ## affected a lot by FLT3, maybe negative subtype interaction? in the sense that 
+                                 ## subtype 1 FLT3.ITD is not lower, but higher                                     
+                                 ## than FLT3.ITD false. This is a FLT3 inhibitor
+
+
+lm_df <- data_df_cluster %>%
+  filter(Inhibitor == chosen_drug)
+model_og <- lm("value ~ Cluster + FLT3.ITD:Cluster", data = lm_df)
+
+auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T)
+lm_df_std <- lm_df %>%
+  mutate(value = value - auc_mean)
+model <- lm("value ~ Cluster + FLT3.ITD:Cluster", data = lm_df_std)
+model1 <- lm("value ~ -1 + Cluster + FLT3.ITD:Cluster", data = lm_df_std)
+
+plot_df <- data.frame(AUC = AUC_mat[chosen_drug, ] %>% as.numeric(), Barcode.ID = colnames(AUC_mat)) %>%
+  merge(metadata %>% select(Barcode.ID, FLT3.ITD, Cluster), by = "Barcode.ID")
+AUC_mean = mean(plot_df[plot_df$FLT3.ITD == "FALSE", "AUC"], na.rm = T)
+
+ggplot(plot_df, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug)
+
+ggplot(plot_df, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug)
+
+ggplot(plot_df, aes(x = paste(Cluster, FLT3.ITD), y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug)
+
+
+```
+
+
+### Trying with boolean variables, gives the same behavior, and same coefficients and p-values
+
+```{r eval=FALSE, include=FALSE}
+data_df_cluster <- exprs(m_auc) %>% as.data.frame()
+data_df_cluster$Inhibitor <- rownames(data_df_cluster)
+data_df_cluster <- data_df_cluster %>%
+  pivot_longer(-Inhibitor) %>%
+  merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster_1, Cluster_2, Cluster_3, Cluster_4))
+
+set.seed(117)
+data_df_shuffle <- exprs(m_auc) %>% as.data.frame()
+data_df_shuffle$Inhibitor <- rownames(data_df_shuffle)
+data_df_shuffle <- data_df_shuffle %>%
+  pivot_longer(-Inhibitor) %>%
+  merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster = shuffle))
+
+auc_model_v2 <- function(chosen_drug, data_df){
+  lm_df <- data_df %>%
+    filter(Inhibitor == chosen_drug)
+  auc_mean <- mean(lm_df$value, na.rm = T)
+  # lm_df <- lm_df %>%
+  #   mutate(value = value - auc_mean)
+  model <- lm("value ~ FLT3.ITD + Cluster_1 + Cluster_2 + Cluster_3 + Cluster_4 + 
+              Cluster_1:FLT3.ITD + Cluster_2:FLT3.ITD +
+              Cluster_3:FLT3.ITD + Cluster_4:FLT3.ITD", data = lm_df)
+  out_df <- data.frame(Inhibitor = chosen_drug,
+                       term = tidy(summary(model))[, 1],
+                       p.value = tidy(summary(model))[, 5]) %>%
+    rbind(data.frame(Inhibitor = chosen_drug,
+                     term = "F_stat", 
+                     p.value = pf(summary(model)$fstatistic[[1]], summary(model)$fstatistic[[2]], 
+                                  summary(model)$fstatistic[[3]], lower.tail = FALSE)))
+  return(out_df)
+}
+
+
+```
+
+
+
+
+```{r}
+data_df_cluster <- exprs(m_auc) %>% as.data.frame()
+data_df_cluster$Inhibitor <- rownames(data_df_cluster)
+data_df_cluster <- data_df_cluster %>%
+  pivot_longer(-Inhibitor) %>%
+  merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster))
+data_df_cluster$Cluster <- factor(data_df_cluster$Cluster, levels = c("1", "2", "3", "4"))
+
+data_df_shuffle <- exprs(m_auc) %>% as.data.frame()
+data_df_shuffle$Inhibitor <- rownames(data_df_shuffle)
+data_df_shuffle <- data_df_shuffle %>%
+  pivot_longer(-Inhibitor) %>%
+  merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster = shuffle))
+
+auc_model <- function(chosen_drug, data_df){
+  lm_df <- data_df %>%
+    filter(Inhibitor == chosen_drug)
+  ## Normalize using FLT3.ITD = FALSE as the control group. 
+  ## Basically, we take away the mean of the FLT3.ITD = FALSE group
+  auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T)
+  lm_df <- lm_df %>%
+    mutate(value = value - auc_mean)
+  model <- lm("value ~ -1 + Cluster + Cluster:FLT3.ITD", data = lm_df)
+  out_df <- data.frame(Inhibitor = chosen_drug,
+                       term = tidy(summary(model))[, 1],
+                       p.value = tidy(summary(model))[, 5],
+                       coeff = model$coefficients %>% as.numeric()) %>%
+    rbind(data.frame(Inhibitor = chosen_drug,
+                     term = "F_stat", 
+                     p.value = pf(summary(model)$fstatistic[[1]], summary(model)$fstatistic[[2]], 
+                                  summary(model)$fstatistic[[3]], lower.tail = FALSE),
+                     coeff = NA))
+  return(out_df)
+}
+
+
+```
+
+
+
+```{r}
+epsi <- 0.0000001
+all_results <- lapply(rownames(m_auc), auc_model, data_df_cluster) %>% do.call("rbind", .) %>%
+  mutate(adj_p_val = p.adjust(p.value, method = "BH"),
+         xx = case_when(term != "F_stat" ~ (p.value < 0.05) * coeff,
+                        term == "F_stat" & p.value > 0.05 ~ 0)) %>%
+  select(-adj_p_val, -p.value, -coeff)
+
+mat_results <- pivot_wider(all_results, values_from = "xx", names_from = "term") %>%
+  select(Inhibitor, everything()) %>%
+  column_to_rownames("Inhibitor") %>% as.data.frame()
+
+colnames(mat_results) <- c("Subtype 1", "Subtype 2", "Subtype 3", "Subtype 4",
+                           "Subtype 1 and FLT3", "Subtype 2 and FLT3", 
+                           "Subtype 3 and FLT3", "Subtype 4 and FLT3", "F tatistic")
+
+
+color_pal <- c(rev(colorRampPalette(brewer.pal(n = 8, name = "Blues"))(15)), 
+               "grey", 
+               colorRampPalette(brewer.pal(n = 8, name = "Reds"))(15))
+
+breaks_buddy <- c(seq(min(mat_results, na.rm = T), -2*epsi, length = 15), 
+                  -epsi, epsi,
+                  seq(2*epsi, max(mat_results, na.rm = T), length = 15))
+
+pheatmap(mat_results, color = color_pal, breaks = breaks_buddy, na_col = "#815F8A",
+         legend = TRUE, filename = "subtype_auc_model_summary_interaction_v2.png", fontsize = 13,
+         treeheight_col = 0, angle_col = 315, cluster_cols = F, width = 8.5, height = 15)
+
+pheatmap(mat_results, color = color_pal, breaks = breaks_buddy, na_col = "#815F8A",
+         legend = TRUE, filename = "subtype_auc_model_summary_interaction_v2.pdf", fontsize = 13,
+         treeheight_col = 0, angle_col = 315, cluster_cols = F, width = 8.5, height = 15)
+
+```
+
+
+
+```{r}
+
+
+
+
+```
+
+
+
+```{r}
+
+all_results <- lapply(rownames(m_auc), auc_model, data_df_shuffle) %>% do.call("rbind", .) %>%
+  mutate(adj_p_val = p.adjust(p.value, method = "BH")) %>%
+  select(-p.value)
+
+mat_results <- pivot_wider(all_results, values_from = "adj_p_val", names_from = "term") %>%
+  select(Inhibitor, everything()) %>%
+  column_to_rownames("Inhibitor") %>% as.data.frame()
+
+
+pheatmap(t(mat_results), color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2),
+         breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_interaction_shuffled.png", 
+         treeheight_row = 0, treeheight_col = 0, angle_col = 315, cluster_rows = F, width = 20, height = 7)
+
+pheatmap(t(mat_results), color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2),
+         breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_interaction_shuffled.pdf", 
+         treeheight_row = 0, treeheight_col = 0, angle_col = 315, cluster_rows = F, width = 20, height = 7)
+
+```
+
+
+
+```{r}
+chosen_drug <- "JAK Inhibitor"
+plot_df <- data.frame(AUC = AUC_mat[chosen_drug, ] %>% as.numeric(), Barcode.ID = colnames(AUC_mat)) %>%
+  merge(metadata %>% select(Barcode.ID, FLT3.ITD, Cluster), by = "Barcode.ID")
+AUC_mean = mean(plot_df[plot_df$FLT3.ITD == "FALSE", "AUC"], na.rm = T)
+
+meta_shuffled <- data_df_shuffle %>%
+  select(Barcode.ID = name, FLT3.ITD, Cluster) %>%
+  unique()
+
+plot_df_shuffled <- plot_df %>% select(-Cluster, -FLT3.ITD) %>%
+  merge(meta_shuffled, by = "Barcode.ID")
+
+## Original cluster
+ggplot(plot_df, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original")
+
+ggplot(plot_df, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original")
+
+ggplot(plot_df, aes(x = paste(Cluster, FLT3.ITD), 
+                    y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original")
+
+## Shuffled cluster labels
+ggplot(plot_df_shuffled, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled")
+
+ggplot(plot_df_shuffled, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled")
+
+ggplot(plot_df_shuffled, aes(x = paste(Cluster, FLT3.ITD), 
+                             y = AUC, color = FLT3.ITD)) + geom_boxplot() + 
+  geom_jitter(position = position_jitter()) +
+  geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled")
+
+```
+
+
+
+```{r}
+pheatmap(exprs(m_auc), na_col = "white", cluster_rows = FALSE, cluster_cols = FALSE,
+         show_colnames = FALSE, height = 10, filename = "auc_data_updated.pdf")
+
+```
+
+
+
+
+
diff --git a/proteomics/1-process_global_data.Rmd b/proteomics/1-process_global_data.Rmd
index eac822f..2a21cd5 100644
--- a/proteomics/1-process_global_data.Rmd
+++ b/proteomics/1-process_global_data.Rmd
@@ -17,6 +17,7 @@ t0 <- Sys.time(); print(t0)
 
 ```{r setup}
 library(PlexedPiper)
+library(PNNL.DMS.utils)
 library(dplyr)
 data_package_num <- 3676
 data_folder <- "data/Ex10_global_data/"
@@ -49,7 +50,7 @@ msgf_data_path <- file.path(data_folder, "msgfData_original.RData")
 if (file.exists(msgf_data_path)) {
   load(msgf_data_path)
 } else {
-  msnid <- read_msms_data_from_DMS(data_package_num)
+  msnid <- read_msgf_data_from_DMS(data_package_num)
   save(msnid, file=msgf_data_path)
 }
 show(msnid)
diff --git a/util/mutational_analysis_helper.R b/util/mutational_analysis_helper.R
index 7c1de59..af80817 100644
--- a/util/mutational_analysis_helper.R
+++ b/util/mutational_analysis_helper.R
@@ -185,7 +185,7 @@ compress_enrichment <- function(enrichment_array, threshold = .75, colname='Coun
   
   # sort enrichment array by attribute of choice
   enrichment_array <- enrichment_array%>%
-    dplyr::mutate(sortval=colname)
+    dplyr::mutate(sortval := !! sym(colname))
   
   if (descending){
     enrichment_array <- enrichment_array %>%