From d190bc877f55e01d39f19b99b68eb944e49e650f Mon Sep 17 00:00:00 2001 From: CamiloPosso Date: Tue, 1 Nov 2022 09:16:29 -0700 Subject: [PATCH] Small updates to plots + subtype AUC interaction Added a few small changes to plots, like removing vitalStatus from the NMF heatmaps. Also included the subtype + AUC interaction script. --- .gitignore | 6 + cohort_summary/NMF/NMF_raw_results.Rmd | 2 +- cohort_summary/NMF/NMF_summary.Rmd | 14 +- ...net_multinomial_predictions_cell_lines.Rmd | 14 +- ...k_4_enet_multinomial_predictions_pilot.Rmd | 13 + cohort_summary/NMF/subtype_drivers.Rmd | 8 +- cohort_summary/NMF/subtype_drivers_159.Rmd | 412 ++++++++++++++++ cohort_summary/NMF/subtype_drivers_alt.Rmd | 417 ++++++++++++++++ .../subtype_added_value.Rmd | 452 ++++++++++++++++++ proteomics/1-process_global_data.Rmd | 3 +- util/mutational_analysis_helper.R | 2 +- 11 files changed, 1325 insertions(+), 18 deletions(-) create mode 100644 cohort_summary/NMF/subtype_drivers_159.Rmd create mode 100644 cohort_summary/NMF/subtype_drivers_alt.Rmd create mode 100644 cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd diff --git a/.gitignore b/.gitignore index 6a9de60..402206c 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,9 @@ cohort_summary/NMF/multinomial_models_k_4_elastic_net/cell_line_data_notes.txt cohort_summary/NMF/peptide_subtype_interaction/Protein peptide deviation Vlad.pptx cohort_summary/NMF/peptide_subtype_interaction/Protein Peptide Deviation PTRC tuesday.pptx cohort_summary/supplemental_table_1.txt +cohort_summary/NMF/cophenetic_correlation.txt +cohort_summary/NMF/logFC.txt +cohort_summary/NMF/subtype_mutation_AUC/drug_response.csv +cohort_summary/NMF/subtype_mutation_AUC/Figures +Figures +cohort_summary/NMF/diving_into_subtype_drivers.Rmd diff --git a/cohort_summary/NMF/NMF_raw_results.Rmd b/cohort_summary/NMF/NMF_raw_results.Rmd index 1496920..3aeeff7 100644 --- a/cohort_summary/NMF/NMF_raw_results.Rmd +++ b/cohort_summary/NMF/NMF_raw_results.Rmd @@ -12,12 +12,12 @@ library(dplyr) library(tibble) library(tidyr) - source("../../util/synapseUtil.R") source("../../util/loading_data.R") source("NMF_helper.R") load.combined.data() + ``` diff --git a/cohort_summary/NMF/NMF_summary.Rmd b/cohort_summary/NMF/NMF_summary.Rmd index 80715d5..3cd9912 100644 --- a/cohort_summary/NMF/NMF_summary.Rmd +++ b/cohort_summary/NMF/NMF_summary.Rmd @@ -120,6 +120,9 @@ coph.correlation <- sapply(results, cophcor) %>% data.frame(coph.cor = ., k = 2:8) %>% filter(k <= 8) +write.table(coph.correlation, file = "cophenetic_correlation.txt", + sep = "\t", quote = F, row.names = F) + p <- ggplot(coph.correlation, aes(x = k, y = coph.cor)) + geom_line() + ylab("Cophenetic correlation") + scale_x_continuous(breaks = 2:8) @@ -229,7 +232,7 @@ fab_annotations <- summary.table %>% ## pheatmap column labels ann.labels <- meta %>% filter(Barcode.ID %in% samples.common) %>% - select(FLT3.ITD, PostChemotherapy, vitalStatus) %>% + select(FLT3.ITD, PostChemotherapy) %>% mutate(FLT3.ITD = as.factor(FLT3.ITD), PostChemotherapy = as.factor(PostChemotherapy), Barcode.ID = rownames(.)) @@ -260,12 +263,12 @@ ann.labels <- ann.labels %>% mutate(FAB = factor(FAB, levels = c("M0", "M1", "M2", "M3", "M4", "M5", "None"))) %>% column_to_rownames("Barcode.ID") %>% as.data.frame() %>% - select(PostChemotherapy, NPM1, NPM1_clinical, RAS, vitalStatus, FLT3.ITD, FLT3_MAF, FLT3_AR, FAB) %>% + select(PostChemotherapy, NPM1, NPM1_clinical, RAS, FLT3.ITD, FLT3_MAF, FLT3_AR, FAB) %>% dplyr::rename(`FLT3_MAF (percentage)` = FLT3_MAF, `RAS vaf` = RAS) ## Slimming down annotations -ann.labels <- ann.labels[, c("RAS vaf", "FLT3.ITD", "NPM1_clinical", "FAB", "vitalStatus")] +ann.labels <- ann.labels[, c("RAS vaf", "FLT3.ITD", "NPM1_clinical", "FAB")] #, "vitalStatus")] ## White to dark purple (like the snow storms on weather maps) colors.wp <- scales::seq_gradient_pal("#FFFFFF", "#4A0078", "Lab")(seq(0,1,length.out = p.celing*10 + 1)) @@ -282,9 +285,10 @@ ann.colors <- lapply(names(ann.labels), function(x){ "None" = "white") } else if (length(levels(ann.labels[[x]])) == 2) { c(`FALSE` = "white", `TRUE` = "darkgrey") - } else { - c("Alive" = "white", "Dead" = "darkgrey", "Unknown" = "brown") } + # } else { + # c("Alive" = "white", "Dead" = "darkgrey", "Unknown" = "brown") + # } }) names(ann.colors) <- names(ann.labels) diff --git a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd index 7a853cc..4c3cdd4 100644 --- a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd +++ b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_cell_lines.Rmd @@ -269,6 +269,7 @@ combined <- get_probabilities(tram$`Global msnset`, tram_comp$`Global msnset`) % ```{r Quizartinib Resistance setup} +##Updating for paper ## Quizartinib Resistance Proteomics Data quizProtData <- querySynapseTable('syn23595222') %>% @@ -621,6 +622,7 @@ ggsave(plot = p, filename = path_str, height = 5, width = 7) ```{r Quizartinib Resistance} +##Updating for paper m <- quiz$`Global msnset` plot_title <- "Quizartinib Resistance" path_str <- tolower(plot_title) %>% gsub(" ", "_", .) @@ -680,10 +682,9 @@ alluvial_df <- quiz_meta %>% p3 <- ggplot(alluvial_df, aes(x = CellLine, y = Proportion, alluvium = Subtype)) + geom_alluvium(aes(fill = Subtype)) + scale_fill_manual(values = subtype_colors) + - scale_x_discrete(expand = c(0.15, 0)) + - ggtitle(plot_title) + scale_x_discrete(expand = c(0.3, 0)) + + ggtitle(plot_title) + theme(axis.title.x = element_blank(), - axis.text.x = element_text(size = 7)) + axis.text.x = element_text(size = 9, angle = 45, vjust = .65)) path_str <- sub(".pdf$", "_no_ligand.pdf", path_str) ggsave(plot = p3, filename = path_str, height = 5.2, width = 10) @@ -708,13 +709,12 @@ alluvial_df <- quiz_meta %>% p4 <- ggplot(alluvial_df, aes(x = CellLine, y = Proportion, alluvium = Subtype)) + geom_alluvium(aes(fill = Subtype)) + scale_fill_manual(values = subtype_colors) + - scale_x_discrete(expand = c(0.3, 0)) + - ggtitle(plot_title) + scale_x_discrete(expand = c(0.3, 0)) + + ggtitle(plot_title) + theme(axis.title.x = element_blank(), - axis.text.x = element_text(size = 7)) + axis.text.x = element_text(size = 9, angle = 45, vjust = .65)) path_str <- sub(".pdf$", "_v2.pdf", path_str) -ggsave(plot = p4, filename = path_str, height = 5.2, width = 7) +ggsave(plot = p4, filename = path_str, height = 5.2, width = 8) ``` diff --git a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd index acb3bce..8f542e1 100644 --- a/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd +++ b/cohort_summary/NMF/multinomial_models_k_4_elastic_net/k_4_enet_multinomial_predictions_pilot.Rmd @@ -275,6 +275,19 @@ drug_data <- auc.dat %>% ``` + +```{r} +table_df <- pilot_meta %>% + filter(exp_3_new) %>% + select(Sample = sample, Subtype, `New to 210-patient cohort` = exp_3_new) +write.table(table_df, "enet_multinomial_data/experiment_3_new_samples.txt", sep = "\t", + col.names = TRUE, row.names = FALSE, quote = F) + + +``` + + + ```{r} cohort_subtypes <- read.table(syn$get("syn30030154")$path) %>% dplyr::rename(Subtype = Cluster) %>% diff --git a/cohort_summary/NMF/subtype_drivers.Rmd b/cohort_summary/NMF/subtype_drivers.Rmd index 3804a22..16da916 100644 --- a/cohort_summary/NMF/subtype_drivers.Rmd +++ b/cohort_summary/NMF/subtype_drivers.Rmd @@ -176,9 +176,10 @@ ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){ plot_path <- paste0("Data/subtype_drivers/ksea_", var_name, ".pdf") } + colnames(ksea_mat) <- sub("^Cluster", "Subtype", colnames(ksea_mat)) heatmap_breaks <- seq(-5, 5, by = 0.2) heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks)) - p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, + p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, breaks = heatmap_breaks, color = heatmap_colors, width = 6, filename = plot_path, ...) @@ -274,10 +275,11 @@ rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7, plot_path <- paste0("Data/subtype_drivers/rank_gsea_", var_name, ".pdf") } - p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, + colnames(gsea_mat) <- sub("^Cluster", "Subtype", colnames(gsea_mat)) + p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, width = 6, filename = plot_path, ...) - pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, + pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, width = 6, filename = sub(".pdf$", ".png", plot_path), ...) diff --git a/cohort_summary/NMF/subtype_drivers_159.Rmd b/cohort_summary/NMF/subtype_drivers_159.Rmd new file mode 100644 index 0000000..c427650 --- /dev/null +++ b/cohort_summary/NMF/subtype_drivers_159.Rmd @@ -0,0 +1,412 @@ +--- +title: "Subtype drivers" +author: "Camilo Posso" +date: "06/13/2022" +output: + html_document: + code_folding: hide + toc: true +editor_options: + chunk_output_type: inline +--- + +## Goal + + +The goal of this markdown is to find biological identifiers for the subtypes. + + +```{r include=FALSE} +library(clusterProfiler) +library(dplyr) +library(ggpubr) +library(ggplot2) +library(kableExtra) +library(ggplotify) +library(KSEAapp) + +source("../../util/synapseUtil.R") +source("../../util/loading_data.R") +source("../../util/mutational_analysis_helper.R") +source("../../util/make_plots_util.R") + +syn <- synapseLogin() + +load("../../Misc/load.combined.data 3-09-2022.RData") +# load.combined.data() +metadata <- load.metadata() +clusters <- read.table(syn$get("syn30030154")$path, sep = "\t") +rownames(clusters) <- clusters$Barcode.ID +metadata$Cluster <- clusters[rownames(metadata), "Cluster"] + +## Restricting to 159 RNA samples here +rna_samples <- unique(RNA.data$Barcode.ID) +global.data <- global.data %>% + filter(Barcode.ID %in% rna_samples) +phospho.data <- phospho.data %>% + filter(Barcode.ID %in% rna_samples) + +global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("Gene") +m_global <- MSnSet(exprs = global_mat %>% as.matrix(), + pData = metadata[colnames(global_mat), ]) + +rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", + values_from = "RNA counts") %>% + column_to_rownames("Gene") +m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), + pData = metadata[colnames(rna_mat), ]) + +phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("SiteID") +m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), + pData = metadata[colnames(phospho_mat), ]) + +``` + + +```{r} +library(clusterProfiler) +library(msigdbr) + +t2g_hallmark <- msigdbr(species = "Homo sapiens", category = "H") %>% + dplyr::select(gs_name, gene_symbol) +## 50 unique terms +# unique(t2g_hallmark$gs_name) + +t2g_gobp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>% + dplyr::select(gs_name, gene_symbol) %>% + mutate(gs_name = gsub("_", " ", gs_name)) + +t2g_reactome <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:REACTOME") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_biocarta <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:BIOCARTA") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_kegg <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:KEGG") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_wikipathways <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:WIKIPATHWAYS") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_PathwayInteractionDB <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:PID") %>% + dplyr::select(gs_name, gene_symbol) + +KSDB <- read.csv(system.file('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv', + package='amlresistancenetworks'),stringsAsFactors = FALSE) + + +``` + + + +```{r} +ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){ + var_levels <- unique(pData(m)[[var_name]]) %>% sort() + combined_ksea <- data.frame() + exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-') + + for (level in var_levels){ + fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T) + # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T) + fold_change <- 2**fold_change_ingroup + PX <- data.frame(Protein = "NULL", Gene = names(fold_change), Peptide = "NULL", + Residue.Both = names(fold_change), p = "NULL", FC = fold_change) %>% + dplyr::mutate(Residue.Both = sub("^.*-", "", Residue.Both)) %>% + dplyr::mutate(Residue.Both = gsub("[a-z]", ";", Residue.Both)) %>% + dplyr::mutate(Residue.Both = gsub(";$", "", Residue.Both), + Gene = sub("^(.*)-.*$", "\\1", Gene)) + + combined_ksea <- KSEA.Scores(KSDB, PX, NetworKIN = TRUE, NetworKIN.cutoff = 5) %>% + dplyr::select(Kinase.Gene, m, FDR, z.score) %>% + dplyr::rename(pathway = Kinase.Gene, enrichment = z.score, + adj_p_val = FDR, set_size = m) %>% + dplyr::mutate(data_type = "phospho", + var_name = level, + DB = "KSDB") %>% + filter(set_size >= 3) %>% + rbind(combined_ksea) + } + + combined_ksea$pathway_subtype_id <- paste(combined_ksea$pathway, + combined_ksea$var_name, sep = " -- ") + rownames(combined_ksea) <- combined_ksea$pathway_subtype_id + write.table(combined_ksea, paste0("Data/subtype_drivers/ksea_159_", var_name, "_", + ending, "_results.txt"), sep = "\t") + + chosen_pathways <- combined_ksea %>% + dplyr::group_by(var_name) %>% + dplyr::arrange(adj_p_val) %>% + dplyr::slice(1:top_n) %>% + dplyr::pull(pathway) + + combined_ksea_small <- combined_ksea %>% + dplyr::filter(pathway %in% chosen_pathways) %>% + dplyr::mutate(pathway = str_wrap(pathway, width = 50)) + + ksea_mat <- combined_ksea_small %>% + dplyr::select(pathway, enrichment, var_name) %>% + pivot_wider(names_from = var_name, values_from = enrichment) %>% + column_to_rownames("pathway") + + ksea_mat_p <- combined_ksea_small %>% + dplyr::select(pathway, adj_p_val, var_name) %>% + pivot_wider(names_from = var_name, values_from = adj_p_val) %>% + column_to_rownames("pathway") + + ksea_mat_p <- ksea_mat_p %>% as.matrix() + ksea_mat_p[is.na(ksea_mat_p)] <- 1 + ksea_mat_p <- apply(ksea_mat_p, 2, signif, digits = 3) + + ksea_mat_p <- ksea_mat_p[, var_levels] + ksea_mat_p2 <- ksea_mat_p + ksea_mat_p2[ksea_mat_p >= 0.01] <- "" + ksea_mat_p2[ksea_mat_p < 0.01] <- "*" + ksea_mat <- ksea_mat[, var_levels] + + title <- paste("KSEA", var_name, ending) + + if (nchar(ending) > 0){ + ending <- tolower(ending) + var_name <- tolower(var_name) + plot_path <- paste0("Data/subtype_drivers/ksea_159_", var_name, "_", ending, ".pdf") %>% + gsub(" ", "_", .) + } else { + plot_path <- paste0("Data/subtype_drivers/ksea_159_", var_name, ".pdf") + } + + colnames(ksea_mat) <- sub("^Cluster", "Subtype", colnames(ksea_mat)) + heatmap_breaks <- seq(-5, 5, by = 0.2) + heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks)) + p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + breaks = heatmap_breaks, color = heatmap_colors, width = 6, + filename = plot_path, ...) + return(p) + +} + + +``` + + + +```{r} +rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7, + top_n = 7, ending = "", ...){ + var_levels <- unique(pData(m)[[var_name]]) %>% sort() + combined_gsea <- data.frame() + exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-') + + if (file.exists(paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", + ending, "_results.txt"))){ + print("Found GSEA results table.") + combined_gsea <- read.table(paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", + ending, "_results.txt")) + } else { + for (level in var_levels){ + fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T) + # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T) + fold_change <- fold_change_ingroup + fold_change <- sort(fold_change, decreasing = TRUE) + set.seed(69) + ## For some pathways, gsea is unable to assess a pvalue. These are removed by the function + ## Internally. This is why some pathways (like "GOBP NCRNA METABOLISM") are excluded from + ## the result. + combined_gsea <- GSEA(fold_change, eps = 1e-16, minGSSize = 10, + pvalueCutoff = 1, TERM2GENE = t2g)@result %>% + dplyr::select(Description, setSize, NES, pvalue, p.adjust, core_enrichment) %>% + dplyr::mutate(var_name = level) %>% + rbind(combined_gsea) + } + + combined_gsea$pathway_subtype_id <- paste(combined_gsea$Description, + combined_gsea$var_name, sep = " -- ") + rownames(combined_gsea) <- combined_gsea$pathway_subtype_id + write.table(combined_gsea, paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", + ending, "_results.txt"), sep = "\t") + } + + combined_gsea <- combined_gsea %>% + dplyr::mutate(Description = sub("^[A-Z]+_", "", Description)) %>% + dplyr::mutate(Description = sub("^GOBP ", "", Description)) + + chosen_pathways <- combined_gsea %>% + dplyr::group_by(var_name) %>% + dplyr::arrange(p.adjust) %>% + dplyr::slice(1:top_n) %>% + dplyr::pull(Description) + + combined_gsea_small <- combined_gsea %>% + dplyr::filter(Description %in% chosen_pathways) %>% + dplyr::mutate(Description = str_wrap(Description, width = 50)) + + gsea_mat <- combined_gsea_small %>% + dplyr::select(Description, NES, var_name) %>% + dplyr::mutate(Description_og = Description) %>% + dplyr::mutate(Description = str_wrap(gsub("_", " ", Description), width = wrap_width)) %>% + pivot_wider(names_from = var_name, values_from = NES) %>% + column_to_rownames("Description") + + gsea_mat_p <- combined_gsea_small %>% + dplyr::select(Description, p.adjust, var_name) %>% + pivot_wider(names_from = var_name, values_from = p.adjust) %>% + column_to_rownames("Description") + + gsea_mat_p <- gsea_mat_p %>% as.matrix() + gsea_mat_p[is.na(gsea_mat_p)] <- 1 + gsea_mat_p <- apply(gsea_mat_p, 2, signif, digits = 2) + + gsea_mat_p <- gsea_mat_p[, var_levels] + gsea_mat_p2 <- gsea_mat_p + gsea_mat_p2[gsea_mat_p >= 0.01] <- "" + gsea_mat_p2[gsea_mat_p < 0.01] <- "*" + gsea_mat <- gsea_mat[, var_levels] + + title <- paste("GSEA", var_name, ending) + + if (nchar(ending) > 0){ + ending <- tolower(ending) + var_name <- tolower(var_name) + plot_path <- paste0("Data/subtype_drivers/rank_gsea_159_", var_name, "_", ending, ".pdf") %>% + gsub(" ", "_", .) + } else { + plot_path <- paste0("Data/subtype_drivers/rank_gsea_159_", var_name, ".pdf") + } + + colnames(gsea_mat) <- sub("^Cluster", "Subtype", colnames(gsea_mat)) + p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + width = 6, filename = plot_path, ...) + pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, fontsize_number = 15, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + width = 6, filename = sub(".pdf$", ".png", plot_path), ...) + + return(p) + +} + +``` + + + +```{r Hallmarks} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "Global - Hallmarks") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "RNA - Hallmarks") +p3 <- ksea_wrap(m_phospho, var_name = "Cluster", top_n = 6, ending = "Phospho - PhosphoSitePlus & NetworKIN") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_hallmarks.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_hallmarks.png", width = 16, height = 6) + +``` + + + +```{r GO BP} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_gobp, top_n = 7, + ending = "Global - GO Biological Process", font_size = 6, wrap_width = 50) +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_gobp, top_n = 7, + ending = "RNA - GO Biological Process", font_size = 6, wrap_width = 50) + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_gsea_gobp.pdf", width = 19, height = 5.5) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_gsea_gobp.png", width = 19, height = 5.5) + +``` + + + +```{r Reactome} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "Global - Reactome", + font_size = 7, wrap_width = 50) +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "RNA - Reactome", + font_size = 6, wrap_width = 50) + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_reactome.pdf", width = 18, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_reactome.png", width = 18, height = 6) + +``` + + + +```{r Biocarta} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "Global - Biocarta") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "RNA - Biocarta") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_biocarta.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_biocarta.png", width = 16, height = 6) + +``` + + + +```{r KEGG} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "Global - KEGG") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "RNA - KEGG") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_kegg.pdf", width = 17, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_kegg.png", width = 17, height = 6) + +``` + + + +```{r Wikipathways} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "Global - Wiki Pathways") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "RNA - Wiki Pathways") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_wikipathways.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_wikipathways.png", width = 16, height = 6) + +``` + + + +```{r Pathway Interaction DB} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "Global - PID") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "RNA - PID") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_pathway_interaction_db.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_159_pathway_interaction_db.png", width = 16, height = 6) + +``` + + + +```{r} +ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_159_by_cluster.pdf", height = 6, width = 6) +ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_159_by_cluster.png", height = 6, width = 6) + +``` + + + + + + + + + + + + + + + + + + diff --git a/cohort_summary/NMF/subtype_drivers_alt.Rmd b/cohort_summary/NMF/subtype_drivers_alt.Rmd new file mode 100644 index 0000000..687102c --- /dev/null +++ b/cohort_summary/NMF/subtype_drivers_alt.Rmd @@ -0,0 +1,417 @@ +--- +title: "Subtype drivers" +author: "Camilo Posso" +date: "06/13/2022" +output: + html_document: + code_folding: hide + toc: true +editor_options: + chunk_output_type: inline +--- + +## Goal + + +The goal of this markdown is to find biological identifiers for the subtypes. + + +```{r include=FALSE} +library(clusterProfiler) +library(dplyr) +library(ggpubr) +library(ggplot2) +library(kableExtra) +library(ggplotify) +library(KSEAapp) + +source("../../util/synapseUtil.R") +source("../../util/loading_data.R") +source("../../util/mutational_analysis_helper.R") +source("../../util/make_plots_util.R") + +syn <- synapseLogin() + +load("../../Misc/load.combined.data 3-09-2022.RData") +# load.combined.data() +metadata <- load.metadata() +clusters <- read.table(syn$get("syn30030154")$path, sep = "\t") +rownames(clusters) <- clusters$Barcode.ID +metadata$Cluster <- clusters[rownames(metadata), "Cluster"] + +global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("Gene") +m_global <- MSnSet(exprs = global_mat %>% as.matrix(), + pData = metadata[colnames(global_mat), ]) + +rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", + values_from = "RNA counts") %>% + column_to_rownames("Gene") +m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), + pData = metadata[colnames(rna_mat), ]) + +phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("SiteID") +m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), + pData = metadata[colnames(phospho_mat), ]) + +``` + + +```{r} +library(clusterProfiler) +library(msigdbr) + +t2g_hallmark <- msigdbr(species = "Homo sapiens", category = "H") %>% + dplyr::select(gs_name, gene_symbol) +## 50 unique terms +# unique(t2g_hallmark$gs_name) + +t2g_gobp <- msigdbr(species = "Homo sapiens", category = "C5", subcategory = "BP") %>% + dplyr::select(gs_name, gene_symbol) %>% + mutate(gs_name = gsub("_", " ", gs_name)) + +t2g_reactome <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:REACTOME") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_biocarta <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:BIOCARTA") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_kegg <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:KEGG") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_wikipathways <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:WIKIPATHWAYS") %>% + dplyr::select(gs_name, gene_symbol) + +t2g_PathwayInteractionDB <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::filter(gs_subcat == "CP:PID") %>% + dplyr::select(gs_name, gene_symbol) + +KSDB <- read.csv(system.file('PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv', + package='amlresistancenetworks'),stringsAsFactors = FALSE) + + +``` + + + +```{r} +ksea_wrap <- function(m, var_name, top_n = 7, font_size = 7, ending = "", ...){ + var_levels <- unique(pData(m)[[var_name]]) %>% sort() + combined_ksea <- data.frame() + # exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-') + + for (level in var_levels){ + fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T) + all_means <- data.frame(mean_grp = fold_change_ingroup) + for (other_level in setdiff(var_levels, c(level))){ + all_means[other_level] <- apply(exprs(m)[, m[[var_name]] == other_level], 1, mean, na.rm = T) + } + other_mean <- all_means[, -1] %>% + rowSums() + other_mean <- other_mean/(ncol(all_means) - 1) + fold_change <- all_means[, 1] - other_mean + # fold_change_outgroup <- apply(exprs(m)[, m[[var_name]] != level], 1, mean, na.rm = T) + fold_change <- 2**fold_change + PX <- data.frame(Protein = "NULL", Gene = names(fold_change), Peptide = "NULL", + Residue.Both = names(fold_change), p = "NULL", FC = fold_change) %>% + dplyr::mutate(Residue.Both = sub("^.*-", "", Residue.Both)) %>% + dplyr::mutate(Residue.Both = gsub("[a-z]", ";", Residue.Both)) %>% + dplyr::mutate(Residue.Both = gsub(";$", "", Residue.Both), + Gene = sub("^(.*)-.*$", "\\1", Gene)) + + combined_ksea <- KSEA.Scores(KSDB, PX, NetworKIN = TRUE, NetworKIN.cutoff = 5) %>% + dplyr::select(Kinase.Gene, m, FDR, z.score) %>% + dplyr::rename(pathway = Kinase.Gene, enrichment = z.score, + adj_p_val = FDR, set_size = m) %>% + dplyr::mutate(data_type = "phospho", + var_name = level, + DB = "KSDB") %>% + filter(set_size >= 3) %>% + rbind(combined_ksea) + } + + combined_ksea$pathway_subtype_id <- paste(combined_ksea$pathway, + combined_ksea$var_name, sep = " -- ") + rownames(combined_ksea) <- combined_ksea$pathway_subtype_id + write.table(combined_ksea, paste0("Data/subtype_drivers/ksea_v2_", var_name, "_", + ending, "_results.txt"), sep = "\t") + + chosen_pathways <- combined_ksea %>% + dplyr::group_by(var_name) %>% + dplyr::arrange(adj_p_val) %>% + dplyr::slice(1:top_n) %>% + dplyr::pull(pathway) + + combined_ksea_small <- combined_ksea %>% + dplyr::filter(pathway %in% chosen_pathways) %>% + dplyr::mutate(pathway = str_wrap(pathway, width = 50)) + + ksea_mat <- combined_ksea_small %>% + dplyr::select(pathway, enrichment, var_name) %>% + pivot_wider(names_from = var_name, values_from = enrichment) %>% + column_to_rownames("pathway") + + ksea_mat_p <- combined_ksea_small %>% + dplyr::select(pathway, adj_p_val, var_name) %>% + pivot_wider(names_from = var_name, values_from = adj_p_val) %>% + column_to_rownames("pathway") + + ksea_mat_p <- ksea_mat_p %>% as.matrix() + ksea_mat_p[is.na(ksea_mat_p)] <- 1 + ksea_mat_p <- apply(ksea_mat_p, 2, signif, digits = 3) + + ksea_mat_p <- ksea_mat_p[, var_levels] + ksea_mat_p2 <- ksea_mat_p + ksea_mat_p2[ksea_mat_p >= 0.01] <- "" + ksea_mat_p2[ksea_mat_p < 0.01] <- "*" + ksea_mat <- ksea_mat[, var_levels] + + title <- paste("KSEA", var_name, ending) + + if (nchar(ending) > 0){ + ending <- tolower(ending) + var_name <- tolower(var_name) + plot_path <- paste0("Data/subtype_drivers/ksea_v2_", var_name, "_", ending, ".pdf") %>% + gsub(" ", "_", .) + } else { + plot_path <- paste0("Data/subtype_drivers/ksea_v2_", var_name, ".pdf") + } + + heatmap_breaks <- seq(-5, 5, by = 0.2) + heatmap_colors <- colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(length(heatmap_breaks)) + p <- pheatmap(ksea_mat, display_numbers = ksea_mat_p2, cluster_cols = F, fontsize = 9, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + breaks = heatmap_breaks, color = heatmap_colors, width = 6, + filename = plot_path, ...) + return(p) + +} + + +``` + + + +```{r} +rank_gsea <- function(m, var_name, t2g, wrap_width = 40, font_size = 7, + top_n = 7, ending = "", ...){ + var_levels <- unique(pData(m)[[var_name]]) %>% sort() + combined_gsea <- data.frame() + # exprs(m) <- sweep(exprs(m), 1, apply(exprs(m), 1, mean, na.rm = T), FUN = '-') + + if (file.exists(paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", + ending, "_results.txt"))){ + print("Found GSEA results table.") + combined_gsea <- read.table(paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", + ending, "_results.txt")) + } else { + for (level in var_levels){ + fold_change_ingroup <- apply(exprs(m)[, m[[var_name]] == level], 1, mean, na.rm = T) + all_means <- data.frame(mean_grp = fold_change_ingroup) + for (other_level in setdiff(var_levels, c(level))){ + all_means[other_level] <- apply(exprs(m)[, m[[var_name]] == other_level], 1, mean, na.rm = T) + } + other_mean <- all_means[, -1] %>% + rowSums() + other_mean <- other_mean/(ncol(all_means) - 1) + fold_change <- all_means[, 1] - other_mean + fold_change <- sort(fold_change, decreasing = TRUE) + set.seed(69) + ## For some pathways, gsea is unable to assess a pvalue. These are removed by the function + ## Internally. This is why some pathways (like "GOBP NCRNA METABOLISM") are excluded from + ## the result. + combined_gsea <- GSEA(fold_change, eps = 1e-16, minGSSize = 10, + pvalueCutoff = 1, TERM2GENE = t2g)@result %>% + dplyr::select(Description, setSize, NES, pvalue, p.adjust, core_enrichment) %>% + dplyr::mutate(var_name = level) %>% + rbind(combined_gsea) + } + + combined_gsea$pathway_subtype_id <- paste(combined_gsea$Description, + combined_gsea$var_name, sep = " -- ") + rownames(combined_gsea) <- combined_gsea$pathway_subtype_id + write.table(combined_gsea, paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", + ending, "_results.txt"), sep = "\t") + } + + combined_gsea <- combined_gsea %>% + dplyr::mutate(Description = sub("^[A-Z]+_", "", Description)) %>% + dplyr::mutate(Description = sub("^GOBP ", "", Description)) + + chosen_pathways <- combined_gsea %>% + dplyr::group_by(var_name) %>% + dplyr::arrange(p.adjust) %>% + dplyr::slice(1:top_n) %>% + dplyr::pull(Description) + + combined_gsea_small <- combined_gsea %>% + dplyr::filter(Description %in% chosen_pathways) %>% + dplyr::mutate(Description = str_wrap(Description, width = 50)) + + gsea_mat <- combined_gsea_small %>% + dplyr::select(Description, NES, var_name) %>% + dplyr::mutate(Description_og = Description) %>% + dplyr::mutate(Description = str_wrap(gsub("_", " ", Description), width = wrap_width)) %>% + pivot_wider(names_from = var_name, values_from = NES) %>% + column_to_rownames("Description") + + gsea_mat_p <- combined_gsea_small %>% + dplyr::select(Description, p.adjust, var_name) %>% + pivot_wider(names_from = var_name, values_from = p.adjust) %>% + column_to_rownames("Description") + + gsea_mat_p <- gsea_mat_p %>% as.matrix() + gsea_mat_p[is.na(gsea_mat_p)] <- 1 + gsea_mat_p <- apply(gsea_mat_p, 2, signif, digits = 2) + + gsea_mat_p <- gsea_mat_p[, var_levels] + gsea_mat_p2 <- gsea_mat_p + gsea_mat_p2[gsea_mat_p >= 0.01] <- "" + gsea_mat_p2[gsea_mat_p < 0.01] <- "*" + gsea_mat <- gsea_mat[, var_levels] + + title <- paste("GSEA", var_name, ending) + + if (nchar(ending) > 0){ + ending <- tolower(ending) + var_name <- tolower(var_name) + plot_path <- paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, "_", ending, ".pdf") %>% + gsub(" ", "_", .) + } else { + plot_path <- paste0("Data/subtype_drivers/rank_gsea_v2_", var_name, ".pdf") + } + + p <- pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + width = 6, filename = plot_path, ...) + pheatmap(gsea_mat, display_numbers = gsea_mat_p2, cluster_cols = F, fontsize = 9, + fontsize_row = font_size, main = title, treeheight_row = 0, cellwidth = 33, + width = 6, filename = sub(".pdf$", ".png", plot_path), ...) + + return(p) + +} + +``` + + + +```{r Hallmarks} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "Global - Hallmarks") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_hallmark, top_n = 7, ending = "RNA - Hallmarks") +p3 <- ksea_wrap(m_phospho, var_name = "Cluster", top_n = 6, ending = "Phospho - PhosphoSitePlus & NetworKIN") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_hallmarks.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_hallmarks.png", width = 16, height = 6) + +``` + + + +```{r GO BP} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_gobp, top_n = 7, + ending = "Global - GO Biological Process", font_size = 6, wrap_width = 50) +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_gobp, top_n = 7, + ending = "RNA - GO Biological Process", font_size = 6, wrap_width = 50) + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_gsea_gobp.pdf", width = 19, height = 5.5) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_gsea_gobp.png", width = 19, height = 5.5) + +``` + + + +```{r Reactome} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "Global - Reactome", + font_size = 7, wrap_width = 50) +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_reactome, top_n = 7, ending = "RNA - Reactome", + font_size = 6, wrap_width = 50) + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_reactome.pdf", width = 18, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_reactome.png", width = 18, height = 6) + +``` + + + +```{r Biocarta} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "Global - Biocarta") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_biocarta, top_n = 7, ending = "RNA - Biocarta") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_biocarta.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_biocarta.png", width = 16, height = 6) + +``` + + + +```{r KEGG} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "Global - KEGG") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_kegg, top_n = 7, ending = "RNA - KEGG") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_kegg.pdf", width = 17, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_kegg.png", width = 17, height = 6) + +``` + + + +```{r Wikipathways} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "Global - Wiki Pathways") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_wikipathways, top_n = 7, ending = "RNA - Wiki Pathways") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_wikipathways.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_wikipathways.png", width = 16, height = 6) + +``` + + + +```{r Pathway Interaction DB} +p1 <- rank_gsea(m_global, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "Global - PID") +p2 <- rank_gsea(m_rna, "Cluster", t2g = t2g_PathwayInteractionDB, top_n = 7, ending = "RNA - PID") + +p <- arrangeGrob(p1[[4]], p2[[4]], p3[[4]], ncol = 3) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_pathway_interaction_db.pdf", width = 16, height = 6) +ggsave(plot = p, "Data/subtype_drivers/combined_gsea_ksea_v2_pathway_interaction_db.png", width = 16, height = 6) + +``` + + + +```{r} +ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_by_cluster.pdf", height = 6, width = 6) +ggsave(plot = p3[[4]], filename = "Data/subtype_drivers/ksea_by_cluster.png", height = 6, width = 6) + +``` + + + + + + + + + + + + + + + + + + diff --git a/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd b/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd new file mode 100644 index 0000000..e93278a --- /dev/null +++ b/cohort_summary/NMF/subtype_mutation_AUC/subtype_added_value.Rmd @@ -0,0 +1,452 @@ +--- +title: "Subtype drivers" +author: "Camilo Posso" +date: "08/29/2022" +output: + html_document: + code_folding: hide + toc: true +editor_options: + chunk_output_type: inline +--- + +## Goal + + +The goal of this markdown is to find biological identifiers for the subtypes. + + +```{r include=FALSE} +library(dplyr) +library(ggplot2) +library(broom) + +source("../../../util/synapseUtil.R") +source("../../../util/loading_data.R") +source("../../../util/mutational_analysis_helper.R") +source("../../../util/make_plots_util.R") + +syn <- synapseLogin() + +load("../../../Misc/load.combined.data 3-09-2022.RData") +# load.combined.data() +metadata <- load.metadata() +clusters <- read.table(syn$get("syn30030154")$path, sep = "\t") +rownames(clusters) <- clusters$Barcode.ID +metadata$Cluster <- clusters[rownames(metadata), "Cluster"] + +global_mat <- pivot_wider(global.data, names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("Gene") +m_global <- MSnSet(exprs = global_mat %>% as.matrix(), + pData = metadata[colnames(global_mat), ]) + +rna_mat <- pivot_wider(RNA.data, names_from = "Barcode.ID", + values_from = "RNA counts") %>% + column_to_rownames("Gene") +m_rna <- MSnSet(exprs = rna_mat %>% as.matrix(), + pData = metadata[colnames(rna_mat), ]) + +phospho_mat <- pivot_wider(phospho.data %>% select(-Gene), names_from = "Barcode.ID", + values_from = "LogRatio") %>% + column_to_rownames("SiteID") +m_phospho <- MSnSet(exprs = phospho_mat %>% as.matrix(), + pData = metadata[colnames(phospho_mat), ]) + +mutation_data <- load_mutational_sample_data() + +NPM1_samples <- mutation_data %>% + filter(Gene == "NPM1_clinical") %>% + pull(Barcode.ID) %>% unique() + +DNMT3A_samples <- mutation_data %>% + filter(Gene == "DNMT3A") %>% + pull(Barcode.ID) %>% unique() + +metadata <- metadata %>% + mutate(NPM1 = case_when(Barcode.ID %in% NPM1_samples ~ "TRUE", + TRUE ~ "FALSE"), + DNMT3A = case_when(Barcode.ID %in% DNMT3A_samples ~ "TRUE", + TRUE ~ "FALSE"), + #mut_col = paste(FLT3.ITD, NPM1), + mut_col = FLT3.ITD, + shuffle = sample(Cluster), + Cluster = sub("Cluster ", "", Cluster), + Cluster_1 = case_when(Cluster == "1" ~ TRUE, + TRUE ~ FALSE), + Cluster_2 = case_when(Cluster == "2" ~ TRUE, + TRUE ~ FALSE), + Cluster_3 = case_when(Cluster == "3" ~ TRUE, + TRUE ~ FALSE), + Cluster_4 = case_when(Cluster == "4" ~ TRUE, + TRUE ~ FALSE)) + +latest_AUC <- read.table("drug_response.csv", sep = ",") +latest_AUC <- latest_AUC[-1, -1] + +colnames(latest_AUC) <- c("Barcode.ID", "Inhibitor", "AUC") +latest_AUC$AUC <- as.numeric(latest_AUC$AUC) + + +``` + + + +```{r} +## Setting up matrices +AUC_mat_old <- pivot_wider(functional.data.sensitive.family %>% select(Inhibitor, Barcode.ID, AUC), names_from = "Barcode.ID", + values_from = "AUC", values_fn = mean) %>% + column_to_rownames("Inhibitor") + +## Using drugs from James table +AUC_mat <- pivot_wider(latest_AUC %>% select(Inhibitor, Barcode.ID, AUC), names_from = "Barcode.ID", + values_from = "AUC", values_fn = mean) %>% + column_to_rownames("Inhibitor") + +chosen_drugs <- c('Venetoclax', 'Vargetef', 'Trametinib (GSK1120212)', + 'Tivozanib (AV-951)', 'Sorafenib', 'Selumetinib (AZD6244)', + 'Selinexor', 'SNS-032 (BMS-387032)', 'Rapamycin', + 'RAF265 (CHIR-265)', 'Quizartinib (AC220)', 'Ponatinib (AP24534)', + 'Pelitinib (EKB-569)', 'Panobinostat', 'PP242', 'PI-103', + 'PD173955', 'OTX-015', 'Neratinib (HKI-272)', 'NVP-TAE684', + 'NF-kB Activation Inhibitor', 'Midostaurin', 'KW-2449', 'KI20227', + 'JQ1', 'JNJ-28312141', 'JAK Inhibitor I', 'INK-128', + 'Gilteritinib', 'GDC-0941', 'Foretinib (XL880)', 'Flavopiridol', + 'Entospletinib (GS-9973)', 'Elesclomol', 'Dovitinib (CHIR-258)', + 'Doramapimod (BIRB 796)', 'Dasatinib', 'Cabozantinib', 'CYT387', + 'CI-1040 (PD184352)', 'Bortezomib (Velcade)', 'BEZ235', + 'Afatinib (BIBW-2992)', 'AT7519', 'A-674563', + '17-AAG (Tanespimycin)') + +## Subset to those drugs which we have used before. Can change later. +AUC_mat <- AUC_mat[chosen_drugs, ] + +m_auc <- MSnSet(exprs = AUC_mat %>% as.matrix(), + pData = metadata[colnames(AUC_mat), ]) + +``` + + +## Using limma and F test +```{r eval=FALSE, include=FALSE} +## Using limma and F test +mut_cluster <- limma_gen(m_auc, "~ mut_col + Cluster", "Cluster") + +mut_shuffle <- limma_gen(m_auc, "~ mut_col + shuffle", "shuffle") + +limma_cluster <- limma_gen(m_auc, "~ Cluster", "Cluster") +limma_shuffle <- limma_gen(m_auc, "~ shuffle", "shuffle") + +limma_mut <- limma_gen(m_auc, "~mut_col", "mut_col") + +cluster_mut <- limma_gen(m_auc, "~mut_col + Cluster", "mut_col") + +summary_df <- mut_cluster %>% + select(adj.P.Val3 = adj.P.Val) +summary_df$Inhibitor <- rownames(summary_df) + +summary_df <- limma_cluster %>% + select(adj.P.Val2 = adj.P.Val) %>% + mutate(Inhibitor = rownames(.)) %>% + merge(summary_df, by = "Inhibitor") +summary_df <- limma_mut %>% + select(adj.P.Val1 = adj.P.Val) %>% + mutate(Inhibitor = rownames(.)) %>% + merge(summary_df, by = "Inhibitor") + +colnames(summary_df) <- c("Inhibitor", + "FLT3.ITD significant", + "Subtype significant", + "Subtype improves FLT3 model") + +summary_df <- cluster_mut %>% + select(adj.P.Val_flt3 = adj.P.Val) %>% + mutate(Inhibitor = rownames(.)) %>% + merge(summary_df, by = "Inhibitor") + +colnames(summary_df)[2] <- "FLT3 improves subtype model" +rownames(summary_df) <- summary_df$Inhibitor + +pheatmap(summary_df[, c(3,5)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2), + breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary.png", height = 10, + treeheight_row = 0, treeheight_col = 0, angle_col = 45) + +pheatmap(summary_df[, c(4,2)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2), + breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_2.png", height = 10, + treeheight_row = 0, treeheight_col = 0, angle_col = 45) + +pheatmap(summary_df[, c(3,5)], color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2), + breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_3.png", height = 10, + treeheight_row = 0, treeheight_col = 0, angle_col = 45) + +``` + + + + +```{r eval=FALSE, include=FALSE} +## scratch work +lm_df <- data_df_cluster %>% + filter(Inhibitor == chosen_drug) +model_og <- lm("value ~ Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df) + +auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T) +lm_df_std <- lm_df %>% + mutate(value = value - auc_mean) +model <- lm("value ~ Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df_std) +model1 <- lm("value ~ -1 + Cluster + FLT3.ITD + FLT3.ITD:Cluster", data = lm_df_std) + + +``` + + +```{r eval=FALSE, include=FALSE} +## Little interaction, subtype important +# chosen_drug <- "Selumetinib (AZD6244)" # subtype predicts AUC response, little interaction. + +## Interaction present, subtype important +# chosen_drug <- "Venetoclax" ## affected little by FLT3, subtype interaction with FLT3 seen in cluster 3. + +## Purely FLT3 effect, harder to discern for sure +# chosen_drug <- "KW-2449" +# chosen_drug <- "Crenolanib" +chosen_drug <- "Elesclomol" ## affected a lot by FLT3, maybe negative subtype interaction? in the sense that + ## subtype 1 FLT3.ITD is not lower, but higher + ## than FLT3.ITD false. This is a FLT3 inhibitor + + +lm_df <- data_df_cluster %>% + filter(Inhibitor == chosen_drug) +model_og <- lm("value ~ Cluster + FLT3.ITD:Cluster", data = lm_df) + +auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T) +lm_df_std <- lm_df %>% + mutate(value = value - auc_mean) +model <- lm("value ~ Cluster + FLT3.ITD:Cluster", data = lm_df_std) +model1 <- lm("value ~ -1 + Cluster + FLT3.ITD:Cluster", data = lm_df_std) + +plot_df <- data.frame(AUC = AUC_mat[chosen_drug, ] %>% as.numeric(), Barcode.ID = colnames(AUC_mat)) %>% + merge(metadata %>% select(Barcode.ID, FLT3.ITD, Cluster), by = "Barcode.ID") +AUC_mean = mean(plot_df[plot_df$FLT3.ITD == "FALSE", "AUC"], na.rm = T) + +ggplot(plot_df, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug) + +ggplot(plot_df, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug) + +ggplot(plot_df, aes(x = paste(Cluster, FLT3.ITD), y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle(chosen_drug) + + +``` + + +### Trying with boolean variables, gives the same behavior, and same coefficients and p-values + +```{r eval=FALSE, include=FALSE} +data_df_cluster <- exprs(m_auc) %>% as.data.frame() +data_df_cluster$Inhibitor <- rownames(data_df_cluster) +data_df_cluster <- data_df_cluster %>% + pivot_longer(-Inhibitor) %>% + merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster_1, Cluster_2, Cluster_3, Cluster_4)) + +set.seed(117) +data_df_shuffle <- exprs(m_auc) %>% as.data.frame() +data_df_shuffle$Inhibitor <- rownames(data_df_shuffle) +data_df_shuffle <- data_df_shuffle %>% + pivot_longer(-Inhibitor) %>% + merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster = shuffle)) + +auc_model_v2 <- function(chosen_drug, data_df){ + lm_df <- data_df %>% + filter(Inhibitor == chosen_drug) + auc_mean <- mean(lm_df$value, na.rm = T) + # lm_df <- lm_df %>% + # mutate(value = value - auc_mean) + model <- lm("value ~ FLT3.ITD + Cluster_1 + Cluster_2 + Cluster_3 + Cluster_4 + + Cluster_1:FLT3.ITD + Cluster_2:FLT3.ITD + + Cluster_3:FLT3.ITD + Cluster_4:FLT3.ITD", data = lm_df) + out_df <- data.frame(Inhibitor = chosen_drug, + term = tidy(summary(model))[, 1], + p.value = tidy(summary(model))[, 5]) %>% + rbind(data.frame(Inhibitor = chosen_drug, + term = "F_stat", + p.value = pf(summary(model)$fstatistic[[1]], summary(model)$fstatistic[[2]], + summary(model)$fstatistic[[3]], lower.tail = FALSE))) + return(out_df) +} + + +``` + + + + +```{r} +data_df_cluster <- exprs(m_auc) %>% as.data.frame() +data_df_cluster$Inhibitor <- rownames(data_df_cluster) +data_df_cluster <- data_df_cluster %>% + pivot_longer(-Inhibitor) %>% + merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster)) +data_df_cluster$Cluster <- factor(data_df_cluster$Cluster, levels = c("1", "2", "3", "4")) + +data_df_shuffle <- exprs(m_auc) %>% as.data.frame() +data_df_shuffle$Inhibitor <- rownames(data_df_shuffle) +data_df_shuffle <- data_df_shuffle %>% + pivot_longer(-Inhibitor) %>% + merge(metadata %>% select(name = Barcode.ID, FLT3.ITD, Cluster = shuffle)) + +auc_model <- function(chosen_drug, data_df){ + lm_df <- data_df %>% + filter(Inhibitor == chosen_drug) + ## Normalize using FLT3.ITD = FALSE as the control group. + ## Basically, we take away the mean of the FLT3.ITD = FALSE group + auc_mean <- mean(lm_df[lm_df$FLT3.ITD == "FALSE", "value"], na.rm = T) + lm_df <- lm_df %>% + mutate(value = value - auc_mean) + model <- lm("value ~ -1 + Cluster + Cluster:FLT3.ITD", data = lm_df) + out_df <- data.frame(Inhibitor = chosen_drug, + term = tidy(summary(model))[, 1], + p.value = tidy(summary(model))[, 5], + coeff = model$coefficients %>% as.numeric()) %>% + rbind(data.frame(Inhibitor = chosen_drug, + term = "F_stat", + p.value = pf(summary(model)$fstatistic[[1]], summary(model)$fstatistic[[2]], + summary(model)$fstatistic[[3]], lower.tail = FALSE), + coeff = NA)) + return(out_df) +} + + +``` + + + +```{r} +epsi <- 0.0000001 +all_results <- lapply(rownames(m_auc), auc_model, data_df_cluster) %>% do.call("rbind", .) %>% + mutate(adj_p_val = p.adjust(p.value, method = "BH"), + xx = case_when(term != "F_stat" ~ (p.value < 0.05) * coeff, + term == "F_stat" & p.value > 0.05 ~ 0)) %>% + select(-adj_p_val, -p.value, -coeff) + +mat_results <- pivot_wider(all_results, values_from = "xx", names_from = "term") %>% + select(Inhibitor, everything()) %>% + column_to_rownames("Inhibitor") %>% as.data.frame() + +colnames(mat_results) <- c("Subtype 1", "Subtype 2", "Subtype 3", "Subtype 4", + "Subtype 1 and FLT3", "Subtype 2 and FLT3", + "Subtype 3 and FLT3", "Subtype 4 and FLT3", "F tatistic") + + +color_pal <- c(rev(colorRampPalette(brewer.pal(n = 8, name = "Blues"))(15)), + "grey", + colorRampPalette(brewer.pal(n = 8, name = "Reds"))(15)) + +breaks_buddy <- c(seq(min(mat_results, na.rm = T), -2*epsi, length = 15), + -epsi, epsi, + seq(2*epsi, max(mat_results, na.rm = T), length = 15)) + +pheatmap(mat_results, color = color_pal, breaks = breaks_buddy, na_col = "#815F8A", + legend = TRUE, filename = "subtype_auc_model_summary_interaction_v2.png", fontsize = 13, + treeheight_col = 0, angle_col = 315, cluster_cols = F, width = 8.5, height = 15) + +pheatmap(mat_results, color = color_pal, breaks = breaks_buddy, na_col = "#815F8A", + legend = TRUE, filename = "subtype_auc_model_summary_interaction_v2.pdf", fontsize = 13, + treeheight_col = 0, angle_col = 315, cluster_cols = F, width = 8.5, height = 15) + +``` + + + +```{r} + + + + +``` + + + +```{r} + +all_results <- lapply(rownames(m_auc), auc_model, data_df_shuffle) %>% do.call("rbind", .) %>% + mutate(adj_p_val = p.adjust(p.value, method = "BH")) %>% + select(-p.value) + +mat_results <- pivot_wider(all_results, values_from = "adj_p_val", names_from = "term") %>% + select(Inhibitor, everything()) %>% + column_to_rownames("Inhibitor") %>% as.data.frame() + + +pheatmap(t(mat_results), color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2), + breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_interaction_shuffled.png", + treeheight_row = 0, treeheight_col = 0, angle_col = 315, cluster_rows = F, width = 20, height = 7) + +pheatmap(t(mat_results), color = colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(2), + breaks = c(0, 0.05, 1), legend = FALSE, filename = "subtype_auc_model_summary_interaction_shuffled.pdf", + treeheight_row = 0, treeheight_col = 0, angle_col = 315, cluster_rows = F, width = 20, height = 7) + +``` + + + +```{r} +chosen_drug <- "JAK Inhibitor" +plot_df <- data.frame(AUC = AUC_mat[chosen_drug, ] %>% as.numeric(), Barcode.ID = colnames(AUC_mat)) %>% + merge(metadata %>% select(Barcode.ID, FLT3.ITD, Cluster), by = "Barcode.ID") +AUC_mean = mean(plot_df[plot_df$FLT3.ITD == "FALSE", "AUC"], na.rm = T) + +meta_shuffled <- data_df_shuffle %>% + select(Barcode.ID = name, FLT3.ITD, Cluster) %>% + unique() + +plot_df_shuffled <- plot_df %>% select(-Cluster, -FLT3.ITD) %>% + merge(meta_shuffled, by = "Barcode.ID") + +## Original cluster +ggplot(plot_df, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original") + +ggplot(plot_df, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original") + +ggplot(plot_df, aes(x = paste(Cluster, FLT3.ITD), + y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("original") + +## Shuffled cluster labels +ggplot(plot_df_shuffled, aes(x = FLT3.ITD, y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled") + +ggplot(plot_df_shuffled, aes(x = Cluster, y = AUC, color = Cluster)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled") + +ggplot(plot_df_shuffled, aes(x = paste(Cluster, FLT3.ITD), + y = AUC, color = FLT3.ITD)) + geom_boxplot() + + geom_jitter(position = position_jitter()) + + geom_hline(aes(yintercept = AUC_mean)) + ggtitle("shuffled") + +``` + + + +```{r} +pheatmap(exprs(m_auc), na_col = "white", cluster_rows = FALSE, cluster_cols = FALSE, + show_colnames = FALSE, height = 10, filename = "auc_data_updated.pdf") + +``` + + + + + diff --git a/proteomics/1-process_global_data.Rmd b/proteomics/1-process_global_data.Rmd index eac822f..2a21cd5 100644 --- a/proteomics/1-process_global_data.Rmd +++ b/proteomics/1-process_global_data.Rmd @@ -17,6 +17,7 @@ t0 <- Sys.time(); print(t0) ```{r setup} library(PlexedPiper) +library(PNNL.DMS.utils) library(dplyr) data_package_num <- 3676 data_folder <- "data/Ex10_global_data/" @@ -49,7 +50,7 @@ msgf_data_path <- file.path(data_folder, "msgfData_original.RData") if (file.exists(msgf_data_path)) { load(msgf_data_path) } else { - msnid <- read_msms_data_from_DMS(data_package_num) + msnid <- read_msgf_data_from_DMS(data_package_num) save(msnid, file=msgf_data_path) } show(msnid) diff --git a/util/mutational_analysis_helper.R b/util/mutational_analysis_helper.R index 7c1de59..af80817 100644 --- a/util/mutational_analysis_helper.R +++ b/util/mutational_analysis_helper.R @@ -185,7 +185,7 @@ compress_enrichment <- function(enrichment_array, threshold = .75, colname='Coun # sort enrichment array by attribute of choice enrichment_array <- enrichment_array%>% - dplyr::mutate(sortval=colname) + dplyr::mutate(sortval := !! sym(colname)) if (descending){ enrichment_array <- enrichment_array %>%