diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..d496296 --- /dev/null +++ b/.lintr @@ -0,0 +1,4 @@ +linters: all_linters() +exclusions: list("projectR/tests/testthat/test_projectR.R", + "projectR/tests/testthat.R") +encoding: "UTF-8" diff --git a/DESCRIPTION b/DESCRIPTION index c509f99..4805ad7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -4,7 +4,14 @@ Title: Functions for the projection of weights from PCA, CoGAPS, NMF, correlation, and clustering Version: 1.19.01 Author: Gaurav Sharma, Charles Shin, Jared Slosberg, Loyal Goff, Genevieve Stein-O'Brien -Maintainer: Genevieve Stein-O'Brien +Authors@R: c( + person("Gaurav", "Sharma", role = c("aut")), + person("Charles", "Shin", role = c("aut")), + person("Jared", "Slosberg", role = c("aut")), + person("Loyal", "Goff", role = c("aut")), + person("Ryan", "Palaganas", role = c("aut")), + person("Genevieve", "Stein-O'Brien", role = c("aut", "cre" ), email = "gsteinobrien@gmail.com") + ) Description: Functions for the projection of data into the spaces defined by PCA, CoGAPS, NMF, correlation, and clustering. License: GPL (==2) Imports: @@ -17,11 +24,13 @@ Imports: ggalluvial, RColorBrewer, dplyr, + fgsea, reshape2, viridis, scales, Matrix, MatrixModels, + msigdbr, ggplot2, cowplot, ggrepel, @@ -41,7 +50,7 @@ Suggests: SeuratObject LazyData: TRUE LazyDataCompression: gzip -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Encoding: UTF-8 VignetteBuilder: knitr biocViews: FunctionalPrediction, GeneRegulation, BiologicalQuestion, Software diff --git a/NAMESPACE b/NAMESPACE index e05318d..273c371 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,7 +10,9 @@ export(getTSNE) export(getUMAP) export(intersectoR) export(multivariateAnalysisR) +export(pdVolcano) export(plotConfidenceIntervals) +export(plotVolcano) export(projectR) export(projectionDriveR) export(rotatoR) @@ -21,14 +23,17 @@ import(MatrixModels) import(RColorBrewer) import(cluster) import(dplyr) +import(fgsea) import(ggalluvial) import(ggplot2) import(limma) +import(msigdbr) import(reshape2) import(scales, except = viridis_pal) import(tsne) import(umap) import(viridis) +importFrom(Matrix,as.matrix) importFrom(NMF,fcnnls) importFrom(ROCR,performance) importFrom(ROCR,prediction) @@ -37,6 +42,7 @@ importFrom(dplyr,"%>%") importFrom(dplyr,dense_rank) importFrom(dplyr,mutate) importFrom(ggrepel,geom_label_repel) +importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRampPalette) importFrom(methods,callNextMethod) importFrom(scales,squish) diff --git a/R/package.R b/R/package.R index d9c261b..367306e 100644 --- a/R/package.R +++ b/R/package.R @@ -95,6 +95,16 @@ #' @format A CogapsResult object "CR.RNAseq6l3c3t" +#' CogapsResult object for microglial_counts +#' +#' cr_microglia contains the output of the CoGAPS function in the +#' CoGAPS package for data = microglial_counts +#' +#' @name cr_microglial +#' @docType data +#' @format A CogapsResult object +"cr_microglial" + #' CoGAPS patterns learned from the developing mouse retina. #' #' @references diff --git a/R/plotting.R b/R/plotting.R index 28e02dc..31bbfe4 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -1,135 +1,368 @@ -# Example plots to be functionalized -# devtools::install_github('rlbarter/superheat') -# -# test<-projectR(data=p.ESepiGen4c1l$mRNA.Seq,Patterns=AP.RNAseq6l3c3t$Amean,AnnotionObj=map.ESepiGen4c1l,IDcol="GeneSymbols",full=TRUE) -# -# tmp<-matrix("",nrow = 5,ncol=9) -# tmp[test$pval<0.01]<-"*" -# -# superheat(test$projection,row.dendrogram=TRUE, pretty.order.cols = TRUE, -# heat.pal.values = c(0, 0.5, 1),yt=colSums(test$projection),yt.plot.type='scatterline',yt.axis.name="Sum of\nProjections",X.text=tmp,X.text.size=8,bottom.label.text.angle = 90) -# - - - - -####################################################################################################################################### -#' +################################################################################ #' plotConfidenceIntervals -#' -#' Generate point and line confidence intervals from provided estimates. -#' +#' +#' Generate point and line confidence intervals from provided estimates. #' @import ggplot2 #' @import viridis #' @importFrom scales squish #' @importFrom dplyr %>% mutate dense_rank #' @importFrom cowplot plot_grid -#' @param confidence_intervals A dataframe of features x estimates. -#' @param interval_name names of columns that contain the low and high estimates, respectively. Default: c("low","high") +#' @param confidence_intervals A dataframe of features x estimates. +#' @param interval_name Estimate column names. Default: c("low","high") #' @param pattern_name string to use as the title for plots. -#' @param sort Boolean. Whether or not to sort genes by their estimates (default = T) -#' @param genes a vector with names of genes to include in plot. If sort=F, estimates will be plotted in this order. -#' @param weights optional. weights of features to include as annotation. -#' @param weights_clip optional. quantile of data to clip color scale for improved visualization. Default: 0.99 -#' @param weights_vis_norm Which processed version of weights to visualize as a heatmap. -#' Options are "none" (which uses provided weights) or "quantiles". Default: none -#' @return A list with pointrange estimates and, if requested, a heatmap of pattern weights. +#' @param sort Boolean. Sort genes by their estimates (default = TRUE) +#' @param genes a vector with names of genes to include in plot. +#' If sort=F, estimates will be plotted in this order. +#' @param weights optional. weights of features to include as annotation. +#' @param weights_clip optional. quantile of data to clip color scale for +#' improved visualization. Default: 0.99 +#' @param weights_vis_norm Which version of weights to visualize as a heatmap. +#' Options are "none" (uses provided weights) or "quantiles". Default: none +#' @param weighted specifies whether the confidence intervals in use are +#' weighted by the pattern and labels plots accordingly +#' @return A list with pointrange estimates and a heatmap of pattern weights. #' @export plotConfidenceIntervals <- function( - confidence_intervals, #confidence_interval is a data.frame or matrix with two columns (low, high). Genes must be rownames - interval_name = c("low","high"), - pattern_name = NULL, - sort = T, - genes = NULL, - weights = NULL, - weights_clip = 0.99, - weights_vis_norm = "none"){ - - if(weights_clip < 0 | weights_clip > 1){ + confidence_intervals, + interval_name = c("low", "high"), + pattern_name = NULL, + sort = TRUE, + genes = NULL, + weights = NULL, + weights_clip = 0.99, + weights_vis_norm = "none", + weighted = FALSE) { + + #bind variables locally to the function + mid <- idx <- low <- high <- positive <- NULL + if (weights_clip < 0L || weights_clip > 1L) { stop("weights_clip must be numeric between 0 and 1") } - - if(!(weights_vis_norm %in% c("none","quantiles"))){ + + if (!(weights_vis_norm %in% c("none", "quantiles"))) { stop("weights_vis_norm must be either 'none' or 'quantiles'") } - - #gene names were stored as rownames, make sure high and low estimates are stored + + if (weighted) { + lab <- "Weighted" + } else { + lab <- "Unweighted" + } + #gene names were stored as rownames, store high and low estimates confidence_intervals$gene_names <- rownames(confidence_intervals) - confidence_intervals$low <- confidence_intervals[,interval_name[1]] - confidence_intervals$high <- confidence_intervals[,interval_name[2]] - - - n <- dim(confidence_intervals)[1] + confidence_intervals$low <- confidence_intervals[, interval_name[1L]] + confidence_intervals$high <- confidence_intervals[, interval_name[2L]] + + n <- dim(confidence_intervals)[1L] confidence_intervals <- confidence_intervals %>% mutate( - mid = (high+low)/2, #estimate, used for point position - positive = mid > 0) #upregulated, used for color scheme - - if(!is.null(genes)){ + mid = (high + low) / 2L, #estimate, used for point position + positive = mid > 0L) #upregulated, used for color scheme + + if (!is.null(genes)) { #select genes provided and get them in that order - if(!(is.character(genes))){ stop("Genes must be provided as a character vector") } + if (!(is.character(genes))) { + stop("Genes must be provided as a character vector") + } n <- length(genes) - message(paste0("Selecting ", n, " features")) - confidence_intervals <- confidence_intervals[genes,] - + message("Selecting ", n, " features") + confidence_intervals <- confidence_intervals[genes, ] + } - - if(sort){ + + if (sort) { #order in increasing order on estimates, and create index variable message("sorting genes in increasing order of estimates...") - confidence_intervals <- confidence_intervals %>% - mutate( - idx = dense_rank(mid)) %>% + confidence_intervals <- confidence_intervals %>% mutate( + idx = dense_rank(mid)) %>% arrange(mid) - - } else{ - #if not sorted, create index variable for current order - confidence_intervals <- confidence_intervals %>% - mutate(idx = 1:n) + + } else { + #if not sorted, create index variable for current order + confidence_intervals <- confidence_intervals %>% mutate(idx = 1L:n) } - - #genereate point range plot - ci_plot <- ggplot(data = confidence_intervals, aes(y = idx, x = mid)) + geom_pointrange(aes(xmin = low, xmax = high, color = positive)) + - geom_point(aes(x = mid, y = idx), fill ="black",color = "black") + - theme_minimal() + - xlab("Difference in group means") + - ylab("Genes") + - geom_vline(xintercept = 0, color = "black", linetype = "dashed") + - theme(legend.position = "none") + - ggtitle(pattern_name) - + + #generate point range plot + ci_plot <- ggplot(data = confidence_intervals, + aes(y = idx, x = mid)) + + geom_pointrange(aes(xmin = low, + xmax = high, + color = positive)) + + geom_point(aes(x = mid, + y = idx), + fill = "black", + color = "black") + + theme_minimal() + + xlab("Difference in group means") + + ylab("Genes") + + geom_vline(xintercept = 0L, color = "black", linetype = "dashed") + + theme(legend.position = "none") + + ggtitle(lab) + #if provided, create heatmap for pattern weights - if(!is.null(weights)){ - - #check that weights are formatted as a named vector - if(!(is.numeric(weights))){ stop("Weights must be provided as a numeric vector") } - if(is.null(names(weights))){ stop("Weights must have names that match estimates")} - - #either use pattern_name, or if not provided, just label heatmap with "weights" + if (!is.null(weights)) { + + #label with pattern name if provided hm_name <- ifelse(is.null(pattern_name), "weights", pattern_name) - + #maintain established order from the pointrange plot ordered_weights <- weights[rownames(confidence_intervals)] - - if(weights_vis_norm == "quantiles"){ - #transform to percentiles from 0 to 1 - ordered_weights <- trunc(rank(ordered_weights))/length(ordered_weights) - hm_name <- paste0(hm_name, " (quantiles)") #append quantile to plot name - } - confidence_intervals$weights <- ordered_weights #generate heatmap - wt_heatmap <- ggplot(data = confidence_intervals) + - geom_tile(aes(x = 1, y = 1:n, fill = weights)) + - scale_fill_viridis(limits=c(0, quantile(ordered_weights,weights_clip )), - oob=squish, + wt_heatmap <- ggplot2::ggplot(data = confidence_intervals) + + geom_tile(aes(x = 1L, y = 1L:n, fill = weights)) + + scale_fill_viridis(limits = c(0L, quantile(ordered_weights, weights_clip)), + oob = scales::squish, name = hm_name) + - theme_void() - - } else{ wt_heatmap = NULL} #if weights aren't provided, return NULL - + theme_void() + + #check that weights are formatted as a named vector + if (!(is.numeric(weights))) { + stop("Weights must be provided as a numeric vector") + } + if (is.null(names(weights))) { + stop("Weights must have names that match estimates") + } + + if (weights_vis_norm == "quantiles") { + #transform to percentiles from 0 to 1 + ordered_weights <- trunc(rank(ordered_weights)) / length(ordered_weights) + hm_name <- paste0(hm_name, " (quantiles)") #append quantile to plot name + } + + } else { + #if weights aren't provided, return NULL + return(list("ci_estimates_plot" = ci_plot, + "feature_order" = rownames(confidence_intervals), + "weights_heatmap" = NULL)) + } + return(list("ci_estimates_plot" = ci_plot, "feature_order" = rownames(confidence_intervals), "weights_heatmap" = wt_heatmap)) -} +} +################################################################################ +#' plotVolcano +#' +#' Volcano plotting function +#' @param stats data frame with differential expression statistics +#' @param metadata #metadata from pdVolcano +#' @param FC Fold change threshold +#' @param pvalue p value threshold +#' @param title plot title +#' @export +plotVolcano <- function( + stats, + metadata, + FC, + pvalue, + title +) { + #bind variables locally + mean_diff <- welch_padj <- Color <- NULL + #set custom colors + myColors <- c("gray", "red", "dodgerblue") + names(myColors) <- levels(stats$Color) + custom_colors <- scale_color_manual(values = myColors, drop = FALSE) + + #plot + volcano <- ggplot(data = stats, + aes(x = mean_diff, y = -log10(welch_padj), + color = Color, + label = stats$label)) + + geom_vline(xintercept = c(FC, -FC), lty = "dashed") + + geom_hline(yintercept = -log10(pvalue), lty = "dashed") + + geom_point(na.rm = TRUE) + + custom_colors + + coord_cartesian(ylim = c(0L, 350L), + xlim = c(min(stats$mean_diff), max(stats$mean_diff))) + + ggrepel::geom_text_repel(size = 3L, point.padding = 1L, color = "black", + min.segment.length = 0.1, box.padding = 0.15, + max.overlaps = Inf, na.rm = TRUE) + + labs(x = "FC", + y = "Significance (-log10pval)", + color = NULL) + + ggtitle(paste(title)) + + theme_bw() + + theme(plot.title = element_text(size = 16L), + legend.position = "bottom", + axis.title = element_text(size = 14L), + legend.text = element_text(size = 12L)) + return(volcano) +} + +################################################################################ +#' pdVolcano +#' +#' Generate volcano plot and gate genes based on fold change and pvalue, +#' includes vectors that can be used with fast gene set enrichment (fgsea) +#' @param result result output from projectionDriveR function in PV mode +#' @param FC fold change threshold, default at 0.2 +#' @param pvalue significance threshold, default set stored pvalue +#' @param subset vector of gene names to subset the plot by +#' @param filter.inf remove genes that have pvalues below machine double minimum value +#' @param label.num Number of genes to label on either side of the volcano plot, default 5 +#' @param display boolean. Whether or not to plot and display volcano plots +#' @importFrom stats var +#' @importFrom ggrepel geom_label_repel +#' @importFrom ggrepel geom_text_repel +#' @import msigdbr +#' @import fgsea +#' @import dplyr +#' @return A list with weighted and unweighted differential expression metrics +#' @export +#plot FC, weighted and unweighted. Designed for use with the output of projectionDriveRs +pdVolcano <- function( + result, + FC = 0.2, + pvalue = NULL, + subset = NULL, + filter.inf = FALSE, + label.num = 5L, + display = TRUE) { + + #bind local variables + welch_padj <- Color <- NULL + #if a genelist is provided, use them to subset the output of projectiondrivers + if (!is.null(subset)) { + #subset the mean_stats object by provided gene list + result$mean_stats <- result$mean_stats[(which(rownames(result$mean_stats) %in% subset)), ] + #subset the weighted_mean_stats object by provided gene list + result$weighted_mean_stats <- result$weighted_mean_stats[(which(rownames(result$weighted_mean_stats) %in% subset)), ] + + } + + if (filter.inf) { + #remove p values below the machine limit representation for plotting purposes + cat("Filtering", length(which(result$mean_stats$welch_padj <= .Machine$double.xmin)), "unweighted genes and", + length(which(result$weighted_mean_stats$welch_padj <= .Machine$double.xmin)), "weighted genes", "\n") + result$mean_stats <- subset(result$mean_stats, welch_padj > .Machine$double.xmin) + result$weighted_mean_stats <- subset(result$weighted_mean_stats, welch_padj > .Machine$double.xmin) + } + + if (!is.numeric(FC)) { + stop('FC must be a number') + } + + if (!is.null(pvalue)) { + + if (!is.numeric(pvalue)) { + stop('p value must be a number') + } + + message("Updating sig_genes...") + #update previously stored pvalue + pvalue <- pvalue + result$meta_data$pvalue <- pvalue + #update sig_genes with new pvalue + #recreate vector of significant genes from weighted and unweighted tests + weighted_PV_sig <- rownames(result$weighted_mean_stats[which(result$weighted_mean_stats$welch_padj <= pvalue), ]) + PV_sig <- rownames(result$mean_stats[which(result$mean_stats$welch_padj <= pvalue), ]) + #create vector of significant genes shared between weighted and unweighted tests + shared_genes_PV <- base::intersect( + PV_sig, weighted_PV_sig) + result$sig_genes <- list(PV_sig = PV_sig, + weighted_PV_sig = weighted_PV_sig, + PV_significant_shared_genes = shared_genes_PV) + } else { + pvalue <- result$meta_data$pvalue + } + + #extract object meta data + metadata <- result$meta_data + + #volcano plotting unweighted + #extract unweighted confidence intervals / statistics + mean_stats <- result$mean_stats + #fold change / significance calls + mean_stats$Color <- paste("NS or FC <", FC) + mean_stats$Color[mean_stats$welch_padj < pvalue & mean_stats$mean_diff > FC] <- paste("Enriched in", metadata$test_matrix) + mean_stats$Color[mean_stats$welch_padj < pvalue & mean_stats$mean_diff < -FC] <- paste("Enriched in", metadata$reference_matrix) + mean_stats$Color <- factor(mean_stats$Color, + levels = c(paste("NS or FC <", FC), + paste("Enriched in", metadata$reference_matrix), + paste("Enriched in", metadata$test_matrix))) + + #label the most significant genes for enrichment + mean_stats$invert_P <- (-log10(mean_stats$welch_padj)) * (mean_stats$mean_diff) + + top_indices <- order(mean_stats$invert_P, decreasing = TRUE)[1L:label.num] + bottom_indices <- order(mean_stats$invert_P)[1L:label.num] + + # Add labels to the dataframe + mean_stats$label <- NA + mean_stats$label[top_indices] <- paste(rownames(mean_stats)[top_indices]) + mean_stats$label[bottom_indices] <- paste(rownames(mean_stats)[bottom_indices]) + #unweighted volcano plot + unweightedvolcano <- plotVolcano(stats = mean_stats, + metadata = metadata, + FC = FC, + pvalue = pvalue, + title = "Differential Enrichment") + #weighted volcano plot + weighted_mean_stats <- result$weighted_mean_stats + weighted_mean_stats$Color <- paste("NS or FC <", FC) + weighted_mean_stats$Color[weighted_mean_stats$welch_padj < pvalue & weighted_mean_stats$mean_diff > FC] <- paste("Enriched in", metadata$test_matrix) + weighted_mean_stats$Color[weighted_mean_stats$welch_padj < pvalue & weighted_mean_stats$mean_diff < -FC] <- paste("Enriched in", metadata$reference_matrix) + weighted_mean_stats$Color <- factor(weighted_mean_stats$Color, + levels = c(paste("NS or FC <", FC), + paste("Enriched in", metadata$reference_matrix), + paste("Enriched in", metadata$test_matrix))) + + weighted_mean_stats$invert_P <- (-log10(weighted_mean_stats$welch_padj)) * (weighted_mean_stats$mean_diff) + + top_indices_w <- order(weighted_mean_stats$invert_P, decreasing = TRUE)[1L:label.num] + bottom_indices_w <- order(weighted_mean_stats$invert_P)[1L:label.num] + + # Add labels to the dataframe + weighted_mean_stats$label <- NA + weighted_mean_stats$label[top_indices_w] <- paste(rownames(weighted_mean_stats)[top_indices_w]) + weighted_mean_stats$label[bottom_indices_w] <- paste(rownames(weighted_mean_stats)[bottom_indices_w]) + + #weighted volcano plot + weightedvolcano <- plotVolcano(stats = weighted_mean_stats, + FC = FC, + pvalue = pvalue, + title = "Weighted Differential Enrichment") + + #return a list of genes that can be used as input to fgsea + difexdf <- subset(mean_stats, + Color == paste("Enriched in", metadata$reference_matrix) | Color == paste("Enriched in", metadata$test_matrix)) + vec <- difexdf$estimate + names(vec) <- rownames(difexdf) + + weighted_difexdf <- subset(weighted_mean_stats, + Color == paste("Enriched in", metadata$reference_matrix) | Color == paste("Enriched in", metadata$test_matrix)) + weighted_vec <- weighted_difexdf$estimate + names(weighted_vec) <- rownames(weighted_difexdf) + names(vec) <- rownames(difexdf) + vol_result <- list(mean_stats = mean_stats, + weighted_mean_stats = weighted_mean_stats, + normalized_weights = result$normalized_weights, + sig_genes = result$sig_genes, + difexpgenes = difexdf, + weighted_difexpgenes = weighted_difexdf, + fgseavecs = list(unweightedvec = vec, + weightedvec = weighted_vec), + meta_data = metadata, + plt = list(differential_expression = unweightedvolcano, + weighted_differential_expression = weightedvolcano)) + if (display) { + #print volcano plots + pltgrid <- cowplot::plot_grid(vol_result$plt$differential_expression + + theme(legend.position = "none"), + vol_result$plt$weighted_differential_expression + + theme(legend.position = "none"), + ncol = 2L, align = "h") + legend <- cowplot::get_legend(vol_result$plt$differential_expression + + guides(color = guide_legend(nrow = 1L)) + + theme(legend.position = "bottom")) + plt <- cowplot::plot_grid(pltgrid, + legend, + ncol = 1L, + rel_heights = c(1.0, 0.1)) + print(plt) + } + return(vol_result) +} diff --git a/R/projectionDriveR.R b/R/projectionDriveR.R deleted file mode 100644 index 30c37bb..0000000 --- a/R/projectionDriveR.R +++ /dev/null @@ -1,245 +0,0 @@ -####################################################################################################################################### -#' bonferroniCorrectedDifferences -#' -#' Calculate the (weighted) difference in means for each measurement between two groups. -#' @param group1 count matrix 1 -#' @param group2 count matrix 2 -#' @param diff_weights oadings to weight the differential expression between the groups -#' @param pvalue significance value to threshold at -#' -#' @importFrom stats var -#' @importFrom ggrepel geom_label_repel -bonferroniCorrectedDifferences <- function( - group1, #count matrix 1 - group2, #count matrix 2 - diff_weights = NULL, #loadings to weight the differential expression between the groups - pvalue) #signficance value to threshold at - { - - #if passed from projectionDrivers, cellgroup1 and cellgroup 1 will have the same rows (genes) - - if(!(dim(group1)[1] == dim(group2)[1])){ - stop("Rows of two cell group matrices are not identical") - } - - if(any(is.na(group1)) | any(is.na(group2))){ - stop("NA values in count matrices not allowed") - } - - ##Take means over all genes and calculate differences - group1_mean <- apply(group1, 1, mean) - group2_mean <- apply(group2, 1, mean) - mean_diff <- group1_mean - group2_mean - - - #if weights are provided, use them to weight the difference in means - if(!is.null(diff_weights)){ - - #check that genes exactly match between difference vector and weight vector - if(!(all(names(mean_diff) == names(diff_weights)))){ - stop("Names of loadings and counts do not match") - } - - mean_diff <- mean_diff * diff_weights - } - - - - ##Stats and corrections beginning here - dimensionality <- length(mean_diff) #number of measurements (genes) - - n1_samples <- dim(group1)[2] #number of samples (cells) - n2_samples <- dim(group2)[2] - bon_correct <- pvalue / (2*dimensionality) #bonferroni correction - qval <- 1 - bon_correct - - tval <- qt(p = qval, df = n1_samples + n2_samples -2) #critical value - - group1_var <- apply(group1, 1, var) - group2_var <- apply(group2, 1, var) - - pooled <- ((n1_samples-1)*group1_var + (n2_samples-1)*group2_var) / (n1_samples+n2_samples-2) - - #establish dataframe to populate in the following for loop - plusminus = data.frame(low = rep(NA_integer_, dimensionality), high = rep(NA_integer_, dimensionality)) - rownames(plusminus) <- names(mean_diff) - - - #for each gene, calculate confidence interval around mean - for(i in 1:dimensionality){ - - scale = tval * sqrt(pooled[i] * (1/n1_samples + 1/n2_samples)) - - plusminus[i, "low"] <- mean_diff[i] - scale #low estimate - plusminus[i, "high"] <- mean_diff[i] + scale #high estimate - - } - - - return(plusminus) -} - - - -####################################################################################################################################### -#' projectionDriveR -#' -#' Calculate the weighted difference in expression between two groups (group1 - group2) -#' -#' @importFrom cowplot plot_grid -#' @param cellgroup1 gene x cell count matrix for cell group 1 -#' @param cellgroup2 gene x cell count matrix for cell group 2 -#' @param loadings A matrix of continuous values defining the features -#' @param pattern_name column of loadings for which drivers will be calculated. -#' @param pvalue confidence level for the bonferroni confidence intervals. Default 1e-5 -#' @param loadingsNames a vector with names of loading rows. Defaults to rownames. -#' @param display boolean. Whether or not to plot and display confidence intervals -#' @param normalize_pattern Boolean. Whether or not to normalize pattern weights. -#' @return A list with weighted mean differences, mean differences, and differential genes that meet the provided signficance threshold. -#' @export -#' -#' -projectionDriveR<-function( - cellgroup1, #gene x cell count matrix for cell group 1 - cellgroup2, #gene x cell count matrix for cell group 2 - loadings, # a matrix of continous values to be projected with unique rownames - loadingsNames = NULL, # a vector with names of loadings rows - pattern_name, - pvalue = 1e-5, - display = TRUE, - normalize_pattern = TRUE -){ - - #Count matrices can be class matrix, data.frame, sparse.matrix, ... anything that is coercible by as.matrix() - - #TODO: assert rownames and colnames exist where needed, and that things are matrices (or can be cast to) - - - #check that alpha significance level is appropriate - if(pvalue <= 0 | pvalue >= 1){ - stop("pvalue must be numeric between 0 and 1") - } - - #Make sure provided pattern string is a character vector of length one - if(length(pattern_name) != 1 | !is.character(pattern_name)){ - stop("provided pattern_name must be a character vector of length one") - } - - #set loadings rownames if provided - if(!is.null(loadingsNames)){ - rownames(loadings) <- loadingsNames - } - - #pattern weights must be formatted as a matrix for normalization - if(pattern_name %in% colnames(loadings)){ - pattern <- loadings[,pattern_name, drop = F] #data.frame - pattern <- as.matrix(pattern) - } else { - stop(paste0("Provided pattern_name ",pattern_name, " is not a column in provided loadings")) - } - - #Filter the two count matrices and the pattern weights to include the intersection of their features - #shared rows in two data matrices - filtered_data <-geneMatchR(data1=cellgroup1, data2=cellgroup2, data1Names=NULL, data2Names=NULL, merge=FALSE) - print(paste(as.character(dim(filtered_data[[2]])[1]),'row names matched between datasets')) - - cellgroup1 <- filtered_data[[2]] #geneMatchR flips the indexes - cellgroup2 <- filtered_data[[1]] - - - #shared rows in data matrices and loadings - filtered_weights <- geneMatchR(data1 = cellgroup1, data2 = pattern, data1Names = NULL, data2Names = NULL, merge = F) - dimensionality_final <- dim(filtered_weights[[2]])[1] - - print(paste(as.character(dimensionality_final,'row names matched between data and loadings'))) - print(paste('Updated dimension of data:',as.character(paste(dimensionality_final, collapse = ' ')))) - - if(dimensionality_final == 0){ - stop("No features matched by rownames of count matrix and rownames of loadings") - } - - pattern_filtered <- filtered_weights[[1]] - - cellgroup1_filtered <- filtered_weights[[2]] - #do second filtering on other cell group so all genes are consistent - cellgroup2_filtered <- cellgroup2[rownames(cellgroup1_filtered),] - - - #normalize pattern weights - if(normalize_pattern){ - weight_norm <- norm(pattern_filtered) #square of sums of squares (sum for all positive values) - num_nonzero <- sum(pattern_filtered > 0) #number of nonzero weights - pattern_filtered <- pattern_filtered * num_nonzero / weight_norm - } - - #cast feature weights to a named vector - pattern_normalized_vec <- pattern_filtered[,1] - names(pattern_normalized_vec) <- rownames(pattern_filtered) - - - - - #weighted confidence intervals of differences in cluster means - weighted_drivers_bonferroni <- bonferroniCorrectedDifferences(group1 = cellgroup1_filtered, - group2 = cellgroup2_filtered, - diff_weights = pattern_normalized_vec, - pvalue = pvalue) - - #unweighted confidence intervals of difference in cluster means - mean_bonferroni <- bonferroniCorrectedDifferences(group1 = cellgroup1_filtered, - group2 = cellgroup2_filtered, - diff_weights = NULL, - pvalue = pvalue) - - #Determine which genes have significantly non-zero mean difference and weighted mean difference - #significant - weighted_sig_idx <- apply(weighted_drivers_bonferroni, 1, function(interval){ - (interval[1] > 0 & interval[2] > 0) | (interval[1] < 0 & interval[2] < 0) - }) - - mean_sig_idx <- apply(mean_bonferroni, 1, function(interval){ - (interval[1] > 0 & interval[2] > 0) | (interval[1] < 0 & interval[2] < 0) - }) - - #genes that are collectively either up or down - shared_genes <- base::intersect( - rownames(weighted_drivers_bonferroni)[weighted_sig_idx], - rownames(mean_bonferroni)[mean_sig_idx]) - - if(length(shared_genes) == 0){ - #no genes were significant. Return info we have and skip plotting. - warning("No features (and weighted features) were significantly differentially used between the two groups") - return(list( - mean_ci = mean_bonferroni, - weighted_mean_ci = weighted_drivers_bonferroni, - normalized_weights = pattern_normalized_vec, - significant_genes = shared_genes, - plotted_ci = NULL)) - } - - - conf_intervals <- mean_bonferroni[shared_genes,] - sig_weights <- pattern_normalized_vec[shared_genes] - - #create confidence interval plot - pl <- plotConfidenceIntervals(conf_intervals, - weights = sig_weights, - pattern_name = pattern_name) - - if(display){ - #print confidence interval pointrange plot - print(cowplot::plot_grid(pl[["ci_estimates_plot"]], - pl[["weights_heatmap"]], - ncol = 2, - align = "h", - rel_widths = c(1,.3))) - } - - return(list( - mean_ci = mean_bonferroni, - weighted_mean_ci = weighted_drivers_bonferroni, - normalized_weights = pattern_normalized_vec, - significant_genes = shared_genes, - plotted_ci = pl)) -} - diff --git a/R/projectionDriveRfun.R b/R/projectionDriveRfun.R new file mode 100644 index 0000000..91dd4e2 --- /dev/null +++ b/R/projectionDriveRfun.R @@ -0,0 +1,353 @@ +################################################################################ +#' bonferroniCorrectedDifferences +#' +#' Calculate weighted/unweighted mean difference for each gene between 2 groups +#' @param group1 count matrix 1 +#' @param group2 count matrix 2 +#' @param pvalue significance value to threshold +#' @param diff_weights loadings to weight the differential expression +#' @param mode statistical approach, confidence intervals(CI) or pvalues(PV) +#' @importFrom stats var +#' @importFrom ggrepel geom_label_repel +#' @import dplyr +bonferroniCorrectedDifferences <- function( + group1, + group2, + pvalue, + diff_weights = NULL, + mode = "CI") { + if (!(dim(group1)[1L] == dim(group2)[1L])) { + #if passed from projectionDrivers, cellgroups will have the same rows + stop("Rows of two cell group matrices are not identical") + } + + if (anyNA(group1) || anyNA(group2)) { + stop("NA values in count matrices not allowed") + } + + ##Take means over all genes and calculate differences + group1_mean <- apply(group1, 1L, mean) + group2_mean <- apply(group2, 1L, mean) + mean_diff <- group1_mean - group2_mean + + + #if weights are provided, use them to weight the difference in means + if (!is.null(diff_weights)) { + + #check that genes exactly match between difference vector and weight vector + if (!(all(names(mean_diff) == names(diff_weights)))) { + stop("Names of loadings and counts do not match") + } + + mean_diff <- mean_diff * diff_weights + } + + ##Stats and corrections beginning here + #calculate confidence intervals + dimensionality <- length(mean_diff) #number of measurements (genes) + + n1_samples <- dim(group1)[2L] #number of samples (cells) + n2_samples <- dim(group2)[2L] + bon_correct <- pvalue / (2L * dimensionality) #bonferroni correction + qval <- 1L - bon_correct + + tval <- qt(p = qval, df = n1_samples + n2_samples - 2L) #critical value + + group1_var <- apply(group1, 1L, var) #variance of genes across group 1 + group2_var <- apply(group2, 1L, var) #variance of genes across group 2 + + if (mode == "CI") { + #pooled standard deviation + pool <- ((n1_samples - 1L) * group1_var + (n2_samples - 1L) * group2_var) / (n1_samples + n2_samples - 2L) + plusminus <- data.frame(low = mean_diff - tval * sqrt(pool * (1L / n1_samples + 1L / n2_samples)), + high = mean_diff + tval * sqrt(pool * (1L / n1_samples + 1L / n2_samples)), + gene = names(mean_diff)) + rownames(plusminus) <- names(mean_diff) + + } else if (mode == "PV") { + #welch t test + #variance calculation + delta_s <- sqrt((group1_var / n1_samples) + (group2_var / n2_samples)) + #welch t statistic, rounded to 10 digits to avoid infinite decimals + welch_estimate <- round(mean_diff / delta_s, digits = 10L) + #Welch-Satterthwaite equation for degrees of freedom + df <- (((group1_var / n1_samples) + (group2_var / n2_samples)) ^ 2L) / + ((((group1_var / n1_samples) ^ 2L) / (n1_samples - 1L)) + + (((group2_var / n2_samples) ^ 2L) / (n2_samples - 1L))) + #calculate p value from estimate/tvalue + welch_pvalue <- 2L * pt(-abs(welch_estimate), df = df) + #bonferroni correction + welch_padj <- p.adjust(welch_pvalue, + method = "bonferroni", + n = dimensionality) + #replace p values equal to zero with the smallest machine value possible + if (min(welch_padj, na.rm=TRUE) <= .Machine$double.xmin) { + zp <- length(which(welch_padj <= .Machine$double.xmin)) + message(zp, " P value(s) equal 0. Converting values less than ", .Machine$double.xmin, " to minimum possible value...", call. = FALSE) + welch_padj[welch_padj <= .Machine$double.xmin] <- .Machine$double.xmin + } + plusminus <- data.frame( + ref_mean = group2_mean, + test_mean = group1_mean, + mean_diff = mean_diff, + estimate = welch_estimate, + welch_pvalue = welch_pvalue, + welch_padj = welch_padj, + gene = names(mean_diff) + ) + } else { + stop("Invalid mode selection") + } + return(plusminus) +} + +################################################################################ +#' projectionDriveR +#' +#' Calculate weighted expression difference between two groups (group1 - group2) +#' +#' @importFrom cowplot plot_grid +#' @importFrom Matrix as.matrix +#' @param cellgroup1 gene x cell count matrix for cell group 1 +#' @param cellgroup2 gene x cell count matrix for cell group 2 +#' @param loadings A matrix of continuous values defining the features +#' @param pattern_name column of loadings for which drivers will be calculated +#' @param pvalue confidence level. Default 1e-5 +#' @param loadingsNames a vector with names of loading rows defaults to rownames +#' @param display boolean. Whether or not to display confidence intervals +#' @param normalize_pattern Boolean. Whether or not to normalize pattern weights +#' @param mode statistical approach, confidence intervals or pvalues. default CI +#' @return A list with unweighted/weighted mean differences and differential +#' genes that meet the provided signficance threshold. +#' @export +#' +#' +projectionDriveR <- function( + cellgroup1, #gene x cell count matrix for cell group 1 + cellgroup2, #gene x cell count matrix for cell group 2 + loadings, # a matrix of continous values to be projected with unique rownames + pattern_name, + loadingsNames = NULL, # a vector with names of loadings rows + pvalue = 1e-5, + display = TRUE, + normalize_pattern = TRUE, + mode = "CI" +) { + message("Mode: ", mode) + #Count matrices can be anything that is coercible by as.matrix() + #check that alpha significance level is appropriate + if (pvalue <= 0L || pvalue >= 1L) { + stop("pvalue must be numeric between 0 and 1") + } + + #Make sure provided pattern string is a character vector of length one + if (length(pattern_name) != 1L || !is.character(pattern_name)) { + stop("provided pattern_name must be a character vector of length one") + } + + #set loadings rownames if provided + if (!is.null(loadingsNames)) { + rownames(loadings) <- loadingsNames + } + + #pattern weights must be formatted as a matrix for normalization + if (pattern_name %in% colnames(loadings)) { + pattern <- loadings[, pattern_name, drop = FALSE] #data.frame + pattern <- Matrix::as.matrix(pattern) + } else { + stop("Provided pattern_name ", pattern_name, " is not a column in provided loadings") + } + + #extract names of data objects + group1name <- deparse(substitute(cellgroup1)) + + group2name <- deparse(substitute(cellgroup2)) + + #Filter the two count matrices and the pattern weights to include + #the intersection of their features + #shared rows in two data matrices + filtered_data <- geneMatchR(data1 = cellgroup1, + data2 = cellgroup2, + data1Names = NULL, + data2Names = NULL, + merge = FALSE) + message(as.character(dim(filtered_data[[2L]])[1L]), + " row names matched between datasets") + + cellgroup1 <- filtered_data[[2L]] #geneMatchR flips the indexes + cellgroup2 <- filtered_data[[1L]] + + #shared rows in data matrices and loadings + filtered_weights <- geneMatchR(data1 = cellgroup1, + data2 = pattern, + data1Names = NULL, + data2Names = NULL, + merge = FALSE) + + dimensionality_final <- dim(filtered_weights[[2L]])[1L] + + message("Updated dimension of data: ", + as.character(paste(dimensionality_final, collapse = " "))) + + if (dimensionality_final == 0L) { + stop("No features matched by rownames of count matrix and rownames of loadings") + } + + pattern_filtered <- filtered_weights[[1L]] + + cellgroup1_filtered <- filtered_weights[[2L]] + #do second filtering on other cell group so all genes are consistent + cellgroup2_filtered <- cellgroup2[rownames(cellgroup1_filtered), ] + + + #normalize pattern weights + if (normalize_pattern) { + weight_norm <- norm(pattern_filtered) #square of sums of squares + num_nonzero <- sum(pattern_filtered > 0L) #number of nonzero weights + pattern_filtered <- pattern_filtered * num_nonzero / weight_norm + } + #cast feature weights to a named vector + pattern_normalized_vec <- pattern_filtered[, 1L] + names(pattern_normalized_vec) <- rownames(pattern_filtered) + + #weighted confidence intervals of differences in cluster means + weighted_bonferroni <- bonferroniCorrectedDifferences( + group1 = cellgroup1_filtered, + group2 = cellgroup2_filtered, + diff_weights = pattern_normalized_vec, + pvalue = pvalue, + mode = mode) + #unweighted confidence intervals of difference in cluster means + mean_bonferroni <- bonferroniCorrectedDifferences( + group1 = cellgroup1_filtered, + group2 = cellgroup2_filtered, + diff_weights = NULL, + pvalue = pvalue, + mode = mode) +#generate confidence interval mode + if (mode == "CI") { + #Determine which genes have unweighted/ weighted mean difference + weighted_sig_idx <- apply(weighted_bonferroni[, 1L:2L], 1L, function(interval) { + (interval[1L] > 0L & interval[2L] > 0L) | (interval[1L] < 0L & interval[2L] < 0L) + }) + + mean_sig_idx <- apply(mean_bonferroni[, 1L:2L], 1L, function(interval) { + (interval[1L] > 0L & interval[2L] > 0L) | (interval[1] < 0L & interval[2L] < 0L) + }) + + weighted_sig_genes <- weighted_bonferroni[weighted_sig_idx,] + unweighted_sig_genes <- mean_bonferroni[mean_sig_idx,] + #genes that are collectively either up or down + shared_genes <- base::intersect( + rownames(weighted_bonferroni)[weighted_sig_idx], + rownames(mean_bonferroni)[mean_sig_idx]) + message("the length of shared genes are: ", length(shared_genes)) + conf_intervals <- mean_bonferroni[shared_genes, ] + sig_weights <- pattern_normalized_vec[shared_genes] + + weighted_conf_intervals <- weighted_bonferroni[shared_genes, ] + #create confidence interval plot (unweighted) + pl <- plotConfidenceIntervals(conf_intervals, + weights = sig_weights, + pattern_name = pattern_name, + weighted = FALSE) + #weighted + pl_w <- plotConfidenceIntervals(weighted_conf_intervals, + weights = sig_weights, + pattern_name = pattern_name, + weighted = TRUE) + + plots <- list(unweighted = pl,weighted = pl_w) + if (display) { + #print confidence interval pointrange plot + pl1_u <- (cowplot::plot_grid(pl[["ci_estimates_plot"]], + pl[["weights_heatmap"]], + ncol = 2L, + align = "h", + rel_widths = c(1.0,0.3))) + pl2_w <- (cowplot::plot_grid(pl_w[["ci_estimates_plot"]], + pl_w[["weights_heatmap"]], + ncol = 2L, + align = "h", + rel_widths = c(1.0,0.3))) + plt <- cowplot::plot_grid(pl1_u, pl2_w, ncol = 2L, align = "h") + print(plt) + } + + if (length(shared_genes) == 0) { + #no genes were significant. Return info we have and skip plotting. + warning("No features were significantly differentially used", + call. = FALSE) + + result <- list(mean_ci = mean_bonferroni, + weighted_mean_ci = weighted_bonferroni, + normalized_weights = pattern_normalized_vec, + significant_shared_genes = shared_genes, + plotted_ci = NULL, + meta_data = list(reference_matrix = paste0(group2name), + test_matrix = paste0(group1name)) + ) + return(result) + } + + result <- list( + mean_ci = mean_bonferroni, + weighted_mean_ci = weighted_bonferroni, + normalized_weights = pattern_normalized_vec, + sig_genes = list(unweighted_sig_genes = rownames(unweighted_sig_genes), + weighted_sig_genes = rownames(weighted_sig_genes), + significant_shared_genes = shared_genes), + plotted_ci = plots, + meta_data = list(reference_matrix = paste0(group2name), + test_matrix = paste0(group1name)) + ) + } else if (mode == "PV") { + #create vector of significant genes from weighted and unweighted tests + weighted_PV_sig <- rownames(weighted_bonferroni[which(weighted_bonferroni$welch_padj <= pvalue),]) + PV_sig <- rownames(mean_bonferroni[which(mean_bonferroni$welch_padj <= pvalue),]) + #create vector of significant genes shared between weighted and unweighted tests + shared_genes_PV <- base::intersect( + PV_sig, weighted_PV_sig) + if (length(shared_genes_PV) == 0L){ + #no genes were significant. Return info we have and skip plotting. + warning("No features were significantly differentially used ", + call. = FALSE) + result <- list(mean_stats = mean_bonferroni, + weighted_mean_stats = weighted_bonferroni, + normalized_weights = pattern_normalized_vec, + meta_data = list(reference_matrix = paste0(group2name), + test_matrix = paste0(group1name), + pvalue = pvalue) + ) + return(result) + } + result <- list(mean_stats = mean_bonferroni, + weighted_mean_stats = weighted_bonferroni, + normalized_weights = pattern_normalized_vec, + sig_genes = list(PV_sig = PV_sig, + weighted_PV_sig = weighted_PV_sig, + PV_significant_shared_genes = shared_genes_PV), + meta_data = list(reference_matrix = paste0(group2name), + test_matrix = paste0(group1name), + pvalue = pvalue) + ) + #apply pdVolcano function to result + result <- pdVolcano(result, display = FALSE) + if (display) { + #print volcano plots + pltgrid <- cowplot::plot_grid(result$plt$differential_expression + + theme(legend.position = "none"), + result$plt$weighted_differential_expression + + theme(legend.position = "none"), + ncol = 2L, align = "h") + legend <- cowplot::get_legend(result$plt$differential_expression + + guides(color = guide_legend(nrow = 1L)) + + theme(legend.position = "bottom")) + plt <- cowplot::plot_grid(pltgrid, legend, ncol = 1L, rel_heights = c(1.0, 0.1)) + print(plt) + } + } else { + stop("Invalid mode selection") + } + return(result) +} diff --git a/data/cr_microglial.rda b/data/cr_microglial.rda new file mode 100644 index 0000000..6d7f94b Binary files /dev/null and b/data/cr_microglial.rda differ diff --git a/man/bonferroniCorrectedDifferences.Rd b/man/bonferroniCorrectedDifferences.Rd index 74f5f97..2c8cebb 100644 --- a/man/bonferroniCorrectedDifferences.Rd +++ b/man/bonferroniCorrectedDifferences.Rd @@ -1,20 +1,28 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/projectionDriveR.R +% Please edit documentation in R/projectionDriveRfun.R \name{bonferroniCorrectedDifferences} \alias{bonferroniCorrectedDifferences} \title{bonferroniCorrectedDifferences} \usage{ -bonferroniCorrectedDifferences(group1, group2, diff_weights = NULL, pvalue) +bonferroniCorrectedDifferences( + group1, + group2, + pvalue, + diff_weights = NULL, + mode = "CI" +) } \arguments{ \item{group1}{count matrix 1} \item{group2}{count matrix 2} -\item{diff_weights}{oadings to weight the differential expression between the groups} +\item{pvalue}{significance value to threshold} -\item{pvalue}{significance value to threshold at} +\item{diff_weights}{loadings to weight the differential expression} + +\item{mode}{statistical approach, confidence intervals(CI) or pvalues(PV)} } \description{ -Calculate the (weighted) difference in means for each measurement between two groups. +Calculate weighted/unweighted mean difference for each gene between 2 groups } diff --git a/man/cr_microglial.Rd b/man/cr_microglial.Rd new file mode 100644 index 0000000..80d80bb --- /dev/null +++ b/man/cr_microglial.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/package.R +\docType{data} +\name{cr_microglial} +\alias{cr_microglial} +\title{CogapsResult object for microglial_counts} +\format{ +A CogapsResult object +} +\usage{ +cr_microglial +} +\description{ +cr_microglia contains the output of the CoGAPS function in the +CoGAPS package for data = microglial_counts +} +\keyword{datasets} diff --git a/man/pdVolcano.Rd b/man/pdVolcano.Rd new file mode 100644 index 0000000..fd80d50 --- /dev/null +++ b/man/pdVolcano.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{pdVolcano} +\alias{pdVolcano} +\title{pdVolcano} +\usage{ +pdVolcano( + result, + FC = 0.2, + pvalue = NULL, + subset = NULL, + filter.inf = FALSE, + label.num = 5L, + display = TRUE +) +} +\arguments{ +\item{result}{result output from projectionDriveR function in PV mode} + +\item{FC}{fold change threshold, default at 0.2} + +\item{pvalue}{significance threshold, default set stored pvalue} + +\item{subset}{vector of gene names to subset the plot by} + +\item{filter.inf}{remove genes that have pvalues below machine double minimum value} + +\item{label.num}{Number of genes to label on either side of the volcano plot, default 5} + +\item{display}{boolean. Whether or not to plot and display volcano plots} +} +\value{ +A list with weighted and unweighted differential expression metrics +} +\description{ +Generate volcano plot and gate genes based on fold change and pvalue, +includes vectors that can be used with fast gene set enrichment (fgsea) +} diff --git a/man/plotConfidenceIntervals.Rd b/man/plotConfidenceIntervals.Rd index ec1c5a5..bdbc9f0 100644 --- a/man/plotConfidenceIntervals.Rd +++ b/man/plotConfidenceIntervals.Rd @@ -8,33 +8,39 @@ plotConfidenceIntervals( confidence_intervals, interval_name = c("low", "high"), pattern_name = NULL, - sort = T, + sort = TRUE, genes = NULL, weights = NULL, weights_clip = 0.99, - weights_vis_norm = "none" + weights_vis_norm = "none", + weighted = FALSE ) } \arguments{ \item{confidence_intervals}{A dataframe of features x estimates.} -\item{interval_name}{names of columns that contain the low and high estimates, respectively. Default: c("low","high")} +\item{interval_name}{Estimate column names. Default: c("low","high")} \item{pattern_name}{string to use as the title for plots.} -\item{sort}{Boolean. Whether or not to sort genes by their estimates (default = T)} +\item{sort}{Boolean. Sort genes by their estimates (default = TRUE)} -\item{genes}{a vector with names of genes to include in plot. If sort=F, estimates will be plotted in this order.} +\item{genes}{a vector with names of genes to include in plot. +If sort=F, estimates will be plotted in this order.} \item{weights}{optional. weights of features to include as annotation.} -\item{weights_clip}{optional. quantile of data to clip color scale for improved visualization. Default: 0.99} +\item{weights_clip}{optional. quantile of data to clip color scale for +improved visualization. Default: 0.99} -\item{weights_vis_norm}{Which processed version of weights to visualize as a heatmap. -Options are "none" (which uses provided weights) or "quantiles". Default: none} +\item{weights_vis_norm}{Which version of weights to visualize as a heatmap. +Options are "none" (uses provided weights) or "quantiles". Default: none} + +\item{weighted}{specifies whether the confidence intervals in use are +weighted by the pattern and labels plots accordingly} } \value{ -A list with pointrange estimates and, if requested, a heatmap of pattern weights. +A list with pointrange estimates and a heatmap of pattern weights. } \description{ Generate point and line confidence intervals from provided estimates. diff --git a/man/plotVolcano.Rd b/man/plotVolcano.Rd new file mode 100644 index 0000000..8af23e5 --- /dev/null +++ b/man/plotVolcano.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plotting.R +\name{plotVolcano} +\alias{plotVolcano} +\title{plotVolcano} +\usage{ +plotVolcano(stats, metadata, FC, pvalue, title) +} +\arguments{ +\item{stats}{data frame with differential expression statistics} + +\item{metadata}{#metadata from pdVolcano} + +\item{FC}{Fold change threshold} + +\item{pvalue}{p value threshold} + +\item{title}{plot title} +} +\description{ +Volcano plotting function +} diff --git a/man/projectionDriveR.Rd b/man/projectionDriveR.Rd index e562c2d..8d3ad41 100644 --- a/man/projectionDriveR.Rd +++ b/man/projectionDriveR.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/projectionDriveR.R +% Please edit documentation in R/projectionDriveRfun.R \name{projectionDriveR} \alias{projectionDriveR} \title{projectionDriveR} @@ -8,11 +8,12 @@ projectionDriveR( cellgroup1, cellgroup2, loadings, - loadingsNames = NULL, pattern_name, + loadingsNames = NULL, pvalue = 1e-05, display = TRUE, - normalize_pattern = TRUE + normalize_pattern = TRUE, + mode = "CI" ) } \arguments{ @@ -22,19 +23,22 @@ projectionDriveR( \item{loadings}{A matrix of continuous values defining the features} -\item{loadingsNames}{a vector with names of loading rows. Defaults to rownames.} +\item{pattern_name}{column of loadings for which drivers will be calculated} + +\item{loadingsNames}{a vector with names of loading rows defaults to rownames} -\item{pattern_name}{column of loadings for which drivers will be calculated.} +\item{pvalue}{confidence level. Default 1e-5} -\item{pvalue}{confidence level for the bonferroni confidence intervals. Default 1e-5} +\item{display}{boolean. Whether or not to display confidence intervals} -\item{display}{boolean. Whether or not to plot and display confidence intervals} +\item{normalize_pattern}{Boolean. Whether or not to normalize pattern weights} -\item{normalize_pattern}{Boolean. Whether or not to normalize pattern weights.} +\item{mode}{statistical approach, confidence intervals or pvalues. default CI} } \value{ -A list with weighted mean differences, mean differences, and differential genes that meet the provided signficance threshold. +A list with unweighted/weighted mean differences and differential +genes that meet the provided signficance threshold. } \description{ -Calculate the weighted difference in expression between two groups (group1 - group2) +Calculate weighted expression difference between two groups (group1 - group2) } diff --git a/tests/testthat/test_projectR.R b/tests/testthat/test_projectR.R index 3d7e072..3acd3d4 100644 --- a/tests/testthat/test_projectR.R +++ b/tests/testthat/test_projectR.R @@ -65,3 +65,139 @@ test_that("results are as expected",{ expect_true("CI" %in% names(output[[1]])) }) + +#projectionDriveR check +#test that expected output is present and in correct format + +test_that("results are correctly formatted for confidence interval mode",{ + + pattern_to_weight <- "Pattern.24" + drivers <- projectionDriveR(microglial_counts, #expression matrix + glial_counts, #expression matrix + loadings = retinal_patterns, #feature x pattern dataframe + loadingsNames = NULL, + pattern_name = pattern_to_weight, #column name + pvalue = 1e-5, #pvalue before bonferroni correction + display = T, + normalize_pattern = T, #normalize feature weights + mode = "CI") #set to confidence interval mode +#check output is in list format +expect_is(drivers, "list") + +#check length of dfs +expect_length(drivers, 6) +expect_length(drivers$mean_ci, 3) +expect_length(drivers$weighted_mean_ci, 3) + +#check that genes used for calculations overlap both datasets and loadings +expect_true(unique(drivers$mean_ci$gene %in% rownames(microglial_counts))) +expect_true(unique(drivers$mean_ci$gene %in% rownames(glial_counts))) +expect_true(unique(drivers$mean_ci$gene %in% rownames(retinal_patterns))) +expect_true(unique(drivers$weighted_mean_ci$gene %in% rownames(microglial_counts))) +expect_true(unique(drivers$weighted_mean_ci$gene %in% rownames(glial_counts))) +expect_true(unique(drivers$weighted_mean_ci$gene %in% rownames(retinal_patterns))) + +#name and class checks +expect_true("mean_ci" %in% names(drivers)) +expect_is(drivers$mean_ci, "data.frame") + +expect_true("weighted_mean_ci" %in% names(drivers)) +expect_is(drivers$mean_ci, "data.frame") + +expect_true("normalized_weights" %in% names(drivers)) +expect_is(drivers$normalized_weights, "numeric") + +expect_true("sig_genes" %in% names(drivers)) +expect_is(drivers$sig_genes, "list") +expect_length(drivers$sig_genes, 3) + +expect_true(unique(c("unweighted_sig_genes", "weighted_sig_genes", "significant_shared_genes") %in% names(drivers$sig_genes))) + +expect_is(drivers$sig_genes$unweighted_sig_genes, "character") + +expect_is(drivers$sig_genes$weighted_sig_genes, "character") + +expect_is(drivers$sig_genes$significant_shared_genes, "character") + +expect_true("meta_data" %in% names(drivers)) +expect_is(drivers$meta_data, "list") +expect_length(drivers$meta_data, 2) + +#check that matrix names are proper and match source names +expect_true(deparse(substitute(microglial_counts)) == drivers$meta_data$test_matrix) +expect_type(drivers$meta_data$test_matrix, "character") + +expect_true(deparse(substitute(glial_counts)) == drivers$meta_data$reference_matrix) +expect_type(drivers$meta_data$reference_matrix, "character") + +#check that plot length is correct +expect_true("plotted_ci" %in% names(drivers)) +expect_length(drivers$plotted_ci, 2) +}) + +test_that("results are correctly formatted for P value mode",{ + + pattern_to_weight <- "Pattern.24" + drivers <- projectionDriveR(microglial_counts, #expression matrix + glial_counts, #expression matrix + loadings = retinal_patterns, #feature x pattern dataframe + loadingsNames = NULL, + pattern_name = pattern_to_weight, #column name + pvalue = 1e-5, #pvalue before bonferroni correction + display = T, + normalize_pattern = T, #normalize feature weights + mode = "PV") #set to p value mode + #check output is in list format + expect_is(drivers, "list") + + #check length of dfs + expect_length(drivers, 9) + expect_length(drivers$mean_stats, 10) + expect_length(drivers$weighted_mean_stats, 10) + + #check that genes used for calculations overlap both datasets and loadings + expect_true(unique(drivers$mean_stats$gene %in% rownames(microglial_counts))) + expect_true(unique(drivers$mean_stats$gene %in% rownames(glial_counts))) + expect_true(unique(drivers$mean_stats$gene %in% rownames(retinal_patterns))) + expect_true(unique(drivers$weighted_mean_stats$gene %in% rownames(microglial_counts))) + expect_true(unique(drivers$weighted_mean_stats$gene %in% rownames(glial_counts))) + expect_true(unique(drivers$weighted_mean_stats$gene %in% rownames(retinal_patterns))) + + #name and class checks + expect_true("mean_stats" %in% names(drivers)) + expect_is(drivers$mean_stats, "data.frame") + + expect_true("weighted_mean_stats" %in% names(drivers)) + expect_is(drivers$mean_stats, "data.frame") + + expect_true("normalized_weights" %in% names(drivers)) + expect_is(drivers$normalized_weights, "numeric") + + expect_true("sig_genes" %in% names(drivers)) + expect_is(drivers$sig_genes, "list") + expect_length(drivers$sig_genes, 3) + + expect_true(unique(c("PV_sig", "weighted_PV_sig", "PV_significant_shared_genes") %in% names(drivers$sig_genes))) + + expect_is(drivers$sig_genes$PV_sig, "character") + + expect_is(drivers$sig_genes$weighted_PV_sig, "character") + + expect_is(drivers$sig_genes$PV_significant_shared_genes, "character") + + expect_true("meta_data" %in% names(drivers)) + expect_is(drivers$meta_data, "list") + expect_length(drivers$meta_data, 3) + expect_true("pvalue" %in% names(drivers$meta_data)) + expect_is(drivers$meta_data$pvalue, "numeric") + + #check that matrix names are proper and match source names + expect_true(deparse(substitute(microglial_counts)) == drivers$meta_data$test_matrix) + expect_type(drivers$meta_data$test_matrix, "character") + + expect_true(deparse(substitute(glial_counts)) == drivers$meta_data$reference_matrix) + expect_type(drivers$meta_data$reference_matrix, "character") + +}) + + diff --git a/vignettes/projectR.Rmd b/vignettes/projectR.Rmd index af6ccba..1401e7f 100644 --- a/vignettes/projectR.Rmd +++ b/vignettes/projectR.Rmd @@ -9,6 +9,7 @@ author: date: "`r BiocStyle::doc_date()`" output: BiocStyle::html_document +fig_crop: false bibliography: projectR.bib description: | Functions for the Projection of Weights from PCA, CoGAPS, NMF, Correlation, and Clustering @@ -22,6 +23,7 @@ vignette: > knitr::opts_chunk$set(echo = TRUE) options(scipen = 1, digits = 2) set.seed(1234) +library(projectR) ``` # Introduction @@ -33,7 +35,7 @@ Technological advances continue to spur the exponential growth of biological dat For automatic Bioconductor package installation, start R, and run: ``` -BiocManager::install("projectR") +BiocManager::install("genesofeve/projectR@projectionDriveR") ``` ## Methods @@ -44,7 +46,7 @@ Projection can roughly be defined as a mapping or transformation of points from The generic projectR function is executed as follows: ``` -library(projectR) + projectR(data, loadings, dataNames=NULL, loadingsNames=NULL, NP = NULL, full = false) ``` @@ -70,7 +72,7 @@ Projection of principal components is achieved by matrix multiplication of a new ## Obtaining PCs to project. ```{r prcomp, warning=FALSE} # data to define PCs -library(projectR) +library(ggplot2) data(p.RNAseq6l3c3t) # do PCA on RNAseq6l3c3t expression data @@ -79,7 +81,7 @@ pcVAR <- round(((pc.RNAseq6l3c3t$sdev)^2/sum(pc.RNAseq6l3c3t$sdev^2))*100,2) dPCA <- data.frame(cbind(pc.RNAseq6l3c3t$x,pd.RNAseq6l3c3t)) #plot pca -library(ggplot2) + setCOL <- scale_colour_manual(values = c("blue","black","red"), name="Condition:") setFILL <- scale_fill_manual(values = c("blue","black","red"),guide = FALSE) setPCH <- scale_shape_manual(values=c(23,22,25,25,21,24),name="Cell Line:") @@ -103,8 +105,8 @@ pPCA <- ggplot(dPCA, aes(x=PC1, y=PC2, colour=ID.cond, shape=ID.line, ```{r projectR.prcomp, warning=FALSE} # data to project into PCs from RNAseq6l3c3t expression data data(p.ESepiGen4c1l) +library(ggplot2) -library(projectR) PCA2ESepi <- projectR(data = p.ESepiGen4c1l$mRNA.Seq,loadings=pc.RNAseq6l3c3t, full=TRUE, dataNames=map.ESepiGen4c1l[["GeneSymbols"]]) @@ -148,7 +150,7 @@ The number of rows in ${\bf{P}}$ (columns in ${\bf{A}}$) defines the number of b Projection of CoGAPS/GWCoGAPS patterns is implemented by solving the factorization in (1) for the new data matrix where ${\bf{A}}$ is the fixed nonorthogonal basis vectors comprising the average of the posterior mean for the CoGAPS/GWCoGAPS simulations performed on the original data. The patterns ${\bf{P}}$ in the new data associated with this amplitude matrix is estimated using the least-squares fit to the new data implemented with the `lmFit` function in the `r BiocStyle::Biocpkg("limma")` package. The `projectR` function has S4 method for class `Linear Embedding Matrix, LME`. ``` -library(projectR) + projectR(data, loadings,dataNames = NULL, loadingsNames = NULL, NP = NA, full = FALSE) ``` @@ -173,7 +175,7 @@ The basic output of the base projectR function, i.e. `full=FALSE`, returns `proj ```{r} # get data -library(projectR) + AP <- get(data("AP.RNAseq6l3c3t")) #CoGAPS run data AP <- AP$Amean # heatmap of gene weights for CoGAPs patterns @@ -189,7 +191,7 @@ pNMF<-heatmap.2(as.matrix(AP),col=bluered, trace='none', ## Projecting CoGAPS objects ```{r} # data to project into PCs from RNAseq6l3c3t expression data -library(projectR) + data('p.ESepiGen4c1l4') data('p.RNAseq6l3c3t') @@ -223,7 +225,7 @@ As canonical projection is not defined for clustering objects, the projectR pack `cluster2pattern` uses the corelation of each genes expression to the mean of each cluster to define continuous weights. ``` -library(projectR) + data(p.RNAseq6l3c3t) @@ -254,7 +256,6 @@ The output of the `cluster2pattern` function is a `pclust` class object; specifi `intersectoR` function can be used to test for significant overlap between two clustering objects. The base function finds and tests the intersecting values of two sets of lists, presumably the genes associated with patterns in two different datasets. S4 class methods for `hclust` and `kmeans` objects are also available. ``` -library(projectR) intersectoR(pSet1 = NA, pSet2 = NA, pval = 0.05, full = FALSE, k = NULL) ``` @@ -280,7 +281,7 @@ Correlation based projection requires a matrix of gene-wise correlation values t ## correlateR ``` -library(projectR) + correlateR(genes = NA, dat = NA, threshtype = "R", threshold = 0.7, absR = FALSE, ...) ``` @@ -304,7 +305,7 @@ The output of the `correlateR` function is a `correlateR` class object. Specific ```{r correlateR-exp} # data to -library(projectR) + data("p.RNAseq6l3c3t") # get genes correlated to T @@ -351,7 +352,7 @@ pCorT # data to project into from RNAseq6l3c3t expression data data(p.ESepiGen4c1l) -library(projectR) + cor2ESepi <- projectR(p.ESepiGen4c1l$mRNA.Seq,loadings=cor2T[[1]],full=FALSE, dataNames=map.ESepiGen4c1l$GeneSymbols) @@ -365,9 +366,9 @@ cor2ESepi <- projectR(p.ESepiGen4c1l$mRNA.Seq,loadings=cor2T[[1]],full=FALSE, Given loadings that define the weight of features (genes) in a given latent space (e.g. PCA, NMF), and the use of these patterns in samples, it is of interest to look at differential usage of these features between conditions. These conditions may be defined by user-defined annotations of cell type or by differential usage of a (projected) pattern. By examining differences in gene expression, weighted by the loadings that define their importance in a specific latent space, a unique understanding of differential expression in that context can be gained. This approach was originally proposed and developed in (Baraban et al, 2021), which demonstrates its utility in cross-celltype and cross-species interpretation of pattern usages. ``` -library(projectR) + projectionDriveR(cellgroup1, cellgroup2, loadings, loadingsNames = NULL, - pvalue, pattern_name, display = T, normalize_pattern = T) + pvalue, pattern_name, display = T, normalize_pattern = T, mode = "CI") ``` @@ -381,46 +382,62 @@ The arguments for projectionDriveR are: **`loadings`** Matrix or dataframe with features as rows, columns as patterns. Values define feature weights in that space **`loadingsNames`** Vector of names corresponding to rows of loadings. By default the rownames of loadings will be used **`pattern_name`** the column name of the loadings by which the features will be weighted -**`pvalue`** Determines the significance of the confidence interval to be calculated between the difference of means +**`pvalue`** Determines the significance of the confidence interval to be calculated between the difference of means. Default 1e-5 **`display`** Boolean. Whether or not to plot the estimates of significant features. Default = T **`normalize_pattern`** Boolean. Whether or not to normalize the average feature weight. Default = T - +**`mode`** 'CI' or 'PV'. Specifies whether to run projectionDriveR in confidence interval mode or to generate p values. Default = "CI" ### Output -The output of `projectionDriveR` is a list of length five `mean_ci` holds the confidence intervals for the difference in means for all features, `weighted_ci` holds the confidence intervals for the weighted difference in means for all features, and normalized_weights are the weights themselves. In addition, `significant_genes` is a vector of gene names that are significantly different at the threshold provided. `plotted_ci` returns the ggplot figure of the confidence intervals, see `plotConfidenceIntervals` for documentation. +The output of `projectionDriveR` in confidence interval mode ('CI') is a list of length six `mean_ci` holds the confidence intervals for the difference in means for all features, `weighted_mean_ci` holds the confidence intervals for the weighted difference in means for all features, and `normalized_weights` are the weights themselves. In addition, `sig_genes` is a list of three vectors of gene names that are significantly different at the threshold provided generated from the mean confidence intervals (`unweighted_sig_genes`), the weighted mean confidence intervals (`weighted_sig_genes`) and genes shared between the two (`significant_shared_genes`) . `plotted_ci` returns the ggplot figure of the confidence intervals, see `plotConfidenceIntervals` for documentation. `meta_data` contains matrix names and pvalue thresholds. The output of `projectionDriveR` in p value mode ('PV') is a list of length nine. `meta_data`, `sig_genes` and `normalized_weights` are similar between modes. `mean_stats` and `weighted_mean_stats` contains summary information for welch t-tests. `difexpgenes` and `weighted_difexpgenes` are filtered dataframes containing differentially expressed genes at a FC and pvalue cut off of 0.2 and 1e-5 respectively. `fgseavecs` contain unweighted and weighted named vectors of welch-t test estimates that can be used with fgsea. `plt` returns the volcano ggplot figure. See `pdVolcano` for documentation. FC and pvalue can be manually altered by calling pdVolcano on projectionDriveR result. ### Identifying differential features associated with learned patterns - ```{r projectionDriver, message = F, out.width="100%"} options(width = 60) -library(projectR) library(dplyr, warn.conflicts = F) -#gene weights x pattern -data("retinal_patterns") - #size-normed, log expression data("microglial_counts") #size-normed, log expression data("glial_counts") +#5 pattern cogaps object generated on microglial_counts +data("cr_microglial") +microglial_fl <- cr_microglial@featureLoadings + #the features by which to weight the difference in expression -pattern_to_weight <- "Pattern.24" -drivers <- projectionDriveR(microglial_counts, #expression matrix +pattern_to_weight <- "Pattern_1" +drivers_ci <- projectionDriveR(microglial_counts, #expression matrix glial_counts, #expression matrix - loadings = retinal_patterns, #feature x pattern dataframe + loadings = microglial_fl, #feature x pattern dataframe loadingsNames = NULL, pattern_name = pattern_to_weight, #column name pvalue = 1e-5, #pvalue before bonferroni correction display = T, - normalize_pattern = T) #normalize feature weights + normalize_pattern = T, #normalize feature weights + mode = "CI") #confidence interval mode + + +conf_intervals <- drivers_ci$mean_ci[drivers_ci$sig_genes$significant_shared_genes,] -conf_intervals <- drivers$mean_ci[drivers$significant_genes,] str(conf_intervals) +drivers_pv <- projectionDriveR(microglial_counts, #expression matrix + glial_counts, #expression matrix + loadings = microglial_fl, #feature x pattern dataframe + loadingsNames = NULL, + pattern_name = pattern_to_weight, #column name + pvalue = 1e-5, #pvalue before bonferroni correction + display = T, + normalize_pattern = T, #normalize feature weights + mode = "PV") #confidence interval mode + +difexp <- drivers_pv$difexpgenes +str(difexp) + + ``` ## plotConfidenceIntervals @@ -436,7 +453,7 @@ The arguments for plotConfidenceIntervals are: **`weights`** weights of features to include as annotation (default = NULL will not include heatmap) **`weights_clip`** quantile of data to clip color scale for improved visualization (default: 0.99) **`weights_vis_norm`** Which processed version of weights to visualize as a heatmap. One of c("none", "quantile"). default = "none" - +**`weighted`** Boolean. Specifies whether confidence intervals are weighted by a pattern or not. Default = "F" ### Output A list of the length three that includes confidence interval plots and relevant info. `ci_estimates_plot` is the point-range plot for the provided estimates. If called from within `projectionDriveR`, the unweighted estimates are used. `feature_order` is the vector of gene names in the order shown in the figure. `weights_heatmap` is a heatmap annotation of the gene loadings, in the same order as above. @@ -459,7 +476,7 @@ conf_intervals$label_name[idx] <- gene_ids #the labels above can now be used as ggplot aesthetics plots_list <- plotConfidenceIntervals(conf_intervals, #mean difference in expression confidence intervals sort = F, #should genes be sorted by estimates - weights = drivers$normalized_weights[rownames(conf_intervals)], + weights = drivers_ci$normalized_weights[rownames(conf_intervals)], pattern_name = pattern_to_weight, weights_clip = 0.99, weights_vis_norm = "none") @@ -470,10 +487,10 @@ pl1 <- plots_list[["ci_estimates_plot"]] + pl2 <- plots_list[["weights_heatmap"]] #now plot the weighted differences -weighted_conf_intervals <- drivers$weighted_mean_ci[gene_order,] +weighted_conf_intervals <- drivers_ci$weighted_mean_ci[gene_order,] plots_list_weighted <- plotConfidenceIntervals(weighted_conf_intervals, sort = F, - pattern_name = pattern_to_weight) + weighted = T) pl3 <- plots_list_weighted[["ci_estimates_plot"]] + xlab("Difference in weighted group means") + @@ -484,13 +501,86 @@ cowplot::plot_grid(pl1, pl2, pl3, align = "h", rel_widths = c(1,.4, 1), ncol = 3 ``` +##pdVolcano -## multivariateAnalysisR +### Input +The arguments for pdVolcano are: + +**`result`** Output from projectionDriveR function with PV mode selected +**`FC`** fold change threshold, default at 0.2 +**`pvalue`** significance threshold, default set to pvalue stored in projectionDriveR output +**`subset`** optional vector of gene names to subset the result by +**`filter.inf`** Boolean. If TRUE will remove genes that have pvalues below machine double minimum value +**`label.num`** number of genes to label on either end of volcano plot, default to 5 (10 total) +**`display`** Boolean. Default TRUE, will print volcano plots using cowplot grid_arrange + +### Output +Generates the same output as projectionDriveR. Allows manual updates to pvalue and FC thresholds and can accept gene lists of interest to subset results. + +### Customize volcano plot and run FGSEA + +```{r fig.width=9, fig.height=10} +suppressWarnings(library(cowplot)) +library(fgsea) +library(msigdbr) +#manually change FC and pvalue threshold from defaults 0.2, 1e-5 +drivers_pv_mod <- pdVolcano(drivers_pv, FC = 0.5, pvalue = 1e-7) + +difexp_mod <- drivers_pv_mod$difexpgenes +str(difexp_mod) + +#fgsea application + +#extract unweighted fgsea vector +fgseavec <- drivers_pv$fgseavecs$unweightedvec +#split into enrichment groups, negative estimates are enriched in the reference matrix (glial), positive are enriched in the test matrix (microglial), take abs value +glial_fgsea_vec <- abs(fgseavec[which(fgseavec < 0)]) +microglial_fgsea_vec <- abs(fgseavec[which(fgseavec > 0)]) + +#get FGSEA pathways - selecting subcategory C8 for cell types +msigdbr_list = msigdbr::msigdbr(species = "Mus musculus", category = "C8") +pathways = split(x = msigdbr_list$ensembl_gene, f = msigdbr_list$gs_name) + +#run fgsea scoreType positive, all values will be positive +glial_fgsea <- fgsea::fgsea(pathways, glial_fgsea_vec, scoreType = "pos") +#tidy +glial_fgseaResTidy <- glial_fgsea %>% + subset(padj <= 0.05 & size >= 10 & size <= 500) %>% + as_tibble() %>% + dplyr::arrange(desc(size)) +#plot +glial_EnrichmentPlot <- ggplot2::ggplot(glial_fgseaResTidy, + ggplot2::aes(reorder(pathway, NES), NES)) + + ggplot2::geom_point(aes(size=size, color = padj)) + + coord_flip() + + labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + + theme_minimal() +glial_EnrichmentPlot + +microglial_fgsea <- fgsea::fgsea(pathways, microglial_fgsea_vec, scoreType = "pos") +#tidy +microglial_fgseaResTidy <- microglial_fgsea %>% + subset(padj <= 0.05 & size >= 10 & size <= 500) %>% + as_tibble() %>% + dplyr::arrange(desc(size)) + +microglial_EnrichmentPlot <- ggplot2::ggplot(microglial_fgseaResTidy, + ggplot2::aes(reorder(pathway, NES), NES)) + + ggplot2::geom_point(aes(size=size, color = padj)) + + coord_flip() + + labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + + theme_minimal() +microglial_EnrichmentPlot + + + +``` +##multivariateAnalysisR This function performs multivariate analysis on different clusters within a dataset, which in this case is restricted to Seurat Object. Clusters are identified as those satisfying the conditions specified in their corresponding dictionaries. `multivariateAnalysisR` performs two tests: Analysis of Variance (ANOVA) and Confidence Interval (CI) evaluations. ANOVA is performed to understand the general differentiation between clusters through the lens of a specified pattern. CI is to visualize pair-wise differential expressions between two clusters for each pattern. Researchers can visually understand both the macroscopic and microscopic differential uses of each pattern across different clusters through this function. ``` -library(projectR) + multivariateAnalysisR <- function(significanceLevel = 0.05, patternKeys, seuratobj, dictionaries, customNames = NULL, exclusive = TRUE, exportFolder = "", ANOVAwidth = 1000, diff --git a/vignettes/projectR.html b/vignettes/projectR.html index 90bada9..98ac29b 100644 --- a/vignettes/projectR.html +++ b/vignettes/projectR.html @@ -15,7 +15,7 @@ - + projectR Vignette @@ -249,6 +249,7 @@ div.csl-bib-body { } div.csl-entry { clear: both; +margin-bottom: 0em; } .hanging div.csl-entry { margin-left:2em; @@ -729,7 +730,7 @@

projectR Vignette

Gaurav Sharma, Charles Shin, Jared N. Slosberg, Loyal A. Goff and Genevieve L. Stein-O'Brien

-

16 November 2023

+

21 February 2024

@@ -795,12 +796,12 @@

Contents

  • 7.2.1 Input
  • 7.2.2 Output
  • 7.2.3 Customize plotting of confidence intervals
  • - -
  • 7.3 multivariateAnalysisR -
  • References
  • @@ -816,16 +817,16 @@

    2 Getting started with projectR

    2.1 Installation Instructions

    For automatic Bioconductor package installation, start R, and run:

    -
    BiocManager::install("projectR")
    +
    BiocManager::install("genesofeve/projectR@projectionDriveR")

    2.2 Methods

    -

    Projection can roughly be defined as a mapping or transformation of points from one space to another often lower dimensional space. Mathematically, this can described as a function \(\varphi(x)=y : \Re^{D} \mapsto \Re^{d}\) s.t. \(d \leq D\) for \(x \in \Re^{D}, y \in \Re^{d}\) Barbakh, Wu, and Fyfe (2009) . The projectR package uses projection functions defined in a training dataset to interrogate related biological phenomena in an entirely new data set. These functions can be the product of any one of several methods common to “omic” analyses including regression, PCA, NMF, clustering. Individual sections focusing on one specific method are included in the vignette. However, the general design of the projectR function is the same regardless.

    +

    Projection can roughly be defined as a mapping or transformation of points from one space to another often lower dimensional space. Mathematically, this can described as a function \(\varphi(x)=y : \Re^{D} \mapsto \Re^{d}\) s.t. \(d \leq D\) for \(x \in \Re^{D}, y \in \Re^{d}\) Barbakh, Wu, and Fyfe (2009) . The projectR package uses projection functions defined in a training dataset to interrogate related biological phenomena in an entirely new data set. These functions can be the product of any one of several methods common to “omic” analyses including regression, PCA, NMF, clustering. Individual sections focusing on one specific method are included in the vignette. However, the general design of the projectR function is the same regardless.

    2.3 The base projectR function

    The generic projectR function is executed as follows:

    -
    library(projectR)
    +
    
     projectR(data, loadings, dataNames=NULL, loadingsNames=NULL, NP = NULL, full = false)

    2.3.1 Input Arguments

    @@ -851,7 +852,7 @@

    3 PCA projection

    3.1 Obtaining PCs to project.

    # data to define PCs
    -library(projectR)
    +
     data(p.RNAseq6l3c3t)
     
     # do PCA on RNAseq6l3c3t expression data
    @@ -860,7 +861,7 @@ 

    3.1 Obtaining PCs to project.

    3.2 Projecting prcomp objects# data to project into PCs from RNAseq6l3c3t expression data data(p.ESepiGen4c1l) -library(projectR) + PCA2ESepi <- projectR(data = p.ESepiGen4c1l$mRNA.Seq,loadings=pc.RNAseq6l3c3t, full=TRUE, dataNames=map.ESepiGen4c1l[["GeneSymbols"]])
    ## [1] "93 row names matched between data and loadings"
    @@ -921,7 +922,7 @@ 

    4 NMF projection

    \end{equation}\] The number of rows in \({\bf{P}}\) (columns in \({\bf{A}}\)) defines the number of biological patterns (k) that CoGAPS/GWCoGAPS will infer from the number of nonorthogonal basis vectors required to span the data space. As in the Bayesian Decomposition algorithm Wang, Kossenkov, and Ochs (2006), the matrices \({\bf{A}}\) and \({\bf{P}}\) in CoGAPS are assumed to have the atomic prior described in Sibisi and Skilling (1997). In the CoGAPS/GWCoGAPS implementation, \(\alpha_{A}\) and \(\alpha_{P}\) are corresponding parameters for the expected number of atoms which map to each matrix element in \({\bf{A}}\) and \({\bf{P}}\), respectively. The corresponding matrices \({\bf{A}}\) and \({\bf{P}}\) are found by MCMC sampling.

    Projection of CoGAPS/GWCoGAPS patterns is implemented by solving the factorization in (1) for the new data matrix where \({\bf{A}}\) is the fixed nonorthogonal basis vectors comprising the average of the posterior mean for the CoGAPS/GWCoGAPS simulations performed on the original data. The patterns \({\bf{P}}\) in the new data associated with this amplitude matrix is estimated using the least-squares fit to the new data implemented with the lmFit function in the limma package. The projectR function has S4 method for class Linear Embedding Matrix, LME.

    -
    library(projectR)
    +
    
     projectR(data, loadings,dataNames = NULL, loadingsNames = NULL,
          NP = NA, full = FALSE)
    @@ -942,7 +943,7 @@

    4.0.2 Output

    4.1 Obtaining CoGAPS patterns to project.

    # get data
    -library(projectR)
    +
     AP <- get(data("AP.RNAseq6l3c3t")) #CoGAPS run data
     AP <- AP$Amean
     # heatmap of gene weights for CoGAPs patterns
    @@ -958,12 +959,12 @@ 

    4.1 Obtaining CoGAPS patterns to cexCol=1,cexRow=.5,scale = "row", hclustfun=function(x) hclust(x, method="average") )

    -

    +

    4.2 Projecting CoGAPS objects

    # data to project into PCs from RNAseq6l3c3t expression data
    -library(projectR)
    +
     data('p.ESepiGen4c1l4')
    ## Warning in data("p.ESepiGen4c1l4"): data set 'p.ESepiGen4c1l4' not found
    data('p.RNAseq6l3c3t')
    @@ -989,10 +990,10 @@ 

    4.2 Projecting CoGAPS objects

    ## $x
    -## [1] "Projected PC1 (18.32% of varience)"
    +## [1] "Projected PC1 (18.37% of varience)"
     ## 
     ## $y
    -## [1] "Projected PC2 (17.12% of varience)"
    +## [1] "Projected PC2 (17.16% of varience)"
     ## 
     ## $title
     ## [1] "Encode RNAseq in target PC1 & PC2"
    @@ -1007,7 +1008,7 @@ 

    5 Clustering projection

    5.1 cluster2pattern

    cluster2pattern uses the corelation of each genes expression to the mean of each cluster to define continuous weights.

    -
    library(projectR)
    +
    
     data(p.RNAseq6l3c3t)
     
     
    @@ -1033,8 +1034,7 @@ 

    5.1.2 Output

    5.2 intersectoR

    intersectoR function can be used to test for significant overlap between two clustering objects. The base function finds and tests the intersecting values of two sets of lists, presumably the genes associated with patterns in two different datasets. S4 class methods for hclust and kmeans objects are also available.

    -
    library(projectR)
    -intersectoR(pSet1 = NA, pSet2 = NA, pval = 0.05, full = FALSE, k = NULL)
    +
    intersectoR(pSet1 = NA, pSet2 = NA, pval = 0.05, full = FALSE, k = NULL)

    5.2.1 Input Arguments

    The inputs that must be set each time are the clusters and data.

    @@ -1056,7 +1056,7 @@

    6 Correlation based projectionCorrelation based projection requires a matrix of gene-wise correlation values to serve as the Pattern input to the projectR function. This matrix can be user-generated or the result of the correlateR function included in the projectR package. User-generated matrixes with each row corresponding to an individual gene can be input to the generic projectR function. The correlateR function allows users to create a weight matrix for projection with values quantifying the within dataset correlation of each genes expression to the expression pattern of a particular gene or set of genes as follows.

    6.1 correlateR

    -
    library(projectR)
    +
    
     correlateR(genes = NA, dat = NA, threshtype = "R", threshold = 0.7, absR = FALSE, ...)

    6.1.1 Input Arguments

    @@ -1077,7 +1077,7 @@

    6.1.2 Output

    6.2 Obtaining and visualizing correlateR objects.

    # data to
    -library(projectR)
    +
     data("p.RNAseq6l3c3t")
     
     # get genes correlated to T
    @@ -1124,14 +1124,14 @@ 

    6.2 Obtaining and visualizing

    -

    +

    6.3 Projecting correlateR objects.

    # data to project into from RNAseq6l3c3t expression data
     data(p.ESepiGen4c1l)
     
    -library(projectR)
    +
     cor2ESepi <- projectR(p.ESepiGen4c1l$mRNA.Seq,loadings=cor2T[[1]],full=FALSE,
         dataNames=map.ESepiGen4c1l$GeneSymbols)
    ## [1] "9 row names matched between data and loadings"
    @@ -1143,9 +1143,9 @@ 

    7 Differential features identific

    7.1 projectionDriveR

    Given loadings that define the weight of features (genes) in a given latent space (e.g. PCA, NMF), and the use of these patterns in samples, it is of interest to look at differential usage of these features between conditions. These conditions may be defined by user-defined annotations of cell type or by differential usage of a (projected) pattern. By examining differences in gene expression, weighted by the loadings that define their importance in a specific latent space, a unique understanding of differential expression in that context can be gained. This approach was originally proposed and developed in (Baraban et al, 2021), which demonstrates its utility in cross-celltype and cross-species interpretation of pattern usages.

    -
    library(projectR)
    +
    
     projectionDriveR(cellgroup1, cellgroup2, loadings, loadingsNames = NULL,
    -                 pvalue, pattern_name, display = T, normalize_pattern = T)
    +                 pvalue, pattern_name, display = T, normalize_pattern = T, mode = "CI")
     

    7.1.1 Input Arguments

    @@ -1156,49 +1156,73 @@

    7.1.1 Input Arguments

    loadings Matrix or dataframe with features as rows, columns as patterns. Values define feature weights in that space
    loadingsNames Vector of names corresponding to rows of loadings. By default the rownames of loadings will be used
    pattern_name the column name of the loadings by which the features will be weighted
    -pvalue Determines the significance of the confidence interval to be calculated between the difference of means
    +pvalue Determines the significance of the confidence interval to be calculated between the difference of means. Default 1e-5 display Boolean. Whether or not to plot the estimates of significant features. Default = T
    -normalize_pattern Boolean. Whether or not to normalize the average feature weight. Default = T

    +normalize_pattern Boolean. Whether or not to normalize the average feature weight. Default = T
    +mode ‘CI’ or ‘PV’. Specifies whether to run projectionDriveR in confidence interval mode or to generate p values. Default = “CI”

    7.1.2 Output

    -

    The output of projectionDriveR is a list of length five mean_ci holds the confidence intervals for the difference in means for all features, weighted_ci holds the confidence intervals for the weighted difference in means for all features, and normalized_weights are the weights themselves. In addition, significant_genes is a vector of gene names that are significantly different at the threshold provided. plotted_ci returns the ggplot figure of the confidence intervals, see plotConfidenceIntervals for documentation.

    +

    The output of projectionDriveR in confidence interval mode (‘CI’) is a list of length six mean_ci holds the confidence intervals for the difference in means for all features, weighted_mean_ci holds the confidence intervals for the weighted difference in means for all features, and normalized_weights are the weights themselves. In addition, sig_genes is a list of three vectors of gene names that are significantly different at the threshold provided generated from the mean confidence intervals (unweighted_sig_genes), the weighted mean confidence intervals (weighted_sig_genes) and genes shared between the two (significant_shared_genes) . plotted_ci returns the ggplot figure of the confidence intervals, see plotConfidenceIntervals for documentation. meta_data contains matrix names and pvalue thresholds. The output of projectionDriveR in p value mode (‘PV’) is a list of length nine. meta_data, sig_genes and normalized_weights are similar between modes. mean_stats and weighted_mean_stats contains summary information for welch t-tests. difexpgenes and weighted_difexpgenes are filtered dataframes containing differentially expressed genes at a FC and pvalue cut off of 0.2 and 1e-5 respectively. fgseavecs contain unweighted and weighted named vectors of welch-t test estimates that can be used with fgsea. plt returns the volcano ggplot figure. See pdVolcano for documentation. FC and pvalue can be manually altered by calling pdVolcano on projectionDriveR result.

    7.1.3 Identifying differential features associated with learned patterns

    options(width = 60)
    -library(projectR)
     library(dplyr, warn.conflicts = F)
     
    -#gene weights x pattern
    -data("retinal_patterns")
    -
     #size-normed, log expression
     data("microglial_counts")
     
     #size-normed, log expression
     data("glial_counts")
     
    +#5 pattern cogaps object generated on microglial_counts
    +data("cr_microglial")
    +microglial_fl <- cr_microglial@featureLoadings
    +
     #the features by which to weight the difference in expression 
    -pattern_to_weight <- "Pattern.24"
    -drivers <- projectionDriveR(microglial_counts, #expression matrix
    +pattern_to_weight <- "Pattern_1"
    +drivers_ci <- projectionDriveR(microglial_counts, #expression matrix
                                            glial_counts, #expression matrix
    -                                       loadings = retinal_patterns, #feature x pattern dataframe
    +                                       loadings = microglial_fl, #feature x pattern dataframe
                                            loadingsNames = NULL,
                                            pattern_name = pattern_to_weight, #column name
                                            pvalue = 1e-5, #pvalue before bonferroni correction
                                            display = T,
    -                                       normalize_pattern = T) #normalize feature weights
    -
    ## [1] "2996 row names matched between datasets"
    -## [1] "2996"
    -## [1] "Updated dimension of data: 2996"
    -

    -
    conf_intervals <- drivers$mean_ci[drivers$significant_genes,]
    +                                       normalize_pattern = T, #normalize feature weights
    +                                       mode = "CI") #confidence interval mode
    +

    +
    conf_intervals <- drivers_ci$mean_ci[drivers_ci$sig_genes$significant_shared_genes,]
    +
     
     str(conf_intervals)
    -
    ## 'data.frame':    253 obs. of  2 variables:
    -##  $ low : num  1.86 0.158 -0.562 -0.756 0.155 ...
    -##  $ high: num  2.03943 0.26729 -0.00197 -0.18521 0.23239 ...
    +
    ## 'data.frame':    330 obs. of  3 variables:
    +##  $ low : num  -1.009 0.102 1.86 -2.089 -0.791 ...
    +##  $ high: num  -0.35 0.356 2.039 -1.359 -0.28 ...
    +##  $ gene: chr  "ENSMUSG00000067879" "ENSMUSG00000026158" "ENSMUSG00000026126" "ENSMUSG00000060424" ...
    +
    drivers_pv <- projectionDriveR(microglial_counts, #expression matrix
    +                                       glial_counts, #expression matrix
    +                                       loadings = microglial_fl, #feature x pattern dataframe
    +                                       loadingsNames = NULL,
    +                                       pattern_name = pattern_to_weight, #column name
    +                                       pvalue = 1e-5, #pvalue before bonferroni correction
    +                                       display = T,
    +                                       normalize_pattern = T, #normalize feature weights
    +                                       mode = "PV") #confidence interval mode
    +

    +
    difexp <- drivers_pv$difexpgenes
    +str(difexp)
    +
    ## 'data.frame':    440 obs. of  10 variables:
    +##  $ ref_mean    : num  0.3193 0.7124 0.108 0.0145 1.9462 ...
    +##  $ test_mean   : num  0.0127 0.0331 0.3367 1.964 0.2223 ...
    +##  $ mean_diff   : num  -0.307 -0.679 0.229 1.949 -1.724 ...
    +##  $ estimate    : num  -27.46 -35.77 7.81 41.72 -51.84 ...
    +##  $ welch_pvalue: num  1.45e-150 2.97e-234 4.14e-14 1.06e-150 3.99e-253 ...
    +##  $ welch_padj  : num  4.36e-147 8.91e-231 1.24e-10 3.17e-147 1.20e-249 ...
    +##  $ gene        : chr  "ENSMUSG00000002459" "ENSMUSG00000067879" "ENSMUSG00000026158" "ENSMUSG00000026126" ...
    +##  $ Color       : Factor w/ 3 levels "NS or FC < 0.2",..: 2 2 3 3 2 2 3 2 2 2 ...
    +##  $ invert_P    : num  -44.87 -156.26 2.26 285.6 -429.14 ...
    +##  $ label       : chr  NA NA NA NA ...
    @@ -1213,7 +1237,8 @@

    7.2.1 Input

    genes a vector with names of genes to include in plot. If sort=F, estimates will be plotted in this order (default = NULL will include all genes.)
    weights weights of features to include as annotation (default = NULL will not include heatmap)
    weights_clip quantile of data to clip color scale for improved visualization (default: 0.99)
    -weights_vis_norm Which processed version of weights to visualize as a heatmap. One of c(“none”, “quantile”). default = “none”

    +weights_vis_norm Which processed version of weights to visualize as a heatmap. One of c(“none”, “quantile”). default = “none”
    +weighted Boolean. Specifies whether confidence intervals are weighted by a pattern or not. Default = “F”

    7.2.2 Output

    @@ -1236,7 +1261,7 @@

    7.2.3 Customize plotting of confi #the labels above can now be used as ggplot aesthetics plots_list <- plotConfidenceIntervals(conf_intervals, #mean difference in expression confidence intervals sort = F, #should genes be sorted by estimates - weights = drivers$normalized_weights[rownames(conf_intervals)], + weights = drivers_ci$normalized_weights[rownames(conf_intervals)], pattern_name = pattern_to_weight, weights_clip = 0.99, weights_vis_norm = "none") @@ -1247,32 +1272,112 @@

    7.2.3 Customize plotting of confi pl2 <- plots_list[["weights_heatmap"]] #now plot the weighted differences -weighted_conf_intervals <- drivers$weighted_mean_ci[gene_order,] +weighted_conf_intervals <- drivers_ci$weighted_mean_ci[gene_order,] plots_list_weighted <- plotConfidenceIntervals(weighted_conf_intervals, sort = F, - pattern_name = pattern_to_weight) + weighted = T) pl3 <- plots_list_weighted[["ci_estimates_plot"]] + xlab("Difference in weighted group means") + theme(axis.title.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) cowplot::plot_grid(pl1, pl2, pl3, align = "h", rel_widths = c(1,.4, 1), ncol = 3)

    -
    ## Warning: Removed 249 rows containing missing values
    +
    ## Warning: Removed 326 rows containing missing values
     ## (`geom_label_repel()`).
    -

    +

    +## pdVolcano

    +
    +
    +

    7.2.4 Input

    +

    The arguments for pdVolcano are:

    +

    result Output from projectionDriveR function with PV mode selected +FC fold change threshold, default at 0.2 +pvalue significance threshold, default set to pvalue stored in projectionDriveR output +subset optional vector of gene names to subset the result by +filter.inf Boolean. If TRUE will remove genes that have pvalues below machine double minimum value
    +label.num number of genes to label on either end of volcano plot, default to 5 (10 total) +display Boolean. Default TRUE, will print volcano plots using cowplot grid_arrange

    +
    +

    7.2.5 Output

    +

    Generates the same output as projectionDriveR. Allows manual updates to pvalue and FC thresholds and can accept gene lists of interest to subset results.

    -
    -

    7.3 multivariateAnalysisR

    +
    +

    7.2.6 Customize volcano plot and run FGSEA

    +
    suppressWarnings(library(cowplot))
    +library(fgsea)
    +library(msigdbr)
    +#manually change FC and pvalue threshold from defaults 0.2, 1e-5
    +drivers_pv_mod <- pdVolcano(drivers_pv, FC = 0.5, pvalue = 1e-7)
    +
    ## Updating sig_genes...
    +

    +
    difexp_mod <- drivers_pv_mod$difexpgenes
    +str(difexp_mod)
    +
    ## 'data.frame':    213 obs. of  10 variables:
    +##  $ ref_mean    : num  0.7124 0.0145 1.9462 0.6037 0.742 ...
    +##  $ test_mean   : num  0.0331 1.964 0.2223 0.0681 0.1416 ...
    +##  $ mean_diff   : num  -0.679 1.949 -1.724 -0.536 -0.6 ...
    +##  $ estimate    : num  -35.8 41.7 -51.8 -28.8 -23.5 ...
    +##  $ welch_pvalue: num  2.97e-234 1.06e-150 3.99e-253 5.10e-139 5.41e-92 ...
    +##  $ welch_padj  : num  8.91e-231 3.17e-147 1.20e-249 1.53e-135 1.62e-88 ...
    +##  $ gene        : chr  "ENSMUSG00000067879" "ENSMUSG00000026126" "ENSMUSG00000060424" "ENSMUSG00000045515" ...
    +##  $ Color       : Factor w/ 3 levels "NS or FC < 0.5",..: 2 3 2 2 2 3 2 2 2 2 ...
    +##  $ invert_P    : num  -156.3 285.6 -429.1 -72.2 -52.7 ...
    +##  $ label       : chr  NA NA NA NA ...
    +
    #fgsea application 
    +
    +#extract unweighted fgsea vector
    +fgseavec <- drivers_pv$fgseavecs$unweightedvec
    +#split into enrichment groups, negative estimates are enriched in the reference matrix (glial), positive are enriched in the test matrix (microglial), take abs value 
    +glial_fgsea_vec <- abs(fgseavec[which(fgseavec < 0)])
    +microglial_fgsea_vec <- abs(fgseavec[which(fgseavec > 0)])
    +
    +#get FGSEA pathways - selecting subcategory C8 for cell types
    +msigdbr_list =  msigdbr::msigdbr(species = "Mus musculus", category = "C8")
    +pathways = split(x = msigdbr_list$ensembl_gene, f = msigdbr_list$gs_name)
    +
    +#run fgsea scoreType positive, all values will be positive
    +glial_fgsea <- fgsea::fgsea(pathways, glial_fgsea_vec, scoreType = "pos")
    +#tidy 
    +glial_fgseaResTidy <- glial_fgsea %>% 
    +  subset(padj <= 0.05 & size >= 10 & size <= 500) %>%
    +    as_tibble() %>%
    +    dplyr::arrange(desc(size))
    +#plot 
    +glial_EnrichmentPlot <- ggplot2::ggplot(glial_fgseaResTidy, 
    +                                        ggplot2::aes(reorder(pathway, NES), NES)) + 
    +  ggplot2::geom_point(aes(size=size, color = padj)) + 
    +  coord_flip() + 
    +  labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + 
    +  theme_minimal()
    +glial_EnrichmentPlot
    +

    +
    microglial_fgsea <- fgsea::fgsea(pathways, microglial_fgsea_vec, scoreType = "pos")
    +#tidy 
    +microglial_fgseaResTidy <- microglial_fgsea %>% 
    +  subset(padj <= 0.05 & size >= 10 & size <= 500) %>%
    +    as_tibble() %>%
    +    dplyr::arrange(desc(size))
    +
    +microglial_EnrichmentPlot <- ggplot2::ggplot(microglial_fgseaResTidy, 
    +                                             ggplot2::aes(reorder(pathway, NES), NES)) +
    +  ggplot2::geom_point(aes(size=size, color = padj)) + 
    +  coord_flip() + 
    +  labs(x="Pathway", y="Normalized Enrichment Score", title="Pathway NES from GSEA") + 
    +  theme_minimal()
    +microglial_EnrichmentPlot
    +

    +## multivariateAnalysisR

    This function performs multivariate analysis on different clusters within a dataset, which in this case is restricted to Seurat Object. Clusters are identified as those satisfying the conditions specified in their corresponding dictionaries. multivariateAnalysisR performs two tests: Analysis of Variance (ANOVA) and Confidence Interval (CI) evaluations. ANOVA is performed to understand the general differentiation between clusters through the lens of a specified pattern. CI is to visualize pair-wise differential expressions between two clusters for each pattern. Researchers can visually understand both the macroscopic and microscopic differential uses of each pattern across different clusters through this function.

    -
    library(projectR)
    +
    
     multivariateAnalysisR <- function(significanceLevel = 0.05, patternKeys, seuratobj,
                                       dictionaries, customNames = NULL, exclusive = TRUE,
                                       exportFolder = "", ANOVAwidth = 1000,
                                       ANOVAheight = 1000, CIwidth = 1000, CIheight = 1000,
                                       CIspacing = 1)
    -
    -

    7.3.1 Input Arguments

    +
    +
    +

    7.2.7 Input Arguments

    The required inputs are patternKeys (list of strings indicating the patterns to be evaluated), seuratobj (the Seurat Object data containing both clusters and patterns), and dictionaries (list of dictionary where each dictionary indicates the conditions each corresponding cluster has to satisfy).

    The arguments for multivariateAnalysisR are:

    significanceLevel Double value for testing significance in ANOVA test.
    @@ -1288,19 +1393,19 @@

    7.3.1 Input Arguments

    CIheight Height of CI png. CIspacing Spacing between each CI in CI graph.

    -
    -

    7.3.2 Output

    +
    +

    7.2.8 Output

    multivariateAnalysisR returns a sorted list of the generated ANOVA and CI values. It also exports two pairs of exported PNG/CSV files: one for ANOVA analysis, another for CI. From the ANOVA analysis, researchers can see the general ranking of differential uses of patterns across the specified clusters. From the CI analysis, researchers can identify the specific differential use cases between every pair of clusters.

    -
    -

    7.3.3 Comparing differential uses of patterns across different clusters

    +
    +

    7.2.9 Comparing differential uses of patterns across different clusters

    Demonstrative example will be added soon.

    References

    -
    +
    Barbakh, Wesam Ashour, Ying Wu, and Colin Fyfe. 2009. Review of Linear Projection Methods.” In Non-Standard Parameter Adaptation for Exploratory Data Analysis, 29–48. Berlin, Heidelberg: Springer Berlin Heidelberg.