Merge pull request #73 from AnthonyChristidis/main

Add option to specify assay.
ccb-hms · Sep 7, 2024 · 9045bcf · 9045bcf
2 parents c581546 + a92f82b
commit 9045bcf
Show file tree

Hide file tree

Showing 46 changed files with 319 additions and 2,533 deletions.
diff --git a/R/argumentCheck.R b/R/argumentCheck.R
@@ -35,6 +35,7 @@
 #' @param pc_subset_ref A numeric vector specifying the principal components to be used for the reference data. If `NULL`, no check is performed.
 #' @param common_rotation_genes If TRUE, check the rotation matrices of the reference and query data and ensure they have the same genes.
 #' Default is FALSE.
+#' @param assay_name Name of the assay on which to perform computations. If `NULL`, no check is performed.
 #' 
 #' @keywords internal
 #' 
@@ -54,32 +55,34 @@ argumentCheck <- function(query_data = NULL,
                           cell_names_ref = NULL,
                           pc_subset_query = NULL,
                           pc_subset_ref = NULL,
-                          common_rotation_genes = FALSE) {
+                          common_rotation_genes = FALSE,
+                          assay_name = NULL) {
 
     # Check if query_data is a SingleCellExperiment object
     if (!is.null(query_data)) {
+
         if (!is(query_data, "SingleCellExperiment")) {
             stop("'query_data' must be a SingleCellExperiment object.")
         }
 
-        if (!("logcounts" %in% SummarizedExperiment::assayNames(query_data))) {
-            stop("'query_data' does not contain 'logcounts' in its assays.")
-        }
+        if(!is.null(assay_name) && !(assay_name %in% SummarizedExperiment::assayNames(query_data)))
+            stop("\'query_data\' does not contain the specified assay.")
     }
 
     # Check if reference_data is a SingleCellExperiment object
     if (!is.null(reference_data)) {
+
         if (!is(reference_data, "SingleCellExperiment")) {
             stop("'reference_data' must be a SingleCellExperiment object.")
         }
 
-        if (!("logcounts" %in% SummarizedExperiment::assayNames(reference_data))) {
-            stop("'reference_data' does not contain 'logcounts' in its assays.")
-        }
+        if(!is.null(assay_name) && !(assay_name %in% SummarizedExperiment::assayNames(reference_data)))
+            stop("\'reference_data\' does not contain the specified assay.")
     }
 
     # Check if query_cell_type_col is a character string of length 1 and exists in query_data
     if (!is.null(query_cell_type_col)) {
+
         if (!is.null(query_data)) {
             if (!is.character(query_cell_type_col) || 
                 length(query_cell_type_col) != 1) {
@@ -94,6 +97,7 @@ argumentCheck <- function(query_data = NULL,
 
     # Check if ref_cell_type_col is a character string of length 1 and exists in reference_data
     if (!is.null(ref_cell_type_col)) {
+
         if (!is.null(reference_data)) {
             if (!is.character(ref_cell_type_col) || 
                 length(ref_cell_type_col) != 1) {
@@ -108,6 +112,7 @@ argumentCheck <- function(query_data = NULL,
 
     # Check if cell_types are available in the SingleCellExperiment object(s)
     if (!is.null(cell_types)) {
+
         if (!is.null(query_data)) {
             if (!all(cell_types %in% 
                      unique(query_data[[query_cell_type_col]]))) {
@@ -125,13 +130,15 @@ argumentCheck <- function(query_data = NULL,
 
     # Check that the SingleCellExperiment object(s) have a unique cell type
     if (isTRUE(unique_cell_type)) {
+
         if (!is.null(query_data)) {
             if (length(unique(query_data[[query_cell_type_col]])) > 1) {
                 stop("This function should be used when there is only one cell type in 'query_data'.")
             }
         }
 
         if (!is.null(reference_data)) {
+
             if (length(unique(reference_data[[ref_cell_type_col]])) > 1) {
                 stop("This function should be used when there is only one cell type in 'reference_data'.")
             }
@@ -146,27 +153,31 @@ argumentCheck <- function(query_data = NULL,
 
     # Check the number of cell types for plot function
     if (plot_function == TRUE) {
+
         if (length(unique(cell_types)) > 10) {
             stop("The maximum number of cell types for plotting is 10.")
         }
     }
 
     # Check cell_names contain valid cell names in query_data
     if (!is.null(cell_names_query)) {
+
         if (!all(cell_names_query %in% colnames(query_data))) {
             stop("'cell_names' contains one or more cells that are not available in 'query_data'.")
         }
     }
 
     # Check cell_names contain valid cell names in reference_data
     if (!is.null(cell_names_ref)) {
+
         if (!all(cell_names_ref %in% colnames(reference_data))) {
             stop("'cell_names' contains one or more cells that are not available in 'reference_data'.")
         }
     }
 
     # Check PC subset for query_data
     if (!is.null(pc_subset_query)) {
+
         # Check if "PCA" is present in query's reduced dimensions
         if (!"PCA" %in% names(reducedDims(query_data))) {
             stop("'query_data' must have pre-computed PCA in 'reducedDims'.")
@@ -180,6 +191,7 @@ argumentCheck <- function(query_data = NULL,
 
     # Check PC subset for reference_data
     if (!is.null(pc_subset_ref)) {
+
         # Check if "PCA" is present in reference's reduced dimensions
         if (!"PCA" %in% names(reducedDims(reference_data))) {
             stop("Reference data must have pre-computed PCA in 'reducedDims'.")
@@ -193,6 +205,7 @@ argumentCheck <- function(query_data = NULL,
 
     # Check if the rotation matrices have the same genes in the same order
     if (common_rotation_genes == TRUE) {
+
         # Check if the rotation matrices have the same number of genes
         if (ncol(attributes(reducedDim(query_data, "PCA"))[["rotation"]]) !=
             ncol(attributes(reducedDim(reference_data, "PCA"))[["rotation"]])) {

diff --git a/R/boxplotPCA.R b/R/boxplotPCA.R
@@ -16,6 +16,7 @@
 #' @param ref_cell_type_col The column name in the \code{colData} of \code{reference_data} that identifies the cell types.
 #' @param cell_types A character vector specifying the cell types to include in the plot. If NULL, all cell types are included.
 #' @param pc_subset A numeric vector specifying which principal components to include in the plot. Default is PC1 to PC5.
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
 #'
 #' @return A ggplot object representing the boxplots of specified principal components for the given cell types and datasets.
 #'
@@ -46,15 +47,17 @@ boxplotPCA <- function(query_data,
                        query_cell_type_col, 
                        ref_cell_type_col, 
                        cell_types = NULL,
-                       pc_subset = 1:5){
+                       pc_subset = 1:5,
+                       assay_name = "logcounts"){
 
     # Check standard input arguments
     argumentCheck(query_data = query_data,
                   reference_data = reference_data,
                   query_cell_type_col = query_cell_type_col,
                   ref_cell_type_col = ref_cell_type_col,
                   cell_types = cell_types,
-                  pc_subset_ref = pc_subset)
+                  pc_subset_ref = pc_subset,
+                  assay_name = assay_name)
 
     # Get common cell types if they are not specified by user
     if(is.null(cell_types)){
@@ -67,7 +70,8 @@ boxplotPCA <- function(query_data,
                              reference_data = reference_data, 
                              query_cell_type_col = query_cell_type_col, 
                              ref_cell_type_col = ref_cell_type_col,
-                             pc_subset = pc_subset)
+                             pc_subset = pc_subset,
+                             assay_name = assay_name)
 
     # Create the long format data frame manually
     pca_output <- pca_output[!is.na(pca_output[["cell_type"]]),]

diff --git a/R/calculateAveragePairwiseCorrelation.R b/R/calculateAveragePairwiseCorrelation.R
@@ -19,6 +19,7 @@
 #' @param pc_subset A numeric vector specifying which principal components to use in the analysis. Default is 1:10.
 #' If set to \code{NULL} then no dimensionality reduction is performed and the assay data is used directly for computations.
 #' @param correlation_method The correlation method to use for calculating pairwise correlations.
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
 #'
 #' @return A matrix containing the average pairwise correlation values. 
 #'         Rows and columns are labeled with the cell types. Each element 
@@ -57,7 +58,8 @@ calculateAveragePairwiseCorrelation <- function(
         ref_cell_type_col, 
         cell_types = NULL, 
         pc_subset = 1:10,
-        correlation_method = c("spearman", "pearson")) {
+        correlation_method = c("spearman", "pearson"),
+        assay_name = "logcounts") {
 
     # Match correlation method argument
     correlation_method <- match.arg(correlation_method)
@@ -68,7 +70,8 @@ calculateAveragePairwiseCorrelation <- function(
                   query_cell_type_col = query_cell_type_col,
                   ref_cell_type_col = ref_cell_type_col,
                   cell_types = cell_types,
-                  pc_subset_ref = pc_subset)
+                  pc_subset_ref = pc_subset,
+                  assay_name = assay_name)
 
     # Get common cell types if they are not specified by user
     if(is.null(cell_types)){
@@ -86,7 +89,8 @@ calculateAveragePairwiseCorrelation <- function(
                 reference_data = reference_data, 
                 query_cell_type_col = query_cell_type_col,
                 ref_cell_type_col = ref_cell_type_col,
-                pc_subset = pc_subset)
+                pc_subset = pc_subset,
+                assay_name = assay_name)
             ref_mat <- pca_output[which(
                 pca_output[["dataset"]] == "Reference" &
                     pca_output[["cell_type"]] == type2), 
@@ -103,8 +107,8 @@ calculateAveragePairwiseCorrelation <- function(
             ref_subset <- reference_data[, which(
                 reference_data[[ref_cell_type_col]] == type2), drop = FALSE]
 
-            query_mat <- t(as.matrix(assay(query_subset, "logcounts")))
-            ref_mat <- t(as.matrix(assay(ref_subset, "logcounts")))
+            query_mat <- t(as.matrix(assay(query_subset, assay_name)))
+            ref_mat <- t(as.matrix(assay(ref_subset, assay_name)))
         }
 
         cor_matrix <- cor(t(query_mat), t(ref_mat), 

diff --git a/R/calculateCellDistances.R b/R/calculateCellDistances.R
@@ -17,6 +17,7 @@
 #' that identifies the cell types.
 #' @param cell_types A character vector specifying the cell types to include in the plot. If NULL, all cell types are included.
 #' @param pc_subset A numeric vector specifying which principal components to include in the plot. Default 1:5.
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
 #'
 #' @return A list containing distance data for each cell type. Each entry in the list contains:
 #' \describe{
@@ -62,15 +63,17 @@ calculateCellDistances <- function(query_data,
                                    query_cell_type_col, 
                                    ref_cell_type_col,
                                    cell_types = NULL,
-                                   pc_subset = 1:5) {
+                                   pc_subset = 1:5,
+                                   assay_name = "logcounts") {
 
     # Check standard input arguments
     argumentCheck(query_data = query_data,
                   reference_data = reference_data,
                   query_cell_type_col = query_cell_type_col,
                   ref_cell_type_col = ref_cell_type_col,
                   cell_types = cell_types,
-                  pc_subset_ref = pc_subset)
+                  pc_subset_ref = pc_subset,
+                  assay_name = assay_name)
 
     # Get common cell types if they are not specified by user
     if(is.null(cell_types)){
@@ -83,7 +86,8 @@ calculateCellDistances <- function(query_data,
                              reference_data = reference_data, 
                              query_cell_type_col = query_cell_type_col, 
                              ref_cell_type_col = ref_cell_type_col,
-                             pc_subset = pc_subset)
+                             pc_subset = pc_subset,
+                             assay_name = assay_name)
 
     # Create a list to store distance data for each cell type
     distance_data <- vector("list", length = length(cell_types))

diff --git a/R/calculateCellDistancesSimilarity.R b/R/calculateCellDistancesSimilarity.R
@@ -25,7 +25,8 @@
 #' that identifies the cell types.
 #' @param cell_names A character vector specifying the names of the query cells for which to compute distance measures.
 #' @param pc_subset A numeric vector specifying which principal components to include in the plot. Default is 1:5.
-#'
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
+#' 
 #' @return A list containing distance data for each cell type. Each entry in the list contains:
 #' \describe{
 #'   \item{ref_distances}{A vector of all pairwise distances within the reference subset for the cell type.}
@@ -73,15 +74,17 @@ calculateCellDistancesSimilarity <- function(query_data,
                                              query_cell_type_col, 
                                              ref_cell_type_col,
                                              cell_names,
-                                             pc_subset = 1:5) {
+                                             pc_subset = 1:5,
+                                             assay_name = "logcounts") {
 
     # Check standard input arguments
     argumentCheck(query_data = query_data,
                   reference_data = reference_data,
                   query_cell_type_col = query_cell_type_col,
                   ref_cell_type_col = ref_cell_type_col,
                   cell_names_query = cell_names,
-                  pc_subset_ref = pc_subset)
+                  pc_subset_ref = pc_subset,
+                  assay_name = assay_name)
 
     # Compute distance data
     query_data_subset <- query_data[, cell_names, drop = FALSE]
@@ -90,7 +93,8 @@ calculateCellDistancesSimilarity <- function(query_data,
         reference_data = reference_data, 
         query_cell_type_col = query_cell_type_col, 
         ref_cell_type_col = ref_cell_type_col, 
-        pc_subset = pc_subset)
+        pc_subset = pc_subset,
+        assay_name = assay_name)
 
     # Initialize empty lists to store results
     bhattacharyya_list <- hellinger_list <- 
@@ -114,7 +118,8 @@ calculateCellDistancesSimilarity <- function(query_data,
         for (i in seq_len(length(cell_names))) {
 
             # Extract distances from the current cell to reference cells
-            cell_distances <- distance_data[[cell_type]][["query_to_ref_distances"]][cell_names[i], , drop = FALSE]
+            cell_distances <- 
+                distance_data[[cell_type]][["query_to_ref_distances"]][cell_names[i], , drop = FALSE]
 
             # Compute density of cell distances
             cell_density <- density(cell_distances)

diff --git a/R/calculateCellSimilarityPCA.R b/R/calculateCellSimilarityPCA.R
@@ -14,6 +14,7 @@
 #' @param cell_names A character vector specifying the cell names for which to compute the similarity.
 #' @param pc_subset A numeric vector specifying the subset of principal components to consider. Default is 1:5..
 #' @param n_top_vars An integer indicating the number of top loading variables to consider for each PC. Default is 50.
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
 #'
 #' @return A data frame containing cosine similarity values between cells for each selected principal component.
 #'
@@ -53,12 +54,14 @@
 calculateCellSimilarityPCA <- function(se_object, 
                                        cell_names, 
                                        pc_subset = 1:5, 
-                                       n_top_vars = 50){
+                                       n_top_vars = 50,
+                                       assay_name = "logcounts"){
 
     # Check standard input arguments
     argumentCheck(query_data = se_object,
                   cell_names_query = cell_names,
-                  pc_subset_query = pc_subset)
+                  pc_subset_query = pc_subset,
+                  assay_name = assay_name)
 
     # Check if n_top_vars is a positive integer
     if (!is.numeric(n_top_vars) || n_top_vars <= 0 || 
@@ -109,7 +112,7 @@ calculateCellSimilarityPCA <- function(se_object,
 
     # Calculate similarities
     assay_mat <- t(as.matrix(assay(se_object[, cell_names, drop = FALSE], 
-                                   "logcounts")))
+                                   assay_name)))
     similarities <- .computeCosineSimilarity(assay_mat, rotation_mat, 
                                              high_loading_vars)
 

diff --git a/R/calculateCramerPValue.R b/R/calculateCramerPValue.R
@@ -19,6 +19,7 @@
 #' @param query_cell_type_col The column name in the \code{colData} of \code{query_data} that identifies the cell types.
 #' @param cell_types A character vector specifying the cell types to include in the plot. If NULL, all cell types are included.
 #' @param pc_subset A numeric vector specifying which principal components to include in the plot. Default is PC1 to PC5.
+#' @param assay_name Name of the assay on which to perform computations. Default is "logcounts".
 #'
 #' @return A named vector of p-values from the Cramer test for each cell type.
 #'
@@ -49,15 +50,17 @@ calculateCramerPValue <- function(reference_data,
                                   ref_cell_type_col, 
                                   query_cell_type_col = NULL, 
                                   cell_types = NULL,
-                                  pc_subset = 1:5) {
+                                  pc_subset = 1:5,
+                                  assay_name = "logcounts") {
 
     # Check standard input arguments
     argumentCheck(query_data = query_data,
                   reference_data = reference_data,
                   query_cell_type_col = query_cell_type_col,
                   ref_cell_type_col = ref_cell_type_col,
                   cell_types = cell_types,
-                  pc_subset_ref = pc_subset)
+                  pc_subset_ref = pc_subset,
+                  assay_name = assay_name)
 
     # Get common cell types if they are not specified by user
     if(is.null(cell_types)){
@@ -76,7 +79,8 @@ calculateCramerPValue <- function(reference_data,
                              reference_data = reference_data, 
                              query_cell_type_col = query_cell_type_col, 
                              ref_cell_type_col = ref_cell_type_col,
-                             pc_subset = pc_subset)
+                             pc_subset = pc_subset,
+                             assay_name = assay_name)
     pca_output <- pca_output[pca_output[["cell_type"]] %in% cell_types,]
 
     # Set data for Cramer test