Merge pull request #26 from ccb-hms/nt_fix_package

Fix package and get it ready for submission
ccb-hms · Apr 14, 2024 · ec2977c · ec2977c
2 parents 7940774 + e41b0ea
commit ec2977c
Show file tree

Hide file tree

Showing 92 changed files with 5,318 additions and 1,695 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,5 @@
 ^doc$
 ^Meta$
 ^\.github$
+^docs$
+^pkgdown$
diff --git a/.gitignore b/.gitignore
@@ -13,7 +13,6 @@
 .Rcheck/
 *.log
 *.pdf
-*.html
 *.tex
 *.docx
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,23 +15,28 @@ Description: The scDiagnostics package provides diagnostic plots to
     expression patterns, and explore relationships between different
     cell types in query and reference datasets allowing users to
     detect potential misalignments between reference and query
-    datasets.
+    datasets. The package also provides visualization capabilities for
+    diagnositics purposes.    
 License: Artistic-2.0
 URL: https://github.com/ccb-hms/scDiagnostics
 BugReports: https://github.com/ccb-hms/scDiagnostics/issues
 Depends:
-    R (>= 4.2.0),
-    SingleCellExperiment
+    R (>= 4.2.0)
 Imports:
+    SingleCellExperiment,
     isotree,
+    methods,
+    rlang,
     ggplot2,
     gridExtra,
     scater,
     SummarizedExperiment,
-    AUCell,
     BiocGenerics,
-    S4Vectors
+    S4Vectors,
+    stats,
+    utils
 Suggests:
+    AUCell,
     BiocStyle,
     corrplot,
     knitr,
@@ -41,9 +46,12 @@ Suggests:
     scran,
     scRNAseq,
     SingleR,
-    celldex
+    celldex,
+    testthat (>= 3.0.0)
 VignetteBuilder: 
     knitr
+biocViews: SingleCell, Software
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -20,18 +20,30 @@ importFrom(SingleCellExperiment,logcounts)
 importFrom(SingleCellExperiment,reducedDim)
 importFrom(SingleCellExperiment,reducedDimNames)
 importFrom(SummarizedExperiment,assay)
+importFrom(SummarizedExperiment,colData)
+importFrom(ggplot2,aes)
 importFrom(ggplot2,element_blank)
+importFrom(ggplot2,geom_density)
 importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_line)
 importFrom(ggplot2,geom_point)
 importFrom(ggplot2,ggplot)
+importFrom(ggplot2,guide_legend)
+importFrom(ggplot2,guides)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_color_gradient)
+importFrom(ggplot2,scale_color_manual)
 importFrom(ggplot2,scale_x_continuous)
 importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,theme_minimal)
+importFrom(ggplot2,xlab)
+importFrom(ggplot2,ylab)
 importFrom(ggplot2,ylim)
 importFrom(gridExtra,grid.arrange)
 importFrom(isotree,isolation.forest)
+importFrom(methods,is)
+importFrom(rlang,.data)
 importFrom(scater,plotPCA)
 importFrom(scater,plotReducedDim)
 importFrom(scater,plotTSNE)
@@ -41,3 +53,5 @@ importFrom(stats,cmdscale)
 importFrom(stats,cor)
 importFrom(stats,dist)
 importFrom(stats,lm)
+importFrom(stats,qnorm)
+importFrom(utils,tail)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# scDiagnostics 0.99.0
+
+* Initial CRAN submission.
diff --git a/R/calculateCategorizationEntropy.R b/R/calculateCategorizationEntropy.R
@@ -23,7 +23,7 @@
 #'     -sum(p*log(p))} . If that input vector p is a uniform
 #'     distribution over the \code{length(p)} categories, the entropy
 #'     will be a high as possible.
-#' 
+#'
 #' @param X a matrix of category scores
 #'
 #' @param inverse_normal_transform if TRUE, apply
@@ -33,47 +33,49 @@
 #' @param plot if TRUE, plot a histogram of the entropies
 #'
 #' @return A vector of entropy values for each column in X.
-#' 
+#'
 #' @examples
 #' # Simulate 500 cells with scores on 4 possible cell types
 #' X <- rnorm(500 * 4) |> matrix(nrow = 4)
 #'
 #' # Make the first category highly scored in the first 250 cells
-#' X[1, 1:250] <- X[1, 1:250] + 5 
+#' X[1, 1:250] <- X[1, 1:250] + 5
 #'
 #' # The function will issue a message about softmaxing the scores,
 #' # and the entropy histogram will be bimodal since we made half of
 #' # the cells clearly category 1 while the other half are roughly
 #' # even.
 #' # entropy_scores <- calculateCategorizationEntropy(X)
-#' 
+#'
 #' @importFrom ggplot2 geom_histogram theme_bw
 #'
 #' @export
-calculateCategorizationEntropy <-
+calculateCategorizationEntropy <- 
     function(X,
              inverse_normal_transform = FALSE,
              plot = TRUE,
-             verbose = TRUE) {    
+             verbose = TRUE) 
+{
     if (inverse_normal_transform) {
-        # https://cran.r-project.org/web/packages/RNOmni/vignettes/RNOmni.html#inverse-normal-transformation
-        if (verbose) message("Applying global inverse normal transformation.")
-
-        # You can't do the INT column-wise (by cell) because it will
-        # set a constant "range" to the probabilities, eliminating the
-        # differences in confidence across methods we're trying to
-        # quantify.
-
-        # You can't do the INT row-wise (by cell-type) because even
-        # though different cell types exhibit different marginal
-        # distributions of scores (in SingleR at least), doing the
-        # transformation row-wise would eliminate any differences in
-        # which cell types are "hard to predict".  You don't want a
-        # score of .5 for cytotoxic T cells (hard to predict type) to
-        # overwhelm a score of .62 from erythroid type 2 (easy to
-        # predict), even though the first would be extraordinary
-        # within its cell type and the latter unexceptional within its
-        # cell type.
+        ## https://cran.r-project.org/web/packages/RNOmni/vignettes/RNOmni.html#inverse-normal-transformation
+        if (verbose)
+            message("Applying global inverse normal transformation.")
+
+        ## You can't do the INT column-wise (by cell) because it will
+        ## set a constant "range" to the probabilities, eliminating
+        ## the differences in confidence across methods we're trying
+        ## to quantify.
+
+        ## You can't do the INT row-wise (by cell-type) because even
+        ## though different cell types exhibit different marginal
+        ## distributions of scores (in SingleR at least), doing the
+        ## transformation row-wise would eliminate any differences in
+        ## which cell types are "hard to predict".  You don't want a
+        ## score of .5 for cytotoxic T cells (hard to predict type) to
+        ## overwhelm a score of .62 from erythroid type 2 (easy to
+        ## predict), even though the first would be extraordinary
+        ## within its cell type and the latter unexceptional within
+        ## its cell type.
 
         X <- inverse_normal_trans(X)
     }
@@ -95,11 +97,11 @@ calculateCategorizationEntropy <-
 
         X <- sweep(expX, MARGIN = 2, STATS = colSums(expX), FUN = "/")
     }
-
+    
     ncat <- nrow(X)
 
     max_ent <- calculate_entropy(rep(1 / ncat, ncat))
-
+    
     if (verbose) {
         message(
             "Max possible entropy given ", ncat, " categories: ",
@@ -125,20 +127,28 @@ calculateCategorizationEntropy <-
 }
 
 
+#' @noRd
+#' @keywords internal
 calculate_entropy <- function(p) {
     # p is one column of X, a vector of probabilities summing to 1.
-
     nonzeros <- p != 0
-    
+
     -sum(p[nonzeros] * log(p[nonzeros]))
 }
 
+
+#' @noRd
+#' @keywords internal
 n_elements <- function(X) {
     ifelse(is.matrix(X),
            prod(dim(X)),
            length(X))
 }
 
+
+#' @noRd
+#' @keywords internal
+#' @importFrom stats qnorm
 inverse_normal_trans <- function(X, constant = 3 / 8) {
     n <- n_elements(X)
 

diff --git a/R/calculateHVGOverlap.R b/R/calculateHVGOverlap.R
@@ -43,7 +43,10 @@
 #' 
 #' # Divide the data into reference and query datasets
 #' set.seed(100)
-#' indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE)
+#'
+#' indices <- sample(ncol(assay(sce)), size = floor(0.7 *
+#'                   ncol(assay(sce))), replace = FALSE)
+#'
 #' ref_data <- sce[, indices]
 #' query_data <- sce[, -indices]
 #' 
@@ -56,14 +59,17 @@
 #' ref_var <- getTopHVGs(ref_data, n=2000)
 #' query_var <- getTopHVGs(query_data, n=2000)
 #' 
-#' overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var, 
-#'                                           query_genes = query_var)
+#' overlap_coefficient <- calculateHVGOverlap(
+#'     reference_genes = ref_var, 
+#'     query_genes = query_var
+#' )
 #' 
 #' @export                                       
-calculateHVGOverlap <- function(reference_genes, query_genes) {
-
+calculateHVGOverlap <- 
+    function(reference_genes, query_genes) 
+{
     ## Sanity checks
-    ## FIXME: Use BiocUtils
+    ## FIXME: Use BiocBaseUtils
     if (!is.vector(reference_genes) || !is.character(reference_genes)) {
         stop("reference_genes must be a character vector.")
     }

diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R
@@ -68,7 +68,8 @@
 #' ref_data <- logNormCounts(ref_data)
 #' query_data <- logNormCounts(query_data)
 #'
-#' # Get cell type scores using SingleR (or any other cell type annotation method)
+#' # Get cell type scores using SingleR (or any other cell type
+#' # annotation method)
 #' scores <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad)
 #'
 #' # Add labels to query object
@@ -84,19 +85,24 @@
 #' ref_data_subset <- ref_data[common_genes, ]
 #' query_data_subset <- query_data[common_genes, ]
 #'
-#' # Example usage of the function
-#' calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, 
-#'                                          reference_data = ref_data_subset, 
-#'                                          query_cell_type_col = "labels", 
-#'                                          ref_cell_type_col = "reclustered.broad", 
-#'                                          cell_type_query = "CD8", 
-#'                                          cell_type_reference = "CD8", 
-#'                                          distance_metric = "euclidean")
+#' ## Example usage of the function
+#'
+#' calculatePairwiseDistancesAndPlotDensity(
+#'     query_data = query_data_subset, 
+#'     reference_data = ref_data_subset,
+#'     query_cell_type_col = "labels", 
+#'     ref_cell_type_col = "reclustered.broad", 
+#'     cell_type_query = "CD8", 
+#'     cell_type_reference = "CD8", 
+#'     distance_metric = "euclidean"
+#' )
 #' 
 #' @importFrom ggplot2 ggplot
+#' @importFrom rlang .data
 #' @importFrom stats cor dist
 #' @importFrom SingleCellExperiment SingleCellExperiment
-#' @importFrom SummarizedExperiment assay                                       
+#' @importFrom SummarizedExperiment assay      
+#' @importFrom methods is
 #'
 #' @export
 calculatePairwiseDistancesAndPlotDensity <-
@@ -107,24 +113,27 @@ calculatePairwiseDistancesAndPlotDensity <-
              cell_type_query, 
              cell_type_reference, 
              distance_metric, 
-             correlation_method = "pearson") {
+             correlation_method = c("pearson", "spearman"))
+{
     ## Sanity checks
 
     ## Check if query_data is a SingleCellExperiment object
     if (!is(query_data, "SingleCellExperiment")) {
         stop("query_data must be a SingleCellExperiment object.")
     }
-
+        
     ## Check if reference_data is a SingleCellExperiment object
     if (!is(reference_data, "SingleCellExperiment")) {
         stop("reference_data must be a SingleCellExperiment object.")
     }
 
     ## Subset query and reference data to the specified cell type
-    query_data_subset <- query_data[, !is.na(query_data[[query_cell_type_col]]) & 
-                                      query_data[[query_cell_type_col]] == cell_type_query]
-    ref_data_subset <- reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & 
-                                        reference_data[[ref_cell_type_col]] == cell_type_reference]
+    query_data_subset <-
+        query_data[, !is.na(query_data[[query_cell_type_col]]) & 
+                     query_data[[query_cell_type_col]] == cell_type_query]
+    ref_data_subset <-
+        reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & 
+                         reference_data[[ref_cell_type_col]] == cell_type_reference]
 
     ## Convert to matrix
     query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
@@ -134,29 +143,28 @@ calculatePairwiseDistancesAndPlotDensity <-
     combined_mat <- rbind(query_mat, ref_mat)
 
     ## Calculate pairwise distances or correlations for all comparisons
+    correlation_method <- match.arg(correlation_method)
     if (distance_metric == "correlation") {
-        if (correlation_method == "pearson") {
-            dist_matrix <- cor(t(combined_mat), method = "pearson")
-        } else if (correlation_method == "spearman") {
-            dist_matrix <- cor(t(combined_mat), method = "spearman")
-        } else {
-            stop("Invalid correlation method. Available options: 'pearson', 'spearman'")
-        }
+        dist_matrix <- cor(t(combined_mat), method = correlation_method)
     } else {
         dist_matrix <- dist(combined_mat, method = distance_metric)
     }
 
     ## Convert dist_matrix to a square matrix
     dist_matrix <- as.matrix(dist_matrix)
 
-    ## Extract the distances or correlations for the different pairwise comparisons
+    ## Extract the distances or correlations for the different pairwise
+    ## comparisons
     num_query_cells <- nrow(query_mat)
     num_ref_cells <- nrow(ref_mat)
-    dist_query_query <- dist_matrix[1:num_query_cells, 1:num_query_cells]
-    dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
-                              (num_query_cells + 1):(num_query_cells + num_ref_cells)]
-    dist_query_ref <- dist_matrix[1:num_query_cells,
-                                  (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+    dist_query_query <-
+        dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
+    dist_ref_ref <-
+        dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
+                    (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+    dist_query_ref <-
+        dist_matrix[seq_len(num_query_cells),
+                    (num_query_cells + 1):(num_query_cells + num_ref_cells)]
 
     ## Create data frame for plotting
     dist_df <- data.frame(
@@ -169,7 +177,7 @@ calculatePairwiseDistancesAndPlotDensity <-
     )
 
     ## Plot density plots
-    ggplot(dist_df, aes(x = Distance, color = Comparison)) +
+    ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
         geom_density() +
         labs(x = ifelse(distance_metric == "correlation", 
                         paste(correlation_method, "correlation"),
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ @@
     ^doc$
     ^Meta$
     ^\.github$
+    ^docs$
+    ^pkgdown$
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# scDiagnostics 0.99.0

		* Initial CRAN submission.