Skip to content

Commit

Permalink
Merge pull request #26 from ccb-hms/nt_fix_package
Browse files Browse the repository at this point in the history
Fix package and get it ready for submission
  • Loading branch information
lgeistlinger authored Apr 14, 2024
2 parents 7940774 + e41b0ea commit ec2977c
Show file tree
Hide file tree
Showing 92 changed files with 5,318 additions and 1,695 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
^doc$
^Meta$
^\.github$
^docs$
^pkgdown$
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
.Rcheck/
*.log
*.pdf
*.html
*.tex
*.docx

Expand Down
20 changes: 14 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,28 @@ Description: The scDiagnostics package provides diagnostic plots to
expression patterns, and explore relationships between different
cell types in query and reference datasets allowing users to
detect potential misalignments between reference and query
datasets.
datasets. The package also provides visualization capabilities for
diagnositics purposes.
License: Artistic-2.0
URL: https://github.com/ccb-hms/scDiagnostics
BugReports: https://github.com/ccb-hms/scDiagnostics/issues
Depends:
R (>= 4.2.0),
SingleCellExperiment
R (>= 4.2.0)
Imports:
SingleCellExperiment,
isotree,
methods,
rlang,
ggplot2,
gridExtra,
scater,
SummarizedExperiment,
AUCell,
BiocGenerics,
S4Vectors
S4Vectors,
stats,
utils
Suggests:
AUCell,
BiocStyle,
corrplot,
knitr,
Expand All @@ -41,9 +46,12 @@ Suggests:
scran,
scRNAseq,
SingleR,
celldex
celldex,
testthat (>= 3.0.0)
VignetteBuilder:
knitr
biocViews: SingleCell, Software
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.1
Config/testthat/edition: 3
14 changes: 14 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,30 @@ importFrom(SingleCellExperiment,logcounts)
importFrom(SingleCellExperiment,reducedDim)
importFrom(SingleCellExperiment,reducedDimNames)
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,colData)
importFrom(ggplot2,aes)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,geom_density)
importFrom(ggplot2,geom_histogram)
importFrom(ggplot2,geom_line)
importFrom(ggplot2,geom_point)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,guide_legend)
importFrom(ggplot2,guides)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_color_gradient)
importFrom(ggplot2,scale_color_manual)
importFrom(ggplot2,scale_x_continuous)
importFrom(ggplot2,theme)
importFrom(ggplot2,theme_bw)
importFrom(ggplot2,theme_minimal)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(ggplot2,ylim)
importFrom(gridExtra,grid.arrange)
importFrom(isotree,isolation.forest)
importFrom(methods,is)
importFrom(rlang,.data)
importFrom(scater,plotPCA)
importFrom(scater,plotReducedDim)
importFrom(scater,plotTSNE)
Expand All @@ -41,3 +53,5 @@ importFrom(stats,cmdscale)
importFrom(stats,cor)
importFrom(stats,dist)
importFrom(stats,lm)
importFrom(stats,qnorm)
importFrom(utils,tail)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# scDiagnostics 0.99.0

* Initial CRAN submission.
66 changes: 38 additions & 28 deletions R/calculateCategorizationEntropy.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' -sum(p*log(p))} . If that input vector p is a uniform
#' distribution over the \code{length(p)} categories, the entropy
#' will be a high as possible.
#'
#'
#' @param X a matrix of category scores
#'
#' @param inverse_normal_transform if TRUE, apply
Expand All @@ -33,47 +33,49 @@
#' @param plot if TRUE, plot a histogram of the entropies
#'
#' @return A vector of entropy values for each column in X.
#'
#'
#' @examples
#' # Simulate 500 cells with scores on 4 possible cell types
#' X <- rnorm(500 * 4) |> matrix(nrow = 4)
#'
#' # Make the first category highly scored in the first 250 cells
#' X[1, 1:250] <- X[1, 1:250] + 5
#' X[1, 1:250] <- X[1, 1:250] + 5
#'
#' # The function will issue a message about softmaxing the scores,
#' # and the entropy histogram will be bimodal since we made half of
#' # the cells clearly category 1 while the other half are roughly
#' # even.
#' # entropy_scores <- calculateCategorizationEntropy(X)
#'
#'
#' @importFrom ggplot2 geom_histogram theme_bw
#'
#' @export
calculateCategorizationEntropy <-
calculateCategorizationEntropy <-
function(X,
inverse_normal_transform = FALSE,
plot = TRUE,
verbose = TRUE) {
verbose = TRUE)
{
if (inverse_normal_transform) {
# https://cran.r-project.org/web/packages/RNOmni/vignettes/RNOmni.html#inverse-normal-transformation
if (verbose) message("Applying global inverse normal transformation.")

# You can't do the INT column-wise (by cell) because it will
# set a constant "range" to the probabilities, eliminating the
# differences in confidence across methods we're trying to
# quantify.

# You can't do the INT row-wise (by cell-type) because even
# though different cell types exhibit different marginal
# distributions of scores (in SingleR at least), doing the
# transformation row-wise would eliminate any differences in
# which cell types are "hard to predict". You don't want a
# score of .5 for cytotoxic T cells (hard to predict type) to
# overwhelm a score of .62 from erythroid type 2 (easy to
# predict), even though the first would be extraordinary
# within its cell type and the latter unexceptional within its
# cell type.
## https://cran.r-project.org/web/packages/RNOmni/vignettes/RNOmni.html#inverse-normal-transformation
if (verbose)
message("Applying global inverse normal transformation.")

## You can't do the INT column-wise (by cell) because it will
## set a constant "range" to the probabilities, eliminating
## the differences in confidence across methods we're trying
## to quantify.

## You can't do the INT row-wise (by cell-type) because even
## though different cell types exhibit different marginal
## distributions of scores (in SingleR at least), doing the
## transformation row-wise would eliminate any differences in
## which cell types are "hard to predict". You don't want a
## score of .5 for cytotoxic T cells (hard to predict type) to
## overwhelm a score of .62 from erythroid type 2 (easy to
## predict), even though the first would be extraordinary
## within its cell type and the latter unexceptional within
## its cell type.

X <- inverse_normal_trans(X)
}
Expand All @@ -95,11 +97,11 @@ calculateCategorizationEntropy <-

X <- sweep(expX, MARGIN = 2, STATS = colSums(expX), FUN = "/")
}

ncat <- nrow(X)

max_ent <- calculate_entropy(rep(1 / ncat, ncat))

if (verbose) {
message(
"Max possible entropy given ", ncat, " categories: ",
Expand All @@ -125,20 +127,28 @@ calculateCategorizationEntropy <-
}


#' @noRd
#' @keywords internal
calculate_entropy <- function(p) {
# p is one column of X, a vector of probabilities summing to 1.

nonzeros <- p != 0

-sum(p[nonzeros] * log(p[nonzeros]))
}


#' @noRd
#' @keywords internal
n_elements <- function(X) {
ifelse(is.matrix(X),
prod(dim(X)),
length(X))
}


#' @noRd
#' @keywords internal
#' @importFrom stats qnorm
inverse_normal_trans <- function(X, constant = 3 / 8) {
n <- n_elements(X)

Expand Down
18 changes: 12 additions & 6 deletions R/calculateHVGOverlap.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@
#'
#' # Divide the data into reference and query datasets
#' set.seed(100)
#' indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE)
#'
#' indices <- sample(ncol(assay(sce)), size = floor(0.7 *
#' ncol(assay(sce))), replace = FALSE)
#'
#' ref_data <- sce[, indices]
#' query_data <- sce[, -indices]
#'
Expand All @@ -56,14 +59,17 @@
#' ref_var <- getTopHVGs(ref_data, n=2000)
#' query_var <- getTopHVGs(query_data, n=2000)
#'
#' overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var,
#' query_genes = query_var)
#' overlap_coefficient <- calculateHVGOverlap(
#' reference_genes = ref_var,
#' query_genes = query_var
#' )
#'
#' @export
calculateHVGOverlap <- function(reference_genes, query_genes) {

calculateHVGOverlap <-
function(reference_genes, query_genes)
{
## Sanity checks
## FIXME: Use BiocUtils
## FIXME: Use BiocBaseUtils
if (!is.vector(reference_genes) || !is.character(reference_genes)) {
stop("reference_genes must be a character vector.")
}
Expand Down
68 changes: 38 additions & 30 deletions R/calculatePairwiseDistancesAndPlotDensity.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@
#' ref_data <- logNormCounts(ref_data)
#' query_data <- logNormCounts(query_data)
#'
#' # Get cell type scores using SingleR (or any other cell type annotation method)
#' # Get cell type scores using SingleR (or any other cell type
#' # annotation method)
#' scores <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad)
#'
#' # Add labels to query object
Expand All @@ -84,19 +85,24 @@
#' ref_data_subset <- ref_data[common_genes, ]
#' query_data_subset <- query_data[common_genes, ]
#'
#' # Example usage of the function
#' calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset,
#' reference_data = ref_data_subset,
#' query_cell_type_col = "labels",
#' ref_cell_type_col = "reclustered.broad",
#' cell_type_query = "CD8",
#' cell_type_reference = "CD8",
#' distance_metric = "euclidean")
#' ## Example usage of the function
#'
#' calculatePairwiseDistancesAndPlotDensity(
#' query_data = query_data_subset,
#' reference_data = ref_data_subset,
#' query_cell_type_col = "labels",
#' ref_cell_type_col = "reclustered.broad",
#' cell_type_query = "CD8",
#' cell_type_reference = "CD8",
#' distance_metric = "euclidean"
#' )
#'
#' @importFrom ggplot2 ggplot
#' @importFrom rlang .data
#' @importFrom stats cor dist
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom SummarizedExperiment assay
#' @importFrom SummarizedExperiment assay
#' @importFrom methods is
#'
#' @export
calculatePairwiseDistancesAndPlotDensity <-
Expand All @@ -107,24 +113,27 @@ calculatePairwiseDistancesAndPlotDensity <-
cell_type_query,
cell_type_reference,
distance_metric,
correlation_method = "pearson") {
correlation_method = c("pearson", "spearman"))
{
## Sanity checks

## Check if query_data is a SingleCellExperiment object
if (!is(query_data, "SingleCellExperiment")) {
stop("query_data must be a SingleCellExperiment object.")
}

## Check if reference_data is a SingleCellExperiment object
if (!is(reference_data, "SingleCellExperiment")) {
stop("reference_data must be a SingleCellExperiment object.")
}

## Subset query and reference data to the specified cell type
query_data_subset <- query_data[, !is.na(query_data[[query_cell_type_col]]) &
query_data[[query_cell_type_col]] == cell_type_query]
ref_data_subset <- reference_data[, !is.na(reference_data[[ref_cell_type_col]]) &
reference_data[[ref_cell_type_col]] == cell_type_reference]
query_data_subset <-
query_data[, !is.na(query_data[[query_cell_type_col]]) &
query_data[[query_cell_type_col]] == cell_type_query]
ref_data_subset <-
reference_data[, !is.na(reference_data[[ref_cell_type_col]]) &
reference_data[[ref_cell_type_col]] == cell_type_reference]

## Convert to matrix
query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
Expand All @@ -134,29 +143,28 @@ calculatePairwiseDistancesAndPlotDensity <-
combined_mat <- rbind(query_mat, ref_mat)

## Calculate pairwise distances or correlations for all comparisons
correlation_method <- match.arg(correlation_method)
if (distance_metric == "correlation") {
if (correlation_method == "pearson") {
dist_matrix <- cor(t(combined_mat), method = "pearson")
} else if (correlation_method == "spearman") {
dist_matrix <- cor(t(combined_mat), method = "spearman")
} else {
stop("Invalid correlation method. Available options: 'pearson', 'spearman'")
}
dist_matrix <- cor(t(combined_mat), method = correlation_method)
} else {
dist_matrix <- dist(combined_mat, method = distance_metric)
}

## Convert dist_matrix to a square matrix
dist_matrix <- as.matrix(dist_matrix)

## Extract the distances or correlations for the different pairwise comparisons
## Extract the distances or correlations for the different pairwise
## comparisons
num_query_cells <- nrow(query_mat)
num_ref_cells <- nrow(ref_mat)
dist_query_query <- dist_matrix[1:num_query_cells, 1:num_query_cells]
dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]
dist_query_ref <- dist_matrix[1:num_query_cells,
(num_query_cells + 1):(num_query_cells + num_ref_cells)]
dist_query_query <-
dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
dist_ref_ref <-
dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]
dist_query_ref <-
dist_matrix[seq_len(num_query_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]

## Create data frame for plotting
dist_df <- data.frame(
Expand All @@ -169,7 +177,7 @@ calculatePairwiseDistancesAndPlotDensity <-
)

## Plot density plots
ggplot(dist_df, aes(x = Distance, color = Comparison)) +
ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
geom_density() +
labs(x = ifelse(distance_metric == "correlation",
paste(correlation_method, "correlation"),
Expand Down
Loading

0 comments on commit ec2977c

Please sign in to comment.