Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Entropy #27

Closed
wants to merge 11 commits into from
7 changes: 0 additions & 7 deletions .Rbuildignore

This file was deleted.

7 changes: 0 additions & 7 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,8 @@ export(plotQCvsAnnotation)
export(regressPC)
export(visualizeCellTypeMDS)
import(SingleCellExperiment)
importFrom(BiocGenerics,t)
importFrom(S4Vectors,metadata)
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SingleCellExperiment,logcounts)
importFrom(SingleCellExperiment,reducedDim)
importFrom(SingleCellExperiment,reducedDimNames)
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,colData)
importFrom(ggplot2,aes)
Expand All @@ -41,14 +37,11 @@ importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(ggplot2,ylim)
importFrom(gridExtra,grid.arrange)
importFrom(isotree,isolation.forest)
importFrom(methods,is)
importFrom(rlang,.data)
importFrom(scater,plotPCA)
importFrom(scater,plotReducedDim)
importFrom(scater,plotTSNE)
importFrom(scater,plotUMAP)
importFrom(scater,runPCA)
importFrom(stats,cmdscale)
importFrom(stats,cor)
importFrom(stats,dist)
Expand Down
138 changes: 69 additions & 69 deletions R/calculatePairwiseDistancesAndPlotDensity.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,74 +114,74 @@ calculatePairwiseDistancesAndPlotDensity <-
cell_type_reference,
distance_metric,
correlation_method = c("pearson", "spearman"))
{
## Sanity checks

## Check if query_data is a SingleCellExperiment object
if (!is(query_data, "SingleCellExperiment")) {
stop("query_data must be a SingleCellExperiment object.")
}
{
## Sanity checks

## Check if reference_data is a SingleCellExperiment object
if (!is(reference_data, "SingleCellExperiment")) {
stop("reference_data must be a SingleCellExperiment object.")
}

## Subset query and reference data to the specified cell type
query_data_subset <-
query_data[, !is.na(query_data[[query_cell_type_col]]) &
query_data[[query_cell_type_col]] == cell_type_query]
ref_data_subset <-
reference_data[, !is.na(reference_data[[ref_cell_type_col]]) &
reference_data[[ref_cell_type_col]] == cell_type_reference]

## Convert to matrix
query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts")))

## Combine query and reference matrices
combined_mat <- rbind(query_mat, ref_mat)

## Calculate pairwise distances or correlations for all comparisons
correlation_method <- match.arg(correlation_method)
if (distance_metric == "correlation") {
dist_matrix <- cor(t(combined_mat), method = correlation_method)
} else {
dist_matrix <- dist(combined_mat, method = distance_metric)
## Check if query_data is a SingleCellExperiment object
if (!is(query_data, "SingleCellExperiment")) {
stop("query_data must be a SingleCellExperiment object.")
}

## Check if reference_data is a SingleCellExperiment object
if (!is(reference_data, "SingleCellExperiment")) {
stop("reference_data must be a SingleCellExperiment object.")
}

## Subset query and reference data to the specified cell type
query_data_subset <-
query_data[, !is.na(query_data[[query_cell_type_col]]) &
query_data[[query_cell_type_col]] == cell_type_query]
ref_data_subset <-
reference_data[, !is.na(reference_data[[ref_cell_type_col]]) &
reference_data[[ref_cell_type_col]] == cell_type_reference]

## Convert to matrix
query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts")))

## Combine query and reference matrices
combined_mat <- rbind(query_mat, ref_mat)

## Calculate pairwise distances or correlations for all comparisons
correlation_method <- match.arg(correlation_method)
if (distance_metric == "correlation") {
dist_matrix <- cor(t(combined_mat), method = correlation_method)
} else {
dist_matrix <- dist(combined_mat, method = distance_metric)
}

## Convert dist_matrix to a square matrix
dist_matrix <- as.matrix(dist_matrix)

## Extract the distances or correlations for the different pairwise
## comparisons
num_query_cells <- nrow(query_mat)
num_ref_cells <- nrow(ref_mat)
dist_query_query <-
dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
dist_ref_ref <-
dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]
dist_query_ref <-
dist_matrix[seq_len(num_query_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]

## Create data frame for plotting
dist_df <- data.frame(
Comparison = c(rep("Query vs Query", length(dist_query_query)),
rep("Reference vs Reference", length(dist_ref_ref)),
rep("Query vs Reference", length(dist_query_ref))),
Distance = c(as.vector(dist_query_query),
as.vector(dist_ref_ref),
as.vector(dist_query_ref))
)

## Plot density plots
ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
geom_density() +
labs(x = ifelse(distance_metric == "correlation",
paste(correlation_method, "correlation"),
"Distance"), y = "Density",
title = "Pairwise Distance Analysis and Density Visualization") +
theme_bw()
}

## Convert dist_matrix to a square matrix
dist_matrix <- as.matrix(dist_matrix)

## Extract the distances or correlations for the different pairwise
## comparisons
num_query_cells <- nrow(query_mat)
num_ref_cells <- nrow(ref_mat)
dist_query_query <-
dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
dist_ref_ref <-
dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]
dist_query_ref <-
dist_matrix[seq_len(num_query_cells),
(num_query_cells + 1):(num_query_cells + num_ref_cells)]

## Create data frame for plotting
dist_df <- data.frame(
Comparison = c(rep("Query vs Query", length(dist_query_query)),
rep("Reference vs Reference", length(dist_ref_ref)),
rep("Query vs Reference", length(dist_query_ref))),
Distance = c(as.vector(dist_query_query),
as.vector(dist_ref_ref),
as.vector(dist_query_ref))
)

## Plot density plots
ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
geom_density() +
labs(x = ifelse(distance_metric == "correlation",
paste(correlation_method, "correlation"),
"Distance"), y = "Density",
title = "Pairwise Distance Analysis and Density Visualization") +
theme_bw()
}
159 changes: 79 additions & 80 deletions R/outlierDetection.R
Original file line number Diff line number Diff line change
@@ -1,47 +1,32 @@
#' Calculate cell outlier scores using isolation forests
#'
#' @description This function calculates outlier scores for each cell
#' using \link[isotree]{isolation.forest}. Isolation forests
#' explicitly try to estimate the isolation of datapoints, in
#' contrast to estimating the full distribution of the data. To
#' quote the isotree documentation: "Isolation Forest is an
#' algorithm originally developed for outlier detection that
#' consists in splitting sub-samples of the data according to some
#' attribute/feature/column at random". Basically, it calculates
#' how quickly each cell gets split off from the rest of the group
#' when the dataset is randomly partitioned by a hyperplane. There
#' are many adjustable parameters (e.g. number of trees,
#' dimensionality of the splits, etc), but the defaults usually
#' work pretty well.
#'
#' @details If \code{use_pcs} is turned on, it will use all available
#' PCs in the PCA reduced dimension slot if it exists, and run
#' \link[scater]{runPCA} with default settings and use that if it
#' does not.
#'
#' \code{prediction_thresh} is the threshold to use on the
#' isoforest score predictions
#'
#' @param sce a SingleCellExperiment object containing a logcounts
#' matrix
#'
#' @param dimred if specified, plot the specified reduced dimension
#' with \link[scater]{plotReducedDim}, colored by outlier score
#'
#' @param use_pcs if TRUE, run the outlier detection in principal
#' component space. Otherwise, use logcounts.
#'
#' @param prediction_thresh threshold to use for binary outlier
#' calling
#'
#' @param plot if TRUE and a dimred is specified, plot the isoscore
#' against the specified dimred
#'
#' @description This function calculates outlier scores for each cell using
#' \link[isotree]{isolation.forest}. Isolation forests explicitly try to
#' estimate the isolation of datapoints, in contrast to estimating the full
#' distribution of the data. To quote the isotree documentation: "Isolation
#' Forest is an algorithm originally developed for outlier detection that
#' consists in splitting sub-samples of the data according to some
#' attribute/feature/column at random." Basically, it calculates how quickly
#' each cell gets split off from the rest of the group when the dataset is
#' randomly partitioned by a hyperplane. There are many adjustable parameters
#' (e.g. number of trees, dimensionality of the splits, etc), but the defaults
#' usually work pretty well.
#' @param sce a SingleCellExperiment object containing a logcounts matrix
#' @param group.var if specified, run the outlier scoring by group (e.g. cell
#' cluster) instead of across all cells globally.
#' @param use_pcs if TRUE, run the outlier detection in principal component
#' space. Otherwise, use logcounts.
#' @param prediction_thresh threshold to use for binary outlier calling
#' @param ... arguments to pass to \link[isotree]{isolation.forest}
#' @details If \code{use_pcs} is turned on, it will use all available PCs in the
#' PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with
#' default settings and use that if it does not.
#'
#' @return the SingleCellExperiment object with \code{outlier_score}
#' and \code{is_outlier} added to the colData. Also, the resulting
#' isotree model is attached to the object's metadata.
#' \code{prediction_thresh} is the threshold to use on the isoforest score
#' predictions
#' @returns the SingleCellExperiment object with \code{outlier_score} and
#' \code{is_outlier} added to the colData. Also, the resulting isotree model
#' is attached to the object's metadata.
#'
#' @examples
#' library(scater)
Expand All @@ -57,31 +42,56 @@
#'
#' query_data <- runPCA(query_data)
#'
#' query_data <- calculateOutlierScore(
#' query_data,
#' plot = TRUE,
#' dimred = "PCA",
#' use_pcs = TRUE
#' )
#'
#' @importFrom scater runPCA plotReducedDim
#' @importFrom SingleCellExperiment SingleCellExperiment
#' reducedDimNames reducedDim logcounts
#' @importFrom S4Vectors metadata
#' @importFrom BiocGenerics t
#' @importFrom isotree isolation.forest
#'
#' query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
#' @export
calculateOutlierScore <-
function(sce,
dimred = NULL,
use_pcs = FALSE,
plot = TRUE,
prediction_thresh = 0.5,
...)
{
calculateOutlierScore <- function(sce,
group.var = NULL,
use_pcs = FALSE,
prediction_thresh = 0.5,
...) {
if (!is.null(group.var)) {
if (is.null(colnames(sce))) stop("This function requires the input SCE to have unique column names when calculating group-wise outlier scores.")

# There's probably a better way to split an sce by a factor...
# Anyway, manually passing it through factor() like this way retains NAs.
cluster_list <- lapply(
split(
colData(sce),
factor(colData(sce)[[group.var]],
exclude = NULL
)
),
\(x) sce[, rownames(x)]
)

# TODO: assess if it would be worth making the threshold adaptive. Different
# clusters can have different distributions of outlier scores.
score_list <- lapply(cluster_list,
calculateOutlierScore,
group.var = NULL,
use_pcs = use_pcs,
prediction_thresh = prediction_thresh
)

score_df <- Reduce(
lapply(
score_list,
\(x) colData(x)[, c("outlier_score", "is_outlier")]
),
f = rbind
)

# If we were using an ID column instead of column names of the input we could
# do a join.
score_df <- score_df[rownames(colData(sce)), ]
colData(sce) <- cbind(colData(sce), score_df)

S4Vectors::metadata(sce)$cluster_isotree_model <- lapply(score_list, S4Vectors::metadata)

return(sce)
}

if (use_pcs) {

if (!("PCA" %in% SingleCellExperiment::reducedDimNames(sce))) {
sce <- scater::runPCA(sce)
}
Expand All @@ -93,29 +103,18 @@ calculateOutlierScore <-
SingleCellExperiment::logcounts() |>
BiocGenerics::t()
}

isotree_res <- X |>
isotree::isolation.forest(
output_score = TRUE,
...
)

## V These scores are the same as running
## predict(isotree_res$model, X)

# V These scores are the same as running predict(isotree_res$model, X)
sce$outlier_score <- isotree_res$scores
sce$is_outlier <- isotree_res$scores >= prediction_thresh

if (!is.null(dimred) && plot) {
## TODO: make this plot work when scater is only in the
## Suggests, or re-do it manually with ggplot.
p <- scater::plotReducedDim(sce, dimred,
color_by = "outlier_score",
shape_by = "is_outlier"
)
print(p)
}


S4Vectors::metadata(sce)$isotree_model <- isotree_res$model

sce
return(sce)
}
Loading
Loading