diff --git a/.Rbuildignore b/.Rbuildignore deleted file mode 100644 index 8b06f02..0000000 --- a/.Rbuildignore +++ /dev/null @@ -1,7 +0,0 @@ -^.*\.Rproj$ -^\.Rproj\.user$ -^doc$ -^Meta$ -^\.github$ -^docs$ -^pkgdown$ diff --git a/NAMESPACE b/NAMESPACE index 1ede8e8..5d06f0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,12 +13,8 @@ export(plotQCvsAnnotation) export(regressPC) export(visualizeCellTypeMDS) import(SingleCellExperiment) -importFrom(BiocGenerics,t) -importFrom(S4Vectors,metadata) importFrom(SingleCellExperiment,SingleCellExperiment) -importFrom(SingleCellExperiment,logcounts) importFrom(SingleCellExperiment,reducedDim) -importFrom(SingleCellExperiment,reducedDimNames) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,colData) importFrom(ggplot2,aes) @@ -41,14 +37,11 @@ importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) importFrom(ggplot2,ylim) importFrom(gridExtra,grid.arrange) -importFrom(isotree,isolation.forest) importFrom(methods,is) importFrom(rlang,.data) importFrom(scater,plotPCA) -importFrom(scater,plotReducedDim) importFrom(scater,plotTSNE) importFrom(scater,plotUMAP) -importFrom(scater,runPCA) importFrom(stats,cmdscale) importFrom(stats,cor) importFrom(stats,dist) diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R index 393b14a..2cafcf2 100644 --- a/R/calculatePairwiseDistancesAndPlotDensity.R +++ b/R/calculatePairwiseDistancesAndPlotDensity.R @@ -114,74 +114,74 @@ calculatePairwiseDistancesAndPlotDensity <- cell_type_reference, distance_metric, correlation_method = c("pearson", "spearman")) -{ - ## Sanity checks - - ## Check if query_data is a SingleCellExperiment object - if (!is(query_data, "SingleCellExperiment")) { - stop("query_data must be a SingleCellExperiment object.") - } + { + ## Sanity checks - ## Check if reference_data is a SingleCellExperiment object - if (!is(reference_data, "SingleCellExperiment")) { - stop("reference_data must be a SingleCellExperiment object.") - } - - ## Subset query and reference data to the specified cell type - query_data_subset <- - query_data[, !is.na(query_data[[query_cell_type_col]]) & - query_data[[query_cell_type_col]] == cell_type_query] - ref_data_subset <- - reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & - reference_data[[ref_cell_type_col]] == cell_type_reference] - - ## Convert to matrix - query_mat <- t(as.matrix(assay(query_data_subset, "logcounts"))) - ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts"))) - - ## Combine query and reference matrices - combined_mat <- rbind(query_mat, ref_mat) - - ## Calculate pairwise distances or correlations for all comparisons - correlation_method <- match.arg(correlation_method) - if (distance_metric == "correlation") { - dist_matrix <- cor(t(combined_mat), method = correlation_method) - } else { - dist_matrix <- dist(combined_mat, method = distance_metric) + ## Check if query_data is a SingleCellExperiment object + if (!is(query_data, "SingleCellExperiment")) { + stop("query_data must be a SingleCellExperiment object.") + } + + ## Check if reference_data is a SingleCellExperiment object + if (!is(reference_data, "SingleCellExperiment")) { + stop("reference_data must be a SingleCellExperiment object.") + } + + ## Subset query and reference data to the specified cell type + query_data_subset <- + query_data[, !is.na(query_data[[query_cell_type_col]]) & + query_data[[query_cell_type_col]] == cell_type_query] + ref_data_subset <- + reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & + reference_data[[ref_cell_type_col]] == cell_type_reference] + + ## Convert to matrix + query_mat <- t(as.matrix(assay(query_data_subset, "logcounts"))) + ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts"))) + + ## Combine query and reference matrices + combined_mat <- rbind(query_mat, ref_mat) + + ## Calculate pairwise distances or correlations for all comparisons + correlation_method <- match.arg(correlation_method) + if (distance_metric == "correlation") { + dist_matrix <- cor(t(combined_mat), method = correlation_method) + } else { + dist_matrix <- dist(combined_mat, method = distance_metric) + } + + ## Convert dist_matrix to a square matrix + dist_matrix <- as.matrix(dist_matrix) + + ## Extract the distances or correlations for the different pairwise + ## comparisons + num_query_cells <- nrow(query_mat) + num_ref_cells <- nrow(ref_mat) + dist_query_query <- + dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] + dist_ref_ref <- + dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] + dist_query_ref <- + dist_matrix[seq_len(num_query_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] + + ## Create data frame for plotting + dist_df <- data.frame( + Comparison = c(rep("Query vs Query", length(dist_query_query)), + rep("Reference vs Reference", length(dist_ref_ref)), + rep("Query vs Reference", length(dist_query_ref))), + Distance = c(as.vector(dist_query_query), + as.vector(dist_ref_ref), + as.vector(dist_query_ref)) + ) + + ## Plot density plots + ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) + + geom_density() + + labs(x = ifelse(distance_metric == "correlation", + paste(correlation_method, "correlation"), + "Distance"), y = "Density", + title = "Pairwise Distance Analysis and Density Visualization") + + theme_bw() } - - ## Convert dist_matrix to a square matrix - dist_matrix <- as.matrix(dist_matrix) - - ## Extract the distances or correlations for the different pairwise - ## comparisons - num_query_cells <- nrow(query_mat) - num_ref_cells <- nrow(ref_mat) - dist_query_query <- - dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] - dist_ref_ref <- - dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] - dist_query_ref <- - dist_matrix[seq_len(num_query_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] - - ## Create data frame for plotting - dist_df <- data.frame( - Comparison = c(rep("Query vs Query", length(dist_query_query)), - rep("Reference vs Reference", length(dist_ref_ref)), - rep("Query vs Reference", length(dist_query_ref))), - Distance = c(as.vector(dist_query_query), - as.vector(dist_ref_ref), - as.vector(dist_query_ref)) - ) - - ## Plot density plots - ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) + - geom_density() + - labs(x = ifelse(distance_metric == "correlation", - paste(correlation_method, "correlation"), - "Distance"), y = "Density", - title = "Pairwise Distance Analysis and Density Visualization") + - theme_bw() -} diff --git a/R/outlierDetection.R b/R/outlierDetection.R index 0b5ff83..176c4f5 100644 --- a/R/outlierDetection.R +++ b/R/outlierDetection.R @@ -1,47 +1,32 @@ #' Calculate cell outlier scores using isolation forests #' -#' @description This function calculates outlier scores for each cell -#' using \link[isotree]{isolation.forest}. Isolation forests -#' explicitly try to estimate the isolation of datapoints, in -#' contrast to estimating the full distribution of the data. To -#' quote the isotree documentation: "Isolation Forest is an -#' algorithm originally developed for outlier detection that -#' consists in splitting sub-samples of the data according to some -#' attribute/feature/column at random". Basically, it calculates -#' how quickly each cell gets split off from the rest of the group -#' when the dataset is randomly partitioned by a hyperplane. There -#' are many adjustable parameters (e.g. number of trees, -#' dimensionality of the splits, etc), but the defaults usually -#' work pretty well. -#' -#' @details If \code{use_pcs} is turned on, it will use all available -#' PCs in the PCA reduced dimension slot if it exists, and run -#' \link[scater]{runPCA} with default settings and use that if it -#' does not. -#' -#' \code{prediction_thresh} is the threshold to use on the -#' isoforest score predictions -#' -#' @param sce a SingleCellExperiment object containing a logcounts -#' matrix -#' -#' @param dimred if specified, plot the specified reduced dimension -#' with \link[scater]{plotReducedDim}, colored by outlier score -#' -#' @param use_pcs if TRUE, run the outlier detection in principal -#' component space. Otherwise, use logcounts. -#' -#' @param prediction_thresh threshold to use for binary outlier -#' calling -#' -#' @param plot if TRUE and a dimred is specified, plot the isoscore -#' against the specified dimred -#' +#' @description This function calculates outlier scores for each cell using +#' \link[isotree]{isolation.forest}. Isolation forests explicitly try to +#' estimate the isolation of datapoints, in contrast to estimating the full +#' distribution of the data. To quote the isotree documentation: "Isolation +#' Forest is an algorithm originally developed for outlier detection that +#' consists in splitting sub-samples of the data according to some +#' attribute/feature/column at random." Basically, it calculates how quickly +#' each cell gets split off from the rest of the group when the dataset is +#' randomly partitioned by a hyperplane. There are many adjustable parameters +#' (e.g. number of trees, dimensionality of the splits, etc), but the defaults +#' usually work pretty well. +#' @param sce a SingleCellExperiment object containing a logcounts matrix +#' @param group.var if specified, run the outlier scoring by group (e.g. cell +#' cluster) instead of across all cells globally. +#' @param use_pcs if TRUE, run the outlier detection in principal component +#' space. Otherwise, use logcounts. +#' @param prediction_thresh threshold to use for binary outlier calling #' @param ... arguments to pass to \link[isotree]{isolation.forest} +#' @details If \code{use_pcs} is turned on, it will use all available PCs in the +#' PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with +#' default settings and use that if it does not. #' -#' @return the SingleCellExperiment object with \code{outlier_score} -#' and \code{is_outlier} added to the colData. Also, the resulting -#' isotree model is attached to the object's metadata. +#' \code{prediction_thresh} is the threshold to use on the isoforest score +#' predictions +#' @returns the SingleCellExperiment object with \code{outlier_score} and +#' \code{is_outlier} added to the colData. Also, the resulting isotree model +#' is attached to the object's metadata. #' #' @examples #' library(scater) @@ -57,31 +42,56 @@ #' #' query_data <- runPCA(query_data) #' -#' query_data <- calculateOutlierScore( -#' query_data, -#' plot = TRUE, -#' dimred = "PCA", -#' use_pcs = TRUE -#' ) -#' -#' @importFrom scater runPCA plotReducedDim -#' @importFrom SingleCellExperiment SingleCellExperiment -#' reducedDimNames reducedDim logcounts -#' @importFrom S4Vectors metadata -#' @importFrom BiocGenerics t -#' @importFrom isotree isolation.forest -#' +#' query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) #' @export -calculateOutlierScore <- - function(sce, - dimred = NULL, - use_pcs = FALSE, - plot = TRUE, - prediction_thresh = 0.5, - ...) -{ +calculateOutlierScore <- function(sce, + group.var = NULL, + use_pcs = FALSE, + prediction_thresh = 0.5, + ...) { + if (!is.null(group.var)) { + if (is.null(colnames(sce))) stop("This function requires the input SCE to have unique column names when calculating group-wise outlier scores.") + + # There's probably a better way to split an sce by a factor... + # Anyway, manually passing it through factor() like this way retains NAs. + cluster_list <- lapply( + split( + colData(sce), + factor(colData(sce)[[group.var]], + exclude = NULL + ) + ), + \(x) sce[, rownames(x)] + ) + + # TODO: assess if it would be worth making the threshold adaptive. Different + # clusters can have different distributions of outlier scores. + score_list <- lapply(cluster_list, + calculateOutlierScore, + group.var = NULL, + use_pcs = use_pcs, + prediction_thresh = prediction_thresh + ) + + score_df <- Reduce( + lapply( + score_list, + \(x) colData(x)[, c("outlier_score", "is_outlier")] + ), + f = rbind + ) + + # If we were using an ID column instead of column names of the input we could + # do a join. + score_df <- score_df[rownames(colData(sce)), ] + colData(sce) <- cbind(colData(sce), score_df) + + S4Vectors::metadata(sce)$cluster_isotree_model <- lapply(score_list, S4Vectors::metadata) + + return(sce) + } + if (use_pcs) { - if (!("PCA" %in% SingleCellExperiment::reducedDimNames(sce))) { sce <- scater::runPCA(sce) } @@ -93,29 +103,18 @@ calculateOutlierScore <- SingleCellExperiment::logcounts() |> BiocGenerics::t() } - + isotree_res <- X |> isotree::isolation.forest( output_score = TRUE, ... ) - - ## V These scores are the same as running - ## predict(isotree_res$model, X) + + # V These scores are the same as running predict(isotree_res$model, X) sce$outlier_score <- isotree_res$scores sce$is_outlier <- isotree_res$scores >= prediction_thresh - - if (!is.null(dimred) && plot) { - ## TODO: make this plot work when scater is only in the - ## Suggests, or re-do it manually with ggplot. - p <- scater::plotReducedDim(sce, dimred, - color_by = "outlier_score", - shape_by = "is_outlier" - ) - print(p) - } - + S4Vectors::metadata(sce)$isotree_model <- isotree_res$model - - sce + + return(sce) } diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd index 7ac6b8d..0df0361 100644 --- a/man/calculateOutlierScore.Rd +++ b/man/calculateOutlierScore.Rd @@ -6,59 +6,50 @@ \usage{ calculateOutlierScore( sce, - dimred = NULL, + group.var = NULL, use_pcs = FALSE, - plot = TRUE, prediction_thresh = 0.5, ... ) } \arguments{ -\item{sce}{a SingleCellExperiment object containing a logcounts -matrix} +\item{sce}{a SingleCellExperiment object containing a logcounts matrix} -\item{dimred}{if specified, plot the specified reduced dimension -with \link[scater]{plotReducedDim}, colored by outlier score} +\item{group.var}{if specified, run the outlier scoring by group (e.g. cell +cluster) instead of across all cells globally.} -\item{use_pcs}{if TRUE, run the outlier detection in principal -component space. Otherwise, use logcounts.} +\item{use_pcs}{if TRUE, run the outlier detection in principal component +space. Otherwise, use logcounts.} -\item{plot}{if TRUE and a dimred is specified, plot the isoscore -against the specified dimred} - -\item{prediction_thresh}{threshold to use for binary outlier -calling} +\item{prediction_thresh}{threshold to use for binary outlier calling} \item{...}{arguments to pass to \link[isotree]{isolation.forest}} } \value{ -the SingleCellExperiment object with \code{outlier_score} - and \code{is_outlier} added to the colData. Also, the resulting - isotree model is attached to the object's metadata. +the SingleCellExperiment object with \code{outlier_score} and + \code{is_outlier} added to the colData. Also, the resulting isotree model + is attached to the object's metadata. } \description{ -This function calculates outlier scores for each cell - using \link[isotree]{isolation.forest}. Isolation forests - explicitly try to estimate the isolation of datapoints, in - contrast to estimating the full distribution of the data. To - quote the isotree documentation: "Isolation Forest is an - algorithm originally developed for outlier detection that - consists in splitting sub-samples of the data according to some - attribute/feature/column at random". Basically, it calculates - how quickly each cell gets split off from the rest of the group - when the dataset is randomly partitioned by a hyperplane. There - are many adjustable parameters (e.g. number of trees, - dimensionality of the splits, etc), but the defaults usually - work pretty well. +This function calculates outlier scores for each cell using + \link[isotree]{isolation.forest}. Isolation forests explicitly try to + estimate the isolation of datapoints, in contrast to estimating the full + distribution of the data. To quote the isotree documentation: "Isolation + Forest is an algorithm originally developed for outlier detection that + consists in splitting sub-samples of the data according to some + attribute/feature/column at random." Basically, it calculates how quickly + each cell gets split off from the rest of the group when the dataset is + randomly partitioned by a hyperplane. There are many adjustable parameters + (e.g. number of trees, dimensionality of the splits, etc), but the defaults + usually work pretty well. } \details{ -If \code{use_pcs} is turned on, it will use all available - PCs in the PCA reduced dimension slot if it exists, and run - \link[scater]{runPCA} with default settings and use that if it - does not. +If \code{use_pcs} is turned on, it will use all available PCs in the + PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with + default settings and use that if it does not. - \code{prediction_thresh} is the threshold to use on the - isoforest score predictions + \code{prediction_thresh} is the threshold to use on the isoforest score + predictions } \examples{ library(scater) @@ -74,11 +65,5 @@ query_data <- logNormCounts(sce) query_data <- runPCA(query_data) -query_data <- calculateOutlierScore( - query_data, - plot = TRUE, - dimred = "PCA", - use_pcs = TRUE -) - +query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) } diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd index 525f85e..e10ae23 100644 --- a/vignettes/scDiagnostics.Rmd +++ b/vignettes/scDiagnostics.Rmd @@ -709,18 +709,23 @@ PCs). ```{r} query_data <- calculateOutlierScore(query_data, - use_pcs = TRUE, - plot = TRUE, - dimred = "PCA") + use_pcs = TRUE) + +scater::plotReducedDim(query_data, + dimred = "PCA", + color_by = "outlier_score", + shape_by = "is_outlier") ``` +By default this evaluates an isolation forest across all cells in the object. One can optionally pass a `group.var` (a column in the colData) to evaluate the outlier detection by groups (e.g. cell type). This can help assess whether a cell is atypical amongst those it was clustered with. + Columns `outlier_score` and `is_outlier` are added to the colData of the output. The isotree model object itself is added to the metadata. One can pass different `dimred` names to change the plot, but running an isolation in forest in e.g. tSNE or UMAP space does not make sense due to the non-linearity of these approaches. Hence this is not -allowed - when `use_pcs = FALSE`, the logcounts are used instead. +allowed - when `use_pcs = FALSE`, the raw logcounts are used instead. ## Annotation entropy