From 79c959a2d67278bfd92fa80bb9263174b97889e9 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Tue, 16 Apr 2024 08:21:30 -0400 Subject: [PATCH 01/11] fix outlier score example, start of by group function --- R/outlierDetection.R | 45 ++++++++++++++++++++ man/calculateOutlierScoreByCluster.Rd | 59 +++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 man/calculateOutlierScoreByCluster.Rd diff --git a/R/outlierDetection.R b/R/outlierDetection.R index 0b5ff83..7cb1cc3 100644 --- a/R/outlierDetection.R +++ b/R/outlierDetection.R @@ -119,3 +119,48 @@ calculateOutlierScore <- sce } + +#' Calculate cell outlier scores using isolation forests by group +#' +#' @description This function calculates outlier scores for each cell BY CLUSTER +#' using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for +#' more detail on isoforests. This function calls that one on each cluster +#' separately. So instead of outlierness from the full dataset overall, this +#' function captures outlierness from the cell's cluster. +#' @inheritParams calculateOutlierScore +#' @param clusterVar the name of the cluster variable in the colData of the sce +#' input. +#' @examples +#' # TODO update this example. +#' library(scater) +#' library(scran) +#' library(scRNAseq) +#' +#' # Load data +#' sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) +#' +#' # Divide the data into reference and query datasets +#' set.seed(100) +#' query_data <- logNormCounts(query_data) +#' +#' query_data <- runPCA(query_data) +#' +#' query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE) +#' @export +calculateOutlierScoreByCluster <- function( + sce, + clusterVar, + dimred = NULL, + use_pcs = FALSE, + plot = TRUE, + prediction_thresh = 0.5, + ...) { + + # TODO implement the functionality. + + lapply(cluster_levels, calculateOutlierScore) + + # Combine results + + return(sce) +} diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd new file mode 100644 index 0000000..32995f2 --- /dev/null +++ b/man/calculateOutlierScoreByCluster.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/outlierDetection.R +\name{calculateOutlierScoreByCluster} +\alias{calculateOutlierScoreByCluster} +\title{Calculate cell outlier scores using isolation forests by group} +\usage{ +calculateOutlierScoreByCluster( + sce, + clusterVar, + dimred = NULL, + use_pcs = FALSE, + plot = TRUE, + prediction_thresh = 0.5, + ... +) +} +\arguments{ +\item{sce}{a SingleCellExperiment object containing a logcounts matrix} + +\item{clusterVar}{the name of the cluster variable in the colData of the sce +input.} + +\item{dimred}{if specified, plot the specified reduced dimension with +\link[scater]{plotReducedDim}, colored by outlier score} + +\item{use_pcs}{if TRUE, run the outlier detection in principal component +space. Otherwise, use logcounts.} + +\item{plot}{if TRUE and a dimred is specified, plot the isoscore against the +specified dimred} + +\item{prediction_thresh}{threshold to use for binary outlier calling} + +\item{...}{arguments to pass to \link[isotree]{isolation.forest}} +} +\description{ +This function calculates outlier scores for each cell BY CLUSTER + using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for + more detail on isoforests. This function calls that one on each cluster + separately. So instead of outlierness from the full dataset overall, this + function captures outlierness from the cell's cluster. +} +\examples{ +# TODO update this example. +library(scater) +library(scran) +library(scRNAseq) + +# Load data +sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) + +# Divide the data into reference and query datasets +set.seed(100) +query_data <- logNormCounts(query_data) + +query_data <- runPCA(query_data) + +query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE) +} From a71cdbfb4868e12dc66fee195e5ed1aabeae907c Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Tue, 16 Apr 2024 09:15:45 -0400 Subject: [PATCH 02/11] outlier detection by group --- NAMESPACE | 1 + R/outlierDetection.R | 50 ++++++++++++++++++++++++--- man/calculateOutlierScoreByCluster.Rd | 13 +++++-- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1ede8e8..c15f736 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(calculateCategorizationEntropy) export(calculateHVGOverlap) export(calculateOutlierScore) +export(calculateOutlierScoreByCluster) export(calculatePairwiseDistancesAndPlotDensity) export(computeAveragePairwiseCorrelation) export(histQCvsAnnotation) diff --git a/R/outlierDetection.R b/R/outlierDetection.R index 7cb1cc3..4050bf9 100644 --- a/R/outlierDetection.R +++ b/R/outlierDetection.R @@ -130,6 +130,10 @@ calculateOutlierScore <- #' @inheritParams calculateOutlierScore #' @param clusterVar the name of the cluster variable in the colData of the sce #' input. +#' @details This function treats NAs in clusterVar as a separate level and +#' applies the outlier scoring to them accordingly. The isolation forest model +#' objects are returned as a list in the metadata. +#' #' @examples #' # TODO update this example. #' library(scater) @@ -141,11 +145,15 @@ calculateOutlierScore <- #' #' # Divide the data into reference and query datasets #' set.seed(100) -#' query_data <- logNormCounts(query_data) +#' query_data <- logNormCounts(sce) #' #' query_data <- runPCA(query_data) #' -#' query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE) +#' query_data <- calculateOutlierScoreByCluster(query_data, +#' clusterVar = "reclustered.broad", +#' plot = TRUE, +#' dimred = "PCA", +#' use_pcs = TRUE) #' @export calculateOutlierScoreByCluster <- function( sce, @@ -156,11 +164,43 @@ calculateOutlierScoreByCluster <- function( prediction_thresh = 0.5, ...) { - # TODO implement the functionality. + if (is.null(colnames(sce))) stop("This function requires the input SCE to have column names.") + + # There's probably a better way to split an sce by a factor... + # Anyway, manually passing it through factor() like this way retains NAs. + cluster_list = split(colData(sce), + factor(colData(sce)[[clusterVar]], + exclude = NULL)) |> + lapply(\(x) sce[,rownames(x)]) + + # TODO: assess if it would be worth making the threshold adaptive. Different + # clusters can have different distributions of outlier scores. + score_list = lapply(cluster_list, calculateOutlierScore, + plot = FALSE, + dimred = dimred, + use_pcs = use_pcs, + prediction_thresh = prediction_thresh) + + score_df = score_list |> + lapply(\(x) colData(x)[,c("outlier_score", "is_outlier")]) |> + Reduce(f = rbind) + + # If we were using an ID column instead of column names of the input we could + # do a join. + score_df = score_df[rownames(colData(sce)),] + colData(sce) = cbind(colData(sce), score_df) - lapply(cluster_levels, calculateOutlierScore) + S4Vectors::metadata(sce)$cluster_isotree_model = score_list |> lapply(S4Vectors::metadata) - # Combine results + if (!is.null(dimred) && plot) { + # TODO make this plot work when scater is only in the Suggests, or re-do + # it manually with ggplot. + p <- scater::plotReducedDim(sce, dimred, + color_by = "outlier_score", + shape_by = "is_outlier" + ) + print(p) + } return(sce) } diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd index 32995f2..54388b1 100644 --- a/man/calculateOutlierScoreByCluster.Rd +++ b/man/calculateOutlierScoreByCluster.Rd @@ -40,6 +40,11 @@ This function calculates outlier scores for each cell BY CLUSTER separately. So instead of outlierness from the full dataset overall, this function captures outlierness from the cell's cluster. } +\details{ +This function treats NAs in clusterVar as a separate level and + applies the outlier scoring to them accordingly. The isolation forest model + objects are returned as a list in the metadata. +} \examples{ # TODO update this example. library(scater) @@ -51,9 +56,13 @@ sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) # Divide the data into reference and query datasets set.seed(100) -query_data <- logNormCounts(query_data) +query_data <- logNormCounts(sce) query_data <- runPCA(query_data) -query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE) +query_data <- calculateOutlierScoreByCluster(query_data, + clusterVar = "reclustered.broad", + plot = TRUE, + dimred = "PCA", + use_pcs = TRUE) } From 93bcd3a77ae9092a671bc9091c6f71bf55b9c78d Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Sun, 14 Apr 2024 09:24:09 -1000 Subject: [PATCH 03/11] rebase entropy --- .Rbuildignore | 7 - DESCRIPTION | 68 --- vignettes/scDiagnostics.Rmd | 795 ------------------------------------ 3 files changed, 870 deletions(-) delete mode 100644 .Rbuildignore delete mode 100644 DESCRIPTION delete mode 100644 vignettes/scDiagnostics.Rmd diff --git a/.Rbuildignore b/.Rbuildignore deleted file mode 100644 index 8b06f02..0000000 --- a/.Rbuildignore +++ /dev/null @@ -1,7 +0,0 @@ -^.*\.Rproj$ -^\.Rproj\.user$ -^doc$ -^Meta$ -^\.github$ -^docs$ -^pkgdown$ diff --git a/DESCRIPTION b/DESCRIPTION deleted file mode 100644 index 0cbd320..0000000 --- a/DESCRIPTION +++ /dev/null @@ -1,68 +0,0 @@ -Type: Package -Package: scDiagnostics -Title: Cell type annotation diagnostics -Version: 0.99.0 -Authors@R: c( - person("Anthony", "Christidis", role = c("aut", "cre"), - email = "anthony-alexander_christidis@hms.harvard.edu"), - person("Andrew", "Ghazi", role = "aut"), - person("Smriti", "Chawla", role = "aut"), - person("Nitesh", "Turaga", role = "ctb"), - person("Ludwig", "Geistlinger", role = "aut"), - person("Robert", "Gentleman", role = "aut") - ) -Description: The scDiagnostics package provides diagnostic plots to - assess the quality of cell type assignments from single cell gene - expression profiles. The implemented functionality allows to - assess the reliability of cell type annotations, investigate gene - expression patterns, and explore relationships between different - cell types in query and reference datasets allowing users to - detect potential misalignments between reference and query - datasets. The package also provides visualization capabilities for - diagnositics purposes. -License: Artistic-2.0 -URL: https://github.com/ccb-hms/scDiagnostics -BugReports: https://github.com/ccb-hms/scDiagnostics/issues -Depends: - R (>= 4.3.0) -Imports: - SingleCellExperiment, - isotree, - methods, - rlang, - ggplot2, - gridExtra, - scater, - SummarizedExperiment, - BiocGenerics, - S4Vectors, - stats, - utils -Suggests: - AUCell, - BiocStyle, - corrplot, - knitr, - Matrix, - RColorBrewer, - rmarkdown, - scran, - scRNAseq, - SingleR, - celldex, - testthat (>= 3.0.0) -VignetteBuilder: - knitr -biocViews: - Annotation, - Classification, - Clustering, - GeneExpression, - RNASeq, - SingleCell, - Software, - Transcriptomics -Encoding: UTF-8 -LazyData: true -RoxygenNote: 7.3.1 -Config/testthat/edition: 3 diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd deleted file mode 100644 index 525f85e..0000000 --- a/vignettes/scDiagnostics.Rmd +++ /dev/null @@ -1,795 +0,0 @@ ---- -title: "scDiagnostics: diagnostic functions to assess the quality of cell type annotations in single-cell RNA-seq data" -author: - - name: Anthony Christidis - affiliation: Center for Computational Biomedicine, Harvard Medical School - email: anthony-alexander_christidis@hms.harvard.edu - - name: Andrew Ghazi - affiliation: Harvard Stem Cell Institute, Harvard Medical School - - name: Smriti Chawla - affiliation: Center for Computational Biomedicine, Harvard Medical School - - name: Nitesh Turaga - affiliation: Center for Computational Biomedicine, Harvard Medical School - - name: Ludwig Geistlinger - affiliation: Center for Computational Biomedicine, Harvard Medical School - - name: Robert Gentleman - affiliation: Center for Computational Biomedicine, Harvard Medical School -package: scDiagnostics -output: - BiocStyle::html_document: - toc: true - toc_float: true -vignette: > - %\VignetteIndexEntry{scDiagnostics} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -# Purpose - -Annotation transfer from a reference dataset for the cell type -annotation of a new query single-cell RNA-sequencing (scRNA-seq) -experiment is an integral component of the typical analysis -workflow. The approach provides a fast, automated, and reproducible -alternative to the manual annotation of cell clusters based on marker -gene expression. However, dataset imbalance and undiagnosed -incompatibilities between query and reference dataset can lead to -erroneous annotation and distort downstream applications. - -The `scDiagnostics` package provides functionality for the systematic -evaluation of cell type assignments in scRNA-seq data. -`scDiagnostics` offers a suite of diagnostic functions to assess -whether both (query and reference) datasets are aligned, ensuring that -annotations can be transferred reliably. `scDiagnostics` also provides -functionality to assess annotation ambiguity, cluster heterogeneity, -and marker gene alignment. The implemented functionality helps -researchers to determine how accurately cells from a new scRNA-seq -experiment can be assigned to known cell types. - -# Installation - -To install the development version of the package from Github, use the following -command: - -```{r dev_version_install, eval = FALSE} -BiocManager::install("ccb-hms/scDiagnostics") -``` - -NOTE: you will need the [remotes](https://cran.r-project.org/web/packages/remotes/index.html) -package to install from GitHub. - -To build the package vignettes upon installation use: - -```{r build_vignettes, eval=FALSE} -BiocManager::install("ccb-hms/scDiagnostics", - build_vignettes = TRUE, - dependencies = TRUE) -``` - -# Usage - -To explore the capabilities of the scDiagnostics package, you can load -your own data or utilize publicly available datasets obtained from the -scRNAseq R package. In this guide, we will demonstrate how to use -scDiagnostics with such datasets, which serve as valuable resources -for exploring the package and assessing the appropriateness of cell -type assignments. - -```{r libraries, message = FALSE} -library(scDiagnostics) -library(celldex) -library(corrplot) -library(scater) -library(scran) -library(scRNAseq) -library(AUCell) -library(RColorBrewer) -library(SingleR) -``` - -## Scatter Plot: QC stats vs. Annotation Scores - -Here, we will consider the Human Primary Cell Atlas (Mabbott et -al. 2013) as a reference dataset and our query dataset consists of -Haematopoietic stem and progenitor cells from (Bunis DG et al. 2021). - -In scRNA-seq studies, assessing the quality of cells is important for -accurate downstream analyses. At the same time, assigning accurate -cell type labels based on gene expression profiles is an integral -aspect of scRNA-seq data interpretation. Generally, these two are -performed independently of each other. The rationale behind this -function is to inspect whether certain QC (Quality Control) criteria -impact the confidence level of cell type annotations. - -For instance, it is reasonable to hypothesize that higher library -sizes could contribute to increased annotation confidence due to -enhanced statistical power for identifying cell type-specific gene -expression patterns, as evident in the scatter plot below. - -```{r Scatter-Plot-LibrarySize-Vs-Annotation-Scores, warning=FALSE, message=FALSE} - -# load reference dataset -ref_data <- celldex::HumanPrimaryCellAtlasData() - -# Load query dataset (Bunis haematopoietic stem and progenitor cell -# data) from Bunis DG et al. (2021). Single-Cell Mapping of -# Progressive Fetal-to-Adult Transition in Human Naive T Cells Cell -# Rep. 34(1): 108573 - -query_data <- BunisHSPCData() -rownames(query_data) <- rowData(query_data)$Symbol - -# Add QC metrics to query data -query_data <- addPerCellQCMetrics(query_data) - -# Log transform query dataset -query_data <- logNormCounts(query_data) - -# Run SingleR to predict cell types -pred <- SingleR(query_data, ref_data, labels = ref_data$label.main) - -# Assign predicted labels to query data -colData(query_data)$pred.labels <- pred$labels - -# Get annotation scores -scores <- apply(pred$scores, 1, max) - -# Assign scores to query data -colData(query_data)$cell_scores <- scores - -# Create a scatter plot between library size and annotation scores -p1 <- plotQCvsAnnotation( - query_data = query_data, - qc_col = "total", - label_col = "pred.labels", - score_col = "cell_scores", - label = NULL -) -p1 + xlab("Library Size") -``` - -However, certain QC metrics, such as the proportion of mitochondrial -genes, may require careful consideration as they can sometimes be -associated with cellular states or functions rather than noise. The -interpretation of mitochondrial content should be context-specific and -informed by biological knowledge. - -In next analysis, we investigated the relationship between -mitochondrial percentage and cell type annotation scores using liver -tissue data from He S et al. 2020. Notably, we observed high -annotation scores for macrophages and monocytes. These findings align -with the established biological characteristic of high mitochondrial -activity in macrophages and monocytes, adding biological context to -our results. - -```{r QC-Annotation-Scatter-Mito, warning=FALSE, message=FALSE} -# load query dataset -query_data <- HeOrganAtlasData( - tissue = c("Liver"), - ensembl = FALSE, - location = TRUE -) - -# Add QC metrics to query data - -mito_genes <- rownames(query_data)[grep("^MT-", rownames(query_data))] -query_data <- unfiltered <- addPerCellQC(query_data,subsets = list(mt = mito_genes)) -qc <- quickPerCellQC(colData(query_data), sub.fields = "subsets_mt_percent") -query_data <- query_data[,!qc$discard] - -# Log transform query dataset -query_data <- logNormCounts(query_data) - -# Run SingleR to predict cell types -pred <- SingleR(query_data, ref_data, labels = ref_data$label.main) - -# Assign predicted labels to query data -colData(query_data)$pred.labels <- pred$labels - -# Get annotation scores -scores <- apply(pred$scores, 1, max) - -# Assign scores to query data -colData(query_data)$cell_scores <- scores - -# Create a new column for the labels so it is easy to distinguish -# between Macrophoges, Monocytes and other cells -query_data$label_category <- - ifelse(query_data$pred.labels %in% c("Macrophage", "Monocyte"), - query_data$pred.labels, - "Other cells") - - -# Define custom colors for cell type labels -cols <- c("Other cells" = "grey", "Macrophage" = "green", "Monocyte" = "red") - -# Generate scatter plot for all cell types -p1 <- plotQCvsAnnotation( - query_data = query_data, - qc_col = "subsets_mt_percent", - label_col = "label_category", - score_col = "cell_scores", - label = NULL) + - scale_color_manual(values = cols) + - xlab("subsets_mt_percent") -p1 -``` - -## Examining Distribution of QC stats and Annotation Scores - -In addition to the scatter plot, we can gain further insights into the -gene expression profiles by visualizing the distribution of user -defined QC stats and annotation scores for all the cell types or -specific cell types. This allows us to examine the variation and -patterns in expression levels and scores across cells assigned to the -cell type of interest. - -To accomplish this, we create two separate histograms. The first -histogram displays the distribution of the annotation scores. - -The second histogram visualizes the distribution of QC stats. This -provides insights into the overall gene expression levels for the -specific cell type. Here in this particular example we are -investigating percentage of mitochondrial genes. - -By examining the histograms, we can observe the range, shape, and -potential outliers in the distribution of both annotation scores and -QC stats. This allows us to assess the appropriateness of the cell -type assignments and identify any potential discrepancies or patterns -in the gene expression profiles for the specific cell type. - -```{r Mito-Genes-Vs-Annotation, warning=FALSE, message=FALSE} -# Generate histogram -histQCvsAnnotation(query_data = query_data, qc_col = "subsets_mt_percent", - label_col = "pred.labels", - score_col = "cell_scores", - label = NULL) -``` - -The right-skewed distribution for mitochondrial percentages and a -left-skewed distribution for annotation scores in above histograms -suggest that most cells have lower mitochondrial contamination and -higher confidence in their assigned cell types. - -## Exploring Gene Expression Distribution - -This function helps user to explore the distribution of gene -expression values for a specific gene of interest across all the cells -in both reference and query datasets and within specific cell -types. This helps to evaluate whether the distributions are similar or -aligned between the datasets. Discrepancies in distribution patterns -may indicate potential incompatibilities or differences between the -datasets. - -The function also allows users to narrow down their analysis to -specific cell types of interest. This enables investigation of whether -alignment between the query and reference datasets is consistent not -only at a global level but also within specific cell types. - -```{r Gene-Expression-Histogram, warning=FALSE, message=FALSE} - -# Load data -sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) - -# Divide the data into reference and query datasets -set.seed(100) -indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE) -ref_data <- sce[, indices] -query_data <- sce[, -indices] - -# Log-transform datasets -ref_data <- logNormCounts(ref_data) -query_data <- logNormCounts(query_data) - -# Run PCA -ref_data <- runPCA(ref_data) -query_data <- runPCA(query_data) - -# Get cell type scores using SingleR -pred <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad) -pred <- as.data.frame(pred) - -# Assign labels to query data -colData(query_data)$labels <- pred$labels - -# Generate density plots -plotMarkerExpression(reference_data = ref_data, - query_data = query_data, - reference_cell_labels = "reclustered.broad", - query_cell_labels = "labels", - gene_name = "MS4A1", - label = "B_and_plasma") -``` - -In the provided example, we examined the distribution of expression -values for the gene MS4A1, a marker for naive B cells, in both the -query and reference datasets. Additionally, we also looked at the -distribution of MS4A1 expression in the B_and_plasma cell type. We -observed overlapping distributions in both cases, suggesting alignment -between the reference and query datasets. - -## Evaluating Alignment Between Reference and Query Datasets in Terms of Highly Variable Genes - -We are assessing the similarity or alignment between two datasets, the -reference dataset, and the query dataset, in terms of highly variable -genes (HVGs). We calculate the overlap coefficient between the sets of -highly variable genes in the reference and query datasets. The overlap -coefficient quantifies the degree of overlap or similarity between -these two sets of genes. A value closer to 1 indicates a higher degree -of overlap, while a value closer to 0 suggests less overlap. The -computed overlap coefficient is printed, providing a numerical measure -of how well the highly variable genes in the reference and query -datasets align. In this case, the overlap coefficient is 0.62, -indicating a moderate level of overlap. - -```{r HVG overlap, warning=FALSE, message=FALSE} - -# Selecting highly variable genes -ref_var <- getTopHVGs(ref_data, n=2000) -query_var <- getTopHVGs(query_data, n=2000) - -# Compute the overlap coefficient -overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var, - query_genes = query_var) -print(overlap_coefficient) -``` - -In the provided example, we examined the distribution of expression -values for the gene MS4A1, a marker for naive B cells, in both the -query and reference datasets. Additionally, we also looked at the -distribution of MS4A1 expression in the B_and_plasma cell type. We -observed overlapping distributions in both cases, suggesting alignment -between the reference and query datasets. - -## Visualize Gene Expression on Dimensional Reduction Plot - -To gain insights into the gene expression patterns and their -representation in a dimensional reduction space, we can utilize the -plotGeneExpressionDimred function. This function allows us to plot the -gene expression values of a specific gene on a dimensional reduction -plot generated using methods like t-SNE, UMAP, or PCA. Each single -cell is color-coded based on its expression level of the gene of -interest. - -In the provided example, we are visualizing the gene expression values -of the gene "VPREB3" on a PCA plot. The PCA plot represents the cells -in a lower-dimensional space, where the x-axis corresponds to the -first principal component (Dimension 1) and the y-axis corresponds to -the second principal component (Dimension 2). Each cell is represented -as a point on the plot, and its color reflects the expression level of -the gene "VPREB3," ranging from low (lighter color) to high (darker -color). - -```{r Gene-Expression-Scatter, warning=FALSE, message=FALSE} -# Generate dimension reduction plot color code by gene expression -plotGeneExpressionDimred(se_object = query_data, - method = "PCA", - n_components = c(1, 2), - feature = "VPREB3") -``` - -The dimensional reduction plot allows us to observe how the gene -expression of VPREB3 is distributed across the cells and whether any -clusters or patterns emerge in the data. - -## Visualize Gene Sets or Pathway Scores on Dimensional Reduction Plot - -In addition to examining individual gene expression patterns, it is -often useful to assess the collective activity of gene sets or -pathways within single cells. This can provide insights into the -functional states or biological processes associated with specific -cell types or conditions. To facilitate this analysis, the -scDiagnostics package includes a function called plotGeneSetScores -that enables the visualization of gene set or pathway scores on a -dimensional reduction plot. - -The plotGeneSetScores function allows you to plot gene set or pathway -scores on a dimensional reduction plot generated using methods such as -PCA, t-SNE, or UMAP. Each single cell is color-coded based on its -scores for specific gene sets or pathways. This visualization helps -identify the heterogeneity and patterns of gene set or pathway -activity within the dataset, potentially revealing subpopulations with -distinct functional characteristics. - -```{r Pathway-Scores-on-Dimensional-Reduction-Scatter, warning=FALSE, message=FALSE} - -# Compute scores using AUCell -expression_matrix <- assay(query_data, "logcounts") -cells_rankings <- AUCell_buildRankings(expression_matrix, plotStats = FALSE) - -# Generate gene sets -gene_set1 <- sample(rownames(expression_matrix), 10) -gene_set2 <- sample(rownames(expression_matrix), 20) - -gene_sets <- list(geneSet1 = gene_set1, - geneSet2 = gene_set2) - -# Calculate AUC scores for gene sets -cells_AUC <- AUCell_calcAUC(gene_sets, cells_rankings) - -# Assign scores to colData -colData(query_data)$geneSetScores <- assay(cells_AUC)["geneSet1", ] - -# Plot gene set scores on PCA -plotGeneSetScores(se_object = query_data, - method = "PCA", - feature = "geneSetScores") -``` - -In the provided example, we demonstrate the usage of the -plotGeneSetScores function using the AUCell package to compute gene -set or pathway scores. Custom gene sets are generated for -demonstration purposes, but users can provide their own gene set -scores using any method of their choice. It is important to ensure -that the scores are assigned to the colData of the reference or query -object and specify the correct feature name for visualization. - -By visualizing gene set or pathway scores on a dimensional reduction -plot, you can gain a comprehensive understanding of the functional -landscape within your single-cell gene expression dataset and explore -the relationships between gene set activities and cellular phenotypes. - -## Visualizing Reference and Query Cell Types using Multidimensional Scaling (MDS) - -This function performs Multidimensional Scaling (MDS) analysis on the -query and reference datasets to examine their similarity. The -dissimilarity matrix is calculated based on the correlation between -the datasets, representing the distances between cells in terms of -gene expression patterns. MDS is then applied to derive -low-dimensional coordinates for each cell. Subsequently, a scatter -plot is generated, where each data point represents a cell, and cell -types are color-coded using custom colors provided by the user. This -visualization enables the comparison of cell type distributions -between the query and reference datasets in a reduced-dimensional -space. - -The rationale behind this function is to visually assess the alignment -and relationships between cell types in the query and reference -datasets. - -```{r CMD-Scatter-Plot, warning=FALSE, message=FALSE} - -# Intersect the gene symbols to obtain common genes -common_genes <- intersect(ref_var, query_var) - -# Select desired cell types -selected_cell_types <- c("CD4", "CD8", "B_and_plasma") -ref_data_subset <- ref_data[common_genes, ref_data$reclustered.broad %in% selected_cell_types] -query_data_subset <- query_data[common_genes, query_data$labels %in% selected_cell_types] - -# Extract cell types for visualization -ref_labels <- ref_data_subset$reclustered.broad -query_labels <- query_data_subset$labels - -# Combine the cell type labels from both datasets -mdata <- c(paste("Query", query_labels), paste("Reference", ref_labels)) - -## Define the cell types and legend order -cell_types <- c("Query CD8", "Reference CD8", "Query CD4", "Reference CD4", "Query B_and_plasma", "Reference B_and_plasma") -legend_order <- cell_types - -## Define the colors for cell types -color_palette <- brewer.pal(length(cell_types), "Paired") -color_mapping <- setNames(color_palette, cell_types) -cell_type_colors <- color_mapping[cell_types] - -## Generate the MDS scatter plot with cell type coloring -visualizeCellTypeMDS(query_data = query_data_subset, - reference_data = ref_data_subset, - mdata = mdata, - cell_type_colors = cell_type_colors, - legend_order = legend_order) -``` - -Upon examining the MDS scatter plot, we observe that the CD4 and CD8 -cell types overlap to some extent.By observing the proximity or -overlap of different cell types, one can gain insights into their -potential relationships or shared characteristics. - -The selection of custom genes and desired cell types depends on the -user's research interests and goals. It allows for flexibility in -focusing on specific genes and examining particular cell types of -interest in the visualization. - -## Cell Type-specific Pairwise Correlation Analysis and Visualization - -This analysis aims to explore the correlation patterns between -different cell types in a single-cell gene expression dataset. The -goal is to compare the gene expression profiles of cells from a -reference dataset and a query dataset to understand the relationships -and similarities between various cell types. - -To perform the analysis, we start by computing the pairwise -correlations between the query and reference cells for selected cell -types ("CD4", "CD8", "B_and_plasma"). The Spearman correlation method -is used, user can also use Pearsons correlation coeefficient. - -This will return average correlation matrix which can be visulaized by -user's method of choice. Here, the results are visualized as a -correlation plot using the corrplot package. - - -```{r Cell-Type-Correlation-Analysis-Visualization, warning=FALSE, message=FALSE} -selected_cell_types <- c("CD4", "CD8", "B_and_plasma") -cor_matrix_avg <- computeAveragePairwiseCorrelation(query_data = query_data_subset, - reference_data = ref_data_subset, - query_cell_type_col = "labels", - ref_cell_type_col = "reclustered.broad", - cell_types = selected_cell_types, - correlation_method = "spearman") - -# Plot the pairwise average correlations using corrplot -corrplot(cor_matrix_avg, method = "number", tl.col = "black") -``` - -In this case, users have the flexibility to extract the gene -expression profiles of specific cell types from the reference and -query datasets and provide these profiles as input to the -function. Additionally, they can select their own set of genes that -they consider relevant for computing the pairwise correlations. For -demonstartion we have used common highly variable genes from reference -and query dataset. - -By providing their own gene expression profiles and choosing specific -genes, users can focus the analysis on the cell types and genes of -interest to their research question. - -## Pairwise Distance Analysis and Density Visualization - -This function serves to conduct a analysis of pairwise distances or -correlations between cells of specific cell types within a single-cell -gene expression dataset. By calculating these distances or -correlations, users can gain insights into the relationships and -differences in gene expression profiles between different cell -types. The function facilitates this analysis by generating density -plots, allowing users to visualize the distribution of distances or -correlations for various pairwise comparisons. - -The analysis offers the flexibility to select a particular cell type -for examination, and users can choose between different distance -metrics, such as "euclidean" or "manhattan," to calculate pairwise -distances. - -To illustrate, the function is applied to the cell type CD8 using the -euclidean distance metric in the example below. - -```{r Pairwise-Distance-Analysis-Density-Visualization, fig.width=8, message=FALSE, warning=FALSE} -calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, - reference_data = ref_data_subset, - query_cell_type_col = "labels", - ref_cell_type_col = "reclustered.broad", - cell_type_query = "CD8", - cell_type_reference = "CD8", - distance_metric = "euclidean") -``` - -Alternatively, users can opt for the "correlation" distance metric, -which measures the similarity in gene expression profiles between -cells. - -To illustrate, the function is applied to the cell type CD8 using the -correlation distance metric in the example below. By selecting either -the "pearson" or "spearman" correlation method, users can emphasize -either linear or rank-based associations, respectively. - -```{r Pairwise-Distance-Correlation-Based-Density-Visualization, warning=FALSE, message=FALSE, fig.width=8} -calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, - reference_data = ref_data_subset, - query_cell_type_col = "labels", - ref_cell_type_col = "reclustered.broad", - cell_type_query = "CD8", - cell_type_reference = "CD8", - distance_metric = "correlation", - correlation_method = "spearman") -``` - -By utilizing this function, users can explore the pairwise distances -between query and reference cells of a specific cell type and gain -insights into the distribution of distances through density -plots. This analysis aids in understanding the similarities and -differences in gene expression profiles for the selected cell type -within the query and reference datasets. - -## PC regression analysis - -Performing PC regression analysis on a SingleCellExperiment object -enables users to examine the relationship between a principal -component (PC) from the dimension reduction slot and an independent -variable of interest. By specifying the desired dependent variable as -one of the principal components (e.g., "PC1", "PC2", etc.) and -providing the corresponding independent variable from the colData slot -(e.g. "cell_type"), users can explore the associations between linear -structure in the single-cell gene expression dataset (reference and -query) and an independent variable of interest (e.g. cell type or -batch). - -The function prints two diagnostic plots by default: - -* a plot of the two PCs with the highest R^2^ with the specified independent variable -* a dot plot showing the R^2^ of each consecutive PC ~ indep.var regression - * Generally you should expect this plot to die off to near 0 before ~PC10 - * Interpretation example: If the R^2^ values are high (>=50%) - anywhere in PCs 1-5 and your independent variable is "batch", you - have batch effects! - -```{r Regression, warning=FALSE, message=FALSE} - -# Specify the dependent variables (principal components) and -# independent variable (e.g., "labels") -dep.vars <- paste0("PC", 1:12) -indep.var <- "labels" - -# Perform linear regression on multiple principal components -result <- regressPC(sce = query_data, - dep.vars = dep.vars, - indep.var = indep.var) - -# Print the summaries of the linear regression models and R-squared -# values - -# Summaries of the linear regression models -result$regression.summaries[[1]] - -# R-squared values -result$rsquared - -# Variance contributions for each principal component -result$var.contributions - -# Total variance explained -result$total.variance.explained -``` - -This analysis helps uncover whether there is a systematic variation in -PC values across different cell types. In the example above, we can -see that the four cell types are spread out across both PC1 and -PC2. Digging into the genes with high loadins on these PCs can help -explain the biological or technical factors driving cellular -heterogeneity. It can help identify PC dimensions that capture -variation specific to certain cell types or distinguish different -cellular states. - -Let's look at the genes driving PC1 by ordering the rotation matrix by -the absolute gene loadings for PC1: - -```{r} -pc_df <- attr(reducedDims(query_data)$PCA, "rotation")[,1:5] |> - as.data.frame() - -pc_df[order(abs(pc_df$PC1)),] |> - tail() -``` - -PC1 is mostly driven by NKG7 - Natural Killer Cell Granule Protein -7. This gene is important in CD8+ T cells, so that makes sense that -it's distinguishing the cell types shown. - -> Exercise: What genes are driving PC2? Do they make sense? - -```{r echo = FALSE, eval = FALSE} -pc_df[order(abs(pc_df$PC2)),] |> - tail() - -# It's IL32 mostly. -``` - -> Exercise: Try to use the command below to examine the spike on - PC5. What's going on there? - -`plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")` - -```{r eval=FALSE, echo=FALSE} -plotPCA(query_data, ncomponents = c(1,5), color_by = "labels") -# The myeloid cells are shifted off from the other types. - -pc_df[order(abs(pc_df$PC5))] |> - tail() -# It's mostly driven by low GNLY expression in the myeloid cells. -``` - -## Outlier detection - -Isolation forests are a fast and effective method for detecting -outliers in high-dimensional data without relying on density -estimation. `calculateOutlierScore()` is a wrapper around the main -function of the [isotree -package](https://cran.r-project.org/web/packages/isotree/index.html). Here -we run it on the PC embeddings but, it can also be informative to run -it on the raw counts. Here we can see some outliers in PC1/2 space -flagged by the default `score > 0.5` threshold due to their clear -isolation (as well as a few that are isolated on the non-visualized -PCs). - -```{r} -query_data <- calculateOutlierScore(query_data, - use_pcs = TRUE, - plot = TRUE, - dimred = "PCA") -``` - -Columns `outlier_score` and `is_outlier` are added to the colData of -the output. The isotree model object itself is added to the metadata. - -One can pass different `dimred` names to change the plot, but running -an isolation in forest in e.g. tSNE or UMAP space does not make sense -due to the non-linearity of these approaches. Hence this is not -allowed - when `use_pcs = FALSE`, the logcounts are used instead. - -## Annotation entropy - -In order to assess the confidence of cell type predictions, we can use -the function `calculateCategorizationEntropy()`. This function -calculates the information entropy of assignment probabilities across -a set of cell types for each cell. If a set of class probabilities are -confident, the entropies will be low. - -This can be used to compare two sets of cell type assignments -(e.g. from different type assignment methods) to compare their -relative confidence. **Please note that this has nothing to do with -their accuracy!** Computational methods can sometimes be confidently -incorrect. - -The cell type probabilities should be passed as a matrix with cell -types as rows and cells as columns. If the columns of the matrix are -not valid probability distributions (i.e. don't sum to 1 as in the -below example), the function will perform a column-wise softmax to -convert them to a probability scale. This may or may not work well -depending on the distribution of the inputs, so if at all possible try -to pass probabilities instead of arbitrary scores. - -In this example, we create 500 random cells with random normal cell -type "scores" across 4 cell types. For demonstration we make the score -of the first class much higher in the first 250 cells. After the -softmax, this will equate to a very high probability of cell type -1. The remaining 250 will have assignments that are roughly even -across the four cell types (i.e. high entropy). - -```{r} -X <- rnorm(500 * 4) |> matrix(nrow = 4) -X[1, 1:250] <- X[1, 1:250] + 5 - -entropy_scores <- calculateCategorizationEntropy(X) -``` - -From the plot we can see that half of the cells (the first half we -shifted to class 1) have low entropy, and half have high entropy. - -# Conclusion - -In this analysis, we have demonstrated the capabilities of the -scDiagnostics package for assessing the appropriateness of cell -assignments in single-cell gene expression profiles. By utilizing -various diagnostic functions and visualization techniques, we have -explored different aspects of the data, including total UMI counts, -annotation scores, gene expression distributions, dimensional -reduction plots, gene set scores, pairwise correlations, pairwise -distances, and linear regression analysis. - -Through the scatter plots, histograms, and dimensional reduction -plots, we were able to gain insights into the relationships between -gene expression patterns, cell types, and the distribution of cells in -a reduced-dimensional space. The examination of gene expression -distributions, gene sets, and pathways allowed us to explore the -functional landscape and identify subpopulations with distinct -characteristics within the dataset. Additionally, the pairwise -correlation and distance analyses provided a deeper understanding of -the similarities and differences between cell types, highlighting -potential relationships and patterns. - -*** - -## R.session Info - -```{r SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} -options(width = 80) #reset to 'default' width - -sessionInfo() #record the R and package versions used -``` - From 218b2044b08ba82df0a21660c7d2cd2ec9b1e60d Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Sun, 14 Apr 2024 09:58:00 -1000 Subject: [PATCH 04/11] add git merge files --- DESCRIPTION | 57 ++++++++++++++++++++ R/calculatePairwiseDistancesAndPlotDensity.R | 8 +++ 2 files changed, 65 insertions(+) create mode 100644 DESCRIPTION diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..52581a2 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,57 @@ +Type: Package +Package: scDiagnostics +Title: Cell type annotation diagnostics +Version: 0.99.0 +Authors@R: c( + person("Smriti", "Chawla", , "smriti_chawla@g.harvard.edu", role = c("aut", "cre")), + person("Andrew", "Ghazi", role = "aut"), + person("Ludwig", "Geistlinger", role = c("aut", "ctb")), + person("Robert", "Gentleman", role = "aut") + ) +Description: The scDiagnostics package provides diagnostic plots to + assess the quality of cell type assignments from single cell gene + expression profiles. The implemented functionality allows to + assess the reliability of cell type annotations, investigate gene + expression patterns, and explore relationships between different + cell types in query and reference datasets allowing users to + detect potential misalignments between reference and query + datasets. The package also provides visualization capabilities for + diagnositics purposes. +License: Artistic-2.0 +URL: https://github.com/ccb-hms/scDiagnostics +BugReports: https://github.com/ccb-hms/scDiagnostics/issues +Depends: + R (>= 4.2.0) +Imports: + SingleCellExperiment, + isotree, + methods, + rlang, + ggplot2, + gridExtra, + scater, + SummarizedExperiment, + BiocGenerics, + S4Vectors, + stats, + utils +Suggests: + AUCell, + BiocStyle, + corrplot, + knitr, + Matrix, + RColorBrewer, + rmarkdown, + scran, + scRNAseq, + SingleR, + celldex, + testthat (>= 3.0.0) +VignetteBuilder: + knitr +biocView: SingleCell, Software +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.3.1 +Config/testthat/edition: 3 diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R index 393b14a..ca5c6dc 100644 --- a/R/calculatePairwiseDistancesAndPlotDensity.R +++ b/R/calculatePairwiseDistancesAndPlotDensity.R @@ -157,6 +157,7 @@ calculatePairwiseDistancesAndPlotDensity <- ## comparisons num_query_cells <- nrow(query_mat) num_ref_cells <- nrow(ref_mat) +<<<<<<< HEAD dist_query_query <- dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] dist_ref_ref <- @@ -165,6 +166,13 @@ calculatePairwiseDistancesAndPlotDensity <- dist_query_ref <- dist_matrix[seq_len(num_query_cells), (num_query_cells + 1):(num_query_cells + num_ref_cells)] +======= + dist_query_query <- dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] + dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] + dist_query_ref <- dist_matrix[seq_len(num_query_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] +>>>>>>> 716f604 (Modify DESCRIPTION and fix couple of BiocCheck issues) ## Create data frame for plotting dist_df <- data.frame( From a1f5e85213556c840836578e58426e6dc861f786 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 15:47:23 -0400 Subject: [PATCH 05/11] remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style --- NAMESPACE | 8 - R/outlierDetection.R | 244 +++++++++----------------- man/calculateOutlierScore.Rd | 69 +++----- man/calculateOutlierScoreByCluster.Rd | 68 ------- 4 files changed, 106 insertions(+), 283 deletions(-) delete mode 100644 man/calculateOutlierScoreByCluster.Rd diff --git a/NAMESPACE b/NAMESPACE index c15f736..5d06f0e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ export(calculateCategorizationEntropy) export(calculateHVGOverlap) export(calculateOutlierScore) -export(calculateOutlierScoreByCluster) export(calculatePairwiseDistancesAndPlotDensity) export(computeAveragePairwiseCorrelation) export(histQCvsAnnotation) @@ -14,12 +13,8 @@ export(plotQCvsAnnotation) export(regressPC) export(visualizeCellTypeMDS) import(SingleCellExperiment) -importFrom(BiocGenerics,t) -importFrom(S4Vectors,metadata) importFrom(SingleCellExperiment,SingleCellExperiment) -importFrom(SingleCellExperiment,logcounts) importFrom(SingleCellExperiment,reducedDim) -importFrom(SingleCellExperiment,reducedDimNames) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,colData) importFrom(ggplot2,aes) @@ -42,14 +37,11 @@ importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) importFrom(ggplot2,ylim) importFrom(gridExtra,grid.arrange) -importFrom(isotree,isolation.forest) importFrom(methods,is) importFrom(rlang,.data) importFrom(scater,plotPCA) -importFrom(scater,plotReducedDim) importFrom(scater,plotTSNE) importFrom(scater,plotUMAP) -importFrom(scater,runPCA) importFrom(stats,cmdscale) importFrom(stats,cor) importFrom(stats,dist) diff --git a/R/outlierDetection.R b/R/outlierDetection.R index 4050bf9..176c4f5 100644 --- a/R/outlierDetection.R +++ b/R/outlierDetection.R @@ -1,47 +1,32 @@ #' Calculate cell outlier scores using isolation forests #' -#' @description This function calculates outlier scores for each cell -#' using \link[isotree]{isolation.forest}. Isolation forests -#' explicitly try to estimate the isolation of datapoints, in -#' contrast to estimating the full distribution of the data. To -#' quote the isotree documentation: "Isolation Forest is an -#' algorithm originally developed for outlier detection that -#' consists in splitting sub-samples of the data according to some -#' attribute/feature/column at random". Basically, it calculates -#' how quickly each cell gets split off from the rest of the group -#' when the dataset is randomly partitioned by a hyperplane. There -#' are many adjustable parameters (e.g. number of trees, -#' dimensionality of the splits, etc), but the defaults usually -#' work pretty well. -#' -#' @details If \code{use_pcs} is turned on, it will use all available -#' PCs in the PCA reduced dimension slot if it exists, and run -#' \link[scater]{runPCA} with default settings and use that if it -#' does not. -#' -#' \code{prediction_thresh} is the threshold to use on the -#' isoforest score predictions -#' -#' @param sce a SingleCellExperiment object containing a logcounts -#' matrix -#' -#' @param dimred if specified, plot the specified reduced dimension -#' with \link[scater]{plotReducedDim}, colored by outlier score -#' -#' @param use_pcs if TRUE, run the outlier detection in principal -#' component space. Otherwise, use logcounts. -#' -#' @param prediction_thresh threshold to use for binary outlier -#' calling -#' -#' @param plot if TRUE and a dimred is specified, plot the isoscore -#' against the specified dimred -#' +#' @description This function calculates outlier scores for each cell using +#' \link[isotree]{isolation.forest}. Isolation forests explicitly try to +#' estimate the isolation of datapoints, in contrast to estimating the full +#' distribution of the data. To quote the isotree documentation: "Isolation +#' Forest is an algorithm originally developed for outlier detection that +#' consists in splitting sub-samples of the data according to some +#' attribute/feature/column at random." Basically, it calculates how quickly +#' each cell gets split off from the rest of the group when the dataset is +#' randomly partitioned by a hyperplane. There are many adjustable parameters +#' (e.g. number of trees, dimensionality of the splits, etc), but the defaults +#' usually work pretty well. +#' @param sce a SingleCellExperiment object containing a logcounts matrix +#' @param group.var if specified, run the outlier scoring by group (e.g. cell +#' cluster) instead of across all cells globally. +#' @param use_pcs if TRUE, run the outlier detection in principal component +#' space. Otherwise, use logcounts. +#' @param prediction_thresh threshold to use for binary outlier calling #' @param ... arguments to pass to \link[isotree]{isolation.forest} +#' @details If \code{use_pcs} is turned on, it will use all available PCs in the +#' PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with +#' default settings and use that if it does not. #' -#' @return the SingleCellExperiment object with \code{outlier_score} -#' and \code{is_outlier} added to the colData. Also, the resulting -#' isotree model is attached to the object's metadata. +#' \code{prediction_thresh} is the threshold to use on the isoforest score +#' predictions +#' @returns the SingleCellExperiment object with \code{outlier_score} and +#' \code{is_outlier} added to the colData. Also, the resulting isotree model +#' is attached to the object's metadata. #' #' @examples #' library(scater) @@ -57,31 +42,56 @@ #' #' query_data <- runPCA(query_data) #' -#' query_data <- calculateOutlierScore( -#' query_data, -#' plot = TRUE, -#' dimred = "PCA", -#' use_pcs = TRUE -#' ) -#' -#' @importFrom scater runPCA plotReducedDim -#' @importFrom SingleCellExperiment SingleCellExperiment -#' reducedDimNames reducedDim logcounts -#' @importFrom S4Vectors metadata -#' @importFrom BiocGenerics t -#' @importFrom isotree isolation.forest -#' +#' query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) #' @export -calculateOutlierScore <- - function(sce, - dimred = NULL, - use_pcs = FALSE, - plot = TRUE, - prediction_thresh = 0.5, - ...) -{ +calculateOutlierScore <- function(sce, + group.var = NULL, + use_pcs = FALSE, + prediction_thresh = 0.5, + ...) { + if (!is.null(group.var)) { + if (is.null(colnames(sce))) stop("This function requires the input SCE to have unique column names when calculating group-wise outlier scores.") + + # There's probably a better way to split an sce by a factor... + # Anyway, manually passing it through factor() like this way retains NAs. + cluster_list <- lapply( + split( + colData(sce), + factor(colData(sce)[[group.var]], + exclude = NULL + ) + ), + \(x) sce[, rownames(x)] + ) + + # TODO: assess if it would be worth making the threshold adaptive. Different + # clusters can have different distributions of outlier scores. + score_list <- lapply(cluster_list, + calculateOutlierScore, + group.var = NULL, + use_pcs = use_pcs, + prediction_thresh = prediction_thresh + ) + + score_df <- Reduce( + lapply( + score_list, + \(x) colData(x)[, c("outlier_score", "is_outlier")] + ), + f = rbind + ) + + # If we were using an ID column instead of column names of the input we could + # do a join. + score_df <- score_df[rownames(colData(sce)), ] + colData(sce) <- cbind(colData(sce), score_df) + + S4Vectors::metadata(sce)$cluster_isotree_model <- lapply(score_list, S4Vectors::metadata) + + return(sce) + } + if (use_pcs) { - if (!("PCA" %in% SingleCellExperiment::reducedDimNames(sce))) { sce <- scater::runPCA(sce) } @@ -93,114 +103,18 @@ calculateOutlierScore <- SingleCellExperiment::logcounts() |> BiocGenerics::t() } - + isotree_res <- X |> isotree::isolation.forest( output_score = TRUE, ... ) - - ## V These scores are the same as running - ## predict(isotree_res$model, X) + + # V These scores are the same as running predict(isotree_res$model, X) sce$outlier_score <- isotree_res$scores sce$is_outlier <- isotree_res$scores >= prediction_thresh - - if (!is.null(dimred) && plot) { - ## TODO: make this plot work when scater is only in the - ## Suggests, or re-do it manually with ggplot. - p <- scater::plotReducedDim(sce, dimred, - color_by = "outlier_score", - shape_by = "is_outlier" - ) - print(p) - } - + S4Vectors::metadata(sce)$isotree_model <- isotree_res$model - - sce -} - -#' Calculate cell outlier scores using isolation forests by group -#' -#' @description This function calculates outlier scores for each cell BY CLUSTER -#' using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for -#' more detail on isoforests. This function calls that one on each cluster -#' separately. So instead of outlierness from the full dataset overall, this -#' function captures outlierness from the cell's cluster. -#' @inheritParams calculateOutlierScore -#' @param clusterVar the name of the cluster variable in the colData of the sce -#' input. -#' @details This function treats NAs in clusterVar as a separate level and -#' applies the outlier scoring to them accordingly. The isolation forest model -#' objects are returned as a list in the metadata. -#' -#' @examples -#' # TODO update this example. -#' library(scater) -#' library(scran) -#' library(scRNAseq) -#' -#' # Load data -#' sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) -#' -#' # Divide the data into reference and query datasets -#' set.seed(100) -#' query_data <- logNormCounts(sce) -#' -#' query_data <- runPCA(query_data) -#' -#' query_data <- calculateOutlierScoreByCluster(query_data, -#' clusterVar = "reclustered.broad", -#' plot = TRUE, -#' dimred = "PCA", -#' use_pcs = TRUE) -#' @export -calculateOutlierScoreByCluster <- function( - sce, - clusterVar, - dimred = NULL, - use_pcs = FALSE, - plot = TRUE, - prediction_thresh = 0.5, - ...) { - - if (is.null(colnames(sce))) stop("This function requires the input SCE to have column names.") - - # There's probably a better way to split an sce by a factor... - # Anyway, manually passing it through factor() like this way retains NAs. - cluster_list = split(colData(sce), - factor(colData(sce)[[clusterVar]], - exclude = NULL)) |> - lapply(\(x) sce[,rownames(x)]) - - # TODO: assess if it would be worth making the threshold adaptive. Different - # clusters can have different distributions of outlier scores. - score_list = lapply(cluster_list, calculateOutlierScore, - plot = FALSE, - dimred = dimred, - use_pcs = use_pcs, - prediction_thresh = prediction_thresh) - - score_df = score_list |> - lapply(\(x) colData(x)[,c("outlier_score", "is_outlier")]) |> - Reduce(f = rbind) - - # If we were using an ID column instead of column names of the input we could - # do a join. - score_df = score_df[rownames(colData(sce)),] - colData(sce) = cbind(colData(sce), score_df) - - S4Vectors::metadata(sce)$cluster_isotree_model = score_list |> lapply(S4Vectors::metadata) - - if (!is.null(dimred) && plot) { - # TODO make this plot work when scater is only in the Suggests, or re-do - # it manually with ggplot. - p <- scater::plotReducedDim(sce, dimred, - color_by = "outlier_score", - shape_by = "is_outlier" - ) - print(p) - } - - return(sce) + + return(sce) } diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd index 7ac6b8d..0df0361 100644 --- a/man/calculateOutlierScore.Rd +++ b/man/calculateOutlierScore.Rd @@ -6,59 +6,50 @@ \usage{ calculateOutlierScore( sce, - dimred = NULL, + group.var = NULL, use_pcs = FALSE, - plot = TRUE, prediction_thresh = 0.5, ... ) } \arguments{ -\item{sce}{a SingleCellExperiment object containing a logcounts -matrix} +\item{sce}{a SingleCellExperiment object containing a logcounts matrix} -\item{dimred}{if specified, plot the specified reduced dimension -with \link[scater]{plotReducedDim}, colored by outlier score} +\item{group.var}{if specified, run the outlier scoring by group (e.g. cell +cluster) instead of across all cells globally.} -\item{use_pcs}{if TRUE, run the outlier detection in principal -component space. Otherwise, use logcounts.} +\item{use_pcs}{if TRUE, run the outlier detection in principal component +space. Otherwise, use logcounts.} -\item{plot}{if TRUE and a dimred is specified, plot the isoscore -against the specified dimred} - -\item{prediction_thresh}{threshold to use for binary outlier -calling} +\item{prediction_thresh}{threshold to use for binary outlier calling} \item{...}{arguments to pass to \link[isotree]{isolation.forest}} } \value{ -the SingleCellExperiment object with \code{outlier_score} - and \code{is_outlier} added to the colData. Also, the resulting - isotree model is attached to the object's metadata. +the SingleCellExperiment object with \code{outlier_score} and + \code{is_outlier} added to the colData. Also, the resulting isotree model + is attached to the object's metadata. } \description{ -This function calculates outlier scores for each cell - using \link[isotree]{isolation.forest}. Isolation forests - explicitly try to estimate the isolation of datapoints, in - contrast to estimating the full distribution of the data. To - quote the isotree documentation: "Isolation Forest is an - algorithm originally developed for outlier detection that - consists in splitting sub-samples of the data according to some - attribute/feature/column at random". Basically, it calculates - how quickly each cell gets split off from the rest of the group - when the dataset is randomly partitioned by a hyperplane. There - are many adjustable parameters (e.g. number of trees, - dimensionality of the splits, etc), but the defaults usually - work pretty well. +This function calculates outlier scores for each cell using + \link[isotree]{isolation.forest}. Isolation forests explicitly try to + estimate the isolation of datapoints, in contrast to estimating the full + distribution of the data. To quote the isotree documentation: "Isolation + Forest is an algorithm originally developed for outlier detection that + consists in splitting sub-samples of the data according to some + attribute/feature/column at random." Basically, it calculates how quickly + each cell gets split off from the rest of the group when the dataset is + randomly partitioned by a hyperplane. There are many adjustable parameters + (e.g. number of trees, dimensionality of the splits, etc), but the defaults + usually work pretty well. } \details{ -If \code{use_pcs} is turned on, it will use all available - PCs in the PCA reduced dimension slot if it exists, and run - \link[scater]{runPCA} with default settings and use that if it - does not. +If \code{use_pcs} is turned on, it will use all available PCs in the + PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with + default settings and use that if it does not. - \code{prediction_thresh} is the threshold to use on the - isoforest score predictions + \code{prediction_thresh} is the threshold to use on the isoforest score + predictions } \examples{ library(scater) @@ -74,11 +65,5 @@ query_data <- logNormCounts(sce) query_data <- runPCA(query_data) -query_data <- calculateOutlierScore( - query_data, - plot = TRUE, - dimred = "PCA", - use_pcs = TRUE -) - +query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) } diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd deleted file mode 100644 index 54388b1..0000000 --- a/man/calculateOutlierScoreByCluster.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/outlierDetection.R -\name{calculateOutlierScoreByCluster} -\alias{calculateOutlierScoreByCluster} -\title{Calculate cell outlier scores using isolation forests by group} -\usage{ -calculateOutlierScoreByCluster( - sce, - clusterVar, - dimred = NULL, - use_pcs = FALSE, - plot = TRUE, - prediction_thresh = 0.5, - ... -) -} -\arguments{ -\item{sce}{a SingleCellExperiment object containing a logcounts matrix} - -\item{clusterVar}{the name of the cluster variable in the colData of the sce -input.} - -\item{dimred}{if specified, plot the specified reduced dimension with -\link[scater]{plotReducedDim}, colored by outlier score} - -\item{use_pcs}{if TRUE, run the outlier detection in principal component -space. Otherwise, use logcounts.} - -\item{plot}{if TRUE and a dimred is specified, plot the isoscore against the -specified dimred} - -\item{prediction_thresh}{threshold to use for binary outlier calling} - -\item{...}{arguments to pass to \link[isotree]{isolation.forest}} -} -\description{ -This function calculates outlier scores for each cell BY CLUSTER - using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for - more detail on isoforests. This function calls that one on each cluster - separately. So instead of outlierness from the full dataset overall, this - function captures outlierness from the cell's cluster. -} -\details{ -This function treats NAs in clusterVar as a separate level and - applies the outlier scoring to them accordingly. The isolation forest model - objects are returned as a list in the metadata. -} -\examples{ -# TODO update this example. -library(scater) -library(scran) -library(scRNAseq) - -# Load data -sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) - -# Divide the data into reference and query datasets -set.seed(100) -query_data <- logNormCounts(sce) - -query_data <- runPCA(query_data) - -query_data <- calculateOutlierScoreByCluster(query_data, - clusterVar = "reclustered.broad", - plot = TRUE, - dimred = "PCA", - use_pcs = TRUE) -} From 51dad318092e71ab5ba9f7d3a67e82823c4074d2 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 15:44:11 -0400 Subject: [PATCH 06/11] add group.var argument, remove plotting/pipes/separate function --- man/calculateOutlierScore.Rd | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd index 0df0361..0138565 100644 --- a/man/calculateOutlierScore.Rd +++ b/man/calculateOutlierScore.Rd @@ -15,20 +15,54 @@ calculateOutlierScore( \arguments{ \item{sce}{a SingleCellExperiment object containing a logcounts matrix} +<<<<<<< HEAD \item{group.var}{if specified, run the outlier scoring by group (e.g. cell cluster) instead of across all cells globally.} +======= +<<<<<<< HEAD +\item{dimred}{if specified, plot the specified reduced dimension +with \link[scater]{plotReducedDim}, colored by outlier score} +======= +\item{group.var}{if specified, run the outlier scoring by group (e.g. cell +cluster) instead of across all cells globally.} +>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) +>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) \item{use_pcs}{if TRUE, run the outlier detection in principal component space. Otherwise, use logcounts.} +<<<<<<< HEAD \item{prediction_thresh}{threshold to use for binary outlier calling} +======= +<<<<<<< HEAD +\item{plot}{if TRUE and a dimred is specified, plot the isoscore +against the specified dimred} + +\item{prediction_thresh}{threshold to use for binary outlier +calling} +======= +\item{prediction_thresh}{threshold to use for binary outlier calling} +>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) +>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) \item{...}{arguments to pass to \link[isotree]{isolation.forest}} } \value{ +<<<<<<< HEAD +the SingleCellExperiment object with \code{outlier_score} and + \code{is_outlier} added to the colData. Also, the resulting isotree model + is attached to the object's metadata. +======= +<<<<<<< HEAD +the SingleCellExperiment object with \code{outlier_score} + and \code{is_outlier} added to the colData. Also, the resulting + isotree model is attached to the object's metadata. +======= the SingleCellExperiment object with \code{outlier_score} and \code{is_outlier} added to the colData. Also, the resulting isotree model is attached to the object's metadata. +>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) +>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) } \description{ This function calculates outlier scores for each cell using @@ -65,5 +99,19 @@ query_data <- logNormCounts(sce) query_data <- runPCA(query_data) +<<<<<<< HEAD +query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) +======= +<<<<<<< HEAD +query_data <- calculateOutlierScore( + query_data, + plot = TRUE, + dimred = "PCA", + use_pcs = TRUE +) + +======= query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) +>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) +>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) } From 915b1f0291865ea11c78b43c108ec873baf1fe25 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 16:03:37 -0400 Subject: [PATCH 07/11] fix git errors --- man/calculateOutlierScore.Rd | 48 ------------------------------------ 1 file changed, 48 deletions(-) diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd index 0138565..0df0361 100644 --- a/man/calculateOutlierScore.Rd +++ b/man/calculateOutlierScore.Rd @@ -15,54 +15,20 @@ calculateOutlierScore( \arguments{ \item{sce}{a SingleCellExperiment object containing a logcounts matrix} -<<<<<<< HEAD \item{group.var}{if specified, run the outlier scoring by group (e.g. cell cluster) instead of across all cells globally.} -======= -<<<<<<< HEAD -\item{dimred}{if specified, plot the specified reduced dimension -with \link[scater]{plotReducedDim}, colored by outlier score} -======= -\item{group.var}{if specified, run the outlier scoring by group (e.g. cell -cluster) instead of across all cells globally.} ->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) ->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) \item{use_pcs}{if TRUE, run the outlier detection in principal component space. Otherwise, use logcounts.} -<<<<<<< HEAD \item{prediction_thresh}{threshold to use for binary outlier calling} -======= -<<<<<<< HEAD -\item{plot}{if TRUE and a dimred is specified, plot the isoscore -against the specified dimred} - -\item{prediction_thresh}{threshold to use for binary outlier -calling} -======= -\item{prediction_thresh}{threshold to use for binary outlier calling} ->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) ->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) \item{...}{arguments to pass to \link[isotree]{isolation.forest}} } \value{ -<<<<<<< HEAD -the SingleCellExperiment object with \code{outlier_score} and - \code{is_outlier} added to the colData. Also, the resulting isotree model - is attached to the object's metadata. -======= -<<<<<<< HEAD -the SingleCellExperiment object with \code{outlier_score} - and \code{is_outlier} added to the colData. Also, the resulting - isotree model is attached to the object's metadata. -======= the SingleCellExperiment object with \code{outlier_score} and \code{is_outlier} added to the colData. Also, the resulting isotree model is attached to the object's metadata. ->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) ->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) } \description{ This function calculates outlier scores for each cell using @@ -99,19 +65,5 @@ query_data <- logNormCounts(sce) query_data <- runPCA(query_data) -<<<<<<< HEAD -query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) -======= -<<<<<<< HEAD -query_data <- calculateOutlierScore( - query_data, - plot = TRUE, - dimred = "PCA", - use_pcs = TRUE -) - -======= query_data <- calculateOutlierScore(query_data, use_pcs = TRUE) ->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style) ->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function) } From 91362e30019e56f3f09195f36bb43ab17c29c63b Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 16:26:32 -0400 Subject: [PATCH 08/11] fix git bug --- R/calculatePairwiseDistancesAndPlotDensity.R | 146 +++++++++---------- 1 file changed, 69 insertions(+), 77 deletions(-) diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R index ca5c6dc..2cafcf2 100644 --- a/R/calculatePairwiseDistancesAndPlotDensity.R +++ b/R/calculatePairwiseDistancesAndPlotDensity.R @@ -114,82 +114,74 @@ calculatePairwiseDistancesAndPlotDensity <- cell_type_reference, distance_metric, correlation_method = c("pearson", "spearman")) -{ - ## Sanity checks - - ## Check if query_data is a SingleCellExperiment object - if (!is(query_data, "SingleCellExperiment")) { - stop("query_data must be a SingleCellExperiment object.") - } + { + ## Sanity checks - ## Check if reference_data is a SingleCellExperiment object - if (!is(reference_data, "SingleCellExperiment")) { - stop("reference_data must be a SingleCellExperiment object.") - } - - ## Subset query and reference data to the specified cell type - query_data_subset <- - query_data[, !is.na(query_data[[query_cell_type_col]]) & - query_data[[query_cell_type_col]] == cell_type_query] - ref_data_subset <- - reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & - reference_data[[ref_cell_type_col]] == cell_type_reference] - - ## Convert to matrix - query_mat <- t(as.matrix(assay(query_data_subset, "logcounts"))) - ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts"))) - - ## Combine query and reference matrices - combined_mat <- rbind(query_mat, ref_mat) - - ## Calculate pairwise distances or correlations for all comparisons - correlation_method <- match.arg(correlation_method) - if (distance_metric == "correlation") { - dist_matrix <- cor(t(combined_mat), method = correlation_method) - } else { - dist_matrix <- dist(combined_mat, method = distance_metric) + ## Check if query_data is a SingleCellExperiment object + if (!is(query_data, "SingleCellExperiment")) { + stop("query_data must be a SingleCellExperiment object.") + } + + ## Check if reference_data is a SingleCellExperiment object + if (!is(reference_data, "SingleCellExperiment")) { + stop("reference_data must be a SingleCellExperiment object.") + } + + ## Subset query and reference data to the specified cell type + query_data_subset <- + query_data[, !is.na(query_data[[query_cell_type_col]]) & + query_data[[query_cell_type_col]] == cell_type_query] + ref_data_subset <- + reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & + reference_data[[ref_cell_type_col]] == cell_type_reference] + + ## Convert to matrix + query_mat <- t(as.matrix(assay(query_data_subset, "logcounts"))) + ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts"))) + + ## Combine query and reference matrices + combined_mat <- rbind(query_mat, ref_mat) + + ## Calculate pairwise distances or correlations for all comparisons + correlation_method <- match.arg(correlation_method) + if (distance_metric == "correlation") { + dist_matrix <- cor(t(combined_mat), method = correlation_method) + } else { + dist_matrix <- dist(combined_mat, method = distance_metric) + } + + ## Convert dist_matrix to a square matrix + dist_matrix <- as.matrix(dist_matrix) + + ## Extract the distances or correlations for the different pairwise + ## comparisons + num_query_cells <- nrow(query_mat) + num_ref_cells <- nrow(ref_mat) + dist_query_query <- + dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] + dist_ref_ref <- + dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] + dist_query_ref <- + dist_matrix[seq_len(num_query_cells), + (num_query_cells + 1):(num_query_cells + num_ref_cells)] + + ## Create data frame for plotting + dist_df <- data.frame( + Comparison = c(rep("Query vs Query", length(dist_query_query)), + rep("Reference vs Reference", length(dist_ref_ref)), + rep("Query vs Reference", length(dist_query_ref))), + Distance = c(as.vector(dist_query_query), + as.vector(dist_ref_ref), + as.vector(dist_query_ref)) + ) + + ## Plot density plots + ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) + + geom_density() + + labs(x = ifelse(distance_metric == "correlation", + paste(correlation_method, "correlation"), + "Distance"), y = "Density", + title = "Pairwise Distance Analysis and Density Visualization") + + theme_bw() } - - ## Convert dist_matrix to a square matrix - dist_matrix <- as.matrix(dist_matrix) - - ## Extract the distances or correlations for the different pairwise - ## comparisons - num_query_cells <- nrow(query_mat) - num_ref_cells <- nrow(ref_mat) -<<<<<<< HEAD - dist_query_query <- - dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] - dist_ref_ref <- - dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] - dist_query_ref <- - dist_matrix[seq_len(num_query_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] -======= - dist_query_query <- dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)] - dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] - dist_query_ref <- dist_matrix[seq_len(num_query_cells), - (num_query_cells + 1):(num_query_cells + num_ref_cells)] ->>>>>>> 716f604 (Modify DESCRIPTION and fix couple of BiocCheck issues) - - ## Create data frame for plotting - dist_df <- data.frame( - Comparison = c(rep("Query vs Query", length(dist_query_query)), - rep("Reference vs Reference", length(dist_ref_ref)), - rep("Query vs Reference", length(dist_query_ref))), - Distance = c(as.vector(dist_query_query), - as.vector(dist_ref_ref), - as.vector(dist_query_ref)) - ) - - ## Plot density plots - ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) + - geom_density() + - labs(x = ifelse(distance_metric == "correlation", - paste(correlation_method, "correlation"), - "Distance"), y = "Density", - title = "Pairwise Distance Analysis and Density Visualization") + - theme_bw() -} From 14621120b1a5896e1c3187e08a26ea49f9f0ccab Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 16:42:21 -0400 Subject: [PATCH 09/11] re-add mysteriously missing vignettes directory... --- vignettes/scDiagnostics.Rmd | 795 ++++++++++++++++++++++++++++++++++++ 1 file changed, 795 insertions(+) create mode 100644 vignettes/scDiagnostics.Rmd diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd new file mode 100644 index 0000000..525f85e --- /dev/null +++ b/vignettes/scDiagnostics.Rmd @@ -0,0 +1,795 @@ +--- +title: "scDiagnostics: diagnostic functions to assess the quality of cell type annotations in single-cell RNA-seq data" +author: + - name: Anthony Christidis + affiliation: Center for Computational Biomedicine, Harvard Medical School + email: anthony-alexander_christidis@hms.harvard.edu + - name: Andrew Ghazi + affiliation: Harvard Stem Cell Institute, Harvard Medical School + - name: Smriti Chawla + affiliation: Center for Computational Biomedicine, Harvard Medical School + - name: Nitesh Turaga + affiliation: Center for Computational Biomedicine, Harvard Medical School + - name: Ludwig Geistlinger + affiliation: Center for Computational Biomedicine, Harvard Medical School + - name: Robert Gentleman + affiliation: Center for Computational Biomedicine, Harvard Medical School +package: scDiagnostics +output: + BiocStyle::html_document: + toc: true + toc_float: true +vignette: > + %\VignetteIndexEntry{scDiagnostics} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Purpose + +Annotation transfer from a reference dataset for the cell type +annotation of a new query single-cell RNA-sequencing (scRNA-seq) +experiment is an integral component of the typical analysis +workflow. The approach provides a fast, automated, and reproducible +alternative to the manual annotation of cell clusters based on marker +gene expression. However, dataset imbalance and undiagnosed +incompatibilities between query and reference dataset can lead to +erroneous annotation and distort downstream applications. + +The `scDiagnostics` package provides functionality for the systematic +evaluation of cell type assignments in scRNA-seq data. +`scDiagnostics` offers a suite of diagnostic functions to assess +whether both (query and reference) datasets are aligned, ensuring that +annotations can be transferred reliably. `scDiagnostics` also provides +functionality to assess annotation ambiguity, cluster heterogeneity, +and marker gene alignment. The implemented functionality helps +researchers to determine how accurately cells from a new scRNA-seq +experiment can be assigned to known cell types. + +# Installation + +To install the development version of the package from Github, use the following +command: + +```{r dev_version_install, eval = FALSE} +BiocManager::install("ccb-hms/scDiagnostics") +``` + +NOTE: you will need the [remotes](https://cran.r-project.org/web/packages/remotes/index.html) +package to install from GitHub. + +To build the package vignettes upon installation use: + +```{r build_vignettes, eval=FALSE} +BiocManager::install("ccb-hms/scDiagnostics", + build_vignettes = TRUE, + dependencies = TRUE) +``` + +# Usage + +To explore the capabilities of the scDiagnostics package, you can load +your own data or utilize publicly available datasets obtained from the +scRNAseq R package. In this guide, we will demonstrate how to use +scDiagnostics with such datasets, which serve as valuable resources +for exploring the package and assessing the appropriateness of cell +type assignments. + +```{r libraries, message = FALSE} +library(scDiagnostics) +library(celldex) +library(corrplot) +library(scater) +library(scran) +library(scRNAseq) +library(AUCell) +library(RColorBrewer) +library(SingleR) +``` + +## Scatter Plot: QC stats vs. Annotation Scores + +Here, we will consider the Human Primary Cell Atlas (Mabbott et +al. 2013) as a reference dataset and our query dataset consists of +Haematopoietic stem and progenitor cells from (Bunis DG et al. 2021). + +In scRNA-seq studies, assessing the quality of cells is important for +accurate downstream analyses. At the same time, assigning accurate +cell type labels based on gene expression profiles is an integral +aspect of scRNA-seq data interpretation. Generally, these two are +performed independently of each other. The rationale behind this +function is to inspect whether certain QC (Quality Control) criteria +impact the confidence level of cell type annotations. + +For instance, it is reasonable to hypothesize that higher library +sizes could contribute to increased annotation confidence due to +enhanced statistical power for identifying cell type-specific gene +expression patterns, as evident in the scatter plot below. + +```{r Scatter-Plot-LibrarySize-Vs-Annotation-Scores, warning=FALSE, message=FALSE} + +# load reference dataset +ref_data <- celldex::HumanPrimaryCellAtlasData() + +# Load query dataset (Bunis haematopoietic stem and progenitor cell +# data) from Bunis DG et al. (2021). Single-Cell Mapping of +# Progressive Fetal-to-Adult Transition in Human Naive T Cells Cell +# Rep. 34(1): 108573 + +query_data <- BunisHSPCData() +rownames(query_data) <- rowData(query_data)$Symbol + +# Add QC metrics to query data +query_data <- addPerCellQCMetrics(query_data) + +# Log transform query dataset +query_data <- logNormCounts(query_data) + +# Run SingleR to predict cell types +pred <- SingleR(query_data, ref_data, labels = ref_data$label.main) + +# Assign predicted labels to query data +colData(query_data)$pred.labels <- pred$labels + +# Get annotation scores +scores <- apply(pred$scores, 1, max) + +# Assign scores to query data +colData(query_data)$cell_scores <- scores + +# Create a scatter plot between library size and annotation scores +p1 <- plotQCvsAnnotation( + query_data = query_data, + qc_col = "total", + label_col = "pred.labels", + score_col = "cell_scores", + label = NULL +) +p1 + xlab("Library Size") +``` + +However, certain QC metrics, such as the proportion of mitochondrial +genes, may require careful consideration as they can sometimes be +associated with cellular states or functions rather than noise. The +interpretation of mitochondrial content should be context-specific and +informed by biological knowledge. + +In next analysis, we investigated the relationship between +mitochondrial percentage and cell type annotation scores using liver +tissue data from He S et al. 2020. Notably, we observed high +annotation scores for macrophages and monocytes. These findings align +with the established biological characteristic of high mitochondrial +activity in macrophages and monocytes, adding biological context to +our results. + +```{r QC-Annotation-Scatter-Mito, warning=FALSE, message=FALSE} +# load query dataset +query_data <- HeOrganAtlasData( + tissue = c("Liver"), + ensembl = FALSE, + location = TRUE +) + +# Add QC metrics to query data + +mito_genes <- rownames(query_data)[grep("^MT-", rownames(query_data))] +query_data <- unfiltered <- addPerCellQC(query_data,subsets = list(mt = mito_genes)) +qc <- quickPerCellQC(colData(query_data), sub.fields = "subsets_mt_percent") +query_data <- query_data[,!qc$discard] + +# Log transform query dataset +query_data <- logNormCounts(query_data) + +# Run SingleR to predict cell types +pred <- SingleR(query_data, ref_data, labels = ref_data$label.main) + +# Assign predicted labels to query data +colData(query_data)$pred.labels <- pred$labels + +# Get annotation scores +scores <- apply(pred$scores, 1, max) + +# Assign scores to query data +colData(query_data)$cell_scores <- scores + +# Create a new column for the labels so it is easy to distinguish +# between Macrophoges, Monocytes and other cells +query_data$label_category <- + ifelse(query_data$pred.labels %in% c("Macrophage", "Monocyte"), + query_data$pred.labels, + "Other cells") + + +# Define custom colors for cell type labels +cols <- c("Other cells" = "grey", "Macrophage" = "green", "Monocyte" = "red") + +# Generate scatter plot for all cell types +p1 <- plotQCvsAnnotation( + query_data = query_data, + qc_col = "subsets_mt_percent", + label_col = "label_category", + score_col = "cell_scores", + label = NULL) + + scale_color_manual(values = cols) + + xlab("subsets_mt_percent") +p1 +``` + +## Examining Distribution of QC stats and Annotation Scores + +In addition to the scatter plot, we can gain further insights into the +gene expression profiles by visualizing the distribution of user +defined QC stats and annotation scores for all the cell types or +specific cell types. This allows us to examine the variation and +patterns in expression levels and scores across cells assigned to the +cell type of interest. + +To accomplish this, we create two separate histograms. The first +histogram displays the distribution of the annotation scores. + +The second histogram visualizes the distribution of QC stats. This +provides insights into the overall gene expression levels for the +specific cell type. Here in this particular example we are +investigating percentage of mitochondrial genes. + +By examining the histograms, we can observe the range, shape, and +potential outliers in the distribution of both annotation scores and +QC stats. This allows us to assess the appropriateness of the cell +type assignments and identify any potential discrepancies or patterns +in the gene expression profiles for the specific cell type. + +```{r Mito-Genes-Vs-Annotation, warning=FALSE, message=FALSE} +# Generate histogram +histQCvsAnnotation(query_data = query_data, qc_col = "subsets_mt_percent", + label_col = "pred.labels", + score_col = "cell_scores", + label = NULL) +``` + +The right-skewed distribution for mitochondrial percentages and a +left-skewed distribution for annotation scores in above histograms +suggest that most cells have lower mitochondrial contamination and +higher confidence in their assigned cell types. + +## Exploring Gene Expression Distribution + +This function helps user to explore the distribution of gene +expression values for a specific gene of interest across all the cells +in both reference and query datasets and within specific cell +types. This helps to evaluate whether the distributions are similar or +aligned between the datasets. Discrepancies in distribution patterns +may indicate potential incompatibilities or differences between the +datasets. + +The function also allows users to narrow down their analysis to +specific cell types of interest. This enables investigation of whether +alignment between the query and reference datasets is consistent not +only at a global level but also within specific cell types. + +```{r Gene-Expression-Histogram, warning=FALSE, message=FALSE} + +# Load data +sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE) + +# Divide the data into reference and query datasets +set.seed(100) +indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE) +ref_data <- sce[, indices] +query_data <- sce[, -indices] + +# Log-transform datasets +ref_data <- logNormCounts(ref_data) +query_data <- logNormCounts(query_data) + +# Run PCA +ref_data <- runPCA(ref_data) +query_data <- runPCA(query_data) + +# Get cell type scores using SingleR +pred <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad) +pred <- as.data.frame(pred) + +# Assign labels to query data +colData(query_data)$labels <- pred$labels + +# Generate density plots +plotMarkerExpression(reference_data = ref_data, + query_data = query_data, + reference_cell_labels = "reclustered.broad", + query_cell_labels = "labels", + gene_name = "MS4A1", + label = "B_and_plasma") +``` + +In the provided example, we examined the distribution of expression +values for the gene MS4A1, a marker for naive B cells, in both the +query and reference datasets. Additionally, we also looked at the +distribution of MS4A1 expression in the B_and_plasma cell type. We +observed overlapping distributions in both cases, suggesting alignment +between the reference and query datasets. + +## Evaluating Alignment Between Reference and Query Datasets in Terms of Highly Variable Genes + +We are assessing the similarity or alignment between two datasets, the +reference dataset, and the query dataset, in terms of highly variable +genes (HVGs). We calculate the overlap coefficient between the sets of +highly variable genes in the reference and query datasets. The overlap +coefficient quantifies the degree of overlap or similarity between +these two sets of genes. A value closer to 1 indicates a higher degree +of overlap, while a value closer to 0 suggests less overlap. The +computed overlap coefficient is printed, providing a numerical measure +of how well the highly variable genes in the reference and query +datasets align. In this case, the overlap coefficient is 0.62, +indicating a moderate level of overlap. + +```{r HVG overlap, warning=FALSE, message=FALSE} + +# Selecting highly variable genes +ref_var <- getTopHVGs(ref_data, n=2000) +query_var <- getTopHVGs(query_data, n=2000) + +# Compute the overlap coefficient +overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var, + query_genes = query_var) +print(overlap_coefficient) +``` + +In the provided example, we examined the distribution of expression +values for the gene MS4A1, a marker for naive B cells, in both the +query and reference datasets. Additionally, we also looked at the +distribution of MS4A1 expression in the B_and_plasma cell type. We +observed overlapping distributions in both cases, suggesting alignment +between the reference and query datasets. + +## Visualize Gene Expression on Dimensional Reduction Plot + +To gain insights into the gene expression patterns and their +representation in a dimensional reduction space, we can utilize the +plotGeneExpressionDimred function. This function allows us to plot the +gene expression values of a specific gene on a dimensional reduction +plot generated using methods like t-SNE, UMAP, or PCA. Each single +cell is color-coded based on its expression level of the gene of +interest. + +In the provided example, we are visualizing the gene expression values +of the gene "VPREB3" on a PCA plot. The PCA plot represents the cells +in a lower-dimensional space, where the x-axis corresponds to the +first principal component (Dimension 1) and the y-axis corresponds to +the second principal component (Dimension 2). Each cell is represented +as a point on the plot, and its color reflects the expression level of +the gene "VPREB3," ranging from low (lighter color) to high (darker +color). + +```{r Gene-Expression-Scatter, warning=FALSE, message=FALSE} +# Generate dimension reduction plot color code by gene expression +plotGeneExpressionDimred(se_object = query_data, + method = "PCA", + n_components = c(1, 2), + feature = "VPREB3") +``` + +The dimensional reduction plot allows us to observe how the gene +expression of VPREB3 is distributed across the cells and whether any +clusters or patterns emerge in the data. + +## Visualize Gene Sets or Pathway Scores on Dimensional Reduction Plot + +In addition to examining individual gene expression patterns, it is +often useful to assess the collective activity of gene sets or +pathways within single cells. This can provide insights into the +functional states or biological processes associated with specific +cell types or conditions. To facilitate this analysis, the +scDiagnostics package includes a function called plotGeneSetScores +that enables the visualization of gene set or pathway scores on a +dimensional reduction plot. + +The plotGeneSetScores function allows you to plot gene set or pathway +scores on a dimensional reduction plot generated using methods such as +PCA, t-SNE, or UMAP. Each single cell is color-coded based on its +scores for specific gene sets or pathways. This visualization helps +identify the heterogeneity and patterns of gene set or pathway +activity within the dataset, potentially revealing subpopulations with +distinct functional characteristics. + +```{r Pathway-Scores-on-Dimensional-Reduction-Scatter, warning=FALSE, message=FALSE} + +# Compute scores using AUCell +expression_matrix <- assay(query_data, "logcounts") +cells_rankings <- AUCell_buildRankings(expression_matrix, plotStats = FALSE) + +# Generate gene sets +gene_set1 <- sample(rownames(expression_matrix), 10) +gene_set2 <- sample(rownames(expression_matrix), 20) + +gene_sets <- list(geneSet1 = gene_set1, + geneSet2 = gene_set2) + +# Calculate AUC scores for gene sets +cells_AUC <- AUCell_calcAUC(gene_sets, cells_rankings) + +# Assign scores to colData +colData(query_data)$geneSetScores <- assay(cells_AUC)["geneSet1", ] + +# Plot gene set scores on PCA +plotGeneSetScores(se_object = query_data, + method = "PCA", + feature = "geneSetScores") +``` + +In the provided example, we demonstrate the usage of the +plotGeneSetScores function using the AUCell package to compute gene +set or pathway scores. Custom gene sets are generated for +demonstration purposes, but users can provide their own gene set +scores using any method of their choice. It is important to ensure +that the scores are assigned to the colData of the reference or query +object and specify the correct feature name for visualization. + +By visualizing gene set or pathway scores on a dimensional reduction +plot, you can gain a comprehensive understanding of the functional +landscape within your single-cell gene expression dataset and explore +the relationships between gene set activities and cellular phenotypes. + +## Visualizing Reference and Query Cell Types using Multidimensional Scaling (MDS) + +This function performs Multidimensional Scaling (MDS) analysis on the +query and reference datasets to examine their similarity. The +dissimilarity matrix is calculated based on the correlation between +the datasets, representing the distances between cells in terms of +gene expression patterns. MDS is then applied to derive +low-dimensional coordinates for each cell. Subsequently, a scatter +plot is generated, where each data point represents a cell, and cell +types are color-coded using custom colors provided by the user. This +visualization enables the comparison of cell type distributions +between the query and reference datasets in a reduced-dimensional +space. + +The rationale behind this function is to visually assess the alignment +and relationships between cell types in the query and reference +datasets. + +```{r CMD-Scatter-Plot, warning=FALSE, message=FALSE} + +# Intersect the gene symbols to obtain common genes +common_genes <- intersect(ref_var, query_var) + +# Select desired cell types +selected_cell_types <- c("CD4", "CD8", "B_and_plasma") +ref_data_subset <- ref_data[common_genes, ref_data$reclustered.broad %in% selected_cell_types] +query_data_subset <- query_data[common_genes, query_data$labels %in% selected_cell_types] + +# Extract cell types for visualization +ref_labels <- ref_data_subset$reclustered.broad +query_labels <- query_data_subset$labels + +# Combine the cell type labels from both datasets +mdata <- c(paste("Query", query_labels), paste("Reference", ref_labels)) + +## Define the cell types and legend order +cell_types <- c("Query CD8", "Reference CD8", "Query CD4", "Reference CD4", "Query B_and_plasma", "Reference B_and_plasma") +legend_order <- cell_types + +## Define the colors for cell types +color_palette <- brewer.pal(length(cell_types), "Paired") +color_mapping <- setNames(color_palette, cell_types) +cell_type_colors <- color_mapping[cell_types] + +## Generate the MDS scatter plot with cell type coloring +visualizeCellTypeMDS(query_data = query_data_subset, + reference_data = ref_data_subset, + mdata = mdata, + cell_type_colors = cell_type_colors, + legend_order = legend_order) +``` + +Upon examining the MDS scatter plot, we observe that the CD4 and CD8 +cell types overlap to some extent.By observing the proximity or +overlap of different cell types, one can gain insights into their +potential relationships or shared characteristics. + +The selection of custom genes and desired cell types depends on the +user's research interests and goals. It allows for flexibility in +focusing on specific genes and examining particular cell types of +interest in the visualization. + +## Cell Type-specific Pairwise Correlation Analysis and Visualization + +This analysis aims to explore the correlation patterns between +different cell types in a single-cell gene expression dataset. The +goal is to compare the gene expression profiles of cells from a +reference dataset and a query dataset to understand the relationships +and similarities between various cell types. + +To perform the analysis, we start by computing the pairwise +correlations between the query and reference cells for selected cell +types ("CD4", "CD8", "B_and_plasma"). The Spearman correlation method +is used, user can also use Pearsons correlation coeefficient. + +This will return average correlation matrix which can be visulaized by +user's method of choice. Here, the results are visualized as a +correlation plot using the corrplot package. + + +```{r Cell-Type-Correlation-Analysis-Visualization, warning=FALSE, message=FALSE} +selected_cell_types <- c("CD4", "CD8", "B_and_plasma") +cor_matrix_avg <- computeAveragePairwiseCorrelation(query_data = query_data_subset, + reference_data = ref_data_subset, + query_cell_type_col = "labels", + ref_cell_type_col = "reclustered.broad", + cell_types = selected_cell_types, + correlation_method = "spearman") + +# Plot the pairwise average correlations using corrplot +corrplot(cor_matrix_avg, method = "number", tl.col = "black") +``` + +In this case, users have the flexibility to extract the gene +expression profiles of specific cell types from the reference and +query datasets and provide these profiles as input to the +function. Additionally, they can select their own set of genes that +they consider relevant for computing the pairwise correlations. For +demonstartion we have used common highly variable genes from reference +and query dataset. + +By providing their own gene expression profiles and choosing specific +genes, users can focus the analysis on the cell types and genes of +interest to their research question. + +## Pairwise Distance Analysis and Density Visualization + +This function serves to conduct a analysis of pairwise distances or +correlations between cells of specific cell types within a single-cell +gene expression dataset. By calculating these distances or +correlations, users can gain insights into the relationships and +differences in gene expression profiles between different cell +types. The function facilitates this analysis by generating density +plots, allowing users to visualize the distribution of distances or +correlations for various pairwise comparisons. + +The analysis offers the flexibility to select a particular cell type +for examination, and users can choose between different distance +metrics, such as "euclidean" or "manhattan," to calculate pairwise +distances. + +To illustrate, the function is applied to the cell type CD8 using the +euclidean distance metric in the example below. + +```{r Pairwise-Distance-Analysis-Density-Visualization, fig.width=8, message=FALSE, warning=FALSE} +calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, + reference_data = ref_data_subset, + query_cell_type_col = "labels", + ref_cell_type_col = "reclustered.broad", + cell_type_query = "CD8", + cell_type_reference = "CD8", + distance_metric = "euclidean") +``` + +Alternatively, users can opt for the "correlation" distance metric, +which measures the similarity in gene expression profiles between +cells. + +To illustrate, the function is applied to the cell type CD8 using the +correlation distance metric in the example below. By selecting either +the "pearson" or "spearman" correlation method, users can emphasize +either linear or rank-based associations, respectively. + +```{r Pairwise-Distance-Correlation-Based-Density-Visualization, warning=FALSE, message=FALSE, fig.width=8} +calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, + reference_data = ref_data_subset, + query_cell_type_col = "labels", + ref_cell_type_col = "reclustered.broad", + cell_type_query = "CD8", + cell_type_reference = "CD8", + distance_metric = "correlation", + correlation_method = "spearman") +``` + +By utilizing this function, users can explore the pairwise distances +between query and reference cells of a specific cell type and gain +insights into the distribution of distances through density +plots. This analysis aids in understanding the similarities and +differences in gene expression profiles for the selected cell type +within the query and reference datasets. + +## PC regression analysis + +Performing PC regression analysis on a SingleCellExperiment object +enables users to examine the relationship between a principal +component (PC) from the dimension reduction slot and an independent +variable of interest. By specifying the desired dependent variable as +one of the principal components (e.g., "PC1", "PC2", etc.) and +providing the corresponding independent variable from the colData slot +(e.g. "cell_type"), users can explore the associations between linear +structure in the single-cell gene expression dataset (reference and +query) and an independent variable of interest (e.g. cell type or +batch). + +The function prints two diagnostic plots by default: + +* a plot of the two PCs with the highest R^2^ with the specified independent variable +* a dot plot showing the R^2^ of each consecutive PC ~ indep.var regression + * Generally you should expect this plot to die off to near 0 before ~PC10 + * Interpretation example: If the R^2^ values are high (>=50%) + anywhere in PCs 1-5 and your independent variable is "batch", you + have batch effects! + +```{r Regression, warning=FALSE, message=FALSE} + +# Specify the dependent variables (principal components) and +# independent variable (e.g., "labels") +dep.vars <- paste0("PC", 1:12) +indep.var <- "labels" + +# Perform linear regression on multiple principal components +result <- regressPC(sce = query_data, + dep.vars = dep.vars, + indep.var = indep.var) + +# Print the summaries of the linear regression models and R-squared +# values + +# Summaries of the linear regression models +result$regression.summaries[[1]] + +# R-squared values +result$rsquared + +# Variance contributions for each principal component +result$var.contributions + +# Total variance explained +result$total.variance.explained +``` + +This analysis helps uncover whether there is a systematic variation in +PC values across different cell types. In the example above, we can +see that the four cell types are spread out across both PC1 and +PC2. Digging into the genes with high loadins on these PCs can help +explain the biological or technical factors driving cellular +heterogeneity. It can help identify PC dimensions that capture +variation specific to certain cell types or distinguish different +cellular states. + +Let's look at the genes driving PC1 by ordering the rotation matrix by +the absolute gene loadings for PC1: + +```{r} +pc_df <- attr(reducedDims(query_data)$PCA, "rotation")[,1:5] |> + as.data.frame() + +pc_df[order(abs(pc_df$PC1)),] |> + tail() +``` + +PC1 is mostly driven by NKG7 - Natural Killer Cell Granule Protein +7. This gene is important in CD8+ T cells, so that makes sense that +it's distinguishing the cell types shown. + +> Exercise: What genes are driving PC2? Do they make sense? + +```{r echo = FALSE, eval = FALSE} +pc_df[order(abs(pc_df$PC2)),] |> + tail() + +# It's IL32 mostly. +``` + +> Exercise: Try to use the command below to examine the spike on + PC5. What's going on there? + +`plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")` + +```{r eval=FALSE, echo=FALSE} +plotPCA(query_data, ncomponents = c(1,5), color_by = "labels") +# The myeloid cells are shifted off from the other types. + +pc_df[order(abs(pc_df$PC5))] |> + tail() +# It's mostly driven by low GNLY expression in the myeloid cells. +``` + +## Outlier detection + +Isolation forests are a fast and effective method for detecting +outliers in high-dimensional data without relying on density +estimation. `calculateOutlierScore()` is a wrapper around the main +function of the [isotree +package](https://cran.r-project.org/web/packages/isotree/index.html). Here +we run it on the PC embeddings but, it can also be informative to run +it on the raw counts. Here we can see some outliers in PC1/2 space +flagged by the default `score > 0.5` threshold due to their clear +isolation (as well as a few that are isolated on the non-visualized +PCs). + +```{r} +query_data <- calculateOutlierScore(query_data, + use_pcs = TRUE, + plot = TRUE, + dimred = "PCA") +``` + +Columns `outlier_score` and `is_outlier` are added to the colData of +the output. The isotree model object itself is added to the metadata. + +One can pass different `dimred` names to change the plot, but running +an isolation in forest in e.g. tSNE or UMAP space does not make sense +due to the non-linearity of these approaches. Hence this is not +allowed - when `use_pcs = FALSE`, the logcounts are used instead. + +## Annotation entropy + +In order to assess the confidence of cell type predictions, we can use +the function `calculateCategorizationEntropy()`. This function +calculates the information entropy of assignment probabilities across +a set of cell types for each cell. If a set of class probabilities are +confident, the entropies will be low. + +This can be used to compare two sets of cell type assignments +(e.g. from different type assignment methods) to compare their +relative confidence. **Please note that this has nothing to do with +their accuracy!** Computational methods can sometimes be confidently +incorrect. + +The cell type probabilities should be passed as a matrix with cell +types as rows and cells as columns. If the columns of the matrix are +not valid probability distributions (i.e. don't sum to 1 as in the +below example), the function will perform a column-wise softmax to +convert them to a probability scale. This may or may not work well +depending on the distribution of the inputs, so if at all possible try +to pass probabilities instead of arbitrary scores. + +In this example, we create 500 random cells with random normal cell +type "scores" across 4 cell types. For demonstration we make the score +of the first class much higher in the first 250 cells. After the +softmax, this will equate to a very high probability of cell type +1. The remaining 250 will have assignments that are roughly even +across the four cell types (i.e. high entropy). + +```{r} +X <- rnorm(500 * 4) |> matrix(nrow = 4) +X[1, 1:250] <- X[1, 1:250] + 5 + +entropy_scores <- calculateCategorizationEntropy(X) +``` + +From the plot we can see that half of the cells (the first half we +shifted to class 1) have low entropy, and half have high entropy. + +# Conclusion + +In this analysis, we have demonstrated the capabilities of the +scDiagnostics package for assessing the appropriateness of cell +assignments in single-cell gene expression profiles. By utilizing +various diagnostic functions and visualization techniques, we have +explored different aspects of the data, including total UMI counts, +annotation scores, gene expression distributions, dimensional +reduction plots, gene set scores, pairwise correlations, pairwise +distances, and linear regression analysis. + +Through the scatter plots, histograms, and dimensional reduction +plots, we were able to gain insights into the relationships between +gene expression patterns, cell types, and the distribution of cells in +a reduced-dimensional space. The examination of gene expression +distributions, gene sets, and pathways allowed us to explore the +functional landscape and identify subpopulations with distinct +characteristics within the dataset. Additionally, the pairwise +correlation and distance analyses provided a deeper understanding of +the similarities and differences between cell types, highlighting +potential relationships and patterns. + +*** + +## R.session Info + +```{r SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA} +options(width = 80) #reset to 'default' width + +sessionInfo() #record the R and package versions used +``` + From b4f1127543c1c5de11fed23bc5830dc27a562ef9 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 16:48:08 -0400 Subject: [PATCH 10/11] git rebase refuses the rebase this --- DESCRIPTION | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 52581a2..0cbd320 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,9 +3,12 @@ Package: scDiagnostics Title: Cell type annotation diagnostics Version: 0.99.0 Authors@R: c( - person("Smriti", "Chawla", , "smriti_chawla@g.harvard.edu", role = c("aut", "cre")), + person("Anthony", "Christidis", role = c("aut", "cre"), + email = "anthony-alexander_christidis@hms.harvard.edu"), person("Andrew", "Ghazi", role = "aut"), - person("Ludwig", "Geistlinger", role = c("aut", "ctb")), + person("Smriti", "Chawla", role = "aut"), + person("Nitesh", "Turaga", role = "ctb"), + person("Ludwig", "Geistlinger", role = "aut"), person("Robert", "Gentleman", role = "aut") ) Description: The scDiagnostics package provides diagnostic plots to @@ -21,7 +24,7 @@ License: Artistic-2.0 URL: https://github.com/ccb-hms/scDiagnostics BugReports: https://github.com/ccb-hms/scDiagnostics/issues Depends: - R (>= 4.2.0) + R (>= 4.3.0) Imports: SingleCellExperiment, isotree, @@ -50,7 +53,15 @@ Suggests: testthat (>= 3.0.0) VignetteBuilder: knitr -biocView: SingleCell, Software +biocViews: + Annotation, + Classification, + Clustering, + GeneExpression, + RNASeq, + SingleCell, + Software, + Transcriptomics Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.1 From ba2f71716448d76671293e91add7273445ed8380 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi Date: Wed, 17 Apr 2024 16:57:48 -0400 Subject: [PATCH 11/11] update vignette --- vignettes/scDiagnostics.Rmd | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd index 525f85e..e10ae23 100644 --- a/vignettes/scDiagnostics.Rmd +++ b/vignettes/scDiagnostics.Rmd @@ -709,18 +709,23 @@ PCs). ```{r} query_data <- calculateOutlierScore(query_data, - use_pcs = TRUE, - plot = TRUE, - dimred = "PCA") + use_pcs = TRUE) + +scater::plotReducedDim(query_data, + dimred = "PCA", + color_by = "outlier_score", + shape_by = "is_outlier") ``` +By default this evaluates an isolation forest across all cells in the object. One can optionally pass a `group.var` (a column in the colData) to evaluate the outlier detection by groups (e.g. cell type). This can help assess whether a cell is atypical amongst those it was clustered with. + Columns `outlier_score` and `is_outlier` are added to the colData of the output. The isotree model object itself is added to the metadata. One can pass different `dimred` names to change the plot, but running an isolation in forest in e.g. tSNE or UMAP space does not make sense due to the non-linearity of these approaches. Hence this is not -allowed - when `use_pcs = FALSE`, the logcounts are used instead. +allowed - when `use_pcs = FALSE`, the raw logcounts are used instead. ## Annotation entropy