From 79c959a2d67278bfd92fa80bb9263174b97889e9 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Tue, 16 Apr 2024 08:21:30 -0400
Subject: [PATCH 01/11] fix outlier score example, start of by group function

---
 R/outlierDetection.R                  | 45 ++++++++++++++++++++
 man/calculateOutlierScoreByCluster.Rd | 59 +++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 man/calculateOutlierScoreByCluster.Rd

diff --git a/R/outlierDetection.R b/R/outlierDetection.R
index 0b5ff83..7cb1cc3 100644
--- a/R/outlierDetection.R
+++ b/R/outlierDetection.R
@@ -119,3 +119,48 @@ calculateOutlierScore <-
 
     sce
 }
+
+#' Calculate cell outlier scores using isolation forests by group
+#'
+#' @description This function calculates outlier scores for each cell BY CLUSTER
+#'   using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for
+#'   more detail on isoforests. This function calls that one on each cluster
+#'   separately. So instead of outlierness from the full dataset overall, this
+#'   function captures outlierness from the cell's cluster.
+#' @inheritParams calculateOutlierScore
+#' @param clusterVar the name of the cluster variable in the colData of the sce
+#'   input.
+#' @examples
+#' # TODO update this example.
+#' library(scater)
+#' library(scran)
+#' library(scRNAseq)
+#'
+#' # Load data
+#' sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
+#'
+#' # Divide the data into reference and query datasets
+#' set.seed(100)
+#' query_data <- logNormCounts(query_data)
+#'
+#' query_data <- runPCA(query_data)
+#'
+#' query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE)
+#' @export
+calculateOutlierScoreByCluster <- function(
+    sce, 
+    clusterVar,
+    dimred = NULL,
+    use_pcs = FALSE,
+    plot = TRUE,
+    prediction_thresh = 0.5,
+    ...) {
+  
+  # TODO implement the functionality.
+  
+  lapply(cluster_levels, calculateOutlierScore)
+  
+  # Combine results
+  
+  return(sce)
+}
diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd
new file mode 100644
index 0000000..32995f2
--- /dev/null
+++ b/man/calculateOutlierScoreByCluster.Rd
@@ -0,0 +1,59 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/outlierDetection.R
+\name{calculateOutlierScoreByCluster}
+\alias{calculateOutlierScoreByCluster}
+\title{Calculate cell outlier scores using isolation forests by group}
+\usage{
+calculateOutlierScoreByCluster(
+  sce,
+  clusterVar,
+  dimred = NULL,
+  use_pcs = FALSE,
+  plot = TRUE,
+  prediction_thresh = 0.5,
+  ...
+)
+}
+\arguments{
+\item{sce}{a SingleCellExperiment object containing a logcounts matrix}
+
+\item{clusterVar}{the name of the cluster variable in the colData of the sce
+input.}
+
+\item{dimred}{if specified, plot the specified reduced dimension with
+\link[scater]{plotReducedDim}, colored by outlier score}
+
+\item{use_pcs}{if TRUE, run the outlier detection in principal component
+space. Otherwise, use logcounts.}
+
+\item{plot}{if TRUE and a dimred is specified, plot the isoscore against the
+specified dimred}
+
+\item{prediction_thresh}{threshold to use for binary outlier calling}
+
+\item{...}{arguments to pass to \link[isotree]{isolation.forest}}
+}
+\description{
+This function calculates outlier scores for each cell BY CLUSTER
+  using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for
+  more detail on isoforests. This function calls that one on each cluster
+  separately. So instead of outlierness from the full dataset overall, this
+  function captures outlierness from the cell's cluster.
+}
+\examples{
+# TODO update this example.
+library(scater)
+library(scran)
+library(scRNAseq)
+
+# Load data
+sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
+
+# Divide the data into reference and query datasets
+set.seed(100)
+query_data <- logNormCounts(query_data)
+
+query_data <- runPCA(query_data)
+
+query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE)
+}

From a71cdbfb4868e12dc66fee195e5ed1aabeae907c Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Tue, 16 Apr 2024 09:15:45 -0400
Subject: [PATCH 02/11] outlier detection by group

---
 NAMESPACE                             |  1 +
 R/outlierDetection.R                  | 50 ++++++++++++++++++++++++---
 man/calculateOutlierScoreByCluster.Rd | 13 +++++--
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 1ede8e8..c15f736 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 export(calculateCategorizationEntropy)
 export(calculateHVGOverlap)
 export(calculateOutlierScore)
+export(calculateOutlierScoreByCluster)
 export(calculatePairwiseDistancesAndPlotDensity)
 export(computeAveragePairwiseCorrelation)
 export(histQCvsAnnotation)
diff --git a/R/outlierDetection.R b/R/outlierDetection.R
index 7cb1cc3..4050bf9 100644
--- a/R/outlierDetection.R
+++ b/R/outlierDetection.R
@@ -130,6 +130,10 @@ calculateOutlierScore <-
 #' @inheritParams calculateOutlierScore
 #' @param clusterVar the name of the cluster variable in the colData of the sce
 #'   input.
+#' @details This function treats NAs in clusterVar as a separate level and
+#'   applies the outlier scoring to them accordingly. The isolation forest model
+#'   objects are returned as a list in the metadata.
+#'
 #' @examples
 #' # TODO update this example.
 #' library(scater)
@@ -141,11 +145,15 @@ calculateOutlierScore <-
 #'
 #' # Divide the data into reference and query datasets
 #' set.seed(100)
-#' query_data <- logNormCounts(query_data)
+#' query_data <- logNormCounts(sce)
 #'
 #' query_data <- runPCA(query_data)
 #'
-#' query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE)
+#' query_data <- calculateOutlierScoreByCluster(query_data,
+#'                                              clusterVar = "reclustered.broad",
+#'                                              plot = TRUE,
+#'                                              dimred = "PCA",
+#'                                              use_pcs = TRUE)
 #' @export
 calculateOutlierScoreByCluster <- function(
     sce, 
@@ -156,11 +164,43 @@ calculateOutlierScoreByCluster <- function(
     prediction_thresh = 0.5,
     ...) {
   
-  # TODO implement the functionality.
+  if (is.null(colnames(sce))) stop("This function requires the input SCE to have column names.")
+  
+  # There's probably a better way to split an sce by a factor...
+  # Anyway, manually passing it through factor() like this way retains NAs.
+  cluster_list = split(colData(sce), 
+                       factor(colData(sce)[[clusterVar]],
+                              exclude = NULL)) |> 
+    lapply(\(x) sce[,rownames(x)]) 
+  
+  # TODO: assess if it would be worth making the threshold adaptive. Different
+  # clusters can have different distributions of outlier scores.
+  score_list = lapply(cluster_list, calculateOutlierScore, 
+                      plot = FALSE,
+                      dimred = dimred,
+                      use_pcs = use_pcs, 
+                      prediction_thresh = prediction_thresh)
+  
+  score_df = score_list |>
+    lapply(\(x) colData(x)[,c("outlier_score", "is_outlier")]) |> 
+    Reduce(f = rbind)
+  
+  # If we were using an ID column instead of column names of the input we could
+  # do a join.
+  score_df = score_df[rownames(colData(sce)),]
+  colData(sce) = cbind(colData(sce), score_df)
   
-  lapply(cluster_levels, calculateOutlierScore)
+  S4Vectors::metadata(sce)$cluster_isotree_model = score_list |> lapply(S4Vectors::metadata)
   
-  # Combine results
+  if (!is.null(dimred) && plot) {
+    # TODO make this plot work when scater is only in the Suggests, or re-do
+    # it manually with ggplot.
+    p <- scater::plotReducedDim(sce, dimred,
+                                color_by = "outlier_score",
+                                shape_by = "is_outlier"
+    )
+    print(p)
+  }
   
   return(sce)
 }
diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd
index 32995f2..54388b1 100644
--- a/man/calculateOutlierScoreByCluster.Rd
+++ b/man/calculateOutlierScoreByCluster.Rd
@@ -40,6 +40,11 @@ This function calculates outlier scores for each cell BY CLUSTER
   separately. So instead of outlierness from the full dataset overall, this
   function captures outlierness from the cell's cluster.
 }
+\details{
+This function treats NAs in clusterVar as a separate level and
+  applies the outlier scoring to them accordingly. The isolation forest model
+  objects are returned as a list in the metadata.
+}
 \examples{
 # TODO update this example.
 library(scater)
@@ -51,9 +56,13 @@ sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
 
 # Divide the data into reference and query datasets
 set.seed(100)
-query_data <- logNormCounts(query_data)
+query_data <- logNormCounts(sce)
 
 query_data <- runPCA(query_data)
 
-query_data <- calculateOutlierScore(query_data, plot = TRUE, dimred = "PCA", use_pcs = TRUE)
+query_data <- calculateOutlierScoreByCluster(query_data,
+                                             clusterVar = "reclustered.broad",
+                                             plot = TRUE,
+                                             dimred = "PCA",
+                                             use_pcs = TRUE)
 }

From 93bcd3a77ae9092a671bc9091c6f71bf55b9c78d Mon Sep 17 00:00:00 2001
From: Nitesh Turaga <nitesh.turaga@gmail.com>
Date: Sun, 14 Apr 2024 09:24:09 -1000
Subject: [PATCH 03/11] rebase entropy

---
 .Rbuildignore               |   7 -
 DESCRIPTION                 |  68 ---
 vignettes/scDiagnostics.Rmd | 795 ------------------------------------
 3 files changed, 870 deletions(-)
 delete mode 100644 .Rbuildignore
 delete mode 100644 DESCRIPTION
 delete mode 100644 vignettes/scDiagnostics.Rmd

diff --git a/.Rbuildignore b/.Rbuildignore
deleted file mode 100644
index 8b06f02..0000000
--- a/.Rbuildignore
+++ /dev/null
@@ -1,7 +0,0 @@
-^.*\.Rproj$
-^\.Rproj\.user$
-^doc$
-^Meta$
-^\.github$
-^docs$
-^pkgdown$
diff --git a/DESCRIPTION b/DESCRIPTION
deleted file mode 100644
index 0cbd320..0000000
--- a/DESCRIPTION
+++ /dev/null
@@ -1,68 +0,0 @@
-Type: Package
-Package: scDiagnostics
-Title: Cell type annotation diagnostics
-Version: 0.99.0
-Authors@R: c(
-    person("Anthony", "Christidis", role = c("aut", "cre"), 
-            email = "anthony-alexander_christidis@hms.harvard.edu"),
-    person("Andrew", "Ghazi", role = "aut"),
-    person("Smriti", "Chawla", role = "aut"),
-    person("Nitesh", "Turaga", role = "ctb"),
-    person("Ludwig", "Geistlinger", role = "aut"),
-    person("Robert", "Gentleman", role = "aut")
-  )
-Description: The scDiagnostics package provides diagnostic plots to
-    assess the quality of cell type assignments from single cell gene
-    expression profiles. The implemented functionality allows to
-    assess the reliability of cell type annotations, investigate gene
-    expression patterns, and explore relationships between different
-    cell types in query and reference datasets allowing users to
-    detect potential misalignments between reference and query
-    datasets. The package also provides visualization capabilities for
-    diagnositics purposes.    
-License: Artistic-2.0
-URL: https://github.com/ccb-hms/scDiagnostics
-BugReports: https://github.com/ccb-hms/scDiagnostics/issues
-Depends:
-    R (>= 4.3.0)
-Imports:
-    SingleCellExperiment,
-    isotree,
-    methods,
-    rlang,
-    ggplot2,
-    gridExtra,
-    scater,
-    SummarizedExperiment,
-    BiocGenerics,
-    S4Vectors,
-    stats,
-    utils
-Suggests:
-    AUCell,
-    BiocStyle,
-    corrplot,
-    knitr,
-    Matrix,
-    RColorBrewer,
-    rmarkdown,
-    scran,
-    scRNAseq,
-    SingleR,
-    celldex,
-    testthat (>= 3.0.0)
-VignetteBuilder: 
-    knitr
-biocViews:
-    Annotation,
-    Classification,
-    Clustering,
-    GeneExpression,
-    RNASeq,
-    SingleCell,
-    Software,
-    Transcriptomics
-Encoding: UTF-8
-LazyData: true
-RoxygenNote: 7.3.1
-Config/testthat/edition: 3
diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd
deleted file mode 100644
index 525f85e..0000000
--- a/vignettes/scDiagnostics.Rmd
+++ /dev/null
@@ -1,795 +0,0 @@
----
-title: "scDiagnostics: diagnostic functions to assess the quality of cell type annotations in single-cell RNA-seq data"
-author:
-    - name: Anthony Christidis
-      affiliation: Center for Computational Biomedicine, Harvard Medical School
-      email: anthony-alexander_christidis@hms.harvard.edu
-    - name: Andrew Ghazi
-      affiliation: Harvard Stem Cell Institute, Harvard Medical School
-    - name: Smriti Chawla
-      affiliation: Center for Computational Biomedicine, Harvard Medical School
-    - name: Nitesh Turaga
-      affiliation: Center for Computational Biomedicine, Harvard Medical School
-    - name: Ludwig Geistlinger
-      affiliation: Center for Computational Biomedicine, Harvard Medical School
-    - name: Robert Gentleman
-      affiliation: Center for Computational Biomedicine, Harvard Medical School
-package: scDiagnostics
-output: 
-  BiocStyle::html_document:
-    toc: true
-    toc_float: true
-vignette: >
-  %\VignetteIndexEntry{scDiagnostics}
-  %\VignetteEngine{knitr::rmarkdown}
-  %\VignetteEncoding{UTF-8}
----
-
-```{r setup, include = FALSE}
-knitr::opts_chunk$set(
-  collapse = TRUE,
-  comment = "#>"
-)
-```
-
-# Purpose
-
-Annotation transfer from a reference dataset for the cell type
-annotation of a new query single-cell RNA-sequencing (scRNA-seq)
-experiment is an integral component of the typical analysis
-workflow. The approach provides a fast, automated, and reproducible
-alternative to the manual annotation of cell clusters based on marker
-gene expression. However, dataset imbalance and undiagnosed
-incompatibilities between query and reference dataset can lead to
-erroneous annotation and distort downstream applications.
-
-The `scDiagnostics` package provides functionality for the systematic
-evaluation of cell type assignments in scRNA-seq data.
-`scDiagnostics` offers a suite of diagnostic functions to assess
-whether both (query and reference) datasets are aligned, ensuring that
-annotations can be transferred reliably. `scDiagnostics` also provides
-functionality to assess annotation ambiguity, cluster heterogeneity,
-and marker gene alignment. The implemented functionality helps
-researchers to determine how accurately cells from a new scRNA-seq
-experiment can be assigned to known cell types.
-
-# Installation
-
-To install the development version of the package from Github, use the following
-command:
-
-```{r dev_version_install, eval = FALSE}
-BiocManager::install("ccb-hms/scDiagnostics")
-```
-
-NOTE: you will need the [remotes](https://cran.r-project.org/web/packages/remotes/index.html)
-package to install from GitHub.
-
-To build the package vignettes upon installation use:
-
-```{r build_vignettes, eval=FALSE}
-BiocManager::install("ccb-hms/scDiagnostics",
-                     build_vignettes = TRUE,
-                     dependencies = TRUE)
-```
-
-# Usage
-
-To explore the capabilities of the scDiagnostics package, you can load
-your own data or utilize publicly available datasets obtained from the
-scRNAseq R package. In this guide, we will demonstrate how to use
-scDiagnostics with such datasets, which serve as valuable resources
-for exploring the package and assessing the appropriateness of cell
-type assignments.
-
-```{r libraries, message = FALSE}
-library(scDiagnostics)
-library(celldex)
-library(corrplot)
-library(scater)
-library(scran)
-library(scRNAseq)
-library(AUCell)
-library(RColorBrewer)
-library(SingleR)
-```
-
-## Scatter Plot: QC stats vs. Annotation Scores
-
-Here, we will consider the Human Primary Cell Atlas (Mabbott et
-al. 2013) as a reference dataset and our query dataset consists of
-Haematopoietic stem and progenitor cells from (Bunis DG et al. 2021).
-
-In scRNA-seq studies, assessing the quality of cells is important for
-accurate downstream analyses. At the same time, assigning accurate
-cell type labels based on gene expression profiles is an integral
-aspect of scRNA-seq data interpretation. Generally, these two are
-performed independently of each other. The rationale behind this
-function is to inspect whether certain QC (Quality Control) criteria
-impact the confidence level of cell type annotations.
-
-For instance, it is reasonable to hypothesize that higher library
-sizes could contribute to increased annotation confidence due to
-enhanced statistical power for identifying cell type-specific gene
-expression patterns, as evident in the scatter plot below.
-
-```{r Scatter-Plot-LibrarySize-Vs-Annotation-Scores, warning=FALSE, message=FALSE}
-
-# load reference dataset
-ref_data <- celldex::HumanPrimaryCellAtlasData()
-
-# Load query dataset (Bunis haematopoietic stem and progenitor cell
-# data) from Bunis DG et al. (2021). Single-Cell Mapping of
-# Progressive Fetal-to-Adult Transition in Human Naive T Cells Cell
-# Rep. 34(1): 108573
-
-query_data <- BunisHSPCData()
-rownames(query_data) <- rowData(query_data)$Symbol
-
-# Add QC metrics to query data
-query_data <- addPerCellQCMetrics(query_data)
-
-# Log transform query dataset
-query_data <- logNormCounts(query_data)
-
-# Run SingleR to predict cell types
-pred <- SingleR(query_data, ref_data, labels = ref_data$label.main)
-
-# Assign predicted labels to query data
-colData(query_data)$pred.labels <- pred$labels
-
-# Get annotation scores
-scores <- apply(pred$scores, 1, max)
-
-# Assign scores to query data
-colData(query_data)$cell_scores <- scores
-
-# Create a scatter plot between library size and annotation scores
-p1 <- plotQCvsAnnotation(
-    query_data = query_data,
-    qc_col = "total",
-    label_col = "pred.labels",
-    score_col = "cell_scores",
-    label = NULL
-)
-p1 + xlab("Library Size")
-```
-
-However, certain QC metrics, such as the proportion of mitochondrial
-genes, may require careful consideration as they can sometimes be
-associated with cellular states or functions rather than noise. The
-interpretation of mitochondrial content should be context-specific and
-informed by biological knowledge.
-
-In next analysis, we investigated the relationship between
-mitochondrial percentage and cell type annotation scores using liver
-tissue data from He S et al. 2020. Notably, we observed high
-annotation scores for macrophages and monocytes. These findings align
-with the established biological characteristic of high mitochondrial
-activity in macrophages and monocytes, adding biological context to
-our results.
-
-```{r QC-Annotation-Scatter-Mito, warning=FALSE, message=FALSE}
-# load query dataset
-query_data <- HeOrganAtlasData(
-    tissue = c("Liver"),
-    ensembl = FALSE,
-    location = TRUE
-)
-
-# Add QC metrics to query data
-
-mito_genes <- rownames(query_data)[grep("^MT-", rownames(query_data))]
-query_data <- unfiltered <- addPerCellQC(query_data,subsets = list(mt = mito_genes))
-qc <- quickPerCellQC(colData(query_data), sub.fields = "subsets_mt_percent")
-query_data <- query_data[,!qc$discard]
-
-# Log transform query dataset
-query_data <- logNormCounts(query_data)
-
-# Run SingleR to predict cell types
-pred <- SingleR(query_data, ref_data, labels = ref_data$label.main)
-
-# Assign predicted labels to query data
-colData(query_data)$pred.labels <- pred$labels
-
-# Get annotation scores
-scores <- apply(pred$scores, 1, max)
-
-# Assign scores to query data
-colData(query_data)$cell_scores <- scores
-
-# Create a new column for the labels so it is easy to distinguish
-#  between Macrophoges, Monocytes and other cells
-query_data$label_category <-
-    ifelse(query_data$pred.labels %in% c("Macrophage", "Monocyte"),
-           query_data$pred.labels,
-           "Other cells")
-
-
-# Define custom colors for cell type labels
-cols <- c("Other cells" = "grey", "Macrophage" = "green", "Monocyte" = "red")
-
-# Generate scatter plot for all cell types
-p1 <- plotQCvsAnnotation(
-    query_data = query_data,
-    qc_col = "subsets_mt_percent",
-    label_col = "label_category",
-    score_col = "cell_scores",
-    label = NULL) + 
-    scale_color_manual(values = cols) +
-    xlab("subsets_mt_percent")
-p1
-```
-
-## Examining Distribution of QC stats and Annotation Scores
-
-In addition to the scatter plot, we can gain further insights into the
-gene expression profiles by visualizing the distribution of user
-defined QC stats and annotation scores for all the cell types or
-specific cell types. This allows us to examine the variation and
-patterns in expression levels and scores across cells assigned to the
-cell type of interest.
-
-To accomplish this, we create two separate histograms. The first
-histogram displays the distribution of the annotation scores.
-
-The second histogram visualizes the distribution of QC stats. This
-provides insights into the overall gene expression levels for the
-specific cell type. Here in this particular example we are
-investigating percentage of mitochondrial genes.
-
-By examining the histograms, we can observe the range, shape, and
-potential outliers in the distribution of both annotation scores and
-QC stats. This allows us to assess the appropriateness of the cell
-type assignments and identify any potential discrepancies or patterns
-in the gene expression profiles for the specific cell type.
-
-```{r Mito-Genes-Vs-Annotation, warning=FALSE, message=FALSE}
-# Generate histogram
-histQCvsAnnotation(query_data = query_data, qc_col = "subsets_mt_percent", 
-                   label_col = "pred.labels", 
-                   score_col = "cell_scores", 
-                   label = NULL)
-```
-
-The right-skewed distribution for mitochondrial percentages and a
-left-skewed distribution for annotation scores in above histograms
-suggest that most cells have lower mitochondrial contamination and
-higher confidence in their assigned cell types.
-
-## Exploring Gene Expression Distribution
-
-This function helps user to explore the distribution of gene
-expression values for a specific gene of interest across all the cells
-in both reference and query datasets and within specific cell
-types. This helps to evaluate whether the distributions are similar or
-aligned between the datasets. Discrepancies in distribution patterns
-may indicate potential incompatibilities or differences between the
-datasets.
-
-The function also allows users to narrow down their analysis to
-specific cell types of interest. This enables investigation of whether
-alignment between the query and reference datasets is consistent not
-only at a global level but also within specific cell types.
-
-```{r Gene-Expression-Histogram, warning=FALSE, message=FALSE}
-
-# Load data
-sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
-
-# Divide the data into reference and query datasets
-set.seed(100)
-indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE)
-ref_data <- sce[, indices]
-query_data <- sce[, -indices]
-
-# Log-transform datasets
-ref_data <- logNormCounts(ref_data)
-query_data <- logNormCounts(query_data)
-
-# Run PCA
-ref_data <- runPCA(ref_data)
-query_data <- runPCA(query_data)
-
-# Get cell type scores using SingleR
-pred <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad)
-pred <- as.data.frame(pred)
-   
-# Assign labels to query data
-colData(query_data)$labels <- pred$labels
-   
-# Generate density plots
-plotMarkerExpression(reference_data = ref_data, 
-                     query_data = query_data, 
-                     reference_cell_labels = "reclustered.broad", 
-                     query_cell_labels = "labels", 
-                     gene_name = "MS4A1", 
-                     label = "B_and_plasma")
-```
-
-In the provided example, we examined the distribution of expression
-values for the gene MS4A1, a marker for naive B cells, in both the
-query and reference datasets. Additionally, we also looked at the
-distribution of MS4A1 expression in the B_and_plasma cell type. We
-observed overlapping distributions in both cases, suggesting alignment
-between the reference and query datasets.
-
-## Evaluating Alignment Between Reference and Query Datasets in Terms of Highly Variable Genes
-
-We are assessing the similarity or alignment between two datasets, the
-reference dataset, and the query dataset, in terms of highly variable
-genes (HVGs). We calculate the overlap coefficient between the sets of
-highly variable genes in the reference and query datasets. The overlap
-coefficient quantifies the degree of overlap or similarity between
-these two sets of genes. A value closer to 1 indicates a higher degree
-of overlap, while a value closer to 0 suggests less overlap. The
-computed overlap coefficient is printed, providing a numerical measure
-of how well the highly variable genes in the reference and query
-datasets align. In this case, the overlap coefficient is 0.62,
-indicating a moderate level of overlap.
-
-```{r HVG overlap, warning=FALSE, message=FALSE}
-
-# Selecting highly variable genes
-ref_var <- getTopHVGs(ref_data, n=2000)
-query_var <- getTopHVGs(query_data, n=2000)
-
-# Compute the overlap coefficient
-overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var, 
-                                           query_genes = query_var)
-print(overlap_coefficient)
-```
-
-In the provided example, we examined the distribution of expression
-values for the gene MS4A1, a marker for naive B cells, in both the
-query and reference datasets. Additionally, we also looked at the
-distribution of MS4A1 expression in the B_and_plasma cell type. We
-observed overlapping distributions in both cases, suggesting alignment
-between the reference and query datasets.
-
-## Visualize Gene Expression on Dimensional Reduction Plot
-
-To gain insights into the gene expression patterns and their
-representation in a dimensional reduction space, we can utilize the
-plotGeneExpressionDimred function. This function allows us to plot the
-gene expression values of a specific gene on a dimensional reduction
-plot generated using methods like t-SNE, UMAP, or PCA. Each single
-cell is color-coded based on its expression level of the gene of
-interest.
-
-In the provided example, we are visualizing the gene expression values
-of the gene "VPREB3" on a PCA plot. The PCA plot represents the cells
-in a lower-dimensional space, where the x-axis corresponds to the
-first principal component (Dimension 1) and the y-axis corresponds to
-the second principal component (Dimension 2). Each cell is represented
-as a point on the plot, and its color reflects the expression level of
-the gene "VPREB3," ranging from low (lighter color) to high (darker
-color).
-
-```{r Gene-Expression-Scatter, warning=FALSE, message=FALSE}
-# Generate dimension reduction plot color code by gene expression
-plotGeneExpressionDimred(se_object = query_data, 
-                         method = "PCA", 
-                         n_components = c(1, 2), 
-                         feature = "VPREB3")
-```
-
-The dimensional reduction plot allows us to observe how the gene
-expression of VPREB3 is distributed across the cells and whether any
-clusters or patterns emerge in the data.
-
-## Visualize Gene Sets or Pathway Scores on Dimensional Reduction Plot
-
-In addition to examining individual gene expression patterns, it is
-often useful to assess the collective activity of gene sets or
-pathways within single cells. This can provide insights into the
-functional states or biological processes associated with specific
-cell types or conditions. To facilitate this analysis, the
-scDiagnostics package includes a function called plotGeneSetScores
-that enables the visualization of gene set or pathway scores on a
-dimensional reduction plot.
-
-The plotGeneSetScores function allows you to plot gene set or pathway
-scores on a dimensional reduction plot generated using methods such as
-PCA, t-SNE, or UMAP. Each single cell is color-coded based on its
-scores for specific gene sets or pathways. This visualization helps
-identify the heterogeneity and patterns of gene set or pathway
-activity within the dataset, potentially revealing subpopulations with
-distinct functional characteristics.
-
-```{r Pathway-Scores-on-Dimensional-Reduction-Scatter, warning=FALSE, message=FALSE}
-
-# Compute scores using AUCell
-expression_matrix <- assay(query_data, "logcounts")
-cells_rankings <- AUCell_buildRankings(expression_matrix, plotStats = FALSE)
-
-# Generate gene sets
-gene_set1 <- sample(rownames(expression_matrix), 10)
-gene_set2 <- sample(rownames(expression_matrix), 20)
-
-gene_sets <- list(geneSet1 = gene_set1,
-                  geneSet2 = gene_set2)
-
-# Calculate AUC scores for gene sets
-cells_AUC <- AUCell_calcAUC(gene_sets, cells_rankings)
-
-# Assign scores to colData
-colData(query_data)$geneSetScores <- assay(cells_AUC)["geneSet1", ]
-
-# Plot gene set scores on PCA
-plotGeneSetScores(se_object = query_data, 
-                  method = "PCA", 
-                  feature = "geneSetScores")
-```
-
-In the provided example, we demonstrate the usage of the
-plotGeneSetScores function using the AUCell package to compute gene
-set or pathway scores. Custom gene sets are generated for
-demonstration purposes, but users can provide their own gene set
-scores using any method of their choice. It is important to ensure
-that the scores are assigned to the colData of the reference or query
-object and specify the correct feature name for visualization.
-
-By visualizing gene set or pathway scores on a dimensional reduction
-plot, you can gain a comprehensive understanding of the functional
-landscape within your single-cell gene expression dataset and explore
-the relationships between gene set activities and cellular phenotypes.
-
-## Visualizing Reference and Query Cell Types using Multidimensional Scaling (MDS)
-
-This function performs Multidimensional Scaling (MDS) analysis on the
-query and reference datasets to examine their similarity. The
-dissimilarity matrix is calculated based on the correlation between
-the datasets, representing the distances between cells in terms of
-gene expression patterns. MDS is then applied to derive
-low-dimensional coordinates for each cell. Subsequently, a scatter
-plot is generated, where each data point represents a cell, and cell
-types are color-coded using custom colors provided by the user. This
-visualization enables the comparison of cell type distributions
-between the query and reference datasets in a reduced-dimensional
-space.
-
-The rationale behind this function is to visually assess the alignment
-and relationships between cell types in the query and reference
-datasets.
-
-```{r CMD-Scatter-Plot, warning=FALSE, message=FALSE}
-
-# Intersect the gene symbols to obtain common genes
-common_genes <- intersect(ref_var, query_var)
-
-# Select desired cell types
-selected_cell_types <- c("CD4", "CD8", "B_and_plasma")
-ref_data_subset <- ref_data[common_genes, ref_data$reclustered.broad %in% selected_cell_types]
-query_data_subset <- query_data[common_genes, query_data$labels %in% selected_cell_types]
-
-# Extract cell types for visualization
-ref_labels <- ref_data_subset$reclustered.broad
-query_labels <- query_data_subset$labels
-
-# Combine the cell type labels from both datasets
-mdata <- c(paste("Query", query_labels), paste("Reference", ref_labels))
-
-## Define the cell types and legend order
-cell_types <- c("Query CD8", "Reference CD8", "Query CD4", "Reference CD4", "Query B_and_plasma", "Reference B_and_plasma")
-legend_order <- cell_types
-
-## Define the colors for cell types
-color_palette <- brewer.pal(length(cell_types), "Paired")
-color_mapping <- setNames(color_palette, cell_types)
-cell_type_colors <- color_mapping[cell_types]
-
-## Generate the MDS scatter plot with cell type coloring
-visualizeCellTypeMDS(query_data = query_data_subset, 
-                     reference_data = ref_data_subset, 
-                     mdata = mdata, 
-                     cell_type_colors = cell_type_colors, 
-                     legend_order = legend_order)
-```
-
-Upon examining the MDS scatter plot, we observe that the CD4 and CD8
-cell types overlap to some extent.By observing the proximity or
-overlap of different cell types, one can gain insights into their
-potential relationships or shared characteristics.
-
-The selection of custom genes and desired cell types depends on the
-user's research interests and goals. It allows for flexibility in
-focusing on specific genes and examining particular cell types of
-interest in the visualization.
-
-## Cell Type-specific Pairwise Correlation Analysis and Visualization
-
-This analysis aims to explore the correlation patterns between
-different cell types in a single-cell gene expression dataset. The
-goal is to compare the gene expression profiles of cells from a
-reference dataset and a query dataset to understand the relationships
-and similarities between various cell types.
-
-To perform the analysis, we start by computing the pairwise
-correlations between the query and reference cells for selected cell
-types ("CD4", "CD8", "B_and_plasma"). The Spearman correlation method
-is used, user can also use Pearsons correlation coeefficient.
-
-This will return average correlation matrix which can be visulaized by
-user's method of choice. Here, the results are visualized as a
-correlation plot using the corrplot package.
-
-
-```{r Cell-Type-Correlation-Analysis-Visualization, warning=FALSE, message=FALSE}
-selected_cell_types <- c("CD4", "CD8", "B_and_plasma")
-cor_matrix_avg <- computeAveragePairwiseCorrelation(query_data = query_data_subset, 
-                                                    reference_data = ref_data_subset, 
-                                                    query_cell_type_col = "labels", 
-                                                    ref_cell_type_col = "reclustered.broad", 
-                                                    cell_types = selected_cell_types, 
-                                                    correlation_method = "spearman")
-
-# Plot the pairwise average correlations using corrplot
-corrplot(cor_matrix_avg, method = "number", tl.col = "black")
-```
-
-In this case, users have the flexibility to extract the gene
-expression profiles of specific cell types from the reference and
-query datasets and provide these profiles as input to the
-function. Additionally, they can select their own set of genes that
-they consider relevant for computing the pairwise correlations. For
-demonstartion we have used common highly variable genes from reference
-and query dataset.
-
-By providing their own gene expression profiles and choosing specific
-genes, users can focus the analysis on the cell types and genes of
-interest to their research question.
-
-## Pairwise Distance Analysis and Density Visualization
-
-This function serves to conduct a analysis of pairwise distances or
-correlations between cells of specific cell types within a single-cell
-gene expression dataset. By calculating these distances or
-correlations, users can gain insights into the relationships and
-differences in gene expression profiles between different cell
-types. The function facilitates this analysis by generating density
-plots, allowing users to visualize the distribution of distances or
-correlations for various pairwise comparisons.
-
-The analysis offers the flexibility to select a particular cell type
-for examination, and users can choose between different distance
-metrics, such as "euclidean" or "manhattan," to calculate pairwise
-distances.
-
-To illustrate, the function is applied to the cell type CD8 using the
-euclidean distance metric in the example below.
-
-```{r Pairwise-Distance-Analysis-Density-Visualization, fig.width=8, message=FALSE, warning=FALSE}
-calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, 
-                                         reference_data = ref_data_subset, 
-                                         query_cell_type_col = "labels", 
-                                         ref_cell_type_col = "reclustered.broad", 
-                                         cell_type_query = "CD8", 
-                                         cell_type_reference = "CD8", 
-                                         distance_metric = "euclidean")
-```
-
-Alternatively, users can opt for the "correlation" distance metric,
-which measures the similarity in gene expression profiles between
-cells.
-
-To illustrate, the function is applied to the cell type CD8 using the
-correlation distance metric in the example below. By selecting either
-the "pearson" or "spearman" correlation method, users can emphasize
-either linear or rank-based associations, respectively.
-
-```{r Pairwise-Distance-Correlation-Based-Density-Visualization, warning=FALSE, message=FALSE, fig.width=8}
-calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, 
-                                         reference_data = ref_data_subset, 
-                                         query_cell_type_col = "labels", 
-                                         ref_cell_type_col = "reclustered.broad", 
-                                         cell_type_query = "CD8", 
-                                         cell_type_reference = "CD8", 
-                                         distance_metric = "correlation",
-                                         correlation_method = "spearman")
-```
-
-By utilizing this function, users can explore the pairwise distances
-between query and reference cells of a specific cell type and gain
-insights into the distribution of distances through density
-plots. This analysis aids in understanding the similarities and
-differences in gene expression profiles for the selected cell type
-within the query and reference datasets.
-
-## PC regression analysis
-
-Performing PC regression analysis on a SingleCellExperiment object
-enables users to examine the relationship between a principal
-component (PC) from the dimension reduction slot and an independent
-variable of interest. By specifying the desired dependent variable as
-one of the principal components (e.g., "PC1", "PC2", etc.) and
-providing the corresponding independent variable from the colData slot
-(e.g. "cell_type"), users can explore the associations between linear
-structure in the single-cell gene expression dataset (reference and
-query) and an independent variable of interest (e.g. cell type or
-batch).
-
-The function prints two diagnostic plots by default:
-
-* a plot of the two PCs with the highest R^2^ with the specified independent variable
-* a dot plot showing the R^2^ of each consecutive PC ~ indep.var regression
-  * Generally you should expect this plot to die off to near 0 before ~PC10
-  * Interpretation example: If the R^2^ values are high (>=50%)
-    anywhere in PCs 1-5 and your independent variable is "batch", you
-    have batch effects!
-
-```{r Regression, warning=FALSE, message=FALSE}
-
-# Specify the dependent variables (principal components) and
-#  independent variable (e.g., "labels")
-dep.vars <- paste0("PC", 1:12)
-indep.var <- "labels"
-
-# Perform linear regression on multiple principal components
-result <- regressPC(sce = query_data,
-                    dep.vars = dep.vars, 
-                    indep.var = indep.var)
-
-# Print the summaries of the linear regression models and R-squared
-#  values
-
-# Summaries of the linear regression models
-result$regression.summaries[[1]]
-
-# R-squared values
-result$rsquared
-
-# Variance contributions for each principal component
-result$var.contributions
-
-# Total variance explained
-result$total.variance.explained
-```
-
-This analysis helps uncover whether there is a systematic variation in
-PC values across different cell types. In the example above, we can
-see that the four cell types are spread out across both PC1 and
-PC2. Digging into the genes with high loadins on these PCs can help
-explain the biological or technical factors driving cellular
-heterogeneity. It can help identify PC dimensions that capture
-variation specific to certain cell types or distinguish different
-cellular states.
-
-Let's look at the genes driving PC1 by ordering the rotation matrix by
-the absolute gene loadings for PC1:
-
-```{r}
-pc_df <-  attr(reducedDims(query_data)$PCA, "rotation")[,1:5] |> 
-  as.data.frame()
-
-pc_df[order(abs(pc_df$PC1)),] |> 
-  tail()
-```
-
-PC1 is mostly driven by NKG7 - Natural Killer Cell Granule Protein
-7. This gene is important in CD8+ T cells, so that makes sense that
-it's distinguishing the cell types shown.
-
-> Exercise: What genes are driving PC2? Do they make sense?
-
-```{r echo = FALSE, eval = FALSE}
-pc_df[order(abs(pc_df$PC2)),] |> 
-  tail()
-
-# It's IL32 mostly.
-```
-
-> Exercise: Try to use the command below to examine the spike on
-  PC5. What's going on there?
-
-`plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")`
-
-```{r eval=FALSE, echo=FALSE}
-plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")
-# The myeloid cells are shifted off from the other types.
-
-pc_df[order(abs(pc_df$PC5))] |> 
-  tail()
-# It's mostly driven by low GNLY expression in the myeloid cells.
-```
-
-## Outlier detection
-
-Isolation forests are a fast and effective method for detecting
-outliers in high-dimensional data without relying on density
-estimation. `calculateOutlierScore()` is a wrapper around the main
-function of the [isotree
-package](https://cran.r-project.org/web/packages/isotree/index.html). Here
-we run it on the PC embeddings but, it can also be informative to run
-it on the raw counts. Here we can see some outliers in PC1/2 space
-flagged by the default `score > 0.5` threshold due to their clear
-isolation (as well as a few that are isolated on the non-visualized
-PCs).
-
-```{r}
-query_data <- calculateOutlierScore(query_data, 
-                                    use_pcs = TRUE, 
-                                    plot = TRUE,
-                                    dimred = "PCA")
-```
-
-Columns `outlier_score` and `is_outlier` are added to the colData of
-the output. The isotree model object itself is added to the metadata.
-
-One can pass different `dimred` names to change the plot, but running
-an isolation in forest in e.g. tSNE or UMAP space does not make sense
-due to the non-linearity of these approaches. Hence this is not
-allowed - when `use_pcs = FALSE`, the logcounts are used instead.
-
-## Annotation entropy
-
-In order to assess the confidence of cell type predictions, we can use
-the function `calculateCategorizationEntropy()`. This function
-calculates the information entropy of assignment probabilities across
-a set of cell types for each cell. If a set of class probabilities are
-confident, the entropies will be low.
-
-This can be used to compare two sets of cell type assignments
-(e.g. from different type assignment methods) to compare their
-relative confidence. **Please note that this has nothing to do with
-their accuracy!** Computational methods can sometimes be confidently
-incorrect.
-
-The cell type probabilities should be passed as a matrix with cell
-types as rows and cells as columns. If the columns of the matrix are
-not valid probability distributions (i.e. don't sum to 1 as in the
-below example), the function will perform a column-wise softmax to
-convert them to a probability scale. This may or may not work well
-depending on the distribution of the inputs, so if at all possible try
-to pass probabilities instead of arbitrary scores.
-
-In this example, we create 500 random cells with random normal cell
-type "scores" across 4 cell types. For demonstration we make the score
-of the first class much higher in the first 250 cells. After the
-softmax, this will equate to a very high probability of cell type
-1. The remaining 250 will have assignments that are roughly even
-across the four cell types (i.e. high entropy).
-
-```{r}
-X <- rnorm(500 * 4) |> matrix(nrow = 4)
-X[1, 1:250] <- X[1, 1:250] + 5 
-
-entropy_scores <- calculateCategorizationEntropy(X)
-```
-
-From the plot we can see that half of the cells (the first half we
-shifted to class 1) have low entropy, and half have high entropy.
-
-# Conclusion
-
-In this analysis, we have demonstrated the capabilities of the
-scDiagnostics package for assessing the appropriateness of cell
-assignments in single-cell gene expression profiles. By utilizing
-various diagnostic functions and visualization techniques, we have
-explored different aspects of the data, including total UMI counts,
-annotation scores, gene expression distributions, dimensional
-reduction plots, gene set scores, pairwise correlations, pairwise
-distances, and linear regression analysis.
-
-Through the scatter plots, histograms, and dimensional reduction
-plots, we were able to gain insights into the relationships between
-gene expression patterns, cell types, and the distribution of cells in
-a reduced-dimensional space. The examination of gene expression
-distributions, gene sets, and pathways allowed us to explore the
-functional landscape and identify subpopulations with distinct
-characteristics within the dataset. Additionally, the pairwise
-correlation and distance analyses provided a deeper understanding of
-the similarities and differences between cell types, highlighting
-potential relationships and patterns.
-
-***
-
-## R.session Info
-
-```{r SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
-options(width = 80) #reset to 'default' width
-
-sessionInfo() #record the R and package versions used
-```
-

From 218b2044b08ba82df0a21660c7d2cd2ec9b1e60d Mon Sep 17 00:00:00 2001
From: Nitesh Turaga <nitesh.turaga@gmail.com>
Date: Sun, 14 Apr 2024 09:58:00 -1000
Subject: [PATCH 04/11] add git merge files

---
 DESCRIPTION                                  | 57 ++++++++++++++++++++
 R/calculatePairwiseDistancesAndPlotDensity.R |  8 +++
 2 files changed, 65 insertions(+)
 create mode 100644 DESCRIPTION

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..52581a2
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,57 @@
+Type: Package
+Package: scDiagnostics
+Title: Cell type annotation diagnostics
+Version: 0.99.0
+Authors@R: c(
+    person("Smriti", "Chawla", , "smriti_chawla@g.harvard.edu", role = c("aut", "cre")),
+    person("Andrew", "Ghazi", role = "aut"),
+    person("Ludwig", "Geistlinger", role = c("aut", "ctb")),
+    person("Robert", "Gentleman", role = "aut")
+  )
+Description: The scDiagnostics package provides diagnostic plots to
+    assess the quality of cell type assignments from single cell gene
+    expression profiles. The implemented functionality allows to
+    assess the reliability of cell type annotations, investigate gene
+    expression patterns, and explore relationships between different
+    cell types in query and reference datasets allowing users to
+    detect potential misalignments between reference and query
+    datasets. The package also provides visualization capabilities for
+    diagnositics purposes.    
+License: Artistic-2.0
+URL: https://github.com/ccb-hms/scDiagnostics
+BugReports: https://github.com/ccb-hms/scDiagnostics/issues
+Depends:
+    R (>= 4.2.0)
+Imports:
+    SingleCellExperiment,
+    isotree,
+    methods,
+    rlang,
+    ggplot2,
+    gridExtra,
+    scater,
+    SummarizedExperiment,
+    BiocGenerics,
+    S4Vectors,
+    stats,
+    utils
+Suggests:
+    AUCell,
+    BiocStyle,
+    corrplot,
+    knitr,
+    Matrix,
+    RColorBrewer,
+    rmarkdown,
+    scran,
+    scRNAseq,
+    SingleR,
+    celldex,
+    testthat (>= 3.0.0)
+VignetteBuilder: 
+    knitr
+biocView: SingleCell, Software
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.3.1
+Config/testthat/edition: 3
diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R
index 393b14a..ca5c6dc 100644
--- a/R/calculatePairwiseDistancesAndPlotDensity.R
+++ b/R/calculatePairwiseDistancesAndPlotDensity.R
@@ -157,6 +157,7 @@ calculatePairwiseDistancesAndPlotDensity <-
     ## comparisons
     num_query_cells <- nrow(query_mat)
     num_ref_cells <- nrow(ref_mat)
+<<<<<<< HEAD
     dist_query_query <-
         dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
     dist_ref_ref <-
@@ -165,6 +166,13 @@ calculatePairwiseDistancesAndPlotDensity <-
     dist_query_ref <-
         dist_matrix[seq_len(num_query_cells),
                     (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+=======
+    dist_query_query <- dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
+    dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
+                              (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+    dist_query_ref <- dist_matrix[seq_len(num_query_cells),
+                                  (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+>>>>>>> 716f604 (Modify DESCRIPTION and fix couple of BiocCheck issues)
 
     ## Create data frame for plotting
     dist_df <- data.frame(

From a1f5e85213556c840836578e58426e6dc861f786 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 15:47:23 -0400
Subject: [PATCH 05/11] remove pipes, remove plotting behavior, add group.var
 argument, use biocthis::bioc_style

---
 NAMESPACE                             |   8 -
 R/outlierDetection.R                  | 244 +++++++++-----------------
 man/calculateOutlierScore.Rd          |  69 +++-----
 man/calculateOutlierScoreByCluster.Rd |  68 -------
 4 files changed, 106 insertions(+), 283 deletions(-)
 delete mode 100644 man/calculateOutlierScoreByCluster.Rd

diff --git a/NAMESPACE b/NAMESPACE
index c15f736..5d06f0e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,7 +3,6 @@
 export(calculateCategorizationEntropy)
 export(calculateHVGOverlap)
 export(calculateOutlierScore)
-export(calculateOutlierScoreByCluster)
 export(calculatePairwiseDistancesAndPlotDensity)
 export(computeAveragePairwiseCorrelation)
 export(histQCvsAnnotation)
@@ -14,12 +13,8 @@ export(plotQCvsAnnotation)
 export(regressPC)
 export(visualizeCellTypeMDS)
 import(SingleCellExperiment)
-importFrom(BiocGenerics,t)
-importFrom(S4Vectors,metadata)
 importFrom(SingleCellExperiment,SingleCellExperiment)
-importFrom(SingleCellExperiment,logcounts)
 importFrom(SingleCellExperiment,reducedDim)
-importFrom(SingleCellExperiment,reducedDimNames)
 importFrom(SummarizedExperiment,assay)
 importFrom(SummarizedExperiment,colData)
 importFrom(ggplot2,aes)
@@ -42,14 +37,11 @@ importFrom(ggplot2,xlab)
 importFrom(ggplot2,ylab)
 importFrom(ggplot2,ylim)
 importFrom(gridExtra,grid.arrange)
-importFrom(isotree,isolation.forest)
 importFrom(methods,is)
 importFrom(rlang,.data)
 importFrom(scater,plotPCA)
-importFrom(scater,plotReducedDim)
 importFrom(scater,plotTSNE)
 importFrom(scater,plotUMAP)
-importFrom(scater,runPCA)
 importFrom(stats,cmdscale)
 importFrom(stats,cor)
 importFrom(stats,dist)
diff --git a/R/outlierDetection.R b/R/outlierDetection.R
index 4050bf9..176c4f5 100644
--- a/R/outlierDetection.R
+++ b/R/outlierDetection.R
@@ -1,47 +1,32 @@
 #' Calculate cell outlier scores using isolation forests
 #'
-#' @description This function calculates outlier scores for each cell
-#'     using \link[isotree]{isolation.forest}. Isolation forests
-#'     explicitly try to estimate the isolation of datapoints, in
-#'     contrast to estimating the full distribution of the data. To
-#'     quote the isotree documentation: "Isolation Forest is an
-#'     algorithm originally developed for outlier detection that
-#'     consists in splitting sub-samples of the data according to some
-#'     attribute/feature/column at random". Basically, it calculates
-#'     how quickly each cell gets split off from the rest of the group
-#'     when the dataset is randomly partitioned by a hyperplane. There
-#'     are many adjustable parameters (e.g. number of trees,
-#'     dimensionality of the splits, etc), but the defaults usually
-#'     work pretty well.
-#'
-#' @details If \code{use_pcs} is turned on, it will use all available
-#'     PCs in the PCA reduced dimension slot if it exists, and run
-#'     \link[scater]{runPCA} with default settings and use that if it
-#'     does not.
-#'
-#'     \code{prediction_thresh} is the threshold to use on the
-#'     isoforest score predictions
-#'
-#' @param sce a SingleCellExperiment object containing a logcounts
-#'     matrix
-#'
-#' @param dimred if specified, plot the specified reduced dimension
-#'     with \link[scater]{plotReducedDim}, colored by outlier score
-#'
-#' @param use_pcs if TRUE, run the outlier detection in principal
-#'     component space. Otherwise, use logcounts.
-#'
-#' @param prediction_thresh threshold to use for binary outlier
-#'     calling
-#'
-#' @param plot if TRUE and a dimred is specified, plot the isoscore
-#'     against the specified dimred
-#'
+#' @description This function calculates outlier scores for each cell using
+#'   \link[isotree]{isolation.forest}. Isolation forests explicitly try to
+#'   estimate the isolation of datapoints, in contrast to estimating the full
+#'   distribution of the data. To quote the isotree documentation: "Isolation
+#'   Forest is an algorithm originally developed for outlier detection that
+#'   consists in splitting sub-samples of the data according to some
+#'   attribute/feature/column at random." Basically, it calculates how quickly
+#'   each cell gets split off from the rest of the group when the dataset is
+#'   randomly partitioned by a hyperplane. There are many adjustable parameters
+#'   (e.g. number of trees, dimensionality of the splits, etc), but the defaults
+#'   usually work pretty well.
+#' @param sce a SingleCellExperiment object containing a logcounts matrix
+#' @param group.var if specified, run the outlier scoring by group (e.g. cell
+#'   cluster) instead of across all cells globally.
+#' @param use_pcs if TRUE, run the outlier detection in principal component
+#'   space. Otherwise, use logcounts.
+#' @param prediction_thresh threshold to use for binary outlier calling
 #' @param ... arguments to pass to \link[isotree]{isolation.forest}
+#' @details If \code{use_pcs} is turned on, it will use all available PCs in the
+#'   PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with
+#'   default settings and use that if it does not.
 #'
-#' @return the SingleCellExperiment object with \code{outlier_score}
-#'     and \code{is_outlier} added to the colData. Also, the resulting
-#'     isotree model is attached to the object's metadata.
+#'   \code{prediction_thresh} is the threshold to use on the isoforest score
+#'   predictions
+#' @returns the SingleCellExperiment object with \code{outlier_score} and
+#'   \code{is_outlier} added to the colData. Also, the resulting isotree model
+#'   is attached to the object's metadata.
 #'
 #' @examples
 #' library(scater)
@@ -57,31 +42,56 @@
 #'
 #' query_data <- runPCA(query_data)
 #'
-#' query_data <- calculateOutlierScore(
-#'     query_data,
-#'     plot = TRUE,
-#'     dimred = "PCA",
-#'     use_pcs = TRUE
-#' )
-#'
-#' @importFrom scater runPCA plotReducedDim
-#' @importFrom SingleCellExperiment SingleCellExperiment
-#'     reducedDimNames reducedDim logcounts
-#' @importFrom S4Vectors metadata
-#' @importFrom BiocGenerics t
-#' @importFrom isotree isolation.forest
-#' 
+#' query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
 #' @export
-calculateOutlierScore <- 
-    function(sce,
-             dimred = NULL,
-             use_pcs = FALSE,
-             plot = TRUE,
-             prediction_thresh = 0.5,
-             ...) 
-{
+calculateOutlierScore <- function(sce,
+                                  group.var = NULL,
+                                  use_pcs = FALSE,
+                                  prediction_thresh = 0.5,
+                                  ...) {
+    if (!is.null(group.var)) {
+        if (is.null(colnames(sce))) stop("This function requires the input SCE to have unique column names when calculating group-wise outlier scores.")
+        
+        # There's probably a better way to split an sce by a factor...
+        # Anyway, manually passing it through factor() like this way retains NAs.
+        cluster_list <- lapply(
+            split(
+                colData(sce),
+                factor(colData(sce)[[group.var]],
+                       exclude = NULL
+                )
+            ),
+            \(x) sce[, rownames(x)]
+        )
+        
+        # TODO: assess if it would be worth making the threshold adaptive. Different
+        # clusters can have different distributions of outlier scores.
+        score_list <- lapply(cluster_list,
+                             calculateOutlierScore,
+                             group.var = NULL,
+                             use_pcs = use_pcs,
+                             prediction_thresh = prediction_thresh
+        )
+        
+        score_df <- Reduce(
+            lapply(
+                score_list,
+                \(x) colData(x)[, c("outlier_score", "is_outlier")]
+            ),
+            f = rbind
+        )
+        
+        # If we were using an ID column instead of column names of the input we could
+        # do a join.
+        score_df <- score_df[rownames(colData(sce)), ]
+        colData(sce) <- cbind(colData(sce), score_df)
+        
+        S4Vectors::metadata(sce)$cluster_isotree_model <- lapply(score_list, S4Vectors::metadata)
+        
+        return(sce)
+    }
+    
     if (use_pcs) {
-
         if (!("PCA" %in% SingleCellExperiment::reducedDimNames(sce))) {
             sce <- scater::runPCA(sce)
         }
@@ -93,114 +103,18 @@ calculateOutlierScore <-
             SingleCellExperiment::logcounts() |>
             BiocGenerics::t()
     }
-
+    
     isotree_res <- X |>
         isotree::isolation.forest(
             output_score = TRUE,
             ...
         )
-
-    ## V These scores are the same as running
-    ## predict(isotree_res$model, X)
+    
+    # V These scores are the same as running predict(isotree_res$model, X)
     sce$outlier_score <- isotree_res$scores
     sce$is_outlier <- isotree_res$scores >= prediction_thresh
-
-    if (!is.null(dimred) && plot) {
-        ## TODO: make this plot work when scater is only in the
-        ## Suggests, or re-do it manually with ggplot.
-        p <- scater::plotReducedDim(sce, dimred,
-                                    color_by = "outlier_score",
-                                    shape_by = "is_outlier"
-                                    )
-        print(p)
-    }
-
+    
     S4Vectors::metadata(sce)$isotree_model <- isotree_res$model
-
-    sce
-}
-
-#' Calculate cell outlier scores using isolation forests by group
-#'
-#' @description This function calculates outlier scores for each cell BY CLUSTER
-#'   using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for
-#'   more detail on isoforests. This function calls that one on each cluster
-#'   separately. So instead of outlierness from the full dataset overall, this
-#'   function captures outlierness from the cell's cluster.
-#' @inheritParams calculateOutlierScore
-#' @param clusterVar the name of the cluster variable in the colData of the sce
-#'   input.
-#' @details This function treats NAs in clusterVar as a separate level and
-#'   applies the outlier scoring to them accordingly. The isolation forest model
-#'   objects are returned as a list in the metadata.
-#'
-#' @examples
-#' # TODO update this example.
-#' library(scater)
-#' library(scran)
-#' library(scRNAseq)
-#'
-#' # Load data
-#' sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
-#'
-#' # Divide the data into reference and query datasets
-#' set.seed(100)
-#' query_data <- logNormCounts(sce)
-#'
-#' query_data <- runPCA(query_data)
-#'
-#' query_data <- calculateOutlierScoreByCluster(query_data,
-#'                                              clusterVar = "reclustered.broad",
-#'                                              plot = TRUE,
-#'                                              dimred = "PCA",
-#'                                              use_pcs = TRUE)
-#' @export
-calculateOutlierScoreByCluster <- function(
-    sce, 
-    clusterVar,
-    dimred = NULL,
-    use_pcs = FALSE,
-    plot = TRUE,
-    prediction_thresh = 0.5,
-    ...) {
-  
-  if (is.null(colnames(sce))) stop("This function requires the input SCE to have column names.")
-  
-  # There's probably a better way to split an sce by a factor...
-  # Anyway, manually passing it through factor() like this way retains NAs.
-  cluster_list = split(colData(sce), 
-                       factor(colData(sce)[[clusterVar]],
-                              exclude = NULL)) |> 
-    lapply(\(x) sce[,rownames(x)]) 
-  
-  # TODO: assess if it would be worth making the threshold adaptive. Different
-  # clusters can have different distributions of outlier scores.
-  score_list = lapply(cluster_list, calculateOutlierScore, 
-                      plot = FALSE,
-                      dimred = dimred,
-                      use_pcs = use_pcs, 
-                      prediction_thresh = prediction_thresh)
-  
-  score_df = score_list |>
-    lapply(\(x) colData(x)[,c("outlier_score", "is_outlier")]) |> 
-    Reduce(f = rbind)
-  
-  # If we were using an ID column instead of column names of the input we could
-  # do a join.
-  score_df = score_df[rownames(colData(sce)),]
-  colData(sce) = cbind(colData(sce), score_df)
-  
-  S4Vectors::metadata(sce)$cluster_isotree_model = score_list |> lapply(S4Vectors::metadata)
-  
-  if (!is.null(dimred) && plot) {
-    # TODO make this plot work when scater is only in the Suggests, or re-do
-    # it manually with ggplot.
-    p <- scater::plotReducedDim(sce, dimred,
-                                color_by = "outlier_score",
-                                shape_by = "is_outlier"
-    )
-    print(p)
-  }
-  
-  return(sce)
+    
+    return(sce)
 }
diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd
index 7ac6b8d..0df0361 100644
--- a/man/calculateOutlierScore.Rd
+++ b/man/calculateOutlierScore.Rd
@@ -6,59 +6,50 @@
 \usage{
 calculateOutlierScore(
   sce,
-  dimred = NULL,
+  group.var = NULL,
   use_pcs = FALSE,
-  plot = TRUE,
   prediction_thresh = 0.5,
   ...
 )
 }
 \arguments{
-\item{sce}{a SingleCellExperiment object containing a logcounts
-matrix}
+\item{sce}{a SingleCellExperiment object containing a logcounts matrix}
 
-\item{dimred}{if specified, plot the specified reduced dimension
-with \link[scater]{plotReducedDim}, colored by outlier score}
+\item{group.var}{if specified, run the outlier scoring by group (e.g. cell
+cluster) instead of across all cells globally.}
 
-\item{use_pcs}{if TRUE, run the outlier detection in principal
-component space. Otherwise, use logcounts.}
+\item{use_pcs}{if TRUE, run the outlier detection in principal component
+space. Otherwise, use logcounts.}
 
-\item{plot}{if TRUE and a dimred is specified, plot the isoscore
-against the specified dimred}
-
-\item{prediction_thresh}{threshold to use for binary outlier
-calling}
+\item{prediction_thresh}{threshold to use for binary outlier calling}
 
 \item{...}{arguments to pass to \link[isotree]{isolation.forest}}
 }
 \value{
-the SingleCellExperiment object with \code{outlier_score}
-    and \code{is_outlier} added to the colData. Also, the resulting
-    isotree model is attached to the object's metadata.
+the SingleCellExperiment object with \code{outlier_score} and
+  \code{is_outlier} added to the colData. Also, the resulting isotree model
+  is attached to the object's metadata.
 }
 \description{
-This function calculates outlier scores for each cell
-    using \link[isotree]{isolation.forest}. Isolation forests
-    explicitly try to estimate the isolation of datapoints, in
-    contrast to estimating the full distribution of the data. To
-    quote the isotree documentation: "Isolation Forest is an
-    algorithm originally developed for outlier detection that
-    consists in splitting sub-samples of the data according to some
-    attribute/feature/column at random". Basically, it calculates
-    how quickly each cell gets split off from the rest of the group
-    when the dataset is randomly partitioned by a hyperplane. There
-    are many adjustable parameters (e.g. number of trees,
-    dimensionality of the splits, etc), but the defaults usually
-    work pretty well.
+This function calculates outlier scores for each cell using
+  \link[isotree]{isolation.forest}. Isolation forests explicitly try to
+  estimate the isolation of datapoints, in contrast to estimating the full
+  distribution of the data. To quote the isotree documentation: "Isolation
+  Forest is an algorithm originally developed for outlier detection that
+  consists in splitting sub-samples of the data according to some
+  attribute/feature/column at random." Basically, it calculates how quickly
+  each cell gets split off from the rest of the group when the dataset is
+  randomly partitioned by a hyperplane. There are many adjustable parameters
+  (e.g. number of trees, dimensionality of the splits, etc), but the defaults
+  usually work pretty well.
 }
 \details{
-If \code{use_pcs} is turned on, it will use all available
-    PCs in the PCA reduced dimension slot if it exists, and run
-    \link[scater]{runPCA} with default settings and use that if it
-    does not.
+If \code{use_pcs} is turned on, it will use all available PCs in the
+  PCA reduced dimension slot if it exists, and run \link[scater]{runPCA} with
+  default settings and use that if it does not.
 
-    \code{prediction_thresh} is the threshold to use on the
-    isoforest score predictions
+  \code{prediction_thresh} is the threshold to use on the isoforest score
+  predictions
 }
 \examples{
 library(scater)
@@ -74,11 +65,5 @@ query_data <- logNormCounts(sce)
 
 query_data <- runPCA(query_data)
 
-query_data <- calculateOutlierScore(
-    query_data,
-    plot = TRUE,
-    dimred = "PCA",
-    use_pcs = TRUE
-)
-
+query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
 }
diff --git a/man/calculateOutlierScoreByCluster.Rd b/man/calculateOutlierScoreByCluster.Rd
deleted file mode 100644
index 54388b1..0000000
--- a/man/calculateOutlierScoreByCluster.Rd
+++ /dev/null
@@ -1,68 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/outlierDetection.R
-\name{calculateOutlierScoreByCluster}
-\alias{calculateOutlierScoreByCluster}
-\title{Calculate cell outlier scores using isolation forests by group}
-\usage{
-calculateOutlierScoreByCluster(
-  sce,
-  clusterVar,
-  dimred = NULL,
-  use_pcs = FALSE,
-  plot = TRUE,
-  prediction_thresh = 0.5,
-  ...
-)
-}
-\arguments{
-\item{sce}{a SingleCellExperiment object containing a logcounts matrix}
-
-\item{clusterVar}{the name of the cluster variable in the colData of the sce
-input.}
-
-\item{dimred}{if specified, plot the specified reduced dimension with
-\link[scater]{plotReducedDim}, colored by outlier score}
-
-\item{use_pcs}{if TRUE, run the outlier detection in principal component
-space. Otherwise, use logcounts.}
-
-\item{plot}{if TRUE and a dimred is specified, plot the isoscore against the
-specified dimred}
-
-\item{prediction_thresh}{threshold to use for binary outlier calling}
-
-\item{...}{arguments to pass to \link[isotree]{isolation.forest}}
-}
-\description{
-This function calculates outlier scores for each cell BY CLUSTER
-  using \link[isotree]{isolation.forest}. See [calculateOutlierScore()] for
-  more detail on isoforests. This function calls that one on each cluster
-  separately. So instead of outlierness from the full dataset overall, this
-  function captures outlierness from the cell's cluster.
-}
-\details{
-This function treats NAs in clusterVar as a separate level and
-  applies the outlier scoring to them accordingly. The isolation forest model
-  objects are returned as a list in the metadata.
-}
-\examples{
-# TODO update this example.
-library(scater)
-library(scran)
-library(scRNAseq)
-
-# Load data
-sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
-
-# Divide the data into reference and query datasets
-set.seed(100)
-query_data <- logNormCounts(sce)
-
-query_data <- runPCA(query_data)
-
-query_data <- calculateOutlierScoreByCluster(query_data,
-                                             clusterVar = "reclustered.broad",
-                                             plot = TRUE,
-                                             dimred = "PCA",
-                                             use_pcs = TRUE)
-}

From 51dad318092e71ab5ba9f7d3a67e82823c4074d2 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 15:44:11 -0400
Subject: [PATCH 06/11] add group.var argument, remove plotting/pipes/separate
 function

---
 man/calculateOutlierScore.Rd | 48 ++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd
index 0df0361..0138565 100644
--- a/man/calculateOutlierScore.Rd
+++ b/man/calculateOutlierScore.Rd
@@ -15,20 +15,54 @@ calculateOutlierScore(
 \arguments{
 \item{sce}{a SingleCellExperiment object containing a logcounts matrix}
 
+<<<<<<< HEAD
 \item{group.var}{if specified, run the outlier scoring by group (e.g. cell
 cluster) instead of across all cells globally.}
+=======
+<<<<<<< HEAD
+\item{dimred}{if specified, plot the specified reduced dimension
+with \link[scater]{plotReducedDim}, colored by outlier score}
+=======
+\item{group.var}{if specified, run the outlier scoring by group (e.g. cell
+cluster) instead of across all cells globally.}
+>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
+>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 
 \item{use_pcs}{if TRUE, run the outlier detection in principal component
 space. Otherwise, use logcounts.}
 
+<<<<<<< HEAD
 \item{prediction_thresh}{threshold to use for binary outlier calling}
+=======
+<<<<<<< HEAD
+\item{plot}{if TRUE and a dimred is specified, plot the isoscore
+against the specified dimred}
+
+\item{prediction_thresh}{threshold to use for binary outlier
+calling}
+=======
+\item{prediction_thresh}{threshold to use for binary outlier calling}
+>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
+>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 
 \item{...}{arguments to pass to \link[isotree]{isolation.forest}}
 }
 \value{
+<<<<<<< HEAD
+the SingleCellExperiment object with \code{outlier_score} and
+  \code{is_outlier} added to the colData. Also, the resulting isotree model
+  is attached to the object's metadata.
+=======
+<<<<<<< HEAD
+the SingleCellExperiment object with \code{outlier_score}
+    and \code{is_outlier} added to the colData. Also, the resulting
+    isotree model is attached to the object's metadata.
+=======
 the SingleCellExperiment object with \code{outlier_score} and
   \code{is_outlier} added to the colData. Also, the resulting isotree model
   is attached to the object's metadata.
+>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
+>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 }
 \description{
 This function calculates outlier scores for each cell using
@@ -65,5 +99,19 @@ query_data <- logNormCounts(sce)
 
 query_data <- runPCA(query_data)
 
+<<<<<<< HEAD
+query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
+=======
+<<<<<<< HEAD
+query_data <- calculateOutlierScore(
+    query_data,
+    plot = TRUE,
+    dimred = "PCA",
+    use_pcs = TRUE
+)
+
+=======
 query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
+>>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
+>>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 }

From 915b1f0291865ea11c78b43c108ec873baf1fe25 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 16:03:37 -0400
Subject: [PATCH 07/11] fix git errors

---
 man/calculateOutlierScore.Rd | 48 ------------------------------------
 1 file changed, 48 deletions(-)

diff --git a/man/calculateOutlierScore.Rd b/man/calculateOutlierScore.Rd
index 0138565..0df0361 100644
--- a/man/calculateOutlierScore.Rd
+++ b/man/calculateOutlierScore.Rd
@@ -15,54 +15,20 @@ calculateOutlierScore(
 \arguments{
 \item{sce}{a SingleCellExperiment object containing a logcounts matrix}
 
-<<<<<<< HEAD
 \item{group.var}{if specified, run the outlier scoring by group (e.g. cell
 cluster) instead of across all cells globally.}
-=======
-<<<<<<< HEAD
-\item{dimred}{if specified, plot the specified reduced dimension
-with \link[scater]{plotReducedDim}, colored by outlier score}
-=======
-\item{group.var}{if specified, run the outlier scoring by group (e.g. cell
-cluster) instead of across all cells globally.}
->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 
 \item{use_pcs}{if TRUE, run the outlier detection in principal component
 space. Otherwise, use logcounts.}
 
-<<<<<<< HEAD
 \item{prediction_thresh}{threshold to use for binary outlier calling}
-=======
-<<<<<<< HEAD
-\item{plot}{if TRUE and a dimred is specified, plot the isoscore
-against the specified dimred}
-
-\item{prediction_thresh}{threshold to use for binary outlier
-calling}
-=======
-\item{prediction_thresh}{threshold to use for binary outlier calling}
->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 
 \item{...}{arguments to pass to \link[isotree]{isolation.forest}}
 }
 \value{
-<<<<<<< HEAD
-the SingleCellExperiment object with \code{outlier_score} and
-  \code{is_outlier} added to the colData. Also, the resulting isotree model
-  is attached to the object's metadata.
-=======
-<<<<<<< HEAD
-the SingleCellExperiment object with \code{outlier_score}
-    and \code{is_outlier} added to the colData. Also, the resulting
-    isotree model is attached to the object's metadata.
-=======
 the SingleCellExperiment object with \code{outlier_score} and
   \code{is_outlier} added to the colData. Also, the resulting isotree model
   is attached to the object's metadata.
->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 }
 \description{
 This function calculates outlier scores for each cell using
@@ -99,19 +65,5 @@ query_data <- logNormCounts(sce)
 
 query_data <- runPCA(query_data)
 
-<<<<<<< HEAD
-query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
-=======
-<<<<<<< HEAD
-query_data <- calculateOutlierScore(
-    query_data,
-    plot = TRUE,
-    dimred = "PCA",
-    use_pcs = TRUE
-)
-
-=======
 query_data <- calculateOutlierScore(query_data, use_pcs = TRUE)
->>>>>>> b3fabc5 (remove pipes, remove plotting behavior, add group.var argument, use biocthis::bioc_style)
->>>>>>> 58dcf42 (add group.var argument, remove plotting/pipes/separate function)
 }

From 91362e30019e56f3f09195f36bb43ab17c29c63b Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 16:26:32 -0400
Subject: [PATCH 08/11] fix git bug

---
 R/calculatePairwiseDistancesAndPlotDensity.R | 146 +++++++++----------
 1 file changed, 69 insertions(+), 77 deletions(-)

diff --git a/R/calculatePairwiseDistancesAndPlotDensity.R b/R/calculatePairwiseDistancesAndPlotDensity.R
index ca5c6dc..2cafcf2 100644
--- a/R/calculatePairwiseDistancesAndPlotDensity.R
+++ b/R/calculatePairwiseDistancesAndPlotDensity.R
@@ -114,82 +114,74 @@ calculatePairwiseDistancesAndPlotDensity <-
              cell_type_reference, 
              distance_metric, 
              correlation_method = c("pearson", "spearman"))
-{
-    ## Sanity checks
-  
-    ## Check if query_data is a SingleCellExperiment object
-    if (!is(query_data, "SingleCellExperiment")) {
-        stop("query_data must be a SingleCellExperiment object.")
-    }
+    {
+        ## Sanity checks
         
-    ## Check if reference_data is a SingleCellExperiment object
-    if (!is(reference_data, "SingleCellExperiment")) {
-        stop("reference_data must be a SingleCellExperiment object.")
-    }
-  
-    ## Subset query and reference data to the specified cell type
-    query_data_subset <-
-        query_data[, !is.na(query_data[[query_cell_type_col]]) & 
-                     query_data[[query_cell_type_col]] == cell_type_query]
-    ref_data_subset <-
-        reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & 
-                         reference_data[[ref_cell_type_col]] == cell_type_reference]
-
-    ## Convert to matrix
-    query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
-    ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts")))
-
-    ## Combine query and reference matrices
-    combined_mat <- rbind(query_mat, ref_mat)
-
-    ## Calculate pairwise distances or correlations for all comparisons
-    correlation_method <- match.arg(correlation_method)
-    if (distance_metric == "correlation") {
-        dist_matrix <- cor(t(combined_mat), method = correlation_method)
-    } else {
-        dist_matrix <- dist(combined_mat, method = distance_metric)
+        ## Check if query_data is a SingleCellExperiment object
+        if (!is(query_data, "SingleCellExperiment")) {
+            stop("query_data must be a SingleCellExperiment object.")
+        }
+        
+        ## Check if reference_data is a SingleCellExperiment object
+        if (!is(reference_data, "SingleCellExperiment")) {
+            stop("reference_data must be a SingleCellExperiment object.")
+        }
+        
+        ## Subset query and reference data to the specified cell type
+        query_data_subset <-
+            query_data[, !is.na(query_data[[query_cell_type_col]]) & 
+                           query_data[[query_cell_type_col]] == cell_type_query]
+        ref_data_subset <-
+            reference_data[, !is.na(reference_data[[ref_cell_type_col]]) & 
+                               reference_data[[ref_cell_type_col]] == cell_type_reference]
+        
+        ## Convert to matrix
+        query_mat <- t(as.matrix(assay(query_data_subset, "logcounts")))
+        ref_mat <- t(as.matrix(assay(ref_data_subset, "logcounts")))
+        
+        ## Combine query and reference matrices
+        combined_mat <- rbind(query_mat, ref_mat)
+        
+        ## Calculate pairwise distances or correlations for all comparisons
+        correlation_method <- match.arg(correlation_method)
+        if (distance_metric == "correlation") {
+            dist_matrix <- cor(t(combined_mat), method = correlation_method)
+        } else {
+            dist_matrix <- dist(combined_mat, method = distance_metric)
+        }
+        
+        ## Convert dist_matrix to a square matrix
+        dist_matrix <- as.matrix(dist_matrix)
+        
+        ## Extract the distances or correlations for the different pairwise
+        ## comparisons
+        num_query_cells <- nrow(query_mat)
+        num_ref_cells <- nrow(ref_mat)
+        dist_query_query <-
+            dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
+        dist_ref_ref <-
+            dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
+                        (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+        dist_query_ref <-
+            dist_matrix[seq_len(num_query_cells),
+                        (num_query_cells + 1):(num_query_cells + num_ref_cells)]
+        
+        ## Create data frame for plotting
+        dist_df <- data.frame(
+            Comparison = c(rep("Query vs Query", length(dist_query_query)),
+                           rep("Reference vs Reference", length(dist_ref_ref)),
+                           rep("Query vs Reference", length(dist_query_ref))),
+            Distance = c(as.vector(dist_query_query),
+                         as.vector(dist_ref_ref),
+                         as.vector(dist_query_ref))
+        )
+        
+        ## Plot density plots
+        ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
+            geom_density() +
+            labs(x = ifelse(distance_metric == "correlation", 
+                            paste(correlation_method, "correlation"), 
+                            "Distance"), y = "Density", 
+                 title = "Pairwise Distance Analysis and Density Visualization") +
+            theme_bw()
     }
-
-    ## Convert dist_matrix to a square matrix
-    dist_matrix <- as.matrix(dist_matrix)
-
-    ## Extract the distances or correlations for the different pairwise
-    ## comparisons
-    num_query_cells <- nrow(query_mat)
-    num_ref_cells <- nrow(ref_mat)
-<<<<<<< HEAD
-    dist_query_query <-
-        dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
-    dist_ref_ref <-
-        dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
-                    (num_query_cells + 1):(num_query_cells + num_ref_cells)]
-    dist_query_ref <-
-        dist_matrix[seq_len(num_query_cells),
-                    (num_query_cells + 1):(num_query_cells + num_ref_cells)]
-=======
-    dist_query_query <- dist_matrix[seq_len(num_query_cells), seq_len(num_query_cells)]
-    dist_ref_ref <- dist_matrix[(num_query_cells + 1):(num_query_cells + num_ref_cells), 
-                              (num_query_cells + 1):(num_query_cells + num_ref_cells)]
-    dist_query_ref <- dist_matrix[seq_len(num_query_cells),
-                                  (num_query_cells + 1):(num_query_cells + num_ref_cells)]
->>>>>>> 716f604 (Modify DESCRIPTION and fix couple of BiocCheck issues)
-
-    ## Create data frame for plotting
-    dist_df <- data.frame(
-        Comparison = c(rep("Query vs Query", length(dist_query_query)),
-                       rep("Reference vs Reference", length(dist_ref_ref)),
-                       rep("Query vs Reference", length(dist_query_ref))),
-        Distance = c(as.vector(dist_query_query),
-                     as.vector(dist_ref_ref),
-                     as.vector(dist_query_ref))
-    )
-    
-    ## Plot density plots
-    ggplot(dist_df, aes(x = .data$Distance, color = .data$Comparison)) +
-        geom_density() +
-        labs(x = ifelse(distance_metric == "correlation", 
-                        paste(correlation_method, "correlation"), 
-                        "Distance"), y = "Density", 
-             title = "Pairwise Distance Analysis and Density Visualization") +
-        theme_bw()
-}

From 14621120b1a5896e1c3187e08a26ea49f9f0ccab Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 16:42:21 -0400
Subject: [PATCH 09/11] re-add mysteriously missing vignettes directory...

---
 vignettes/scDiagnostics.Rmd | 795 ++++++++++++++++++++++++++++++++++++
 1 file changed, 795 insertions(+)
 create mode 100644 vignettes/scDiagnostics.Rmd

diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd
new file mode 100644
index 0000000..525f85e
--- /dev/null
+++ b/vignettes/scDiagnostics.Rmd
@@ -0,0 +1,795 @@
+---
+title: "scDiagnostics: diagnostic functions to assess the quality of cell type annotations in single-cell RNA-seq data"
+author:
+    - name: Anthony Christidis
+      affiliation: Center for Computational Biomedicine, Harvard Medical School
+      email: anthony-alexander_christidis@hms.harvard.edu
+    - name: Andrew Ghazi
+      affiliation: Harvard Stem Cell Institute, Harvard Medical School
+    - name: Smriti Chawla
+      affiliation: Center for Computational Biomedicine, Harvard Medical School
+    - name: Nitesh Turaga
+      affiliation: Center for Computational Biomedicine, Harvard Medical School
+    - name: Ludwig Geistlinger
+      affiliation: Center for Computational Biomedicine, Harvard Medical School
+    - name: Robert Gentleman
+      affiliation: Center for Computational Biomedicine, Harvard Medical School
+package: scDiagnostics
+output: 
+  BiocStyle::html_document:
+    toc: true
+    toc_float: true
+vignette: >
+  %\VignetteIndexEntry{scDiagnostics}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+
+# Purpose
+
+Annotation transfer from a reference dataset for the cell type
+annotation of a new query single-cell RNA-sequencing (scRNA-seq)
+experiment is an integral component of the typical analysis
+workflow. The approach provides a fast, automated, and reproducible
+alternative to the manual annotation of cell clusters based on marker
+gene expression. However, dataset imbalance and undiagnosed
+incompatibilities between query and reference dataset can lead to
+erroneous annotation and distort downstream applications.
+
+The `scDiagnostics` package provides functionality for the systematic
+evaluation of cell type assignments in scRNA-seq data.
+`scDiagnostics` offers a suite of diagnostic functions to assess
+whether both (query and reference) datasets are aligned, ensuring that
+annotations can be transferred reliably. `scDiagnostics` also provides
+functionality to assess annotation ambiguity, cluster heterogeneity,
+and marker gene alignment. The implemented functionality helps
+researchers to determine how accurately cells from a new scRNA-seq
+experiment can be assigned to known cell types.
+
+# Installation
+
+To install the development version of the package from Github, use the following
+command:
+
+```{r dev_version_install, eval = FALSE}
+BiocManager::install("ccb-hms/scDiagnostics")
+```
+
+NOTE: you will need the [remotes](https://cran.r-project.org/web/packages/remotes/index.html)
+package to install from GitHub.
+
+To build the package vignettes upon installation use:
+
+```{r build_vignettes, eval=FALSE}
+BiocManager::install("ccb-hms/scDiagnostics",
+                     build_vignettes = TRUE,
+                     dependencies = TRUE)
+```
+
+# Usage
+
+To explore the capabilities of the scDiagnostics package, you can load
+your own data or utilize publicly available datasets obtained from the
+scRNAseq R package. In this guide, we will demonstrate how to use
+scDiagnostics with such datasets, which serve as valuable resources
+for exploring the package and assessing the appropriateness of cell
+type assignments.
+
+```{r libraries, message = FALSE}
+library(scDiagnostics)
+library(celldex)
+library(corrplot)
+library(scater)
+library(scran)
+library(scRNAseq)
+library(AUCell)
+library(RColorBrewer)
+library(SingleR)
+```
+
+## Scatter Plot: QC stats vs. Annotation Scores
+
+Here, we will consider the Human Primary Cell Atlas (Mabbott et
+al. 2013) as a reference dataset and our query dataset consists of
+Haematopoietic stem and progenitor cells from (Bunis DG et al. 2021).
+
+In scRNA-seq studies, assessing the quality of cells is important for
+accurate downstream analyses. At the same time, assigning accurate
+cell type labels based on gene expression profiles is an integral
+aspect of scRNA-seq data interpretation. Generally, these two are
+performed independently of each other. The rationale behind this
+function is to inspect whether certain QC (Quality Control) criteria
+impact the confidence level of cell type annotations.
+
+For instance, it is reasonable to hypothesize that higher library
+sizes could contribute to increased annotation confidence due to
+enhanced statistical power for identifying cell type-specific gene
+expression patterns, as evident in the scatter plot below.
+
+```{r Scatter-Plot-LibrarySize-Vs-Annotation-Scores, warning=FALSE, message=FALSE}
+
+# load reference dataset
+ref_data <- celldex::HumanPrimaryCellAtlasData()
+
+# Load query dataset (Bunis haematopoietic stem and progenitor cell
+# data) from Bunis DG et al. (2021). Single-Cell Mapping of
+# Progressive Fetal-to-Adult Transition in Human Naive T Cells Cell
+# Rep. 34(1): 108573
+
+query_data <- BunisHSPCData()
+rownames(query_data) <- rowData(query_data)$Symbol
+
+# Add QC metrics to query data
+query_data <- addPerCellQCMetrics(query_data)
+
+# Log transform query dataset
+query_data <- logNormCounts(query_data)
+
+# Run SingleR to predict cell types
+pred <- SingleR(query_data, ref_data, labels = ref_data$label.main)
+
+# Assign predicted labels to query data
+colData(query_data)$pred.labels <- pred$labels
+
+# Get annotation scores
+scores <- apply(pred$scores, 1, max)
+
+# Assign scores to query data
+colData(query_data)$cell_scores <- scores
+
+# Create a scatter plot between library size and annotation scores
+p1 <- plotQCvsAnnotation(
+    query_data = query_data,
+    qc_col = "total",
+    label_col = "pred.labels",
+    score_col = "cell_scores",
+    label = NULL
+)
+p1 + xlab("Library Size")
+```
+
+However, certain QC metrics, such as the proportion of mitochondrial
+genes, may require careful consideration as they can sometimes be
+associated with cellular states or functions rather than noise. The
+interpretation of mitochondrial content should be context-specific and
+informed by biological knowledge.
+
+In next analysis, we investigated the relationship between
+mitochondrial percentage and cell type annotation scores using liver
+tissue data from He S et al. 2020. Notably, we observed high
+annotation scores for macrophages and monocytes. These findings align
+with the established biological characteristic of high mitochondrial
+activity in macrophages and monocytes, adding biological context to
+our results.
+
+```{r QC-Annotation-Scatter-Mito, warning=FALSE, message=FALSE}
+# load query dataset
+query_data <- HeOrganAtlasData(
+    tissue = c("Liver"),
+    ensembl = FALSE,
+    location = TRUE
+)
+
+# Add QC metrics to query data
+
+mito_genes <- rownames(query_data)[grep("^MT-", rownames(query_data))]
+query_data <- unfiltered <- addPerCellQC(query_data,subsets = list(mt = mito_genes))
+qc <- quickPerCellQC(colData(query_data), sub.fields = "subsets_mt_percent")
+query_data <- query_data[,!qc$discard]
+
+# Log transform query dataset
+query_data <- logNormCounts(query_data)
+
+# Run SingleR to predict cell types
+pred <- SingleR(query_data, ref_data, labels = ref_data$label.main)
+
+# Assign predicted labels to query data
+colData(query_data)$pred.labels <- pred$labels
+
+# Get annotation scores
+scores <- apply(pred$scores, 1, max)
+
+# Assign scores to query data
+colData(query_data)$cell_scores <- scores
+
+# Create a new column for the labels so it is easy to distinguish
+#  between Macrophoges, Monocytes and other cells
+query_data$label_category <-
+    ifelse(query_data$pred.labels %in% c("Macrophage", "Monocyte"),
+           query_data$pred.labels,
+           "Other cells")
+
+
+# Define custom colors for cell type labels
+cols <- c("Other cells" = "grey", "Macrophage" = "green", "Monocyte" = "red")
+
+# Generate scatter plot for all cell types
+p1 <- plotQCvsAnnotation(
+    query_data = query_data,
+    qc_col = "subsets_mt_percent",
+    label_col = "label_category",
+    score_col = "cell_scores",
+    label = NULL) + 
+    scale_color_manual(values = cols) +
+    xlab("subsets_mt_percent")
+p1
+```
+
+## Examining Distribution of QC stats and Annotation Scores
+
+In addition to the scatter plot, we can gain further insights into the
+gene expression profiles by visualizing the distribution of user
+defined QC stats and annotation scores for all the cell types or
+specific cell types. This allows us to examine the variation and
+patterns in expression levels and scores across cells assigned to the
+cell type of interest.
+
+To accomplish this, we create two separate histograms. The first
+histogram displays the distribution of the annotation scores.
+
+The second histogram visualizes the distribution of QC stats. This
+provides insights into the overall gene expression levels for the
+specific cell type. Here in this particular example we are
+investigating percentage of mitochondrial genes.
+
+By examining the histograms, we can observe the range, shape, and
+potential outliers in the distribution of both annotation scores and
+QC stats. This allows us to assess the appropriateness of the cell
+type assignments and identify any potential discrepancies or patterns
+in the gene expression profiles for the specific cell type.
+
+```{r Mito-Genes-Vs-Annotation, warning=FALSE, message=FALSE}
+# Generate histogram
+histQCvsAnnotation(query_data = query_data, qc_col = "subsets_mt_percent", 
+                   label_col = "pred.labels", 
+                   score_col = "cell_scores", 
+                   label = NULL)
+```
+
+The right-skewed distribution for mitochondrial percentages and a
+left-skewed distribution for annotation scores in above histograms
+suggest that most cells have lower mitochondrial contamination and
+higher confidence in their assigned cell types.
+
+## Exploring Gene Expression Distribution
+
+This function helps user to explore the distribution of gene
+expression values for a specific gene of interest across all the cells
+in both reference and query datasets and within specific cell
+types. This helps to evaluate whether the distributions are similar or
+aligned between the datasets. Discrepancies in distribution patterns
+may indicate potential incompatibilities or differences between the
+datasets.
+
+The function also allows users to narrow down their analysis to
+specific cell types of interest. This enables investigation of whether
+alignment between the query and reference datasets is consistent not
+only at a global level but also within specific cell types.
+
+```{r Gene-Expression-Histogram, warning=FALSE, message=FALSE}
+
+# Load data
+sce <- HeOrganAtlasData(tissue = c("Marrow"), ensembl = FALSE)
+
+# Divide the data into reference and query datasets
+set.seed(100)
+indices <- sample(ncol(assay(sce)), size = floor(0.7 * ncol(assay(sce))), replace = FALSE)
+ref_data <- sce[, indices]
+query_data <- sce[, -indices]
+
+# Log-transform datasets
+ref_data <- logNormCounts(ref_data)
+query_data <- logNormCounts(query_data)
+
+# Run PCA
+ref_data <- runPCA(ref_data)
+query_data <- runPCA(query_data)
+
+# Get cell type scores using SingleR
+pred <- SingleR(query_data, ref_data, labels = ref_data$reclustered.broad)
+pred <- as.data.frame(pred)
+   
+# Assign labels to query data
+colData(query_data)$labels <- pred$labels
+   
+# Generate density plots
+plotMarkerExpression(reference_data = ref_data, 
+                     query_data = query_data, 
+                     reference_cell_labels = "reclustered.broad", 
+                     query_cell_labels = "labels", 
+                     gene_name = "MS4A1", 
+                     label = "B_and_plasma")
+```
+
+In the provided example, we examined the distribution of expression
+values for the gene MS4A1, a marker for naive B cells, in both the
+query and reference datasets. Additionally, we also looked at the
+distribution of MS4A1 expression in the B_and_plasma cell type. We
+observed overlapping distributions in both cases, suggesting alignment
+between the reference and query datasets.
+
+## Evaluating Alignment Between Reference and Query Datasets in Terms of Highly Variable Genes
+
+We are assessing the similarity or alignment between two datasets, the
+reference dataset, and the query dataset, in terms of highly variable
+genes (HVGs). We calculate the overlap coefficient between the sets of
+highly variable genes in the reference and query datasets. The overlap
+coefficient quantifies the degree of overlap or similarity between
+these two sets of genes. A value closer to 1 indicates a higher degree
+of overlap, while a value closer to 0 suggests less overlap. The
+computed overlap coefficient is printed, providing a numerical measure
+of how well the highly variable genes in the reference and query
+datasets align. In this case, the overlap coefficient is 0.62,
+indicating a moderate level of overlap.
+
+```{r HVG overlap, warning=FALSE, message=FALSE}
+
+# Selecting highly variable genes
+ref_var <- getTopHVGs(ref_data, n=2000)
+query_var <- getTopHVGs(query_data, n=2000)
+
+# Compute the overlap coefficient
+overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var, 
+                                           query_genes = query_var)
+print(overlap_coefficient)
+```
+
+In the provided example, we examined the distribution of expression
+values for the gene MS4A1, a marker for naive B cells, in both the
+query and reference datasets. Additionally, we also looked at the
+distribution of MS4A1 expression in the B_and_plasma cell type. We
+observed overlapping distributions in both cases, suggesting alignment
+between the reference and query datasets.
+
+## Visualize Gene Expression on Dimensional Reduction Plot
+
+To gain insights into the gene expression patterns and their
+representation in a dimensional reduction space, we can utilize the
+plotGeneExpressionDimred function. This function allows us to plot the
+gene expression values of a specific gene on a dimensional reduction
+plot generated using methods like t-SNE, UMAP, or PCA. Each single
+cell is color-coded based on its expression level of the gene of
+interest.
+
+In the provided example, we are visualizing the gene expression values
+of the gene "VPREB3" on a PCA plot. The PCA plot represents the cells
+in a lower-dimensional space, where the x-axis corresponds to the
+first principal component (Dimension 1) and the y-axis corresponds to
+the second principal component (Dimension 2). Each cell is represented
+as a point on the plot, and its color reflects the expression level of
+the gene "VPREB3," ranging from low (lighter color) to high (darker
+color).
+
+```{r Gene-Expression-Scatter, warning=FALSE, message=FALSE}
+# Generate dimension reduction plot color code by gene expression
+plotGeneExpressionDimred(se_object = query_data, 
+                         method = "PCA", 
+                         n_components = c(1, 2), 
+                         feature = "VPREB3")
+```
+
+The dimensional reduction plot allows us to observe how the gene
+expression of VPREB3 is distributed across the cells and whether any
+clusters or patterns emerge in the data.
+
+## Visualize Gene Sets or Pathway Scores on Dimensional Reduction Plot
+
+In addition to examining individual gene expression patterns, it is
+often useful to assess the collective activity of gene sets or
+pathways within single cells. This can provide insights into the
+functional states or biological processes associated with specific
+cell types or conditions. To facilitate this analysis, the
+scDiagnostics package includes a function called plotGeneSetScores
+that enables the visualization of gene set or pathway scores on a
+dimensional reduction plot.
+
+The plotGeneSetScores function allows you to plot gene set or pathway
+scores on a dimensional reduction plot generated using methods such as
+PCA, t-SNE, or UMAP. Each single cell is color-coded based on its
+scores for specific gene sets or pathways. This visualization helps
+identify the heterogeneity and patterns of gene set or pathway
+activity within the dataset, potentially revealing subpopulations with
+distinct functional characteristics.
+
+```{r Pathway-Scores-on-Dimensional-Reduction-Scatter, warning=FALSE, message=FALSE}
+
+# Compute scores using AUCell
+expression_matrix <- assay(query_data, "logcounts")
+cells_rankings <- AUCell_buildRankings(expression_matrix, plotStats = FALSE)
+
+# Generate gene sets
+gene_set1 <- sample(rownames(expression_matrix), 10)
+gene_set2 <- sample(rownames(expression_matrix), 20)
+
+gene_sets <- list(geneSet1 = gene_set1,
+                  geneSet2 = gene_set2)
+
+# Calculate AUC scores for gene sets
+cells_AUC <- AUCell_calcAUC(gene_sets, cells_rankings)
+
+# Assign scores to colData
+colData(query_data)$geneSetScores <- assay(cells_AUC)["geneSet1", ]
+
+# Plot gene set scores on PCA
+plotGeneSetScores(se_object = query_data, 
+                  method = "PCA", 
+                  feature = "geneSetScores")
+```
+
+In the provided example, we demonstrate the usage of the
+plotGeneSetScores function using the AUCell package to compute gene
+set or pathway scores. Custom gene sets are generated for
+demonstration purposes, but users can provide their own gene set
+scores using any method of their choice. It is important to ensure
+that the scores are assigned to the colData of the reference or query
+object and specify the correct feature name for visualization.
+
+By visualizing gene set or pathway scores on a dimensional reduction
+plot, you can gain a comprehensive understanding of the functional
+landscape within your single-cell gene expression dataset and explore
+the relationships between gene set activities and cellular phenotypes.
+
+## Visualizing Reference and Query Cell Types using Multidimensional Scaling (MDS)
+
+This function performs Multidimensional Scaling (MDS) analysis on the
+query and reference datasets to examine their similarity. The
+dissimilarity matrix is calculated based on the correlation between
+the datasets, representing the distances between cells in terms of
+gene expression patterns. MDS is then applied to derive
+low-dimensional coordinates for each cell. Subsequently, a scatter
+plot is generated, where each data point represents a cell, and cell
+types are color-coded using custom colors provided by the user. This
+visualization enables the comparison of cell type distributions
+between the query and reference datasets in a reduced-dimensional
+space.
+
+The rationale behind this function is to visually assess the alignment
+and relationships between cell types in the query and reference
+datasets.
+
+```{r CMD-Scatter-Plot, warning=FALSE, message=FALSE}
+
+# Intersect the gene symbols to obtain common genes
+common_genes <- intersect(ref_var, query_var)
+
+# Select desired cell types
+selected_cell_types <- c("CD4", "CD8", "B_and_plasma")
+ref_data_subset <- ref_data[common_genes, ref_data$reclustered.broad %in% selected_cell_types]
+query_data_subset <- query_data[common_genes, query_data$labels %in% selected_cell_types]
+
+# Extract cell types for visualization
+ref_labels <- ref_data_subset$reclustered.broad
+query_labels <- query_data_subset$labels
+
+# Combine the cell type labels from both datasets
+mdata <- c(paste("Query", query_labels), paste("Reference", ref_labels))
+
+## Define the cell types and legend order
+cell_types <- c("Query CD8", "Reference CD8", "Query CD4", "Reference CD4", "Query B_and_plasma", "Reference B_and_plasma")
+legend_order <- cell_types
+
+## Define the colors for cell types
+color_palette <- brewer.pal(length(cell_types), "Paired")
+color_mapping <- setNames(color_palette, cell_types)
+cell_type_colors <- color_mapping[cell_types]
+
+## Generate the MDS scatter plot with cell type coloring
+visualizeCellTypeMDS(query_data = query_data_subset, 
+                     reference_data = ref_data_subset, 
+                     mdata = mdata, 
+                     cell_type_colors = cell_type_colors, 
+                     legend_order = legend_order)
+```
+
+Upon examining the MDS scatter plot, we observe that the CD4 and CD8
+cell types overlap to some extent.By observing the proximity or
+overlap of different cell types, one can gain insights into their
+potential relationships or shared characteristics.
+
+The selection of custom genes and desired cell types depends on the
+user's research interests and goals. It allows for flexibility in
+focusing on specific genes and examining particular cell types of
+interest in the visualization.
+
+## Cell Type-specific Pairwise Correlation Analysis and Visualization
+
+This analysis aims to explore the correlation patterns between
+different cell types in a single-cell gene expression dataset. The
+goal is to compare the gene expression profiles of cells from a
+reference dataset and a query dataset to understand the relationships
+and similarities between various cell types.
+
+To perform the analysis, we start by computing the pairwise
+correlations between the query and reference cells for selected cell
+types ("CD4", "CD8", "B_and_plasma"). The Spearman correlation method
+is used, user can also use Pearsons correlation coeefficient.
+
+This will return average correlation matrix which can be visulaized by
+user's method of choice. Here, the results are visualized as a
+correlation plot using the corrplot package.
+
+
+```{r Cell-Type-Correlation-Analysis-Visualization, warning=FALSE, message=FALSE}
+selected_cell_types <- c("CD4", "CD8", "B_and_plasma")
+cor_matrix_avg <- computeAveragePairwiseCorrelation(query_data = query_data_subset, 
+                                                    reference_data = ref_data_subset, 
+                                                    query_cell_type_col = "labels", 
+                                                    ref_cell_type_col = "reclustered.broad", 
+                                                    cell_types = selected_cell_types, 
+                                                    correlation_method = "spearman")
+
+# Plot the pairwise average correlations using corrplot
+corrplot(cor_matrix_avg, method = "number", tl.col = "black")
+```
+
+In this case, users have the flexibility to extract the gene
+expression profiles of specific cell types from the reference and
+query datasets and provide these profiles as input to the
+function. Additionally, they can select their own set of genes that
+they consider relevant for computing the pairwise correlations. For
+demonstartion we have used common highly variable genes from reference
+and query dataset.
+
+By providing their own gene expression profiles and choosing specific
+genes, users can focus the analysis on the cell types and genes of
+interest to their research question.
+
+## Pairwise Distance Analysis and Density Visualization
+
+This function serves to conduct a analysis of pairwise distances or
+correlations between cells of specific cell types within a single-cell
+gene expression dataset. By calculating these distances or
+correlations, users can gain insights into the relationships and
+differences in gene expression profiles between different cell
+types. The function facilitates this analysis by generating density
+plots, allowing users to visualize the distribution of distances or
+correlations for various pairwise comparisons.
+
+The analysis offers the flexibility to select a particular cell type
+for examination, and users can choose between different distance
+metrics, such as "euclidean" or "manhattan," to calculate pairwise
+distances.
+
+To illustrate, the function is applied to the cell type CD8 using the
+euclidean distance metric in the example below.
+
+```{r Pairwise-Distance-Analysis-Density-Visualization, fig.width=8, message=FALSE, warning=FALSE}
+calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, 
+                                         reference_data = ref_data_subset, 
+                                         query_cell_type_col = "labels", 
+                                         ref_cell_type_col = "reclustered.broad", 
+                                         cell_type_query = "CD8", 
+                                         cell_type_reference = "CD8", 
+                                         distance_metric = "euclidean")
+```
+
+Alternatively, users can opt for the "correlation" distance metric,
+which measures the similarity in gene expression profiles between
+cells.
+
+To illustrate, the function is applied to the cell type CD8 using the
+correlation distance metric in the example below. By selecting either
+the "pearson" or "spearman" correlation method, users can emphasize
+either linear or rank-based associations, respectively.
+
+```{r Pairwise-Distance-Correlation-Based-Density-Visualization, warning=FALSE, message=FALSE, fig.width=8}
+calculatePairwiseDistancesAndPlotDensity(query_data = query_data_subset, 
+                                         reference_data = ref_data_subset, 
+                                         query_cell_type_col = "labels", 
+                                         ref_cell_type_col = "reclustered.broad", 
+                                         cell_type_query = "CD8", 
+                                         cell_type_reference = "CD8", 
+                                         distance_metric = "correlation",
+                                         correlation_method = "spearman")
+```
+
+By utilizing this function, users can explore the pairwise distances
+between query and reference cells of a specific cell type and gain
+insights into the distribution of distances through density
+plots. This analysis aids in understanding the similarities and
+differences in gene expression profiles for the selected cell type
+within the query and reference datasets.
+
+## PC regression analysis
+
+Performing PC regression analysis on a SingleCellExperiment object
+enables users to examine the relationship between a principal
+component (PC) from the dimension reduction slot and an independent
+variable of interest. By specifying the desired dependent variable as
+one of the principal components (e.g., "PC1", "PC2", etc.) and
+providing the corresponding independent variable from the colData slot
+(e.g. "cell_type"), users can explore the associations between linear
+structure in the single-cell gene expression dataset (reference and
+query) and an independent variable of interest (e.g. cell type or
+batch).
+
+The function prints two diagnostic plots by default:
+
+* a plot of the two PCs with the highest R^2^ with the specified independent variable
+* a dot plot showing the R^2^ of each consecutive PC ~ indep.var regression
+  * Generally you should expect this plot to die off to near 0 before ~PC10
+  * Interpretation example: If the R^2^ values are high (>=50%)
+    anywhere in PCs 1-5 and your independent variable is "batch", you
+    have batch effects!
+
+```{r Regression, warning=FALSE, message=FALSE}
+
+# Specify the dependent variables (principal components) and
+#  independent variable (e.g., "labels")
+dep.vars <- paste0("PC", 1:12)
+indep.var <- "labels"
+
+# Perform linear regression on multiple principal components
+result <- regressPC(sce = query_data,
+                    dep.vars = dep.vars, 
+                    indep.var = indep.var)
+
+# Print the summaries of the linear regression models and R-squared
+#  values
+
+# Summaries of the linear regression models
+result$regression.summaries[[1]]
+
+# R-squared values
+result$rsquared
+
+# Variance contributions for each principal component
+result$var.contributions
+
+# Total variance explained
+result$total.variance.explained
+```
+
+This analysis helps uncover whether there is a systematic variation in
+PC values across different cell types. In the example above, we can
+see that the four cell types are spread out across both PC1 and
+PC2. Digging into the genes with high loadins on these PCs can help
+explain the biological or technical factors driving cellular
+heterogeneity. It can help identify PC dimensions that capture
+variation specific to certain cell types or distinguish different
+cellular states.
+
+Let's look at the genes driving PC1 by ordering the rotation matrix by
+the absolute gene loadings for PC1:
+
+```{r}
+pc_df <-  attr(reducedDims(query_data)$PCA, "rotation")[,1:5] |> 
+  as.data.frame()
+
+pc_df[order(abs(pc_df$PC1)),] |> 
+  tail()
+```
+
+PC1 is mostly driven by NKG7 - Natural Killer Cell Granule Protein
+7. This gene is important in CD8+ T cells, so that makes sense that
+it's distinguishing the cell types shown.
+
+> Exercise: What genes are driving PC2? Do they make sense?
+
+```{r echo = FALSE, eval = FALSE}
+pc_df[order(abs(pc_df$PC2)),] |> 
+  tail()
+
+# It's IL32 mostly.
+```
+
+> Exercise: Try to use the command below to examine the spike on
+  PC5. What's going on there?
+
+`plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")`
+
+```{r eval=FALSE, echo=FALSE}
+plotPCA(query_data, ncomponents = c(1,5), color_by = "labels")
+# The myeloid cells are shifted off from the other types.
+
+pc_df[order(abs(pc_df$PC5))] |> 
+  tail()
+# It's mostly driven by low GNLY expression in the myeloid cells.
+```
+
+## Outlier detection
+
+Isolation forests are a fast and effective method for detecting
+outliers in high-dimensional data without relying on density
+estimation. `calculateOutlierScore()` is a wrapper around the main
+function of the [isotree
+package](https://cran.r-project.org/web/packages/isotree/index.html). Here
+we run it on the PC embeddings but, it can also be informative to run
+it on the raw counts. Here we can see some outliers in PC1/2 space
+flagged by the default `score > 0.5` threshold due to their clear
+isolation (as well as a few that are isolated on the non-visualized
+PCs).
+
+```{r}
+query_data <- calculateOutlierScore(query_data, 
+                                    use_pcs = TRUE, 
+                                    plot = TRUE,
+                                    dimred = "PCA")
+```
+
+Columns `outlier_score` and `is_outlier` are added to the colData of
+the output. The isotree model object itself is added to the metadata.
+
+One can pass different `dimred` names to change the plot, but running
+an isolation in forest in e.g. tSNE or UMAP space does not make sense
+due to the non-linearity of these approaches. Hence this is not
+allowed - when `use_pcs = FALSE`, the logcounts are used instead.
+
+## Annotation entropy
+
+In order to assess the confidence of cell type predictions, we can use
+the function `calculateCategorizationEntropy()`. This function
+calculates the information entropy of assignment probabilities across
+a set of cell types for each cell. If a set of class probabilities are
+confident, the entropies will be low.
+
+This can be used to compare two sets of cell type assignments
+(e.g. from different type assignment methods) to compare their
+relative confidence. **Please note that this has nothing to do with
+their accuracy!** Computational methods can sometimes be confidently
+incorrect.
+
+The cell type probabilities should be passed as a matrix with cell
+types as rows and cells as columns. If the columns of the matrix are
+not valid probability distributions (i.e. don't sum to 1 as in the
+below example), the function will perform a column-wise softmax to
+convert them to a probability scale. This may or may not work well
+depending on the distribution of the inputs, so if at all possible try
+to pass probabilities instead of arbitrary scores.
+
+In this example, we create 500 random cells with random normal cell
+type "scores" across 4 cell types. For demonstration we make the score
+of the first class much higher in the first 250 cells. After the
+softmax, this will equate to a very high probability of cell type
+1. The remaining 250 will have assignments that are roughly even
+across the four cell types (i.e. high entropy).
+
+```{r}
+X <- rnorm(500 * 4) |> matrix(nrow = 4)
+X[1, 1:250] <- X[1, 1:250] + 5 
+
+entropy_scores <- calculateCategorizationEntropy(X)
+```
+
+From the plot we can see that half of the cells (the first half we
+shifted to class 1) have low entropy, and half have high entropy.
+
+# Conclusion
+
+In this analysis, we have demonstrated the capabilities of the
+scDiagnostics package for assessing the appropriateness of cell
+assignments in single-cell gene expression profiles. By utilizing
+various diagnostic functions and visualization techniques, we have
+explored different aspects of the data, including total UMI counts,
+annotation scores, gene expression distributions, dimensional
+reduction plots, gene set scores, pairwise correlations, pairwise
+distances, and linear regression analysis.
+
+Through the scatter plots, histograms, and dimensional reduction
+plots, we were able to gain insights into the relationships between
+gene expression patterns, cell types, and the distribution of cells in
+a reduced-dimensional space. The examination of gene expression
+distributions, gene sets, and pathways allowed us to explore the
+functional landscape and identify subpopulations with distinct
+characteristics within the dataset. Additionally, the pairwise
+correlation and distance analyses provided a deeper understanding of
+the similarities and differences between cell types, highlighting
+potential relationships and patterns.
+
+***
+
+## R.session Info
+
+```{r SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA}
+options(width = 80) #reset to 'default' width
+
+sessionInfo() #record the R and package versions used
+```
+

From b4f1127543c1c5de11fed23bc5830dc27a562ef9 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 16:48:08 -0400
Subject: [PATCH 10/11] git rebase refuses the rebase this

---
 DESCRIPTION | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 52581a2..0cbd320 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,9 +3,12 @@ Package: scDiagnostics
 Title: Cell type annotation diagnostics
 Version: 0.99.0
 Authors@R: c(
-    person("Smriti", "Chawla", , "smriti_chawla@g.harvard.edu", role = c("aut", "cre")),
+    person("Anthony", "Christidis", role = c("aut", "cre"), 
+            email = "anthony-alexander_christidis@hms.harvard.edu"),
     person("Andrew", "Ghazi", role = "aut"),
-    person("Ludwig", "Geistlinger", role = c("aut", "ctb")),
+    person("Smriti", "Chawla", role = "aut"),
+    person("Nitesh", "Turaga", role = "ctb"),
+    person("Ludwig", "Geistlinger", role = "aut"),
     person("Robert", "Gentleman", role = "aut")
   )
 Description: The scDiagnostics package provides diagnostic plots to
@@ -21,7 +24,7 @@ License: Artistic-2.0
 URL: https://github.com/ccb-hms/scDiagnostics
 BugReports: https://github.com/ccb-hms/scDiagnostics/issues
 Depends:
-    R (>= 4.2.0)
+    R (>= 4.3.0)
 Imports:
     SingleCellExperiment,
     isotree,
@@ -50,7 +53,15 @@ Suggests:
     testthat (>= 3.0.0)
 VignetteBuilder: 
     knitr
-biocView: SingleCell, Software
+biocViews:
+    Annotation,
+    Classification,
+    Clustering,
+    GeneExpression,
+    RNASeq,
+    SingleCell,
+    Software,
+    Transcriptomics
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1

From ba2f71716448d76671293e91add7273445ed8380 Mon Sep 17 00:00:00 2001
From: Andrew Ghazi <andghazi@gmail.com>
Date: Wed, 17 Apr 2024 16:57:48 -0400
Subject: [PATCH 11/11] update vignette

---
 vignettes/scDiagnostics.Rmd | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vignettes/scDiagnostics.Rmd b/vignettes/scDiagnostics.Rmd
index 525f85e..e10ae23 100644
--- a/vignettes/scDiagnostics.Rmd
+++ b/vignettes/scDiagnostics.Rmd
@@ -709,18 +709,23 @@ PCs).
 
 ```{r}
 query_data <- calculateOutlierScore(query_data, 
-                                    use_pcs = TRUE, 
-                                    plot = TRUE,
-                                    dimred = "PCA")
+                                    use_pcs = TRUE)
+
+scater::plotReducedDim(query_data, 
+                       dimred = "PCA", 
+                       color_by = "outlier_score",
+                       shape_by = "is_outlier")
 ```
 
+By default this evaluates an isolation forest across all cells in the object. One can optionally pass a `group.var` (a column in the colData) to evaluate the outlier detection by groups (e.g. cell type). This can help assess whether a cell is atypical amongst those it was clustered with.
+
 Columns `outlier_score` and `is_outlier` are added to the colData of
 the output. The isotree model object itself is added to the metadata.
 
 One can pass different `dimred` names to change the plot, but running
 an isolation in forest in e.g. tSNE or UMAP space does not make sense
 due to the non-linearity of these approaches. Hence this is not
-allowed - when `use_pcs = FALSE`, the logcounts are used instead.
+allowed - when `use_pcs = FALSE`, the raw logcounts are used instead.
 
 ## Annotation entropy