diff --git a/DESCRIPTION b/DESCRIPTION index 5fb3f22..5e36344 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rliger -Version: 2.1.0.9001 -Date: 2024-11-13 +Version: 2.1.0.9002 +Date: 2024-11-15 Type: Package Title: Linked Inference of Genomic Experimental Relationships Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) , and Liu J, Gao C, Sodicoff J, et al (2020) for more details. diff --git a/NAMESPACE b/NAMESPACE index f6197d9..1e952ae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,9 @@ S3method(scaleNotCenter,dgCMatrix) S3method(scaleNotCenter,liger) S3method(scaleNotCenter,ligerDataset) S3method(scaleNotCenter,ligerMethDataset) +S3method(selectBatchHVG,dgCMatrix) +S3method(selectBatchHVG,liger) +S3method(selectBatchHVG,ligerDataset) S3method(selectGenes,Seurat) S3method(selectGenes,liger) S3method(writeH5,default) @@ -212,6 +215,7 @@ export(runWilcoxon) export(scaleData) export(scaleNotCenter) export(scaleUnsharedData) +export(selectBatchHVG) export(selectGenes) export(selectGenesVST) export(seuratToLiger) diff --git a/NEWS.md b/NEWS.md index ac1d5d0..daa94ff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,8 +13,9 @@ - Pseudo-bulk should be easy because we are just aggregating cells. - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data. -## rliger 2.1.0.9001 +## rliger 2.1.0.9002 +- Added `selectBatchHVG()` which implements another HVG selection strategy, credit to SCIB - Fixed Wilcoxon rank-sum test bug when using ATAC peak counts ## rliger 2.1.0 diff --git a/R/selectBatchHVG.R b/R/selectBatchHVG.R new file mode 100644 index 0000000..20b91e6 --- /dev/null +++ b/R/selectBatchHVG.R @@ -0,0 +1,226 @@ +#' `r lifecycle::badge("experimental")` Batch-aware highly variable gene selection +#' @rdname selectBatchHVG +#' @description +#' Method to select HVGs based on mean dispersions of genes that are highly +#' variable genes in all batches. Using a the top target_genes per batch by +#' average normalize dispersion. If target genes still hasn't been reached, +#' then HVGs in all but one batches are used to fill up. This is continued +#' until HVGs in a single batch are considered. +#' +#' This is an *rliger* implementation of the method originally published in +#' [SCIB](https://scib.readthedocs.io/en/latest/api/scib.preprocessing.hvg_batch.html). +#' We found the potential that it can improve integration under some +#' circumstances, and is currently testing it. +#' +#' This function currently only works for shared features across all datasets. +#' For selection from only part of the datasets and selection for +#' dataset-specific unshared features, please use \code{\link{selectGenes}()}. +#' @references +#' Luecken, M.D., Büttner, M., Chaichoompu, K. et al. (2022), Benchmarking +#' atlas-level data integration in single-cell genomics. *Nat Methods*, 19, +#' 41–50. https://doi.org/10.1038/s41592-021-01336-8. +#' @seealso [selectGenes()] +#' @param object A \code{\linkS4class{liger}} object, +#' \code{\linkS4class{ligerDataset}} object or a sparse/dense matrix. The liger +#' objects must have raw counts available. A direct matrix input is preferably +#' log-1p transformed from CPM normalized counts in cell per column orientation. +#' @param nGenes Integer number of target genes to select. Default \code{2000}. +#' @param features For ligerDataset method, the feature subset to limit the +#' selection to, due to liger downstream requires non-zero features to be +#' considered. Default \code{NULL} selects from all features. +#' @param verbose Logical. Whether to show a progress bar. Default +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. +#' @param returnStats Logical, for dgCMatrix-method, whether to return a data +#' frame of statistics for all features, or by default \code{FALSE} just return +#' a character vector of selected features. +#' @param ... Arguments passed to S3 methods. +#' @return +#' \itemize{ +#' \item{liger-method: Returns the input liger object with the selected genes +#' updated in \code{varFeatures} slot, which can be accessed with +#' \code{varFeatures(object)}. Additionally, the statistics are updated in +#' the \code{featureMeta} slot of each ligerDataset object within the +#' \code{datasets} slot of the \code{object}.} +#' \item{ligerDataset-method: Returns the input ligerDataset object with the +#' statistics updated in the \code{featureMeta} slot.} +#' \item{dgCMatrix-method: By default returns a character vector of selected +#' variable features. If \code{returnStats = TRUE}, returns a data.frame of the +#' statistics.} +#' } +#' @export +#' @examples +#' pbmc <- selectBatchHVG(pbmc, nGenes = 10) +#' varFeatures(pbmc) +selectBatchHVG <- function( + object, + ... +) { + UseMethod("selectBatchHVG", object) +} + +#' @export +#' @method selectBatchHVG liger +#' @rdname selectBatchHVG +selectBatchHVG.liger <- function( + object, + nGenes = 2000, + verbose = getOption("ligerVerbose", TRUE), + ... +) { + object <- recordCommand(object, ...) + sharedFeature <- Reduce(intersect, lapply(datasets(object), rownames)) + hvgDFList <- list() + for (d in names(object)) { + if (isTRUE(verbose)) + cli::cli_alert_info("Selecting variable features for dataset {.val {d}}") + ld <- dataset(object, d) + ld <- selectBatchHVG(ld, nGenes = nGenes, features = sharedFeature, + verbose = verbose, ...) + object@datasets[[d]] <- ld + fm <- featureMeta(ld)[sharedFeature, , drop = FALSE] + fm$dataset <- d + fm$feature <- rownames(fm) + hvgDFList[[d]] <- as.data.frame(fm) + } + hvgDFAgg <- Reduce(rbind, hvgDFList) + hvgDFAgg <- hvgDFAgg %>% + dplyr::mutate(feature = factor(.data[['feature']], levels = sharedFeature)) %>% + dplyr::group_by(.data[['feature']]) %>% + dplyr::summarise( + means = mean(.data[['means']]), + dispersion = mean(.data[['dispersion']]), + dispersion_norm = mean(.data[['dispersion_norm']]), + n_batch = sum(.data[['highly_variable']]) + ) + res <- character() + enough <- FALSE + nSelected <- 0 + nNeeded <- nGenes + n_batch_look <- length(object) + while (!enough && n_batch_look > 0) { + hvgDFSub <- hvgDFAgg %>% + dplyr::filter(.data[['n_batch']] == n_batch_look) %>% + dplyr::arrange(-.data[['dispersion_norm']]) + if (nrow(hvgDFSub) < nNeeded) { + if (isTRUE(verbose)) { + cli::cli_alert_info( + "Selected {nrow(hvgDFSub)} gene{?s} that {?is/are} highly variable in {n_batch_look} batch{?es}" + ) + } + nSelected <- nrow(hvgDFSub) + nNeeded <- nGenes - nSelected + n_batch_look <- n_batch_look - 1 + } else { + if (isTRUE(verbose)) { + cli::cli_alert_info( + "Selected {nNeeded} gene{?s} that {?is/are} highly variable in {n_batch_look} batch{?es}" + ) + } + hvgDFSub <- hvgDFSub %>% dplyr::slice_head(n = nNeeded) + enough <- TRUE + } + res <- c(res, as.character(hvgDFSub$feature)) + } + if (isTRUE(verbose)) { + if (!enough) { + cli::cli_alert_warning("Only {length(res)} gene{?s} {?is/are} selected while {nGenes} {?is/are} requested.") + } else { + cli::cli_alert_success("Totally {length(res)} gene{?s} {?is/are} selected.") + } + } + varFeatures(object) <- res + return(object) +} + +#' @export +#' @method selectBatchHVG ligerDataset +#' @rdname selectBatchHVG +selectBatchHVG.ligerDataset <- function( + object, + nGenes = 2000, + features = NULL, + verbose = getOption("ligerVerbose", TRUE), + ... +) { + features <- .idxCheck(object, features, orient = "feature") + mat <- rawData(object) + mat <- normalize(mat, log = TRUE, scaleFactor = 1e6) + fm <- featureMeta(object) + idx <- rep(FALSE, nrow(object)) + idx[features] <- TRUE + idx <- idx & (fm$nCell > 0) + mat <- mat[idx, , drop = FALSE] + stats <- selectBatchHVG(mat, nGenes = nGenes, verbose = verbose, + returnStats = TRUE, ...) + fm$means <- 0 + fm$dispersion <- 0 + fm$dispersion_norm <- 0 + fm$highly_variable <- FALSE + fm[stats$feature, "means"] <- stats$means + fm[stats$feature, "dispersion"] <- stats$dispersion + fm[stats$feature, "dispersion_norm"] <- stats$dispersion_norm + fm[stats$feature, "highly_variable"] <- stats$highly_variable + featureMeta(object) <- fm + return(object) +} + +#' @export +#' @method selectBatchHVG dgCMatrix +#' @rdname selectBatchHVG +selectBatchHVG.dgCMatrix <- function( + object, + nGenes = 2000, + returnStats = FALSE, + verbose = getOption("ligerVerbose", TRUE), + ... +) { + # Get mean and var + means <- Matrix::rowMeans(object) + vars <- rowVars_sparse_rcpp(object, means) + stats <- .selectBatchHVG.by.metric( + feature = rownames(object), + means = means, + vars = vars, + nGenes = nGenes, + verbose = verbose + ) + if (isTRUE(returnStats)) { + return(stats) + } else { + stats %>% + dplyr::arrange(-.data[['dispersion_norm']]) %>% + dplyr::filter(.data[['highly_variable']]) %>% + dplyr::pull(.data[['feature']]) + } +} + +.selectBatchHVG.by.metric <- function( + feature, + means, + vars, + nGenes, + verbose +) { + means[means == 0] <- 1e-12 + dispersion <- vars / means + df <- data.frame( + feature = feature, + means = means, + dispersion = dispersion + ) + bins <- c(-Inf, stats::quantile(means, seq(0.1, 1, 0.05)), Inf) + df$mean_bin <- cut(means, breaks = bins) + df <- df %>% + dplyr::group_by(.data[['mean_bin']]) %>% + dplyr::mutate( + dispersion_norm = (.data[['dispersion']] - stats::median(.data[['dispersion']]))/.mad(.data[['dispersion']]) + ) + if (nGenes > nrow(df)) nGenes <- nrow(df) + df$highly_variable <- FALSE + df$highly_variable[order(df$dispersion_norm, decreasing = TRUE)[seq(nGenes)]] <- TRUE + df +} + +.mad <- function(x) { + stats::median(abs(x - stats::median(x)))/0.6744897501960817 +} diff --git a/man/selectBatchHVG.Rd b/man/selectBatchHVG.Rd new file mode 100644 index 0000000..aa0f350 --- /dev/null +++ b/man/selectBatchHVG.Rd @@ -0,0 +1,97 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/selectBatchHVG.R +\name{selectBatchHVG} +\alias{selectBatchHVG} +\alias{selectBatchHVG.liger} +\alias{selectBatchHVG.ligerDataset} +\alias{selectBatchHVG.dgCMatrix} +\title{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Batch-aware highly variable gene selection} +\usage{ +selectBatchHVG(object, ...) + +\method{selectBatchHVG}{liger}( + object, + nGenes = 2000, + verbose = getOption("ligerVerbose", TRUE), + ... +) + +\method{selectBatchHVG}{ligerDataset}( + object, + nGenes = 2000, + features = NULL, + verbose = getOption("ligerVerbose", TRUE), + ... +) + +\method{selectBatchHVG}{dgCMatrix}( + object, + nGenes = 2000, + returnStats = FALSE, + verbose = getOption("ligerVerbose", TRUE), + ... +) +} +\arguments{ +\item{object}{A \code{\linkS4class{liger}} object, +\code{\linkS4class{ligerDataset}} object or a sparse/dense matrix. The liger +objects must have raw counts available. A direct matrix input is preferably +log-1p transformed from CPM normalized counts in cell per column orientation.} + +\item{...}{Arguments passed to S3 methods.} + +\item{nGenes}{Integer number of target genes to select. Default \code{2000}.} + +\item{verbose}{Logical. Whether to show a progress bar. Default +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} + +\item{features}{For ligerDataset method, the feature subset to limit the +selection to, due to liger downstream requires non-zero features to be +considered. Default \code{NULL} selects from all features.} + +\item{returnStats}{Logical, for dgCMatrix-method, whether to return a data +frame of statistics for all features, or by default \code{FALSE} just return +a character vector of selected features.} +} +\value{ +\itemize{ +\item{liger-method: Returns the input liger object with the selected genes +updated in \code{varFeatures} slot, which can be accessed with +\code{varFeatures(object)}. Additionally, the statistics are updated in +the \code{featureMeta} slot of each ligerDataset object within the +\code{datasets} slot of the \code{object}.} +\item{ligerDataset-method: Returns the input ligerDataset object with the +statistics updated in the \code{featureMeta} slot.} +\item{dgCMatrix-method: By default returns a character vector of selected +variable features. If \code{returnStats = TRUE}, returns a data.frame of the +statistics.} +} +} +\description{ +Method to select HVGs based on mean dispersions of genes that are highly +variable genes in all batches. Using a the top target_genes per batch by +average normalize dispersion. If target genes still hasn't been reached, +then HVGs in all but one batches are used to fill up. This is continued +until HVGs in a single batch are considered. + +This is an \emph{rliger} implementation of the method originally published in +\href{https://scib.readthedocs.io/en/latest/api/scib.preprocessing.hvg_batch.html}{SCIB}. +We found the potential that it can improve integration under some +circumstances, and is currently testing it. + +This function currently only works for shared features across all datasets. +For selection from only part of the datasets and selection for +dataset-specific unshared features, please use \code{\link{selectGenes}()}. +} +\examples{ +pbmc <- selectBatchHVG(pbmc, nGenes = 10) +varFeatures(pbmc) +} +\references{ +Luecken, M.D., Büttner, M., Chaichoompu, K. et al. (2022), Benchmarking +atlas-level data integration in single-cell genomics. \emph{Nat Methods}, 19, +41–50. https://doi.org/10.1038/s41592-021-01336-8. +} +\seealso{ +\code{\link[=selectGenes]{selectGenes()}} +} diff --git a/tests/testthat/test_preprocessing.R b/tests/testthat/test_preprocessing.R index f861156..b64f166 100644 --- a/tests/testthat/test_preprocessing.R +++ b/tests/testthat/test_preprocessing.R @@ -242,6 +242,18 @@ test_that("selectGenes - on disk", { ) }) +test_that("selectBatchHVG", { + expect_no_error(pbmc <- selectBatchHVG(pbmc, nGenes = 100)) + expect_length(varFeatures(pbmc), 100) + expect_message(pbmc <- selectBatchHVG(pbmc, nGenes = 300), + "Only 249 genes") + ctrl.raw <- rawData(pbmc, "ctrl") + sel1 <- selectBatchHVG(ctrl.raw, nGenes = 100) + expect_length(sel1, 100) + sel1 <- selectBatchHVG(ctrl.raw, nGenes = 100, returnStats = TRUE) + expect_true(any(is.nan(sel1$dispersion_norm))) +}) + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Scaling #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%