Fix bugs when running wilcoxon for ATAC peaks

welch-lab · Nov 13, 2024 · 8d36995 · 8d36995
1 parent 991d926
commit 8d36995
Show file tree

Hide file tree

Showing 12 changed files with 92 additions and 76 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rliger
-Version: 2.1.0
-Date: 2024-10-29
+Version: 2.1.0.9001
+Date: 2024-11-13
 Type: Package
 Title: Linked Inference of Genomic Experimental Relationships
 Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) <doi:10.1016/j.cell.2019.05.006>, and Liu J, Gao C, Sodicoff J, et al (2020) <doi:10.1038/s41596-020-0391-8> for more details.

diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,10 @@
   - Pseudo-bulk should be easy because we are just aggregating cells.
   - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data.
 
+## rliger 2.1.0.9001
+
+- Fixed Wilcoxon rank-sum test bug when using ATAC peak counts
+
 ## rliger 2.1.0
 
 - Added `centroidAlign()` for new cell factor loading alignment method

diff --git a/R/DEG_marker.R b/R/DEG_marker.R
@@ -445,11 +445,12 @@ runWilcoxon <- function(
             end <- min(i * chunk, length(features))
             mat <- extractMergedNormData(
                 object,
+                slot = slot,
                 cellIdx = allCellBC,
                 featureIdx = features[start:end]
             )
             mat <- log1p(1e10*mat)
-            resultList[[i]] <- wilcoxauc(mat, var)
+            resultList[[i]] <- wilcoxauc(mat, var, verbose = verbose)
             if (nchunk > 1) gc()
             cli::cli_progress_update(set = i)
         }
@@ -840,53 +841,6 @@ calcPctInOut <- function(
     return(nCellExpr)
 }
 
-# makePseudoBulkOld <- function(mat, replicateAnn, minCellPerRep, verbose = TRUE) {
-#     # mat - Extracted and contatenated matrix. intersection of genes by
-#     #       c(groupTest, groupCtrl) cells
-#     # groups - list of groups
-#     # replicateAnn - data.frame of replicate annotation, with rownames as
-#     #                barcodes and columns as variables
-#
-#     # Check whether enough replicates per condition
-#     for (gr in levels(replicateAnn$groups)) {
-#         subrep <- replicateAnn[replicateAnn$groups == gr,]
-#         splitLabel <- interaction(subrep, drop = TRUE)
-#         if (nlevels(splitLabel) < 2) {
-#             cli::cli_abort(
-#                 c("Too few replicates for condition {.val {gr}}. Cannot create pseudo-bulks.",
-#                   "i" = "Please consider creating pseudo-replicates or using {.code method = 'wilcoxon'} instead.")
-#             )
-#         }
-#     }
-#     splitLabel <- interaction(replicateAnn, drop = TRUE)
-#     repSizeTab <- table(splitLabel)
-#     if (verbose) {
-#         cli::cli_alert_info("Replicate sizes:")
-#         print(repSizeTab)
-#     }
-#     labelCounts <- table(splitLabel)
-#     ignored <- names(labelCounts)[labelCounts < minCellPerRep]
-#     if (length(ignored) > 0) {
-#         cli::cli_alert_warning(
-#             "Ignoring replicates (size in bracket) with too few cells: {.val {paste0(ignored, ' (', repSizeTab[ignored], ')')}}"
-#         )
-#         cli::cli_alert_info(
-#             "Consider decrease {.field minCellPerRep} to exclude less replicates or/and lower {.field nPsdRep} to generate larger pseudo-replicates."
-#         )
-#     }
-#     keep <- names(labelCounts)[labelCounts >= minCellPerRep]
-#     idx <- splitLabel %in% keep
-#     splitLabel <- splitLabel[idx, drop = TRUE]
-#     mat <- mat[, idx, drop = FALSE]
-#     replicateAnn <- replicateAnn[idx, , drop = FALSE]
-#
-#     pseudoBulks <- colAggregateSums_sparse(mat, as.integer(splitLabel) - 1,
-#                                            nlevels(splitLabel))
-#     dimnames(pseudoBulks) <- list(rownames(mat), levels(splitLabel))
-#     pseudoBulks <- pseudoBulks[rowSums(pseudoBulks) > 0,]
-#     return(list(pseudoBulks, replicateAnn))
-# }
-
 .callDESeq2 <- function(pseudoBulks, groups,
                          verbose = getOption("ligerVerbose", TRUE)) {
     # DESeq2 workflow
@@ -926,37 +880,54 @@ calcPctInOut <- function(
 
 extractMergedNormData <- function(
         object,
+        slot,
         cellIdx = NULL,
         featureIdx = NULL
 ) {
     cellIdx <- .idxCheck(object, cellIdx, "cell")
     datasetInvolved <- unique(object$dataset[cellIdx])
     cellID <- colnames(object)[cellIdx]
+    getter <- switch(
+        slot,
+        normData = normData,
+        normPeak = normPeak
+    )
     if (is.null(featureIdx)) {
         # Use intersection by default
         featureIdx <- Reduce(
             intersect,
             lapply(
-                normData(object, datasetInvolved),
+                getter(object, datasetInvolved),
                 rownames
             )
         )
     }
     out <- NULL
     for (i in seq_along(datasetInvolved)) {
         dn <- datasetInvolved[i]
-        ldFeatureIdx <- .idxCheck(dataset(object, dn), featureIdx, "feature")
+        ld <- dataset(object, dn)
+        mat <- getter(ld)
+        allFeature <- rownames(mat)
+        ldFeatureIdx <- match(featureIdx, allFeature)
+        ldFeatureIdx <- ldFeatureIdx[!is.na(ldFeatureIdx)]
+        # if (slot == "normData") {
+        #     ldFeatureIdx <- .idxCheck(dataset(object, dn), featureIdx, "feature")
+        # } else {
+        #     allPeaks <- rownames(getter())
+        # }
+
         ldCellIdx <- match(cellID, colnames(dataset(object, dn)))
         ldCellIdx <- ldCellIdx[!is.na(ldCellIdx)]
-        out <- cbind(out, normData(object, dn)[ldFeatureIdx, ldCellIdx, drop = FALSE])
+        out <- cbind(out, mat[ldFeatureIdx, ldCellIdx, drop = FALSE])
+        # out <- cbind(out, getter(object, dn)[ldFeatureIdx, ldCellIdx, drop = FALSE])
     }
     out[, cellID]
 }
 
 # X: matrix of data to be tested
 # y: grouping label of columns of X
 # Rcpp source code located in src/wilcoxon.cpp
-wilcoxauc <- function(x, clusterVar) {
+wilcoxauc <- function(x, clusterVar, verbose = verbose) {
     if (methods::is(x, 'dgTMatrix')) x <- methods::as(x, 'CsparseMatrix') # nocov start
     if (methods::is(x, 'TsparseMatrix')) x <- methods::as(x, 'CsparseMatrix')
     if (is.null(row.names(x))) {
@@ -972,7 +943,7 @@ wilcoxauc <- function(x, clusterVar) {
     xRanked <- Matrix::t(x)
     # This computes the ranking of non-zero values and the ties
     ties <- cpp_rank_matrix_dgc(xRanked@x, xRanked@p,
-                                nrow(xRanked), ncol(xRanked))
+                                nrow(xRanked), ncol(xRanked), showProgress = verbose)
     # ranksRes <- list(X_ranked = xT, ties = ties)
 
     # rankRes <- colRanking(x)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -115,8 +115,8 @@ DirectSNNToFile <- function(nn_ranked, prune, display_progress, filename) {
     .Call(`_rliger_DirectSNNToFile`, nn_ranked, prune, display_progress, filename)
 }
 
-cpp_rank_matrix_dgc <- function(x, p, nrow, ncol) {
-    .Call(`_rliger_cpp_rank_matrix_dgc`, x, p, nrow, ncol)
+cpp_rank_matrix_dgc <- function(x, p, nrow, ncol, showProgress = FALSE) {
+    .Call(`_rliger_cpp_rank_matrix_dgc`, x, p, nrow, ncol, showProgress)
 }
 
 rowAggregateSum_sparse <- function(X, groups, ngroups) {

diff --git a/R/factorMarker.R b/R/factorMarker.R
@@ -135,7 +135,7 @@ getFactorMarkers <- function(
                           normDataList[[2]][vargene, labels[[2]] == i])
         cellLabel <- rep(c(dataset1, dataset2),
                          c(sum(labels[[1]] == i), sum(labels[[2]] == i)))
-        wilcoxResult <- wilcoxauc(log1p(1e10*normData), cellLabel)
+        wilcoxResult <- wilcoxauc(log1p(1e10*normData), cellLabel, verbose = FALSE)
 
         log2fc <- wilcoxResult[wilcoxResult$group == dataset1, ]$logFC
         names(log2fc) <- wilcoxResult[wilcoxResult$group == dataset1, ]$feature

diff --git a/R/liger-methods.R b/R/liger-methods.R
@@ -927,7 +927,7 @@ setMethod(
     function(x,
              slot = c("rawData", "normData", "scaleData",
                       "scaleUnsharedData", "H", "V", "U", "A", "B",
-                      "W", "H.norm"),
+                      "W", "H.norm", "rawPeak", "normPeak"),
              dataset = NULL,
              returnList = FALSE) {
         slot <- match.arg(slot)

diff --git a/R/ligerDataset_subclass-methods.R b/R/ligerDataset_subclass-methods.R
@@ -64,6 +64,21 @@ setReplaceMethod(
         x
     })
 
+#' @export
+#' @rdname ligerDataset-class
+setMethod(
+    "getMatrix", signature(x = "ligerATACDataset", dataset = "missing",
+                           returnList = "missing"),
+    function(x,
+             slot = c("rawData", "normData", "scaleData",
+                      "scaleUnsharedData", "H", "V", "U", "A", "B", "rawPeak", "normPeak"),
+             dataset = NULL) {
+        # TODO: Currently directly find the data with slot, but need to
+        # think about maintainability when we need to change slot name.
+        slot <- match.arg(slot)
+        methods::slot(x, slot)
+    })
+
 #' @rdname coordinate
 #' @export
 setMethod("coordinate", signature(x = "ligerSpatialDataset", dataset = "missing"),

diff --git a/man/liger-class.Rd b/man/liger-class.Rd
diff --git a/man/ligerDataset-class.Rd b/man/ligerDataset-class.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -330,16 +330,17 @@ BEGIN_RCPP
 END_RCPP
 }
 // cpp_rank_matrix_dgc
-std::vector<std::list<float> > cpp_rank_matrix_dgc(arma::vec& x, const arma::vec& p, int nrow, int ncol);
-RcppExport SEXP _rliger_cpp_rank_matrix_dgc(SEXP xSEXP, SEXP pSEXP, SEXP nrowSEXP, SEXP ncolSEXP) {
+std::vector<std::list<float> > cpp_rank_matrix_dgc(arma::vec& x, const arma::vec& p, int nrow, int ncol, bool showProgress);
+RcppExport SEXP _rliger_cpp_rank_matrix_dgc(SEXP xSEXP, SEXP pSEXP, SEXP nrowSEXP, SEXP ncolSEXP, SEXP showProgressSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< arma::vec& >::type x(xSEXP);
     Rcpp::traits::input_parameter< const arma::vec& >::type p(pSEXP);
     Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP);
     Rcpp::traits::input_parameter< int >::type ncol(ncolSEXP);
-    rcpp_result_gen = Rcpp::wrap(cpp_rank_matrix_dgc(x, p, nrow, ncol));
+    Rcpp::traits::input_parameter< bool >::type showProgress(showProgressSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_rank_matrix_dgc(x, p, nrow, ncol, showProgress));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -422,7 +423,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_rliger_ComputeSNN", (DL_FUNC) &_rliger_ComputeSNN, 2},
     {"_rliger_WriteEdgeFile", (DL_FUNC) &_rliger_WriteEdgeFile, 3},
     {"_rliger_DirectSNNToFile", (DL_FUNC) &_rliger_DirectSNNToFile, 4},
-    {"_rliger_cpp_rank_matrix_dgc", (DL_FUNC) &_rliger_cpp_rank_matrix_dgc, 4},
+    {"_rliger_cpp_rank_matrix_dgc", (DL_FUNC) &_rliger_cpp_rank_matrix_dgc, 5},
     {"_rliger_rowAggregateSum_sparse", (DL_FUNC) &_rliger_rowAggregateSum_sparse, 3},
     {"_rliger_colAggregateSum_sparse", (DL_FUNC) &_rliger_colAggregateSum_sparse, 3},
     {"_rliger_colNNZAggr_sparse", (DL_FUNC) &_rliger_colNNZAggr_sparse, 3},

diff --git a/src/wilcoxon.cpp b/src/wilcoxon.cpp
@@ -1,5 +1,10 @@
+// [[Rcpp::depends(RcppArmadillo)]]
 #include <RcppArmadillo.h>
 
+// [[Rcpp::depends(RcppProgress)]]
+#include <progress.hpp>
+#include <progress_bar.hpp>
+
 using namespace std;
 using namespace Rcpp;
 using namespace arma;
@@ -54,18 +59,28 @@ std::list<float> cpp_in_place_rank_mean(arma::vec& v_temp, int idx_begin,
 
 // [[Rcpp::export]]
 std::vector<std::list<float> > cpp_rank_matrix_dgc(
-        arma::vec& x, const arma::vec& p, int nrow, int ncol) {
+        arma::vec& x, const arma::vec& p, int nrow, int ncol,
+        bool showProgress = false
+) {
     vector<list<float> > ties(ncol);
     int n_zero;
+    if (showProgress) {
+        Rcpp::Rcerr << "Wilcoxon rank-sum test over " << ncol << " features across " << nrow << " cells" << std::endl;
+    }
+    Progress pb(ncol, showProgress);
     for (int i = 0; i < ncol; i++) {
+        if (Progress::check_abort()) return ties;
+
         n_zero = nrow - (p[i+1] - p[i]);
         if (p[i+1] == p[i]) {
             ties[i].push_back(n_zero);
+            pb.increment();
             continue;
         }
         ties[i] = cpp_in_place_rank_mean(x, p[i], p[i + 1] - 1);
         ties[i].push_back(n_zero);
         x.rows(p[i], p[i + 1] - 1) += n_zero;
+        pb.increment();
     }
     return ties;
 }