Skip to content

Commit

Permalink
Fix bugs when running wilcoxon for ATAC peaks
Browse files Browse the repository at this point in the history
  • Loading branch information
mvfki committed Nov 13, 2024
1 parent 991d926 commit 8d36995
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 76 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: rliger
Version: 2.1.0
Date: 2024-10-29
Version: 2.1.0.9001
Date: 2024-11-13
Type: Package
Title: Linked Inference of Genomic Experimental Relationships
Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) <doi:10.1016/j.cell.2019.05.006>, and Liu J, Gao C, Sodicoff J, et al (2020) <doi:10.1038/s41596-020-0391-8> for more details.
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
- Pseudo-bulk should be easy because we are just aggregating cells.
- Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data.

## rliger 2.1.0.9001

- Fixed Wilcoxon rank-sum test bug when using ATAC peak counts

## rliger 2.1.0

- Added `centroidAlign()` for new cell factor loading alignment method
Expand Down
77 changes: 24 additions & 53 deletions R/DEG_marker.R
Original file line number Diff line number Diff line change
Expand Up @@ -445,11 +445,12 @@ runWilcoxon <- function(
end <- min(i * chunk, length(features))
mat <- extractMergedNormData(
object,
slot = slot,
cellIdx = allCellBC,
featureIdx = features[start:end]
)
mat <- log1p(1e10*mat)
resultList[[i]] <- wilcoxauc(mat, var)
resultList[[i]] <- wilcoxauc(mat, var, verbose = verbose)
if (nchunk > 1) gc()
cli::cli_progress_update(set = i)
}
Expand Down Expand Up @@ -840,53 +841,6 @@ calcPctInOut <- function(
return(nCellExpr)
}

# makePseudoBulkOld <- function(mat, replicateAnn, minCellPerRep, verbose = TRUE) {
# # mat - Extracted and contatenated matrix. intersection of genes by
# # c(groupTest, groupCtrl) cells
# # groups - list of groups
# # replicateAnn - data.frame of replicate annotation, with rownames as
# # barcodes and columns as variables
#
# # Check whether enough replicates per condition
# for (gr in levels(replicateAnn$groups)) {
# subrep <- replicateAnn[replicateAnn$groups == gr,]
# splitLabel <- interaction(subrep, drop = TRUE)
# if (nlevels(splitLabel) < 2) {
# cli::cli_abort(
# c("Too few replicates for condition {.val {gr}}. Cannot create pseudo-bulks.",
# "i" = "Please consider creating pseudo-replicates or using {.code method = 'wilcoxon'} instead.")
# )
# }
# }
# splitLabel <- interaction(replicateAnn, drop = TRUE)
# repSizeTab <- table(splitLabel)
# if (verbose) {
# cli::cli_alert_info("Replicate sizes:")
# print(repSizeTab)
# }
# labelCounts <- table(splitLabel)
# ignored <- names(labelCounts)[labelCounts < minCellPerRep]
# if (length(ignored) > 0) {
# cli::cli_alert_warning(
# "Ignoring replicates (size in bracket) with too few cells: {.val {paste0(ignored, ' (', repSizeTab[ignored], ')')}}"
# )
# cli::cli_alert_info(
# "Consider decrease {.field minCellPerRep} to exclude less replicates or/and lower {.field nPsdRep} to generate larger pseudo-replicates."
# )
# }
# keep <- names(labelCounts)[labelCounts >= minCellPerRep]
# idx <- splitLabel %in% keep
# splitLabel <- splitLabel[idx, drop = TRUE]
# mat <- mat[, idx, drop = FALSE]
# replicateAnn <- replicateAnn[idx, , drop = FALSE]
#
# pseudoBulks <- colAggregateSums_sparse(mat, as.integer(splitLabel) - 1,
# nlevels(splitLabel))
# dimnames(pseudoBulks) <- list(rownames(mat), levels(splitLabel))
# pseudoBulks <- pseudoBulks[rowSums(pseudoBulks) > 0,]
# return(list(pseudoBulks, replicateAnn))
# }

.callDESeq2 <- function(pseudoBulks, groups,
verbose = getOption("ligerVerbose", TRUE)) {
# DESeq2 workflow
Expand Down Expand Up @@ -926,37 +880,54 @@ calcPctInOut <- function(

extractMergedNormData <- function(
object,
slot,
cellIdx = NULL,
featureIdx = NULL
) {
cellIdx <- .idxCheck(object, cellIdx, "cell")
datasetInvolved <- unique(object$dataset[cellIdx])
cellID <- colnames(object)[cellIdx]
getter <- switch(
slot,
normData = normData,
normPeak = normPeak
)
if (is.null(featureIdx)) {
# Use intersection by default
featureIdx <- Reduce(
intersect,
lapply(
normData(object, datasetInvolved),
getter(object, datasetInvolved),
rownames
)
)
}
out <- NULL
for (i in seq_along(datasetInvolved)) {
dn <- datasetInvolved[i]
ldFeatureIdx <- .idxCheck(dataset(object, dn), featureIdx, "feature")
ld <- dataset(object, dn)
mat <- getter(ld)
allFeature <- rownames(mat)
ldFeatureIdx <- match(featureIdx, allFeature)
ldFeatureIdx <- ldFeatureIdx[!is.na(ldFeatureIdx)]
# if (slot == "normData") {
# ldFeatureIdx <- .idxCheck(dataset(object, dn), featureIdx, "feature")
# } else {
# allPeaks <- rownames(getter())
# }

ldCellIdx <- match(cellID, colnames(dataset(object, dn)))
ldCellIdx <- ldCellIdx[!is.na(ldCellIdx)]
out <- cbind(out, normData(object, dn)[ldFeatureIdx, ldCellIdx, drop = FALSE])
out <- cbind(out, mat[ldFeatureIdx, ldCellIdx, drop = FALSE])
# out <- cbind(out, getter(object, dn)[ldFeatureIdx, ldCellIdx, drop = FALSE])
}
out[, cellID]
}

# X: matrix of data to be tested
# y: grouping label of columns of X
# Rcpp source code located in src/wilcoxon.cpp
wilcoxauc <- function(x, clusterVar) {
wilcoxauc <- function(x, clusterVar, verbose = verbose) {
if (methods::is(x, 'dgTMatrix')) x <- methods::as(x, 'CsparseMatrix') # nocov start
if (methods::is(x, 'TsparseMatrix')) x <- methods::as(x, 'CsparseMatrix')
if (is.null(row.names(x))) {
Expand All @@ -972,7 +943,7 @@ wilcoxauc <- function(x, clusterVar) {
xRanked <- Matrix::t(x)
# This computes the ranking of non-zero values and the ties
ties <- cpp_rank_matrix_dgc(xRanked@x, xRanked@p,
nrow(xRanked), ncol(xRanked))
nrow(xRanked), ncol(xRanked), showProgress = verbose)
# ranksRes <- list(X_ranked = xT, ties = ties)

# rankRes <- colRanking(x)
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ DirectSNNToFile <- function(nn_ranked, prune, display_progress, filename) {
.Call(`_rliger_DirectSNNToFile`, nn_ranked, prune, display_progress, filename)
}

cpp_rank_matrix_dgc <- function(x, p, nrow, ncol) {
.Call(`_rliger_cpp_rank_matrix_dgc`, x, p, nrow, ncol)
cpp_rank_matrix_dgc <- function(x, p, nrow, ncol, showProgress = FALSE) {
.Call(`_rliger_cpp_rank_matrix_dgc`, x, p, nrow, ncol, showProgress)
}

rowAggregateSum_sparse <- function(X, groups, ngroups) {
Expand Down
2 changes: 1 addition & 1 deletion R/factorMarker.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ getFactorMarkers <- function(
normDataList[[2]][vargene, labels[[2]] == i])
cellLabel <- rep(c(dataset1, dataset2),
c(sum(labels[[1]] == i), sum(labels[[2]] == i)))
wilcoxResult <- wilcoxauc(log1p(1e10*normData), cellLabel)
wilcoxResult <- wilcoxauc(log1p(1e10*normData), cellLabel, verbose = FALSE)

log2fc <- wilcoxResult[wilcoxResult$group == dataset1, ]$logFC
names(log2fc) <- wilcoxResult[wilcoxResult$group == dataset1, ]$feature
Expand Down
2 changes: 1 addition & 1 deletion R/liger-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ setMethod(
function(x,
slot = c("rawData", "normData", "scaleData",
"scaleUnsharedData", "H", "V", "U", "A", "B",
"W", "H.norm"),
"W", "H.norm", "rawPeak", "normPeak"),
dataset = NULL,
returnList = FALSE) {
slot <- match.arg(slot)
Expand Down
15 changes: 15 additions & 0 deletions R/ligerDataset_subclass-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ setReplaceMethod(
x
})

#' @export
#' @rdname ligerDataset-class
setMethod(
"getMatrix", signature(x = "ligerATACDataset", dataset = "missing",
returnList = "missing"),
function(x,
slot = c("rawData", "normData", "scaleData",
"scaleUnsharedData", "H", "V", "U", "A", "B", "rawPeak", "normPeak"),
dataset = NULL) {
# TODO: Currently directly find the data with slot, but need to
# think about maintainability when we need to change slot name.
slot <- match.arg(slot)
methods::slot(x, slot)
})

#' @rdname coordinate
#' @export
setMethod("coordinate", signature(x = "ligerSpatialDataset", dataset = "missing"),
Expand Down
2 changes: 1 addition & 1 deletion man/liger-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion man/ligerDataset-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,16 +330,17 @@ BEGIN_RCPP
END_RCPP
}
// cpp_rank_matrix_dgc
std::vector<std::list<float> > cpp_rank_matrix_dgc(arma::vec& x, const arma::vec& p, int nrow, int ncol);
RcppExport SEXP _rliger_cpp_rank_matrix_dgc(SEXP xSEXP, SEXP pSEXP, SEXP nrowSEXP, SEXP ncolSEXP) {
std::vector<std::list<float> > cpp_rank_matrix_dgc(arma::vec& x, const arma::vec& p, int nrow, int ncol, bool showProgress);
RcppExport SEXP _rliger_cpp_rank_matrix_dgc(SEXP xSEXP, SEXP pSEXP, SEXP nrowSEXP, SEXP ncolSEXP, SEXP showProgressSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< arma::vec& >::type x(xSEXP);
Rcpp::traits::input_parameter< const arma::vec& >::type p(pSEXP);
Rcpp::traits::input_parameter< int >::type nrow(nrowSEXP);
Rcpp::traits::input_parameter< int >::type ncol(ncolSEXP);
rcpp_result_gen = Rcpp::wrap(cpp_rank_matrix_dgc(x, p, nrow, ncol));
Rcpp::traits::input_parameter< bool >::type showProgress(showProgressSEXP);
rcpp_result_gen = Rcpp::wrap(cpp_rank_matrix_dgc(x, p, nrow, ncol, showProgress));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -422,7 +423,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_rliger_ComputeSNN", (DL_FUNC) &_rliger_ComputeSNN, 2},
{"_rliger_WriteEdgeFile", (DL_FUNC) &_rliger_WriteEdgeFile, 3},
{"_rliger_DirectSNNToFile", (DL_FUNC) &_rliger_DirectSNNToFile, 4},
{"_rliger_cpp_rank_matrix_dgc", (DL_FUNC) &_rliger_cpp_rank_matrix_dgc, 4},
{"_rliger_cpp_rank_matrix_dgc", (DL_FUNC) &_rliger_cpp_rank_matrix_dgc, 5},
{"_rliger_rowAggregateSum_sparse", (DL_FUNC) &_rliger_rowAggregateSum_sparse, 3},
{"_rliger_colAggregateSum_sparse", (DL_FUNC) &_rliger_colAggregateSum_sparse, 3},
{"_rliger_colNNZAggr_sparse", (DL_FUNC) &_rliger_colNNZAggr_sparse, 3},
Expand Down
17 changes: 16 additions & 1 deletion src/wilcoxon.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>

// [[Rcpp::depends(RcppProgress)]]
#include <progress.hpp>
#include <progress_bar.hpp>

using namespace std;
using namespace Rcpp;
using namespace arma;
Expand Down Expand Up @@ -54,18 +59,28 @@ std::list<float> cpp_in_place_rank_mean(arma::vec& v_temp, int idx_begin,

// [[Rcpp::export]]
std::vector<std::list<float> > cpp_rank_matrix_dgc(
arma::vec& x, const arma::vec& p, int nrow, int ncol) {
arma::vec& x, const arma::vec& p, int nrow, int ncol,
bool showProgress = false
) {
vector<list<float> > ties(ncol);
int n_zero;
if (showProgress) {
Rcpp::Rcerr << "Wilcoxon rank-sum test over " << ncol << " features across " << nrow << " cells" << std::endl;
}
Progress pb(ncol, showProgress);
for (int i = 0; i < ncol; i++) {
if (Progress::check_abort()) return ties;

n_zero = nrow - (p[i+1] - p[i]);
if (p[i+1] == p[i]) {
ties[i].push_back(n_zero);
pb.increment();
continue;
}
ties[i] = cpp_in_place_rank_mean(x, p[i], p[i + 1] - 1);
ties[i].push_back(n_zero);
x.rows(p[i], p[i + 1] - 1) += n_zero;
pb.increment();
}
return ties;
}
Expand Down
Loading

0 comments on commit 8d36995

Please sign in to comment.