From e939e112d5130c839e1e89ecfceb70e7dc7c6f7e Mon Sep 17 00:00:00 2001 From: Yichen Wang Date: Tue, 19 Mar 2024 14:41:40 -0400 Subject: [PATCH] New article; with fix to h5/h5ad support; add more inmf abort condition --- DESCRIPTION | 4 +- NAMESPACE | 8 + NEWS.md | 9 +- R/classes.R | 22 +- R/h5Utility.R | 227 ++++++++++++++++- R/import.R | 188 +++++++++----- R/integration.R | 27 +- R/liger-methods.R | 4 +- R/preprocess.R | 22 +- R/subsetObject.R | 6 +- README.md | 4 +- man/closeAllH5.Rd | 8 +- man/createH5LigerDataset.Rd | 39 ++- man/createLiger.Rd | 4 + man/liger-class.Rd | 4 +- man/restoreH5Liger.Rd | 2 +- man/writeH5.Rd | 87 +++++++ tests/testthat/test_factorization.R | 2 +- tests/testthat/test_object.R | 4 +- vignettes/articles/import_data.Rmd | 365 ++++++++++++++++++++++++++++ 20 files changed, 920 insertions(+), 116 deletions(-) create mode 100644 man/writeH5.Rd create mode 100644 vignettes/articles/import_data.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index c5ac6535..5db63db2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rliger -Version: 1.99.1 -Date: 2023-11-09 +Version: 2.0.0 +Date: 2024-03-19 Type: Package Title: Linked Inference of Genomic Experimental Relationships Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) , and Liu J, Gao C, Sodicoff J, et al (2020) for more details. diff --git a/NAMESPACE b/NAMESPACE index 1c3a6532..69f7d8b3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,8 @@ S3method(as.ligerDataset,ligerDataset) S3method(as.ligerDataset,matrix) S3method(c,liger) S3method(cbind,ligerDataset) +S3method(closeAllH5,liger) +S3method(closeAllH5,ligerDataset) S3method(fortify,liger) S3method(length,liger) S3method(lengths,liger) @@ -46,6 +48,10 @@ S3method(scaleNotCenter,ligerDataset) S3method(scaleNotCenter,ligerMethDataset) S3method(selectGenes,Seurat) S3method(selectGenes,liger) +S3method(writeH5,default) +S3method(writeH5,dgCMatrix) +S3method(writeH5,liger) +S3method(writeH5,ligerDataset) export("%>%") export("cellMeta<-") export("coordinate<-") @@ -78,6 +84,7 @@ export(commandDiff) export(commands) export(convertOldLiger) export(coordinate) +export(createH5LigerDataset) export(createLiger) export(createLigerDataset) export(dataset) @@ -199,6 +206,7 @@ export(subsetLigerDataset) export(subsetMemLigerDataset) export(varFeatures) export(varUnsharedFeatures) +export(writeH5) exportClasses(ligerATACDataset) exportClasses(ligerCommand) exportClasses(ligerDataset) diff --git a/NEWS.md b/NEWS.md index 349da770..dafbdaa4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,16 +1,17 @@ -## rliger 1.9.9 +## rliger 2.0.0 - Added `ligerDataset` class for per-dataset information storage, with inheritance for specific modalities - Added a number of plotting functions with clear function names and useful functionality - Added Leiden clustering method, now as default rather than Louvain - Added pseudo-bulk DEG method +- Added DEG analysis with one-vs-rest marker detection in `runMarkerDEG()` and pairwise comparison in `runPairwiseDEG()` - Added gene name pattern for expression percentage QC metric - Added native Seurat object support for the core integration workflow - Added a documentation website built with pkgdown +- Added new iNMF variant method, consensus iNMF (c-iNMF), in `runCINMF()` - Changed `liger` object class structure -- Changed iNMF (previously `optimizeALS()`), UINMF (previously `optimizeALS(unshared = TRUE)`) and online iNMF (previously `online_iNMF()`) implementation with vastly improved performance. -Now named by `runINMF()`, `runUINMF()` and `runOnlineINMF()` respectively, and wrapped in -`runIntegration()`. +- Changed iNMF (previously `optimizeALS()`), UINMF (previously `optimizeALS(unshared = TRUE)`) and online iNMF (previously `online_iNMF()`) implementation with vastly improved performance. Now named by `runINMF()`, `runUINMF()` and `runOnlineINMF()` respectively, and wrapped in `runIntegration()`. +- Updated H5AD support to match up with Python anndata package 0.8.0 specs ## rliger 1.0.1 diff --git a/R/classes.R b/R/classes.R index 3f768d24..7de8d112 100644 --- a/R/classes.R +++ b/R/classes.R @@ -206,14 +206,23 @@ liger <- setClass( } .checkDatasetVar <- function(x) { + if (!"dataset" %in% names(cellMeta(x))) { + return("`datasets` variable missing in cellMeta(x)") + } if (!is.factor(x@cellMeta$dataset)) { return("\"dataset\" variable in cellMeta is not a factor") } - ds_cm <- levels(droplevels(x@cellMeta[["dataset"]])) - ds_ds <- names(x@datasets) + ds_cm <- as.character(levels(droplevels(x@cellMeta[["dataset"]]))) + ds_ds <- as.character(names(x@datasets)) if (!identical(ds_cm, ds_ds)) { return("`levels(x$dataset)` does not match `names(x)`.") } + + datasetNamesFromDatasets <- as.character(rep(names(x), lapply(datasets(x), ncol))) + names(datasetNamesFromDatasets) <- NULL + if (!identical(datasetNamesFromDatasets, as.character(x$dataset))) { + return("names of datasets do not match \"datasets\" variable in cellMeta") + } return(NULL) } @@ -242,16 +251,7 @@ liger <- setClass( return("H.norm barcodes do not match to barcodes in datasets.") } } - if (!"dataset" %in% names(cellMeta(x))) { - return("`datasets` variable missing in cellMeta(x)") - } - datasetNamesFromDatasets <- rep(names(x), lapply(datasets(x), ncol)) - names(datasetNamesFromDatasets) <- NULL - if (!identical(datasetNamesFromDatasets, as.character(x$dataset))) { - return("names of datasets do not match - `datasets` variable in cellMeta") - } return(NULL) } diff --git a/R/h5Utility.R b/R/h5Utility.R index 161ea944..d0e43b9b 100644 --- a/R/h5Utility.R +++ b/R/h5Utility.R @@ -183,7 +183,7 @@ safeH5Create <- function(object, #' lig <- createLiger(list(ctrl = h5Path)) #' # Now it is actually an invalid object! which is equivalent to what users #' # will get with `saveRDS(lig, "object.rds"); lig <- readRDS("object.rds")`` -#' lig <- closeAllH5(lig) +#' closeAllH5(lig) #' lig <- restoreH5Liger(lig) restoreH5Liger <- function(object, filePath = NULL) { if (!inherits(object, "liger") && !inherits(object, "ligerDataset")) { @@ -298,16 +298,34 @@ restoreOnlineLiger <- function(object, file.path = NULL) { #' of the currect R session, the HDF5 files has to be closed in order to be #' available to other processes. #' @param object liger object. -#' @return \code{object} with links closed. +#' @return Nothing is returned. #' @export +#' @rdname closeAllH5 closeAllH5 <- function(object) { - if (!isH5Liger(object)) return(object) + UseMethod("closeAllH5", object) +} + +#' @rdname closeAllH5 +#' @export +#' @method closeAllH5 liger +closeAllH5.liger <- function(object) { for (dn in names(object)) { - if (!isH5Liger(object, dn)) next - h5f <- getH5File(object, dn) - h5f$close_all() + ld <- dataset(object, dn) + closeAllH5(ld) } - return(object) +} + +#' @rdname closeAllH5 +#' @export +#' @method closeAllH5 ligerDataset +closeAllH5.ligerDataset <- function(object) { + h5file <- getH5File(object) + if (!is.null(h5file)) { + path <- h5file$filename + cli::cli_alert_info("Closing H5 file: {.file {path}}") + h5file$close_all() + } + return(invisible(NULL)) } .H5GroupToH5SpMat <- function(obj, dims) { @@ -323,3 +341,198 @@ closeAllH5 <- function(object) { # RcppPlanc::H5Mat(filename = obj$get_filename(), # dataPath = obj$get_obj_name()) # } + + + + + +#' Write in-memory data into H5 file +#' @rdname writeH5 +#' @description +#' This function writes in-memory data into H5 file by default in 10x cellranger +#' HDF5 output format. The main goal of this function is to allow users to +#' integrate large H5-based dataset, that cannot be fully loaded into memory, +#' with other data already loaded in memory using \code{\link{runOnlineINMF}}. +#' In this case, users can write the smaller in-memory data to H5 file instead +#' of loading subset of the large H5-based dataset into memory, where +#' information might be lost. +#' +#' Basing on the goal of the whole workflow, the data will always be written +#' in a CSC matrix format and colnames/rownames are always required. +#' +#' The default method coerces the input to a \linkS4class{dgCMatrix}. Methods +#' for other container classes tries to extract proper data and calls the +#' default method. +#' @param x An object with in-memory data to be written into H5 file. +#' @param file A character string of the file path to be written. +#' @param overwrite Logical, whether to overwrite the file if it already exists. +#' Default \code{FALSE}. +#' @param indicesPath,indptrPath,dataPath The paths inside the H5 file where +#' the \linkS4class{dgCMatrix} constructor \code{i}, \code{p}, and \code{x} will +#' be written to, respectively. Default using cellranger convention +#' \code{"matrix/indices"}, \code{"matrix/indptr"}, and \code{"matrix/data"}. +#' @param shapePath The path inside the H5 file where the shape of the matrix +#' will be written to. Default \code{"matrix/shape"}. +#' @param barcodesPath The path inside the H5 file where the barcodes/colnames +#' will be written to. Default \code{"matrix/barcodes"}. Skipped if the object +#' does not have colnames. +#' @param featuresPath The path inside the H5 file where the features/rownames +#' will be written to. Default \code{"matrix/features/name"}. Skipped if the +#' object does not have rownames. +#' @param useDatasets For liger method. Names or indices of datasets to be +#' written to H5 files. Required. +#' @param ... Arguments passed to other S3 methods. +#' @export +#' @seealso +#' \href{https://www.10xgenomics.com/cn/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-h5-matrices}{10X cellranger H5 matrix detail} +#' @return Nothing is returned. H5 file will be created on disk. +#' @examples +#' raw <- rawData(pbmc, "ctrl") +#' writeH5(raw, "ctrl.h5") +#' unlink("ctrl.h5") +writeH5 <- function(x, file, ...) { + UseMethod("writeH5", x) +} + +#' @rdname writeH5 +#' @export +#' @method writeH5 default +writeH5.default <- function(x, file, ...) { + x <- methods::as(x, "CsparseMatrix") + writeH5(x, file, ...) +} + +#' @rdname writeH5 +#' @export +#' @method writeH5 dgCMatrix +writeH5.dgCMatrix <- function( + x, + file, + overwrite = FALSE, + indicesPath = "matrix/indices", + indptrPath = "matrix/indptr", + dataPath = "matrix/data", + shapePath = "matrix/shape", + barcodesPath = "matrix/barcodes", + featuresPath = "matrix/features/name", + ... +) { + if (file.exists(file)) { + if (isFALSE(overwrite)) { + cli::cli_abort( + c("x" = "File already exists at: {.file {normalizePath(file)}}.", + "i" = "Use {.code overwrite = TRUE} to overwrite.") + ) + } else { + file.remove(file) + } + } + + if (any(sapply(dimnames(x), is.null))) { + cli::cli_abort("Both rownames and colnames are required.") + } + + dataType <- typeof(x@x) + dataDtype <- switch( + dataType, + double = "double", + integer = "uint32_t", + cli::cli_abort( + "Unsupported data type in the sparse matrix: {.val {dataType}}" + ) + ) + h5file <- hdf5r::H5File$new(file, mode = "w") + safeH5Create( + object = h5file, + dataPath = indicesPath, + dims = length(x@i), + dtype = "uint32_t", + chunkSize = 4096 + ) + safeH5Create( + object = h5file, + dataPath = indptrPath, + dims = length(x@p), + dtype = "uint32_t", + chunkSize = 2048 + ) + safeH5Create( + object = h5file, + dataPath = dataPath, + dims = length(x@x), + dtype = dataDtype, + chunkSize = 4096 + ) + safeH5Create( + object = h5file, + dataPath = shapePath, + dims = 2, + dtype = "uint64_t" + ) + h5file[[indicesPath]][] <- x@i + h5file[[indptrPath]][] <- x@p + h5file[[dataPath]][] <- x@x + h5file[[shapePath]][] <- dim(x) + + safeH5Create( + object = h5file, + dataPath = barcodesPath, + dims = ncol(x), + dtype = "char" + ) + h5file[[barcodesPath]][] <- colnames(x) + + safeH5Create( + object = h5file, + dataPath = featuresPath, + dims = nrow(x), + dtype = "char" + ) + h5file[[featuresPath]][] <- rownames(x) + + h5file$close() + invisible(NULL) +} + +#' @rdname writeH5 +#' @export +#' @method writeH5 ligerDataset +writeH5.ligerDataset <- function(x, file, ...) { + raw <- rawData(x) + if (is.null(raw)) { + cli::cli_abort("No {.code rawData(x)} available.") + } + writeH5(raw, file, ...) +} + +#' @rdname writeH5 +#' @export +#' @method writeH5 liger +writeH5.liger <- function(x, file, useDatasets, ...) { + useDatasets <- .checkUseDatasets(x, useDatasets) + file <- .checkArgLen(file, n = length(useDatasets), repN = FALSE, + class = "character", .stop = TRUE) + for (i in seq_along(useDatasets)) { + if (isH5Liger(x, i)) { + cli::cli_alert_warning( + "Dataset {.val {useDatasets[i]}} is H5 based, file located at: {.file {getH5File(x, useDatasets[i])$filename}}. Skipped." + ) + next + } + raw <- rawData(x, useDatasets[i]) + if (is.null(raw)) { + cli::cli_abort("No {.code rawData(x, '{useDatasets[i]}')} available.") + } + tryCatch( + expr = { + writeH5(raw, file[i], ...) + }, + error = function(e) { + cli::cli_alert_danger("Failed to write dataset {.val {useDatasets[i]}} to H5 file at {.file {file[i]}}. Continuing with the others.") + msg <- format(e) + cli::cli_alert_danger("Call {.code {msg[['call']]}}: {msg[['message']]}") + } + ) + } + invisible(NULL) +} diff --git a/R/import.R b/R/import.R index 0f97dd69..67844529 100644 --- a/R/import.R +++ b/R/import.R @@ -22,6 +22,8 @@ #' options are \code{"10X"} and \code{"AnnData"}. Can be either a single #' specification for all datasets or a character vector that match with each #' dataset. +#' @param anndataX The HDF5 path to the raw count data in an H5AD file. See +#' \code{\link{createH5LigerDataset}} Details. Default \code{"X"}. #' @param dataName,indicesName,indptrName The path in a H5 file for the raw #' sparse matrix data. These three types of data stands for the \code{x}, #' \code{i}, and \code{p} slots of a \code{\link[Matrix]{dgCMatrix-class}} @@ -61,6 +63,7 @@ createLiger <- function( removeMissing = TRUE, addPrefix = "auto", formatType = "10X", + anndataX = "X", dataName = NULL, indicesName = NULL, indptrName = NULL, @@ -99,9 +102,10 @@ createLiger <- function( data <- rawData[[i]] if (is.character(data)) { # Assuming character input is a filename - datasets[[dname]] <- createH5LigerDataset( + ld <- createH5LigerDataset( h5file = data, formatType = formatType, + anndataX = anndataX, rawData = dataName, barcodesName = barcodesName, genesName = genesName, @@ -110,9 +114,10 @@ createLiger <- function( modal = modal[i] ) } else { - datasets[[dname]] <- as.ligerDataset(data, modal = modal[i]) + ld <- as.ligerDataset(data, modal = modal[i]) } - colnameOrig <- colnames(datasets[[dname]]) + if (is.null(ld)) next + colnameOrig <- colnames(ld) prefix <- paste0(dname, "_") .addPrefix <- FALSE if (addPrefix == "auto") { @@ -121,8 +126,9 @@ createLiger <- function( } barcodesOrig <- c(barcodesOrig, colnameOrig) if (.addPrefix) { - colnames(datasets[[dname]]) <- paste0(prefix, colnameOrig) + colnames(ld) <- paste0(prefix, colnameOrig) } + datasets[[dname]] <- ld } datasets <- lapply(datasets, function(ld) { @@ -230,25 +236,53 @@ createLigerDataset <- function( } #' Create on-disk ligerDataset Object +#' @description +#' For convenience, the default \code{formatType = "10x"} directly fits the +#' structure of cellranger output. \code{formatType = "anndata"} works for +#' current AnnData H5AD file specification (see Details). If there a customized +#' H5 file structure is presented, any of the \code{rawData}, +#' \code{indicesName}, \code{indptrName}, \code{genesName}, \code{barcodesName} +#' should be specified accordingly to override the \code{formatType} preset. +#' @details +#' For H5AD file written from an AnnData object, we allow using +#' \code{formatType = "anndata"} for the function to infer the proper structure. +#' However, while a typical AnnData-based analysis tends to in-place update the +#' \code{adata.X} attribute and there is no standard/forced convention for where +#' the raw count data, as needed from LIGER, is stored. Therefore, we expose +#' argument \code{anndataX} for specifying this information. The default value +#' \code{"X"} looks for \code{adata.X}. If the raw data is stored in a layer, +#' e.g. \code{adata.layers['count']}, then \code{anndataX = "layers/count"}. +#' If it is stored to \code{adata.raw.X}, then \code{anndataX = "raw/X"}. If +#' your AnnData object does not have the raw count retained, you will have to +#' go back to the Python work flow to have it inserted at desired object space +#' and re-write the H5AD file, or just go from upstream source files with which +#' the AnnData was originally created. #' @param h5file Filename of an H5 file #' @param formatType Select preset of H5 file structure. Default \code{"10X"}. -#' Current available options are \code{"10X"} and \code{"AnnData"}. +#' Current available option is only \code{"10X"}. #' @param rawData,indicesName,indptrName The path in a H5 file for the raw #' sparse matrix data. These three types of data stands for the \code{x}, #' \code{i}, and \code{p} slots of a \code{\link[Matrix]{dgCMatrix-class}} #' object. Default \code{NULL} uses \code{formatType} preset. #' @param normData The path in a H5 file for the "x" vector of the normalized #' sparse matrix. Default \code{NULL}. -#' @param scaleData The path in a H5 file for the dense 2D scaled matrix. -#' Default \code{NULL}. +#' @param scaleData The path in a H5 file for the Group that contains the sparse +#' matrix constructing information for the scaled data. Default \code{NULL}. #' @param genesName,barcodesName The path in a H5 file for the gene names and #' cell barcodes. Default \code{NULL} uses \code{formatType} preset. +#' @param anndataX The HDF5 path to the raw count data in an H5AD file. See +#' Details. Default \code{"X"}. #' @param modal Name of modality for this dataset. Currently options of #' \code{"default"}, \code{"rna"}, \code{"atac"}, \code{"spatial"} and #' \code{"meth"} are supported. Default \code{"default"}. #' @param featureMeta Data frame for feature metadata. Default \code{NULL}. #' @param ... Additional slot data. See \linkS4class{ligerDataset} for detail. #' Given values will be directly placed at corresponding slots. +#' @export +#' @return H5-based \linkS4class{ligerDataset} object +#' @examples +#' h5Path <- system.file("extdata/ctrl.h5", package = "rliger") +#' ld <- createH5LigerDataset(h5Path) createH5LigerDataset <- function( h5file, formatType = "10X", @@ -259,6 +293,7 @@ createH5LigerDataset <- function( genesName = NULL, indicesName = NULL, indptrName = NULL, + anndataX = "X", modal = c("default", "rna", "atac", "spatial", "meth"), featureMeta = NULL, ... @@ -272,61 +307,94 @@ createH5LigerDataset <- function( modal <- match.arg(modal) additional <- list(...) h5file <- hdf5r::H5File$new(h5file, mode = "r+") - if (!is.null(formatType)) { - if (formatType == "10X") { - barcodesName <- "matrix/barcodes" - barcodes <- h5file[[barcodesName]][] - rawData <- "matrix/data" - indicesName <- "matrix/indices" - indptrName <- "matrix/indptr" - genesName <- "matrix/features/name" - genes <- h5file[[genesName]][] - } else if (formatType == "AnnData") { - barcodesName <- "obs" - barcodes <- h5file[[barcodesName]][]$cell - rawData <- "raw.X/data" - indicesName <- "raw.X/indices" - indptrName <- "raw.X/indptr" - genesName <- "raw.var" - genes <- h5file[[genesName]][] - } else { - cli::cli_abort("Specified {.var formatType} ({.val {formatType}}) is not supported for now.") + tryCatch( + { + if (!is.null(formatType)) { + formatType <- tolower(formatType) + if (formatType == "10x") { + barcodesName <- barcodesName %||% "matrix/barcodes" + barcodes <- h5file[[barcodesName]][] + rawData <- rawData %||% "matrix/data" + indicesName <- indicesName %||% "matrix/indices" + indptrName <- indptrName %||% "matrix/indptr" + genesName <- genesName %||% "matrix/features/name" + genes <- h5file[[genesName]][] + } else if (formatType == "anndata") { + # AnnData specs changed around 0.7.0 + if (inherits(h5file[["obs"]], "H5Group")) { + barcodesName <- paste0("obs/", hdf5r::h5attr(h5file[["obs"]], "_index")) + barcodes <- h5file[[barcodesName]][] + genesName <- paste0("var/", hdf5r::h5attr(h5file[["var"]], "_index")) + genes <- h5file[[genesName]][] + } else if (inherits(h5file[["obs"]], "H5D")) { + barcodesName <- "obs" + barcodes <- h5file[["obs"]][]$cell + genesName <- genesName %||% "raw.var" + genes <- h5file[[genesName]][] + } + rawData <- rawData %||% paste0(anndataX, "/data") + indicesName <- indicesName %||% paste0(anndataX, "/indices") + indptrName <- indptrName %||% paste0(anndataX, "/indptr") + if (!inherits(h5file[[rawData]]$key_info$type, "H5T_INTEGER")) { + cli::cli_alert_warning( + "Warning: H5AD matrix data detected is not of integer dtype while {.pkg rliger} requires raw counts input." + ) + cli::cli_alert_warning( + "Use caution when using this dataset if the data really does not have integer values." + ) + cli::cli_alert_info( + "A different matrix can be selected using argument {.var anndataX}." + ) + cli::cli_alert_info( + "See {.code ?createH5LigerDataset} {.emph Details} for more information." + ) + } + } else { + cli::cli_abort("Specified {.var formatType} ({.val {formatType}}) is not supported for now.") + } + } else { + barcodes <- h5file[[barcodesName]][] + genes <- h5file[[genesName]][] + } + # The order of list elements matters. Put "paths" together so easier for + # checking link existence. + h5.meta <- list( + H5File = h5file, + filename = h5file$filename, + formatType = formatType, + indicesName = indicesName, + indptrName = indptrName, + barcodesName = barcodesName, + genesName = genesName, + rawData = rawData, + normData = normData, + scaleData = scaleData + ) + if (!is.null(rawData)) rawData <- h5file[[rawData]] + if (!is.null(normData)) normData <- h5file[[normData]] + if (!is.null(scaleData)) scaleData <- h5file[[scaleData]] + if (is.null(featureMeta)) + featureMeta <- S4Vectors::DataFrame(row.names = genes) + else if (!inherits(featureMeta, "DFrame")) + featureMeta <- S4Vectors::DataFrame(featureMeta) + allData <- list(Class = .modalClassDict[[modal]], + rawData = rawData, + normData = normData, + scaleData = scaleData, + #H = H, V = V, A = A, B = B, U = U, + h5fileInfo = h5.meta, featureMeta = featureMeta, + colnames = barcodes, rownames = genes) + allData <- c(allData, additional) + x <- do.call("new", allData) + return(x) + }, + error = function(e) { + message(format(e)) + cli::cli_alert_warning("Closing all linking to H5 file: {.file {h5file$filename}}") + h5file$close_all() + return(invisible(NULL)) } - } else { - barcodes <- h5file[[barcodesName]][] - genes <- h5file[[genesName]][] - } - # The order of list elements matters. Put "paths" together so easier for - # checking link existence. - h5.meta <- list( - H5File = h5file, - filename = h5file$filename, - formatType = formatType, - indicesName = indicesName, - indptrName = indptrName, - barcodesName = barcodesName, - genesName = genesName, - rawData = rawData, - normData = normData, - scaleData = scaleData ) - if (!is.null(rawData)) rawData <- h5file[[rawData]] - if (!is.null(normData)) normData <- h5file[[normData]] - if (!is.null(scaleData)) scaleData <- h5file[[scaleData]] - if (is.null(featureMeta)) - featureMeta <- S4Vectors::DataFrame(row.names = genes) - else if (!inherits(featureMeta, "DFrame")) - featureMeta <- S4Vectors::DataFrame(featureMeta) - allData <- list(.modalClassDict[[modal]], - rawData = rawData, - normData = normData, - scaleData = scaleData, - #H = H, V = V, A = A, B = B, U = U, - h5fileInfo = h5.meta, featureMeta = featureMeta, - colnames = barcodes, rownames = genes) - allData <- c(allData, additional) - x <- do.call("new", allData) - x } diff --git a/R/integration.R b/R/integration.R index f5a688d5..967d2773 100644 --- a/R/integration.R +++ b/R/integration.R @@ -390,6 +390,13 @@ runINMF.Seurat <- function( allFeatures <- lapply(object, rownames) features <- Reduce(.same, allFeatures) + if (min(lengths(barcodeList)) < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(lengths(barcodeList))}).") + } + if (length(features) < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of shared features ({length(features)}).") + } + bestResult <- list() bestObj <- Inf bestSeed <- seed @@ -751,7 +758,7 @@ runOnlineINMF.liger <- function( } } } - object <- closeAllH5(object) + closeAllH5(object) res <- .runOnlineINMF.list(Es, newDatasets = newDatasets, projection = projection, k = k, lambda = lambda, maxEpochs = maxEpochs, @@ -822,6 +829,17 @@ runOnlineINMF.liger <- function( allFeatures <- c(lapply(object, rownames), lapply(newDatasets, rownames)) features <- Reduce(.same, allFeatures) + # In the case for H5 liger, we don't have dimnames associated with input + # NOTE: RcppPlanc got dim method for the H5SpMat class. + ncellPerDataset <- unlist(c(sapply(object, ncol), sapply(newDatasets, ncol))) + nFeaturePerDataset <- unlist(c(sapply(object, nrow), sapply(newDatasets, nrow))) + if (min(ncellPerDataset) < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(ncellPerDataset)}).") + } + if (nFeaturePerDataset[1] < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of shared features ({nFeaturePerDataset}).") + } + if (!is.null(seed)) set.seed(seed) res <- RcppPlanc::onlineINMF(objectList = object, newDatasets = newDatasets, @@ -1194,6 +1212,13 @@ runUINMF.liger <- function( allFeatures <- lapply(object, rownames) features <- Reduce(.same, allFeatures) + if (min(lengths(barcodeList)) < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(lengths(barcodeList))}).") + } + if (length(features) < k) { + cli::cli_abort("Number of factors (k={k}) should be less than the number of shared features ({length(features)}).") + } + bestObj <- Inf bestRes <- NULL bestSeed <- NULL diff --git a/R/liger-methods.R b/R/liger-methods.R index ff2bb9b5..c6e32129 100644 --- a/R/liger-methods.R +++ b/R/liger-methods.R @@ -68,7 +68,9 @@ is.newLiger <- function(object) { #' ## Retrieving cell metadata, replacement methods available #' cellMeta(pbmcPlot) #' head(pbmcPlot[["nUMI"]]) -#' head(pbmcPlot$UMAP) +#' +#' ## Retrieving dimemtion reduction matrix +#' head(dimRed(pbmcPlot, "UMAP")) #' #' ## Retrieving variable features, replacement methods available #' varFeatures(pbmcPlot) diff --git a/R/preprocess.R b/R/preprocess.R index 95b94399..275e6712 100644 --- a/R/preprocess.R +++ b/R/preprocess.R @@ -292,8 +292,10 @@ removeMissing <- function( rmCellDataset <- length(cellIdx) != ncol(ld) subsetted <- c(subsetted, any(c(rmFeatureDataset, rmCellDataset))) if (any(c(rmFeatureDataset, rmCellDataset))) { - if (isTRUE(verbose)) + if (isTRUE(verbose)) { cli::cli_alert_info("Removing missing in dataset: {.val {d}}") + } + datasets.new[[d]] <- subsetLigerDataset( ld, featureIdx = featureIdx, @@ -309,7 +311,7 @@ removeMissing <- function( } if (any(subsetted)) { allCells <- unlist(lapply(datasets.new, colnames), use.names = FALSE) - methods::new( + object <- methods::new( "liger", datasets = datasets.new, cellMeta = cellMeta(object, cellIdx = allCells, @@ -317,9 +319,8 @@ removeMissing <- function( varFeatures = character(), H.norm = object@H.norm[allCells, , drop = FALSE] ) - } else { - object } + return(object) } #' @rdname removeMissing @@ -435,11 +436,6 @@ normalize.ligerDataset <- function( safeH5Create(object = object, dataPath = resultH5Path, dims = rawData(object)$dims, dtype = "double", chunkSize = rawData(object)$chunk_dims) - ## These are for feature level metadata, to be used in downstream steps - safeH5Create(object = object, dataPath = "gene_means", - dims = nrow(object), dtype = "double") - safeH5Create(object = object, dataPath = "gene_sum_sq", - dims = nrow(object), dtype = "double") # Chunk run results <- H5Apply( object, @@ -456,9 +452,7 @@ normalize.ligerDataset <- function( chunkSize = chunk, verbose = verbose ) results$geneMeans <- results$geneMeans / ncol(object) - h5file[["gene_means"]][seq_along(results$geneMeans)] <- results$geneMeans featureMeta(object, check = FALSE)$geneMeans <- results$geneMeans - h5file[["gene_sum_sq"]][seq_along(results$geneSumSq)] <- results$geneSumSq featureMeta(object, check = FALSE)$geneSumSq <- results$geneSumSq normData(object, check = FALSE) <- h5file[[resultH5Path]] h5fileInfo(object, "normData", check = FALSE) <- resultH5Path @@ -780,11 +774,8 @@ selectGenes.liger <- function( #' @noRd calcGeneVars.H5 <- function(object, chunkSize = 1000, verbose = getOption("ligerVerbose", TRUE)) { - h5file <- getH5File(object) geneVars <- rep(0, nrow(object)) - geneMeans <- h5file[["gene_means"]][] - safeH5Create(object = object, dataPath = "gene_vars", - dims = nrow(object), dtype = "double") + geneMeans <- featureMeta(object)$geneMeans geneVars <- H5Apply( object, function(chunk, sparseXIdx, cellIdx, values) { @@ -796,7 +787,6 @@ calcGeneVars.H5 <- function(object, chunkSize = 1000, verbose = verbose ) geneVars <- geneVars / (ncol(object) - 1) - h5file[["gene_vars"]][seq_along(geneVars)] <- geneVars featureMeta(object, check = FALSE)$geneVars <- geneVars object } diff --git a/R/subsetObject.R b/R/subsetObject.R index cff80f1d..6ac65971 100644 --- a/R/subsetObject.R +++ b/R/subsetObject.R @@ -331,7 +331,7 @@ subsetH5LigerDataset <- function( featureIdx = featureIdx, useSlot = useSlot, chunkSize = chunkSize, verbose = verbose ) - }, error=function(e) { + }, error = function(e) { cli::cli_alert_danger( "An error occurred during subseting from H5 to H5." ) @@ -340,13 +340,15 @@ subsetH5LigerDataset <- function( stop(e) } ) - } else if (isFALSE(newH5)) { newObj <- subsetH5LigerDatasetToMem( object, cellIdx = cellIdx, featureIdx = featureIdx, useSlot = useSlot, chunkSize = chunkSize, verbose = verbose, returnObject = returnObject) } + cli::cli_alert_warning( + "The original H5 file is not modified and stays open. Use {.fn gc} to close." + ) newObj } diff --git a/README.md b/README.md index 1dd0be2c..a73022ab 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ R -codecov +codecov cran # LIGER (Linked Inference of Genomic Experimental Relationships) @@ -66,4 +66,4 @@ data("bmmc") ``` We also provide a set of datasets for real-world style demos, including scRNAseq, scATACseq, spatial transcriptomics and DNA methylation data. -They are described in detail in the articles that use of them. Please check them out from the links above. +They are described in detail in the articles that make use of them. Please check them out from the links above. diff --git a/man/closeAllH5.Rd b/man/closeAllH5.Rd index 5be7bf7d..eff8ef0c 100644 --- a/man/closeAllH5.Rd +++ b/man/closeAllH5.Rd @@ -2,15 +2,21 @@ % Please edit documentation in R/h5Utility.R \name{closeAllH5} \alias{closeAllH5} +\alias{closeAllH5.liger} +\alias{closeAllH5.ligerDataset} \title{Close all links (to HDF5 files) of a liger object} \usage{ closeAllH5(object) + +\method{closeAllH5}{liger}(object) + +\method{closeAllH5}{ligerDataset}(object) } \arguments{ \item{object}{liger object.} } \value{ -\code{object} with links closed. +Nothing is returned. } \description{ When need to interact with the data embedded in HDF5 files out diff --git a/man/createH5LigerDataset.Rd b/man/createH5LigerDataset.Rd index 29319bc5..d0a7801d 100644 --- a/man/createH5LigerDataset.Rd +++ b/man/createH5LigerDataset.Rd @@ -14,6 +14,7 @@ createH5LigerDataset( genesName = NULL, indicesName = NULL, indptrName = NULL, + anndataX = "X", modal = c("default", "rna", "atac", "spatial", "meth"), featureMeta = NULL, ... @@ -23,7 +24,7 @@ createH5LigerDataset( \item{h5file}{Filename of an H5 file} \item{formatType}{Select preset of H5 file structure. Default \code{"10X"}. -Current available options are \code{"10X"} and \code{"AnnData"}.} +Current available option is only \code{"10X"}.} \item{rawData, indicesName, indptrName}{The path in a H5 file for the raw sparse matrix data. These three types of data stands for the \code{x}, @@ -33,12 +34,15 @@ object. Default \code{NULL} uses \code{formatType} preset.} \item{normData}{The path in a H5 file for the "x" vector of the normalized sparse matrix. Default \code{NULL}.} -\item{scaleData}{The path in a H5 file for the dense 2D scaled matrix. -Default \code{NULL}.} +\item{scaleData}{The path in a H5 file for the Group that contains the sparse +matrix constructing information for the scaled data. Default \code{NULL}.} \item{genesName, barcodesName}{The path in a H5 file for the gene names and cell barcodes. Default \code{NULL} uses \code{formatType} preset.} +\item{anndataX}{The HDF5 path to the raw count data in an H5AD file. See +Details. Default \code{"X"}.} + \item{modal}{Name of modality for this dataset. Currently options of \code{"default"}, \code{"rna"}, \code{"atac"}, \code{"spatial"} and \code{"meth"} are supported. Default \code{"default"}.} @@ -48,6 +52,33 @@ cell barcodes. Default \code{NULL} uses \code{formatType} preset.} \item{...}{Additional slot data. See \linkS4class{ligerDataset} for detail. Given values will be directly placed at corresponding slots.} } +\value{ +H5-based \linkS4class{ligerDataset} object +} \description{ -Create on-disk ligerDataset Object +For convenience, the default \code{formatType = "10x"} directly fits the +structure of cellranger output. \code{formatType = "anndata"} works for +current AnnData H5AD file specification (see Details). If there a customized +H5 file structure is presented, any of the \code{rawData}, +\code{indicesName}, \code{indptrName}, \code{genesName}, \code{barcodesName} +should be specified accordingly to override the \code{formatType} preset. +} +\details{ +For H5AD file written from an AnnData object, we allow using +\code{formatType = "anndata"} for the function to infer the proper structure. +However, while a typical AnnData-based analysis tends to in-place update the +\code{adata.X} attribute and there is no standard/forced convention for where +the raw count data, as needed from LIGER, is stored. Therefore, we expose +argument \code{anndataX} for specifying this information. The default value +\code{"X"} looks for \code{adata.X}. If the raw data is stored in a layer, +e.g. \code{adata.layers['count']}, then \code{anndataX = "layers/count"}. +If it is stored to \code{adata.raw.X}, then \code{anndataX = "raw/X"}. If +your AnnData object does not have the raw count retained, you will have to +go back to the Python work flow to have it inserted at desired object space +and re-write the H5AD file, or just go from upstream source files with which +the AnnData was originally created. +} +\examples{ +h5Path <- system.file("extdata/ctrl.h5", package = "rliger") +ld <- createH5LigerDataset(h5Path) } diff --git a/man/createLiger.Rd b/man/createLiger.Rd index 80a92882..da0f9979 100644 --- a/man/createLiger.Rd +++ b/man/createLiger.Rd @@ -11,6 +11,7 @@ createLiger( removeMissing = TRUE, addPrefix = "auto", formatType = "10X", + anndataX = "X", dataName = NULL, indicesName = NULL, indptrName = NULL, @@ -56,6 +57,9 @@ options are \code{"10X"} and \code{"AnnData"}. Can be either a single specification for all datasets or a character vector that match with each dataset.} +\item{anndataX}{The HDF5 path to the raw count data in an H5AD file. See +\code{\link{createH5LigerDataset}} Details. Default \code{"X"}.} + \item{dataName, indicesName, indptrName}{The path in a H5 file for the raw sparse matrix data. These three types of data stands for the \code{x}, \code{i}, and \code{p} slots of a \code{\link[Matrix]{dgCMatrix-class}} diff --git a/man/liger-class.Rd b/man/liger-class.Rd index ba7eff03..8c14288d 100644 --- a/man/liger-class.Rd +++ b/man/liger-class.Rd @@ -597,7 +597,9 @@ dataset(pbmcPlot, 2) ## Retrieving cell metadata, replacement methods available cellMeta(pbmcPlot) head(pbmcPlot[["nUMI"]]) -head(pbmcPlot$UMAP) + +## Retrieving dimemtion reduction matrix +head(dimRed(pbmcPlot, "UMAP")) ## Retrieving variable features, replacement methods available varFeatures(pbmcPlot) diff --git a/man/restoreH5Liger.Rd b/man/restoreH5Liger.Rd index 1d8112e7..62173c90 100644 --- a/man/restoreH5Liger.Rd +++ b/man/restoreH5Liger.Rd @@ -37,6 +37,6 @@ h5Path <- system.file("extdata/ctrl.h5", package = "rliger") lig <- createLiger(list(ctrl = h5Path)) # Now it is actually an invalid object! which is equivalent to what users # will get with `saveRDS(lig, "object.rds"); lig <- readRDS("object.rds")`` -lig <- closeAllH5(lig) +closeAllH5(lig) lig <- restoreH5Liger(lig) } diff --git a/man/writeH5.Rd b/man/writeH5.Rd new file mode 100644 index 00000000..53dfb58b --- /dev/null +++ b/man/writeH5.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/h5Utility.R +\name{writeH5} +\alias{writeH5} +\alias{writeH5.default} +\alias{writeH5.dgCMatrix} +\alias{writeH5.ligerDataset} +\alias{writeH5.liger} +\title{Write in-memory data into H5 file} +\usage{ +writeH5(x, file, ...) + +\method{writeH5}{default}(x, file, ...) + +\method{writeH5}{dgCMatrix}( + x, + file, + overwrite = FALSE, + indicesPath = "matrix/indices", + indptrPath = "matrix/indptr", + dataPath = "matrix/data", + shapePath = "matrix/shape", + barcodesPath = "matrix/barcodes", + featuresPath = "matrix/features/name", + ... +) + +\method{writeH5}{ligerDataset}(x, file, ...) + +\method{writeH5}{liger}(x, file, useDatasets, ...) +} +\arguments{ +\item{x}{An object with in-memory data to be written into H5 file.} + +\item{file}{A character string of the file path to be written.} + +\item{...}{Arguments passed to other S3 methods.} + +\item{overwrite}{Logical, whether to overwrite the file if it already exists. +Default \code{FALSE}.} + +\item{indicesPath, indptrPath, dataPath}{The paths inside the H5 file where +the \linkS4class{dgCMatrix} constructor \code{i}, \code{p}, and \code{x} will +be written to, respectively. Default using cellranger convention +\code{"matrix/indices"}, \code{"matrix/indptr"}, and \code{"matrix/data"}.} + +\item{shapePath}{The path inside the H5 file where the shape of the matrix +will be written to. Default \code{"matrix/shape"}.} + +\item{barcodesPath}{The path inside the H5 file where the barcodes/colnames +will be written to. Default \code{"matrix/barcodes"}. Skipped if the object +does not have colnames.} + +\item{featuresPath}{The path inside the H5 file where the features/rownames +will be written to. Default \code{"matrix/features/name"}. Skipped if the +object does not have rownames.} + +\item{useDatasets}{For liger method. Names or indices of datasets to be +written to H5 files. Required.} +} +\value{ +Nothing is returned. H5 file will be created on disk. +} +\description{ +This function writes in-memory data into H5 file by default in 10x cellranger +HDF5 output format. The main goal of this function is to allow users to +integrate large H5-based dataset, that cannot be fully loaded into memory, +with other data already loaded in memory using \code{\link{runOnlineINMF}}. +In this case, users can write the smaller in-memory data to H5 file instead +of loading subset of the large H5-based dataset into memory, where +information might be lost. + +Basing on the goal of the whole workflow, the data will always be written +in a CSC matrix format and colnames/rownames are always required. + +The default method coerces the input to a \linkS4class{dgCMatrix}. Methods +for other container classes tries to extract proper data and calls the +default method. +} +\examples{ +raw <- rawData(pbmc, "ctrl") +writeH5(raw, "ctrl.h5") +unlink("ctrl.h5") +} +\seealso{ +\href{https://www.10xgenomics.com/cn/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-h5-matrices}{10X cellranger H5 matrix detail} +} diff --git a/tests/testthat/test_factorization.R b/tests/testthat/test_factorization.R index 2199d35d..b884ef96 100644 --- a/tests/testthat/test_factorization.R +++ b/tests/testthat/test_factorization.R @@ -60,7 +60,7 @@ test_that("iNMF - in-memory", { "Scaled data not available. ") pbmc <- scaleNotCenter(pbmc) expect_error(pbmc <- runINMF(pbmc, k = 5000), - "k must be <= m") + "Number of factors") pbmc <- runIntegration(pbmc, k = 10, nIteration = 2) expect_no_error(.checkValidFactorResult(pbmc)) diff --git a/tests/testthat/test_object.R b/tests/testthat/test_object.R index 207ffb55..397e47b6 100644 --- a/tests/testthat/test_object.R +++ b/tests/testthat/test_object.R @@ -82,7 +82,7 @@ test_that("liger object creation - in memory", { test_that("liger object creation - on disk", { withNewH5Copy( function(rawList) { - expect_error(createLiger(rawList, formatType = "Hello"), + expect_message(createLiger(rawList, formatType = "Hello"), "Specified `formatType`") # Customized paths @@ -358,7 +358,7 @@ test_that("H5 ligerDataset methods", { "Cannot replace slot with in-memory") expect_is(h5fileInfo(ctrl), "list") - expect_identical(h5fileInfo(ctrl, "formatType"), "10X") + expect_identical(h5fileInfo(ctrl, "formatType"), "10x") expect_identical(h5fileInfo(ctrl, c("indicesName", "indptrName")), list(indicesName = "matrix/indices", indptrName = "matrix/indptr")) diff --git a/vignettes/articles/import_data.Rmd b/vignettes/articles/import_data.Rmd new file mode 100644 index 00000000..f9465e9b --- /dev/null +++ b/vignettes/articles/import_data.Rmd @@ -0,0 +1,365 @@ +--- +title: "Import Data from Various Source" +author: "Yichen Wang" +date: "2024-03-16" +output: + html_document: + toc: 3 + toc_float: true +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) +``` + +This article only focuses on reading in data presented in various forms and provides detailed examples and explanations. Currently, we support importing the following forms of input data into a [liger](../reference/liger-class.html) object: + +- [Flat MTX files](#data-presented-with-mtx-files) containing sparse matrices, together with files containing barcodes and feature names. These are mostly the 10X cellranger output. +- [H5 files](#data-presented-with-h5-files) containing the sparse data, barcodes, and feature names. These can be found also in 10X cellranger output, as well as [H5AD files](#importing-from-h5ad-file-python-anndata-object) written from a Python AnnData object. +- Loaded R objects of the following classes: + - [dgCMatrix](#importing-r-object---dgcmatrix) + - [Seurat, preferably of V5 structure](#importing-r-object---seurat) + - [SingleCellExperiment](#importing-r-object---singlecellexperiment) + +Whichever type of data you want to bring to a LIGER analysis, we generally recommend that it represents the raw information, such as RNA raw counts, peak counts. LIGER performs optimally with its own normalization (library-size/L1 normalization only) and scaling (no centering) procedures, and other types of transformation may not be compatible with LIGER's assumptions. + +The data imported from flat MTX files will be read into memory, internally converted into dgCMatrix objects. Data from other R object classes will also be converted into dgCMatrix objects. Data stored in H5 files will not be read into memory, as they are employed for helping with large datasets that cannot be loaded into memory. The main liger object constructor function, `createLiger()`, requires a **named list** as its first argument, where each element is a dataset to be integrated. Each element of the named list can be of different types from described above and below. However, it is limited that we don't currently provide integration method that integrates H5 data with other types of data. That is, either you can integrate H5 data with H5 data, or you can integrate in-memory data with in-memory data. If integration under this situation is needed, we still provide some solution. + +If you need to import data presented in other forms but encounter problems coercing them to supported formats, please feel free to [open an Issue on our GitHub repository](https://github.com/welch-lab/liger/issues/new) to request for support. + +## Importing 10X Cellranger Output + +Given the prevalence of 10X Genomics' single-cell sequencing technology, the [cellranger output](https://www.10xgenomics.com/cn/support/software/cell-ranger/latest/analysis/outputs/cr-outputs-overview) has been the most commonly seen data source for real-world analyses. We provide the following examples showing how to read multi-sample datasets from cellranger output into a liger object. + +### Data presented with MTX files + +MTX files are proposed for efficiently storing sparse matrix data into flat text files. Cellranger output MEX format data that contains a MTX file together with a text file with barcodes and a text file with feature names. For example, the following is the structure of the cellranger output directory of one certain sample. + +``` +# shell commands in unix terminal +$ cd /home/jdoe/runs/sample345/outs +$ tree filtered_feature_bc_matrix +filtered_feature_bc_matrix + ├── barcodes.tsv.gz + ├── features.tsv.gz + └── matrix.mtx.gz +0 directories, 3 files +``` + +We provide a function `read10X()` for identifying proper files to use and loading them into memory. The function works for either just one sample or the root directory that conatins multiple samples. + +#### Reading a single sample + +To load a single sample: + +```{R importMTX_single, eval=FALSE} +sample1 <- read10X("path/to/filtered_feature_bc_matrix") +``` + +Depending on whether the cellranger version is before 3.0 or after, the returned object `sample1` varies in structure. For cellranger version 3.0 and later, the returned object is a named list where each element is for a sample, even if we opt to only read one sample. And this element would be another named list for available feature types. For example when doing scRNAseq, it will be `"Gene Expression"`. + +``` +> sample1 +$sampleName +$sampleName$`Gene Expression` + [... dgCMatrix] +``` + +For older cellranger, `sample1` will a named list, named by sample names, where each element is directly a dgCMatrix object. + +``` +> sample1 +$sampleName + [... dgCMatrix] +``` + +#### Reading multiple samples from root directory + +The root directory that `read10X()` expects is the directory where you can see subdirecories named by sample names. + +``` +# shell commands in unix terminal +$ cd /home/jdoe/ +$ tree runs +runs + ├── sample123 + │ └── outs + │ └── filtered_feature_bc_matrix + │ ├── barcodes.tsv.gz + │ ├── features.tsv.gz + │ └── matrix.mtx.gz + └── sample456 + └── outs + └── filtered_feature_bc_matrix + ├── barcodes.tsv.gz + ├── features.tsv.gz + └── matrix.mtx.gz +``` + +```{R importMTX_multi, eval=FALSE} +multiSample <- read10X("path/to/runs") +``` + +The returned object `multiSample` is a named list for each sample available in the `runs/` folder. Similar to the case of reading a single sample, the element for each sample will be a list for available feature types for cellranger>=3.0 or a dgCMatrix object for older cellranger. + +#### Creating a liger object from loaded data + +The required structure for the input of `createLiger()` is a named list of matrices. For cellranger>=3.0 case where the loaded data contains substructure for potentially multiple feature types, we need to extract the desired data and organize them a bit. The following example shows how to take only the gene expression out of the loaded data and organize them into the desired format to finally create a liger object. + +```{R createLiger, eval=FALSE} +multiSample <- read10X("path/to/runs") +multiSample +## $sample123 +## $sample123$`Gene Expression` +## [... dgCMatrix] +## $sample123$`Antibody Capture` +## [... dgCMatrix] +## $sample456 +## $sample456$`Gene Expression` +## [... dgCMatrix] +## $sample456$`Antibody Capture` +## [... dgCMatrix] +multiSample <- lapply(multiSample, `[[`, i = "Gene Expression") +multiSample +## $sample123 +## [... dgCMatrix] +## $sample456 +## [... dgCMatrix] +ligerObj <- createLiger(multiSample) +``` + +For convenience, if you only want to work on RNA data, you can directly load with `read10XRNA()`. + +```{R importMTX_RNA, eval=FALSE} +multiSample <- read10XRNA("path/to/runs") +multiSample +## $sample123 +## [... dgCMatrix] +## $sample456 +## [... dgCMatrix] +ligerObj <- createLiger(multiSample) +``` + +We also provide `read10XATAC()` for loading only the ATAC data from cellranger output. Note here that 10X provides different platforms that contain ATAC modality, namingly "cellranger atac" and "cellranger arc". Users need to specify argument `pipeline` for the function to correctly infer the file structure. + +### Data presented with H5 files + +Cellranger outputs identical information into H5 files, which contain the same sparse matrix, barcodes and feature names. H5 files are based on the HDF5 library, which is designed for storing and managing large and complex data. They can be found at the same folder where you see the `filtered_feature_bc_matrix/` folder, and the file would be named by `filtered_feature_bc_matrix.h5`. + +``` +# shell commands in unix terminal +$ cd /home/jdoe/runs/sample345/ +$ tree outs +outs + ├── filtered_feature_bc_matrix + │ ├── barcodes.tsv.gz + │ ├── features.tsv.gz + │ └── matrix.mtx.gz + └── filtered_feature_bc_matrix.h5 + ... +``` + +To create a liger object with H5 files, you simply need to provide the path to each H5 files in a named list. + +```{R importH5, eval=FALSE} +H5PathList <- list( + sample123 = "/home/jdoe/runs/sample123/outs/filtered_feature_bc_matrix.h5", + sample456 = "/home/jdoe/runs/sample456/outs/filtered_feature_bc_matrix.h5" +) +ligerObj <- createLiger(H5PathList) +``` + +## Importing R object - dgCMatrix + +"dgCMatrix", supported by package [Matrix](https://CRAN.R-project.org/package=Matrix), is the core class we adopt for holding the cell feature matrix in rliger. It represents CSC (Compressed Sparse Column) matrix, which is the most common form of sparse matrix in R given its efficiency in both memory usage and computation. + +We expect that users have each dgCMatrix object representing a single dataset to be integrated. The main object constructor function, `createLiger()`, expects datasets are organized in a **named list** and being passed to the first argument. + +```{importDGCMATRIX, eval=FALSE} +# NOT RUN, for demonstration only +datasetList <- list( + "name1" = dgCMatrix1, + "name2" = dgCMatrix2, + "name3" = dgCMatrix3 +) +ligerObj <- createLiger(datasetList) +``` + +If your matrix is a combination of multiple datasets to be integrated, we suggest using `as.liger()` with argument `datasetVar` supplied with a factor object as the labeling to indicate the proper splitting behavior. + +```{importDGCMATRIX2, eval=FALSE} +# NOT RUN, for demonstration only +datasetVar +## [1] sample1 sample1 sample1 sample1 ... +## Levels: sample1 sample2 sample3 +ligerObj <- as.liger(dgCMatrix, datasetVar = datasetVar) +``` + +## Importing R object - Seurat + +Seurat has been the most popular R package for single-cell analysis. We directly support [running rliger functions on a Seurat object](liger_with_seurat.html) without having to convert a Seurat object into a liger object. Please click on the link to see a detailed example, together with many use cases of interoperation between Seurat and rliger. + +## Importing R object - SingleCellExperiment + +[SingleCellExperiment](https://bioconductor.org/packages/SingleCellExperiment/) (SCE) has also been a popular class for single-cell analysis. Here's how we bring raw data from SCEs. + +### Multiple SCEs, each a dataset + +This is the most preferable case. Most of the times, people do QC and filterings to remove unexpressing genes and low quality cells. If a SCE is presented as a combined representation of QC'ed datasets, it's likely that unwanted zero entries would be introduced to keep the dimensionality consistent, and this would unnecessarily increase the memory usage and computation time. If you have multiple SCEs, each representing a dataset, we recommend to pass them as a named list to `createLiger()`. + +```{importSCE, eval=FALSE} +# NOT RUN, for demonstration only +sceList <- list( + "name1" = sce1, + "name2" = sce2, + "name3" = sce3 +) +ligerObj <- createLiger(sceList) +``` + +### A single SCE, including many datasets + +If it happens that your single SCE object includes multiple datasets that need to be integrated. We suggest using `as.liger()` to convert it into a liger object. In this case, the second argument `datasetVar` must be provided for inferring how to split the data. Assume that the example `sce` object has a column `sample` in its `colData`: + +```{importSCE2, eval=FALSE} +# NOT RUN, for demonstration only +ligerObj <- as.liger(sce, datasetVar = "sample") +``` + +## Importing from H5AD file, Python AnnData object + +>**WARNING**: We strongly suggest that you make a COPY of the H5AD file for rliger analyis. rliger load the HDF5 file with read-and-write mode and occasionally writes to the file, which might cause the file unable to be loaded back to Python AnnData object. This will be improved in the future. + +[AnnData](https://anndata.readthedocs.io/en/latest/) is a popular Python class for single-cell analysis. When storing the object, it is written to an H5AD file, which is also based on HDF5 library, and is totally supported to be loaded into a liger object. Note that the H5AD specification has been changed since anndata 0.8.0, and we only support the new format. If you have H5AD file written by an older version of anndata, you need to convert it to the new format using an upgraded version of Python anndata package. + +```{importH5AD, eval=FALSE} +# NOT RUN, for demonstration only +ligerObj <- createLiger( + list( + sample1 = "path/to/sample1.h5ad", + sample2 = "path/to/sample2.h5ad" + ), + formatType = "anndata" +) +``` + +A few limitations to note here are that: + +1. We by default read the data located at `adata.X`. However, due to the fact that people might be storing data at `adata.layers['key']` or `adata.raw.X`, and LIGER requires the raw data as the start point, we provide argument `anndataX` to allow setting the path to the data. See `createH5LigerDataset()` for more details. +2. LIGER expects that each H5AD file contains only one dataset to be integrated, because we are not loading the H5 based data into memory and hence don't yet have method to split the data by a given variable. This will be updated in the future. + +## Working with mixed types of data + +The `createLiger()` constructor function expects a named list as its first input, while the element in this list can be of various forms. + +### Mixing cellranger output H5 files with anndata H5AD files + +When importing a number of H5-based datasets that are of the same specification, one can tweak the additional arguments of the function to apply the same settings to all these datasets. However, if you want to integrate H5 datasets of different types (e.g. cellranger output and H5AD), you will need to load each dataset into ligerDataset object individually, and then call the `createLiger()` function with a list of these ligerDataset objects. + +```{importMixed, eval=FALSE} +# NOT RUN, for demonstration only +ligerDataset1 <- createH5LigerDataset("path/to/runs/sample/outs/filtered_feature_bc_matrix.h5", formatType = "10x") +ligerDataset2 <- createH5LigerDataset("path/to/sample2.h5ad", formatType = "anndata", anndataX = "raw/X") +ligerObj <- createLiger(list(ligerDataset1, ligerDataset2)) +``` + +### Mixing all kinds of R in-memory objects + +If each of your R objects contains a single dataset to be integrated, you can simply pass them to the named list. + +```{importMixed2, eval=FALSE} +# NOT RUN, for demonstration only +ligerObj <- createLiger( + list( + sample1 = dgCMatrix1, + seurat2 = seuratObj, + sce3 = sce, + ) +) +``` + +Function `as.ligerDataset` is called under the hook to convert each R object to a ligerDataset object, and it directly fetches the `'counts'` layer/slot of the default assay from a Seurat object, or the `'counts'` assay from an SCE object. If this is not the desired behavior, you can manually extract the correct matrix. + +```{importMixed3, eval=FALSE} +# NOT RUN, for demonstration only +ligerObj <- createLiger( + list( + sample1 = dgCMatrix1, + seurat2 = SeuratObject::LayerData(seuratObj, layer = "somelayer", assay = "someassay"), + sce3 = SummarizedExperiment::assay(sce, "someassay"), + ) +) +``` + +If the given Seurat or SCE objects contain multiple datasets, you can use `as.liger()` to convert them into a liger object, with `datasetVar` properly specified to have the datasets split. Then `c()` method can be applied to merge multiple liger objects into one, analogous to how one would concatenate R list objects. + +```{importMixed4, eval=FALSE} +# NOT RUN, for demonstration only +ligerObj1 <- as.liger(seuratObj, datasetVar = "sample") +ligerObj1 +## An object of class liger with XXXXX cells +## datasets(2): patient001 (XXXX cells), patient002 (XXXX cells) +## cellMeta(10): dataset, barcode, group, nUMI, ..., hemo +## varFeatures(0): +## dimReds(0): +ligerObj2 <- as.liger(sce, datasetVar = "batch") +ligerObj2 +## An object of class liger with XXXXX cells +## datasets(2): someid101 (XXXX cells), someid103 (XXXX cells) +## cellMeta(10): dataset, barcode, group, nUMI, ..., hemo +## varFeatures(0): +## dimReds(0): +ligerObj <- c(ligerObj1, ligerObj2) +# You can even insert more datasets with `dataset<-()` +dataset(ligerObj, "pbmc") <- dgCMatrix1 +ligerObj +## An object of class liger with XXXXX cells +## datasets(5): patient001 (XXXX cells), patient002 (XXXX cells), someid101 (XXXX cells), someid103 (XXXX cells), pbmc (XXXX cells) +## cellMeta(10): dataset, barcode, group, nUMI, ..., hemo +## varFeatures(0): +## dimReds(0): +``` + +### Mixing H5 files with in-memory objects + +Technically, with the hints provided above, you can create a liger object with both in-memory and H5-based datasets presented at the same time. However, the integration methods will not work in this situation. Generally, users will have to either write the in-memory part to H5 files and replace the datasets with the H5 copies accordingly, or read subsampled data from H5 files to memory and replace the datasets with the in-memory copies. Note that we assume that H5 files are employed for holding large datasets that cannot be fit into the memory and this is why we suggest subsampling them. If they are indeed okay to be loaded into memory, you can totally load them with `subsetLigerDataset(..., newH5 = FALSE)` and replace the datasets with the full in-memory copies. + +It is becoming more common that users have large datasets stored in H5 files and are expensive to be all loaded into memory. `runOnlineINMF()` runs highly efficient methods to integrate datasets presented in this form. If you want to bring a rather smaller in-memory dataset to the pool and have it integrated together, you can write the dataset to an H5 file and then create a liger object with it. In this way, the integration will have the maximum information captured from all datasets. + +```{writeH5, eval=FALSE} +# NOT RUN, for demonstration only +class(smallData) +## [1] "dgCMatrix" +write(smallData, "path/to/smallData.h5") + +largeLiger +## An object of class liger with XXXXXX cells +## datasets(n): sample1 (XXXXX cells), sample2 (XXXXX cells), ... +## cellMeta(10): dataset, barcode, group, nUMI, ..., hemo +## varFeatures(6000): AAA1, AAA2, BBB1, BBB2, ... +## dimReds(0): +smallDataset <- createH5LigerDataset("path/to/smallData.h5") +dataset(largeLiger, "sampleSmall") <- smallDataset + +largeLiger +## An object of class liger with XXXXXX cells +## datasets(n): sample1 (XXXXX cells), sample2 (XXXXX cells), ..., sampleSmall (XXXX cells) +## cellMeta(10): dataset, barcode, group, nUMI, ..., hemo +## varFeatures(6000): AAA1, AAA2, BBB1, BBB2, ... +## dimReds(0): +``` + +However, after integration, downstream analyses that rely on gene/feature expression values (e.g. differential expression analysis) would still need in-memory form of the data. We provide `downsample()` to sub-sample data from H5 files and read the subset to memory, with fraction of dataset source and/or cell type balanced. This might be improved in the future. + +The following example would randomly sample 4,000 cells from each dataset in `largeLiger` and load only the normalized data into memory. The `newH5` argument is set to `FALSE` to load the data into memory, and the `useSlot` argument is set to `"normData"` to load only the normalized data. The `maxCells` argument is set to `4000` to sample 4,000 cells from each dataset. The resulting `downsampledData` is a liger object with the same structure as `largeLiger`, but with only the downsampled normalized in-memory data. + +```{downsampleH5, eval=FALSE} +# NOT RUN, for demonstration only +downsampledData <- downsample(largeLiger, newH5 = FALSE, maxCells = 4000, useSlot = "normData") +downsampledData +## An object of class liger with XXXXXX cells +## datasets(n): sample1 (4000 cells), sample2 (4000 cells), ..., sampleSmall (4000 cells) +## cellMeta(10): dataset, barcode, group, nUMI, ..., leiden_cluster +## varFeatures(6000): AAA1, AAA2, BBB1, BBB2, ... +## dimReds(1): UMAP +```