diff --git a/R/ATAC.R b/R/ATAC.R index 89bebb0d..41da2a09 100644 --- a/R/ATAC.R +++ b/R/ATAC.R @@ -356,6 +356,7 @@ linkGenesAndPeaks <- function( #' @return No return value. A file located at \code{outputPath} will be created. #' @export #' @examples +#' \donttest{ #' bmmc <- normalize(bmmc) #' bmmc <- selectGenes(bmmc) #' bmmc <- scaleNotCenter(bmmc) @@ -376,6 +377,7 @@ linkGenesAndPeaks <- function( #' ) #' head(read.table(resultPath, skip = 1)) #' } +#' } exportInteractTrack <- function( corrMat, pathToCoords, diff --git a/R/ggplotting.R b/R/ggplotting.R index aae9cc3d..180b01a7 100644 --- a/R/ggplotting.R +++ b/R/ggplotting.R @@ -588,8 +588,9 @@ plotCellViolin <- function( #' @param colorLabels,colorValues Each a vector with as many values as the #' number of categories for the categorical coloring aesthetics. Labels will be #' the shown text and values will be the color code. These are passed to -#' \code{\link[ggplot2]{scale_color_manual}}. Default uses \code{scPalette} and -#' plot original labels (levels of the factor). +#' \code{\link[ggplot2]{scale_color_manual}}. Default uses an internal selected +#' palette if there are <= 26 colors needed, or ggplot hues otherwise, and plot +#' original labels (levels of the factor). #' @param legendNRow,legendNCol Integer, when too many categories in one #' variable, arranges number of rows or columns. Default \code{NULL}, #' automatically split to \code{ceiling(levels(variable)/10)} columns. @@ -641,7 +642,7 @@ plotCellViolin <- function( legendNCol = NULL, # Coloring colorLabels = NULL, - colorValues = scPalette, + colorValues = NULL, colorPalette = "magma", colorDirection = -1, naColor = "#DEDEDE", @@ -741,9 +742,13 @@ plotCellViolin <- function( colorLabels <- levels(plot$data[[varName]]) } if (is.null(colorValues)) { - colorValues <- scales::hue_pal()( - length(levels(plot$data[[varName]])) - ) + if (nlevels(plot$data[[varName]]) <= length(scPalette)) + colorValues <- scPalette[seq_len(nlevels(plot$data[[varName]]))] + else { + colorValues <- scales::hue_pal()( + length(levels(plot$data[[varName]])) + ) + } } if (a %in% c("colour", "fill")) { plot <- plot + diff --git a/R/liger-methods.R b/R/liger-methods.R index ece44f81..ff2bb9b5 100644 --- a/R/liger-methods.R +++ b/R/liger-methods.R @@ -595,14 +595,14 @@ setReplaceMethod( # No name matching, require exact length value <- .checkArgLen(value, n = length(cellIdx), class = c("vector", "factor")) } else { - if (!all(barcodes %in% names(value))) { + if (!all(endsWith(barcodes, names(value)))) { cli::cli_abort( - c("{.code names(value)} do not contain all cells selected. ", - "These are not involved: ", - "{.val {barcodes[!barcodes %in% names(value)]}}") + c("x" = "{.code names(value)} do match to cells selected. ", + "i" = "The first three given names: {.val {names(value)[1:3]}}", + "i" = "The first three selected names: {.val {barocdes[1:3]}}") ) } - value <- value[barcodes] + names(value) <- barcodes } } else { # matrix like @@ -873,9 +873,9 @@ setReplaceMethod( function(x, dataset = NULL, check = TRUE, value) { dataset <- .checkUseDatasets(x, dataset) if (length(dataset) != 1) cli::cli_abort("Need to specify one dataset to insert.") - if (isH5Liger(x, dataset)) - cli::cli_abort("Cannot replace slot with in-memory data for H5 based object.") - x@datasets[[dataset]]@scaleUnsharedData <- value + ld <- dataset(x, dataset) + scaleUnsharedData(ld) <- value + x@datasets[[dataset]] <- ld if (isTRUE(check)) methods::validObject(x) x } @@ -889,9 +889,9 @@ setReplaceMethod( function(x, dataset = NULL, check = TRUE, value) { dataset <- .checkUseDatasets(x, dataset) if (length(dataset) != 1) cli::cli_abort("Need to specify one dataset to insert.") - if (!isH5Liger(x, dataset)) - cli::cli_abort("Cannot replace slot with on-disk data for in-memory object.") - x@datasets[[dataset]]@scaleUnsharedData <- value + ld <- dataset(x, dataset) + scaleUnsharedData(ld) <- value + x@datasets[[dataset]] <- ld if (isTRUE(check)) methods::validObject(x) x } @@ -907,7 +907,9 @@ setReplaceMethod( if (length(dataset) != 1) cli::cli_abort("Need to specify one dataset to insert.") if (!isH5Liger(x, dataset)) cli::cli_abort("Cannot replace slot with on-disk data for in-memory object.") - x@datasets[[dataset]]@scaleUnsharedData <- value + ld <- dataset(x, dataset) + scaleUnsharedData(ld) <- value + x@datasets[[dataset]] <- ld if (isTRUE(check)) methods::validObject(x) x } @@ -1140,7 +1142,7 @@ setReplaceMethod( signature(x = "liger", value = "list"), function(x, value) { x@dimReds <- value - validObject(x) + methods::validObject(x) return(x) } ) diff --git a/R/ligerDataset-methods.R b/R/ligerDataset-methods.R index 7ee9d8c2..866e8684 100644 --- a/R/ligerDataset-methods.R +++ b/R/ligerDataset-methods.R @@ -408,6 +408,15 @@ setReplaceMethod( function(x, check = TRUE, value) { if (isH5Liger(x)) cli::cli_abort("Cannot replace slot with in-memory data for H5 based object.") + if (!is.null(value)) { + bc <- x@colnames + if (!all(endsWith(bc, colnames(value)))) { + cli::cli_abort( + "Column names of {.var value} do not match to those of the object." + ) + } + colnames(value) <- bc + } x@scaleUnsharedData <- value if (isTRUE(check)) methods::validObject(x) x diff --git a/R/subsetObject.R b/R/subsetObject.R index 953cbc6a..cff80f1d 100644 --- a/R/subsetObject.R +++ b/R/subsetObject.R @@ -320,9 +320,6 @@ subsetH5LigerDataset <- function( path <- dirname(oldFN) filename <- tempfile(pattern = paste0(bn, ".subset_"), fileext = ".h5", tmpdir = path) - # filename <- paste0(oldFN, ".subset_", - # format(Sys.time(), "%y%m%d_%H%M%S"), - # ".h5") } else if (is.null(filename) && !is.null(filenameSuffix)) { oldFN <- h5fileInfo(object, "filename") filename <- paste0(oldFN, ".", filenameSuffix, ".h5") @@ -374,6 +371,8 @@ subsetH5LigerDatasetToMem <- function( } modal <- modalOf(object) cellIdx <- .idxCheck(object, cellIdx, "cell") + if (is.null(featureIdx)) noFeatureIdx <- TRUE + else noFeatureIdx <- FALSE featureIdx <- .idxCheck(object, featureIdx, "feature") # Having `useSlot` as a record of user specification, to know whether it's # a `NULL` but involve everything, or just a few slots. @@ -418,9 +417,16 @@ subsetH5LigerDatasetToMem <- function( # Process scaled data #### secondIdx <- NULL if (!is.null(scaleData(object))) { - # See comments in h5ToH5 for what the following two mean + # scaledFeatureIdx indicates where each feature in scaled data is from + # the dataset. scaledFeatureIdx <- scaleData(object)[["featureIdx"]][] - secondIdx <- as.numeric(stats::na.omit(match(featureIdx, scaledFeatureIdx))) + # `secondIdx` is used to subscribe scaleData and V, so that result + # after subset match with the order requested by `featureIdx` + if (noFeatureIdx) { + secondIdx <- seq_along(scaledFeatureIdx) + } else { + secondIdx <- as.numeric(stats::na.omit(match(featureIdx, scaledFeatureIdx))) + } } if ("scaleData" %in% slotInvolved & !is.null(scaleData(object))) { if (isTRUE(verbose)) @@ -538,6 +544,8 @@ subsetH5LigerDatasetToH5 <- function( } modal <- modalOf(object) cellIdx <- .idxCheck(object, cellIdx, "cell") + if (is.null(featureIdx)) noFeatureIdx <- TRUE + else noFeatureIdx <- FALSE featureIdx <- .idxCheck(object, featureIdx, "feature") useSlot <- .checkLDSlot(object, useSlot) @@ -672,26 +680,17 @@ subsetH5LigerDatasetToH5 <- function( } # Process Scaled Data #### secondIdx <- NULL - # if (getH5File(object)$exists("scaleData.featureIdx")) { - # scaledFeatureIdx <- getH5File(object)[["scaleData.featureIdx"]][] - # } else if ("selected" %in% names(featureMeta(object))) { - # scaledFeatureIdx <- which(featureMeta(object)$selected) - # } else if (!is.null(object@V)) { - # scaleFeatureIdx <- rownames(object@V) %in% rownames(object)[featureIdx] - # } else { - # if ("scaleData" %in% useSlot & !is.null(scaleData(object))) { - # warning("Unable to know what features are included scaled data. ", - # "Skipped.") - # } - # } if (!is.null(scaleData(object))) { - # This asserts that - # identical(rownames(object)[scaledFeatureIdx], rownames(scaleData)) + # scaledFeatureIdx indicates where each feature in scaled data is from + # the dataset. scaledFeatureIdx <- scaleData(object)[["featureIdx"]][] - # This asserts that - # rownames(scaleData)[secondIdx] returns a subset that follows the order - # specified by featureIdx - secondIdx <- as.numeric(stats::na.omit(match(featureIdx, scaledFeatureIdx))) + # `secondIdx` is used to subscribe scaleData and V, so that result + # after subset match with the order requested by `featureIdx` + if (noFeatureIdx) { + secondIdx <- seq_along(scaledFeatureIdx) + } else { + secondIdx <- as.numeric(stats::na.omit(match(featureIdx, scaledFeatureIdx))) + } } if ("scaleData" %in% useSlot & !is.null(scaleData(object))) { scaledFeatureIdxNew <- which(featureIdx %in% scaledFeatureIdx) diff --git a/_pkgdown.yml b/_pkgdown.yml index ab43faf9..8e13ffa8 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,15 @@ template: - params: - bootswatch: flatly + bootstrap: 5 + theme: flatly + bslib: + bg: "#202123" + fg: "#B8BCC2" + primary: "#306cc9" + border-radius: 0.5rem + btn-border-radius: 0.25rem + includes: + in_header: + reference: - title: "Read Data" diff --git a/docs/articles/img/liger_class_structure.png b/docs/articles/img/liger_class_structure.png index 290fc225..99768487 100644 Binary files a/docs/articles/img/liger_class_structure.png and b/docs/articles/img/liger_class_structure.png differ diff --git a/man/dot-ggplotLigerTheme.Rd b/man/dot-ggplotLigerTheme.Rd index a2d2c5be..533dfabb 100644 --- a/man/dot-ggplotLigerTheme.Rd +++ b/man/dot-ggplotLigerTheme.Rd @@ -32,7 +32,7 @@ legendNRow = NULL, legendNCol = NULL, colorLabels = NULL, - colorValues = scPalette, + colorValues = NULL, colorPalette = "magma", colorDirection = -1, naColor = "#DEDEDE", @@ -88,8 +88,9 @@ automatically split to \code{ceiling(levels(variable)/10)} columns.} \item{colorLabels, colorValues}{Each a vector with as many values as the number of categories for the categorical coloring aesthetics. Labels will be the shown text and values will be the color code. These are passed to -\code{\link[ggplot2]{scale_color_manual}}. Default uses \code{scPalette} and -plot original labels (levels of the factor).} +\code{\link[ggplot2]{scale_color_manual}}. Default uses an internal selected +palette if there are <= 26 colors needed, or ggplot hues otherwise, and plot +original labels (levels of the factor).} \item{colorPalette}{For continuous coloring, an index or a palette name to select from available options from ggplot diff --git a/man/exportInteractTrack.Rd b/man/exportInteractTrack.Rd index 247a648b..01fc88f7 100644 --- a/man/exportInteractTrack.Rd +++ b/man/exportInteractTrack.Rd @@ -34,6 +34,7 @@ which is compatible with \href{https://genome.ucsc.edu/cgi-bin/hgCustom}{UCSC Genome Browser}. } \examples{ +\donttest{ bmmc <- normalize(bmmc) bmmc <- selectGenes(bmmc) bmmc <- scaleNotCenter(bmmc) @@ -55,3 +56,4 @@ if (requireNamespace("RcppPlanc", quietly = TRUE)) { head(read.table(resultPath, skip = 1)) } } +} diff --git a/tests/testthat/test_object.R b/tests/testthat/test_object.R index 7bba33b5..207ffb55 100644 --- a/tests/testthat/test_object.R +++ b/tests/testthat/test_object.R @@ -424,7 +424,7 @@ test_that("as.liger methods", { expect_message(lig <- as.liger(seu)) expect_true(all.equal(sapply(datasets(lig), ncol), c(a = 150, b = 150))) - expect_in(paste0("pca.", 1:10), colnames(cellMeta(lig, as.data.frame = TRUE))) + expect_in("pca", names(dimReds(lig))) } }) diff --git a/vignettes/articles/Integrating_multi_scRNA_data.rmd b/vignettes/articles/Integrating_multi_scRNA_data.rmd index 7c8452c3..58393aec 100644 --- a/vignettes/articles/Integrating_multi_scRNA_data.rmd +++ b/vignettes/articles/Integrating_multi_scRNA_data.rmd @@ -10,7 +10,7 @@ output: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE) +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) ``` This guide will demonstrate the usage of the Liger package in the style of the R Console, which can be accessed through an R development environment (e.g., RStudio) or directly from the R command line. diff --git a/vignettes/articles/Integrating_scRNA_and_scATAC_data.Rmd b/vignettes/articles/Integrating_scRNA_and_scATAC_data.Rmd index 57921ce3..90809205 100644 --- a/vignettes/articles/Integrating_scRNA_and_scATAC_data.Rmd +++ b/vignettes/articles/Integrating_scRNA_and_scATAC_data.Rmd @@ -9,7 +9,7 @@ output: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE) +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) ``` In this tutorial, we will demonstrate LIGER’s ability to jointly define cell types by leveraging multiple single-cell modalities. For example, the integration of single-cell RNA-seq and single-cell ATAC-seq enables cell type definitions that incorporate both gene expression and chromatin accessibility data. Such joint analysis allows not only the taxonomic categorization of cell types, but also a deeper understanding of their underlying regulatory networks. The pipeline for jointly analyzing scRNA-seq and scATAC-seq is similar to that for integrating multiple scRNA-seq datasets in that both rely on joint matrix factorization and quantile normalization. The main differences are: (1) scATAC-seq data needs to be processed into gene-level values; (2) gene selection is performed on the scRNA-seq data only; and (3) downstream analyses can use both gene-level and intergenic information. diff --git a/vignettes/articles/SNAREseq_walkthrough.Rmd b/vignettes/articles/SNAREseq_walkthrough.Rmd index 9bb1ca40..ce58c806 100644 --- a/vignettes/articles/SNAREseq_walkthrough.Rmd +++ b/vignettes/articles/SNAREseq_walkthrough.Rmd @@ -10,7 +10,7 @@ output: html_document knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) ``` -Here we integrate the scATAC and scRNA reads from the dual-omics dataset SNARE-seq as an illustration of how UINMF can be used to integrate cross-modality data. For this tutorial, we will use three matrices, which can all be downloaded at https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0 +Here we integrate the scATAC and scRNA reads from the dual-omics dataset SNARE-seq as an illustration of how UINMF can be used to integrate cross-modality data. For this tutorial, we will use three matrices, which can all be downloaded from our [Dropbox folder](https://www.dropbox.com/scl/fo/ea666n049h762lla8c5iz/h?rlkey=796slf2ybnv2kmcxxbwluvv2b&dl=0) - The transcriptomic measures `SNAREseq_RNA.RDS` is the SNARE-seq scRNA dataset (31,367 genes by 10,309 cells). - For the shared epigenomic features `SNARE_seq_shared_chromatin_features.RDS`, we create a gene-centric matrix, such that we sum of the number of accessibiltiy peaks that occur over the gene body and promoter regions for each gene. For a detailed walkthough of how to generate such a matrix, please see our [Integrating scRNA and scATAC data vignette](Integrating_scRNA_and_scATAC_data.html). The resulting matrix of gene-centric chromatin accessibility is 22,379 genes by 10,309 cells @@ -24,20 +24,19 @@ library(rliger) rna <- readRDS("SNAREseq_data/SNAREseq_RNA.RDS") atac_shared <- readRDS("SNAREseq_data/SNAREseq_chromatin_accessibility_shared.RDS") atac_unshared <- readRDS("SNAREseq_data/SNARE_seq_unshared_chromatin_features.RDS") -atac_unshared <- as(atac_unshared, "CsparseMatrix") ``` ## Step 2: Preprocessing Integrative non-negative matrix factorization with unshared features (UINMF) performs factorization using shared feature matrix of all datasets and includes unshared features from dataset(s) with such information. In this tutorial, we plan to use the variable feature selected from the scRNA dataset. Since we prepared the gene-centric matrix from the scATAC dataset, these genes can be accounted as the shared features. For the unshared features, normally we select variable genes that are out of the intersection of gene sets from all datasets. In this tutorial, we directly use the peaks that are identified variable. Therefore, special processing will be needed compared to the other UINMF tutorials. -### 1. Create liger object for shared genes +### 2.1: Create liger object for shared genes ```{r create} lig <- createLiger(list(rna = rna, atac = atac_shared)) ``` -### 2. Normalize and select shared variable features +### 2.2: Normalize and select shared variable features ```{r norm} lig <- normalize(lig) %>% @@ -45,7 +44,7 @@ lig <- normalize(lig) %>% scaleNotCenter() ``` -## 3. Selecting the unshared features +## Step 3: Selecting the unshared features When selecting unshared features for the UINMF integration, it is critical to consider the type of data you are working with. For unshared features that gene-centric, the user should follow the feature selection process outlined in the ['Integrating unshared features with UINMF' tutorial](UINMF_vignette.html). @@ -77,12 +76,10 @@ Add the unshared features that have been properly selected, such that they are a ```{r addunshared} varUnsharedFeatures(lig, "atac") <- top2000 -scaleUnsharedData(lig, "atac", check = FALSE) <- unshareScaled +scaleUnsharedData(lig, "atac") <- unshareScaled ``` ->The latest version of rliger package is equipped with strict checkpoints on object validity, including the matching of features and cells between matrices belonging to the same dataset. Given that the special processing method we are introducing in this vignette, that we use the intergenic bins as unshared features, we have to turn the checks off with `check = FALSE` when inserting the scaled unshared features into the object. - -## 4. Joint Matrix Factorization +## Step 4: Joint Matrix Factorization To factorize the datasets and include the unshared datasets, set `useUnshared = TRUE`. @@ -90,7 +87,7 @@ To factorize the datasets and include the unshared datasets, set `useUnshared = lig <- runUINMF(lig, k = 30, nIteration = 30) ``` -## 5. Quantile Normalization and Joint Clustering +## Step 5: Quantile Normalization and Joint Clustering After factorization, the resulting Liger object can be used in all downstream LIGER functions without adjustment. The default reference dataset for quantile normalization is the larger dataset, but the user should select the higher quality dataset as the reference dataset, even if it is the smaller dataset. @@ -100,7 +97,7 @@ lig <- quantileNorm(lig) lig <- runCluster(lig, resolution = 0.8, nNeighbors = 30) ``` -## 6. Visualizations and Downstream processing +## Step 6: Visualizations and Downstream processing ```{r runumap} lig <- runUMAP(lig, nNeighbors = 30) diff --git a/vignettes/articles/STARmap_dropviz_vig.Rmd b/vignettes/articles/STARmap_dropviz_vig.Rmd index bfb70038..921123f8 100644 --- a/vignettes/articles/STARmap_dropviz_vig.Rmd +++ b/vignettes/articles/STARmap_dropviz_vig.Rmd @@ -12,17 +12,15 @@ options(rgl.useNULL = TRUE) knitr::knit_hooks$set(webgl = hook_webgl) ``` -Here we demonstrate integrating a scRNA (Dropviz) dataset of the mouse frontal cortex and a spatial transcriptomics (STARmap) dataset of the same region. For this tutorial, we prepared the two datasets at https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0 +Here we demonstrate integrating a scRNA (Dropviz) dataset of the mouse frontal cortex and a spatial transcriptomics (STARmap) dataset of the same region. For this tutorial, we prepared the two datasets at a [Dropbox folder](https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0) -- The scRNA mouse dataset (Dropviz_starmap_vig.RDS) is 28,366 genes by 71,639 cells. -- The spatial transcriptomic STARmap dataset (STARmap_vig.RDS) is 28 genes by 31,294 cells. -- The annotation data of the Dropviz mouse data (Dropviz_general_annotations.RDS). -- The 3D coordinate data of the STARmap dataset (STARmap_3D_Annotations.RDS) +- The scRNA mouse dataset `Dropviz_starmap_vig.RDS` is 28,366 genes by 71,639 cells. +- The spatial transcriptomic STARmap dataset `STARmap_vig.RDS` is 28 genes by 31,294 cells. +- The annotation data of the Dropviz mouse data `Dropviz_general_annotations.RDS`. +- The 3D coordinate data of the STARmap dataset `STARmap_3D_Annotations.RDS` ## Step 1: Load the data -### 1. Load data and create liger object - The datasets provided are presented in "dgCMatrix" class, which is a common form of sparse matrix in R. We can then create a liger object with a named list of the matrices. The unshared features should not be subsetted out, or submitted separately. Rather, they should be included in the matrix submitted for that dataset. This helps ensure proper normalization. Different from other UINMF tutorials, we set the modality of each dataset with argument `modal`, in this way, a slot for spatial coordinates is preserved for the STARmap dataset. ```{r load} @@ -37,7 +35,7 @@ lig <- createLiger(list(starmap = starmap, dropviz = dropviz), ## Step 2: Preprocessing -### 2. Normalization +### 2.1 Normalization Liger simply normalizes the matrices by library size, without multiplying a scale factor or applying "log1p" transformation. @@ -45,7 +43,7 @@ Liger simply normalizes the matrices by library size, without multiplying a scal lig <- normalize(lig) ``` -### 3. Select variable genes +### 2.2 Select variable genes Given that the STARmap dataset has only 28 genes available and these are also presented in the Dropviz dataset, we directly set all of them as shared variable genes. Please use `varFeatures<-()` method for accessing the shared variable genes. For the Dropviz dataset, we further select variable genes from the unshared part of it. To enable selection of unshared genes, users need to specify the name(s) of dataset(s) where unshared genes should be chosen from to `useUnsharedDatasets`. @@ -54,7 +52,7 @@ lig <- selectGenes(lig, thresh = 0.3, useUnsharedDatasets = "dropviz", unsharedT varFeatures(lig) <- rownames(dataset(lig, "starmap")) ``` -### 4. Scale not center +### 2.3 Scale not center Then we scale the dataset. Three new matrices will be created under the hook, two containing the 28 shared variable genes for both datasets, and one for the unshared variable genes of the Dropviz data. Note that LIGER does not center the data before scaling because iNMF/UINMF methods require non-negative input. @@ -64,8 +62,6 @@ lig <- scaleNotCenter(lig) ## Step 3: Joint Matrix Factorization -### 5. Mosaic integrative NMF - Unshared Integrative Non-negative Matrix Factorization (UINMF) can be applied with `runIntegration(..., method = "UINMF")`. A standalone function `runUINMF()` is also provided with more detailed documentation and initialization setup. This step produces factor gene loading matrices of the shared genes: $W$ for shared information across datasets, and $V$ for dataset specific information. Specific to UINMF method, additional factor gene loading matrix of unshared features, $U$ is also produced. $H$ matrices, the cell factor loading matrices are produced for each dataset and can be interpreted as low-rank representation of the cells. In this tutorial, we set dataset specific lambda (regularization parameter) values to penalize the dataset specific effect differently. @@ -75,9 +71,10 @@ Another noteworthy advantage of UINMF is that we are able to use a larger number ```{r factorization} lig <- runUINMF(lig, k = 40, lambda = c(10, 1)) ``` + ## Step 4: Quantile Normalization and Joint Clustering -### 6. Quantile normalization +### 4.1 Quantile normalization The default reference dataset for quantile normalization is the larger dataset, but users should select the higher quality dataset as the reference dataset, even if it is the smaller dataset. In this case, the Dropviz dataset is considered higher quality than the STARmap dataset, so we set the Dropviz dataset to be the reference dataset. @@ -87,7 +84,7 @@ After this step, the low-rank cell factor loading matrices, $H$, are aligned and lig <- quantileNorm(lig, reference = "dropviz") ``` -### 7. Leiden clustering +### 4.2 Leiden clustering With the aligned cell factor loading information, we next apply Leiden community detection algorithm to identify cell clusters. @@ -97,7 +94,7 @@ lig <- runCluster(lig) ## Step 5: Visualizations and Downstream processing -### 8. Dimensionality reduction +### 5.1 Dimensionality reduction We create a UMAP with the quantile normalized cell factor loading. @@ -105,7 +102,7 @@ We create a UMAP with the quantile normalized cell factor loading. lig <- runUMAP(lig) ``` -### 9. Plot UMAP +### 5.2 Plot UMAP Next, we can visualize our factorized object by dataset to check the integration between datasets, and also the cluster labeling on the global population. @@ -135,7 +132,7 @@ starmap_annies <- readRDS("starmap_data/STARmap_3D_Annotations.RDS") dplyr::glimpse(starmap_annies) ``` -### 10. Manage ligerSpatialDataset class +### 6.1 Manage ligerSpatialDataset class (Optional) Recall that we set the "modal" of the STARmap dataset to "spatial" when we created the liger object, here we can insert the coordinate information into the object. @@ -147,7 +144,7 @@ coordinate(lig, "starmap") <- coords LIGER does not yet have any analysis method that incorporate the spatial coordinate information. Given that most of the spatial transcriptomics technologies only support 2D spatial information, LIGER only has 2D plotting (See `plotSpatial2D()`) method implemented for the spatial coordinate at this point. -### 11. 3D visualization +### 6.2 3D visualization The STARmap technology we work with in this tutorial is special for 3D information. LIGER does not provide any function for the visualization of 3D coordinates, but here we provide a simple guide on it using package [*rgl*](https://CRAN.R-project.org/package=rgl). diff --git a/vignettes/articles/UINMF_vignette.Rmd b/vignettes/articles/UINMF_vignette.Rmd index eb1d0654..fb6a4f91 100644 --- a/vignettes/articles/UINMF_vignette.Rmd +++ b/vignettes/articles/UINMF_vignette.Rmd @@ -16,12 +16,14 @@ Unshared integrative non-negative matrix factorization (UINMF) [[Kriebel and Wel 3. UINMF can utilize all of the information present in single-cell multimodal datasets when integrating with single-modality datasets. 4. UINMF can accommodate non-orthologous genes in cross-species integration analyses. -For this tutorial, we prepared example osmFISH dataset (33 genes by 5,219 cells) from the mouse somatosensory cortex, and a downsampled single-cell RNA-seq dataset (29,463 genes by 15,000 cells) from the frontal cortex. The datasets can be downloaded at https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0 +For this tutorial, we prepared example input data which can be found in the [Dropbox folder](https://www.dropbox.com/scl/fo/npj3x6ygp5bkl702gycnu/h?rlkey=llaana00wg81tgzp3oodhhe7p&dl=0) +- osmFISH dataset `OSMFISH.vin.RDS`, 33 genes by 5,219 cells, in dense matrix form, rom the mouse somatosensory cortex +- A downsampled single-cell RNA-seq dataset `DROPVIZ.vin.RDS`, 29,463 genes by 15,000 cells, from the frontal cortex. ## Step 1: Preprocessing and Normalization -### 1. Loading the data +### 1.1 Loading the data The prepared RDS files can be directly loaded into the R environment. They are presented in dense matrix (R default) form. We recommend converting them to sparse matrix form ("dgCMatrix" class supported by "Matrix" package) in order to reduce memory usage and speed up downstream analysis. @@ -36,11 +38,10 @@ options(ligerDotSize = 0.5) osmFISH <- readRDS("uinmf_vignette_data/OSMFISH.vin.RDS") osmFISH <- as(osmFISH, "CsparseMatrix") rna <- readRDS("uinmf_vignette_data/DROPVIZ.vin.RDS") -rna <- as(rna, "CsparseMatrix") lig <- createLiger(list(osmFISH = osmFISH, rna = rna)) ``` -### 2. Preprocess the datasets +### 1.2 Preprocess the datasets The normalization is applied to the datasets in their entirety. @@ -67,16 +68,15 @@ The scaled matrix for shared features can be accessed with `scaleData()`. The sc ## Step 2: Joint Matrix Factorization -### 3. Run UINMF - We provide function `runIntegration()` for a general wrapper of all integration method we developed. Users can do `runIntegration(..., method = "UINMF")` to perform the analysis. Alternatively, stand-alone function `runUINMF()` is also provided with more specific parameter setitng guide. ```{r factorization} lig <- runIntegration(lig, k = 30, method = "UINMF") ``` + ## Step 3: Quantile Normalization and Joint Clustering -### 4. Quantile normalize the factor loading +### 3.1 Quantile normalize the factor loading After factorization, the resulting liger object can be used in all downstream LIGER functions without adjustment. The default reference dataset for quantile normalization is the larger dataset, but the user should select the higher quality dataset as the reference dataset, even if it is the smaller dataset. @@ -84,7 +84,7 @@ After factorization, the resulting liger object can be used in all downstream LI lig <- quantileNorm(lig, reference = "rna") ``` -### 5. Leiden clustering +### 3.2 Leiden clustering With the quantile normalized cell factor loading, we can then apply Leiden graph-based community detection method on it. @@ -94,13 +94,13 @@ lig <- runCluster(lig) ## Step 4: Visualizations and Downstream processing -### 6. Create dimension reduction embedding +### 4.1 Create dimension reduction embedding ```{r runumap} lig <- runUMAP(lig) ``` -### 7. Visualize +### 4.2 Visualize Next, we can visualize our returned factorized object by dataset to check the alignment between datasets, as well as by cluster determined in the factorization. diff --git a/vignettes/articles/installation.Rmd b/vignettes/articles/installation.Rmd index 40ef7bc6..29255a5e 100644 --- a/vignettes/articles/installation.Rmd +++ b/vignettes/articles/installation.Rmd @@ -33,7 +33,7 @@ devtools::install_github("welch-lab/liger@newObj") If you encounter issues when building from source, please see below sections for details about some dependencies. -**Please note that the latest version (>=1.9.9) is massively updated compared to prior versions, and functions are not directly compatible with the old version objects which users might possess. Please use `readLiger()` to load old RDS files or `convertOldLiger()` to convert a loaded object into the up-to-date structure. You might need to be careful when overwriting existing analysis because we don't provide methods to convert new the structure backwards.** +**Please note that the latest version (>=1.99.0) is massively updated compared to prior versions, and functions are not directly compatible with the old version objects which users might possess. Please use `readLiger()` to load old RDS files or `convertOldLiger()` to convert a loaded object into the up-to-date structure. You might need to be careful when overwriting existing analysis because we don't provide methods to convert new the structure backwards.** ### Compiler setup diff --git a/vignettes/articles/liger_object.Rmd b/vignettes/articles/liger_object.Rmd index 0e8902e2..ecbab0d1 100644 --- a/vignettes/articles/liger_object.Rmd +++ b/vignettes/articles/liger_object.Rmd @@ -24,10 +24,11 @@ On the left hand side of the figure, we briefly illustrate how the old structure In the new version, on the right side, we first introduce another new class, [ligerDataset](../reference/ligerDataset-class.html), to serve as a container for all matrices belonging to the same specific dataset. In this way, we have easy and safe control on data matching thanks to a number of object oriented accessor methods and validity checks. As for the [liger](../reference/liger-class.html) class, there are some main differences: - Feature (gene expression) matrices are now wrapped in ligerDataset objects, and all put in the `datasets(obj)` slot. Click here for how to [access the raw counts, normalized counts, scaled data](#access-feature-matrices). -- Cell metadata including dataset belonging, study design conditions, quality control (QC) metrics are now stored in slot `cellMeta(obj)`. Meanwhile, we moved cluster labeling result also to this slot, in order to have multiple cluster variables existing at the same time. Additionally, by introducing `S4Vectors::DataFrame` class for the metadata, we are now storing low-dimensional representation (e.g. UMAP, t-SNE) in the cell metadata. Click here for how to [access cell metadata](#access-cell-metadata). +- Cell metadata including dataset belonging, study design conditions, quality control (QC) metrics are now stored in slot `cellMeta(obj)`. Meanwhile, we moved cluster labeling result also to this slot, in order to have multiple cluster variables existing at the same time. Additionally, by introducing `S4Vectors::DataFrame` class for the metadata for flexibility and tidy display. Click here for how to [access cell metadata](#access-cell-metadata). +- Dimensionality reductions are now expanded to a list of low-dimensional representation (e.g. UMAP, t-SNE) in slot `dimReds(obj)`. Click here for how to [access dimensionality reductions](#access-dimensionality-reductions). - Variable features identified are now stored in `varFeatures(obj)`, with no structural change. -- A new slot `@uns` is added for storing miscellaneous unstructured information. -- We also added the feature to record the commands applied to the liger object and allow retrieving the records with `commands(obj)`. See more about liger command recording and retrieving. +- A new slot `@uns` is added for storing miscellaneous unstructured information, including default setting for cluster labeling and dimension reduction, for faster visualization calls. +- We also added the feature to record the commands applied to the liger object and allow retrieving the records with `commands(obj)`. See more about liger [command recording and retrieving](#check-the-records-of-run-commands). We demonstrate examples below with example dataset "[pbmc](../reference/pbmc.html)", which is a minimal subset from the study conducted in Hyun Min Kang and et. al., Nature Biotechnology, 2018. The data is a ready to use (new) liger object with only raw counts data. We quickly process it here so that we can show how to retrieve all kinds of data in sections below. @@ -60,6 +61,12 @@ names(pbmc) length(pbmc) ``` +- To list out number of cells in each dataset + +```{R} +lengths(pbmc) +``` + - To access the [ligerDataset](../reference/ligerDataset-class.html) object for a specific dataset ```{R} @@ -68,7 +75,7 @@ ctrlLD <- dataset(pbmc, dataset = "ctrl") ctrlLD <- dataset(pbmc, 1) ``` -In other LIGER functions where the argument `useDatasets` is allowed, users can always use the exact character name(s) or the numeric index to specify the datasets to be involved in the analysis. Moreoever, a logical vector of index is also allowed and could ease the usage in some cases. +In any other *rliger* functions where the argument `useDatasets` is exposed, users can always use the exact character name(s) or the numeric index to specify the datasets to be involved in the analysis. Moreoever, a logical vector of index is also allowed and could ease the usage in some cases. ```{R, eval = FALSE} # Not run, just for example, assuming we've got the clustering for such an object @@ -89,7 +96,7 @@ ldList <- datasets(pbmc) ## Access feature matrices -We have three main generics for accessing feature matrices, namingly `rawData()`, `normData()` and `scaleData()`. For scaled unshared features, used for UINMF, we also have `scaleUnsharedData()`. The logistics of the accessor to all these feature matrices are the same, so we only present the case for raw counts. +We have three main generics for accessing feature matrices, namingly `rawData()`, `normData()` and `scaleData()`. For scaled unshared features, used for UINMF, we also have `scaleUnsharedData()`. Additionally, we provide `rawPeak()` and `normPeak()` for accessing the peak counts in a ATACseq dataset. The logistics of the accessor to all these feature matrices are the same, so we only present the case for raw counts. - To get a list of the raw counts from all datasets: @@ -141,7 +148,7 @@ dim(pbmc) ## Access cell metadata -As previously descibed at the top of this page, cell metadata including dataset origin, QC metrics, cluster labeling and dimension reduction are all stored in `cellMeta(obj)`. +As previously descibed at the top of this page, cell metadata including dataset origin, study metadata, QC metrics and cluster labeling are all stored in `cellMeta(obj)`. - To have a look at the full metadata table: @@ -166,22 +173,11 @@ nUMI <- pbmc[["nUMI"]] cellMeta(pbmc, c("nUMI", "nGene")) ``` -- To retrieve the matrix of a dimension reduction result - -```{R} -umap <- pbmc$UMAP -class(umap) -dim(umap) -# Alternatively -umap <- cellMeta(pbmc, "UMAP") -``` - -- To get a variable for only a subset of cells (e.g. from a specific dataset), use the argument `cellIdx` with any valid indexing value (numerics, barcodes, logicals) that applies to all cells: +- To get a variable for only a subset of cells (e.g. from a specific dataset), use the argument `useDatasets`, or alternatively `cellIdx` that subscribes cells explicitly: ```{R} -umapCtrl <- cellMeta(pbmc, "UMAP", useDataset = "ctrl") -class(umapCtrl) -dim(umapCtrl) +nUMI <- cellMeta(pbmc, "nUMI", useDatasets = "ctrl") +length(nUMI) ``` - Add or replace a variable. If the new variable being added has a matching size (`length()` or `nrow()`) with the number of all cells (`ncol(pbmc)`): @@ -200,6 +196,58 @@ ctrlBar <- seq_len(ncol(dataset(pbmc, "ctrl"))) cellMeta(pbmc, "ctrl_bar", useDataset = "ctrl") <- ctrlBar ``` +## Access dimensionality reductions + +- To get a list of all dimensionality reductions: + +```{R} +allDimReds <- dimReds(pbmc) +class(allDimReds) +length(allDimReds) +``` + +- To get a specific dimensionality reduction: + +```{R} +umap <- dimRed(pbmc, "UMAP") +class(umap) +dim(umap) +``` + +- To get a specific dimensionality reduction for a specific dataset, again, use the argument `useDatasets`. + +```{R} +ctrlUMAP <- dimRed(pbmc, "UMAP", useDatasets = "ctrl") +dim(ctrlUMAP) +``` + +- Setting an existing dimensionality reduction as the default for visualization + +```{R} +defaultDimRed(pbmc) <- "UMAP" +``` + +Every time when `runUMAP()` or `runTSNE()` is called, the new result will be set as default. When default dimensionality reduction is set, any plotting function that shall work with it will use it by default without the need to specify it explicitly. And the dimensionality reduction accessor function also returns the default one if no specific one is requested. + +```{R} +umap <- dimRed(pbmc) +``` + +- Add a new matrix into the object + +```{R} +dimRed(pbmc, "newUMAP") <- umap +``` + +Cell identifiers on `rownames(value)` will be checked for matching is present. The check is aware of that a dataset name prefix is added to the object cell IDs. + +- Adding a dimensionality reduction matrix for only one certain dataset + +```{R} +ctrlUMAP <- dimRed(pbmc, "UMAP", useDatasets = "ctrl") +dimRed(pbmc, "ctrlUMAP", useDatasets = "ctrl") <- ctrlUMAP +``` + ## Access factorization result We suggest using `getMatrix()` for all matrices involved in the factorization, including: diff --git a/vignettes/articles/walkthrough_pbmc.Rmd b/vignettes/articles/walkthrough_pbmc.Rmd deleted file mode 100644 index 746784d1..00000000 --- a/vignettes/articles/walkthrough_pbmc.Rmd +++ /dev/null @@ -1,164 +0,0 @@ ---- -title: "Walkthrough -- Aligning PBMC Data" -author: "Velina Kozareva" -date: "8/2/2019" -output: - pdf_document: default - html_document: default ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` -This walkthrough steps through a basic analysis and alignment of two datasets of peripheral blood mononuclear cells (PBMCs), -provided by 10X and [Gierahn et al., 2017](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5376227/). We show how LIGER can be used to identify trends and clusters across both datasets, in addition to dataset-specific differences. We also demonstrate some -of the package's visualization capabilities while reproducing some of the figure panels from the manuscript (Fig. 2). - -The data can be downloaded from `vignettes/pbmc_alignment.zip`. First we set up the liger object using the 10X and Seqwell datasets provided in this directory. - -```{r load, results='hide', message=F, warning=F} -library(liger) -library(cowplot) - -pbmc.10x <- read.table('~/Downloads/pbmc_alignment/pbmc_10X.expressionMatrix.txt', - sep="\t",stringsAsFactors=F,header=T,row.names = 1) -pbmc.seqwell <- read.table('~/Downloads/pbmc_alignment/pbmc_SeqWell.expressionMatrix.txt', - sep="\t",stringsAsFactors=F,header=T,row.names = 1) -pbmc.data = list(tenx=pbmc.10x, seqwell=pbmc.seqwell) - -# Create liger object -a.pbmc <- createLiger(pbmc.data) -``` - -###Data Preprocessing -Before running the factorization, we need to normalize the data to account for differences in sequencing depth and capture efficiency across cells, select variable genes, and scale the data. Note that we do not center the data when scaling because non-negative matrix factorization accepts only non-negative values. - -The `selectGenes` function performs variable gene selection on each of the datasets separately, then takes the union of the result. Highly variable genes are selected by comparing the variance of each gene's expression to its mean expression and keeping those with a ratio above a specified threshold. - -We can also simply pass pre-selected variable genes to the object if we've determined them from another -source. In this case, we are using the variable genes determined by the Seurat package for their -analysis of these two datasets (this was done in the manuscript for a cleaner comparison of the -methods). - -```{r preprocess} -a.pbmc <- normalize(a.pbmc) -# Can pass different var.thresh values to each dataset if one seems to be contributing significantly -# more genes than the other -a.pbmc <- selectGenes(a.pbmc, var.thresh = c(0.3, 0.875), do.plot = F) - -# In this case, we have a precomputed set of variable genes -s.var.genes <- readRDS('~/Downloads/pbmc_alignment/var_genes.RDS') -a.pbmc@var.genes <- s.var.genes -a.pbmc <- scaleNotCenter(a.pbmc) -``` - -###Factorization -Next we perform integrative non-negative matrix factorization in order to identify shared and distinct metagenes across the datasets and the corresponding factor/metagene loadings for each cell. The most important parameters in the factorization are `k` (the number of factors) and `lambda` (the penalty parameter which limits the dataset-specific component of the factorization). The default value of `lambda=5.0` usually provides reasonable results for most analyses, although the `suggestLambda` function can be used to determine a more appropriate `lambda` value for the desired level of dataset alignment. - -To determine the appropriate number of factors to use, we can use the `suggestK` function which plots median K-L divergence from the uniform distribution in the factor loadings as a function of `k`. We want to look for the section of the plot where this metric stops increasing as sharply (the "elbow" of the plot). In general, we should expect a positive correlation between the number of subgroups we expect to find in the analysis and the appropriate number of factors to use. Since the `suggestK` function can take more than 10 minutes to run, it can sometimes be useful to run a quick preliminary analysis with `k=20` to get an idea of whether a much higher number of factors is needed. (Note that the command provided below requests 5 cores – adjust this value if you have fewer cores available.) - -``` {r kselection, eval=F} -# running suggestK on multiple cores can greatly decrease the runtime -k.suggest <- suggestK(a.pbmc, num.cores = 5, gen.new = T, return.results = T, plot.log2 = F, - nrep = 5) -``` - -For this analysis, we select a `k` of 22, though you can try various values in that range for similar results. We select the default `lambda=5.0`. (You can also run `suggestLambda` to get a plot of expected alignment vs. lambda for a given number of factors.) -```{r factorization, results='hide'} -# Take the lowest objective of three factorizations with different initializations -# Multiple restarts are recommended for initial analyses since iNMF is non-deterministic -a.pbmc <- optimizeALS(a.pbmc, k=22, thresh = 5e-5, nrep = 3) -``` - -After the factorization, we still need to quantile align the factor loadings across the datasets. Notice that if we plot a t-SNE representation of the factor loadings, the data still cluster mainly by dataset. -```{r unaligned} -a.pbmc <- runTSNE(a.pbmc, use.raw = T) -p1 <- plotByDatasetAndCluster(a.pbmc, return.plots = T) -# Plot by dataset -print(p1[[1]]) -``` - -To better integrate the datasets, we perform a quantile alignment step. This process first identifies similarly loading cells across datasets by building a similarity graph based on shared factor neighborhoods. Using Louvain community detection, we then identify clusters shared across datasets, and align quantiles within each cluster and factor. The key parameters in this step are the `resolution` (increasing this increases the number of communities detected) and `knn_k` (the number of dataset neighbors used in generating the shared factor neighborhood). In general, lowering `knn_k` will allow for more fine-grained identification of smaller groups with shared factor neighborhoods. - -We can also try to extract even smaller clusters by setting the `small_clust_thresh` parameter equal to the current `knn_k`; we do this here since we expect a small group of megakaryocytes in the 10X dataset. We set the `resolution` to 0.4 to identify larger clusters, and use the default settings for the other parameters. (For a simpler analysis, and to perfectly reproduce the manuscript figures, you can use the default `small.clust.thresh = 0`, though this will prevent separation of the megakaryocytes.) - -```{r aligned, results='hide'} -a.pbmc <- quantileAlignSNF(a.pbmc, resolution = 0.4, small.clust.thresh = 20) -# Could also forgo small cluster extraction if desired -# a.pbmc <- quantileAlignSNF(a.pbmc, resolution = 0.4) - -``` - -###Visualization -Now we can visualize the integrated data, and determine identities of the detected clusters. -```{r visualize, fig.width=6, fig.height=4.5} -a.pbmc <- runTSNE(a.pbmc) -p_a <- plotByDatasetAndCluster(a.pbmc, return.plots = T) -# Modify plot output slightly -p_a[[1]] <- p_a[[1]] + theme_classic() + theme(legend.position = c(0.85, 0.15)) + - guides(col=guide_legend(title = '', override.aes = list(size = 4))) - -print(p_a[[1]]) - -``` - -Since we have cluster information from the original publications/analyses for each of these two datasets, we -can easily see how our clustering compares using a river (i.e. Sankey) plot. Note that these clusterings were generated by individual (not joint) analyses of each dataset. The 10X labels come from the Seurat analysis -detailed [here](https://satijalab.org/seurat/pbmc3k_tutorial.html) and the SeqWell labels from the -original SeqWell publication. - -```{r clustering, message=F, warning=F} -# load cluster labels from Seurat 10X PBMC analysis and original SeqWell publication -clusters_prior <- readRDS('~/Downloads/pbmc_alignment/tenx_seqwell_clusters.RDS') -tenx_c <- droplevels(clusters_prior[rownames(a.pbmc@H[[1]])]) -seqwell_c <- droplevels(clusters_prior[rownames(a.pbmc@H[[2]])]) - -# Set specific node order for cleaner visualization (can also leave to default value for -# automatic ordering) -set_node_order = list(c(2, 6, 3, 4, 8, 1, 5, 7), c(1, 6, 2, 3, 4, 5, 7, 8, 9), c(5, 2, 3, 6, 1, 4)) -makeRiverplot(a.pbmc, tenx_c, seqwell_c, min.frac = 0.05, node.order = set_node_order, - river.usr = c(0, 1, -0.6, 1.6)) -# If using object with default small.clust.thresh settings, use this node order instead -# set_node_order = list(c(7, 2, 6, 3, 4, 8, 1, 5), c(1, 6, 2, 3, 4, 5, 7, 8), rev(c(4, 1, 6, 3, 2, 5))) - -# plot t-SNE plot colored by clusters -print(p_a[[2]]) - -``` - -We see a good correspondance between analogous cluster labels from the two datasets and have successfully identified the small group of megakaryocytes! - -We can also identify some shared and dataset-specific markers for each factor and plot them to help in cluster annotation. This information can aid in finding cell-type specific dataset differences. We can return these in table format with `getFactorMarkers`, though an easy way to visualize these markers (and distribution of factor loadings) is with the package's `plotWordClouds` function. Here we can notice that in the plot of factor 9, which seems to correspond to the NK cell cluster, one of the most highly-loading dataset-specific markers is CD16 (FCGR3A). -```{r markers, message=F, fig.width=5, fig.height=6} -# This function uses the V loadings to identify dataset-specific genes and a combination of the W -# and V loadings to identify shared markers -markers <- getFactorMarkers(a.pbmc, dataset1='tenx', dataset2='seqwell', num.genes = 10) -# The first three elements in this list are the dataset1-specific, shared, and dataset2-specific -# markers respectively -# Let's take a look at factor 9 -head(markers$tenx[markers$tenx$factor_num == 9, ]) -``` - -```{r word clouds, results='hide', fig.width=5, fig.height=6} -# Show top 10 highly loading dataset specific and shared genes -# Prevent text progress bar from printing -word_clouds <- plotWordClouds(a.pbmc, num.genes = 10, do.spec.plot = F, return.plots = T) -print(word_clouds[[9]]) -``` - -A related type of visualization is a gene loadings plot, which may allow for easier comparison of the most highly-loading genes: - -```{r gene loadings, results='hide', fig.width=5, fig.height=6} -# Show top 10 highly loading dataset specific and shared genes -# with plots of gene loading values -gene_loadings <- plotGeneLoadings(a.pbmc, num.genes = 10, do.spec.plot = F, return.plots = T) -print(gene_loadings[[9]]) -``` - -We can now plot this gene to better observe finer-grained dataset specific differences. -```{r plot_gene, fig.width=9, fig.height=4.5} -p_g2 <- plotGene(a.pbmc, 'FCGR3A', return.plots = T) -plot_grid(plotlist = p_g2) -``` - -Note that with these plots, we can confirm that the NK cluster in the seqwell dataset does not seem to express CD16 as highly as that from the 10X dataset (although as expected many of the myeloid cells express this marker). This might suggest some additional cytokine activation in the 10X patient.