From 4f8e761837ee5e98ddd97e11da16fd2a9e23e6eb Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Tue, 7 Jun 2016 11:03:42 -0700 Subject: [PATCH 01/27] Added option to not return normalized data --- R/scone_main.R | 181 +++++++++++++++++++++++++++++++------------------ 1 file changed, 116 insertions(+), 65 deletions(-) diff --git a/R/scone_main.R b/R/scone_main.R index c4b5528..9aaeaa9 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -1,79 +1,116 @@ -#' scone main wrapper: function to apply and evaluate all the normalization schemes +#' scone main wrapper: function to apply and evaluate all the normalization +#' schemes #' -#' This function is a high-level wrapper to apply and evaluate a variety of normalization -#' schemes to a specified expression matrix. +#' This function is a high-level wrapper to apply and evaluate a variety of +#' normalization schemes to a specified expression matrix. #' -#' The function consists of four main steps: imputation, scaling normalization, adjusting for batch effects, -#' and evaluation of the normalization performance. +#' The function consists of four main steps: imputation, scaling normalization, +#' adjusting for batch effects, and evaluation of the normalization performance. #' -#' @details Note that if one wants to include the unnormalized data in the comparison, the identity function -#' must be included in the scaling list. Analogously, if one wants to avoid the imputation and/or include the -#' non-imputed data in the comparison, the identity function must be included. +#' @details Note that if one wants to include the unnormalized data in the +#' comparison, the identity function must be included in the scaling list. +#' Analogously, if one wants to avoid the imputation and/or include the +#' non-imputed data in the comparison, the identity function must be included. #' #' @param expr matrix. The data matrix (genes in rows, cells in columns). -#' @param imputation list or function. (A list of) function(s) to be used for imputation. -#' @param scaling list or function. (A list of) function(s) to be used for scaling normalization. -#' @param k_ruv numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation). -#' If 0, RUV will not be performed. -#' @param k_qc numeric. The maximum number of quality metrics PCs (the function will adjust for 1 to k_qc PCs). -#' If 0, QC adjustment will not be performed. -#' @param ruv_negcon character. The genes to be used as negative controls for RUV. Ignored if k_ruv=0. -#' @param qc matrix. The QC metrics to be used for QC adjustment. Ignored if k_qc=0. -#' @param adjust_bio character. If 'no' it will not be included in the model; if 'yes', both models with and without 'bio' will be run; -#' if 'force', only models with 'bio' will be run. -#' @param adjust_batch character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run; -#' if 'force', only models with 'batch' will be run. -#' @param bio factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation. -#' @param batch factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation. -#' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster). -#' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE. -#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). -#' If NULL, PCA is used for projection -#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args. -#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. -#' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should -#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -#' If NULL, correlations with negative controls will be returned NA. -#' @param eval_poscon character. The genes to be used as positive controls for evaluation. These genes should -#' be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -#' If NULL, correlations with positive controls will be returned NA. -#' @param params matrix or data.frame. If given, the algorithm will bypass creating the matrix of possible -#' parameters, and will use the given matrix. There are basically no checks as to whether this matrix is in the -#' right format, and is only intended to be used to feed the results of setting run=FALSE -#' back into the algorithm (see example). +#' @param imputation list or function. (A list of) function(s) to be used for +#' imputation. +#' @param scaling list or function. (A list of) function(s) to be used for +#' scaling normalization. +#' @param k_ruv numeric. The maximum number of factors of unwanted variation +#' (the function will adjust for 1 to k_ruv factors of unwanted variation). If +#' 0, RUV will not be performed. +#' @param k_qc numeric. The maximum number of quality metrics PCs (the function +#' will adjust for 1 to k_qc PCs). If 0, QC adjustment will not be performed. +#' @param ruv_negcon character. The genes to be used as negative controls for +#' RUV. Ignored if k_ruv=0. +#' @param qc matrix. The QC metrics to be used for QC adjustment. Ignored if +#' k_qc=0. +#' @param adjust_bio character. If 'no' it will not be included in the model; if +#' 'yes', both models with and without 'bio' will be run; if 'force', only +#' models with 'bio' will be run. +#' @param adjust_batch character. If 'no' it will not be included in the model; +#' if 'yes', both models with and without 'batch' will be run; if 'force', +#' only models with 'batch' will be run. +#' @param bio factor. The biological condition to be included in the adjustment +#' model (variation to be preserved). If adjust_bio="no", it will not be used +#' for normalization, but only for evaluation. +#' @param batch factor. The known batch variable to be included in the +#' adjustment model (variation to be removed). If adjust_batch="no", it will +#' not be used for normalization, but only for evaluation. +#' @param evaluate logical. If FALSE the normalization methods will not be +#' evaluated (faster). +#' @param eval_pcs numeric. The number of principal components to use for +#' evaluation. Ignored if evaluation=FALSE. +#' @param eval_proj function. Projection function for evaluation (Inputs: e = +#' genes in rows, cells in columns. eval_proj_args. Output: cells in rows, +#' factors in columns). If NULL, PCA is used for projection +#' @param eval_proj_args list. List of args passed to projection function as +#' eval_proj_args. +#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam +#' tightness and stability evaluation. If an array of integers, largest +#' average silhoutte width (tightness) / maximum co-clustering stability will +#' be reported. If NULL, tightness and stability will be returned NA. +#' @param eval_negcon character. The genes to be used as negative controls for +#' evaluation. These genes should be expected not to change according to the +#' biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL, +#' correlations with negative controls will be returned NA. +#' @param eval_poscon character. The genes to be used as positive controls for +#' evaluation. These genes should be expected to change according to the +#' biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL, +#' correlations with positive controls will be returned NA. +#' @param params matrix or data.frame. If given, the algorithm will bypass +#' creating the matrix of possible parameters, and will use the given matrix. +#' There are basically no checks as to whether this matrix is in the right +#' format, and is only intended to be used to feed the results of setting +#' run=FALSE back into the algorithm (see example). #' @param verbose logical. If TRUE some messagges are printed. -#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL. -#' @param run logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame -#' of parameters that will be run for inspection by the user. +#' @param conditional_pam logical. If TRUE then maximum ASW is separately +#' computed for each biological condition (including NA), and a weighted +#' average is returned as PAM_SIL. +#' @param run logical. If FALSE the normalization and evaluation are not run, +#' but the function returns a data.frame of parameters that will be run for +#' inspection by the user. +#' @param return_norm logical. If FALSE the normalization will not be returned, +#' but only evaluate. This will create a much smaller object and may be useful +#' for large datasets and/or when many combinations are compared. #' #' @importFrom RUVSeq RUVg #' @importFrom matrixStats rowMedians #' @import BiocParallel #' @export #' -#' @details If both \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. +#' @details If both \code{run=FALSE} the normalization and evaluation are not +#' run, but the function returns a matrix of parameters that will be run for +#' inspection by the user. #' -#' @return A list with the following elements: -#' \itemize{ -#' \item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.} -#' \item{evaluation}{ A matrix containing raw evaluation metrics for each normalization method. Rows are sorted in the same order as in the ranks output matrix. NULL when evaluate = FALSE.} -#' \item{ranks}{ A matrix containing rank-scores for each normalization, including median rank across all scores. Rows are sorted by increasing median rank. NULL when evaluate = FALSE.} -#' \item{params}{ A data.frame with each row corresponding to a set of normalization parameters.} -#' } -#' @return If \code{run=FALSE} a \code{data.frame} -#' with each row corresponding to a set of normalization parameters. +#' @return A list with the following elements: \itemize{ \item{normalized_data}{ +#' A list containing the normalized data matrix, log-scaled. NULL when +#' evaluate = TRUE.} \item{evaluation}{ A matrix containing raw evaluation +#' metrics for each normalization method. Rows are sorted in the same order as +#' in the ranks output matrix. NULL when evaluate = FALSE.} \item{ranks}{ A +#' matrix containing rank-scores for each normalization, including median rank +#' across all scores. Rows are sorted by increasing median rank. NULL when +#' evaluate = FALSE.} \item{params}{ A data.frame with each row corresponding +#' to a set of normalization parameters.} } +#' @return If \code{run=FALSE} a \code{data.frame} with each row corresponding +#' to a set of normalization parameters. #' -#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: -#' Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. -#' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. -#' Rank-scores are computed so that higer-performing methods are assigned a lower-rank. +#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. +#' Each metric is assigned a signature for conversion to rank-score: +#' Positive-signature metrics increase with improving performance, including +#' BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. Negative-signature +#' metrics decrease with improving performance, including BATCH_SIL, +#' EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. Rank-scores are computed so that +#' higer-performing methods are assigned a lower-rank. #' -scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, - qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), - bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, - eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, - params=NULL, verbose=FALSE, conditional_pam = FALSE) { +scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, + ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), + adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, + run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL, + eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL, + eval_poscon=NULL, params=NULL, verbose=FALSE, + conditional_pam = FALSE, return_norm = TRUE) { if(!is.matrix(expr)) { stop("'expr' must be a matrix.") @@ -338,11 +375,20 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, } else { score <- NULL } - return(list(score=score, adjusted=adjusted)) + + if(return_norm) { + return(list(score=score, adjusted=adjusted)) + } else { + return(list(score=score)) + } }) - adjusted <- lapply(outlist, function(x) x$adjusted) - names(adjusted) <- apply(params, 1, paste, collapse=',') + if(return_norm) { + adjusted <- lapply(outlist, function(x) x$adjusted) + names(adjusted) <- apply(params, 1, paste, collapse=',') + } else { + adjusted <- NULL + } if(evaluate) { evaluation <- lapply(outlist, function(x) x$score) @@ -361,8 +407,13 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, } evaluation <- t(evaluation[,order(med_rank), drop=FALSE]) - adjusted <- adjusted[order(med_rank)] + + if(return_norm) { + adjusted <- adjusted[order(med_rank)] + } + params <- params[order(med_rank),] + } else { evaluation <- ranks <- NULL } From 11f0195d549d448bfc03e21f40297a3dde3ce115 Mon Sep 17 00:00:00 2001 From: mbcole Date: Tue, 14 Jun 2016 17:13:19 -0700 Subject: [PATCH 02/27] Develop 0.0.5-9000 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4338105..f6f8ee5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.5 +Version: 0.0.5-9000 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", From 5debdfc6927a1178187e6625696512f56133ba57 Mon Sep 17 00:00:00 2001 From: John Blischak Date: Sat, 25 Jun 2016 16:57:46 -0700 Subject: [PATCH 03/27] Restore original par settings upon exiting function. --- R/sample_filtering.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/sample_filtering.R b/R/sample_filtering.R index 483d597..f19a265 100644 --- a/R/sample_filtering.R +++ b/R/sample_filtering.R @@ -254,7 +254,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, is_bad = rep(FALSE,dim(expr)[2]) - par(mfcol = c(criterion_count,2)) + op <- par(mfcol = c(criterion_count,2)) + on.exit(par(op)) if(!is.null(nreads)){ is_bad = filtered_nreads @@ -417,8 +418,9 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5, num_qual_pcs = which(csum > min_qual_variance)[1] if(plot){ + op <- par(mfrow = c(2,1)) + on.exit(par(op)) for (i in 1:num_qual_pcs){ - par(mfrow = c(2,1)) hist(qpc$x[,i],breaks = hist_breaks, main = paste0("Distribution of Quality PC ",i), xlab = paste0("Qual PC",i)) barplot(abs(qpc$rotation[,i]),col = c("red","green")[1 + (qpc$rotation[,i] > 0)], cex.names = .25,horiz = T, las=1, main = "Loadings") } From 6ffeb74bf91948c9825e1d947df9a4f78a6a24a0 Mon Sep 17 00:00:00 2001 From: mbcole Date: Thu, 7 Jul 2016 13:27:02 -0700 Subject: [PATCH 04/27] Replaced conditional pam with new stratified pam + updated NEWS. --- R/scone_eval.R | 45 +++++++++++++++++------------------ R/scone_main.R | 18 +++++++------- inst/NEWS | 47 +++++++++++++++++++++---------------- man/scone.Rd | 4 ++-- man/score_matrix.Rd | 6 ++--- tests/testthat/test_scone.R | 18 +++++++------- 6 files changed, 72 insertions(+), 66 deletions(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index 646a07a..448957f 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -29,7 +29,7 @@ #' @param wv_factors Factors of wanted variation derived from positive control genes (evaluation set). #' If NULL, wv correlations, EXP_WV_COR, will be returned NA. #' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection. -#' @param conditional_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA), +#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), #' and a weighted average silhouette width is returned. #' #' @importFrom class knn @@ -43,7 +43,7 @@ #' \itemize{ #' \item{BIO_SIL}{ Average silhouette width by biological condition.} #' \item{BATCH_SIL}{ Average silhouette width by batch condition.} -#' \item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).} +#' \item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see stratified_pam argument).} #' \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} @@ -57,7 +57,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, - is_log=FALSE, conditional_pam = FALSE){ + is_log=FALSE, stratified_pam = FALSE){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ stop("NA/Inf/NaN Expression Values.") @@ -102,36 +102,35 @@ score_matrix <- function(expr, eval_pcs = 3, if ( !is.null(eval_kclust) ){ - # "Conditional" PAM - if(conditional_pam){ + # "Stratified" PAM + if(stratified_pam){ + biobatch = as.factor(paste(bio,batch,sep = "_")) PAM_SIL = 0 - # Max Average Sil Width per Biological Condition - for (cond in levels(bio)){ - is_cond = which(bio == cond) + # Max Average Sil Width per bio-cross-batch Condition + for (cond in levels(biobatch)){ + is_cond = which(biobatch == cond) cond_w = length(is_cond) if(cond_w > max(eval_kclust)){ pamk_object = pamk(proj[is_cond,],krange = eval_kclust) + + # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected + if(is.null(pamk_object$pamobject$silinfo$avg.width) ){ + if (!1 %in% eval_kclust) { + pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[-1]) + }else{ + stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.") + } + } + PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width }else{ - stop("Number of clusters 'k' must be smaller than bio class size") + stop(paste("Number of clusters 'k' must be smaller than bio-cross-batch stratum size:", + paste(levels(biobatch),table(biobatch), sep = " = ",collapse = ", "))) } } - - # Max Average Sil Width for Unclassified Condition - is_na = is.na(bio) - if (any(is_na)){ - cond_w = sum(is_na) - if(cond_w > max(eval_kclust)){ - pamk_object = pamk(proj[is_na,],krange = eval_kclust) - PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width - }else{ - stop("Number of clusters 'k' must be smaller than unclassified bio set size") - } - } - - PAM_SIL = PAM_SIL/length(bio) + PAM_SIL = PAM_SIL/length(biobatch) # Traditional PAM }else{ diff --git a/R/scone_main.R b/R/scone_main.R index e4f3496..f84f497 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -43,7 +43,7 @@ #' right format, and is only intended to be used to feed the results of setting run=FALSE #' back into the algorithm (see example). #' @param verbose logical. If TRUE some messagges are printed. -#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL. +#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL. #' @param run logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame #' of parameters that will be run for inspection by the user. #' @@ -77,7 +77,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL, - params=NULL, verbose=FALSE, conditional_pam = FALSE) { + params=NULL, verbose=FALSE, stratified_pam = FALSE) { if(!is.matrix(expr)) { stop("'expr' must be a matrix.") @@ -203,11 +203,13 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 stop("'eval_kclust' must be less than the number of samples.") } - if(conditional_pam) { - if(is.null(bio)) { - stop("If `bio` is null, `conditional_pam` cannot be TRUE.") - } else if(max(eval_kclust) >= min(table(bio))) { - stop("For conditional_pam, max 'eval_kclust' must be smaller than min bio class size") + if(stratified_pam) { + if(is.null(bio) & is.null(batch)){ + stop("For stratified_pam, bio and/or batch must be specified") + } + biobatch = as.factor(paste(bio,batch,sep = "_")) + if(max(eval_kclust) >= min(table(biobatch))) { + stop("For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size") } } } @@ -334,7 +336,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, eval_proj = eval_proj, eval_proj_args = eval_proj_args, eval_kclust = eval_kclust, bio = bio, batch = batch, qc_factors = qc_pcs, uv_factors = uv_factors, wv_factors = wv_factors, - is_log = TRUE, conditional_pam = conditional_pam) + is_log = TRUE, stratified_pam = stratified_pam) } else { score <- NULL } diff --git a/inst/NEWS b/inst/NEWS index 57761c3..109aedf 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,33 +1,40 @@ +Changes in version 0.0.6 +======================== + +* Fixed bug when using plot functionality of filtering functions. +* "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition. + Changes in version 0.0.5 ======================== + * Modified biplot to handle general coloring schemes. * Limit number of WV and UV factors to eval_pcs in computing WV and UV scores. -* Updated depencencies. +* Updated dependencies. * Added error-handling to sample filter. -* Removed var preserved measure do to length of running time. +* Removed “var preserved” measure do to length of running time. Changes in version 0.0.4 ======================== -* Fixed a few bugs and documentation mismatches -* Removed stability evaluation (redundant with sil width and slow) -* Removed clusterExperiment dependency -* Removed RUV correlation score. UV correlation now takes ruv control genes as default -* Added RLE measures to scone evaluation -* Added FQT_FN to implement careful ties handling by FQ -* Better handling of plots in sample filter functions -* Mean score rather than median rank to evaluate normalizations -* Default value for imputation -* Minor optimizations to evaluations +* Fixed a few bugs and documentation mismatches. +* Removed stability evaluation (redundant with sil width and slow). +* Removed clusterExperiment dependency. +* Removed RUV correlation score. UV correlation now takes ruv control genes as default. +* Added RLE measures to scone evaluation. +* Added FQT_FN to implement careful ties handling by FQ. +* Better handling of plots in sample filter functions. +* Mean score rather than median rank to evaluate normalizations. +* Default value for imputation. +* Minor optimizations to evaluation functions. Changes in version 0.0.3 ======================== -* Fixed various bugs -* Added compactness measure for stability evaluation -* Compute RUV factors only when needed -* Fixed Github issues #11, #12, #13, #14, #21, #28 -* zinb now works for non-integer whole numbers -* Updated tests -* Added documentation for datasets -* Added biplot_colored function +* Fixed various bugs. +* Added “compactness” measure for stability evaluation. +* Compute RUV factors only when needed. +* Fixed Github issues #11, #12, #13, #14, #21, #28. +* zinb now works for non-integer whole numbers. +* Updated tests. +* Added documentation for datasets. +* Added biplot_colored function. diff --git a/man/scone.Rd b/man/scone.Rd index cefb163..20e59fc 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -10,7 +10,7 @@ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL, - verbose = FALSE, conditional_pam = FALSE) + verbose = FALSE, stratified_pam = FALSE) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -69,7 +69,7 @@ back into the algorithm (see example).} \item{verbose}{logical. If TRUE some messagges are printed.} -\item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL.} +\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL.} } \value{ A list with the following elements: diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index bbac848..945bf70 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -7,7 +7,7 @@ score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, uv_factors = NULL, wv_factors = NULL, is_log = FALSE, - conditional_pam = FALSE) + stratified_pam = FALSE) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -40,7 +40,7 @@ If NULL, wv correlations, EXP_WV_COR, will be returned NA.} \item{is_log}{logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection.} -\item{conditional_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA), +\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average silhouette width is returned.} } \value{ @@ -48,7 +48,7 @@ A list with the following elements: \itemize{ \item{BIO_SIL}{ Average silhouette width by biological condition.} \item{BATCH_SIL}{ Average silhouette width by batch condition.} -\item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).} +\item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see stratified_pam argument).} \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index 48a69a0..6743a98 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -202,11 +202,10 @@ test_that("conditional PAM",{ res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), - k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, - adjust_batch="yes", batch=batch, run=FALSE, + k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_kclust = 2, conditional_pam = TRUE) + eval_kclust = 2, stratified_pam = TRUE) expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), @@ -214,17 +213,16 @@ test_that("conditional PAM",{ adjust_batch="yes", batch=batch, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_kclust = 6, conditional_pam = TRUE), - "For conditional_pam, max 'eval_kclust' must be smaller than min bio class size") + eval_kclust = 6, stratified_pam = TRUE), + "For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size") expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), - k_ruv=0, k_qc=0, - adjust_batch="yes", batch=batch, run=FALSE, - evaluate=TRUE, eval_negcon=as.character(11:20), + k_ruv=0, k_qc=0, run = FALSE, + evaluate = TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_kclust = 6, conditional_pam = TRUE), - "If `bio` is null, `conditional_pam` cannot be TRUE") + eval_kclust = 6, stratified_pam = TRUE), + "For stratified_pam, bio and/or batch must be specified") }) From 6c00bff3f64480ef38cac26c5fe72c2ee8f473a8 Mon Sep 17 00:00:00 2001 From: mbcole Date: Thu, 7 Jul 2016 13:57:20 -0700 Subject: [PATCH 05/27] Fixed bug in PAM_SIL --- R/scone_eval.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index 448957f..fd421fa 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -118,7 +118,7 @@ score_matrix <- function(expr, eval_pcs = 3, # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected if(is.null(pamk_object$pamobject$silinfo$avg.width) ){ if (!1 %in% eval_kclust) { - pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[-1]) + pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[1:max(eval_kclust) %in% eval_kclust]) }else{ stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.") } From 8412f0b6742aee29ff24aa18edf2af1856cba92c Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 14:37:58 -0700 Subject: [PATCH 06/27] Check memory usage and add option to write on file --- .Rbuildignore | 4 +++- R/scone_main.R | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 3e3b853..0fdadad 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -33,4 +33,6 @@ README[.]md #---------------------------- # Temp scripts #---------------------------- -^old_scripts/* \ No newline at end of file +^old_scripts/* +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/R/scone_main.R b/R/scone_main.R index 9aaeaa9..d8eb5f5 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -71,13 +71,18 @@ #' @param run logical. If FALSE the normalization and evaluation are not run, #' but the function returns a data.frame of parameters that will be run for #' inspection by the user. -#' @param return_norm logical. If FALSE the normalization will not be returned, -#' but only evaluate. This will create a much smaller object and may be useful -#' for large datasets and/or when many combinations are compared. +#' @param return_norm character. If "no" the normalized values will not be +#' returned. This will create a much smaller object and may be useful for +#' large datasets and/or when many combinations are compared. If "in_memory" +#' the normalized values will be returned as part of the output. If "hdf5" +#' they will be written on file using the \code{rhdf5} package. #' #' @importFrom RUVSeq RUVg #' @importFrom matrixStats rowMedians #' @import BiocParallel +#' @imporFrom pryr mem_used +#' @importFrom gdata humanReadable +#' #' @export #' #' @details If both \code{run=FALSE} the normalization and evaluation are not @@ -110,7 +115,9 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, params=NULL, verbose=FALSE, - conditional_pam = FALSE, return_norm = TRUE) { + conditional_pam = FALSE, return_norm = c("no", "in_memory", "hdf5")) { + + return_norm <- match.arg(return_norm) if(!is.matrix(expr)) { stop("'expr' must be a matrix.") @@ -313,7 +320,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, if(verbose) message("Imputation step...") im_params <- unique(params[,1]) - imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr)) + imputed <- lapply(seq_along(im_params), function(i) imputation[[i]](expr)) names(imputed) <- im_params # output: a list of imputed matrices @@ -321,7 +328,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, if(verbose) message("Scaling step...") sc_params <- unique(params[,1:2]) - scaled <- bplapply(1:nrow(sc_params), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]])) + scaled <- bplapply(seq_len(NROW(sc_params)), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]])) names(scaled) <- apply(sc_params, 1, paste, collapse="_") # output: a list of normalized expression matrices @@ -359,7 +366,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, if(verbose) message("Factor adjustment and evaluation...") - outlist <- bplapply(1:nrow(params), function(i) { + outlist <- bplapply(seq_len(NROW(params)), function(i) { parsed <- parse_row(params[i,], bio, batch, ruv_factors, qc_pcs) design_mat <- make_design(parsed$bio, parsed$batch, parsed$W, nested=(nested & !is.null(parsed$bio) & !is.null(parsed$batch))) @@ -376,9 +383,13 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5, score <- NULL } - if(return_norm) { + if(verbose) message(paste0("Processed: ", rownames(params)[i], "\n", "Mem used: ", gdata::humanReadable(pryr::mem_used()))) + if(return_norm == "in_memory") { return(list(score=score, adjusted=adjusted)) } else { + if(return_norm == "hdf5") { + ## write on file + } return(list(score=score)) } }) From 324858ead500ad9a6429a43af118410a3fcfc4e5 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 17:32:47 -0700 Subject: [PATCH 07/27] Support for hdf5 --- R/scone_main.R | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/R/scone_main.R b/R/scone_main.R index 476bf0d..59ddc4d 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -76,6 +76,8 @@ #' large datasets and/or when many combinations are compared. If "in_memory" #' the normalized values will be returned as part of the output. If "hdf5" #' they will be written on file using the \code{rhdf5} package. +#' @param hdf5file character. If \code{return_norm="hdf5"}, the name of the file +#' onto which to save the normalized matrices. #' #' @importFrom RUVSeq RUVg #' @importFrom matrixStats rowMedians @@ -87,6 +89,7 @@ #' dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm #' prcomp quantile quasibinomial sd #' @importFrom utils capture.output +#' @importFrom rhdf5 h5createFile h5write #' @export #' #' @details If \code{run=FALSE} the normalization and evaluation are not run, @@ -114,16 +117,31 @@ #' RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods #' are assigned a higher scores. #' +#' @details If \code{return_norm="hdf5"}, the normalized matrices will be +#' written to the \code{hdf5file} file. This must be a string specifying (a +#' path to) a new file. If the file already exists, it will return error. +#' scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL, params=NULL, verbose=FALSE, - stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5")) { + stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5"), + hdf5file) { return_norm <- match.arg(return_norm) + if(return_norm == "hdf5") { + if(missing(hdf5file)) { + stop("If `return_norm='hdf5'`, `hdf5file` must be specified.") + } else { + stopifnot(h5createFile(hdf5file)) + h5write(rownames(expr), hdf5file, "genes") + h5write(colnames(expr), hdf5file, "samples") + } + } + if(!is.matrix(expr)) { stop("'expr' must be a matrix.") } else if(is.null(rownames(expr))) { @@ -391,7 +409,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 return(list(score=score, adjusted=adjusted)) } else { if(return_norm == "hdf5") { - ## write on file + h5write(adjusted, hdf5file, rownames(params)[i]) } return(list(score=score)) } From 8516e799efa016d47534fd20b77228205d5aead9 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 17:55:31 -0700 Subject: [PATCH 08/27] Added wrapper function (issue #9) --- R/scone_main.R | 31 ++++++++++++++++++++++++++++++- man/get_normalized.Rd | 27 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 man/get_normalized.Rd diff --git a/R/scone_main.R b/R/scone_main.R index 59ddc4d..2fb007c 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -82,7 +82,7 @@ #' @importFrom RUVSeq RUVg #' @importFrom matrixStats rowMedians #' @import BiocParallel -#' @imporFrom pryr mem_used +#' @importFrom pryr mem_used #' @importFrom gdata humanReadable #' @importFrom graphics abline arrows barplot hist par plot text #' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist @@ -449,3 +449,32 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 return(list(normalized_data=adjusted, metrics=evaluation, scores=scores, params=params)) } + +#' Retrieve normalized matrix +#' +#' Given a string representing a normalization scheme and an HDF5 file created by +#' scone, it will return a matrix of normalized counts (in log scale). +#' +#' @details The string must be of of the row.names of the \code{params} +#' component of the scone output. +#' +#' @param normalization character. A string identifying the normalization scheme +#' to be retrieved. +#' @param hdf5file character. The name of the file in which the data have been +#' stored. +#' +#' @return A matrix of normalized counts in log-scale. +#' +#' @importFrom rhdf5 h5ls h5read +#' @export +get_normalized <- function(normalization, hdf5file) { + + if(!normalization %in% h5ls(hdf5file)$name) { + stop(paste0(normalization, " does not match any object in ", hdf5file)) + } else { + retval <- h5read(hdf5file, normalization) + rownames(retval) <- h5read(hdf5file, "genes") + colnames(retval) <- h5read(hdf5file, "samples") + return(retval) + } +} diff --git a/man/get_normalized.Rd b/man/get_normalized.Rd new file mode 100644 index 0000000..a285b47 --- /dev/null +++ b/man/get_normalized.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scone_main.R +\name{get_normalized} +\alias{get_normalized} +\title{Retrieve normalized matrix} +\usage{ +get_normalized(normalization, hdf5file) +} +\arguments{ +\item{normalization}{character. A string identifying the normalization scheme +to be retrieved.} + +\item{hdf5file}{character. The name of the file in which the data have been +stored.} +} +\value{ +A matrix of normalized counts in log-scale. +} +\description{ +Given a string representing a normalization scheme and an HDF5 file created by +scone, it will return a matrix of normalized counts (in log scale). +} +\details{ +The string must be of of the row.names of the \code{params} + component of the scone output. +} + From fc8c8c373e1ea1bb3dc02164f6d404720b897ec3 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 17:55:46 -0700 Subject: [PATCH 09/27] Updated documentation --- NAMESPACE | 7 +++ man/scone.Rd | 164 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 113 insertions(+), 58 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3fc881a..5531cec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(biplot_colored) export(estimate_ziber) export(estimate_zinb) export(factor_sample_filter) +export(get_normalized) export(impute_ziber_simp) export(impute_zinb) export(lm_adjust) @@ -31,6 +32,7 @@ importFrom(cluster,silhouette) importFrom(diptest,dip.test) importFrom(edgeR,calcNormFactors) importFrom(fpc,pamk) +importFrom(gdata,humanReadable) importFrom(grDevices,colorRampPalette) importFrom(graphics,abline) importFrom(graphics,arrows) @@ -44,6 +46,11 @@ importFrom(matrixStats,colIQRs) importFrom(matrixStats,colMedians) importFrom(matrixStats,rowMedians) importFrom(mixtools,normalmixEM) +importFrom(pryr,mem_used) +importFrom(rhdf5,h5createFile) +importFrom(rhdf5,h5ls) +importFrom(rhdf5,h5read) +importFrom(rhdf5,h5write) importFrom(stats,approx) importFrom(stats,as.formula) importFrom(stats,binomial) diff --git a/man/scone.Rd b/man/scone.Rd index 20e59fc..df7ad4a 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/scone_main.R \name{scone} \alias{scone} -\title{scone main wrapper: function to apply and evaluate all the normalization schemes} +\title{scone main wrapper: function to apply and evaluate all the normalization +schemes} \usage{ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", @@ -10,98 +11,145 @@ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL, - verbose = FALSE, stratified_pam = FALSE) + verbose = FALSE, stratified_pam = FALSE, return_norm = c("no", + "in_memory", "hdf5"), hdf5file) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} -\item{imputation}{list or function. (A list of) function(s) to be used for imputation.} +\item{imputation}{list or function. (A list of) function(s) to be used for +imputation.} -\item{scaling}{list or function. (A list of) function(s) to be used for scaling normalization.} +\item{scaling}{list or function. (A list of) function(s) to be used for +scaling normalization.} -\item{k_ruv}{numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation). -If 0, RUV will not be performed.} +\item{k_ruv}{numeric. The maximum number of factors of unwanted variation +(the function will adjust for 1 to k_ruv factors of unwanted variation). If +0, RUV will not be performed.} -\item{k_qc}{numeric. The maximum number of quality metrics PCs (the function will adjust for 1 to k_qc PCs). -If 0, QC adjustment will not be performed.} +\item{k_qc}{numeric. The maximum number of quality metrics PCs (the function +will adjust for 1 to k_qc PCs). If 0, QC adjustment will not be performed.} -\item{ruv_negcon}{character. The genes to be used as negative controls for RUV. Ignored if k_ruv=0.} +\item{ruv_negcon}{character. The genes to be used as negative controls for +RUV. Ignored if k_ruv=0.} -\item{qc}{matrix. The QC metrics to be used for QC adjustment. Ignored if k_qc=0.} +\item{qc}{matrix. The QC metrics to be used for QC adjustment. Ignored if +k_qc=0.} -\item{adjust_bio}{character. If 'no' it will not be included in the model; if 'yes', both models with and without 'bio' will be run; -if 'force', only models with 'bio' will be run.} +\item{adjust_bio}{character. If 'no' it will not be included in the model; if +'yes', both models with and without 'bio' will be run; if 'force', only +models with 'bio' will be run.} -\item{adjust_batch}{character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run; -if 'force', only models with 'batch' will be run.} +\item{adjust_batch}{character. If 'no' it will not be included in the model; +if 'yes', both models with and without 'batch' will be run; if 'force', +only models with 'batch' will be run.} -\item{bio}{factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation.} +\item{bio}{factor. The biological condition to be included in the adjustment +model (variation to be preserved). If adjust_bio="no", it will not be used +for normalization, but only for evaluation.} -\item{batch}{factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation.} +\item{batch}{factor. The known batch variable to be included in the +adjustment model (variation to be removed). If adjust_batch="no", it will +not be used for normalization, but only for evaluation.} -\item{run}{logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame -of parameters that will be run for inspection by the user.} +\item{run}{logical. If FALSE the normalization and evaluation are not run, +but the function returns a data.frame of parameters that will be run for +inspection by the user.} -\item{evaluate}{logical. If FALSE the normalization methods will not be evaluated (faster).} +\item{evaluate}{logical. If FALSE the normalization methods will not be +evaluated (faster).} -\item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.} +\item{eval_pcs}{numeric. The number of principal components to use for +evaluation. Ignored if evaluation=FALSE.} -\item{eval_proj}{function. Projection function for evaluation (see \code{\link{score_matrix}} for details). -If NULL, PCA is used for projection} +\item{eval_proj}{function. Projection function for evaluation (see +\code{\link{score_matrix}} for details). If NULL, PCA is used for +projection.} -\item{eval_proj_args}{list. List passed to the eval_proj function as eval_proj_args.} +\item{eval_proj_args}{list. List of args passed to projection function as +eval_proj_args.} -\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness evaluation. -If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA.} +\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam +tightness evaluation. If an array of integers, largest average silhouette +width (tightness) will be reported. If NULL, tightness will be returned NA.} -\item{eval_negcon}{character. The genes to be used as negative controls for evaluation. These genes should -be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA.} +\item{eval_negcon}{character. The genes to be used as negative controls for +evaluation. These genes should be expected not to change according to the +biological phenomenon of interest. Ignored if evaluation=FALSE. Default is +ruv_negcon argument. If NULL, correlations with negative controls will be +returned NA.} -\item{eval_poscon}{character. The genes to be used as positive controls for evaluation. These genes should -be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -If NULL, correlations with positive controls will be returned NA.} +\item{eval_poscon}{character. The genes to be used as positive controls for +evaluation. These genes should be expected to change according to the +biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL, +correlations with positive controls will be returned NA.} -\item{params}{matrix or data.frame. If given, the algorithm will bypass creating the matrix of possible -parameters, and will use the given matrix. There are basically no checks as to whether this matrix is in the -right format, and is only intended to be used to feed the results of setting run=FALSE -back into the algorithm (see example).} +\item{params}{matrix or data.frame. If given, the algorithm will bypass +creating the matrix of possible parameters, and will use the given matrix. +There are basically no checks as to whether this matrix is in the right +format, and is only intended to be used to feed the results of setting +run=FALSE back into the algorithm (see example).} \item{verbose}{logical. If TRUE some messagges are printed.} -\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL.} +\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is +separately computed for each biological-cross-batch condition (accepting +NAs), and a weighted average is returned as PAM_SIL.} + +\item{return_norm}{character. If "no" the normalized values will not be +returned. This will create a much smaller object and may be useful for +large datasets and/or when many combinations are compared. If "in_memory" +the normalized values will be returned as part of the output. If "hdf5" +they will be written on file using the \code{rhdf5} package.} + +\item{hdf5file}{character. If \code{return_norm="hdf5"}, the name of the file +onto which to save the normalized matrices.} } \value{ -A list with the following elements: -\itemize{ -\item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.} -\item{metrics}{ A matrix containing raw evaluation metrics for each normalization method (see \code{\link{score_matrix}} for details). Rows are sorted in the same order as in the scores output matrix. NULL when evaluate = FALSE.} -\item{scores}{ A matrix containing scores for each normalization, including average score. Rows are sorted by increasing mean score. NULL when evaluate = FALSE.} -\item{params}{ A data.frame with each row corresponding to a set of normalization parameters.} -} - -If \code{run=FALSE} a \code{data.frame} -with each row corresponding to a set of normalization parameters. +A list with the following elements: \itemize{ \item{normalized_data}{ + A list containing the normalized data matrix, log-scaled. NULL when + \code{return_norm} is not "in_memory".} \item{metrics}{ A matrix containing + raw evaluation metrics for each normalization method (see + \code{\link{score_matrix}} for details). Rows are sorted in the same order + as in the scores output matrix. NULL when evaluate = FALSE.} \item{scores}{ + A matrix containing scores for each normalization, including average score. + Rows are sorted by increasing mean score. NULL when evaluate = FALSE.} + \item{params}{ A data.frame with each row corresponding to a set of + normalization parameters.} } + +If \code{run=FALSE} a \code{data.frame} with each row corresponding + to a set of normalization parameters. } \description{ -This function is a high-level wrapper to apply and evaluate a variety of normalization -schemes to a specified expression matrix. +This function is a high-level wrapper to apply and evaluate a variety of +normalization schemes to a specified expression matrix. } \details{ -The function consists of four main steps: imputation, scaling normalization, adjusting for batch effects, -and evaluation of the normalization performance. +The function consists of four main steps: imputation, scaling normalization, +adjusting for batch effects, and evaluation of the normalization performance. + +Note that if one wants to include the unnormalized data in the + comparison, the identity function must be included in the scaling list. + Analogously, if one wants to avoid the imputation and/or include the + non-imputed data in the comparison, the identity function must be included. + -Note that if one wants to include the unnormalized data in the comparison, the identity function -must be included in the scaling list. Analogously, if one wants to avoid the imputation and/or include the -non-imputed data in the comparison, the identity function must be included. +If \code{run=FALSE} the normalization and evaluation are not run, + but the function returns a matrix of parameters that will be run for + inspection by the user. -If \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. +Evaluation metrics are defined in \code{\link{score_matrix}}. + Each metric is assigned a signature for conversion to scores: + Positive-signature metrics increase with improving performance, including + BIO_SIL, PAM_SIL, and EXP_WV_COR. Negative-signature metrics decrease with + improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR, + RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods + are assigned a higher scores. -Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores: -Positive-signature metrics increase with improving performance, including BIO_SIL, PAM_SIL, and EXP_WV_COR. -Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR, RLE_MED, and RLE_IQR. -Scores are computed so that higer-performing methods are assigned a higher scores. +If \code{return_norm="hdf5"}, the normalized matrices will be + written to the \code{hdf5file} file. This must be a string specifying (a + path to) a new file. If the file already exists, it will return error. } From 79738f58b58f4e2f321807a26776164f37f6e955 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 18:05:36 -0700 Subject: [PATCH 10/27] Updated NEWS --- .Rbuildignore | 2 ++ .gitignore | 2 +- inst/NEWS | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 0fdadad..306b778 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -29,6 +29,8 @@ README[.]md ^[.]devel ^[.]test ^[.]check +.Rhistory +R/.Rhistory #---------------------------- # Temp scripts diff --git a/.gitignore b/.gitignore index b8c17c0..c6778e3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ vignettes/figure *.Rproj vignettes/R_cache vignettes/R_figure -old_scripts \ No newline at end of file +old_scripts diff --git a/inst/NEWS b/inst/NEWS index 109aedf..56a0bcf 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,8 +1,10 @@ -Changes in version 0.0.6 +Changes in version 0.0.6 (2016-07-07) ======================== * Fixed bug when using plot functionality of filtering functions. * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition. +* Added option to write normalized matrices to HDF5 file. +* Added wrapper function get_normalized() to retrieve normalized data. Changes in version 0.0.5 ======================== From 1a02565231b6f718663efa34730a8d7569d3e22b Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 18:08:00 -0700 Subject: [PATCH 11/27] Updated DESCRIPTION file --- DESCRIPTION | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f6f8ee5..2f1e14b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,22 +13,25 @@ License: Artistic-2.0 Depends: R (>= 3.3) Imports: + aroma.light, BiocParallel, + class, cluster, DESeq, - EDASeq, - MASS, - RUVSeq, - aroma.light, - class, diptest, + EDASeq, edgeR, fpc, + gdata, gplots, + grDevices, limma, + MASS, matrixStats, mixtools, - grDevices + pryr, + rhdf5, + RUVSeq Suggests: knitr, rmarkdown, From 8e5d7a78936dfdf4e00e069200965c12e61b3018 Mon Sep 17 00:00:00 2001 From: mbcole Date: Thu, 7 Jul 2016 18:24:26 -0700 Subject: [PATCH 12/27] Bug fix to metric sample filter / redefinition of simple FNR estimation --- DESCRIPTION | 5 ++- NAMESPACE | 2 + R/sample_filtering.R | 87 ++++++++++++++++++++++--------------- inst/NEWS | 2 + man/metric_sample_filter.Rd | 18 ++++---- man/simple_FNR_params.Rd | 10 +++-- 6 files changed, 73 insertions(+), 51 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f6f8ee5..b998287 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.5-9000 +Version: 0.0.6-9000 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -28,7 +28,8 @@ Imports: limma, matrixStats, mixtools, - grDevices + grDevices, + boot Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 3fc881a..4050ac6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,8 @@ importFrom(EDASeq,betweenLaneNormalization) importFrom(MASS,glm.nb) importFrom(RUVSeq,RUVg) importFrom(aroma.light,normalizeQuantileRank.matrix) +importFrom(boot,inv.logit) +importFrom(boot,logit) importFrom(class,knn) importFrom(cluster,silhouette) importFrom(diptest,dip.test) diff --git a/R/sample_filtering.R b/R/sample_filtering.R index f19a265..803e629 100644 --- a/R/sample_filtering.R +++ b/R/sample_filtering.R @@ -1,32 +1,53 @@ #' Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes #' -#' @details logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) . +#' @details logit(Probability of False Negative) ~ a + b*(median log-expression) . #' #' @param expr matrix The data matrix in transcript-proportional units (genes in rows, cells in columns). #' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics. +#' User must provide at least 2 positive control genes. #' @param fn_tresh Inclusive threshold for negative detection. Default 0.01. +#' fn_tresh must be non-negative. #' -#' @return A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA. +#' @return A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated. +#' +#' @importFrom boot logit +#' @importFrom matrixStats rowMedians #' simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ - # Mean log10p1 expression - mu_obs = rowMeans(log10(expr[pos_controls,]+1)) - - # Drop-outs - drop_outs = 0 + (expr[pos_controls,] <= fn_tresh) + stopifnot(!any(is.na(pos_controls))) + + if (sum(pos_controls) < 2){ + stop("User must provide at least 2 positive control genes") + } + + if (fn_tresh < 0){ + stop("fn_tresh must be non-negative") + } + + pos_expr = expr[pos_controls,] # Selecting positive-control genes + is_drop = pos_expr <= fn_tresh # Identify false negatives + pos_expr[is_drop] = NA # Set false negatives to NA + drop_outs = 0 + is_drop # Numeric drop-out state + drop_rate = colMeans(drop_outs) # Total drop-out rate per sample + + # Median log-expression in positive observations + mu_obs = log(rowMedians(pos_expr,na.rm = TRUE)) + if(any(is.na(mu_obs))){ + stop("Median log-expression in positive observations NA for some positive control gene/s") + } # Logistic Regression Model of FNR - ref.glms = list() - for (si in 1:dim(drop_outs)[2]){ + logistic_coef = matrix(0,ncol(drop_outs),2) + for (si in seq_len(ncol(drop_outs))){ fit = suppressWarnings(glm(cbind(drop_outs[,si],1 - drop_outs[,si]) ~ mu_obs,family=binomial(logit))) if(fit$converged){ - ref.glms[[si]] = fit$coefficients + logistic_coef[si,] = fit$coefficients } else { - ref.glms[[si]] = NA + logistic_coef[si,1] = logit(drop_rate[si]) } } - return(ref.glms) + return(logistic_coef) } #' metric-based sample filtering: function to filter single-cell RNA-Seq libraries. @@ -51,7 +72,7 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #' If NULL, filtered_fnr will be returned NA. #' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE #' @param glen Gene lengths for gene-length normalization (normalized data used in FNR computation). -#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6) +#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log(expr_units)). Default c(0,15) #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. #' If NULL, only hard threshold sub-criteria will be applied. #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric). @@ -61,11 +82,11 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000. #' @param hard_ralign numeric. Hard (lower bound on) ralign threshold. Default 15. #' @param hard_breadth numeric. Hard (lower bound on) breadth threshold. Default 0.2. -#' @param hard_fnr numeric. Hard (upper bound on) fnr threshold. Default 3. -#' @param suff_nreads numeric. If not null, serves as an upper bound on nreads threshold. -#' @param suff_ralign numeric. If not null, serves as an upper bound on ralign threshold. Default 65. -#' @param suff_breadth numeric. If not null, serves as an upper bound on breadth threshold. Default 0.8. -#' @param suff_fnr numeric. If not null, serves as an lower bound on fnr threshold. +#' @param hard_auc numeric. Hard (upper bound on) fnr auc threshold. Default 10. +#' @param suff_nreads numeric. If not null, serves as an overriding upper bound on nreads threshold. +#' @param suff_ralign numeric. If not null, serves as an overriding upper bound on ralign threshold. +#' @param suff_breadth numeric. If not null, serves as an overriding upper bound on breadth threshold. +#' @param suff_auc numeric. If not null, serves as an overriding lower bound on fnr auc threshold. #' @param plot logical. Should a plot be produced? #' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`. #' @@ -79,15 +100,16 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #' #'@importFrom mixtools normalmixEM #'@importFrom diptest dip.test +#'@importFrom boot inv.logit #'@export #' #' metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, gene_filter = NULL, pos_controls = NULL,scale. = FALSE,glen = NULL, - AUC_range = c(0,6), zcut = 1, + AUC_range = c(0,15), zcut = 1, mixture = TRUE, dip_thresh = 0.05, - hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_fnr = 3, - suff_nreads = NULL, suff_ralign = 65, suff_breadth = 0.8, suff_fnr = NULL, + hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_auc = 10, + suff_nreads = NULL, suff_ralign = NULL, suff_breadth = NULL, suff_auc = NULL, plot = FALSE, hist_breaks = 10){ criterion_count = 0 @@ -210,17 +232,17 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, } # Compute FNR AUC - ref.glms = simple_FNR_params(expr = nexpr, pos_controls = pos_controls) + logistic_coef = simple_FNR_params(expr = nexpr, pos_controls = pos_controls) AUC = NULL for (si in 1:dim(expr)[2]){ - if(!any(is.na(ref.glms[[si]]))){ - AUC[si] = log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[2]) + 1)/ref.glms[[si]][2] - log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[1]) + 1)/ref.glms[[si]][2] + if(logistic_coef[si,2] != 0){ + AUC[si] = log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[2]) + 1)/logistic_coef[si,2] - log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[1]) + 1)/logistic_coef[si,2] } else { - stop("glm fit did not converge") + AUC[si] = inv.logit(logistic_coef[si,1])*(AUC_range[2] - AUC_range[1]) } } - AUC_CUTOFF = hard_fnr + AUC_CUTOFF = hard_auc if (!is.null(zcut)){ @@ -239,8 +261,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, } } - if(!is.null(suff_fnr)){ - AUC_CUTOFF = max(AUC_CUTOFF,suff_fnr) + if(!is.null(suff_auc)){ + AUC_CUTOFF = max(AUC_CUTOFF,suff_auc) } } filtered_fnr = AUC > AUC_CUTOFF @@ -305,7 +327,7 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, }else{ hist(AUC, main = paste0("auc: Thresh = ",signif(AUC_CUTOFF,3)," , Rm = ",sum(filtered_fnr)," , Tot_Rm = ",sum(is_bad)), xlab = "FNR AUC", breaks = hist_breaks) } - abline(v = hard_fnr, col = "yellow", lty = 1) + abline(v = hard_auc, col = "yellow", lty = 1) abline(v = AUC_CUTOFF, col = "blue", lty = 2) } @@ -321,13 +343,6 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, if(!is.null(pos_controls)){ hist(AUC[!is_bad], main = paste0("auc: Tot = ",sum(!is_bad)), xlab = "FNR AUC", breaks = hist_breaks) } - - # v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr) - # rownames(v) = c("nreads","ralign","breadth","fnr") - # v = na.omit(v) - # m = v %*% t(v) - # - # barplot(m, beside = TRUE, legend.text = TRUE) } return(list(filtered_nreads = filtered_nreads, diff --git a/inst/NEWS b/inst/NEWS index 109aedf..475b526 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -3,6 +3,8 @@ Changes in version 0.0.6 * Fixed bug when using plot functionality of filtering functions. * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition. +* Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues. +* Removed all sufficient thresholds for metric sample filter. Changes in version 0.0.5 ======================== diff --git a/man/metric_sample_filter.Rd b/man/metric_sample_filter.Rd index 6a00bc2..125aad8 100644 --- a/man/metric_sample_filter.Rd +++ b/man/metric_sample_filter.Rd @@ -6,10 +6,10 @@ \usage{ metric_sample_filter(expr, nreads = colSums(expr), ralign = NULL, gene_filter = NULL, pos_controls = NULL, scale. = FALSE, glen = NULL, - AUC_range = c(0, 6), zcut = 1, mixture = TRUE, dip_thresh = 0.05, + AUC_range = c(0, 15), zcut = 1, mixture = TRUE, dip_thresh = 0.05, hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, - hard_fnr = 3, suff_nreads = NULL, suff_ralign = 65, - suff_breadth = 0.8, suff_fnr = NULL, plot = FALSE, hist_breaks = 10) + hard_auc = 10, suff_nreads = NULL, suff_ralign = NULL, + suff_breadth = NULL, suff_auc = NULL, plot = FALSE, hist_breaks = 10) } \arguments{ \item{expr}{matrix The data matrix (genes in rows, cells in columns).} @@ -29,7 +29,7 @@ If NULL, filtered_fnr will be returned NA.} \item{glen}{Gene lengths for gene-length normalization (normalized data used in FNR computation).} -\item{AUC_range}{An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6)} +\item{AUC_range}{An array of two values, representing range over which FNR AUC will be computed (log(expr_units)). Default c(0,15)} \item{zcut}{A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. If NULL, only hard threshold sub-criteria will be applied.} @@ -46,15 +46,15 @@ Samples deviating zcut sd's from optimal mean (in the inferior direction), have \item{hard_breadth}{numeric. Hard (lower bound on) breadth threshold. Default 0.2.} -\item{hard_fnr}{numeric. Hard (upper bound on) fnr threshold. Default 3.} +\item{hard_auc}{numeric. Hard (upper bound on) fnr auc threshold. Default 10.} -\item{suff_nreads}{numeric. If not null, serves as an upper bound on nreads threshold.} +\item{suff_nreads}{numeric. If not null, serves as an overriding upper bound on nreads threshold.} -\item{suff_ralign}{numeric. If not null, serves as an upper bound on ralign threshold. Default 65.} +\item{suff_ralign}{numeric. If not null, serves as an overriding upper bound on ralign threshold.} -\item{suff_breadth}{numeric. If not null, serves as an upper bound on breadth threshold. Default 0.8.} +\item{suff_breadth}{numeric. If not null, serves as an overriding upper bound on breadth threshold.} -\item{suff_fnr}{numeric. If not null, serves as an lower bound on fnr threshold.} +\item{suff_auc}{numeric. If not null, serves as an overriding lower bound on fnr auc threshold.} \item{plot}{logical. Should a plot be produced?} diff --git a/man/simple_FNR_params.Rd b/man/simple_FNR_params.Rd index 86e4de0..039e668 100644 --- a/man/simple_FNR_params.Rd +++ b/man/simple_FNR_params.Rd @@ -9,17 +9,19 @@ simple_FNR_params(expr, pos_controls, fn_tresh = 0.01) \arguments{ \item{expr}{matrix The data matrix in transcript-proportional units (genes in rows, cells in columns).} -\item{pos_controls}{A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.} +\item{pos_controls}{A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics. +User must provide at least 2 positive control genes.} -\item{fn_tresh}{Inclusive threshold for negative detection. Default 0.01.} +\item{fn_tresh}{Inclusive threshold for negative detection. Default 0.01. +fn_tresh must be non-negative.} } \value{ -A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA. +A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated. } \description{ Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes } \details{ -logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) . +logit(Probability of False Negative) ~ a + b*(median log-expression) . } From a7031885ee75d3c5123ae24e1d6c4d5e1aa0c941 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 18:29:00 -0700 Subject: [PATCH 13/27] Fixed tests --- DESCRIPTION | 6 +++--- tests/testthat/test_norm.R | 5 +++-- tests/testthat/test_scone.R | 23 ++++++++++++++--------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2f1e14b..b29f22b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,10 +5,10 @@ Description: scone is a package to compare and rank the performance of different Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", role = c("aut", "cre", "cph")), person("Davide", "Risso", email = "risso.davide@gmail.com", - role = c("aut"))) -Author: Michael Cole [aut, cre, cph], Davide Risso [aut] + role = c("aut", "cph"))) +Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph] Maintainer: Michael Cole -Date: 2016-05-12 +Date: 2016-07-07 License: Artistic-2.0 Depends: R (>= 3.3) diff --git a/tests/testthat/test_norm.R b/tests/testthat/test_norm.R index 368be51..61dc207 100644 --- a/tests/testthat/test_norm.R +++ b/tests/testthat/test_norm.R @@ -8,7 +8,8 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package" # UQ + RUV res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=5, k_qc=0, - evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100)) + evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100), + return_norm = "in_memory") uq <- EDASeq::betweenLaneNormalization(e, which="upper", round=FALSE) rs <- lapply(1:5, function(i) RUVSeq::RUVg(log1p(uq), as.character(1:100), k=i, round=FALSE, isLog=TRUE)$norm) @@ -22,7 +23,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package" qc_mat <- matrix(rnorm(20), nrow=10) res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=0, k_qc=2, - evaluate=FALSE, run=TRUE, qc=qc_mat) + evaluate=FALSE, run=TRUE, qc=qc_mat, return_norm = "in_memory") pca_qc <- prcomp(qc_mat, center=TRUE, scale=TRUE) qs <- lapply(1:2, function(i) { diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index 6743a98..aa36cd2 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -8,12 +8,12 @@ test_that("Test with no real method (only identity)", { # one combination res <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, - evaluate=FALSE, run=TRUE) + evaluate=FALSE, run=TRUE, return_norm = "in_memory") expect_equal(res$normalized_data[[1]], log1p(e)) # add more imputations res <- scone(e, imputation=list(a=identity, b=identity), scaling=identity, - k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE) + k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE, return_norm = "in_memory") expect_equal(res$normalized_data[[1]], log1p(e)) expect_equal(res$normalized_data[[2]], log1p(e)) @@ -159,7 +159,8 @@ test_that("Test imputation and scaling", { adjust_batch="yes", batch=batch, run=TRUE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_kclust = 2, verbose=TRUE)) + eval_kclust = 2, verbose=FALSE, + return_norm = "in_memory")) expect_equal(rownames(res$metrics), rownames(res$scores)) expect_equal(rownames(res$metrics), rownames(res$params)) @@ -172,7 +173,7 @@ test_that("Test imputation and scaling", { adjust_batch="yes", batch=batch, run=TRUE, evaluate=FALSE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_kclust = 2, verbose=TRUE) + eval_kclust = 2, verbose=FALSE, return_norm = "in_memory") norm_ordered <- res2$normalized_data[names(res$normalized_data)] expect_equal(norm_ordered, res$normalized_data) @@ -187,7 +188,7 @@ test_that("scone works with only one normalization",{ res <- scone(e, imputation=list(none=identity), scaling=list(none=identity), k_ruv=0, k_qc=0, run=TRUE, - evaluate=TRUE, eval_kclust = 2) + evaluate=TRUE, eval_kclust = 2, return_norm = "in_memory") expect_equal(res$normalized_data[[1]], log1p(e)) }) @@ -232,10 +233,12 @@ test_that("if bio=no bio is ignored", { rownames(e) <- as.character(1:nrow(e)) res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, - adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3) + adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3, + return_norm = "in_memory") res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, - adjust_bio = "no", eval_kclust = 3) + adjust_bio = "no", eval_kclust = 3, + return_norm = "in_memory") expect_equal(res1$normalized_data, res2$normalized_data) }) @@ -245,10 +248,12 @@ test_that("if batch=no batch is ignored", { rownames(e) <- as.character(1:nrow(e)) res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, - adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3) + adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3, + return_norm = "in_memory") res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, - adjust_batch = "no", eval_kclust = 3) + adjust_batch = "no", eval_kclust = 3, + return_norm = "in_memory") expect_equal(res1$normalized_data, res2$normalized_data) }) From ec8826f70c5965b2a5b359b3164e18c867d31629 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 7 Jul 2016 18:46:01 -0700 Subject: [PATCH 14/27] mem_used() is very costly; removed --- DESCRIPTION | 4 +--- NAMESPACE | 2 -- R/scone_main.R | 4 +--- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b29f22b..e6b66a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.5-9000 +Version: 0.0.5-9001 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -22,14 +22,12 @@ Imports: EDASeq, edgeR, fpc, - gdata, gplots, grDevices, limma, MASS, matrixStats, mixtools, - pryr, rhdf5, RUVSeq Suggests: diff --git a/NAMESPACE b/NAMESPACE index 5531cec..05269de 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,7 +32,6 @@ importFrom(cluster,silhouette) importFrom(diptest,dip.test) importFrom(edgeR,calcNormFactors) importFrom(fpc,pamk) -importFrom(gdata,humanReadable) importFrom(grDevices,colorRampPalette) importFrom(graphics,abline) importFrom(graphics,arrows) @@ -46,7 +45,6 @@ importFrom(matrixStats,colIQRs) importFrom(matrixStats,colMedians) importFrom(matrixStats,rowMedians) importFrom(mixtools,normalmixEM) -importFrom(pryr,mem_used) importFrom(rhdf5,h5createFile) importFrom(rhdf5,h5ls) importFrom(rhdf5,h5read) diff --git a/R/scone_main.R b/R/scone_main.R index 2fb007c..6b2f626 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -82,8 +82,6 @@ #' @importFrom RUVSeq RUVg #' @importFrom matrixStats rowMedians #' @import BiocParallel -#' @importFrom pryr mem_used -#' @importFrom gdata humanReadable #' @importFrom graphics abline arrows barplot hist par plot text #' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist #' dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm @@ -404,7 +402,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 score <- NULL } - if(verbose) message(paste0("Processed: ", rownames(params)[i], "\n", "Mem used: ", gdata::humanReadable(pryr::mem_used()))) + if(verbose) message(paste0("Processed: ", rownames(params)[i])) if(return_norm == "in_memory") { return(list(score=score, adjusted=adjusted)) } else { From ac6774ec8595af52e37ccc1322fbf01b95ccf71b Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 14 Jul 2016 16:55:41 -0700 Subject: [PATCH 15/27] Import correct hdf5 functions --- NAMESPACE | 1 + R/scone_main.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 05269de..1cf4b58 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ importFrom(rhdf5,h5createFile) importFrom(rhdf5,h5ls) importFrom(rhdf5,h5read) importFrom(rhdf5,h5write) +importFrom(rhdf5,h5write.default) importFrom(stats,approx) importFrom(stats,as.formula) importFrom(stats,binomial) diff --git a/R/scone_main.R b/R/scone_main.R index 6b2f626..079deaf 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -87,7 +87,7 @@ #' dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm #' prcomp quantile quasibinomial sd #' @importFrom utils capture.output -#' @importFrom rhdf5 h5createFile h5write +#' @importFrom rhdf5 h5createFile h5write.default h5write #' @export #' #' @details If \code{run=FALSE} the normalization and evaluation are not run, From 05319c075dc18773190c817e7329e4fffdff59e3 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 14 Jul 2016 18:06:05 -0700 Subject: [PATCH 16/27] Interactive biplot --- .Rbuildignore | 4 ++- DESCRIPTION | 8 +++-- NAMESPACE | 10 ++++++ R/biplot_interactive.R | 70 +++++++++++++++++++++++++++++++++++++++ inst/NEWS | 7 +++- man/biplot_interactive.Rd | 29 ++++++++++++++++ 6 files changed, 123 insertions(+), 5 deletions(-) create mode 100644 R/biplot_interactive.R create mode 100644 man/biplot_interactive.Rd diff --git a/.Rbuildignore b/.Rbuildignore index 3e3b853..0fdadad 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -33,4 +33,6 @@ README[.]md #---------------------------- # Temp scripts #---------------------------- -^old_scripts/* \ No newline at end of file +^old_scripts/* +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION index b998287..b69a813 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.6-9000 +Version: 0.0.6-9001 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -8,7 +8,7 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", role = c("aut"))) Author: Michael Cole [aut, cre, cph], Davide Risso [aut] Maintainer: Michael Cole -Date: 2016-05-12 +Date: 2016-07-14 License: Artistic-2.0 Depends: R (>= 3.3) @@ -29,7 +29,9 @@ Imports: matrixStats, mixtools, grDevices, - boot + boot, + shiny, + miniUI Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 4050ac6..1832316 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(TMM_FN) export(UQ_FN) export(UQ_FN_POS) export(biplot_colored) +export(biplot_interactive) export(estimate_ziber) export(estimate_zinb) export(factor_sample_filter) @@ -45,7 +46,16 @@ importFrom(limma,lmFit) importFrom(matrixStats,colIQRs) importFrom(matrixStats,colMedians) importFrom(matrixStats,rowMedians) +importFrom(miniUI,gadgetTitleBar) +importFrom(miniUI,miniContentPanel) +importFrom(miniUI,miniPage) importFrom(mixtools,normalmixEM) +importFrom(shiny,brushedPoints) +importFrom(shiny,observeEvent) +importFrom(shiny,plotOutput) +importFrom(shiny,renderPlot) +importFrom(shiny,runGadget) +importFrom(shiny,verbatimTextOutput) importFrom(stats,approx) importFrom(stats,as.formula) importFrom(stats,binomial) diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R new file mode 100644 index 0000000..c3a79a0 --- /dev/null +++ b/R/biplot_interactive.R @@ -0,0 +1,70 @@ +#' Interactive biplot +#' +#' This is a wrapper around \code{\link{biplot_colored}}, which creates a shiny +#' gadget to allow the user to select specific points in the graph. +#' +#' @details Since this is based on the shiny gadget feature, it will not work in +#' static documents, such as vignettes or markdown / knitr documents. +#' See \code{biplot_colored} for more details on the internals. +#' +#' @param data a data.frame containing the data to be plotted. +#' @param scores a numeric vector used to color the points. +#' +#' @importFrom miniUI gadgetTitleBar miniContentPanel miniPage gadgetTitleBar +#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput +#' +#' @export +#' +#' @examples +#' mat <- matrix(rnorm(1000), ncol=10) +#' colnames(mat) <- paste("X", 1:ncol(mat), sep="") +#' +#' biplot_interactive(mat, mat[,1]) +biplot_interactive <- function(data, scores) { + + data <- as.data.frame(data) + scores <- as.numeric(scores) + + ui <- miniPage( + gadgetTitleBar("Drag to select points"), + miniContentPanel( + # The brush="brush" argument means we can listen for + # brush events on the plot using input$brush. + plotOutput("plot1", height = "80%", brush = "plot_brush"), + verbatimTextOutput("info") + ) + ) + + server <- function(input, output, session) { + + # Compute PCA + pc_obj <- prcomp(data, center = TRUE, scale = FALSE) + bp_obj <- biplot_colored(pc_obj, y = scores) + + # Render the plot + output$plot1 <- renderPlot({ + # Biplot + biplot_colored(pc_obj, y = scores) + }) + + data_out <- cbind(data, bp_obj) + + output$info <- renderText({ + xy_range_str <- function(e) { + if(is.null(e)) return("NULL\n") + idx <- which(bp_obj[,1] >= e$xmin & bp_obj[,1] <= e$xmax & + bp_obj[,2] >= e$ymin & bp_obj[,2] <= e$ymax) + paste0(rownames(data)[idx], collapse = "\n") + } + xy_range_str(input$plot_brush) + }) + + # Handle the Done button being pressed. + observeEvent(input$done, { + # Return the brushed points. See ?shiny::brushedPoints. + stopApp(brushedPoints(data_out, input$plot_brush, xvar="PC1", yvar="PC2")) + }) + } + + runGadget(ui, server) +} diff --git a/inst/NEWS b/inst/NEWS index 475b526..1e8d854 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,10 +1,15 @@ +Changes in version 0.0.7 +======================== + +* New biplot_interactive function to explore the results. + Changes in version 0.0.6 ======================== * Fixed bug when using plot functionality of filtering functions. * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition. * Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues. -* Removed all sufficient thresholds for metric sample filter. +* Removed all sufficient thresholds for metric sample filter. Changes in version 0.0.5 ======================== diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd new file mode 100644 index 0000000..ae974d5 --- /dev/null +++ b/man/biplot_interactive.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/biplot_interactive.R +\name{biplot_interactive} +\alias{biplot_interactive} +\title{Interactive biplot} +\usage{ +biplot_interactive(data, scores) +} +\arguments{ +\item{data}{a data.frame containing the data to be plotted.} + +\item{scores}{a numeric vector used to color the points.} +} +\description{ +This is a wrapper around \code{\link{biplot_colored}}, which creates a shiny +gadget to allow the user to select specific points in the graph. +} +\details{ +Since this is based on the shiny gadget feature, it will not work in + static documents, such as vignettes or markdown / knitr documents. + See \code{biplot_colored} for more details on the internals. +} +\examples{ +mat <- matrix(rnorm(1000), ncol=10) +colnames(mat) <- paste("X", 1:ncol(mat), sep="") + +biplot_interactive(mat, mat[,1]) +} + From dbc16ec487280486d026bee192c0887b471d2f0a Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 14 Jul 2016 18:14:16 -0700 Subject: [PATCH 17/27] Travis with codecov --- .travis.yml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..3d1b594 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,42 @@ +#---------------------------------------------------------------- +# Travis-CI configuration for Bioconductor packages +# +# REFERENCES: +# * Travis CI: https://travis-ci.org/ +# * tutorial: https://docs.travis-ci.com/user/languages/r +# * see also: https://blog.rstudio.org/2016/03/09/r-on-travis-ci/ +# * covr: https://github.com/jimhester/covr +# * Coveralls: https://coveralls.io/ +# +# Validate your .travis.yml file at http://lint.travis-ci.org/ +#---------------------------------------------------------------- +language: r +cache: packages + +# R versions to be tested on +r: + - bioc-release + - bioc-devel + +## Turn this to true before submission to CRAN/Bioconductor +warnings_are_errors: false + +r_build_args: "--no-build-vignettes" +r_check_args: "--no-vignettes" + +notifications: + email: + on_success: change + on_failure: change + +## Code coverage +r_packages: + - covr + +## BiocCheck +bioc_packages: + - BiocCheck + +after_success: + - bash <(curl -s https://codecov.io/bash) + - R CMD BiocCheck . From 0b5c3507231370e80b32b6acd431fa6785d275a9 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Thu, 14 Jul 2016 20:49:37 -0700 Subject: [PATCH 18/27] Do not run interactive example (duh!) --- NAMESPACE | 2 ++ R/biplot_interactive.R | 4 +++- man/biplot_interactive.Rd | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 1832316..1c5199e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -54,7 +54,9 @@ importFrom(shiny,brushedPoints) importFrom(shiny,observeEvent) importFrom(shiny,plotOutput) importFrom(shiny,renderPlot) +importFrom(shiny,renderText) importFrom(shiny,runGadget) +importFrom(shiny,stopApp) importFrom(shiny,verbatimTextOutput) importFrom(stats,approx) importFrom(stats,as.formula) diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R index c3a79a0..8e6b111 100644 --- a/R/biplot_interactive.R +++ b/R/biplot_interactive.R @@ -11,15 +11,17 @@ #' @param scores a numeric vector used to color the points. #' #' @importFrom miniUI gadgetTitleBar miniContentPanel miniPage gadgetTitleBar -#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput +#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput stopApp renderText #' #' @export #' #' @examples +#' \dontrun{ #' mat <- matrix(rnorm(1000), ncol=10) #' colnames(mat) <- paste("X", 1:ncol(mat), sep="") #' #' biplot_interactive(mat, mat[,1]) +#' } biplot_interactive <- function(data, scores) { data <- as.data.frame(data) diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd index ae974d5..32b5f78 100644 --- a/man/biplot_interactive.Rd +++ b/man/biplot_interactive.Rd @@ -21,9 +21,11 @@ Since this is based on the shiny gadget feature, it will not work in See \code{biplot_colored} for more details on the internals. } \examples{ +\dontrun{ mat <- matrix(rnorm(1000), ncol=10) colnames(mat) <- paste("X", 1:ncol(mat), sep="") biplot_interactive(mat, mat[,1]) } +} From 17c1042ad54f82fb7324e7bf147a5c6e057ab638 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 15 Jul 2016 10:37:38 -0700 Subject: [PATCH 19/27] Add par control to interactive biplot --- R/biplot_interactive.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R index 8e6b111..38c3eda 100644 --- a/R/biplot_interactive.R +++ b/R/biplot_interactive.R @@ -22,7 +22,7 @@ #' #' biplot_interactive(mat, mat[,1]) #' } -biplot_interactive <- function(data, scores) { +biplot_interactive <- function(data, scores, ...) { data <- as.data.frame(data) scores <- as.numeric(scores) @@ -46,7 +46,7 @@ biplot_interactive <- function(data, scores) { # Render the plot output$plot1 <- renderPlot({ # Biplot - biplot_colored(pc_obj, y = scores) + biplot_colored(pc_obj, y = scores, ...) }) data_out <- cbind(data, bp_obj) From bcd05f0b9310b1dd3e0529eeaca85dd50c64c808 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 15 Jul 2016 13:24:29 -0700 Subject: [PATCH 20/27] Update docs --- man/biplot_interactive.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd index 32b5f78..04eb7b8 100644 --- a/man/biplot_interactive.Rd +++ b/man/biplot_interactive.Rd @@ -4,7 +4,7 @@ \alias{biplot_interactive} \title{Interactive biplot} \usage{ -biplot_interactive(data, scores) +biplot_interactive(data, scores, ...) } \arguments{ \item{data}{a data.frame containing the data to be plotted.} From 4918f6ae5f1e62b6e736269f9ef1d9b3ed489c92 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 15 Jul 2016 14:36:11 -0700 Subject: [PATCH 21/27] Added check for hdf5 and parallel --- R/scone_main.R | 3 ++ tests/testthat/test_hdf5.R | 67 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 tests/testthat/test_hdf5.R diff --git a/R/scone_main.R b/R/scone_main.R index 079deaf..1666c50 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -134,6 +134,9 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 if(missing(hdf5file)) { stop("If `return_norm='hdf5'`, `hdf5file` must be specified.") } else { + if(BiocParallel::bpnworkers(BiocParallel::registered()[[1]]) > 1) { + stop("At the moment, `return_norm='hdf5'` does not support multicores.") + } stopifnot(h5createFile(hdf5file)) h5write(rownames(expr), hdf5file, "genes") h5write(colnames(expr), hdf5file, "samples") diff --git a/tests/testthat/test_hdf5.R b/tests/testthat/test_hdf5.R new file mode 100644 index 0000000..6822211 --- /dev/null +++ b/tests/testthat/test_hdf5.R @@ -0,0 +1,67 @@ +context("Test the different return_norm options") +set.seed(13124323) +BiocParallel::register(bpparam("SerialParam")) + + +test_that("hd5 checks", { + e <- matrix(rpois(1000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + qc_mat <- matrix(rnorm(20), nrow=10) + bio <- gl(2, 5) + batch <- as.factor(rep(1:2, 5)) + + # factorial + expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb), + scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), + k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, + adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, + evaluate=FALSE, run=TRUE, return_norm = "hdf5"), + "must be specified") + + BiocParallel::register(bpparam("MulticoreParam")) + + expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb), + scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), + k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, + adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, + evaluate=FALSE, run=TRUE, return_norm = "hdf5", hdf5file = "/tmp/tmp"), + "does not support multicores") + + BiocParallel::register(bpparam("SerialParam")) +}) + +test_that("return_norm in memory", { + e <- matrix(rpois(1000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + qc_mat <- matrix(rnorm(20), nrow=10) + bio <- gl(2, 5) + batch <- as.factor(rep(1:2, 5)) + + # factorial + res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), + k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, + adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, + evaluate=FALSE, run=TRUE, return_norm = "in_memory") + expect_is(res$normalized_data, "list") + expect_true(all(sapply(res$normalized_data, is.matrix))) +}) + +test_that("do not return_norm", { + e <- matrix(rpois(1000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + qc_mat <- matrix(rnorm(20), nrow=10) + bio <- gl(2, 5) + batch <- as.factor(rep(1:2, 5)) + + # factorial + res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), + k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, + adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, + evaluate=FALSE, run=TRUE, return_norm = "no") + expect_null(res$normalized_data) +}) From 9087b7c57e2b42e3cf2ac38dcac836bf00cbb1a9 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 15 Jul 2016 14:58:42 -0700 Subject: [PATCH 22/27] Do not compute dist twice --- R/scone_eval.R | 53 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index fd421fa..371704a 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -3,13 +3,13 @@ #' This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on #' projections of the normalized data, correlations, and RLE metrics. #' -#' @details The eval_proj function argument must have 2 inputs: +#' @details The eval_proj function argument must have 2 inputs: #' \itemize{ #' \item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).} #' \item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.} #' } #' and it must output a matrix representation of the original data (cells in rows, factors in columns). -#' +#' #' @param expr matrix. The data matrix (genes in rows, cells in columns). #' @param eval_pcs numeric. The number of principal components to use for evaluation. #' Ignored if !is.null(eval_proj). @@ -52,11 +52,11 @@ #' } #' -score_matrix <- function(expr, eval_pcs = 3, +score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, - qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, + qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, is_log=FALSE, stratified_pam = FALSE){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ @@ -75,33 +75,34 @@ score_matrix <- function(expr, eval_pcs = 3, } ## ------ Bio and Batch Tightness ----- + dd <- as.matrix(dist(proj)) - if( !is.null(bio) ) { - if(!all(is.na(bio))) { - BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dist(proj[!is.na(bio),])))$avg.width - } else { - BIO_SIL = NA - warning("bio is all NA!") - } + if( !is.null(bio) ) { + if(!all(is.na(bio))) { + BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dd[!is.na(bio),]))$avg.width } else { BIO_SIL = NA + warning("bio is all NA!") } + } else { + BIO_SIL = NA + } - if(!is.null(batch)) { - if(!all(is.na(batch))) { - BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dist(proj[!is.na(batch),])))$avg.width - } else{ - BATCH_SIL <- NA - warning("batch is all NA!") - } - } else { + if(!is.null(batch)) { + if(!all(is.na(batch))) { + BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dd[!is.na(batch),]))$avg.width + } else{ BATCH_SIL <- NA + warning("batch is all NA!") } + } else { + BATCH_SIL <- NA + } ## ------ PAM Tightness ----- if ( !is.null(eval_kclust) ){ - + # "Stratified" PAM if(stratified_pam){ @@ -114,7 +115,7 @@ score_matrix <- function(expr, eval_pcs = 3, cond_w = length(is_cond) if(cond_w > max(eval_kclust)){ pamk_object = pamk(proj[is_cond,],krange = eval_kclust) - + # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected if(is.null(pamk_object$pamobject$silinfo$avg.width) ){ if (!1 %in% eval_kclust) { @@ -123,7 +124,7 @@ score_matrix <- function(expr, eval_pcs = 3, stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.") } } - + PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width }else{ stop(paste("Number of clusters 'k' must be smaller than bio-cross-batch stratum size:", @@ -131,7 +132,7 @@ score_matrix <- function(expr, eval_pcs = 3, } } PAM_SIL = PAM_SIL/length(biobatch) - + # Traditional PAM }else{ pamk_object = pamk(proj,krange = eval_kclust) @@ -170,10 +171,10 @@ score_matrix <- function(expr, eval_pcs = 3, RLE_MED <- mean(colMedians(rle)^2) RLE_IQR <- mean(colIQRs(rle)) - scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, - EXP_QC_COR, EXP_UV_COR, EXP_WV_COR, + scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, + EXP_QC_COR, EXP_UV_COR, EXP_WV_COR, RLE_MED, RLE_IQR) - names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", + names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_UV_COR", "EXP_WV_COR", "RLE_MED", "RLE_IQR") return(scores) From 80cdc51f173d29d76bf2b19cb40613f63f95d5c9 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 15 Jul 2016 15:06:53 -0700 Subject: [PATCH 23/27] Fix bug introduced in previous commit --- R/scone_eval.R | 6 ++++-- tests/testthat/test_scone.R | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index 371704a..9f654a1 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -79,7 +79,8 @@ score_matrix <- function(expr, eval_pcs = 3, if( !is.null(bio) ) { if(!all(is.na(bio))) { - BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dd[!is.na(bio),]))$avg.width + BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)), + dd[!is.na(bio), !is.na(bio)]))$avg.width } else { BIO_SIL = NA warning("bio is all NA!") @@ -90,7 +91,8 @@ score_matrix <- function(expr, eval_pcs = 3, if(!is.null(batch)) { if(!all(is.na(batch))) { - BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dd[!is.na(batch),]))$avg.width + BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)), + dd[!is.na(batch),!is.na(batch)]))$avg.width } else{ BATCH_SIL <- NA warning("batch is all NA!") diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index aa36cd2..9ce9857 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -270,3 +270,25 @@ test_that("batch and bio can be confounded if at least one of adjust_bio or adju adjust_bio = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3), "Biological conditions and batches are confounded.") }) + +test_that("batch and bio can contain NA", { + e <- matrix(rpois(10000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + batch <- gl(2, 5) + bio <- gl(5, 2) + + res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, + adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3) + + batch[1] <- NA + bio[2] <- NA + + res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, + adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3) + + expect_true(!is.na(res2$metrics[,"BIO_SIL"])) + expect_true(!is.na(res2$metrics[,"BATCH_SIL"])) + expect_false(all(res1$metrics[,"BIO_SIL"] == res2$metrics[,"BIO_SIL"])) + expect_false(all(res1$metrics[,"BATCH_SIL"] == res2$metrics[,"BATCH_SIL"])) +}) From 6e63d95cbbb603b75ff62317d20034b776c649fa Mon Sep 17 00:00:00 2001 From: mbcole Date: Mon, 18 Jul 2016 18:57:36 -0700 Subject: [PATCH 24/27] Major imputation modifications --- NAMESPACE | 3 +- R/scone_main.R | 11 +++- R/ziber.R | 101 +++++++++++++++++++----------------- R/zinb.R | 5 +- inst/NEWS | 3 ++ man/estimate_ziber.Rd | 3 +- man/impute_expectation.Rd | 29 +++++++++++ man/impute_null.Rd | 20 +++++++ man/impute_ziber_simp.Rd | 24 --------- man/impute_zinb.Rd | 4 +- man/scone.Rd | 19 ++++--- tests/testthat/test_norm.R | 4 +- tests/testthat/test_scone.R | 66 +++++++++++------------ 13 files changed, 169 insertions(+), 123 deletions(-) create mode 100644 man/impute_expectation.Rd create mode 100644 man/impute_null.Rd delete mode 100644 man/impute_ziber_simp.Rd diff --git a/NAMESPACE b/NAMESPACE index 4050ac6..bd1015c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,7 +12,8 @@ export(biplot_colored) export(estimate_ziber) export(estimate_zinb) export(factor_sample_filter) -export(impute_ziber_simp) +export(impute_expectation) +export(impute_null) export(impute_zinb) export(lm_adjust) export(make_design) diff --git a/R/scone_main.R b/R/scone_main.R index f84f497..1b21bf9 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -12,6 +12,8 @@ #' #' @param expr matrix. The data matrix (genes in rows, cells in columns). #' @param imputation list or function. (A list of) function(s) to be used for imputation. +#' @param impute_args arguments for imputation functions. +#' @param rezero Restore zeroes following scaling step? Default FALSE. #' @param scaling list or function. (A list of) function(s) to be used for scaling normalization. #' @param k_ruv numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation). #' If 0, RUV will not be performed. @@ -73,7 +75,7 @@ #' Scores are computed so that higer-performing methods are assigned a higher scores. #' -scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, +scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL, @@ -282,7 +284,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 if(verbose) message("Imputation step...") im_params <- unique(params[,1]) - imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr)) + imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr,impute_args)) names(imputed) <- im_params # output: a list of imputed matrices @@ -291,6 +293,11 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 sc_params <- unique(params[,1:2]) scaled <- bplapply(1:nrow(sc_params), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]])) + if(rezero){ + if(verbose) message("Re-zero step...") + toz = expr <= 0 + scaled <- lapply(scaled,function(x) x - x*toz) + } names(scaled) <- apply(sc_params, 1, paste, collapse="_") # output: a list of normalized expression matrices diff --git a/R/ziber.R b/R/ziber.R index cff93d3..6e7281d 100644 --- a/R/ziber.R +++ b/R/ziber.R @@ -15,7 +15,9 @@ likfn = function(Z,X,Beta){ #' @param Beta gene-level sample coefficients. #' pzfn = function(Y,W,Alpha,X,Beta){ - return(likfn(Y,W,Alpha)*likfn(1,X,Beta)/(likfn(Y,W,Alpha)*likfn(1,X,Beta) + (Y == 0)*likfn(0,X,Beta))) + matA = likfn(1,X,Beta) + matB = matA*likfn(Y,W,Alpha) + return(matB/(matB + (Y == 0)*(1-matA))) } #' Parameter estimation of zero-inflated bernoulli model @@ -34,7 +36,6 @@ pzfn = function(Y,W,Alpha,X,Beta){ #' @param bulk_model logical. Use median log-expression of gene in detected fraction as sole gene-level feature. Default FALSE. #' Ignored if gfeatM is specified. #' @param pos_controls logical. TRUE for all genes that are known to be expressed in all cells. -#' If specified, then drop-out characteristic is fit only once to control genes. #' @param em_tol numeric. Convergence treshold on log-likelihood. #' @param maxiter numeric. The maximum number of iterations. Default 100. #' @param verbose logical. Whether or not to print the value of the likelihood at each iteration. @@ -53,10 +54,10 @@ pzfn = function(Y,W,Alpha,X,Beta){ #' \item{loglik}{ the log-likelihood} #' \item{convergence}{ 0 if the algorithm converged and 1 if maxiter was reached} #' } + estimate_ziber = function(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, - pos_controls = NULL, - + pos_controls = NULL, em_tol = 0.01, maxiter = 100, verbose = FALSE){ @@ -85,18 +86,22 @@ estimate_ziber = function(x, fp_tresh = 0, # Initialize Z Z = pzfn(Y,W,Alpha,X,Beta) - # If control genes are supplied, fit W once using control genes - if(!is.null(pos_controls)){ - cY = Y[,pos_controls] - cAlpha = Alpha[,pos_controls] - for (i in 1:dim(Z)[1]){ - W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients - } - Beta[,pos_controls] = NA - Z[,pos_controls] = 1 + if(is.null(pos_controls)){ + stop("Must supply positive controls genes to fit FNR characteristic") } - EL2 = sum(na.omit(as.vector(Z*log(likfn(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(likfn(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(likfn(0,X,Beta))))) + # Fit W once using control genes + + cY = Y[,pos_controls] + cAlpha = Alpha[,pos_controls] + for (i in 1:dim(Z)[1]){ + W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients + } + Beta[,pos_controls] = NA + Z[,pos_controls] = 1 + + matA = likfn(1,X,Beta) + EL2 = sum(Z*log(likfn(Y,W,Alpha)),na.rm = TRUE) + sum(Z*log( matA ),na.rm = TRUE) + sum((1-Z)*log(1- matA ),na.rm = TRUE) if(verbose){print(EL2)} @@ -107,33 +112,14 @@ estimate_ziber = function(x, fp_tresh = 0, EL1 = EL2 - if(is.null(pos_controls)){ - - # Update Beta - Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z))))) - - # Update Z - Z = pzfn(Y,W,Alpha,X,Beta) - - # Update W - for (i in 1:dim(Z)[1]){ - W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients - } - - # Update Z - Z = pzfn(Y,W,Alpha,X,Beta) - - }else{ - - # Update Beta - Beta[,!pos_controls] = t(matrix(log(colMeans(Z[,!pos_controls])/(1-colMeans(Z[,!pos_controls]))))) - - # Update Z - Z[,!pos_controls] = pzfn(Y[,!pos_controls],W,Alpha[,!pos_controls],X,Beta[,!pos_controls]) - - } + # Update Beta + cmz = colMeans(Z[,!pos_controls]) + Beta[,!pos_controls] = t(matrix(log(cmz) - log(1-cmz))) + # Update Z + Z[,!pos_controls] = pzfn(Y[,!pos_controls],W,Alpha[,!pos_controls],X,Beta[,!pos_controls]) - EL2 = sum(na.omit(as.vector(Z*log(likfn(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(likfn(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(likfn(0,X,Beta))))) + matA = likfn(1,X,Beta) + EL2 = sum(Z*log(likfn(Y,W,Alpha)),na.rm = TRUE) + sum(Z*log( matA ),na.rm = TRUE) + sum((1-Z)*log(1- matA ),na.rm = TRUE) if(verbose){print(EL2)} niter = niter + 1 if(niter >= maxiter){ @@ -147,23 +133,40 @@ estimate_ziber = function(x, fp_tresh = 0, expected_state = t(Z), loglik = EL2, convergence = 0)) } -#' Imputation of zero abundance based on zero-inflated bernoulli model +#' Imputation of zero abundance based on general zero-inflated model #' -#' This function is used to impute the data, by weighting the observed detection failure by their -#' probability of coming from the zero-inflation part of the distribution. Median positive expression -#' is used as the sole gene-level determinant of drop-out, and the data is replaced with this expression value. +#' This function is used to impute the data, weighted by probability of +#' data coming from the zero-inflation part of the distribution. #' #' @details The imputation is carried out with the following formula: -#' y_{ij}* = y_{ij} * Pr( No Drop | D_{ij}) + pos_median_{i} * Pr( Drop | D_{ij}). +#' y_{ij}* = y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}). +#' +#' impute_args must contain two elements: +#' 1) p_nodrop = posterior probability of data not having resulted from drop-out (genes in rows, cells in columns) +#' 2) mu = expected expression of dropped data (genes in rows, cells in columns) #' #' @export #' #' @param expression the data matrix (genes in rows, cells in columns) +#' @param impute_args arguments for imputation (see details) #' #' @return the imputed expression matrix. -impute_ziber_simp <- function(expression) { - pars <- estimate_ziber(expression,bulk_model = TRUE) - w <- pars$p_nodrop - imputed <- expression * w + exp(pars$Alpha[1,]) * (1 - w) +#' + +impute_expectation <- function(expression,impute_args) { + imputed <- expression * impute_args$p_nodrop + impute_args$mu * (1 - impute_args$p_nodrop) return(imputed) } + +#' Null or no-op imputation +#' @export +#' +#' @param expression the data matrix (genes in rows, cells in columns) +#' @param impute_args arguments for imputation (not used) +#' +#' @return the imputed expression matrix. +#' + +impute_null <- function(expression,impute_args) { + return(expression) +} diff --git a/R/zinb.R b/R/zinb.R index 1dc7d70..e2953cc 100644 --- a/R/zinb.R +++ b/R/zinb.R @@ -158,9 +158,10 @@ loglik_small <- function(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj) #' @export #' #' @param expression the data matrix (genes in rows, cells in columns) -#' +#' @param impute_args arguments for imputation (not used) +#' #' @return the imputed expression matrix. -impute_zinb <- function(expression) { +impute_zinb <- function(expression, impute_args) { pars <- estimate_zinb(expression) w <- pars$p_z imputed <- expression * w + pars$mu * (1 - w) diff --git a/inst/NEWS b/inst/NEWS index 475b526..b4a7a93 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,6 +1,9 @@ Changes in version 0.0.6 ======================== +* Added option for restoring zeroes after scaling step. +* New argument format for imputation via impute_args. +* Simplified ziber fnr estimation function - requires control genes. * Fixed bug when using plot functionality of filtering functions. * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition. * Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues. diff --git a/man/estimate_ziber.Rd b/man/estimate_ziber.Rd index a339a25..8eea68d 100644 --- a/man/estimate_ziber.Rd +++ b/man/estimate_ziber.Rd @@ -17,8 +17,7 @@ estimate_ziber(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, \item{bulk_model}{logical. Use median log-expression of gene in detected fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is specified.} -\item{pos_controls}{logical. TRUE for all genes that are known to be expressed in all cells. -If specified, then drop-out characteristic is fit only once to control genes.} +\item{pos_controls}{logical. TRUE for all genes that are known to be expressed in all cells.} \item{em_tol}{numeric. Convergence treshold on log-likelihood.} diff --git a/man/impute_expectation.Rd b/man/impute_expectation.Rd new file mode 100644 index 0000000..037040f --- /dev/null +++ b/man/impute_expectation.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ziber.R +\name{impute_expectation} +\alias{impute_expectation} +\title{Imputation of zero abundance based on general zero-inflated model} +\usage{ +impute_expectation(expression, impute_args) +} +\arguments{ +\item{expression}{the data matrix (genes in rows, cells in columns)} + +\item{impute_args}{arguments for imputation (see details)} +} +\value{ +the imputed expression matrix. +} +\description{ +This function is used to impute the data, weighted by probability of +data coming from the zero-inflation part of the distribution. +} +\details{ +The imputation is carried out with the following formula: +y_{ij}* = y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}). + +impute_args must contain two elements: +1) p_nodrop = posterior probability of data not having resulted from drop-out (genes in rows, cells in columns) +2) mu = expected expression of dropped data (genes in rows, cells in columns) +} + diff --git a/man/impute_null.Rd b/man/impute_null.Rd new file mode 100644 index 0000000..8711d49 --- /dev/null +++ b/man/impute_null.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ziber.R +\name{impute_null} +\alias{impute_null} +\title{Null or no-op imputation} +\usage{ +impute_null(expression, impute_args) +} +\arguments{ +\item{expression}{the data matrix (genes in rows, cells in columns)} + +\item{impute_args}{arguments for imputation (not used)} +} +\value{ +the imputed expression matrix. +} +\description{ +Null or no-op imputation +} + diff --git a/man/impute_ziber_simp.Rd b/man/impute_ziber_simp.Rd deleted file mode 100644 index 20ddfe6..0000000 --- a/man/impute_ziber_simp.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ziber.R -\name{impute_ziber_simp} -\alias{impute_ziber_simp} -\title{Imputation of zero abundance based on zero-inflated bernoulli model} -\usage{ -impute_ziber_simp(expression) -} -\arguments{ -\item{expression}{the data matrix (genes in rows, cells in columns)} -} -\value{ -the imputed expression matrix. -} -\description{ -This function is used to impute the data, by weighting the observed detection failure by their -probability of coming from the zero-inflation part of the distribution. Median positive expression -is used as the sole gene-level determinant of drop-out, and the data is replaced with this expression value. -} -\details{ -The imputation is carried out with the following formula: -y_{ij}* = y_{ij} * Pr( No Drop | D_{ij}) + pos_median_{i} * Pr( Drop | D_{ij}). -} - diff --git a/man/impute_zinb.Rd b/man/impute_zinb.Rd index 7fd45c8..64a9b6a 100644 --- a/man/impute_zinb.Rd +++ b/man/impute_zinb.Rd @@ -4,10 +4,12 @@ \alias{impute_zinb} \title{Imputation of zero counts based on zero-inflated negative binomial model} \usage{ -impute_zinb(expression) +impute_zinb(expression, impute_args) } \arguments{ \item{expression}{the data matrix (genes in rows, cells in columns)} + +\item{impute_args}{arguments for imputation (not used)} } \value{ the imputed expression matrix. diff --git a/man/scone.Rd b/man/scone.Rd index 20e59fc..d922df6 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -4,19 +4,24 @@ \alias{scone} \title{scone main wrapper: function to apply and evaluate all the normalization schemes} \usage{ -scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, - k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", - "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, - batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, - eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, - eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL, - verbose = FALSE, stratified_pam = FALSE) +scone(expr, imputation = list(none = impute_null), impute_args = NULL, + rezero = FALSE, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, + qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no", + "yes", "force"), bio = NULL, batch = NULL, run = TRUE, + evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, + eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon, + eval_poscon = NULL, params = NULL, verbose = FALSE, + stratified_pam = FALSE) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} \item{imputation}{list or function. (A list of) function(s) to be used for imputation.} +\item{impute_args}{arguments for imputation functions.} + +\item{rezero}{Restore zeroes following scaling step? Default FALSE.} + \item{scaling}{list or function. (A list of) function(s) to be used for scaling normalization.} \item{k_ruv}{numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation). diff --git a/tests/testthat/test_norm.R b/tests/testthat/test_norm.R index 368be51..2f22bcc 100644 --- a/tests/testthat/test_norm.R +++ b/tests/testthat/test_norm.R @@ -7,7 +7,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package" rownames(e) <- as.character(1:nrow(e)) # UQ + RUV - res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=5, k_qc=0, + res <- scone(e, imputation=impute_null, scaling=UQ_FN, k_ruv=5, k_qc=0, evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100)) uq <- EDASeq::betweenLaneNormalization(e, which="upper", round=FALSE) @@ -21,7 +21,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package" # UQ + QC qc_mat <- matrix(rnorm(20), nrow=10) - res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=0, k_qc=2, + res <- scone(e, imputation=impute_null, scaling=UQ_FN, k_ruv=0, k_qc=2, evaluate=FALSE, run=TRUE, qc=qc_mat) pca_qc <- prcomp(qc_mat, center=TRUE, scale=TRUE) diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index 6743a98..38a4a0d 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -7,33 +7,33 @@ test_that("Test with no real method (only identity)", { rownames(e) <- as.character(1:nrow(e)) # one combination - res <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + res <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE) expect_equal(res$normalized_data[[1]], log1p(e)) # add more imputations - res <- scone(e, imputation=list(a=identity, b=identity), scaling=identity, + res <- scone(e, imputation=list(a=impute_null, b=impute_null), scaling=identity, k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE) expect_equal(res$normalized_data[[1]], log1p(e)) expect_equal(res$normalized_data[[2]], log1p(e)) # add more scaling - res <- scone(e, imputation=list(a=identity, b=identity), + res <- scone(e, imputation=list(a=impute_null, b=impute_null), scaling=list(a=identity, b=identity, c=identity), k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE) # add ruv (the first two should not work) - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, k_qc=0, evaluate=FALSE, run=FALSE), "ruv_negcon must be specified") - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=1:100, k_qc=0, evaluate=FALSE, run=FALSE), "must be a character vector") - params <- scone(e, imputation=list(identity, identity), + params <- scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=0, evaluate=FALSE, run=FALSE) @@ -41,17 +41,17 @@ test_that("Test with no real method (only identity)", { # add qc (the first two should not work) qc_mat <- matrix(rnorm(20), nrow=10) - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=5, evaluate=FALSE, run=FALSE), "qc must be specified") - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=5, qc=qc_mat, evaluate=FALSE, run=FALSE), "k_qc must be less or equal than the number of columns") - res <- scone(e, imputation=list(identity, identity), + res <- scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, evaluate=FALSE, run=TRUE) @@ -59,24 +59,24 @@ test_that("Test with no real method (only identity)", { # add bio (the first two should not work) bio <- rep(1:2, each=5) - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="yes", evaluate=FALSE, run=FALSE), "if adjust_bio is 'yes' or 'force', 'bio' must be specified") - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="yes", bio=bio, evaluate=FALSE, run=FALSE), "'bio' must be a factor") bio <- as.factor(bio) - res <- scone(e, imputation=list(identity, identity), + res <- scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="yes", bio=bio, evaluate=FALSE, run=TRUE) - res <- scone(e, imputation=list(identity, identity), + res <- scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, evaluate=FALSE, run=TRUE) @@ -84,14 +84,14 @@ test_that("Test with no real method (only identity)", { # add batch (the first three should not work) batch <- rep(1:2, each=5) - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", evaluate=FALSE, run=FALSE), "if adjust_batch is 'yes' or 'force', 'batch' must be specified") - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", @@ -99,7 +99,7 @@ test_that("Test with no real method (only identity)", { "'batch' must be a factor") batch <- as.factor(batch) - expect_error(scone(e, imputation=list(identity, identity), + expect_error(scone(e, imputation=list(impute_null,impute_null), scaling=list(identity, identity, identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", @@ -107,14 +107,14 @@ test_that("Test with no real method (only identity)", { "Biological conditions and batches are confounded") batch <- as.factor(rep(1:2, 5)) - res <- scone(e, imputation=list(a=identity, b=identity), + res <- scone(e, imputation=list(a=impute_null, b=impute_null), scaling=list(a=identity, b=identity, c=identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, evaluate=FALSE, run=TRUE) ## add evaluation - res <- scone(e, imputation=list(a=identity, b=identity), + res <- scone(e, imputation=list(a=impute_null, b=impute_null), scaling=list(a=identity, b=identity, c=identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, @@ -131,7 +131,7 @@ test_that("Test imputation and scaling", { batch <- as.factor(rep(1:2, 5)) # factorial - res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, @@ -139,20 +139,20 @@ test_that("Test imputation and scaling", { # nested batch <- as.factor(c(1, 2, 1, 2, 1, 3, 4, 3, 4, 3)) - params <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + params <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, evaluate=FALSE, run=FALSE) params <- params[-(1:5),] - res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, evaluate=FALSE, run=TRUE, params=params) # evaluation - system.time(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + system.time(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=5, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat, adjust_bio="yes", bio=bio, @@ -165,7 +165,7 @@ test_that("Test imputation and scaling", { expect_equal(rownames(res$metrics), rownames(res$params)) expect_equal(rownames(res$metrics), names(res$normalized_data)) - res2 <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res2 <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=5, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat, adjust_bio="yes", bio=bio, @@ -184,7 +184,7 @@ test_that("scone works with only one normalization",{ e <- matrix(rpois(1000, lambda = 5), ncol=10) rownames(e) <- as.character(1:nrow(e)) - res <- scone(e, imputation=list(none=identity), + res <- scone(e, imputation=list(none=impute_null), scaling=list(none=identity), k_ruv=0, k_qc=0, run=TRUE, evaluate=TRUE, eval_kclust = 2) @@ -200,14 +200,14 @@ test_that("conditional PAM",{ bio <- gl(2, 5) batch <- gl(5, 2) - res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), eval_kclust = 2, stratified_pam = TRUE) - expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + expect_error(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, adjust_batch="yes", batch=batch, run=FALSE, @@ -216,7 +216,7 @@ test_that("conditional PAM",{ eval_kclust = 6, stratified_pam = TRUE), "For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size") - expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + expect_error(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=0, k_qc=0, run = FALSE, evaluate = TRUE, eval_negcon=as.character(11:20), @@ -231,10 +231,10 @@ test_that("if bio=no bio is ignored", { e <- matrix(rpois(10000, lambda = 5), ncol=10) rownames(e) <- as.character(1:nrow(e)) - res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3) - res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_bio = "no", eval_kclust = 3) expect_equal(res1$normalized_data, res2$normalized_data) @@ -244,10 +244,10 @@ test_that("if batch=no batch is ignored", { e <- matrix(rpois(10000, lambda = 5), ncol=10) rownames(e) <- as.character(1:nrow(e)) - res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3) - res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_batch = "no", eval_kclust = 3) expect_equal(res1$normalized_data, res2$normalized_data) @@ -257,11 +257,11 @@ test_that("batch and bio can be confounded if at least one of adjust_bio or adju e <- matrix(rpois(10000, lambda = 5), ncol=10) rownames(e) <- as.character(1:nrow(e)) - expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + expect_warning(scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_batch = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3), "Biological conditions and batches are confounded.") - expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + expect_warning(scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, adjust_bio = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3), "Biological conditions and batches are confounded.") }) From 412cd2d21ae18296907fe91d191240989146f0f9 Mon Sep 17 00:00:00 2001 From: mbcole Date: Tue, 19 Jul 2016 11:14:49 -0700 Subject: [PATCH 25/27] Fixed test cases and documentation --- NAMESPACE | 2 +- man/scone.Rd | 17 +++++++++-------- man/score_matrix.Rd | 2 +- tests/testthat/test_hdf5.R | 8 ++++---- tests/testthat/test_scone.R | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0e76c26..edfffde 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,9 +13,9 @@ export(biplot_interactive) export(estimate_ziber) export(estimate_zinb) export(factor_sample_filter) +export(get_normalized) export(impute_expectation) export(impute_null) -export(get_normalized) export(impute_zinb) export(lm_adjust) export(make_design) diff --git a/man/scone.Rd b/man/scone.Rd index eb42fdb..e891376 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -5,14 +5,15 @@ \title{scone main wrapper: function to apply and evaluate all the normalization schemes} \usage{ -scone(expr, imputation = list(none = impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv = 5, - k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", - "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, - batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, - eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, - eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL, - verbose = FALSE, stratified_pam = FALSE, return_norm = c("no", - "in_memory", "hdf5"), hdf5file) +scone(expr, imputation = list(none = impute_null), impute_args = NULL, + rezero = FALSE, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, + qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no", + "yes", "force"), bio = NULL, batch = NULL, run = TRUE, + evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, + eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon, + eval_poscon = NULL, params = NULL, verbose = FALSE, + stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5"), + hdf5file) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index 945bf70..99df0dc 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -61,7 +61,7 @@ This function evaluates an expression matrix using SCONE criteria, producing a n projections of the normalized data, correlations, and RLE metrics. } \details{ -The eval_proj function argument must have 2 inputs: +The eval_proj function argument must have 2 inputs: \itemize{ \item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).} \item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.} diff --git a/tests/testthat/test_hdf5.R b/tests/testthat/test_hdf5.R index 6822211..2e9811e 100644 --- a/tests/testthat/test_hdf5.R +++ b/tests/testthat/test_hdf5.R @@ -12,7 +12,7 @@ test_that("hd5 checks", { batch <- as.factor(rep(1:2, 5)) # factorial - expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb), + expect_error(scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, @@ -21,7 +21,7 @@ test_that("hd5 checks", { BiocParallel::register(bpparam("MulticoreParam")) - expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb), + expect_error(scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, @@ -40,7 +40,7 @@ test_that("return_norm in memory", { batch <- as.factor(rep(1:2, 5)) # factorial - res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, @@ -58,7 +58,7 @@ test_that("do not return_norm", { batch <- as.factor(rep(1:2, 5)) # factorial - res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), + res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index f5d27a3..5a763f7 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -278,13 +278,13 @@ test_that("batch and bio can contain NA", { batch <- gl(2, 5) bio <- gl(5, 2) - res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, + res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3) batch[1] <- NA bio[2] <- NA - res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, + res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE, adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3) expect_true(!is.na(res2$metrics[,"BIO_SIL"])) From fe0ffcb3ff1b159809af5fa632a54b16659e7d03 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 22 Jul 2016 12:01:26 -0700 Subject: [PATCH 26/27] Minor --- R/scone_main.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/scone_main.R b/R/scone_main.R index 9af6823..ccca798 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -121,7 +121,8 @@ #' written to the \code{hdf5file} file. This must be a string specifying (a #' path to) a new file. If the file already exists, it will return error. #' -scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv=5, k_qc=5, +scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL, + rezero = FALSE, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL, From 1ca23e3a1f9e000d2ce6ea51bb8923f5d9fc9c07 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Fri, 22 Jul 2016 12:04:27 -0700 Subject: [PATCH 27/27] Version bumped --- DESCRIPTION | 4 ++-- inst/NEWS | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c53d27c..b1acdfe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.5-9002 +Version: 0.0.6 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -8,7 +8,7 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", role = c("aut", "cph"))) Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph] Maintainer: Michael Cole -Date: 2016-07-14 +Date: 2016-07-22 License: Artistic-2.0 Depends: R (>= 3.3) diff --git a/inst/NEWS b/inst/NEWS index 168c21f..14013d5 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,5 +1,4 @@ - -Changes in version 0.0.6 (2016-07-07) +Changes in version 0.0.6 (2016-07-22) ======================== * Added option for restoring zeroes after scaling step.