From 5201642e8e3dbc8a633a79a2302330691c09c286 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Mon, 2 May 2016 18:16:40 -0700 Subject: [PATCH 01/18] Changed version in develop branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3e74bdb..5fa5ff9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3 +Version: 0.0.3-9000 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", From 2ea687f2ea90207d243c5ab307bb2e6116675f69 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Tue, 3 May 2016 14:29:45 -0700 Subject: [PATCH 02/18] Fix #32 --- R/biplot.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/biplot.R b/R/biplot.R index 2667854..cc40378 100644 --- a/R/biplot.R +++ b/R/biplot.R @@ -46,4 +46,6 @@ biplot_colored <- function(x, y, choices=1:2, expand=1, ...) { text(yy/ratio, labels=labs, col=2) arrows(0, 0, yy[, 1] * 0.8/ratio, yy[, 2] * 0.8/ratio, col = 2, length = 0.1) + + invisible(xx) } From 6825b06edad41ae00f9fc466cbdb544d4c1d2506 Mon Sep 17 00:00:00 2001 From: mbcole Date: Fri, 6 May 2016 12:35:01 -0700 Subject: [PATCH 03/18] Added preserved variance score --- R/scone_eval.R | 23 +++++++++++++++++++---- R/scone_main.R | 6 +++--- man/scone.Rd | 2 +- man/score_matrix.Rd | 6 +++++- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index 232c549..fe467e7 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -35,7 +35,9 @@ #' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out. #' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), #' and a weighted average is returned. -#' +#' @param ref_expr matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). +#' If NULL, preserved variance is returned NA. +#' #' @importFrom scde bwpca #' @importFrom class knn #' @importFrom fpc pamk @@ -53,6 +55,7 @@ #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} #' \item{PAM_COMPACT}{ Compactness measure of sub-sampled (pam) co-clustering matrix's "block-diagonal-ness". Approximate isoperimetric quotient of non-clustering region.} +#' \item{VAR_PRES}{ Variance preserved measure.} #' } #' @@ -62,7 +65,8 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, bio = NULL, batch = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, - wv_factors = NULL, is_log=FALSE, conditional_pam = FALSE ){ + wv_factors = NULL, is_log=FALSE, + conditional_pam = FALSE , cv_genes = NULL, ref_expr = NULL){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ stop("NA/Inf/NaN Expression Values.") @@ -208,9 +212,20 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, }else{ EXP_WV_COR = NA } + + # VAR_PRES + if(!is.null(ref_expr)){ + z1 = scale(t(ref_expr),scale = FALSE) + z1 = t(t(z1)/sqrt(colSums(z1^2))) + z2 = scale(t(expr),scale = FALSE) + z2 = t(t(z2)/sqrt(colSums(z2^2))) + VAR_PRES = mean(diag(t(z1) %*% (z2))) + }else{ + VAR_PRES = NA + } - scores = c(KNN_BIO, KNN_BATCH, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR , PAM_COMPACT) - names(scores) = c("KNN_BIO", "KNN_BATCH", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR", "PAM_COMPACT") + scores = c(KNN_BIO, KNN_BATCH, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR , PAM_COMPACT, VAR_PRES) + names(scores) = c("KNN_BIO", "KNN_BATCH", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR", "PAM_COMPACT","VAR_PRES") return(scores) } diff --git a/R/scone_main.R b/R/scone_main.R index 58f26b6..573acc6 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -66,7 +66,7 @@ #' with each row corresponding to a set of normalization parameters. #' #' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: -#' Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT. +#' Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT, and VAR_PRES. #' Negative-signature metrics decrease with improving performance, including KNN_BATCH, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. #' Rank-scores are computed so that higer-performing methods are assigned a lower-rank. #' @@ -334,7 +334,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, eval_kclust = eval_kclust, bio = bio, batch = batch, qc_factors = qc_pcs, ruv_factors = ruv_factors_raw, uv_factors = uv_factors, wv_factors = wv_factors, - is_log = TRUE, conditional_pam = conditional_pam) + is_log = TRUE, conditional_pam = conditional_pam,ref_expr = log1p(expr)) } else { score <- NULL } @@ -349,7 +349,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, names(evaluation) <- apply(params, 1, paste, collapse=',') evaluation <- simplify2array(evaluation) - ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1) + ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1,-1) ranks <- apply(ev_for_ranks[apply(evaluation, 1, function(x) !all(is.na(x))),, drop=FALSE], 1, rank) if(NCOL(ranks) > 1) { med_rank <- rowMedians(ranks) diff --git a/man/scone.Rd b/man/scone.Rd index 62c447d..c7281da 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -100,7 +100,7 @@ If both \code{run=FALSE} the normalization and evaluation are not run, but the f Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: -Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT. +Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT, and VAR_PRES. Negative-signature metrics decrease with improving performance, including KNN_BATCH, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. Rank-scores are computed so that higer-performing methods are assigned a lower-rank. } diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index d39e4d8..bd97277 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -8,7 +8,7 @@ score_matrix(expr, eval_pcs = 3, proj = NULL, weights = NULL, seed = 1, em.maxiter = 100, eval_knn = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, wv_factors = NULL, is_log = FALSE, - conditional_pam = FALSE) + conditional_pam = FALSE, cv_genes = NULL, ref_expr = NULL) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -56,6 +56,9 @@ If NULL, wv correlations will be returned NA.} \item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned.} + +\item{ref_expr}{matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). +If NULL, preserved variance is returned NA.} } \value{ A list with the following elements: @@ -68,6 +71,7 @@ A list with the following elements: \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} \item{PAM_COMPACT}{ Compactness measure of sub-sampled (pam) co-clustering matrix's "block-diagonal-ness". Approximate isoperimetric quotient of non-clustering region.} +\item{VAR_PRES}{ Variance preserved measure.} } } \description{ From 8c799540b7db3afa41dbb98bbe36e299402cd3f6 Mon Sep 17 00:00:00 2001 From: mbcole Date: Fri, 6 May 2016 12:44:27 -0700 Subject: [PATCH 04/18] Lingering arg in score_matrix. --- R/scone_eval.R | 2 +- man/score_matrix.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/scone_eval.R b/R/scone_eval.R index fe467e7..722e569 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -66,7 +66,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, wv_factors = NULL, is_log=FALSE, - conditional_pam = FALSE , cv_genes = NULL, ref_expr = NULL){ + conditional_pam = FALSE , ref_expr = NULL){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ stop("NA/Inf/NaN Expression Values.") diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index bd97277..3812124 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -8,7 +8,7 @@ score_matrix(expr, eval_pcs = 3, proj = NULL, weights = NULL, seed = 1, em.maxiter = 100, eval_knn = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, wv_factors = NULL, is_log = FALSE, - conditional_pam = FALSE, cv_genes = NULL, ref_expr = NULL) + conditional_pam = FALSE, ref_expr = NULL) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} From 61b47fa65155d59b034938b873420ed0990b3448 Mon Sep 17 00:00:00 2001 From: mbcole Date: Fri, 6 May 2016 15:14:14 -0700 Subject: [PATCH 05/18] Replaced BIO and BATCH KNN with SIL and replaced compactness with PAM SIL score. --- NAMESPACE | 1 + R/scone_eval.R | 68 +++++++++++++++------------------------------ R/scone_main.R | 15 ++++------ man/scone.Rd | 10 +++---- man/score_matrix.Rd | 23 +++++++-------- 5 files changed, 43 insertions(+), 74 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5e6b813..87b4ef8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ importFrom(MASS,glm.nb) importFrom(RUVSeq,RUVg) importFrom(aroma.light,normalizeQuantileRank.matrix) importFrom(class,knn) +importFrom(cluster,silhouette) importFrom(clusterCells,subsampleClustering) importFrom(diptest,dip.test) importFrom(edgeR,calcNormFactors) diff --git a/R/scone_eval.R b/R/scone_eval.R index 722e569..2241f3f 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -16,14 +16,12 @@ #' Ignored if is.null(weights) or !is.null(proj). #' @param em.maxiter numeric. Maximum EM iterations, passed to bwpca. #' Ignored if is.null(weights) or !is.null(proj). -#' @param eval_knn numeric. The number of nearest neighbors to use for evaluation. -#' If NULL, all KNN concordances will be returned NA. #' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering compactness (stability) will be reported. If NULL, tightness and stability will be returned NA. +#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. #' @param bio factor. The biological condition (variation to be preserved), NA is allowed. -#' If NULL, condition KNN concordance will be returned NA. +#' If NULL, condition ASW will be returned NA. #' @param batch factor. The known batch variable (variation to be removed), NA is allowed. -#' If NULL, batch KNN concordance will be returned NA. +#' If NULL, batch ASW will be returned NA. #' @param qc_factors Factors of unwanted variation derived from quality metrics. #' If NULL, qc correlations will be returned NA. #' @param ruv_factors Factors of unwanted variation derived from negative control genes (adjustable set). @@ -42,26 +40,27 @@ #' @importFrom class knn #' @importFrom fpc pamk #' @importFrom clusterCells subsampleClustering +#' @importFrom cluster silhouette #' #' @export #' #' @return A list with the following elements: #' \itemize{ -#' \item{KNN_BIO}{ K-NN concordance rate by biological condition.} -#' \item{KNN_BATCH}{ K-NN concordance rate by batch condition.} +#' \item{BIO_SIL}{ Average silhoutte width by biological condition.} +#' \item{BATCH_SIL}{ Average silhoutte width by batch condition.} #' \item{PAM_SIL}{ Maximum average silhoutte width from pam clustering.} #' \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} #' \item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -#' \item{PAM_COMPACT}{ Compactness measure of sub-sampled (pam) co-clustering matrix's "block-diagonal-ness". Approximate isoperimetric quotient of non-clustering region.} +#' \item{PAM_STAB}{ Maximum average silhoutte width from pam clustering of sub-sampled co-clustering (pam) matrix.} #' \item{VAR_PRES}{ Variance preserved measure.} #' } #' score_matrix <- function(expr, eval_pcs = 3, proj = NULL, weights = NULL, seed = 1, em.maxiter = 100, - eval_knn = NULL, eval_kclust = NULL, + eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, @@ -86,41 +85,31 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, eval_pcs = dim(proj)[2] } - ## ------ K-nearest neighbors (including self!) ----- - - if( !is.null(eval_knn) ){ + ## ------ Bio and Batch Tightness ----- if( !is.null(bio) ) { if(!all(is.na(bio))) { - KNN_BIO = mean(attributes(knn(train = proj[!is.na(bio),],test = proj[!is.na(bio),],cl = bio[!is.na(bio)], k = eval_knn,prob = TRUE))$prob) + BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dist(proj[!is.na(bio),])))$avg.width } else { - KNN_BIO = NA + BIO_SIL = NA warning("bio is all NA!") } } else { - KNN_BIO = NA + BIO_SIL = NA } - # K-NN Batch if(!is.null(batch)) { if(!all(is.na(batch))) { - KNN_BATCH <- mean(attributes(knn(train = proj[!is.na(batch),],test = proj[!is.na(batch),],cl = batch[!is.na(batch)], k = eval_knn,prob = TRUE))$prob) + BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dist(proj[!is.na(batch),])))$avg.width } else{ - KNN_BATCH <- NA + BATCH_SIL <- NA warning("batch is all NA!") } } else { - KNN_BATCH <- NA + BATCH_SIL <- NA } - - } else { - - KNN_BIO = NA - KNN_BATCH = NA - - } - - ## ------ PAM Tightness and Stability ----- + + ## ------ PAM (Conditional) Tightness and Stability ----- if ( !is.null(eval_kclust) ){ @@ -161,26 +150,15 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, } # Stability - - PAM_COMPACT = 0 + PAM_STAB = 0 for(k in eval_kclust){ - submat = subsampleClustering(proj, k=k) # Co-clustering frequency matrix - subhc = hclust(dist(submat)) # Re-order rows and columns using hclust - submat = submat[subhc$order,subhc$order] - - submat_shifted = 2*submat - 1 - field = (submat_shifted[2:(nrow(submat_shifted)-1),3:nrow(submat_shifted)] + submat_shifted[2:(nrow(submat_shifted)-1),1:(nrow(submat_shifted)-2)])/2 - spin = submat_shifted[2:(nrow(submat_shifted)-1),2:(nrow(submat_shifted)-1)] - perim_len = mean(1/2-field*spin/2) # Approximate fraction of elements on the perimeter - isoper = (mean(1-submat)/(perim_len^2))/length(submat) # Approximate isoperimetric quotient of non-clustering (0) region. - PAM_COMPACT = max(PAM_COMPACT,isoper) # Maximum compactness across all choices of resampling scheme. - + PAM_STAB = max(PAM_STAB,fpc::pamk(data = submat,krange = k)$pam$sil$avg.width) } }else{ PAM_SIL = NA - PAM_COMPACT = NA + PAM_STAB = NA } ## ------ Hidden Factors ----- @@ -213,7 +191,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, EXP_WV_COR = NA } - # VAR_PRES + ## ----- Variation Preserved if(!is.null(ref_expr)){ z1 = scale(t(ref_expr),scale = FALSE) z1 = t(t(z1)/sqrt(colSums(z1^2))) @@ -224,8 +202,8 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, VAR_PRES = NA } - scores = c(KNN_BIO, KNN_BATCH, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR , PAM_COMPACT, VAR_PRES) - names(scores) = c("KNN_BIO", "KNN_BATCH", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR", "PAM_COMPACT","VAR_PRES") + scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR , PAM_STAB, VAR_PRES) + names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR", "PAM_STAB","VAR_PRES") return(scores) } diff --git a/R/scone_main.R b/R/scone_main.R index 573acc6..127b524 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -29,10 +29,9 @@ #' Ignored, if adjust_batch=0. #' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster). #' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE. -#' @param eval_knn numeric. The number of nearest neighbors to use for evaluation. Ignored if evaluation=FALSE. #' @param eval_weights matrix. A numeric data matrix to be used for weighted PCA in evaluation (genes in rows, cells in columns). #' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering compactness (stability) will be reported. If NULL, tightness and stability will be returned NA. +#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. #' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should #' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. #' If NULL, correlations with negative controls will be returned NA. @@ -66,13 +65,13 @@ #' with each row corresponding to a set of normalization parameters. #' #' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: -#' Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT, and VAR_PRES. -#' Negative-signature metrics decrease with improving performance, including KNN_BATCH, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. +#' Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. +#' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. #' Rank-scores are computed so that higer-performing methods are assigned a lower-rank. #' scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), - bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_knn=10, eval_weights = NULL, + bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_weights = NULL, eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, params=NULL, verbose=FALSE, conditional_pam = FALSE) { @@ -196,10 +195,6 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, stop("'eval_pcs' must be less or equal than the number of samples.") } - if(eval_knn >= ncol(expr)) { - stop("'eval_knn' must be less than the number of samples.") - } - if(any(eval_kclust >= ncol(expr))) { stop("'eval_kclust' must be less than the number of samples.") } @@ -330,7 +325,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, sc_name <- paste(params[i,1:2], collapse="_") adjusted <- lm_adjust(log1p(scaled[[sc_name]]), design_mat, batch) if(evaluate) { - score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, eval_knn = eval_knn, weights = eval_weights, + score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, weights = eval_weights, eval_kclust = eval_kclust, bio = bio, batch = batch, qc_factors = qc_pcs, ruv_factors = ruv_factors_raw, uv_factors = uv_factors, wv_factors = wv_factors, diff --git a/man/scone.Rd b/man/scone.Rd index c7281da..f4b8c29 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -7,7 +7,7 @@ scone(expr, imputation, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, batch = NULL, run = TRUE, - evaluate = TRUE, eval_pcs = 3, eval_knn = 10, eval_weights = NULL, + evaluate = TRUE, eval_pcs = 3, eval_weights = NULL, eval_kclust = 2:10, eval_negcon = NULL, eval_poscon = NULL, params = NULL, verbose = FALSE, conditional_pam = FALSE) } @@ -47,12 +47,10 @@ of parameters that will be run for inspection by the user.} \item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.} -\item{eval_knn}{numeric. The number of nearest neighbors to use for evaluation. Ignored if evaluation=FALSE.} - \item{eval_weights}{matrix. A numeric data matrix to be used for weighted PCA in evaluation (genes in rows, cells in columns).} \item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering compactness (stability) will be reported. If NULL, tightness and stability will be returned NA.} +If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} \item{eval_negcon}{character. The genes to be used as negative controls for evaluation. These genes should be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. @@ -100,8 +98,8 @@ If both \code{run=FALSE} the normalization and evaluation are not run, but the f Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: -Positive-signature metrics increase with improving performance, including KNN_BIO,PAM_SIL, EXP_WV_COR, PAM_COMPACT, and VAR_PRES. -Negative-signature metrics decrease with improving performance, including KNN_BATCH, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. +Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. +Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. Rank-scores are computed so that higer-performing methods are assigned a lower-rank. } diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index 3812124..35489ad 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -5,10 +5,10 @@ \title{scone evaluation: function to evaluate one normalization scheme} \usage{ score_matrix(expr, eval_pcs = 3, proj = NULL, weights = NULL, seed = 1, - em.maxiter = 100, eval_knn = NULL, eval_kclust = NULL, bio = NULL, - batch = NULL, qc_factors = NULL, ruv_factors = NULL, - uv_factors = NULL, wv_factors = NULL, is_log = FALSE, - conditional_pam = FALSE, ref_expr = NULL) + em.maxiter = 100, eval_kclust = NULL, bio = NULL, batch = NULL, + qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, + wv_factors = NULL, is_log = FALSE, conditional_pam = FALSE, + ref_expr = NULL) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -28,17 +28,14 @@ Ignored if is.null(weights) or !is.null(proj).} \item{em.maxiter}{numeric. Maximum EM iterations, passed to bwpca. Ignored if is.null(weights) or !is.null(proj).} -\item{eval_knn}{numeric. The number of nearest neighbors to use for evaluation. -If NULL, all KNN concordances will be returned NA.} - \item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering compactness (stability) will be reported. If NULL, tightness and stability will be returned NA.} +If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} \item{bio}{factor. The biological condition (variation to be preserved), NA is allowed. -If NULL, condition KNN concordance will be returned NA.} +If NULL, condition ASW will be returned NA.} \item{batch}{factor. The known batch variable (variation to be removed), NA is allowed. -If NULL, batch KNN concordance will be returned NA.} +If NULL, batch ASW will be returned NA.} \item{qc_factors}{Factors of unwanted variation derived from quality metrics. If NULL, qc correlations will be returned NA.} @@ -63,14 +60,14 @@ If NULL, preserved variance is returned NA.} \value{ A list with the following elements: \itemize{ -\item{KNN_BIO}{ K-NN concordance rate by biological condition.} -\item{KNN_BATCH}{ K-NN concordance rate by batch condition.} +\item{BIO_SIL}{ Average silhoutte width by biological condition.} +\item{BATCH_SIL}{ Average silhoutte width by batch condition.} \item{PAM_SIL}{ Maximum average silhoutte width from pam clustering.} \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} \item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -\item{PAM_COMPACT}{ Compactness measure of sub-sampled (pam) co-clustering matrix's "block-diagonal-ness". Approximate isoperimetric quotient of non-clustering region.} +\item{PAM_STAB}{ Maximum average silhoutte width from pam clustering of sub-sampled co-clustering (pam) matrix.} \item{VAR_PRES}{ Variance preserved measure.} } } From d7263f296be21c69bfa1a88dace716ce9214f617 Mon Sep 17 00:00:00 2001 From: mbcole Date: Fri, 6 May 2016 16:55:23 -0700 Subject: [PATCH 06/18] Projection functions are now passed as arguments and evaluated per normalized matrix. --- NAMESPACE | 1 - R/scone_eval.R | 28 +++++++++------------------- R/scone_main.R | 10 ++++++---- man/scone.Rd | 14 +++++++++----- man/score_matrix.Rd | 26 +++++++++----------------- 5 files changed, 33 insertions(+), 46 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 87b4ef8..448fc91 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,4 +35,3 @@ importFrom(grDevices,colorRampPalette) importFrom(limma,lmFit) importFrom(matrixStats,rowMedians) importFrom(mixtools,normalmixEM) -importFrom(scde,bwpca) diff --git a/R/scone_eval.R b/R/scone_eval.R index 2241f3f..3d02640 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -1,21 +1,16 @@ #' scone evaluation: function to evaluate one normalization scheme #' #' This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on -#' weighted (or unweighted) projections of the normalized data. +#' projections of the normalized data. #' #' @details None. #' #' @param expr matrix. The data matrix (genes in rows, cells in columns). #' @param eval_pcs numeric. The number of principal components to use for evaluation. -#' Ignored if !is.null(proj). -#' @param proj matrix. A numeric data matrix to be used as projection (cells in rows, coordinates in columns). -#' If NULL, weighted PCA is used for projection -#' @param weights matrix. A numeric data matrix to be used for weighted PCA (genes in rows, cells in columns). -#' If NULL, regular PCA is used for projection -#' @param seed numeric. Random seed, passed to bwpca. -#' Ignored if is.null(weights) or !is.null(proj). -#' @param em.maxiter numeric. Maximum EM iterations, passed to bwpca. -#' Ignored if is.null(weights) or !is.null(proj). +#' Ignored if !is.null(eval_proj). +#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +#' If NULL, PCA is used for projection +#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args. #' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. #' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. #' @param bio factor. The biological condition (variation to be preserved), NA is allowed. @@ -36,7 +31,6 @@ #' @param ref_expr matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). #' If NULL, preserved variance is returned NA. #' -#' @importFrom scde bwpca #' @importFrom class knn #' @importFrom fpc pamk #' @importFrom clusterCells subsampleClustering @@ -58,8 +52,7 @@ #' } #' -score_matrix <- function(expr, eval_pcs = 3, proj = NULL, - weights = NULL, seed = 1, em.maxiter = 100, +score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, @@ -75,14 +68,11 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, expr <- log1p(expr) } - if(is.null(proj)){ - if(is.null(weights)){ + if(is.null(eval_proj)){ proj = prcomp(t(expr),center = TRUE,scale. = TRUE)$x[,1:eval_pcs] - }else{ - proj = bwpca(mat = t(expr),matw = t(weights),npcs = eval_pcs, seed = seed, em.maxiter = em.maxiter)$scores - } }else{ - eval_pcs = dim(proj)[2] + proj = eval_proj(expr,eval_proj_args = eval_proj_args) + eval_pcs = ncol(proj) } ## ------ Bio and Batch Tightness ----- diff --git a/R/scone_main.R b/R/scone_main.R index 127b524..c606cca 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -29,7 +29,9 @@ #' Ignored, if adjust_batch=0. #' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster). #' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE. -#' @param eval_weights matrix. A numeric data matrix to be used for weighted PCA in evaluation (genes in rows, cells in columns). +#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +#' If NULL, PCA is used for projection +#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args. #' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. #' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. #' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should @@ -43,7 +45,7 @@ #' right format, and is only intended to be used to feed the results of setting run=FALSE #' back into the algorithm (see example). #' @param verbose logical. If TRUE some messagges are printed. -#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned. +#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL. #' @param run logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame #' of parameters that will be run for inspection by the user. #' @@ -71,7 +73,7 @@ #' scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), - bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_weights = NULL, + bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, params=NULL, verbose=FALSE, conditional_pam = FALSE) { @@ -325,7 +327,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, sc_name <- paste(params[i,1:2], collapse="_") adjusted <- lm_adjust(log1p(scaled[[sc_name]]), design_mat, batch) if(evaluate) { - score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, weights = eval_weights, + score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, eval_proj = eval_proj, eval_proj_args = eval_proj_args, eval_kclust = eval_kclust, bio = bio, batch = batch, qc_factors = qc_pcs, ruv_factors = ruv_factors_raw, uv_factors = uv_factors, wv_factors = wv_factors, diff --git a/man/scone.Rd b/man/scone.Rd index f4b8c29..a16e440 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -7,9 +7,10 @@ scone(expr, imputation, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, batch = NULL, run = TRUE, - evaluate = TRUE, eval_pcs = 3, eval_weights = NULL, - eval_kclust = 2:10, eval_negcon = NULL, eval_poscon = NULL, - params = NULL, verbose = FALSE, conditional_pam = FALSE) + evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, + eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = NULL, + eval_poscon = NULL, params = NULL, verbose = FALSE, + conditional_pam = FALSE) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -47,7 +48,10 @@ of parameters that will be run for inspection by the user.} \item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.} -\item{eval_weights}{matrix. A numeric data matrix to be used for weighted PCA in evaluation (genes in rows, cells in columns).} +\item{eval_proj}{function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +If NULL, PCA is used for projection} + +\item{eval_proj_args}{list. List of args passed to projection function as eval_proj_args.} \item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} @@ -67,7 +71,7 @@ back into the algorithm (see example).} \item{verbose}{logical. If TRUE some messagges are printed.} -\item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned.} +\item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL.} } \value{ A list with the following elements: diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index 35489ad..2670d2f 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -4,29 +4,21 @@ \alias{score_matrix} \title{scone evaluation: function to evaluate one normalization scheme} \usage{ -score_matrix(expr, eval_pcs = 3, proj = NULL, weights = NULL, seed = 1, - em.maxiter = 100, eval_kclust = NULL, bio = NULL, batch = NULL, - qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, - wv_factors = NULL, is_log = FALSE, conditional_pam = FALSE, - ref_expr = NULL) +score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, + eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, + ruv_factors = NULL, uv_factors = NULL, wv_factors = NULL, + is_log = FALSE, conditional_pam = FALSE, ref_expr = NULL) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} \item{eval_pcs}{numeric. The number of principal components to use for evaluation. -Ignored if !is.null(proj).} +Ignored if !is.null(eval_proj).} -\item{proj}{matrix. A numeric data matrix to be used as projection (cells in rows, coordinates in columns). -If NULL, weighted PCA is used for projection} +\item{eval_proj}{function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +If NULL, PCA is used for projection} -\item{weights}{matrix. A numeric data matrix to be used for weighted PCA (genes in rows, cells in columns). -If NULL, regular PCA is used for projection} - -\item{seed}{numeric. Random seed, passed to bwpca. -Ignored if is.null(weights) or !is.null(proj).} - -\item{em.maxiter}{numeric. Maximum EM iterations, passed to bwpca. -Ignored if is.null(weights) or !is.null(proj).} +\item{eval_proj_args}{list. List of args passed to projection function as eval_proj_args.} \item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} @@ -73,7 +65,7 @@ A list with the following elements: } \description{ This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on -weighted (or unweighted) projections of the normalized data. +projections of the normalized data. } \details{ None. From aad3b60dd24e3f221182638d8bbca9526e80494a Mon Sep 17 00:00:00 2001 From: mbcole Date: Thu, 12 May 2016 08:54:48 -0700 Subject: [PATCH 07/18] Updated clusterExperiment dependency --- DESCRIPTION | 11 +++++------ NAMESPACE | 2 +- R/scone_eval.R | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5fa5ff9..ec920a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9000 +Version: 0.0.3-9001 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -8,13 +8,13 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", role = c("aut"))) Author: Michael Cole [aut, cre, cph] and Davide Risso [aut] Maintainer: Michael Cole -Date: 2016-02-14 +Date: 2016-05-12 License: Artistic-2.0 Depends: - R (>= 3.1) + R (>= 3.3) Imports: BiocParallel, - clusterCells, + clusterExperiment, DESeq, EDASeq, MASS, @@ -27,8 +27,7 @@ Imports: gplots, limma, matrixStats, - mixtools, - scde + mixtools Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 448fc91..78ebc9d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,7 +27,7 @@ importFrom(RUVSeq,RUVg) importFrom(aroma.light,normalizeQuantileRank.matrix) importFrom(class,knn) importFrom(cluster,silhouette) -importFrom(clusterCells,subsampleClustering) +importFrom(clusterExperiment,subsampleClustering) importFrom(diptest,dip.test) importFrom(edgeR,calcNormFactors) importFrom(fpc,pamk) diff --git a/R/scone_eval.R b/R/scone_eval.R index 3d02640..d143481 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -33,7 +33,7 @@ #' #' @importFrom class knn #' @importFrom fpc pamk -#' @importFrom clusterCells subsampleClustering +#' @importFrom clusterExperiment subsampleClustering #' @importFrom cluster silhouette #' #' @export From 2434c074ee4da82ec6bafa2c6767f9ab521e41f2 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 1 Jun 2016 17:43:55 -0700 Subject: [PATCH 08/18] Fixes #37 and #35 --- DESCRIPTION | 2 +- R/scone_eval.R | 21 ++++++--------------- R/scone_main.R | 8 +++----- tests/testthat/test_scone.R | 28 ++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ec920a2..b964674 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9001 +Version: 0.0.3-9002 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", diff --git a/R/scone_eval.R b/R/scone_eval.R index d143481..58d4b21 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -30,7 +30,7 @@ #' and a weighted average is returned. #' @param ref_expr matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). #' If NULL, preserved variance is returned NA. -#' +#' #' @importFrom class knn #' @importFrom fpc pamk #' @importFrom clusterExperiment subsampleClustering @@ -47,7 +47,6 @@ #' \item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -#' \item{PAM_STAB}{ Maximum average silhoutte width from pam clustering of sub-sampled co-clustering (pam) matrix.} #' \item{VAR_PRES}{ Variance preserved measure.} #' } #' @@ -57,7 +56,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N bio = NULL, batch = NULL, qc_factors = NULL, ruv_factors = NULL, uv_factors = NULL, - wv_factors = NULL, is_log=FALSE, + wv_factors = NULL, is_log=FALSE, conditional_pam = FALSE , ref_expr = NULL){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ @@ -98,7 +97,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N } else { BATCH_SIL <- NA } - + ## ------ PAM (Conditional) Tightness and Stability ----- if ( !is.null(eval_kclust) ){ @@ -139,16 +138,8 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N PAM_SIL = pamk_object$pamobject$silinfo$avg.width } - # Stability - PAM_STAB = 0 - for(k in eval_kclust){ - submat = subsampleClustering(proj, k=k) # Co-clustering frequency matrix - PAM_STAB = max(PAM_STAB,fpc::pamk(data = submat,krange = k)$pam$sil$avg.width) - } - }else{ PAM_SIL = NA - PAM_STAB = NA } ## ------ Hidden Factors ----- @@ -180,7 +171,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N }else{ EXP_WV_COR = NA } - + ## ----- Variation Preserved if(!is.null(ref_expr)){ z1 = scale(t(ref_expr),scale = FALSE) @@ -192,8 +183,8 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N VAR_PRES = NA } - scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR , PAM_STAB, VAR_PRES) - names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR", "PAM_STAB","VAR_PRES") + scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR, VAR_PRES) + names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR","VAR_PRES") return(scores) } diff --git a/R/scone_main.R b/R/scone_main.R index c606cca..bba0579 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -23,10 +23,8 @@ #' if 'force', only models with 'bio' will be run. #' @param adjust_batch character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run; #' if 'force', only models with 'batch' will be run. -#' @param bio factor. The biological condition to be included in the adjustment model (variation to be preserved). -#' Ignored, if adjust_bio=0. -#' @param batch factor. The known batch variable to be included in the adjustment model (variation to be removed). -#' Ignored, if adjust_batch=0. +#' @param bio factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation. +#' @param batch factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation. #' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster). #' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE. #' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). @@ -346,7 +344,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, names(evaluation) <- apply(params, 1, paste, collapse=',') evaluation <- simplify2array(evaluation) - ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1,-1) + ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1) ranks <- apply(ev_for_ranks[apply(evaluation, 1, function(x) !all(is.na(x))),, drop=FALSE], 1, rank) if(NCOL(ranks) > 1) { med_rank <- rowMedians(ranks) diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index 769d521..4ee2d53 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -227,3 +227,31 @@ test_that("conditional PAM",{ "If `bio` is null, `conditional_pam` cannot be TRUE") }) + + +test_that("if bio=no bio is ignored", { + e <- matrix(rpois(10000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3) + + res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_bio = "no", eval_kclust = 3) + + expect_equal(res1$normalized_data, res2$normalized_data) +}) + +test_that("if batch=no batch is ignored", { + e <- matrix(rpois(10000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3) + + res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_batch = "no", eval_kclust = 3) + + expect_equal(res1$normalized_data, res2$normalized_data) +}) + From f891bea5e40513cf201b8060ea19fc151eb9eff9 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 1 Jun 2016 17:54:40 -0700 Subject: [PATCH 09/18] Close #34 --- R/scone_main.R | 6 +++++- tests/testthat/test_scone.R | 26 +++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/R/scone_main.R b/R/scone_main.R index bba0579..d628c9d 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -215,7 +215,11 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, tab <- table(bio, batch) if(all(colSums(tab>0)==1)){ if(nlevels(bio) == nlevels(batch)) { - stop("Biological conditions and batches are confounded. They cannot both be included in the model, please set at least one of 'adjust_bio' and 'adjust_batch' to 'no.'") + if(adjust_bio != "no" & adjust_batch != "no") { + stop("Biological conditions and batches are confounded. They cannot both be included in the model, please set at least one of 'adjust_bio' and 'adjust_batch' to 'no.'") + } else { + warning("Biological conditions and batches are confounded. Removing batch effects may remove true biological signal and/or inferred differences may be inflated because of batch effects.") + } } else { nested <- TRUE } diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index 4ee2d53..e14cbd5 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -118,7 +118,7 @@ test_that("Test with no real method (only identity)", { scaling=list(a=identity, b=identity, c=identity), k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat, adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch, - evaluate=TRUE, run=TRUE, eval_knn=5, eval_kclust=5) + evaluate=TRUE, run=TRUE, eval_kclust=5) }) @@ -159,7 +159,7 @@ test_that("Test imputation and scaling", { adjust_batch="yes", batch=batch, run=TRUE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_knn=2, eval_kclust = 2, verbose=TRUE)) + eval_kclust = 2, verbose=TRUE)) expect_equal(rownames(res$evaluation), rownames(res$ranks)) expect_equal(rownames(res$evaluation), rownames(res$params)) @@ -172,7 +172,7 @@ test_that("Test imputation and scaling", { adjust_batch="yes", batch=batch, run=TRUE, evaluate=FALSE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_knn=2, eval_kclust = 2, verbose=TRUE) + eval_kclust = 2, verbose=TRUE) norm_ordered <- res2$normalized_data[names(res$normalized_data)] expect_equal(norm_ordered, res$normalized_data) @@ -187,7 +187,7 @@ test_that("scone works with only one normalization",{ res <- scone(e, imputation=list(none=identity), scaling=list(none=identity), k_ruv=0, k_qc=0, run=TRUE, - evaluate=TRUE, eval_knn=2, eval_kclust = 2) + evaluate=TRUE, eval_kclust = 2) expect_equal(res$normalized_data[[1]], log1p(e)) }) @@ -206,7 +206,7 @@ test_that("conditional PAM",{ adjust_batch="yes", batch=batch, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_knn=2, eval_kclust = 2, conditional_pam = TRUE) + eval_kclust = 2, conditional_pam = TRUE) expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), @@ -214,7 +214,7 @@ test_that("conditional PAM",{ adjust_batch="yes", batch=batch, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_knn=2, eval_kclust = 6, conditional_pam = TRUE), + eval_kclust = 6, conditional_pam = TRUE), "For conditional_pam, max 'eval_kclust' must be smaller than min bio class size") expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb), @@ -223,7 +223,7 @@ test_that("conditional PAM",{ adjust_batch="yes", batch=batch, run=FALSE, evaluate=TRUE, eval_negcon=as.character(11:20), eval_poscon=as.character(21:30), - eval_knn=2, eval_kclust = 6, conditional_pam = TRUE), + eval_kclust = 6, conditional_pam = TRUE), "If `bio` is null, `conditional_pam` cannot be TRUE") }) @@ -255,3 +255,15 @@ test_that("if batch=no batch is ignored", { expect_equal(res1$normalized_data, res2$normalized_data) }) +test_that("batch and bio can be confounded if at least one of adjust_bio or adjust_batch is no", { + e <- matrix(rpois(10000, lambda = 5), ncol=10) + rownames(e) <- as.character(1:nrow(e)) + + expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_batch = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3), + "Biological conditions and batches are confounded.") + + expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, + adjust_bio = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3), + "Biological conditions and batches are confounded.") +}) From 92d9925fdba3c465b95dffbae084597b9ab58824 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 1 Jun 2016 18:09:42 -0700 Subject: [PATCH 10/18] Fix R CMD check --- DESCRIPTION | 6 ++++-- NAMESPACE | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b964674..06539a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", role = c("aut", "cre", "cph")), person("Davide", "Risso", email = "risso.davide@gmail.com", role = c("aut"))) -Author: Michael Cole [aut, cre, cph] and Davide Risso [aut] +Author: Michael Cole [aut, cre, cph], Davide Risso [aut] Maintainer: Michael Cole Date: 2016-05-12 License: Artistic-2.0 @@ -14,6 +14,7 @@ Depends: R (>= 3.3) Imports: BiocParallel, + cluster, clusterExperiment, DESeq, EDASeq, @@ -27,7 +28,8 @@ Imports: gplots, limma, matrixStats, - mixtools + mixtools, + grDevices Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 78ebc9d..5c467a2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,3 +35,4 @@ importFrom(grDevices,colorRampPalette) importFrom(limma,lmFit) importFrom(matrixStats,rowMedians) importFrom(mixtools,normalmixEM) +importFrom(grDevices, dev.off, pdf) From 95a0cbf7e445ac2efe96c2d877f104ad1b19148b Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Mon, 6 Jun 2016 12:18:23 -0700 Subject: [PATCH 11/18] Added RLE measures to scone evaluation --- DESCRIPTION | 2 +- NAMESPACE | 3 ++- R/scone_eval.R | 19 +++++++++++++++---- R/scone_main.R | 5 +++-- inst/NEWS | 10 +++++++++- man/scone.Rd | 6 ++---- man/score_matrix.Rd | 3 ++- 7 files changed, 34 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 06539a3..7ced773 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9002 +Version: 0.0.3-9003 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", diff --git a/NAMESPACE b/NAMESPACE index 5c467a2..d7fd2dc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ importFrom(edgeR,calcNormFactors) importFrom(fpc,pamk) importFrom(grDevices,colorRampPalette) importFrom(limma,lmFit) +importFrom(matrixStats,colIQRs) +importFrom(matrixStats,colMedians) importFrom(matrixStats,rowMedians) importFrom(mixtools,normalmixEM) -importFrom(grDevices, dev.off, pdf) diff --git a/R/scone_eval.R b/R/scone_eval.R index 58d4b21..f9c4f39 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -35,6 +35,7 @@ #' @importFrom fpc pamk #' @importFrom clusterExperiment subsampleClustering #' @importFrom cluster silhouette +#' @importFrom matrixStats rowMedians colMedians colIQRs #' #' @export #' @@ -48,10 +49,13 @@ #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} #' \item{VAR_PRES}{ Variance preserved measure.} +#' \item{RLE_MED}{ The mean squared median Relative Log Expression (RLE).} +#' \item{RLE_IQR}{ The mean inter-quartile range (IQR) of the RLE.} #' } #' -score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = NULL, +score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, + eval_proj_args = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, @@ -172,7 +176,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N EXP_WV_COR = NA } - ## ----- Variation Preserved + ## ----- Variance Preserved if(!is.null(ref_expr)){ z1 = scale(t(ref_expr),scale = FALSE) z1 = t(t(z1)/sqrt(colSums(z1^2))) @@ -183,8 +187,15 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL,eval_proj_args = N VAR_PRES = NA } - scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, EXP_WV_COR, VAR_PRES) - names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", "EXP_UV_COR", "EXP_WV_COR","VAR_PRES") + ## ----- RLE + rle <- expr - rowMedians(expr) + RLE_MED <- mean(colMedians(rle)^2) + RLE_IQR <- mean(colIQRs(rle)) + + scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, + EXP_WV_COR, VAR_PRES, RLE_MED, RLE_IQR) + names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", + "EXP_UV_COR", "EXP_WV_COR","VAR_PRES", "RLE_MED", "RLE_IQR") return(scores) } diff --git a/R/scone_main.R b/R/scone_main.R index d628c9d..c4b5528 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -333,7 +333,8 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, eval_kclust = eval_kclust, bio = bio, batch = batch, qc_factors = qc_pcs, ruv_factors = ruv_factors_raw, uv_factors = uv_factors, wv_factors = wv_factors, - is_log = TRUE, conditional_pam = conditional_pam,ref_expr = log1p(expr)) + is_log = TRUE, conditional_pam = conditional_pam, + ref_expr = log1p(expr)) } else { score <- NULL } @@ -348,7 +349,7 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, names(evaluation) <- apply(params, 1, paste, collapse=',') evaluation <- simplify2array(evaluation) - ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1) + ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1, 1, 1) ranks <- apply(ev_for_ranks[apply(evaluation, 1, function(x) !all(is.na(x))),, drop=FALSE], 1, rank) if(NCOL(ranks) > 1) { med_rank <- rowMedians(ranks) diff --git a/inst/NEWS b/inst/NEWS index 53104bb..a6bcbf0 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,11 @@ +Changes in version 0.0.4 +======================== + +* Fixed a few bugs and documentation mismatches +* Removed stability evaluation (redundant with sil width and slow) +* Updated clusterExperiment dependency +* Added RLE measures to scone evaluation + Changes in version 0.0.3 ======================== @@ -8,4 +16,4 @@ Changes in version 0.0.3 * zinb now works for non-integer whole numbers * Updated tests * Added documentation for datasets -* Added biplot_colored function \ No newline at end of file +* Added biplot_colored function diff --git a/man/scone.Rd b/man/scone.Rd index a16e440..2e5b8e9 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -35,11 +35,9 @@ if 'force', only models with 'bio' will be run.} \item{adjust_batch}{character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run; if 'force', only models with 'batch' will be run.} -\item{bio}{factor. The biological condition to be included in the adjustment model (variation to be preserved). -Ignored, if adjust_bio=0.} +\item{bio}{factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation.} -\item{batch}{factor. The known batch variable to be included in the adjustment model (variation to be removed). -Ignored, if adjust_batch=0.} +\item{batch}{factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation.} \item{run}{logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame of parameters that will be run for inspection by the user.} diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index 2670d2f..37c8a1d 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -59,8 +59,9 @@ A list with the following elements: \item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -\item{PAM_STAB}{ Maximum average silhoutte width from pam clustering of sub-sampled co-clustering (pam) matrix.} \item{VAR_PRES}{ Variance preserved measure.} +\item{RLE_MED}{ The mean squared median Relative Log Expression (RLE).} +\item{RLE_IQR}{ The mean inter-quartile range (IQR) of the RLE.} } } \description{ From 450e1a630c39c685fc22a7bbd5a946c674c2c2d5 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Mon, 6 Jun 2016 15:45:57 -0700 Subject: [PATCH 12/18] Handling of ties by FQ --- NAMESPACE | 1 + R/SCONE_DEFAULTS.R | 20 ++++++++++++++------ inst/NEWS | 1 + man/FQ_FN.Rd | 6 ++++++ 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d7fd2dc..34e18fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(DESEQ_FN) export(DESEQ_FN_POS) +export(FQT_FN) export(FQ_FN) export(FQ_FN_POS) export(TMM_FN) diff --git a/R/SCONE_DEFAULTS.R b/R/SCONE_DEFAULTS.R index 3ced667..ddedec3 100644 --- a/R/SCONE_DEFAULTS.R +++ b/R/SCONE_DEFAULTS.R @@ -20,7 +20,7 @@ uq_f <- function(which="upper", round = FALSE){ # exprs(x) <- UQ_FN(exprs(x)) # return(x) # }) -# +# # setMethod(f="UQ_FN", # signature="matrix", # definition= function(ei){ @@ -52,15 +52,23 @@ FQ_FN = function(ei){ return(eo) } +#' @rdname FQ_FN +#' @details FQT_FN handles ties carefully (see \code{\link[limma]{normalizeQuantiles}}). +#' @export +FQT_FN = function(ei){ + eo = normalizeQuantileRank.matrix(ei, ties = TRUE) + return(eo) +} + #' Full-Quantile normalization applied to positive data. #' @export #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. #' @return FQ (positive) normalized matrix. FQ_FN_POS = function(ei){ - + # Vector of integers used for computation base_rank = 1:nrow(ei) - + # Quantile Index Matrix: Values between 0 and 1 corresponding to quantile quant_mat = NULL # Re-ordered Data Matrix @@ -75,7 +83,7 @@ FQ_FN_POS = function(ei){ quant[quant > 1] = NA quant_mat = cbind(quant_mat,quant) } - + # Vector form of quantile index matrix quant_out = as.numeric(as.vector(quant_mat)) # Interpolation Matrix (Values of all quantiles) @@ -95,7 +103,7 @@ FQ_FN_POS = function(ei){ # Average over the interpolated values from all samples inter_mean = inter_mat/ob_counts - + ## Substituting Mean Interpolated Values for Expression Values and Return eo = matrix(inter_mean,ncol = dim(ei)[2]) eo[is.na(eo)] = 0 @@ -136,7 +144,7 @@ DESEQ_FN_POS = function(ei){ } if(!any(geom_mean > 0)){ stop("Geometric mean non-positive for all genes.") - } + } ratios = ei / geom_mean # Divide each Expression Value by Geometric Mean of Corresponding Gene if(any(is.infinite(geom_mean))){ stop("Infinite mean! This should never happen :-<") diff --git a/inst/NEWS b/inst/NEWS index a6bcbf0..957c48a 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -5,6 +5,7 @@ Changes in version 0.0.4 * Removed stability evaluation (redundant with sil width and slow) * Updated clusterExperiment dependency * Added RLE measures to scone evaluation +* Added FQT_FN to implement careful ties handling by FQ Changes in version 0.0.3 ======================== diff --git a/man/FQ_FN.Rd b/man/FQ_FN.Rd index 95edb41..0b2f4d6 100644 --- a/man/FQ_FN.Rd +++ b/man/FQ_FN.Rd @@ -1,10 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/SCONE_DEFAULTS.R \name{FQ_FN} +\alias{FQT_FN} \alias{FQ_FN} \title{Full-Quantile normalization wrapper.} \usage{ FQ_FN(ei) + +FQT_FN(ei) } \arguments{ \item{ei}{= Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.} @@ -15,4 +18,7 @@ FQ normalized matrix. \description{ Full-Quantile normalization wrapper. } +\details{ +FQT_FN handles ties carefully (see \code{\link[limma]{normalizeQuantiles}}). +} From 052c8e2a2aee0f56e457b6b032a3b6ef31480482 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Tue, 7 Jun 2016 10:37:10 -0700 Subject: [PATCH 13/18] Closes #40 --- DESCRIPTION | 2 +- R/sample_filtering.R | 284 ++++++++++++++++-------------------- inst/NEWS | 1 + man/factor_sample_filter.Rd | 10 +- man/metric_sample_filter.Rd | 20 ++- 5 files changed, 145 insertions(+), 172 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7ced773..0eafb03 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9003 +Version: 0.0.3-9004 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", diff --git a/R/sample_filtering.R b/R/sample_filtering.R index 3ee3276..828bc81 100644 --- a/R/sample_filtering.R +++ b/R/sample_filtering.R @@ -1,21 +1,21 @@ #' Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes -#' +#' #' @details logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) . -#' +#' #' @param expr matrix The data matrix in transcript-proportional units (genes in rows, cells in columns). #' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics. #' @param fn_tresh Inclusive threshold for negative detection. Default 0.01. -#' +#' #' @return A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA. #' simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ - + # Mean log10p1 expression mu_obs = rowMeans(log10(expr[pos_controls,]+1)) - + # Drop-outs drop_outs = 0 + (expr[pos_controls,] <= fn_tresh) - + # Logistic Regression Model of FNR ref.glms = list() for (si in 1:dim(drop_outs)[2]){ @@ -30,33 +30,32 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ } #' metric-based sample filtering: function to filter single-cell RNA-Seq libraries. -#' -#' This function returns a sample-filtering report for each cell in the input expression matrix, +#' +#' This function returns a sample-filtering report for each cell in the input expression matrix, #' describing which filtering criteria are satisfied. -#' -#' @details For each primary criterion (metric), a sample is evaluated based on 4 sub-criteria: -#' 1) Hard (encoded) threshold +#' +#' @details For each primary criterion (metric), a sample is evaluated based on 4 sub-criteria: +#' 1) Hard (encoded) threshold #' 2) Adaptive thresholding via sd's from the mean #' 3) Adaptive thresholding via mad's from the median #' 4) Adaptive thresholding via sd's from the mean (after mixture modeling) #' A sample must pass all sub-criteria to pass the primary criterion. -#' +#' #' @param expr matrix The data matrix (genes in rows, cells in columns). -#' @param nreads A numeric vector representing number of reads in each library. -#' If NULL, filtered_nreads will be returned NA. +#' @param nreads A numeric vector representing number of reads in each library. Default to `colSums` of `expr`. #' @param ralign A numeric vector representing the proportion of reads aligned to the reference genome in each library. #' If NULL, filtered_ralign will be returned NA. #' @param gene_filter A logical vector indexing genes that will be used to compute library transcriptome breadth. #' If NULL, filtered_breadth will be returned NA. #' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics. #' If NULL, filtered_fnr will be returned NA. -#' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE +#' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE #' @param glen Gene lengths for gene-length normalization (normalized data used in FNR computation). #' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6) #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. #' If NULL, only hard threshold sub-criteria will be applied. #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric). -#' If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. +#' If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. #' Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion. #' @param dip_thresh A numeric value determining dip test p-value threshold. Default 0.05. #' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000. @@ -67,9 +66,9 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #' @param suff_ralign numeric. If not null, serves as an upper bound on ralign threshold. Default 65. #' @param suff_breadth numeric. If not null, serves as an upper bound on breadth threshold. Default 0.8. #' @param suff_fnr numeric. If not null, serves as an lower bound on fnr threshold. -#' @param plot_dir If not null, specifies path to plot output -#' @param hist_breaks hist() breaks argument -#' +#' @param plot logical. Should a plot be produced? +#' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`. +#' #' @return A list with the following elements: #' \itemize{ #' \item{filtered_nreads}{ Logical. Sample has too few reads.} @@ -83,36 +82,31 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #'@export #' #' -metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, +metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, gene_filter = NULL, pos_controls = NULL,scale. = FALSE,glen = NULL, AUC_range = c(0,6), zcut = 1, - mixture = TRUE, dip_thresh = 0.05, + mixture = TRUE, dip_thresh = 0.05, hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_fnr = 3, suff_nreads = NULL, suff_ralign = 65, suff_breadth = 0.8, suff_fnr = NULL, - plot_dir = NULL, hist_breaks = 10){ - - # Create plot directory, if necessary - if (!is.null(plot_dir) && !file.exists(plot_dir)){ - dir.create(plot_dir) - } - + plot = FALSE, hist_breaks = 10){ + criterion_count = 0 - + mix_plot = list(nreads = NULL,ralign = NULL,breadth = NULL,fnr = NULL) - + ### ----- Primary Criterion 1) Number of Reads. ----- - - if(!is.null(nreads)){ + + if(!is.null(nreads)){ criterion_count = 1 - + logr = log10(nreads + 1) - + LOGR_CUTOFF = log10(hard_nreads + 1) # Hard Sub-Criterion - + if (!is.null(zcut)){ LOGR_CUTOFF = max(mean(logr) - zcut*sd(logr), LOGR_CUTOFF) # SD Sub-Criterion LOGR_CUTOFF = max(median(logr) - zcut*mad(logr), LOGR_CUTOFF ) # MAD Sub-Criterion - + if(mixture){ # Mixture SD Sub-Criterion is.multimodal = dip.test(logr)$p.value < dip_thresh @@ -132,22 +126,22 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, }else{ filtered_nreads = NA } - + ## ----- Primary Criterion 2) Ratio of reads aligned. ----- - + if(!is.null(ralign)){ criterion_count = criterion_count + 1 - + RALIGN_CUTOFF = hard_ralign - + if (!is.null(zcut)){ RALIGN_CUTOFF = max(mean(ralign) - zcut*sd(ralign), RALIGN_CUTOFF) # SD Sub-Criterion RALIGN_CUTOFF = max(median(ralign) - zcut*mad(ralign), RALIGN_CUTOFF) # MAD Sub-Criterion - + if(mixture){ # Mixture SD Sub-Criterion - + is.multimodal = dip.test(ralign)$p.value < dip_thresh - + if(is.multimodal){ capture.output(mixmdl <- normalmixEM(ralign,k=2)) mix_plot$ralign = mixmdl @@ -155,7 +149,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, RALIGN_CUTOFF = max(mixmdl$mu[component] - zcut*mixmdl$sigma[component], RALIGN_CUTOFF) } } - + if(!is.null(suff_ralign)){ RALIGN_CUTOFF = min(RALIGN_CUTOFF,suff_ralign) } @@ -164,25 +158,25 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, }else{ filtered_ralign = NA } - + ## ----- Primary Criterion 3) Transcriptome Breadth: Fraction of filtered genes detected. ----- - - if(!is.null(gene_filter)){ + + if(!is.null(gene_filter)){ criterion_count = criterion_count + 1 - + breadth = as.numeric(colMeans(expr[gene_filter,] > 0)) - + BREADTH_CUTOFF = hard_breadth - + if (!is.null(zcut)){ - + BREADTH_CUTOFF = max(mean(breadth) - zcut*sd(breadth), BREADTH_CUTOFF) # SD Sub-Criterion BREADTH_CUTOFF = max(median(breadth) - zcut*mad(breadth), BREADTH_CUTOFF) # MAD Sub-Criterion - + if(mixture){ # Mixture SD Sub-Criterion - + is.multimodal = dip.test(breadth)$p.value < dip_thresh - + if(is.multimodal){ capture.output(mixmdl <- normalmixEM(breadth,k=2)) mix_plot$breadth = mixmdl @@ -190,7 +184,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, BREADTH_CUTOFF = max(mixmdl$mu[component] - zcut*mixmdl$sigma[component], BREADTH_CUTOFF) } } - + if(!is.null(suff_breadth)){ BREADTH_CUTOFF = min(BREADTH_CUTOFF,suff_breadth) } @@ -201,10 +195,10 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, } ## ----- Primary Criterion 4) FNR AUC. ----- - - if(!is.null(pos_controls)){ + + if(!is.null(pos_controls)){ criterion_count = criterion_count + 1 - + # Normalize matrix for FNR estimation if(scale.){ nexpr = mean(colSums(expr))*t(t(expr)/colSums(expr)) @@ -214,8 +208,8 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, if(!is.null(glen)){ nexpr = mean(glen)*nexpr/glen } - - # Compute FNR AUC + + # Compute FNR AUC ref.glms = simple_FNR_params(expr = nexpr, pos_controls = pos_controls) AUC = NULL for (si in 1:dim(expr)[2]){ @@ -225,18 +219,18 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, stop("glm fit did not converge") } } - + AUC_CUTOFF = hard_fnr - + if (!is.null(zcut)){ - + AUC_CUTOFF = min(mean(AUC) + zcut*sd(AUC), AUC_CUTOFF) # SD Sub-Criterion AUC_CUTOFF = min(median(AUC) + zcut*mad(AUC), AUC_CUTOFF) # MAD Sub-Criterion - + if(mixture){ # Mixture SD Sub-Criterion - + is.multimodal = dip.test(AUC)$p.value < dip_thresh - + if(is.multimodal){ capture.output(mixmdl <- normalmixEM(AUC,k=2)) mix_plot$fnr = mixmdl @@ -244,7 +238,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, AUC_CUTOFF = min(mixmdl$mu[component] + zcut*mixmdl$sigma[component], AUC_CUTOFF) } } - + if(!is.null(suff_fnr)){ AUC_CUTOFF = max(AUC_CUTOFF,suff_fnr) } @@ -253,20 +247,17 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, }else{ filtered_fnr = NA } - + ## ----- Plotting ----- - - if (!is.null(plot_dir)){ - - - pdf(paste0(plot_dir,"/filtering_per_criterion.pdf")) - + + if (plot & criterion_count > 0) { + is_bad = rep(FALSE,dim(expr)[2]) - + par(mfcol = c(criterion_count,2)) - + if(!is.null(nreads)){ - is_bad = filtered_nreads + is_bad = filtered_nreads if(!is.null(mix_plot$nreads)){ nm_obj = mix_plot$nreads plot(nm_obj,2,main2 = paste0("nreads: Thresh = ",signif(LOGR_CUTOFF,3)," , Rm = ",sum(filtered_nreads)," , Tot_Rm = ",sum(is_bad)), @@ -277,7 +268,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, abline(v = log10(hard_nreads+1), col = "yellow", lty = 1) abline(v = LOGR_CUTOFF, col = "blue", lty = 2) } - + if(!is.null(ralign)){ is_bad = is_bad | filtered_ralign if(!is.null(mix_plot$ralign)){ @@ -290,7 +281,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, abline(v = hard_ralign, col = "yellow", lty = 1) abline(v = RALIGN_CUTOFF, col = "blue", lty = 2) } - + if(!is.null(gene_filter)){ is_bad = is_bad | filtered_breadth if(!is.null(mix_plot$breadth)){ @@ -303,7 +294,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, abline(v = hard_breadth, col = "yellow", lty = 1) abline(v = BREADTH_CUTOFF, col = "blue", lty = 2) } - + if(!is.null(pos_controls)){ is_bad = is_bad | filtered_fnr if(!is.null(mix_plot$fnr)){ @@ -316,7 +307,7 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, abline(v = hard_fnr, col = "yellow", lty = 1) abline(v = AUC_CUTOFF, col = "blue", lty = 2) } - + if(!is.null(nreads)){ hist(logr[!is_bad], main = paste0("nreads: Tot = ",sum(!is_bad)), xlab = "log10(NREADS+1)", breaks = hist_breaks) } @@ -329,18 +320,15 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, if(!is.null(pos_controls)){ hist(AUC[!is_bad], main = paste0("auc: Tot = ",sum(!is_bad)), xlab = "FNR AUC", breaks = hist_breaks) } - dev.off() - - pdf(paste0(plot_dir,"/overlap_of_criteria.pdf")) - v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr) - rownames(v) = c("nreads","ralign","breadth","fnr") - v = na.omit(v) - m = v %*% t(v) - m = t(t(m)/diag(m)) - barplot(m, beside = T,legend.text = T, ylim = c(0,1.5)) - dev.off() + + # v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr) + # rownames(v) = c("nreads","ralign","breadth","fnr") + # v = na.omit(v) + # m = v %*% t(v) + # + # barplot(m, beside = TRUE, legend.text = TRUE) } - + return(list(filtered_nreads = filtered_nreads, filtered_ralign = filtered_ralign, filtered_breadth = filtered_breadth, @@ -349,12 +337,12 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, #' factor-based sample filtering: function to filter single-cell RNA-Seq libraries. -#' -#' This function returns a sample-filtering report for each cell in the input expression matrix, +#' +#' This function returns a sample-filtering report for each cell in the input expression matrix, #' describing whether it passed filtering by factor-based filtering, using PCA of quality metrics. -#' +#' #' @details None -#' +#' #' @param expr matrix The data matrix (genes in rows, cells in columns). #' @param qual matrix Quality metric data matrix (cells in rows, metrics in columns). #' @param gene_filter A logical vector indexing genes that will be used for PCA. @@ -363,148 +351,136 @@ metric_sample_filter = function(expr, nreads = NULL, ralign = NULL, #' @param qual_select_q_thresh numeric. q-value threshold for quality/expression correlation significance tests. Default 0.01 #' @param force_metrics logical. If not NULL, indexes quality metric to be forcefully included in quality PCA. #' @param good_metrics logical. If not NULL, indexes quality metric that indicate better quality when of higher value. -#' @param min_qual_variance numeric. Minimum proportion of selected quality variance addressed in filtering. Default 0.70 +#' @param min_qual_variance numeric. Minimum proportion of selected quality variance addressed in filtering. Default 0.70 #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (quality score). -#' If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. +#' If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. #' Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion. #' @param dip_thresh A numeric value determining dip test p-value threshold. Default 0.05. -#' @param plot_dir If not null, specifies path to plot output -#' @param hist_breaks hist() breaks argument -#' +#' @param plot logical. Should a plot be produced? +#' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`. +#' #' @return A logical, representing samples passing factor-based filter. -#' +#' #' @importFrom mixtools normalmixEM #' @importFrom diptest dip.test #' @import gplots #' @export factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5, qual_select_q_thresh = 0.01, force_metrics = NULL, good_metrics = NULL, - min_qual_variance = 0.7, zcut = 1, + min_qual_variance = 0.7, zcut = 1, mixture = TRUE, dip_thresh = .01, - plot_dir = NULL, hist_breaks = 20){ - - # Create plot directory, if necessary - if (!is.null(plot_dir) && !file.exists(plot_dir)){ - dir.create( plot_dir) - } + plot = FALSE, hist_breaks = 20){ # Gene filter vector if(is.null(gene_filter)){ gene_filter = rep(TRUE,dim(expr)[1]) } - + ## ----- Expression PCs (based on high-quality genes) ----- pc = prcomp(t(log1p(expr[gene_filter,])), center = TRUE, scale = TRUE) - + ## ----- Quality PCs ----- cors = cor(pc$x[,1: max_exp_pcs],qual,method = "spearman") z = sqrt((dim(pc$x)[1]-3)/1.06)*(1/2)*log((1+cors)/(1-cors)) p = 2*pnorm(-abs(z)) to_keep = p.adjust(p,method = "BH") < qual_select_q_thresh to_keep_vec = colSums(matrix(to_keep,nrow = max_exp_pcs)) > 0 - + if(is.null(good_metrics)){ good_metrics = rep(FALSE,dim(qual)[2]) } - + # Introduce forced metrics if(!is.null(force_metrics)){ to_keep_vec = to_keep_vec | force_metrics } - + if (sum(to_keep_vec) == 0){ warning("No quality metrics were selected. All samples pass filter.") return(rep(TRUE,dim(expr)[2])) } - + keep_quals = qual[,to_keep_vec] - + qpc = prcomp(keep_quals,center = T,scale. = T) - - if(!is.null(plot_dir)){ - pdf(paste0(plot_dir,"/qual_csum_var.pdf")) + + if(plot) { csum = cumsum((qpc$sdev^2)/sum(qpc$sdev^2)) plot(csum, main = "Cumulative Quality PC Variance", ylab = "Fraction of Total Variance") abline(h = min_qual_variance, lty = 2, col = "red") - dev.off() } num_qual_pcs = which(csum > min_qual_variance)[1] - - if(!is.null(plot_dir)){ + + if(plot){ for (i in 1:num_qual_pcs){ - pdf(paste0(plot_dir,paste0("/qc_pc",i,".pdf"))) par(mfrow = c(2,1)) hist(qpc$x[,i],breaks = hist_breaks, main = paste0("Distribution of Quality PC ",i), xlab = paste0("Qual PC",i)) barplot(abs(qpc$rotation[,i]),col = c("red","green")[1 + (qpc$rotation[,i] > 0)], cex.names = .25,horiz = T, las=1, main = "Loadings") - dev.off() } } - - if(!is.null(plot_dir)){ - pdf(paste0(plot_dir,"/qual_corr_heatmap.pdf")) + + if(plot){ heatmap.2(cor(keep_quals),key.title = "",key.xlab = "Spearman Corr.",density.info = 'none',trace = 'none',margins = c(20,20), cexRow = .7, cexCol = .7) - dev.off() } - + # Only perform filtering when zcut is not null if (!is.null(zcut)){ - + # Initializing sample removal vector to_remove = rep(F,dim(expr)[2]) - + # Check if "good" metrics have been selected -> if filtering is signed is_signed = sum(to_keep_vec & good_metrics) > 0 - + if(!is_signed){ warning("No good metrics were selected. Unsigned filtering applied.") } - + # Loop over quality PCs until we've covered min_qual_variance of the quality variance for ( i in 1:num_qual_pcs){ - + # Quality Score - qscore = qpc$x[,i] - + qscore = qpc$x[,i] + # Signed Quality Score if (is_signed){ qscore = qscore * median(sign(qpc$rotation[,i][good_metrics[to_keep_vec]])) } - + # Simple thresholds CUTOFF = median(qscore) - zcut*mad(qscore) CUTOFF = max(mean(qscore) - zcut*sd(qscore), CUTOFF) - + # Reverse thresholds if(!is_signed){ RCUTOFF = median(qscore) + zcut*mad(qscore) RCUTOFF = min(mean(qscore) + zcut*sd(qscore), RCUTOFF) } - + # Mixture model threshold (Signed Only!) - if(mixture && is_signed){ - - is.multimodal = dip.test(qscore)$p.value < dip_thresh - + if(mixture && is_signed){ + + is.multimodal = dip.test(qscore)$p.value < dip_thresh + if(is.multimodal){ mixmdl = normalmixEM(qscore,k=2) component = which(mixmdl$mu %in% max(mixmdl$mu)) CUTOFF = max(mixmdl$mu[component] - zcut*mixmdl$sigma[component], CUTOFF) - + } } - if(!is.null(plot_dir)){ - - pdf(paste0(plot_dir,paste0("/qc_filter_pc",i,".pdf")),width = 20) + if(plot){ + par(mfrow = c(1,2)) hist(qscore,breaks = hist_breaks,main = paste0("Distribution of Quality Score ",i), xlab = paste0("Qual Score ",i)) abline(v = CUTOFF, lty = 2, col = "red") if(!is_signed){ abline(v = RCUTOFF, lty = 2, col = "red") } - + if(i == num_qual_pcs){ # Exceeds minimum quality of variance - last filter - + if(is_signed){ to_remove = to_remove | (qscore < CUTOFF) hist(qpc$x[,i][!(qscore < CUTOFF)],breaks = hist_breaks, main = paste0("Thresh = ",signif(CUTOFF,3),", Rm = ",sum(qscore < CUTOFF),", Tot Rm = ",sum(to_remove) ),xlab = paste0("Qual Score ",i)) @@ -512,10 +488,9 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5, to_remove = to_remove | ((qscore < CUTOFF) | (qscore > RCUTOFF)) hist(qpc$x[,i][!((qscore < CUTOFF) | (qscore > RCUTOFF))],breaks = hist_breaks, main = paste0("Threshs = ",signif(CUTOFF,3),"/",signif(RCUTOFF,3),", Rm = ",sum((qscore < CUTOFF) | (qscore > RCUTOFF)),", Tot Rm = ",sum(to_remove) ),xlab = paste0("Qual Score ",i)) } - - dev.off() + break() - + }else{ if(is_signed){ to_remove = to_remove | (qscore < CUTOFF) @@ -524,12 +499,11 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5, to_remove = to_remove | ((qscore < CUTOFF) | (qscore > RCUTOFF)) hist(qpc$x[,i][!((qscore < CUTOFF) | (qscore > RCUTOFF))],breaks = hist_breaks, main = paste0("Threshs = ",signif(CUTOFF,3),"/",signif(RCUTOFF,3),", Rm = ",sum((qscore < CUTOFF) | (qscore > RCUTOFF)) ),xlab = paste0("Qual Score ",i)) } - dev.off() } } - + } - + return(!to_remove) }else{ warning("No z-cutoff specified, thus no filtering results returned.") diff --git a/inst/NEWS b/inst/NEWS index 957c48a..d0fc6f6 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -6,6 +6,7 @@ Changes in version 0.0.4 * Updated clusterExperiment dependency * Added RLE measures to scone evaluation * Added FQT_FN to implement careful ties handling by FQ +* Better handling of plots in sample filter functions Changes in version 0.0.3 ======================== diff --git a/man/factor_sample_filter.Rd b/man/factor_sample_filter.Rd index ee7a7ca..12d1054 100644 --- a/man/factor_sample_filter.Rd +++ b/man/factor_sample_filter.Rd @@ -7,7 +7,7 @@ factor_sample_filter(expr, qual, gene_filter = NULL, max_exp_pcs = 5, qual_select_q_thresh = 0.01, force_metrics = NULL, good_metrics = NULL, min_qual_variance = 0.7, zcut = 1, mixture = TRUE, dip_thresh = 0.01, - plot_dir = NULL, hist_breaks = 20) + plot = FALSE, hist_breaks = 20) } \arguments{ \item{expr}{matrix The data matrix (genes in rows, cells in columns).} @@ -30,20 +30,20 @@ If NULL, all genes are used.} \item{zcut}{A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.} \item{mixture}{A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (quality score). -If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. +If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion.} \item{dip_thresh}{A numeric value determining dip test p-value threshold. Default 0.05.} -\item{plot_dir}{If not null, specifies path to plot output} +\item{plot}{logical. Should a plot be produced?} -\item{hist_breaks}{hist() breaks argument} +\item{hist_breaks}{hist() breaks argument. Ignored if `plot=FALSE`.} } \value{ A logical, representing samples passing factor-based filter. } \description{ -This function returns a sample-filtering report for each cell in the input expression matrix, +This function returns a sample-filtering report for each cell in the input expression matrix, describing whether it passed filtering by factor-based filtering, using PCA of quality metrics. } \details{ diff --git a/man/metric_sample_filter.Rd b/man/metric_sample_filter.Rd index f1ab936..7dcbc53 100644 --- a/man/metric_sample_filter.Rd +++ b/man/metric_sample_filter.Rd @@ -4,19 +4,17 @@ \alias{metric_sample_filter} \title{metric-based sample filtering: function to filter single-cell RNA-Seq libraries.} \usage{ -metric_sample_filter(expr, nreads = NULL, ralign = NULL, +metric_sample_filter(expr, nreads = colSums(expr), ralign = NULL, gene_filter = NULL, pos_controls = NULL, scale. = FALSE, glen = NULL, AUC_range = c(0, 6), zcut = 1, mixture = TRUE, dip_thresh = 0.05, hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_fnr = 3, suff_nreads = NULL, suff_ralign = 65, - suff_breadth = 0.8, suff_fnr = NULL, plot_dir = NULL, - hist_breaks = 10) + suff_breadth = 0.8, suff_fnr = NULL, plot = FALSE, hist_breaks = 10) } \arguments{ \item{expr}{matrix The data matrix (genes in rows, cells in columns).} -\item{nreads}{A numeric vector representing number of reads in each library. -If NULL, filtered_nreads will be returned NA.} +\item{nreads}{A numeric vector representing number of reads in each library. Default to `colSums` of `expr`.} \item{ralign}{A numeric vector representing the proportion of reads aligned to the reference genome in each library. If NULL, filtered_ralign will be returned NA.} @@ -37,7 +35,7 @@ If NULL, filtered_fnr will be returned NA.} If NULL, only hard threshold sub-criteria will be applied.} \item{mixture}{A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric). -If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. +If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion.} \item{dip_thresh}{A numeric value determining dip test p-value threshold. Default 0.05.} @@ -58,9 +56,9 @@ Samples deviating zcut sd's from optimal mean (in the inferior direction), have \item{suff_fnr}{numeric. If not null, serves as an lower bound on fnr threshold.} -\item{plot_dir}{If not null, specifies path to plot output} +\item{plot}{logical. Should a plot be produced?} -\item{hist_breaks}{hist() breaks argument} +\item{hist_breaks}{hist() breaks argument. Ignored if `plot=FALSE`.} } \value{ A list with the following elements: @@ -72,12 +70,12 @@ A list with the following elements: } } \description{ -This function returns a sample-filtering report for each cell in the input expression matrix, +This function returns a sample-filtering report for each cell in the input expression matrix, describing which filtering criteria are satisfied. } \details{ -For each primary criterion (metric), a sample is evaluated based on 4 sub-criteria: -1) Hard (encoded) threshold +For each primary criterion (metric), a sample is evaluated based on 4 sub-criteria: +1) Hard (encoded) threshold 2) Adaptive thresholding via sd's from the mean 3) Adaptive thresholding via mad's from the median 4) Adaptive thresholding via sd's from the mean (after mixture modeling) From 0b4b019ba6da67867500488dba333121627035b7 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 8 Jun 2016 11:14:15 -0700 Subject: [PATCH 14/18] Close #42 --- DESCRIPTION | 2 +- R/scone_main.R | 32 +++++++++++++------------------- inst/NEWS | 1 + man/scone.Rd | 8 ++++---- tests/testthat/test_scone.R | 10 +++++----- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0eafb03..df5f2cd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9004 +Version: 0.0.3-9005 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", diff --git a/R/scone_main.R b/R/scone_main.R index c4b5528..ff38215 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -57,17 +57,17 @@ #' @return A list with the following elements: #' \itemize{ #' \item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.} -#' \item{evaluation}{ A matrix containing raw evaluation metrics for each normalization method. Rows are sorted in the same order as in the ranks output matrix. NULL when evaluate = FALSE.} -#' \item{ranks}{ A matrix containing rank-scores for each normalization, including median rank across all scores. Rows are sorted by increasing median rank. NULL when evaluate = FALSE.} +#' \item{metrics}{ A matrix containing raw evaluation metrics for each normalization method (see \code{\link{score_matrix}} for details). Rows are sorted in the same order as in the scores output matrix. NULL when evaluate = FALSE.} +#' \item{scores}{ A matrix containing scores for each normalization, including average score. Rows are sorted by increasing mean score. NULL when evaluate = FALSE.} #' \item{params}{ A data.frame with each row corresponding to a set of normalization parameters.} #' } #' @return If \code{run=FALSE} a \code{data.frame} #' with each row corresponding to a set of normalization parameters. #' -#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: +#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores: #' Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. #' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. -#' Rank-scores are computed so that higer-performing methods are assigned a lower-rank. +#' Scores are computed so that higer-performing methods are assigned a higher scores. #' scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), @@ -349,25 +349,19 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, names(evaluation) <- apply(params, 1, paste, collapse=',') evaluation <- simplify2array(evaluation) - ev_for_ranks <- evaluation * c(-1, 1, -1, 1, 1, 1, -1, -1, 1, 1) - ranks <- apply(ev_for_ranks[apply(evaluation, 1, function(x) !all(is.na(x))),, drop=FALSE], 1, rank) - if(NCOL(ranks) > 1) { - med_rank <- rowMedians(ranks) - ranks <- cbind(ranks, med_rank)[order(med_rank),] - } else { - med_rank <- median(ranks) - ranks <- t(data.frame(c(ranks, med_rank=med_rank))) - rownames(ranks) <- apply(params, 1, paste, collapse=',') - } + scores <- evaluation * c(1, -1, 1, -1, -1, -1, 1, 1, -1, -1) + + mean_score <- colMeans(scores, na.rm=TRUE) + scores <- cbind(t(scores), mean_score)[order(mean_score, decreasing = TRUE),] - evaluation <- t(evaluation[,order(med_rank), drop=FALSE]) - adjusted <- adjusted[order(med_rank)] - params <- params[order(med_rank),] + evaluation <- t(evaluation[,order(mean_score, decreasing = TRUE), drop=FALSE]) + adjusted <- adjusted[order(mean_score, decreasing = TRUE)] + params <- params[order(mean_score, decreasing = TRUE),] } else { - evaluation <- ranks <- NULL + evaluation <- scores <- NULL } if(verbose) message("Done!") - return(list(normalized_data=adjusted, evaluation=evaluation, ranks=ranks, params=params)) + return(list(normalized_data=adjusted, metrics=evaluation, scores=scores, params=params)) } diff --git a/inst/NEWS b/inst/NEWS index d0fc6f6..b870407 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -7,6 +7,7 @@ Changes in version 0.0.4 * Added RLE measures to scone evaluation * Added FQT_FN to implement careful ties handling by FQ * Better handling of plots in sample filter functions +* Mean score rather than median rank to evaluate normalizations Changes in version 0.0.3 ======================== diff --git a/man/scone.Rd b/man/scone.Rd index 2e5b8e9..3f2754b 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -75,8 +75,8 @@ back into the algorithm (see example).} A list with the following elements: \itemize{ \item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.} -\item{evaluation}{ A matrix containing raw evaluation metrics for each normalization method. Rows are sorted in the same order as in the ranks output matrix. NULL when evaluate = FALSE.} -\item{ranks}{ A matrix containing rank-scores for each normalization, including median rank across all scores. Rows are sorted by increasing median rank. NULL when evaluate = FALSE.} +\item{metrics}{ A matrix containing raw evaluation metrics for each normalization method (see \code{\link{score_matrix}} for details). Rows are sorted in the same order as in the scores output matrix. NULL when evaluate = FALSE.} +\item{scores}{ A matrix containing scores for each normalization, including average score. Rows are sorted by increasing mean score. NULL when evaluate = FALSE.} \item{params}{ A data.frame with each row corresponding to a set of normalization parameters.} } @@ -99,9 +99,9 @@ non-imputed data in the comparison, the identity function must be included. If both \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. -Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score: +Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores: Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. -Rank-scores are computed so that higer-performing methods are assigned a lower-rank. +Scores are computed so that higer-performing methods are assigned a higher scores. } diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R index e14cbd5..48a69a0 100644 --- a/tests/testthat/test_scone.R +++ b/tests/testthat/test_scone.R @@ -161,9 +161,9 @@ test_that("Test imputation and scaling", { eval_poscon=as.character(21:30), eval_kclust = 2, verbose=TRUE)) - expect_equal(rownames(res$evaluation), rownames(res$ranks)) - expect_equal(rownames(res$evaluation), rownames(res$params)) - expect_equal(rownames(res$evaluation), names(res$normalized_data)) + expect_equal(rownames(res$metrics), rownames(res$scores)) + expect_equal(rownames(res$metrics), rownames(res$params)) + expect_equal(rownames(res$metrics), names(res$normalized_data)) res2 <- scone(e, imputation=list(none=identity, zinb=impute_zinb), scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN), @@ -176,8 +176,8 @@ test_that("Test imputation and scaling", { norm_ordered <- res2$normalized_data[names(res$normalized_data)] expect_equal(norm_ordered, res$normalized_data) - expect_null(res2$evaluation) - expect_null(res2$ranks) + expect_null(res2$metrics) + expect_null(res2$scores) }) test_that("scone works with only one normalization",{ From 9a32a80f2df678837966090b265a69a2db2d2e80 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 8 Jun 2016 11:25:40 -0700 Subject: [PATCH 15/18] Close #29 --- R/scone_main.R | 2 +- inst/NEWS | 1 + man/scone.Rd | 14 +++++++------- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/R/scone_main.R b/R/scone_main.R index ff38215..ff51458 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -69,7 +69,7 @@ #' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. #' Scores are computed so that higer-performing methods are assigned a higher scores. #' -scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, +scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, diff --git a/inst/NEWS b/inst/NEWS index b870407..c2ffdfc 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -8,6 +8,7 @@ Changes in version 0.0.4 * Added FQT_FN to implement careful ties handling by FQ * Better handling of plots in sample filter functions * Mean score rather than median rank to evaluate normalizations +* Default value for imputation Changes in version 0.0.3 ======================== diff --git a/man/scone.Rd b/man/scone.Rd index 3f2754b..2338dd5 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -4,13 +4,13 @@ \alias{scone} \title{scone main wrapper: function to apply and evaluate all the normalization schemes} \usage{ -scone(expr, imputation, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL, - qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no", - "yes", "force"), bio = NULL, batch = NULL, run = TRUE, - evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, - eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = NULL, - eval_poscon = NULL, params = NULL, verbose = FALSE, - conditional_pam = FALSE) +scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, + k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes", + "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, + batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, + eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, + eval_negcon = NULL, eval_poscon = NULL, params = NULL, + verbose = FALSE, conditional_pam = FALSE) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} From 2af67a43d8de80841ea6a1ffc0412bf8fd6e7d55 Mon Sep 17 00:00:00 2001 From: mbcole Date: Wed, 8 Jun 2016 14:32:39 -0700 Subject: [PATCH 16/18] Multiple changes. See inst/NEWS --- DESCRIPTION | 3 +- NAMESPACE | 1 - R/sample_filtering.R | 4 +- R/scone_eval.R | 114 +++++++++++++++++------------------- R/scone_main.R | 38 ++++++------ inst/NEWS | 4 +- man/factor_sample_filter.Rd | 2 +- man/metric_sample_filter.Rd | 2 +- man/scone.Rd | 18 +++--- man/score_matrix.Rd | 57 +++++++++--------- 10 files changed, 121 insertions(+), 122 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index df5f2cd..5d0ff7e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9005 +Version: 0.0.3-9006 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", @@ -15,7 +15,6 @@ Depends: Imports: BiocParallel, cluster, - clusterExperiment, DESeq, EDASeq, MASS, diff --git a/NAMESPACE b/NAMESPACE index 34e18fa..0496d89 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,7 +28,6 @@ importFrom(RUVSeq,RUVg) importFrom(aroma.light,normalizeQuantileRank.matrix) importFrom(class,knn) importFrom(cluster,silhouette) -importFrom(clusterExperiment,subsampleClustering) importFrom(diptest,dip.test) importFrom(edgeR,calcNormFactors) importFrom(fpc,pamk) diff --git a/R/sample_filtering.R b/R/sample_filtering.R index 828bc81..515ec82 100644 --- a/R/sample_filtering.R +++ b/R/sample_filtering.R @@ -55,7 +55,7 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){ #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. #' If NULL, only hard threshold sub-criteria will be applied. #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric). -#' If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. +#' If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component normal mixture model. #' Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion. #' @param dip_thresh A numeric value determining dip test p-value threshold. Default 0.05. #' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000. @@ -354,7 +354,7 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL, #' @param min_qual_variance numeric. Minimum proportion of selected quality variance addressed in filtering. Default 0.70 #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1. #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (quality score). -#' If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. +#' If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component normal mixture model. #' Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion. #' @param dip_thresh A numeric value determining dip test p-value threshold. Default 0.05. #' @param plot logical. Should a plot be produced? diff --git a/R/scone_eval.R b/R/scone_eval.R index f9c4f39..90f7225 100644 --- a/R/scone_eval.R +++ b/R/scone_eval.R @@ -1,39 +1,41 @@ #' scone evaluation: function to evaluate one normalization scheme #' #' This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on -#' projections of the normalized data. -#' -#' @details None. +#' projections of the normalized data, correlations, and RLE metrics. #' +#' @details The eval_proj function argument must have 2 inputs: +#' \itemize{ +#' \item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).} +#' \item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.} +#' } +#' and it must output a matrix representation of the original data (cells in rows, factors in columns). +#' #' @param expr matrix. The data matrix (genes in rows, cells in columns). #' @param eval_pcs numeric. The number of principal components to use for evaluation. #' Ignored if !is.null(eval_proj). -#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +#' @param eval_proj function. Projection function for evaluation (see details). #' If NULL, PCA is used for projection -#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args. -#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. -#' @param bio factor. The biological condition (variation to be preserved), NA is allowed. -#' If NULL, condition ASW will be returned NA. -#' @param batch factor. The known batch variable (variation to be removed), NA is allowed. -#' If NULL, batch ASW will be returned NA. +#' @param eval_proj_args list. List of arguments passed to projection function as eval_proj_args (see details). +#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness (PAM_SIL) evaluation. +#' If an array of integers, largest average silhouette width (tightness) will be reported in PAM_SIL. If NULL, PAM_SIL will be returned NA. +#' @param bio factor. A known biological condition (variation to be preserved), NA is allowed. +#' If NULL, condition ASW, BIO_SIL, will be returned NA. +#' @param batch factor. A known batch variable (variation to be removed), NA is allowed. +#' If NULL, batch ASW, BATCH_SIL, will be returned NA. #' @param qc_factors Factors of unwanted variation derived from quality metrics. -#' If NULL, qc correlations will be returned NA. -#' @param ruv_factors Factors of unwanted variation derived from negative control genes (adjustable set). -#' If NULL, ruv correlations will be returned NA. -#' @param uv_factors Factors of unwanted variation derived from negative control genes (un-adjustable set). -#' If NULL, uv correlations will be returned NA. -#' @param wv_factors Factors of wanted variation derived from positive control genes (un-adjustable set). -#' If NULL, wv correlations will be returned NA. -#' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out. -#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), -#' and a weighted average is returned. +#' If NULL, qc correlations, EXP_QC_COR, will be returned NA. +#' @param uv_factors Factors of unwanted variation derived from negative control genes (evaluation set). +#' If NULL, uv correlations, EXP_UV_COR, will be returned NA. +#' @param wv_factors Factors of wanted variation derived from positive control genes (evaluation set). +#' If NULL, wv correlations, EXP_WV_COR, will be returned NA. +#' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection. +#' @param conditional_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA), +#' and a weighted average silhouette width is returned. #' @param ref_expr matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). #' If NULL, preserved variance is returned NA. #' #' @importFrom class knn #' @importFrom fpc pamk -#' @importFrom clusterExperiment subsampleClustering #' @importFrom cluster silhouette #' @importFrom matrixStats rowMedians colMedians colIQRs #' @@ -41,27 +43,24 @@ #' #' @return A list with the following elements: #' \itemize{ -#' \item{BIO_SIL}{ Average silhoutte width by biological condition.} -#' \item{BATCH_SIL}{ Average silhoutte width by batch condition.} -#' \item{PAM_SIL}{ Maximum average silhoutte width from pam clustering.} +#' \item{BIO_SIL}{ Average silhouette width by biological condition.} +#' \item{BATCH_SIL}{ Average silhouette width by batch condition.} +#' \item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).} #' \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} -#' \item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -#' \item{VAR_PRES}{ Variance preserved measure.} +#' \item{VAR_PRES}{ Mean correlation between matched genes in normalized and reference matrices.} #' \item{RLE_MED}{ The mean squared median Relative Log Expression (RLE).} #' \item{RLE_IQR}{ The mean inter-quartile range (IQR) of the RLE.} #' } #' -score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, - eval_proj_args = NULL, - eval_kclust = NULL, - bio = NULL, batch = NULL, - qc_factors = NULL, - ruv_factors = NULL, uv_factors = NULL, - wv_factors = NULL, is_log=FALSE, - conditional_pam = FALSE , ref_expr = NULL){ +score_matrix <- function(expr, eval_pcs = 3, + eval_proj = NULL, eval_proj_args = NULL, + eval_kclust = NULL, + bio = NULL, batch = NULL, + qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, + is_log=FALSE, conditional_pam = FALSE , ref_expr = NULL){ if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){ stop("NA/Inf/NaN Expression Values.") @@ -72,7 +71,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, } if(is.null(eval_proj)){ - proj = prcomp(t(expr),center = TRUE,scale. = TRUE)$x[,1:eval_pcs] + proj = svd(scale(t(expr),center = TRUE,scale = TRUE),eval_pcs,0)$u }else{ proj = eval_proj(expr,eval_proj_args = eval_proj_args) eval_pcs = ncol(proj) @@ -102,12 +101,11 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, BATCH_SIL <- NA } - ## ------ PAM (Conditional) Tightness and Stability ----- + ## ------ PAM Tightness ----- if ( !is.null(eval_kclust) ){ - - # Tightness - + + # "Conditional" PAM if(conditional_pam){ PAM_SIL = 0 @@ -135,8 +133,10 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, stop("Number of clusters 'k' must be smaller than unclassified bio set size") } } - + PAM_SIL = PAM_SIL/length(bio) + + # Traditional PAM }else{ pamk_object = pamk(proj,krange = eval_kclust) PAM_SIL = pamk_object$pamobject$silinfo$avg.width @@ -146,7 +146,7 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, PAM_SIL = NA } - ## ------ Hidden Factors ----- + ## ------ Correlation with Factors ----- # Max cor with quality factors. if(!is.null(qc_factors)){ @@ -155,13 +155,6 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, EXP_QC_COR = NA } - # Max cor with RUV factors. - if(!is.null(ruv_factors)){ - EXP_RUV_COR = max(cor(proj,ruv_factors,method = "spearman")^2) - }else{ - EXP_RUV_COR = NA - } - # Max cor with UV factors. if(!is.null(uv_factors)){ EXP_UV_COR = max(cor(proj,uv_factors,method = "spearman")^2) @@ -178,24 +171,25 @@ score_matrix <- function(expr, eval_pcs = 3, eval_proj = NULL, ## ----- Variance Preserved if(!is.null(ref_expr)){ - z1 = scale(t(ref_expr),scale = FALSE) - z1 = t(t(z1)/sqrt(colSums(z1^2))) - z2 = scale(t(expr),scale = FALSE) - z2 = t(t(z2)/sqrt(colSums(z2^2))) - VAR_PRES = mean(diag(t(z1) %*% (z2))) + + split_ref_expr = split(t(ref_expr), rep(1:nrow(ref_expr), each = ncol(ref_expr))) + split_expr = split(t(expr), rep(1:nrow(expr), each = ncol(expr))) + VAR_PRES = mean(mapply(function(x,y) cor(x,y),split_ref_expr,split_expr)) + }else{ VAR_PRES = NA } - ## ----- RLE + ## ----- RLE Measures rle <- expr - rowMedians(expr) RLE_MED <- mean(colMedians(rle)^2) RLE_IQR <- mean(colIQRs(rle)) - scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, EXP_QC_COR, EXP_RUV_COR, EXP_UV_COR, - EXP_WV_COR, VAR_PRES, RLE_MED, RLE_IQR) - names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", "EXP_QC_COR", "EXP_RUV_COR", - "EXP_UV_COR", "EXP_WV_COR","VAR_PRES", "RLE_MED", "RLE_IQR") + scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, + EXP_QC_COR, EXP_UV_COR, EXP_WV_COR, + VAR_PRES, RLE_MED, RLE_IQR) + names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", + "EXP_QC_COR", "EXP_UV_COR", "EXP_WV_COR", + "VAR_PRES", "RLE_MED", "RLE_IQR") return(scores) - } diff --git a/R/scone_main.R b/R/scone_main.R index ff51458..985f1a5 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -27,14 +27,14 @@ #' @param batch factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation. #' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster). #' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE. -#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +#' @param eval_proj function. Projection function for evaluation (see \code{\link{score_matrix}} for details). #' If NULL, PCA is used for projection -#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args. -#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA. +#' @param eval_proj_args list. List passed to the eval_proj function as eval_proj_args. +#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness evaluation. +#' If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA. #' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should -#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -#' If NULL, correlations with negative controls will be returned NA. +#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. +#' Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA. #' @param eval_poscon character. The genes to be used as positive controls for evaluation. These genes should #' be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. #' If NULL, correlations with positive controls will be returned NA. @@ -52,7 +52,7 @@ #' @import BiocParallel #' @export #' -#' @details If both \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. +#' @details If \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. #' #' @return A list with the following elements: #' \itemize{ @@ -65,16 +65,21 @@ #' with each row corresponding to a set of normalization parameters. #' #' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores: -#' Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. -#' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. +#' Positive-signature metrics increase with improving performance, including BIO_SIL, PAM_SIL, EXP_WV_COR, and VAR_PRES. +#' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR, RLE_MED, and RLE_IQR. #' Scores are computed so that higer-performing methods are assigned a higher scores. #' + scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, params=NULL, verbose=FALSE, conditional_pam = FALSE) { + if(is.null(eval_negcon) && !is.null(ruv_negcon)) { + eval_negcon = ruv_negcon + } + if(!is.matrix(expr)) { stop("'expr' must be a matrix.") } else if(is.null(rownames(expr))) { @@ -302,14 +307,11 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 } if(evaluate) { + if(verbose) message("Computing factors for evaluation...") ## generate factors - uv_factors <- wv_factors <- ruv_factors_raw <- NULL - - if(!is.null(ruv_negcon)) { - ruv_factors_raw <- prcomp(t(log1p(expr[ruv_negcon,])), scale=TRUE, center=TRUE)$x - } + uv_factors <- wv_factors <- NULL if(!is.null(eval_negcon)) { uv_factors <- prcomp(t(log1p(expr[eval_negcon,])), scale=TRUE, center=TRUE)$x @@ -318,6 +320,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 if(!is.null(eval_poscon)) { wv_factors <- prcomp(t(log1p(expr[eval_poscon,])), scale=TRUE, center=TRUE)$x } + } if(verbose) message("Factor adjustment and evaluation...") @@ -331,8 +334,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 if(evaluate) { score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, eval_proj = eval_proj, eval_proj_args = eval_proj_args, eval_kclust = eval_kclust, bio = bio, batch = batch, - qc_factors = qc_pcs, ruv_factors = ruv_factors_raw, - uv_factors = uv_factors, wv_factors = wv_factors, + qc_factors = qc_pcs, uv_factors = uv_factors, wv_factors = wv_factors, is_log = TRUE, conditional_pam = conditional_pam, ref_expr = log1p(expr)) } else { @@ -349,7 +351,9 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 names(evaluation) <- apply(params, 1, paste, collapse=',') evaluation <- simplify2array(evaluation) - scores <- evaluation * c(1, -1, 1, -1, -1, -1, 1, 1, -1, -1) + scores <- evaluation * c(1, -1, 1, # "BIO_SIL", "BATCH_SIL", "PAM_SIL" + -1, -1, 1, # "EXP_QC_COR", "EXP_UV_COR", "EXP_WV_COR" + 1, -1, -1) # "VAR_PRES", "RLE_MED", "RLE_IQR" mean_score <- colMeans(scores, na.rm=TRUE) scores <- cbind(t(scores), mean_score)[order(mean_score, decreasing = TRUE),] diff --git a/inst/NEWS b/inst/NEWS index c2ffdfc..8a98fb9 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -3,12 +3,14 @@ Changes in version 0.0.4 * Fixed a few bugs and documentation mismatches * Removed stability evaluation (redundant with sil width and slow) -* Updated clusterExperiment dependency +* Removed clusterExperiment dependency +* Removed RUV correlation score. UV correlation now takes ruv control genes as default * Added RLE measures to scone evaluation * Added FQT_FN to implement careful ties handling by FQ * Better handling of plots in sample filter functions * Mean score rather than median rank to evaluate normalizations * Default value for imputation +* Minor optimizations to evaluations Changes in version 0.0.3 ======================== diff --git a/man/factor_sample_filter.Rd b/man/factor_sample_filter.Rd index 12d1054..cbe93b9 100644 --- a/man/factor_sample_filter.Rd +++ b/man/factor_sample_filter.Rd @@ -30,7 +30,7 @@ If NULL, all genes are used.} \item{zcut}{A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.} \item{mixture}{A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (quality score). -If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component nomal mixture model. +If true, a dip test will be applied to each quality score. If a metric is multimodal, it is fit to a two-component normal mixture model. Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion.} \item{dip_thresh}{A numeric value determining dip test p-value threshold. Default 0.05.} diff --git a/man/metric_sample_filter.Rd b/man/metric_sample_filter.Rd index 7dcbc53..6a00bc2 100644 --- a/man/metric_sample_filter.Rd +++ b/man/metric_sample_filter.Rd @@ -35,7 +35,7 @@ If NULL, filtered_fnr will be returned NA.} If NULL, only hard threshold sub-criteria will be applied.} \item{mixture}{A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric). -If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component nomal mixture model. +If true, a dip test will be applied to each metric. If a metric is multimodal, it is fit to a two-component normal mixture model. Samples deviating zcut sd's from optimal mean (in the inferior direction), have failed this sub-criterion.} \item{dip_thresh}{A numeric value determining dip test p-value threshold. Default 0.05.} diff --git a/man/scone.Rd b/man/scone.Rd index 2338dd5..807db9f 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -46,17 +46,17 @@ of parameters that will be run for inspection by the user.} \item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.} -\item{eval_proj}{function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +\item{eval_proj}{function. Projection function for evaluation (see \code{\link{score_matrix}} for details). If NULL, PCA is used for projection} -\item{eval_proj_args}{list. List of args passed to projection function as eval_proj_args.} +\item{eval_proj_args}{list. List passed to the eval_proj function as eval_proj_args.} -\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} +\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness evaluation. +If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA.} \item{eval_negcon}{character. The genes to be used as negative controls for evaluation. These genes should -be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. -If NULL, correlations with negative controls will be returned NA.} +be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. +Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA.} \item{eval_poscon}{character. The genes to be used as positive controls for evaluation. These genes should be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. @@ -96,12 +96,12 @@ must be included in the scaling list. Analogously, if one wants to avoid the imp non-imputed data in the comparison, the identity function must be included. -If both \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. +If \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user. Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores: -Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. -Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. +Positive-signature metrics increase with improving performance, including BIO_SIL, PAM_SIL, EXP_WV_COR, and VAR_PRES. +Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR, RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods are assigned a higher scores. } diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd index 37c8a1d..d707920 100644 --- a/man/score_matrix.Rd +++ b/man/score_matrix.Rd @@ -6,8 +6,8 @@ \usage{ score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL, - ruv_factors = NULL, uv_factors = NULL, wv_factors = NULL, - is_log = FALSE, conditional_pam = FALSE, ref_expr = NULL) + uv_factors = NULL, wv_factors = NULL, is_log = FALSE, + conditional_pam = FALSE, ref_expr = NULL) } \arguments{ \item{expr}{matrix. The data matrix (genes in rows, cells in columns).} @@ -15,36 +15,33 @@ score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, \item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if !is.null(eval_proj).} -\item{eval_proj}{function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns). +\item{eval_proj}{function. Projection function for evaluation (see details). If NULL, PCA is used for projection} -\item{eval_proj_args}{list. List of args passed to projection function as eval_proj_args.} +\item{eval_proj_args}{list. List of arguments passed to projection function as eval_proj_args (see details).} -\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation. -If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.} +\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness (PAM_SIL) evaluation. +If an array of integers, largest average silhouette width (tightness) will be reported in PAM_SIL. If NULL, PAM_SIL will be returned NA.} -\item{bio}{factor. The biological condition (variation to be preserved), NA is allowed. -If NULL, condition ASW will be returned NA.} +\item{bio}{factor. A known biological condition (variation to be preserved), NA is allowed. +If NULL, condition ASW, BIO_SIL, will be returned NA.} -\item{batch}{factor. The known batch variable (variation to be removed), NA is allowed. -If NULL, batch ASW will be returned NA.} +\item{batch}{factor. A known batch variable (variation to be removed), NA is allowed. +If NULL, batch ASW, BATCH_SIL, will be returned NA.} \item{qc_factors}{Factors of unwanted variation derived from quality metrics. -If NULL, qc correlations will be returned NA.} +If NULL, qc correlations, EXP_QC_COR, will be returned NA.} -\item{ruv_factors}{Factors of unwanted variation derived from negative control genes (adjustable set). -If NULL, ruv correlations will be returned NA.} +\item{uv_factors}{Factors of unwanted variation derived from negative control genes (evaluation set). +If NULL, uv correlations, EXP_UV_COR, will be returned NA.} -\item{uv_factors}{Factors of unwanted variation derived from negative control genes (un-adjustable set). -If NULL, uv correlations will be returned NA.} +\item{wv_factors}{Factors of wanted variation derived from positive control genes (evaluation set). +If NULL, wv correlations, EXP_WV_COR, will be returned NA.} -\item{wv_factors}{Factors of wanted variation derived from positive control genes (un-adjustable set). -If NULL, wv correlations will be returned NA.} +\item{is_log}{logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection.} -\item{is_log}{logical. If TRUE the expr matrix is already logged and log transformation will not be carried out.} - -\item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), -and a weighted average is returned.} +\item{conditional_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA), +and a weighted average silhouette width is returned.} \item{ref_expr}{matrix. A reference (log-) expression data matrix for calculating preserved variance (genes in rows, cells in columns). If NULL, preserved variance is returned NA.} @@ -52,23 +49,27 @@ If NULL, preserved variance is returned NA.} \value{ A list with the following elements: \itemize{ -\item{BIO_SIL}{ Average silhoutte width by biological condition.} -\item{BATCH_SIL}{ Average silhoutte width by batch condition.} -\item{PAM_SIL}{ Maximum average silhoutte width from pam clustering.} +\item{BIO_SIL}{ Average silhouette width by biological condition.} +\item{BATCH_SIL}{ Average silhouette width by batch condition.} +\item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).} \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.} -\item{EXP_RUV_COR}{ Maximum squared spearman correlation between pcs and active uv factors.} \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.} \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.} -\item{VAR_PRES}{ Variance preserved measure.} +\item{VAR_PRES}{ Mean correlation between matched genes in normalized and reference matrices.} \item{RLE_MED}{ The mean squared median Relative Log Expression (RLE).} \item{RLE_IQR}{ The mean inter-quartile range (IQR) of the RLE.} } } \description{ This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on -projections of the normalized data. +projections of the normalized data, correlations, and RLE metrics. } \details{ -None. +The eval_proj function argument must have 2 inputs: +\itemize{ +\item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).} +\item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.} +} +and it must output a matrix representation of the original data (cells in rows, factors in columns). } From ebe8f3da39a0a8d652ba4b1e60a8f7fbc791e255 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 8 Jun 2016 16:24:11 -0700 Subject: [PATCH 17/18] Minor change --- R/scone_main.R | 12 ++++-------- man/scone.Rd | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/R/scone_main.R b/R/scone_main.R index 985f1a5..15ae6f5 100644 --- a/R/scone_main.R +++ b/R/scone_main.R @@ -33,7 +33,7 @@ #' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness evaluation. #' If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA. #' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should -#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. +#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. #' Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA. #' @param eval_poscon character. The genes to be used as positive controls for evaluation. These genes should #' be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. @@ -73,13 +73,9 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL, - eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL, + eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL, params=NULL, verbose=FALSE, conditional_pam = FALSE) { - if(is.null(eval_negcon) && !is.null(ruv_negcon)) { - eval_negcon = ruv_negcon - } - if(!is.matrix(expr)) { stop("'expr' must be a matrix.") } else if(is.null(rownames(expr))) { @@ -307,7 +303,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 } if(evaluate) { - + if(verbose) message("Computing factors for evaluation...") ## generate factors @@ -320,7 +316,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5 if(!is.null(eval_poscon)) { wv_factors <- prcomp(t(log1p(expr[eval_poscon,])), scale=TRUE, center=TRUE)$x } - + } if(verbose) message("Factor adjustment and evaluation...") diff --git a/man/scone.Rd b/man/scone.Rd index 807db9f..a0355ce 100644 --- a/man/scone.Rd +++ b/man/scone.Rd @@ -9,7 +9,7 @@ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5, "force"), adjust_batch = c("no", "yes", "force"), bio = NULL, batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10, - eval_negcon = NULL, eval_poscon = NULL, params = NULL, + eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL, verbose = FALSE, conditional_pam = FALSE) } \arguments{ @@ -55,7 +55,7 @@ If NULL, PCA is used for projection} If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA.} \item{eval_negcon}{character. The genes to be used as negative controls for evaluation. These genes should -be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. +be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE. Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA.} \item{eval_poscon}{character. The genes to be used as positive controls for evaluation. These genes should From 4ed47737cd519978a9cc8f6db930d37c1172a965 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 8 Jun 2016 16:33:06 -0700 Subject: [PATCH 18/18] Version bumped --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5d0ff7e..0bf2649 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: scone -Version: 0.0.3-9006 +Version: 0.0.4 Title: Single Cell Overview of Normalized Expression data Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets. Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",