From 4f8e761837ee5e98ddd97e11da16fd2a9e23e6eb Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Tue, 7 Jun 2016 11:03:42 -0700
Subject: [PATCH 01/27] Added option to not return normalized data

---
 R/scone_main.R | 181 +++++++++++++++++++++++++++++++------------------
 1 file changed, 116 insertions(+), 65 deletions(-)

diff --git a/R/scone_main.R b/R/scone_main.R
index c4b5528..9aaeaa9 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -1,79 +1,116 @@
-#' scone main wrapper: function to apply and evaluate all the normalization schemes
+#' scone main wrapper: function to apply and evaluate all the normalization
+#' schemes
 #'
-#' This function is a high-level wrapper to apply and evaluate a variety of normalization
-#' schemes to a specified expression matrix.
+#' This function is a high-level wrapper to apply and evaluate a variety of
+#' normalization schemes to a specified expression matrix.
 #'
-#' The function consists of four main steps: imputation, scaling normalization, adjusting for batch effects,
-#' and evaluation of the normalization performance.
+#' The function consists of four main steps: imputation, scaling normalization,
+#' adjusting for batch effects, and evaluation of the normalization performance.
 #'
-#' @details Note that if one wants to include the unnormalized data in the comparison, the identity function
-#' must be included in the scaling list. Analogously, if one wants to avoid the imputation and/or include the
-#' non-imputed data in the comparison, the identity function must be included.
+#' @details Note that if one wants to include the unnormalized data in the
+#'   comparison, the identity function must be included in the scaling list.
+#'   Analogously, if one wants to avoid the imputation and/or include the
+#'   non-imputed data in the comparison, the identity function must be included.
 #'
 #' @param expr matrix. The data matrix (genes in rows, cells in columns).
-#' @param imputation list or function. (A list of) function(s) to be used for imputation.
-#' @param scaling list or function. (A list of) function(s) to be used for scaling normalization.
-#' @param k_ruv numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation).
-#' If 0, RUV will not be performed.
-#' @param k_qc numeric. The maximum number of quality metrics PCs (the function will adjust for 1 to k_qc PCs).
-#' If 0, QC adjustment will not be performed.
-#' @param ruv_negcon character. The genes to be used as negative controls for RUV. Ignored if k_ruv=0.
-#' @param qc matrix. The QC metrics to be used for QC adjustment. Ignored if k_qc=0.
-#' @param adjust_bio character. If 'no' it will not be included in the model; if 'yes', both models with and without 'bio' will be run;
-#' if 'force', only models with 'bio' will be run.
-#' @param adjust_batch character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run;
-#' if 'force', only models with 'batch' will be run.
-#' @param bio factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation.
-#' @param batch factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation.
-#' @param evaluate logical. If FALSE the normalization methods will not be evaluated (faster).
-#' @param eval_pcs numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.
-#' @param eval_proj function. Projection function for evaluation (Inputs: e = genes in rows, cells in columns. eval_proj_args. Output: cells in rows, factors in columns).
-#' If NULL, PCA is used for projection
-#' @param eval_proj_args list. List of args passed to projection function as eval_proj_args.
-#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam tightness and stability evaluation.
-#' If an array of integers, largest average silhoutte width (tightness) / maximum co-clustering stability will be reported. If NULL, tightness and stability will be returned NA.
-#' @param eval_negcon character. The genes to be used as negative controls for evaluation. These genes should
-#' be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE.
-#' If NULL, correlations with negative controls will be returned NA.
-#' @param eval_poscon character. The genes to be used as positive controls for evaluation. These genes should
-#' be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE.
-#' If NULL, correlations with positive controls will be returned NA.
-#' @param params matrix or data.frame. If given, the algorithm will bypass creating the matrix of possible
-#' parameters, and will use the given matrix. There are basically no checks as to whether this matrix is in the
-#' right format, and is only intended to be used to feed the results of setting run=FALSE
-#' back into the algorithm (see example).
+#' @param imputation list or function. (A list of) function(s) to be used for
+#'   imputation.
+#' @param scaling list or function. (A list of) function(s) to be used for
+#'   scaling normalization.
+#' @param k_ruv numeric. The maximum number of factors of unwanted variation
+#'   (the function will adjust for 1 to k_ruv factors of unwanted variation). If
+#'   0, RUV will not be performed.
+#' @param k_qc numeric. The maximum number of quality metrics PCs (the function
+#'   will adjust for 1 to k_qc PCs). If 0, QC adjustment will not be performed.
+#' @param ruv_negcon character. The genes to be used as negative controls for
+#'   RUV. Ignored if k_ruv=0.
+#' @param qc matrix. The QC metrics to be used for QC adjustment. Ignored if
+#'   k_qc=0.
+#' @param adjust_bio character. If 'no' it will not be included in the model; if
+#'   'yes', both models with and without 'bio' will be run; if 'force', only
+#'   models with 'bio' will be run.
+#' @param adjust_batch character. If 'no' it will not be included in the model;
+#'   if 'yes', both models with and without 'batch' will be run; if 'force',
+#'   only models with 'batch' will be run.
+#' @param bio factor. The biological condition to be included in the adjustment
+#'   model (variation to be preserved). If adjust_bio="no", it will not be used
+#'   for normalization, but only for evaluation.
+#' @param batch factor. The known batch variable to be included in the
+#'   adjustment model (variation to be removed). If adjust_batch="no", it will
+#'   not be used for normalization, but only for evaluation.
+#' @param evaluate logical. If FALSE the normalization methods will not be
+#'   evaluated (faster).
+#' @param eval_pcs numeric. The number of principal components to use for
+#'   evaluation. Ignored if evaluation=FALSE.
+#' @param eval_proj function. Projection function for evaluation (Inputs: e =
+#'   genes in rows, cells in columns. eval_proj_args. Output: cells in rows,
+#'   factors in columns). If NULL, PCA is used for projection
+#' @param eval_proj_args list. List of args passed to projection function as
+#'   eval_proj_args.
+#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam
+#'   tightness and stability evaluation. If an array of integers, largest
+#'   average silhoutte width (tightness) / maximum co-clustering stability will
+#'   be reported. If NULL, tightness and stability will be returned NA.
+#' @param eval_negcon character. The genes to be used as negative controls for
+#'   evaluation. These genes should be expected not to change according to the
+#'   biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL,
+#'   correlations with negative controls will be returned NA.
+#' @param eval_poscon character. The genes to be used as positive controls for
+#'   evaluation. These genes should be expected to change according to the
+#'   biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL,
+#'   correlations with positive controls will be returned NA.
+#' @param params matrix or data.frame. If given, the algorithm will bypass
+#'   creating the matrix of possible parameters, and will use the given matrix.
+#'   There are basically no checks as to whether this matrix is in the right
+#'   format, and is only intended to be used to feed the results of setting
+#'   run=FALSE back into the algorithm (see example).
 #' @param verbose logical. If TRUE some messagges are printed.
-#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL.
-#' @param run logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame
-#' of parameters that will be run for inspection by the user.
+#' @param conditional_pam logical. If TRUE then maximum ASW is separately
+#'   computed for each biological condition (including NA), and a weighted
+#'   average is returned as PAM_SIL.
+#' @param run logical. If FALSE the normalization and evaluation are not run,
+#'   but the function returns a data.frame of parameters that will be run for
+#'   inspection by the user.
+#' @param return_norm logical. If FALSE the normalization will not be returned,
+#'   but only evaluate. This will create a much smaller object and may be useful
+#'   for large datasets and/or when many combinations are compared.
 #'
 #' @importFrom RUVSeq RUVg
 #' @importFrom matrixStats rowMedians
 #' @import BiocParallel
 #' @export
 #'
-#' @details If both \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user.
+#' @details If both \code{run=FALSE} the normalization and evaluation are not
+#'   run, but the function returns a matrix of parameters that will be run for
+#'   inspection by the user.
 #'
-#' @return A list with the following elements:
-#' \itemize{
-#' \item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.}
-#' \item{evaluation}{ A matrix containing raw evaluation metrics for each normalization method. Rows are sorted in the same order as in the ranks output matrix. NULL when evaluate = FALSE.}
-#' \item{ranks}{ A matrix containing rank-scores for each normalization, including median rank across all scores. Rows are sorted by increasing median rank. NULL when evaluate = FALSE.}
-#' \item{params}{ A data.frame with each row corresponding to a set of normalization parameters.}
-#' }
-#' @return If \code{run=FALSE} a \code{data.frame}
-#' with each row corresponding to a set of normalization parameters.
+#' @return A list with the following elements: \itemize{ \item{normalized_data}{
+#'   A list containing the normalized data matrix, log-scaled. NULL when
+#'   evaluate = TRUE.} \item{evaluation}{ A matrix containing raw evaluation
+#'   metrics for each normalization method. Rows are sorted in the same order as
+#'   in the ranks output matrix. NULL when evaluate = FALSE.} \item{ranks}{ A
+#'   matrix containing rank-scores for each normalization, including median rank
+#'   across all scores. Rows are sorted by increasing median rank. NULL when
+#'   evaluate = FALSE.} \item{params}{ A data.frame with each row corresponding
+#'   to a set of normalization parameters.} }
+#' @return If \code{run=FALSE} a \code{data.frame} with each row corresponding
+#'   to a set of normalization parameters.
 #'
-#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to rank-score:
-#' Positive-signature metrics increase with improving performance, including BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES.
-#' Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR.
-#' Rank-scores are computed so that higer-performing methods are assigned a lower-rank.
+#' @details Evaluation metrics are defined in \code{\link[scone]{score_matrix}}.
+#'   Each metric is assigned a signature for conversion to rank-score:
+#'   Positive-signature metrics increase with improving performance, including
+#'   BIO_SIL,PAM_SIL, EXP_WV_COR, PAM_STAB, and VAR_PRES. Negative-signature
+#'   metrics decrease with improving performance, including BATCH_SIL,
+#'   EXP_QC_COR, EXP_RUV_COR, and EXP_UV_COR. Rank-scores are computed so that
+#'   higer-performing methods are assigned a lower-rank.
 #'
-scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL,
-                  qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"),
-                  bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL,
-                  eval_kclust=2:10, eval_negcon=NULL, eval_poscon=NULL,
-                  params=NULL, verbose=FALSE, conditional_pam = FALSE) {
+scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
+                  ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"),
+                  adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL,
+                  run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,
+                  eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL,
+                  eval_poscon=NULL, params=NULL, verbose=FALSE,
+                  conditional_pam = FALSE, return_norm = TRUE) {
 
   if(!is.matrix(expr)) {
     stop("'expr' must be a matrix.")
@@ -338,11 +375,20 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL,
     } else {
       score <- NULL
     }
-    return(list(score=score, adjusted=adjusted))
+
+    if(return_norm) {
+      return(list(score=score, adjusted=adjusted))
+    } else {
+      return(list(score=score))
+    }
   })
 
-  adjusted <- lapply(outlist, function(x) x$adjusted)
-  names(adjusted) <- apply(params, 1, paste, collapse=',')
+  if(return_norm) {
+    adjusted <- lapply(outlist, function(x) x$adjusted)
+    names(adjusted) <- apply(params, 1, paste, collapse=',')
+  } else {
+    adjusted <- NULL
+  }
 
   if(evaluate) {
     evaluation <- lapply(outlist, function(x) x$score)
@@ -361,8 +407,13 @@ scone <- function(expr, imputation, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL,
     }
 
     evaluation <- t(evaluation[,order(med_rank), drop=FALSE])
-    adjusted <- adjusted[order(med_rank)]
+
+    if(return_norm) {
+      adjusted <- adjusted[order(med_rank)]
+    }
+
     params <- params[order(med_rank),]
+
   } else {
     evaluation <- ranks <- NULL
   }

From 11f0195d549d448bfc03e21f40297a3dde3ce115 Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Tue, 14 Jun 2016 17:13:19 -0700
Subject: [PATCH 02/27] Develop 0.0.5-9000

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4338105..f6f8ee5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 0.0.5
+Version: 0.0.5-9000
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",

From 5debdfc6927a1178187e6625696512f56133ba57 Mon Sep 17 00:00:00 2001
From: John Blischak <jdblischak@gmail.com>
Date: Sat, 25 Jun 2016 16:57:46 -0700
Subject: [PATCH 03/27] Restore original par settings upon exiting function.

---
 R/sample_filtering.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/sample_filtering.R b/R/sample_filtering.R
index 483d597..f19a265 100644
--- a/R/sample_filtering.R
+++ b/R/sample_filtering.R
@@ -254,7 +254,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
 
       is_bad = rep(FALSE,dim(expr)[2])
 
-      par(mfcol = c(criterion_count,2))
+      op <- par(mfcol = c(criterion_count,2))
+      on.exit(par(op))
 
       if(!is.null(nreads)){
         is_bad = filtered_nreads
@@ -417,8 +418,9 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5,
   num_qual_pcs = which(csum > min_qual_variance)[1]
 
   if(plot){
+    op <- par(mfrow = c(2,1))
+    on.exit(par(op))
     for (i in 1:num_qual_pcs){
-      par(mfrow = c(2,1))
       hist(qpc$x[,i],breaks = hist_breaks, main = paste0("Distribution of Quality PC ",i), xlab = paste0("Qual PC",i))
       barplot(abs(qpc$rotation[,i]),col = c("red","green")[1 + (qpc$rotation[,i] > 0)], cex.names = .25,horiz = T, las=1, main = "Loadings")
     }

From 6ffeb74bf91948c9825e1d947df9a4f78a6a24a0 Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Thu, 7 Jul 2016 13:27:02 -0700
Subject: [PATCH 04/27] Replaced conditional pam with new stratified pam +
 updated NEWS.

---
 R/scone_eval.R              | 45 +++++++++++++++++------------------
 R/scone_main.R              | 18 +++++++-------
 inst/NEWS                   | 47 +++++++++++++++++++++----------------
 man/scone.Rd                |  4 ++--
 man/score_matrix.Rd         |  6 ++---
 tests/testthat/test_scone.R | 18 +++++++-------
 6 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/R/scone_eval.R b/R/scone_eval.R
index 646a07a..448957f 100644
--- a/R/scone_eval.R
+++ b/R/scone_eval.R
@@ -29,7 +29,7 @@
 #' @param wv_factors Factors of wanted variation derived from positive control genes (evaluation set).
 #' If NULL, wv correlations, EXP_WV_COR, will be returned NA.
 #' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection.
-#' @param conditional_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA),
+#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs),
 #' and a weighted average silhouette width is returned.
 #'
 #' @importFrom class knn
@@ -43,7 +43,7 @@
 #' \itemize{
 #' \item{BIO_SIL}{ Average silhouette width by biological condition.}
 #' \item{BATCH_SIL}{ Average silhouette width by batch condition.}
-#' \item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).}
+#' \item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see stratified_pam argument).}
 #' \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.}
 #' \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.}
 #' \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.}
@@ -57,7 +57,7 @@ score_matrix <- function(expr, eval_pcs = 3,
                          eval_kclust = NULL,
                          bio = NULL, batch = NULL,
                          qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, 
-                         is_log=FALSE, conditional_pam = FALSE){
+                         is_log=FALSE, stratified_pam = FALSE){
 
   if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){
     stop("NA/Inf/NaN Expression Values.")
@@ -102,36 +102,35 @@ score_matrix <- function(expr, eval_pcs = 3,
 
   if ( !is.null(eval_kclust) ){
     
-    # "Conditional" PAM
-    if(conditional_pam){
+    # "Stratified" PAM
+    if(stratified_pam){
 
+      biobatch = as.factor(paste(bio,batch,sep = "_"))
       PAM_SIL = 0
 
-      # Max Average Sil Width per Biological Condition
-      for (cond in levels(bio)){
-        is_cond = which(bio == cond)
+      # Max Average Sil Width per bio-cross-batch Condition
+      for (cond in levels(biobatch)){
+        is_cond = which(biobatch == cond)
         cond_w = length(is_cond)
         if(cond_w > max(eval_kclust)){
           pamk_object = pamk(proj[is_cond,],krange = eval_kclust)
+          
+          # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected
+          if(is.null(pamk_object$pamobject$silinfo$avg.width) ){
+            if (!1 %in% eval_kclust) {
+              pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[-1])
+            }else{
+              stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.")
+            }
+          }
+  
           PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width
         }else{
-          stop("Number of clusters 'k' must be smaller than bio class size")
+          stop(paste("Number of clusters 'k' must be smaller than bio-cross-batch stratum size:",
+                paste(levels(biobatch),table(biobatch), sep = " = ",collapse = ", ")))
         }
       }
-
-      # Max Average Sil Width for Unclassified Condition
-      is_na = is.na(bio)
-      if (any(is_na)){
-        cond_w = sum(is_na)
-        if(cond_w > max(eval_kclust)){
-          pamk_object = pamk(proj[is_na,],krange = eval_kclust)
-          PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width
-        }else{
-          stop("Number of clusters 'k' must be smaller than unclassified bio set size")
-        }
-      }
-      
-      PAM_SIL = PAM_SIL/length(bio)
+      PAM_SIL = PAM_SIL/length(biobatch)
       
     # Traditional PAM
     }else{
diff --git a/R/scone_main.R b/R/scone_main.R
index e4f3496..f84f497 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -43,7 +43,7 @@
 #' right format, and is only intended to be used to feed the results of setting run=FALSE
 #' back into the algorithm (see example).
 #' @param verbose logical. If TRUE some messagges are printed.
-#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL.
+#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL.
 #' @param run logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame
 #' of parameters that will be run for inspection by the user.
 #'
@@ -77,7 +77,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
                   qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"),
                   bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL,
                   eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL,
-                  params=NULL, verbose=FALSE, conditional_pam = FALSE) {
+                  params=NULL, verbose=FALSE, stratified_pam = FALSE) {
 
   if(!is.matrix(expr)) {
     stop("'expr' must be a matrix.")
@@ -203,11 +203,13 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
       stop("'eval_kclust' must be less than the number of samples.")
     }
 
-    if(conditional_pam) {
-      if(is.null(bio)) {
-        stop("If `bio` is null, `conditional_pam` cannot be TRUE.")
-      } else if(max(eval_kclust) >= min(table(bio))) {
-        stop("For conditional_pam, max 'eval_kclust' must be smaller than min bio class size")
+    if(stratified_pam) {
+      if(is.null(bio) & is.null(batch)){
+        stop("For stratified_pam, bio and/or batch must be specified")
+      }
+      biobatch = as.factor(paste(bio,batch,sep = "_"))
+      if(max(eval_kclust) >= min(table(biobatch))) {
+        stop("For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size")
       }
     }
   }
@@ -334,7 +336,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
       score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, eval_proj = eval_proj, eval_proj_args = eval_proj_args,
                             eval_kclust = eval_kclust, bio = bio, batch = batch,
                             qc_factors = qc_pcs, uv_factors = uv_factors, wv_factors = wv_factors,
-                            is_log = TRUE, conditional_pam = conditional_pam)
+                            is_log = TRUE, stratified_pam = stratified_pam)
     } else {
       score <- NULL
     }
diff --git a/inst/NEWS b/inst/NEWS
index 57761c3..109aedf 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,33 +1,40 @@
+Changes in version 0.0.6
+========================
+
+* Fixed bug when using plot functionality of filtering functions.
+* "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition.
+
 Changes in version 0.0.5
 ========================
+
 * Modified biplot to handle general coloring schemes.
 * Limit number of WV and UV factors to eval_pcs in computing WV and UV scores.
-* Updated depencencies.
+* Updated dependencies.
 * Added error-handling to sample filter.
-* Removed var preserved measure do to length of running time.
+* Removed “var preserved” measure do to length of running time.
 
 Changes in version 0.0.4
 ========================
 
-* Fixed a few bugs and documentation mismatches
-* Removed stability evaluation (redundant with sil width and slow)
-* Removed clusterExperiment dependency
-* Removed RUV correlation score. UV correlation now takes ruv control genes as default
-* Added RLE measures to scone evaluation
-* Added FQT_FN to implement careful ties handling by FQ
-* Better handling of plots in sample filter functions
-* Mean score rather than median rank to evaluate normalizations
-* Default value for imputation
-* Minor optimizations to evaluations
+* Fixed a few bugs and documentation mismatches.
+* Removed stability evaluation (redundant with sil width and slow).
+* Removed clusterExperiment dependency.
+* Removed RUV correlation score. UV correlation now takes ruv control genes as default.
+* Added RLE measures to scone evaluation.
+* Added FQT_FN to implement careful ties handling by FQ.
+* Better handling of plots in sample filter functions.
+* Mean score rather than median rank to evaluate normalizations.
+* Default value for imputation.
+* Minor optimizations to evaluation functions.
 
 Changes in version 0.0.3
 ========================
 
-* Fixed various bugs
-* Added compactness measure for stability evaluation
-* Compute RUV factors only when needed
-* Fixed Github issues #11, #12, #13, #14, #21, #28
-* zinb now works for non-integer whole numbers
-* Updated tests
-* Added documentation for datasets
-* Added biplot_colored function
+* Fixed various bugs.
+* Added “compactness” measure for stability evaluation.
+* Compute RUV factors only when needed.
+* Fixed Github issues #11, #12, #13, #14, #21, #28.
+* zinb now works for non-integer whole numbers.
+* Updated tests.
+* Added documentation for datasets.
+* Added biplot_colored function.
diff --git a/man/scone.Rd b/man/scone.Rd
index cefb163..20e59fc 100644
--- a/man/scone.Rd
+++ b/man/scone.Rd
@@ -10,7 +10,7 @@ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5,
   batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3,
   eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10,
   eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL,
-  verbose = FALSE, conditional_pam = FALSE)
+  verbose = FALSE, stratified_pam = FALSE)
 }
 \arguments{
 \item{expr}{matrix. The data matrix (genes in rows, cells in columns).}
@@ -69,7 +69,7 @@ back into the algorithm (see example).}
 
 \item{verbose}{logical. If TRUE some messagges are printed.}
 
-\item{conditional_pam}{logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), and a weighted average is returned as PAM_SIL.}
+\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL.}
 }
 \value{
 A list with the following elements:
diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd
index bbac848..945bf70 100644
--- a/man/score_matrix.Rd
+++ b/man/score_matrix.Rd
@@ -7,7 +7,7 @@
 score_matrix(expr, eval_pcs = 3, eval_proj = NULL, eval_proj_args = NULL,
   eval_kclust = NULL, bio = NULL, batch = NULL, qc_factors = NULL,
   uv_factors = NULL, wv_factors = NULL, is_log = FALSE,
-  conditional_pam = FALSE)
+  stratified_pam = FALSE)
 }
 \arguments{
 \item{expr}{matrix. The data matrix (genes in rows, cells in columns).}
@@ -40,7 +40,7 @@ If NULL, wv correlations, EXP_WV_COR, will be returned NA.}
 
 \item{is_log}{logical. If TRUE the expr matrix is already logged and log transformation will not be carried out prior to projection.}
 
-\item{conditional_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological condition (including NA),
+\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs),
 and a weighted average silhouette width is returned.}
 }
 \value{
@@ -48,7 +48,7 @@ A list with the following elements:
 \itemize{
 \item{BIO_SIL}{ Average silhouette width by biological condition.}
 \item{BATCH_SIL}{ Average silhouette width by batch condition.}
-\item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see conditional_pam argument).}
+\item{PAM_SIL}{ Maximum average silhouette width from pam clustering (see stratified_pam argument).}
 \item{EXP_QC_COR}{ Maximum squared spearman correlation between pcs and quality factors.}
 \item{EXP_UV_COR}{ Maximum squared spearman correlation between pcs and passive uv factors.}
 \item{EXP_WV_COR}{ Maximum squared spearman correlation between pcs and passive wv factors.}
diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R
index 48a69a0..6743a98 100644
--- a/tests/testthat/test_scone.R
+++ b/tests/testthat/test_scone.R
@@ -202,11 +202,10 @@ test_that("conditional PAM",{
 
   res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
-               k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio,
-               adjust_batch="yes", batch=batch, run=FALSE,
+               k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, run=FALSE,
                evaluate=TRUE, eval_negcon=as.character(11:20),
                eval_poscon=as.character(21:30),
-               eval_kclust = 2, conditional_pam = TRUE)
+               eval_kclust = 2, stratified_pam = TRUE)
 
   expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
@@ -214,17 +213,16 @@ test_that("conditional PAM",{
                adjust_batch="yes", batch=batch, run=FALSE,
                evaluate=TRUE, eval_negcon=as.character(11:20),
                eval_poscon=as.character(21:30),
-               eval_kclust = 6, conditional_pam = TRUE),
-               "For conditional_pam, max 'eval_kclust' must be smaller than min bio class size")
+               eval_kclust = 6, stratified_pam = TRUE),
+               "For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size")
 
   expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
                             scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
-                            k_ruv=0, k_qc=0,
-                            adjust_batch="yes", batch=batch, run=FALSE,
-                            evaluate=TRUE, eval_negcon=as.character(11:20),
+                            k_ruv=0, k_qc=0, run = FALSE,
+                            evaluate = TRUE, eval_negcon=as.character(11:20),
                             eval_poscon=as.character(21:30),
-                            eval_kclust = 6, conditional_pam = TRUE),
-               "If `bio` is null, `conditional_pam` cannot be TRUE")
+                            eval_kclust = 6, stratified_pam = TRUE),
+               "For stratified_pam, bio and/or batch must be specified")
 
 })
 

From 6c00bff3f64480ef38cac26c5fe72c2ee8f473a8 Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Thu, 7 Jul 2016 13:57:20 -0700
Subject: [PATCH 05/27] Fixed bug in PAM_SIL

---
 R/scone_eval.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/scone_eval.R b/R/scone_eval.R
index 448957f..fd421fa 100644
--- a/R/scone_eval.R
+++ b/R/scone_eval.R
@@ -118,7 +118,7 @@ score_matrix <- function(expr, eval_pcs = 3,
           # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected
           if(is.null(pamk_object$pamobject$silinfo$avg.width) ){
             if (!1 %in% eval_kclust) {
-              pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[-1])
+              pamk_object$pamobject$silinfo$avg.width = max(pamk_object$crit[1:max(eval_kclust) %in% eval_kclust])
             }else{
               stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.")
             }

From 8412f0b6742aee29ff24aa18edf2af1856cba92c Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 14:37:58 -0700
Subject: [PATCH 06/27] Check memory usage and add option to write on file

---
 .Rbuildignore  |  4 +++-
 R/scone_main.R | 27 +++++++++++++++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/.Rbuildignore b/.Rbuildignore
index 3e3b853..0fdadad 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -33,4 +33,6 @@ README[.]md
 #----------------------------
 # Temp scripts
 #----------------------------
-^old_scripts/*
\ No newline at end of file
+^old_scripts/*
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/R/scone_main.R b/R/scone_main.R
index 9aaeaa9..d8eb5f5 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -71,13 +71,18 @@
 #' @param run logical. If FALSE the normalization and evaluation are not run,
 #'   but the function returns a data.frame of parameters that will be run for
 #'   inspection by the user.
-#' @param return_norm logical. If FALSE the normalization will not be returned,
-#'   but only evaluate. This will create a much smaller object and may be useful
-#'   for large datasets and/or when many combinations are compared.
+#' @param return_norm character. If "no" the normalized values will not be
+#'   returned. This will create a much smaller object and may be useful for
+#'   large datasets and/or when many combinations are compared. If "in_memory"
+#'   the normalized values will be returned as part of the output. If "hdf5"
+#'   they will be written on file using the \code{rhdf5} package.
 #'
 #' @importFrom RUVSeq RUVg
 #' @importFrom matrixStats rowMedians
 #' @import BiocParallel
+#' @imporFrom pryr mem_used
+#' @importFrom gdata humanReadable
+#'
 #' @export
 #'
 #' @details If both \code{run=FALSE} the normalization and evaluation are not
@@ -110,7 +115,9 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
                   run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,
                   eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=NULL,
                   eval_poscon=NULL, params=NULL, verbose=FALSE,
-                  conditional_pam = FALSE, return_norm = TRUE) {
+                  conditional_pam = FALSE, return_norm = c("no", "in_memory", "hdf5")) {
+
+  return_norm <- match.arg(return_norm)
 
   if(!is.matrix(expr)) {
     stop("'expr' must be a matrix.")
@@ -313,7 +320,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
   if(verbose) message("Imputation step...")
   im_params <- unique(params[,1])
 
-  imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr))
+  imputed <- lapply(seq_along(im_params), function(i) imputation[[i]](expr))
   names(imputed) <- im_params
   # output: a list of imputed matrices
 
@@ -321,7 +328,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
   if(verbose) message("Scaling step...")
   sc_params <- unique(params[,1:2])
 
-  scaled <- bplapply(1:nrow(sc_params), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]]))
+  scaled <- bplapply(seq_len(NROW(sc_params)), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]]))
   names(scaled) <- apply(sc_params, 1, paste, collapse="_")
   # output: a list of normalized expression matrices
 
@@ -359,7 +366,7 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
 
   if(verbose) message("Factor adjustment and evaluation...")
 
-  outlist <- bplapply(1:nrow(params), function(i) {
+  outlist <- bplapply(seq_len(NROW(params)), function(i) {
     parsed <- parse_row(params[i,], bio, batch, ruv_factors, qc_pcs)
     design_mat <- make_design(parsed$bio, parsed$batch, parsed$W,
                               nested=(nested & !is.null(parsed$bio) & !is.null(parsed$batch)))
@@ -376,9 +383,13 @@ scone <- function(expr, imputation=identity, scaling=identity, k_ruv=5, k_qc=5,
       score <- NULL
     }
 
-    if(return_norm) {
+    if(verbose) message(paste0("Processed: ", rownames(params)[i], "\n", "Mem used: ", gdata::humanReadable(pryr::mem_used())))
+    if(return_norm == "in_memory") {
       return(list(score=score, adjusted=adjusted))
     } else {
+      if(return_norm == "hdf5") {
+        ## write on file
+      }
       return(list(score=score))
     }
   })

From 324858ead500ad9a6429a43af118410a3fcfc4e5 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 17:32:47 -0700
Subject: [PATCH 07/27] Support for hdf5

---
 R/scone_main.R | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/R/scone_main.R b/R/scone_main.R
index 476bf0d..59ddc4d 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -76,6 +76,8 @@
 #'   large datasets and/or when many combinations are compared. If "in_memory"
 #'   the normalized values will be returned as part of the output. If "hdf5"
 #'   they will be written on file using the \code{rhdf5} package.
+#' @param hdf5file character. If \code{return_norm="hdf5"}, the name of the file
+#'   onto which to save the normalized matrices.
 #'
 #' @importFrom RUVSeq RUVg
 #' @importFrom matrixStats rowMedians
@@ -87,6 +89,7 @@
 #'   dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm
 #'   prcomp quantile quasibinomial sd
 #' @importFrom utils capture.output
+#' @importFrom rhdf5 h5createFile h5write
 #' @export
 #'
 #' @details If \code{run=FALSE} the normalization and evaluation are not run,
@@ -114,16 +117,31 @@
 #'   RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods
 #'   are assigned a higher scores.
 #'
+#' @details If \code{return_norm="hdf5"}, the normalized matrices will be
+#'   written to the \code{hdf5file} file. This must be a string specifying (a
+#'   path to) a new file. If the file already exists, it will return error.
+#'
 scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5,
                   ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"),
                   adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL,
                   run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,
                   eval_proj_args = NULL, eval_kclust=2:10, eval_negcon=ruv_negcon,
                   eval_poscon=NULL, params=NULL, verbose=FALSE,
-                  stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5")) {
+                  stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5"),
+                  hdf5file) {
 
   return_norm <- match.arg(return_norm)
 
+  if(return_norm == "hdf5") {
+    if(missing(hdf5file)) {
+      stop("If `return_norm='hdf5'`, `hdf5file` must be specified.")
+    } else {
+      stopifnot(h5createFile(hdf5file))
+      h5write(rownames(expr), hdf5file, "genes")
+      h5write(colnames(expr), hdf5file, "samples")
+    }
+  }
+
   if(!is.matrix(expr)) {
     stop("'expr' must be a matrix.")
   } else if(is.null(rownames(expr))) {
@@ -391,7 +409,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
       return(list(score=score, adjusted=adjusted))
     } else {
       if(return_norm == "hdf5") {
-        ## write on file
+        h5write(adjusted, hdf5file, rownames(params)[i])
       }
       return(list(score=score))
     }

From 8516e799efa016d47534fd20b77228205d5aead9 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 17:55:31 -0700
Subject: [PATCH 08/27] Added wrapper function (issue #9)

---
 R/scone_main.R        | 31 ++++++++++++++++++++++++++++++-
 man/get_normalized.Rd | 27 +++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 man/get_normalized.Rd

diff --git a/R/scone_main.R b/R/scone_main.R
index 59ddc4d..2fb007c 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -82,7 +82,7 @@
 #' @importFrom RUVSeq RUVg
 #' @importFrom matrixStats rowMedians
 #' @import BiocParallel
-#' @imporFrom pryr mem_used
+#' @importFrom pryr mem_used
 #' @importFrom gdata humanReadable
 #' @importFrom graphics abline arrows barplot hist par plot text
 #' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist
@@ -449,3 +449,32 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
 
   return(list(normalized_data=adjusted, metrics=evaluation, scores=scores, params=params))
 }
+
+#' Retrieve normalized matrix
+#'
+#' Given a string representing a normalization scheme and an HDF5 file created by
+#' scone, it will return a matrix of normalized counts (in log scale).
+#'
+#' @details The string must be of of the row.names of the \code{params}
+#'   component of the scone output.
+#'
+#' @param normalization character. A string identifying the normalization scheme
+#'   to be retrieved.
+#' @param hdf5file character. The name of the file in which the data have been
+#'   stored.
+#'
+#' @return A matrix of normalized counts in log-scale.
+#'
+#' @importFrom rhdf5 h5ls h5read
+#' @export
+get_normalized <- function(normalization, hdf5file) {
+
+  if(!normalization %in% h5ls(hdf5file)$name) {
+    stop(paste0(normalization, " does not match any object in ", hdf5file))
+  } else {
+    retval <- h5read(hdf5file, normalization)
+    rownames(retval) <- h5read(hdf5file, "genes")
+    colnames(retval) <- h5read(hdf5file, "samples")
+    return(retval)
+  }
+}
diff --git a/man/get_normalized.Rd b/man/get_normalized.Rd
new file mode 100644
index 0000000..a285b47
--- /dev/null
+++ b/man/get_normalized.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/scone_main.R
+\name{get_normalized}
+\alias{get_normalized}
+\title{Retrieve normalized matrix}
+\usage{
+get_normalized(normalization, hdf5file)
+}
+\arguments{
+\item{normalization}{character. A string identifying the normalization scheme
+to be retrieved.}
+
+\item{hdf5file}{character. The name of the file in which the data have been
+stored.}
+}
+\value{
+A matrix of normalized counts in log-scale.
+}
+\description{
+Given a string representing a normalization scheme and an HDF5 file created by
+scone, it will return a matrix of normalized counts (in log scale).
+}
+\details{
+The string must be of of the row.names of the \code{params}
+  component of the scone output.
+}
+

From fc8c8c373e1ea1bb3dc02164f6d404720b897ec3 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 17:55:46 -0700
Subject: [PATCH 09/27] Updated documentation

---
 NAMESPACE    |   7 +++
 man/scone.Rd | 164 +++++++++++++++++++++++++++++++++------------------
 2 files changed, 113 insertions(+), 58 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 3fc881a..5531cec 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -12,6 +12,7 @@ export(biplot_colored)
 export(estimate_ziber)
 export(estimate_zinb)
 export(factor_sample_filter)
+export(get_normalized)
 export(impute_ziber_simp)
 export(impute_zinb)
 export(lm_adjust)
@@ -31,6 +32,7 @@ importFrom(cluster,silhouette)
 importFrom(diptest,dip.test)
 importFrom(edgeR,calcNormFactors)
 importFrom(fpc,pamk)
+importFrom(gdata,humanReadable)
 importFrom(grDevices,colorRampPalette)
 importFrom(graphics,abline)
 importFrom(graphics,arrows)
@@ -44,6 +46,11 @@ importFrom(matrixStats,colIQRs)
 importFrom(matrixStats,colMedians)
 importFrom(matrixStats,rowMedians)
 importFrom(mixtools,normalmixEM)
+importFrom(pryr,mem_used)
+importFrom(rhdf5,h5createFile)
+importFrom(rhdf5,h5ls)
+importFrom(rhdf5,h5read)
+importFrom(rhdf5,h5write)
 importFrom(stats,approx)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
diff --git a/man/scone.Rd b/man/scone.Rd
index 20e59fc..df7ad4a 100644
--- a/man/scone.Rd
+++ b/man/scone.Rd
@@ -2,7 +2,8 @@
 % Please edit documentation in R/scone_main.R
 \name{scone}
 \alias{scone}
-\title{scone main wrapper: function to apply and evaluate all the normalization schemes}
+\title{scone main wrapper: function to apply and evaluate all the normalization
+schemes}
 \usage{
 scone(expr, imputation = list(none = identity), scaling, k_ruv = 5,
   k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes",
@@ -10,98 +11,145 @@ scone(expr, imputation = list(none = identity), scaling, k_ruv = 5,
   batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3,
   eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10,
   eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL,
-  verbose = FALSE, stratified_pam = FALSE)
+  verbose = FALSE, stratified_pam = FALSE, return_norm = c("no",
+  "in_memory", "hdf5"), hdf5file)
 }
 \arguments{
 \item{expr}{matrix. The data matrix (genes in rows, cells in columns).}
 
-\item{imputation}{list or function. (A list of) function(s) to be used for imputation.}
+\item{imputation}{list or function. (A list of) function(s) to be used for
+imputation.}
 
-\item{scaling}{list or function. (A list of) function(s) to be used for scaling normalization.}
+\item{scaling}{list or function. (A list of) function(s) to be used for
+scaling normalization.}
 
-\item{k_ruv}{numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation).
-If 0, RUV will not be performed.}
+\item{k_ruv}{numeric. The maximum number of factors of unwanted variation
+(the function will adjust for 1 to k_ruv factors of unwanted variation). If
+0, RUV will not be performed.}
 
-\item{k_qc}{numeric. The maximum number of quality metrics PCs (the function will adjust for 1 to k_qc PCs).
-If 0, QC adjustment will not be performed.}
+\item{k_qc}{numeric. The maximum number of quality metrics PCs (the function
+will adjust for 1 to k_qc PCs). If 0, QC adjustment will not be performed.}
 
-\item{ruv_negcon}{character. The genes to be used as negative controls for RUV. Ignored if k_ruv=0.}
+\item{ruv_negcon}{character. The genes to be used as negative controls for
+RUV. Ignored if k_ruv=0.}
 
-\item{qc}{matrix. The QC metrics to be used for QC adjustment. Ignored if k_qc=0.}
+\item{qc}{matrix. The QC metrics to be used for QC adjustment. Ignored if
+k_qc=0.}
 
-\item{adjust_bio}{character. If 'no' it will not be included in the model; if 'yes', both models with and without 'bio' will be run;
-if 'force', only models with 'bio' will be run.}
+\item{adjust_bio}{character. If 'no' it will not be included in the model; if
+'yes', both models with and without 'bio' will be run; if 'force', only
+models with 'bio' will be run.}
 
-\item{adjust_batch}{character. If 'no' it will not be included in the model; if 'yes', both models with and without 'batch' will be run;
-if 'force', only models with 'batch' will be run.}
+\item{adjust_batch}{character. If 'no' it will not be included in the model;
+if 'yes', both models with and without 'batch' will be run; if 'force',
+only models with 'batch' will be run.}
 
-\item{bio}{factor. The biological condition to be included in the adjustment model (variation to be preserved). If adjust_bio="no", it will not be used for normalization, but only for evaluation.}
+\item{bio}{factor. The biological condition to be included in the adjustment
+model (variation to be preserved). If adjust_bio="no", it will not be used
+for normalization, but only for evaluation.}
 
-\item{batch}{factor. The known batch variable to be included in the adjustment model (variation to be removed). If adjust_batch="no", it will not be used for normalization, but only for evaluation.}
+\item{batch}{factor. The known batch variable to be included in the
+adjustment model (variation to be removed). If adjust_batch="no", it will
+not be used for normalization, but only for evaluation.}
 
-\item{run}{logical. If FALSE the normalization and evaluation are not run, but the function returns a data.frame
-of parameters that will be run for inspection by the user.}
+\item{run}{logical. If FALSE the normalization and evaluation are not run,
+but the function returns a data.frame of parameters that will be run for
+inspection by the user.}
 
-\item{evaluate}{logical. If FALSE the normalization methods will not be evaluated (faster).}
+\item{evaluate}{logical. If FALSE the normalization methods will not be
+evaluated (faster).}
 
-\item{eval_pcs}{numeric. The number of principal components to use for evaluation. Ignored if evaluation=FALSE.}
+\item{eval_pcs}{numeric. The number of principal components to use for
+evaluation. Ignored if evaluation=FALSE.}
 
-\item{eval_proj}{function. Projection function for evaluation (see \code{\link{score_matrix}} for details).
-If NULL, PCA is used for projection}
+\item{eval_proj}{function. Projection function for evaluation  (see
+\code{\link{score_matrix}} for details). If NULL, PCA is used for
+projection.}
 
-\item{eval_proj_args}{list. List passed to the eval_proj function as eval_proj_args.}
+\item{eval_proj_args}{list. List of args passed to projection function as
+eval_proj_args.}
 
-\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam tightness evaluation.
-If an array of integers, largest average silhouette width (tightness) will be reported. If NULL, tightness will be returned NA.}
+\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam
+tightness evaluation. If an array of integers, largest average silhouette
+width (tightness) will be reported. If NULL, tightness will be returned NA.}
 
-\item{eval_negcon}{character. The genes to be used as negative controls for evaluation. These genes should
-be expected not to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE.
-Default is ruv_negcon argument. If NULL, correlations with negative controls will be returned NA.}
+\item{eval_negcon}{character. The genes to be used as negative controls for
+evaluation. These genes should be expected not to change according to the
+biological phenomenon of interest. Ignored if evaluation=FALSE. Default is
+ruv_negcon argument. If NULL, correlations with negative controls will be
+returned NA.}
 
-\item{eval_poscon}{character. The genes to be used as positive controls for evaluation. These genes should
-be expected to change according to the biological phenomenon of interest. Ignored if evaluation=FALSE.
-If NULL, correlations with positive controls will be returned NA.}
+\item{eval_poscon}{character. The genes to be used as positive controls for
+evaluation. These genes should be expected to change according to the
+biological phenomenon of interest. Ignored if evaluation=FALSE. If NULL,
+correlations with positive controls will be returned NA.}
 
-\item{params}{matrix or data.frame. If given, the algorithm will bypass creating the matrix of possible
-parameters, and will use the given matrix. There are basically no checks as to whether this matrix is in the
-right format, and is only intended to be used to feed the results of setting run=FALSE
-back into the algorithm (see example).}
+\item{params}{matrix or data.frame. If given, the algorithm will bypass
+creating the matrix of possible parameters, and will use the given matrix.
+There are basically no checks as to whether this matrix is in the right
+format, and is only intended to be used to feed the results of setting
+run=FALSE back into the algorithm (see example).}
 
 \item{verbose}{logical. If TRUE some messagges are printed.}
 
-\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is separately computed for each biological-cross-batch condition (accepting NAs), and a weighted average is returned as PAM_SIL.}
+\item{stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is
+separately computed for each biological-cross-batch condition (accepting
+NAs), and a weighted average is returned as PAM_SIL.}
+
+\item{return_norm}{character. If "no" the normalized values will not be
+returned. This will create a much smaller object and may be useful for
+large datasets and/or when many combinations are compared. If "in_memory"
+the normalized values will be returned as part of the output. If "hdf5"
+they will be written on file using the \code{rhdf5} package.}
+
+\item{hdf5file}{character. If \code{return_norm="hdf5"}, the name of the file
+onto which to save the normalized matrices.}
 }
 \value{
-A list with the following elements:
-\itemize{
-\item{normalized_data}{ A list containing the normalized data matrix, log-scaled. NULL when evaluate = TRUE.}
-\item{metrics}{ A matrix containing raw evaluation metrics for each normalization method (see \code{\link{score_matrix}} for details). Rows are sorted in the same order as in the scores output matrix. NULL when evaluate = FALSE.}
-\item{scores}{ A matrix containing scores for each normalization, including average score. Rows are sorted by increasing mean score. NULL when evaluate = FALSE.}
-\item{params}{ A data.frame with each row corresponding to a set of normalization parameters.}
-}
-
-If \code{run=FALSE} a \code{data.frame}
-with each row corresponding to a set of normalization parameters.
+A list with the following elements: \itemize{ \item{normalized_data}{
+  A list containing the normalized data matrix, log-scaled. NULL when
+  \code{return_norm} is not "in_memory".} \item{metrics}{ A matrix containing
+  raw evaluation metrics for each normalization method (see
+  \code{\link{score_matrix}} for details). Rows are sorted in the same order
+  as in the scores output matrix. NULL when evaluate = FALSE.} \item{scores}{
+  A matrix containing scores for each normalization, including average score.
+  Rows are sorted by increasing mean score. NULL when evaluate = FALSE.}
+  \item{params}{ A data.frame with each row corresponding to a set of
+  normalization parameters.} }
+
+If \code{run=FALSE} a \code{data.frame} with each row corresponding
+  to a set of normalization parameters.
 }
 \description{
-This function is a high-level wrapper to apply and evaluate a variety of normalization
-schemes to a specified expression matrix.
+This function is a high-level wrapper to apply and evaluate a variety of
+normalization schemes to a specified expression matrix.
 }
 \details{
-The function consists of four main steps: imputation, scaling normalization, adjusting for batch effects,
-and evaluation of the normalization performance.
+The function consists of four main steps: imputation, scaling normalization,
+adjusting for batch effects, and evaluation of the normalization performance.
+
+Note that if one wants to include the unnormalized data in the
+  comparison, the identity function must be included in the scaling list.
+  Analogously, if one wants to avoid the imputation and/or include the
+  non-imputed data in the comparison, the identity function must be included.
+
 
-Note that if one wants to include the unnormalized data in the comparison, the identity function
-must be included in the scaling list. Analogously, if one wants to avoid the imputation and/or include the
-non-imputed data in the comparison, the identity function must be included.
+If \code{run=FALSE} the normalization and evaluation are not run,
+  but the function returns a matrix of parameters that will be run for
+  inspection by the user.
 
 
-If \code{run=FALSE} the normalization and evaluation are not run, but the function returns a matrix of parameters that will be run for inspection by the user.
+Evaluation metrics are defined in \code{\link{score_matrix}}.
+  Each metric is assigned a signature for conversion to scores:
+  Positive-signature metrics increase with improving performance, including
+  BIO_SIL, PAM_SIL, and EXP_WV_COR. Negative-signature metrics decrease with
+  improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR,
+  RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods
+  are assigned a higher scores.
 
 
-Evaluation metrics are defined in \code{\link[scone]{score_matrix}}. Each metric is assigned a signature for conversion to scores:
-Positive-signature metrics increase with improving performance, including BIO_SIL, PAM_SIL, and EXP_WV_COR.
-Negative-signature metrics decrease with improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR, RLE_MED, and RLE_IQR.
-Scores are computed so that higer-performing methods are assigned a higher scores.
+If \code{return_norm="hdf5"}, the normalized matrices will be
+  written to the \code{hdf5file} file. This must be a string specifying (a
+  path to) a new file. If the file already exists, it will return error.
 }
 

From 79738f58b58f4e2f321807a26776164f37f6e955 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 18:05:36 -0700
Subject: [PATCH 10/27] Updated NEWS

---
 .Rbuildignore | 2 ++
 .gitignore    | 2 +-
 inst/NEWS     | 4 +++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.Rbuildignore b/.Rbuildignore
index 0fdadad..306b778 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -29,6 +29,8 @@ README[.]md
 ^[.]devel
 ^[.]test
 ^[.]check
+.Rhistory
+R/.Rhistory
 
 #----------------------------
 # Temp scripts
diff --git a/.gitignore b/.gitignore
index b8c17c0..c6778e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,4 @@ vignettes/figure
 *.Rproj
 vignettes/R_cache
 vignettes/R_figure
-old_scripts
\ No newline at end of file
+old_scripts
diff --git a/inst/NEWS b/inst/NEWS
index 109aedf..56a0bcf 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,8 +1,10 @@
-Changes in version 0.0.6
+Changes in version 0.0.6 (2016-07-07)
 ========================
 
 * Fixed bug when using plot functionality of filtering functions.
 * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition.
+* Added option to write normalized matrices to HDF5 file.
+* Added wrapper function get_normalized() to retrieve normalized data.
 
 Changes in version 0.0.5
 ========================

From 1a02565231b6f718663efa34730a8d7569d3e22b Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 18:08:00 -0700
Subject: [PATCH 11/27] Updated DESCRIPTION file

---
 DESCRIPTION | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f6f8ee5..2f1e14b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -13,22 +13,25 @@ License: Artistic-2.0
 Depends:
   R (>= 3.3)
 Imports:
+  aroma.light,
   BiocParallel,
+  class,
   cluster,
   DESeq,
-  EDASeq,
-  MASS,
-  RUVSeq,
-  aroma.light,
-  class,
   diptest,
+  EDASeq,
   edgeR,
   fpc,
+  gdata,
   gplots,
+  grDevices,
   limma,
+  MASS,
   matrixStats,
   mixtools,
-  grDevices
+  pryr,
+  rhdf5,
+  RUVSeq
 Suggests:
   knitr,
   rmarkdown,

From 8e5d7a78936dfdf4e00e069200965c12e61b3018 Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Thu, 7 Jul 2016 18:24:26 -0700
Subject: [PATCH 12/27] Bug fix to metric sample filter / redefinition of
 simple FNR estimation

---
 DESCRIPTION                 |  5 ++-
 NAMESPACE                   |  2 +
 R/sample_filtering.R        | 87 ++++++++++++++++++++++---------------
 inst/NEWS                   |  2 +
 man/metric_sample_filter.Rd | 18 ++++----
 man/simple_FNR_params.Rd    | 10 +++--
 6 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f6f8ee5..b998287 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 0.0.5-9000
+Version: 0.0.6-9000
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
@@ -28,7 +28,8 @@ Imports:
   limma,
   matrixStats,
   mixtools,
-  grDevices
+  grDevices,
+  boot
 Suggests:
   knitr,
   rmarkdown,
diff --git a/NAMESPACE b/NAMESPACE
index 3fc881a..4050ac6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -26,6 +26,8 @@ importFrom(EDASeq,betweenLaneNormalization)
 importFrom(MASS,glm.nb)
 importFrom(RUVSeq,RUVg)
 importFrom(aroma.light,normalizeQuantileRank.matrix)
+importFrom(boot,inv.logit)
+importFrom(boot,logit)
 importFrom(class,knn)
 importFrom(cluster,silhouette)
 importFrom(diptest,dip.test)
diff --git a/R/sample_filtering.R b/R/sample_filtering.R
index f19a265..803e629 100644
--- a/R/sample_filtering.R
+++ b/R/sample_filtering.R
@@ -1,32 +1,53 @@
 #' Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes
 #'
-#' @details logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) .
+#' @details logit(Probability of False Negative) ~ a + b*(median log-expression) .
 #'
 #' @param expr matrix The data matrix in transcript-proportional units (genes in rows, cells in columns).
 #' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.
+#' User must provide at least 2 positive control genes.
 #' @param fn_tresh Inclusive threshold for negative detection. Default 0.01.
+#' fn_tresh must be non-negative.
 #'
-#' @return A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA.
+#' @return A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated.
+#'
+#' @importFrom boot logit
+#' @importFrom matrixStats rowMedians
 #'
 simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 
-  # Mean log10p1 expression
-  mu_obs = rowMeans(log10(expr[pos_controls,]+1))
-
-  # Drop-outs
-  drop_outs = 0 + (expr[pos_controls,] <= fn_tresh)
+  stopifnot(!any(is.na(pos_controls)))
+  
+  if (sum(pos_controls) < 2){
+    stop("User must provide at least 2 positive control genes")
+  }
+  
+  if (fn_tresh < 0){
+    stop("fn_tresh must be non-negative")
+  }
+  
+  pos_expr = expr[pos_controls,]  # Selecting positive-control genes
+  is_drop = pos_expr <= fn_tresh  # Identify false negatives
+  pos_expr[is_drop] = NA          # Set false negatives to NA
+  drop_outs = 0 + is_drop         # Numeric drop-out state
+  drop_rate = colMeans(drop_outs) # Total drop-out rate per sample
+    
+  # Median log-expression in positive observations
+  mu_obs = log(rowMedians(pos_expr,na.rm = TRUE))
+  if(any(is.na(mu_obs))){
+    stop("Median log-expression in positive observations NA for some positive control gene/s")
+  }
 
   # Logistic Regression Model of FNR
-  ref.glms = list()
-  for (si in 1:dim(drop_outs)[2]){
+  logistic_coef = matrix(0,ncol(drop_outs),2)
+  for (si in seq_len(ncol(drop_outs))){
     fit = suppressWarnings(glm(cbind(drop_outs[,si],1 - drop_outs[,si]) ~ mu_obs,family=binomial(logit)))
     if(fit$converged){
-      ref.glms[[si]] = fit$coefficients
+      logistic_coef[si,] = fit$coefficients
     } else {
-      ref.glms[[si]] = NA
+      logistic_coef[si,1] = logit(drop_rate[si])
     }
   }
-  return(ref.glms)
+  return(logistic_coef)
 }
 
 #' metric-based sample filtering: function to filter single-cell RNA-Seq libraries.
@@ -51,7 +72,7 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #' If NULL, filtered_fnr will be returned NA.
 #' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE
 #' @param glen Gene lengths for gene-length normalization (normalized data used in FNR computation).
-#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6)
+#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log(expr_units)). Default c(0,15)
 #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.
 #' If NULL, only hard threshold sub-criteria will be applied.
 #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric).
@@ -61,11 +82,11 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000.
 #' @param hard_ralign numeric. Hard (lower bound on) ralign threshold. Default 15.
 #' @param hard_breadth numeric. Hard (lower bound on) breadth threshold. Default 0.2.
-#' @param hard_fnr numeric. Hard (upper bound on) fnr threshold. Default 3.
-#' @param suff_nreads numeric. If not null, serves as an upper bound on nreads threshold.
-#' @param suff_ralign numeric. If not null, serves as an upper bound on ralign threshold.  Default 65.
-#' @param suff_breadth numeric. If not null, serves as an upper bound on breadth threshold.  Default 0.8.
-#' @param suff_fnr numeric. If not null, serves as an lower bound on fnr threshold.
+#' @param hard_auc numeric. Hard (upper bound on) fnr auc threshold. Default 10.
+#' @param suff_nreads numeric. If not null, serves as an overriding upper bound on nreads threshold.
+#' @param suff_ralign numeric. If not null, serves as an overriding upper bound on ralign threshold.
+#' @param suff_breadth numeric. If not null, serves as an overriding upper bound on breadth threshold.
+#' @param suff_auc numeric. If not null, serves as an overriding lower bound on fnr auc threshold.
 #' @param plot logical. Should a plot be produced?
 #' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`.
 #'
@@ -79,15 +100,16 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #'
 #'@importFrom mixtools normalmixEM
 #'@importFrom diptest dip.test
+#'@importFrom boot inv.logit
 #'@export
 #'
 #'
 metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
                                 gene_filter = NULL, pos_controls = NULL,scale. = FALSE,glen = NULL,
-                                AUC_range = c(0,6), zcut = 1,
+                                AUC_range = c(0,15), zcut = 1,
                                 mixture = TRUE, dip_thresh = 0.05,
-                                hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_fnr = 3,
-                                suff_nreads = NULL, suff_ralign = 65, suff_breadth = 0.8, suff_fnr = NULL,
+                                hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_auc = 10,
+                                suff_nreads = NULL, suff_ralign = NULL, suff_breadth = NULL, suff_auc = NULL,
                                 plot = FALSE, hist_breaks = 10){
 
   criterion_count = 0
@@ -210,17 +232,17 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
     }
 
     # Compute FNR AUC
-    ref.glms = simple_FNR_params(expr = nexpr, pos_controls = pos_controls)
+    logistic_coef = simple_FNR_params(expr = nexpr, pos_controls = pos_controls)
     AUC = NULL
     for (si in 1:dim(expr)[2]){
-      if(!any(is.na(ref.glms[[si]]))){
-        AUC[si] = log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[2]) + 1)/ref.glms[[si]][2] - log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[1]) + 1)/ref.glms[[si]][2]
+      if(logistic_coef[si,2] != 0){
+        AUC[si] = log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[2]) + 1)/logistic_coef[si,2] - log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[1]) + 1)/logistic_coef[si,2]
       } else {
-        stop("glm fit did not converge")
+        AUC[si] = inv.logit(logistic_coef[si,1])*(AUC_range[2] - AUC_range[1])
       }
     }
 
-    AUC_CUTOFF = hard_fnr
+    AUC_CUTOFF = hard_auc
 
     if (!is.null(zcut)){
 
@@ -239,8 +261,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
         }
       }
 
-      if(!is.null(suff_fnr)){
-        AUC_CUTOFF = max(AUC_CUTOFF,suff_fnr)
+      if(!is.null(suff_auc)){
+        AUC_CUTOFF = max(AUC_CUTOFF,suff_auc)
       }
     }
     filtered_fnr = AUC > AUC_CUTOFF
@@ -305,7 +327,7 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
         }else{
           hist(AUC, main = paste0("auc: Thresh = ",signif(AUC_CUTOFF,3)," , Rm = ",sum(filtered_fnr)," , Tot_Rm = ",sum(is_bad)), xlab = "FNR AUC", breaks = hist_breaks)
         }
-        abline(v = hard_fnr, col = "yellow", lty = 1)
+        abline(v = hard_auc, col = "yellow", lty = 1)
         abline(v = AUC_CUTOFF, col = "blue", lty = 2)
       }
 
@@ -321,13 +343,6 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
       if(!is.null(pos_controls)){
         hist(AUC[!is_bad],  main = paste0("auc: Tot = ",sum(!is_bad)), xlab = "FNR AUC", breaks = hist_breaks)
       }
-
-      # v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr)
-      # rownames(v) = c("nreads","ralign","breadth","fnr")
-      # v = na.omit(v)
-      # m = v %*% t(v)
-      #
-      # barplot(m, beside = TRUE, legend.text = TRUE)
   }
 
   return(list(filtered_nreads = filtered_nreads,
diff --git a/inst/NEWS b/inst/NEWS
index 109aedf..475b526 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -3,6 +3,8 @@ Changes in version 0.0.6
 
 * Fixed bug when using plot functionality of filtering functions.
 * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition.
+* Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues.
+* Removed all sufficient thresholds for metric sample filter. 
 
 Changes in version 0.0.5
 ========================
diff --git a/man/metric_sample_filter.Rd b/man/metric_sample_filter.Rd
index 6a00bc2..125aad8 100644
--- a/man/metric_sample_filter.Rd
+++ b/man/metric_sample_filter.Rd
@@ -6,10 +6,10 @@
 \usage{
 metric_sample_filter(expr, nreads = colSums(expr), ralign = NULL,
   gene_filter = NULL, pos_controls = NULL, scale. = FALSE, glen = NULL,
-  AUC_range = c(0, 6), zcut = 1, mixture = TRUE, dip_thresh = 0.05,
+  AUC_range = c(0, 15), zcut = 1, mixture = TRUE, dip_thresh = 0.05,
   hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2,
-  hard_fnr = 3, suff_nreads = NULL, suff_ralign = 65,
-  suff_breadth = 0.8, suff_fnr = NULL, plot = FALSE, hist_breaks = 10)
+  hard_auc = 10, suff_nreads = NULL, suff_ralign = NULL,
+  suff_breadth = NULL, suff_auc = NULL, plot = FALSE, hist_breaks = 10)
 }
 \arguments{
 \item{expr}{matrix The data matrix (genes in rows, cells in columns).}
@@ -29,7 +29,7 @@ If NULL, filtered_fnr will be returned NA.}
 
 \item{glen}{Gene lengths for gene-length normalization (normalized data used in FNR computation).}
 
-\item{AUC_range}{An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6)}
+\item{AUC_range}{An array of two values, representing range over which FNR AUC will be computed (log(expr_units)). Default c(0,15)}
 
 \item{zcut}{A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.
 If NULL, only hard threshold sub-criteria will be applied.}
@@ -46,15 +46,15 @@ Samples deviating zcut sd's from optimal mean (in the inferior direction), have
 
 \item{hard_breadth}{numeric. Hard (lower bound on) breadth threshold. Default 0.2.}
 
-\item{hard_fnr}{numeric. Hard (upper bound on) fnr threshold. Default 3.}
+\item{hard_auc}{numeric. Hard (upper bound on) fnr auc threshold. Default 10.}
 
-\item{suff_nreads}{numeric. If not null, serves as an upper bound on nreads threshold.}
+\item{suff_nreads}{numeric. If not null, serves as an overriding upper bound on nreads threshold.}
 
-\item{suff_ralign}{numeric. If not null, serves as an upper bound on ralign threshold.  Default 65.}
+\item{suff_ralign}{numeric. If not null, serves as an overriding upper bound on ralign threshold.}
 
-\item{suff_breadth}{numeric. If not null, serves as an upper bound on breadth threshold.  Default 0.8.}
+\item{suff_breadth}{numeric. If not null, serves as an overriding upper bound on breadth threshold.}
 
-\item{suff_fnr}{numeric. If not null, serves as an lower bound on fnr threshold.}
+\item{suff_auc}{numeric. If not null, serves as an overriding lower bound on fnr auc threshold.}
 
 \item{plot}{logical. Should a plot be produced?}
 
diff --git a/man/simple_FNR_params.Rd b/man/simple_FNR_params.Rd
index 86e4de0..039e668 100644
--- a/man/simple_FNR_params.Rd
+++ b/man/simple_FNR_params.Rd
@@ -9,17 +9,19 @@ simple_FNR_params(expr, pos_controls, fn_tresh = 0.01)
 \arguments{
 \item{expr}{matrix The data matrix in transcript-proportional units (genes in rows, cells in columns).}
 
-\item{pos_controls}{A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.}
+\item{pos_controls}{A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.
+User must provide at least 2 positive control genes.}
 
-\item{fn_tresh}{Inclusive threshold for negative detection. Default 0.01.}
+\item{fn_tresh}{Inclusive threshold for negative detection. Default 0.01.
+fn_tresh must be non-negative.}
 }
 \value{
-A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA.
+A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated.
 }
 \description{
 Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes
 }
 \details{
-logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) .
+logit(Probability of False Negative) ~ a + b*(median log-expression) .
 }
 

From a7031885ee75d3c5123ae24e1d6c4d5e1aa0c941 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 18:29:00 -0700
Subject: [PATCH 13/27] Fixed tests

---
 DESCRIPTION                 |  6 +++---
 tests/testthat/test_norm.R  |  5 +++--
 tests/testthat/test_scone.R | 23 ++++++++++++++---------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2f1e14b..b29f22b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -5,10 +5,10 @@ Description: scone is a package to compare and rank the performance of different
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
 	          role = c("aut", "cre", "cph")),
 	     person("Davide", "Risso", email = "risso.davide@gmail.com",
-	          role = c("aut")))
-Author: Michael Cole [aut, cre, cph], Davide Risso [aut]
+	          role = c("aut", "cph")))
+Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph]
 Maintainer: Michael Cole <mbeloc@gmail.com>
-Date: 2016-05-12
+Date: 2016-07-07
 License: Artistic-2.0
 Depends:
   R (>= 3.3)
diff --git a/tests/testthat/test_norm.R b/tests/testthat/test_norm.R
index 368be51..61dc207 100644
--- a/tests/testthat/test_norm.R
+++ b/tests/testthat/test_norm.R
@@ -8,7 +8,8 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package"
 
   # UQ + RUV
   res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=5, k_qc=0,
-               evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100))
+               evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100),
+               return_norm = "in_memory")
 
   uq <- EDASeq::betweenLaneNormalization(e, which="upper", round=FALSE)
   rs <- lapply(1:5, function(i) RUVSeq::RUVg(log1p(uq), as.character(1:100), k=i, round=FALSE, isLog=TRUE)$norm)
@@ -22,7 +23,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package"
   qc_mat <- matrix(rnorm(20), nrow=10)
 
   res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=0, k_qc=2,
-               evaluate=FALSE, run=TRUE, qc=qc_mat)
+               evaluate=FALSE, run=TRUE, qc=qc_mat, return_norm = "in_memory")
 
   pca_qc <- prcomp(qc_mat, center=TRUE, scale=TRUE)
   qs <- lapply(1:2, function(i) {
diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R
index 6743a98..aa36cd2 100644
--- a/tests/testthat/test_scone.R
+++ b/tests/testthat/test_scone.R
@@ -8,12 +8,12 @@ test_that("Test with no real method (only identity)", {
 
   # one combination
   res <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
-               evaluate=FALSE, run=TRUE)
+               evaluate=FALSE, run=TRUE, return_norm = "in_memory")
   expect_equal(res$normalized_data[[1]], log1p(e))
 
   # add more imputations
   res <- scone(e, imputation=list(a=identity, b=identity), scaling=identity,
-               k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE)
+               k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE, return_norm = "in_memory")
   expect_equal(res$normalized_data[[1]], log1p(e))
   expect_equal(res$normalized_data[[2]], log1p(e))
 
@@ -159,7 +159,8 @@ test_that("Test imputation and scaling", {
                            adjust_batch="yes", batch=batch, run=TRUE,
                            evaluate=TRUE, eval_negcon=as.character(11:20),
                            eval_poscon=as.character(21:30),
-                           eval_kclust = 2, verbose=TRUE))
+                           eval_kclust = 2, verbose=FALSE,
+                           return_norm = "in_memory"))
 
   expect_equal(rownames(res$metrics), rownames(res$scores))
   expect_equal(rownames(res$metrics), rownames(res$params))
@@ -172,7 +173,7 @@ test_that("Test imputation and scaling", {
                adjust_batch="yes", batch=batch, run=TRUE,
                evaluate=FALSE, eval_negcon=as.character(11:20),
                eval_poscon=as.character(21:30),
-               eval_kclust = 2, verbose=TRUE)
+               eval_kclust = 2, verbose=FALSE, return_norm = "in_memory")
 
   norm_ordered <- res2$normalized_data[names(res$normalized_data)]
   expect_equal(norm_ordered, res$normalized_data)
@@ -187,7 +188,7 @@ test_that("scone works with only one normalization",{
   res <- scone(e, imputation=list(none=identity),
                scaling=list(none=identity),
                k_ruv=0, k_qc=0, run=TRUE,
-               evaluate=TRUE, eval_kclust = 2)
+               evaluate=TRUE, eval_kclust = 2, return_norm = "in_memory")
 
   expect_equal(res$normalized_data[[1]], log1p(e))
 })
@@ -232,10 +233,12 @@ test_that("if bio=no bio is ignored", {
   rownames(e) <- as.character(1:nrow(e))
 
   res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
-               adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3)
+               adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3,
+               return_norm = "in_memory")
 
   res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
-                 adjust_bio = "no", eval_kclust = 3)
+                 adjust_bio = "no", eval_kclust = 3,
+                return_norm = "in_memory")
 
   expect_equal(res1$normalized_data, res2$normalized_data)
 })
@@ -245,10 +248,12 @@ test_that("if batch=no batch is ignored", {
   rownames(e) <- as.character(1:nrow(e))
 
   res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
-                adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3)
+                adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3,
+                return_norm = "in_memory")
 
   res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
-                adjust_batch = "no", eval_kclust = 3)
+                adjust_batch = "no", eval_kclust = 3,
+                return_norm = "in_memory")
 
   expect_equal(res1$normalized_data, res2$normalized_data)
 })

From ec8826f70c5965b2a5b359b3164e18c867d31629 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 7 Jul 2016 18:46:01 -0700
Subject: [PATCH 14/27] mem_used() is very costly; removed

---
 DESCRIPTION    | 4 +---
 NAMESPACE      | 2 --
 R/scone_main.R | 4 +---
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b29f22b..e6b66a2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 0.0.5-9000
+Version: 0.0.5-9001
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
@@ -22,14 +22,12 @@ Imports:
   EDASeq,
   edgeR,
   fpc,
-  gdata,
   gplots,
   grDevices,
   limma,
   MASS,
   matrixStats,
   mixtools,
-  pryr,
   rhdf5,
   RUVSeq
 Suggests:
diff --git a/NAMESPACE b/NAMESPACE
index 5531cec..05269de 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -32,7 +32,6 @@ importFrom(cluster,silhouette)
 importFrom(diptest,dip.test)
 importFrom(edgeR,calcNormFactors)
 importFrom(fpc,pamk)
-importFrom(gdata,humanReadable)
 importFrom(grDevices,colorRampPalette)
 importFrom(graphics,abline)
 importFrom(graphics,arrows)
@@ -46,7 +45,6 @@ importFrom(matrixStats,colIQRs)
 importFrom(matrixStats,colMedians)
 importFrom(matrixStats,rowMedians)
 importFrom(mixtools,normalmixEM)
-importFrom(pryr,mem_used)
 importFrom(rhdf5,h5createFile)
 importFrom(rhdf5,h5ls)
 importFrom(rhdf5,h5read)
diff --git a/R/scone_main.R b/R/scone_main.R
index 2fb007c..6b2f626 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -82,8 +82,6 @@
 #' @importFrom RUVSeq RUVg
 #' @importFrom matrixStats rowMedians
 #' @import BiocParallel
-#' @importFrom pryr mem_used
-#' @importFrom gdata humanReadable
 #' @importFrom graphics abline arrows barplot hist par plot text
 #' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist
 #'   dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm
@@ -404,7 +402,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
       score <- NULL
     }
 
-    if(verbose) message(paste0("Processed: ", rownames(params)[i], "\n", "Mem used: ", gdata::humanReadable(pryr::mem_used())))
+    if(verbose) message(paste0("Processed: ", rownames(params)[i]))
     if(return_norm == "in_memory") {
       return(list(score=score, adjusted=adjusted))
     } else {

From ac6774ec8595af52e37ccc1322fbf01b95ccf71b Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 14 Jul 2016 16:55:41 -0700
Subject: [PATCH 15/27] Import correct hdf5 functions

---
 NAMESPACE      | 1 +
 R/scone_main.R | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 05269de..1cf4b58 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,6 +49,7 @@ importFrom(rhdf5,h5createFile)
 importFrom(rhdf5,h5ls)
 importFrom(rhdf5,h5read)
 importFrom(rhdf5,h5write)
+importFrom(rhdf5,h5write.default)
 importFrom(stats,approx)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
diff --git a/R/scone_main.R b/R/scone_main.R
index 6b2f626..079deaf 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -87,7 +87,7 @@
 #'   dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm
 #'   prcomp quantile quasibinomial sd
 #' @importFrom utils capture.output
-#' @importFrom rhdf5 h5createFile h5write
+#' @importFrom rhdf5 h5createFile h5write.default h5write
 #' @export
 #'
 #' @details If \code{run=FALSE} the normalization and evaluation are not run,

From 05319c075dc18773190c817e7329e4fffdff59e3 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 14 Jul 2016 18:06:05 -0700
Subject: [PATCH 16/27] Interactive biplot

---
 .Rbuildignore             |  4 ++-
 DESCRIPTION               |  8 +++--
 NAMESPACE                 | 10 ++++++
 R/biplot_interactive.R    | 70 +++++++++++++++++++++++++++++++++++++++
 inst/NEWS                 |  7 +++-
 man/biplot_interactive.Rd | 29 ++++++++++++++++
 6 files changed, 123 insertions(+), 5 deletions(-)
 create mode 100644 R/biplot_interactive.R
 create mode 100644 man/biplot_interactive.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
index 3e3b853..0fdadad 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -33,4 +33,6 @@ README[.]md
 #----------------------------
 # Temp scripts
 #----------------------------
-^old_scripts/*
\ No newline at end of file
+^old_scripts/*
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/DESCRIPTION b/DESCRIPTION
index b998287..b69a813 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 0.0.6-9000
+Version: 0.0.6-9001
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
@@ -8,7 +8,7 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
 	          role = c("aut")))
 Author: Michael Cole [aut, cre, cph], Davide Risso [aut]
 Maintainer: Michael Cole <mbeloc@gmail.com>
-Date: 2016-05-12
+Date: 2016-07-14
 License: Artistic-2.0
 Depends:
   R (>= 3.3)
@@ -29,7 +29,9 @@ Imports:
   matrixStats,
   mixtools,
   grDevices,
-  boot
+  boot,
+  shiny,
+  miniUI
 Suggests:
   knitr,
   rmarkdown,
diff --git a/NAMESPACE b/NAMESPACE
index 4050ac6..1832316 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -9,6 +9,7 @@ export(TMM_FN)
 export(UQ_FN)
 export(UQ_FN_POS)
 export(biplot_colored)
+export(biplot_interactive)
 export(estimate_ziber)
 export(estimate_zinb)
 export(factor_sample_filter)
@@ -45,7 +46,16 @@ importFrom(limma,lmFit)
 importFrom(matrixStats,colIQRs)
 importFrom(matrixStats,colMedians)
 importFrom(matrixStats,rowMedians)
+importFrom(miniUI,gadgetTitleBar)
+importFrom(miniUI,miniContentPanel)
+importFrom(miniUI,miniPage)
 importFrom(mixtools,normalmixEM)
+importFrom(shiny,brushedPoints)
+importFrom(shiny,observeEvent)
+importFrom(shiny,plotOutput)
+importFrom(shiny,renderPlot)
+importFrom(shiny,runGadget)
+importFrom(shiny,verbatimTextOutput)
 importFrom(stats,approx)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R
new file mode 100644
index 0000000..c3a79a0
--- /dev/null
+++ b/R/biplot_interactive.R
@@ -0,0 +1,70 @@
+#' Interactive biplot
+#'
+#' This is a wrapper around \code{\link{biplot_colored}}, which creates a shiny
+#' gadget to allow the user to select specific points in the graph.
+#'
+#' @details Since this is based on the shiny gadget feature, it will not work in
+#'   static documents, such as vignettes or markdown / knitr documents.
+#'   See \code{biplot_colored} for more details on the internals.
+#'
+#' @param data a data.frame containing the data to be plotted.
+#' @param scores a numeric vector used to color the points.
+#'
+#' @importFrom miniUI gadgetTitleBar miniContentPanel miniPage gadgetTitleBar
+#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput
+#'
+#' @export
+#'
+#' @examples
+#' mat <- matrix(rnorm(1000), ncol=10)
+#' colnames(mat) <- paste("X", 1:ncol(mat), sep="")
+#'
+#' biplot_interactive(mat, mat[,1])
+biplot_interactive <- function(data, scores) {
+
+  data <- as.data.frame(data)
+  scores <- as.numeric(scores)
+
+  ui <- miniPage(
+    gadgetTitleBar("Drag to select points"),
+    miniContentPanel(
+      # The brush="brush" argument means we can listen for
+      # brush events on the plot using input$brush.
+      plotOutput("plot1", height = "80%", brush = "plot_brush"),
+      verbatimTextOutput("info")
+    )
+  )
+
+  server <- function(input, output, session) {
+
+    # Compute PCA
+    pc_obj <- prcomp(data, center = TRUE, scale = FALSE)
+    bp_obj <- biplot_colored(pc_obj, y = scores)
+
+    # Render the plot
+    output$plot1 <- renderPlot({
+      # Biplot
+      biplot_colored(pc_obj, y = scores)
+    })
+
+    data_out <- cbind(data, bp_obj)
+
+    output$info <- renderText({
+      xy_range_str <- function(e) {
+        if(is.null(e)) return("NULL\n")
+        idx <- which(bp_obj[,1] >= e$xmin & bp_obj[,1] <= e$xmax &
+                       bp_obj[,2] >= e$ymin & bp_obj[,2] <= e$ymax)
+        paste0(rownames(data)[idx], collapse = "\n")
+      }
+      xy_range_str(input$plot_brush)
+    })
+
+    # Handle the Done button being pressed.
+    observeEvent(input$done, {
+      # Return the brushed points. See ?shiny::brushedPoints.
+      stopApp(brushedPoints(data_out, input$plot_brush, xvar="PC1", yvar="PC2"))
+    })
+  }
+
+  runGadget(ui, server)
+}
diff --git a/inst/NEWS b/inst/NEWS
index 475b526..1e8d854 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,10 +1,15 @@
+Changes in version 0.0.7
+========================
+
+* New biplot_interactive function to explore the results.
+
 Changes in version 0.0.6
 ========================
 
 * Fixed bug when using plot functionality of filtering functions.
 * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition.
 * Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues.
-* Removed all sufficient thresholds for metric sample filter. 
+* Removed all sufficient thresholds for metric sample filter.
 
 Changes in version 0.0.5
 ========================
diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd
new file mode 100644
index 0000000..ae974d5
--- /dev/null
+++ b/man/biplot_interactive.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/biplot_interactive.R
+\name{biplot_interactive}
+\alias{biplot_interactive}
+\title{Interactive biplot}
+\usage{
+biplot_interactive(data, scores)
+}
+\arguments{
+\item{data}{a data.frame containing the data to be plotted.}
+
+\item{scores}{a numeric vector used to color the points.}
+}
+\description{
+This is a wrapper around \code{\link{biplot_colored}}, which creates a shiny
+gadget to allow the user to select specific points in the graph.
+}
+\details{
+Since this is based on the shiny gadget feature, it will not work in
+  static documents, such as vignettes or markdown / knitr documents.
+  See \code{biplot_colored} for more details on the internals.
+}
+\examples{
+mat <- matrix(rnorm(1000), ncol=10)
+colnames(mat) <- paste("X", 1:ncol(mat), sep="")
+
+biplot_interactive(mat, mat[,1])
+}
+

From dbc16ec487280486d026bee192c0887b471d2f0a Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 14 Jul 2016 18:14:16 -0700
Subject: [PATCH 17/27] Travis with codecov

---
 .travis.yml | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..3d1b594
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,42 @@
+#----------------------------------------------------------------
+# Travis-CI configuration for Bioconductor packages
+#
+# REFERENCES:
+# * Travis CI: https://travis-ci.org/
+# * tutorial: https://docs.travis-ci.com/user/languages/r
+# * see also: https://blog.rstudio.org/2016/03/09/r-on-travis-ci/
+# * covr: https://github.com/jimhester/covr
+# * Coveralls: https://coveralls.io/
+#
+# Validate your .travis.yml file at http://lint.travis-ci.org/
+#----------------------------------------------------------------
+language: r
+cache: packages
+
+# R versions to be tested on
+r:
+  - bioc-release
+  - bioc-devel
+
+## Turn this to true before submission to CRAN/Bioconductor
+warnings_are_errors: false
+
+r_build_args: "--no-build-vignettes"
+r_check_args: "--no-vignettes"
+
+notifications:
+  email:
+    on_success: change
+    on_failure: change
+
+## Code coverage
+r_packages:
+  - covr
+
+## BiocCheck
+bioc_packages:
+  - BiocCheck
+
+after_success:
+  - bash <(curl -s https://codecov.io/bash)
+  - R CMD BiocCheck .

From 0b5c3507231370e80b32b6acd431fa6785d275a9 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Thu, 14 Jul 2016 20:49:37 -0700
Subject: [PATCH 18/27] Do not run interactive example (duh!)

---
 NAMESPACE                 | 2 ++
 R/biplot_interactive.R    | 4 +++-
 man/biplot_interactive.Rd | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 1832316..1c5199e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -54,7 +54,9 @@ importFrom(shiny,brushedPoints)
 importFrom(shiny,observeEvent)
 importFrom(shiny,plotOutput)
 importFrom(shiny,renderPlot)
+importFrom(shiny,renderText)
 importFrom(shiny,runGadget)
+importFrom(shiny,stopApp)
 importFrom(shiny,verbatimTextOutput)
 importFrom(stats,approx)
 importFrom(stats,as.formula)
diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R
index c3a79a0..8e6b111 100644
--- a/R/biplot_interactive.R
+++ b/R/biplot_interactive.R
@@ -11,15 +11,17 @@
 #' @param scores a numeric vector used to color the points.
 #'
 #' @importFrom miniUI gadgetTitleBar miniContentPanel miniPage gadgetTitleBar
-#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput
+#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput stopApp renderText
 #'
 #' @export
 #'
 #' @examples
+#' \dontrun{
 #' mat <- matrix(rnorm(1000), ncol=10)
 #' colnames(mat) <- paste("X", 1:ncol(mat), sep="")
 #'
 #' biplot_interactive(mat, mat[,1])
+#' }
 biplot_interactive <- function(data, scores) {
 
   data <- as.data.frame(data)
diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd
index ae974d5..32b5f78 100644
--- a/man/biplot_interactive.Rd
+++ b/man/biplot_interactive.Rd
@@ -21,9 +21,11 @@ Since this is based on the shiny gadget feature, it will not work in
   See \code{biplot_colored} for more details on the internals.
 }
 \examples{
+\dontrun{
 mat <- matrix(rnorm(1000), ncol=10)
 colnames(mat) <- paste("X", 1:ncol(mat), sep="")
 
 biplot_interactive(mat, mat[,1])
 }
+}
 

From 17c1042ad54f82fb7324e7bf147a5c6e057ab638 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 15 Jul 2016 10:37:38 -0700
Subject: [PATCH 19/27] Add par control to interactive biplot

---
 R/biplot_interactive.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R
index 8e6b111..38c3eda 100644
--- a/R/biplot_interactive.R
+++ b/R/biplot_interactive.R
@@ -22,7 +22,7 @@
 #'
 #' biplot_interactive(mat, mat[,1])
 #' }
-biplot_interactive <- function(data, scores) {
+biplot_interactive <- function(data, scores, ...) {
 
   data <- as.data.frame(data)
   scores <- as.numeric(scores)
@@ -46,7 +46,7 @@ biplot_interactive <- function(data, scores) {
     # Render the plot
     output$plot1 <- renderPlot({
       # Biplot
-      biplot_colored(pc_obj, y = scores)
+      biplot_colored(pc_obj, y = scores, ...)
     })
 
     data_out <- cbind(data, bp_obj)

From bcd05f0b9310b1dd3e0529eeaca85dd50c64c808 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 15 Jul 2016 13:24:29 -0700
Subject: [PATCH 20/27] Update docs

---
 man/biplot_interactive.Rd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/biplot_interactive.Rd b/man/biplot_interactive.Rd
index 32b5f78..04eb7b8 100644
--- a/man/biplot_interactive.Rd
+++ b/man/biplot_interactive.Rd
@@ -4,7 +4,7 @@
 \alias{biplot_interactive}
 \title{Interactive biplot}
 \usage{
-biplot_interactive(data, scores)
+biplot_interactive(data, scores, ...)
 }
 \arguments{
 \item{data}{a data.frame containing the data to be plotted.}

From 4918f6ae5f1e62b6e736269f9ef1d9b3ed489c92 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 15 Jul 2016 14:36:11 -0700
Subject: [PATCH 21/27] Added check for hdf5 and parallel

---
 R/scone_main.R             |  3 ++
 tests/testthat/test_hdf5.R | 67 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 tests/testthat/test_hdf5.R

diff --git a/R/scone_main.R b/R/scone_main.R
index 079deaf..1666c50 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -134,6 +134,9 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
     if(missing(hdf5file)) {
       stop("If `return_norm='hdf5'`, `hdf5file` must be specified.")
     } else {
+      if(BiocParallel::bpnworkers(BiocParallel::registered()[[1]]) > 1) {
+        stop("At the moment, `return_norm='hdf5'` does not support multicores.")
+      }
       stopifnot(h5createFile(hdf5file))
       h5write(rownames(expr), hdf5file, "genes")
       h5write(colnames(expr), hdf5file, "samples")
diff --git a/tests/testthat/test_hdf5.R b/tests/testthat/test_hdf5.R
new file mode 100644
index 0000000..6822211
--- /dev/null
+++ b/tests/testthat/test_hdf5.R
@@ -0,0 +1,67 @@
+context("Test the different return_norm options")
+set.seed(13124323)
+BiocParallel::register(bpparam("SerialParam"))
+
+
+test_that("hd5 checks", {
+  e <-  matrix(rpois(1000, lambda = 5), ncol=10)
+  rownames(e) <- as.character(1:nrow(e))
+
+  qc_mat <- matrix(rnorm(20), nrow=10)
+  bio <- gl(2, 5)
+  batch <- as.factor(rep(1:2, 5))
+
+  # factorial
+  expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb),
+               scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
+               k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
+               adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
+               evaluate=FALSE, run=TRUE, return_norm = "hdf5"),
+               "must be specified")
+
+  BiocParallel::register(bpparam("MulticoreParam"))
+
+  expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb),
+                     scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
+                     k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
+                     adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
+                     evaluate=FALSE, run=TRUE, return_norm = "hdf5", hdf5file = "/tmp/tmp"),
+               "does not support multicores")
+
+  BiocParallel::register(bpparam("SerialParam"))
+})
+
+test_that("return_norm in memory", {
+  e <-  matrix(rpois(1000, lambda = 5), ncol=10)
+  rownames(e) <- as.character(1:nrow(e))
+
+  qc_mat <- matrix(rnorm(20), nrow=10)
+  bio <- gl(2, 5)
+  batch <- as.factor(rep(1:2, 5))
+
+  # factorial
+  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+                     scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
+                     k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
+                     adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
+                     evaluate=FALSE, run=TRUE, return_norm = "in_memory")
+  expect_is(res$normalized_data, "list")
+  expect_true(all(sapply(res$normalized_data, is.matrix)))
+})
+
+test_that("do not return_norm", {
+  e <-  matrix(rpois(1000, lambda = 5), ncol=10)
+  rownames(e) <- as.character(1:nrow(e))
+
+  qc_mat <- matrix(rnorm(20), nrow=10)
+  bio <- gl(2, 5)
+  batch <- as.factor(rep(1:2, 5))
+
+  # factorial
+  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+               scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
+               k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
+               adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
+               evaluate=FALSE, run=TRUE, return_norm = "no")
+  expect_null(res$normalized_data)
+})

From 9087b7c57e2b42e3cf2ac38dcac836bf00cbb1a9 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 15 Jul 2016 14:58:42 -0700
Subject: [PATCH 22/27] Do not compute dist twice

---
 R/scone_eval.R | 53 +++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/R/scone_eval.R b/R/scone_eval.R
index fd421fa..371704a 100644
--- a/R/scone_eval.R
+++ b/R/scone_eval.R
@@ -3,13 +3,13 @@
 #' This function evaluates an expression matrix using SCONE criteria, producing a number of scores based on
 #' projections of the normalized data, correlations, and RLE metrics.
 #'
-#' @details The eval_proj function argument must have 2 inputs: 
+#' @details The eval_proj function argument must have 2 inputs:
 #' \itemize{
 #' \item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).}
 #' \item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.}
 #' }
 #' and it must output a matrix representation of the original data (cells in rows, factors in columns).
-#' 
+#'
 #' @param expr matrix. The data matrix (genes in rows, cells in columns).
 #' @param eval_pcs numeric. The number of principal components to use for evaluation.
 #' Ignored if !is.null(eval_proj).
@@ -52,11 +52,11 @@
 #' }
 #'
 
-score_matrix <- function(expr, eval_pcs = 3, 
+score_matrix <- function(expr, eval_pcs = 3,
                          eval_proj = NULL, eval_proj_args = NULL,
                          eval_kclust = NULL,
                          bio = NULL, batch = NULL,
-                         qc_factors = NULL,uv_factors = NULL, wv_factors = NULL, 
+                         qc_factors = NULL,uv_factors = NULL, wv_factors = NULL,
                          is_log=FALSE, stratified_pam = FALSE){
 
   if(any(is.na(expr) | is.infinite(expr) | is.nan(expr))){
@@ -75,33 +75,34 @@ score_matrix <- function(expr, eval_pcs = 3,
   }
 
   ## ------ Bio and Batch Tightness -----
+  dd <- as.matrix(dist(proj))
 
-    if( !is.null(bio) ) {
-      if(!all(is.na(bio))) {
-        BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dist(proj[!is.na(bio),])))$avg.width
-      } else {
-        BIO_SIL = NA
-        warning("bio is all NA!")
-      }
+  if( !is.null(bio) ) {
+    if(!all(is.na(bio))) {
+      BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dd[!is.na(bio),]))$avg.width
     } else {
       BIO_SIL = NA
+      warning("bio is all NA!")
     }
+  } else {
+    BIO_SIL = NA
+  }
 
-    if(!is.null(batch)) {
-      if(!all(is.na(batch))) {
-        BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dist(proj[!is.na(batch),])))$avg.width
-      } else{
-        BATCH_SIL <- NA
-        warning("batch is all NA!")
-      }
-    } else {
+  if(!is.null(batch)) {
+    if(!all(is.na(batch))) {
+      BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dd[!is.na(batch),]))$avg.width
+    } else{
       BATCH_SIL <- NA
+      warning("batch is all NA!")
     }
+  } else {
+    BATCH_SIL <- NA
+  }
 
   ## ------ PAM Tightness -----
 
   if ( !is.null(eval_kclust) ){
-    
+
     # "Stratified" PAM
     if(stratified_pam){
 
@@ -114,7 +115,7 @@ score_matrix <- function(expr, eval_pcs = 3,
         cond_w = length(is_cond)
         if(cond_w > max(eval_kclust)){
           pamk_object = pamk(proj[is_cond,],krange = eval_kclust)
-          
+
           # Despite krange excluding nc = 1, if asw is negative, nc = 1 will be selected
           if(is.null(pamk_object$pamobject$silinfo$avg.width) ){
             if (!1 %in% eval_kclust) {
@@ -123,7 +124,7 @@ score_matrix <- function(expr, eval_pcs = 3,
               stop("nc = 1 was selected by Duda-Hart, exclude 1 from eval_kclust.")
             }
           }
-  
+
           PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width
         }else{
           stop(paste("Number of clusters 'k' must be smaller than bio-cross-batch stratum size:",
@@ -131,7 +132,7 @@ score_matrix <- function(expr, eval_pcs = 3,
         }
       }
       PAM_SIL = PAM_SIL/length(biobatch)
-      
+
     # Traditional PAM
     }else{
       pamk_object = pamk(proj,krange = eval_kclust)
@@ -170,10 +171,10 @@ score_matrix <- function(expr, eval_pcs = 3,
   RLE_MED <- mean(colMedians(rle)^2)
   RLE_IQR <- mean(colIQRs(rle))
 
-  scores = c(BIO_SIL, BATCH_SIL, PAM_SIL, 
-             EXP_QC_COR, EXP_UV_COR, EXP_WV_COR, 
+  scores = c(BIO_SIL, BATCH_SIL, PAM_SIL,
+             EXP_QC_COR, EXP_UV_COR, EXP_WV_COR,
              RLE_MED, RLE_IQR)
-  names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL", 
+  names(scores) = c("BIO_SIL", "BATCH_SIL", "PAM_SIL",
                     "EXP_QC_COR", "EXP_UV_COR", "EXP_WV_COR",
                     "RLE_MED", "RLE_IQR")
   return(scores)

From 80cdc51f173d29d76bf2b19cb40613f63f95d5c9 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 15 Jul 2016 15:06:53 -0700
Subject: [PATCH 23/27] Fix bug introduced in previous commit

---
 R/scone_eval.R              |  6 ++++--
 tests/testthat/test_scone.R | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/R/scone_eval.R b/R/scone_eval.R
index 371704a..9f654a1 100644
--- a/R/scone_eval.R
+++ b/R/scone_eval.R
@@ -79,7 +79,8 @@ score_matrix <- function(expr, eval_pcs = 3,
 
   if( !is.null(bio) ) {
     if(!all(is.na(bio))) {
-      BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),dd[!is.na(bio),]))$avg.width
+      BIO_SIL = summary(cluster::silhouette(as.numeric(na.omit(bio)),
+                                            dd[!is.na(bio), !is.na(bio)]))$avg.width
     } else {
       BIO_SIL = NA
       warning("bio is all NA!")
@@ -90,7 +91,8 @@ score_matrix <- function(expr, eval_pcs = 3,
 
   if(!is.null(batch)) {
     if(!all(is.na(batch))) {
-      BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),dd[!is.na(batch),]))$avg.width
+      BATCH_SIL <- summary(cluster::silhouette(as.numeric(na.omit(batch)),
+                                               dd[!is.na(batch),!is.na(batch)]))$avg.width
     } else{
       BATCH_SIL <- NA
       warning("batch is all NA!")
diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R
index aa36cd2..9ce9857 100644
--- a/tests/testthat/test_scone.R
+++ b/tests/testthat/test_scone.R
@@ -270,3 +270,25 @@ test_that("batch and bio can be confounded if at least one of adjust_bio or adju
                        adjust_bio = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3),
                  "Biological conditions and batches are confounded.")
 })
+
+test_that("batch and bio can contain NA", {
+  e <-  matrix(rpois(10000, lambda = 5), ncol=10)
+  rownames(e) <- as.character(1:nrow(e))
+
+  batch <- gl(2, 5)
+  bio <- gl(5, 2)
+
+  res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
+               adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3)
+
+  batch[1] <- NA
+  bio[2] <- NA
+
+  res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
+        adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3)
+
+  expect_true(!is.na(res2$metrics[,"BIO_SIL"]))
+  expect_true(!is.na(res2$metrics[,"BATCH_SIL"]))
+  expect_false(all(res1$metrics[,"BIO_SIL"] == res2$metrics[,"BIO_SIL"]))
+  expect_false(all(res1$metrics[,"BATCH_SIL"] == res2$metrics[,"BATCH_SIL"]))
+})

From 6e63d95cbbb603b75ff62317d20034b776c649fa Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Mon, 18 Jul 2016 18:57:36 -0700
Subject: [PATCH 24/27] Major imputation modifications

---
 NAMESPACE                   |   3 +-
 R/scone_main.R              |  11 +++-
 R/ziber.R                   | 101 +++++++++++++++++++-----------------
 R/zinb.R                    |   5 +-
 inst/NEWS                   |   3 ++
 man/estimate_ziber.Rd       |   3 +-
 man/impute_expectation.Rd   |  29 +++++++++++
 man/impute_null.Rd          |  20 +++++++
 man/impute_ziber_simp.Rd    |  24 ---------
 man/impute_zinb.Rd          |   4 +-
 man/scone.Rd                |  19 ++++---
 tests/testthat/test_norm.R  |   4 +-
 tests/testthat/test_scone.R |  66 +++++++++++------------
 13 files changed, 169 insertions(+), 123 deletions(-)
 create mode 100644 man/impute_expectation.Rd
 create mode 100644 man/impute_null.Rd
 delete mode 100644 man/impute_ziber_simp.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 4050ac6..bd1015c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -12,7 +12,8 @@ export(biplot_colored)
 export(estimate_ziber)
 export(estimate_zinb)
 export(factor_sample_filter)
-export(impute_ziber_simp)
+export(impute_expectation)
+export(impute_null)
 export(impute_zinb)
 export(lm_adjust)
 export(make_design)
diff --git a/R/scone_main.R b/R/scone_main.R
index f84f497..1b21bf9 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -12,6 +12,8 @@
 #'
 #' @param expr matrix. The data matrix (genes in rows, cells in columns).
 #' @param imputation list or function. (A list of) function(s) to be used for imputation.
+#' @param impute_args arguments for imputation functions.
+#' @param rezero Restore zeroes following scaling step? Default FALSE.
 #' @param scaling list or function. (A list of) function(s) to be used for scaling normalization.
 #' @param k_ruv numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation).
 #' If 0, RUV will not be performed.
@@ -73,7 +75,7 @@
 #' Scores are computed so that higer-performing methods are assigned a higher scores.
 #'
 
-scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL,
+scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv=5, k_qc=5, ruv_negcon=NULL,
                   qc=NULL, adjust_bio=c("no", "yes", "force"), adjust_batch=c("no", "yes", "force"),
                   bio=NULL, batch=NULL, run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,eval_proj_args = NULL,
                   eval_kclust=2:10, eval_negcon=ruv_negcon, eval_poscon=NULL,
@@ -282,7 +284,7 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
   if(verbose) message("Imputation step...")
   im_params <- unique(params[,1])
 
-  imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr))
+  imputed <- lapply(1:length(im_params), function(i) imputation[[i]](expr,impute_args))
   names(imputed) <- im_params
   # output: a list of imputed matrices
 
@@ -291,6 +293,11 @@ scone <- function(expr, imputation=list(none=identity), scaling, k_ruv=5, k_qc=5
   sc_params <- unique(params[,1:2])
 
   scaled <- bplapply(1:nrow(sc_params), function(i) scaling[[sc_params[i,2]]](imputed[[sc_params[i,1]]]))
+  if(rezero){
+    if(verbose) message("Re-zero step...")
+    toz = expr <= 0
+    scaled <- lapply(scaled,function(x) x - x*toz)
+  }
   names(scaled) <- apply(sc_params, 1, paste, collapse="_")
   # output: a list of normalized expression matrices
 
diff --git a/R/ziber.R b/R/ziber.R
index cff93d3..6e7281d 100644
--- a/R/ziber.R
+++ b/R/ziber.R
@@ -15,7 +15,9 @@ likfn = function(Z,X,Beta){
 #' @param Beta gene-level sample coefficients.
 #'
 pzfn = function(Y,W,Alpha,X,Beta){
-  return(likfn(Y,W,Alpha)*likfn(1,X,Beta)/(likfn(Y,W,Alpha)*likfn(1,X,Beta) + (Y == 0)*likfn(0,X,Beta)))
+  matA = likfn(1,X,Beta)
+  matB = matA*likfn(Y,W,Alpha)
+  return(matB/(matB + (Y == 0)*(1-matA)))
 }
 
 #' Parameter estimation of zero-inflated bernoulli model
@@ -34,7 +36,6 @@ pzfn = function(Y,W,Alpha,X,Beta){
 #' @param bulk_model logical. Use median log-expression of gene in detected fraction as sole gene-level feature. Default FALSE.
 #' Ignored if gfeatM is specified.
 #' @param pos_controls logical. TRUE for all genes that are known to be expressed in all cells. 
-#' If specified, then drop-out characteristic is fit only once to control genes.
 #' @param em_tol numeric. Convergence treshold on log-likelihood.
 #' @param maxiter numeric. The maximum number of iterations. Default 100.
 #' @param verbose logical. Whether or not to print the value of the likelihood at each iteration.
@@ -53,10 +54,10 @@ pzfn = function(Y,W,Alpha,X,Beta){
 #' \item{loglik}{ the log-likelihood}
 #' \item{convergence}{ 0 if the algorithm converged and 1 if maxiter was reached}
 #' }
+
 estimate_ziber = function(x, fp_tresh = 0, 
                           gfeatM = NULL, bulk_model = FALSE, 
-                          pos_controls = NULL,  
-                        
+                          pos_controls = NULL,
                           em_tol = 0.01, maxiter = 100,
                           verbose = FALSE){
   
@@ -85,18 +86,22 @@ estimate_ziber = function(x, fp_tresh = 0,
   # Initialize Z
   Z = pzfn(Y,W,Alpha,X,Beta)
   
-  # If control genes are supplied, fit W once using control genes
-  if(!is.null(pos_controls)){
-    cY = Y[,pos_controls]
-    cAlpha = Alpha[,pos_controls]
-    for (i in 1:dim(Z)[1]){
-      W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients
-    } 
-    Beta[,pos_controls] = NA
-    Z[,pos_controls] = 1
+  if(is.null(pos_controls)){
+    stop("Must supply positive controls genes to fit FNR characteristic")
   }
     
-  EL2 = sum(na.omit(as.vector(Z*log(likfn(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(likfn(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(likfn(0,X,Beta)))))
+  # Fit W once using control genes
+  
+  cY = Y[,pos_controls]
+  cAlpha = Alpha[,pos_controls]
+  for (i in 1:dim(Z)[1]){
+    W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients
+  } 
+  Beta[,pos_controls] = NA
+  Z[,pos_controls] = 1
+  
+  matA = likfn(1,X,Beta)
+  EL2 = sum(Z*log(likfn(Y,W,Alpha)),na.rm = TRUE) + sum(Z*log( matA ),na.rm = TRUE) + sum((1-Z)*log(1- matA ),na.rm = TRUE)
   
   if(verbose){print(EL2)}
   
@@ -107,33 +112,14 @@ estimate_ziber = function(x, fp_tresh = 0,
     
     EL1 = EL2
     
-    if(is.null(pos_controls)){
-      
-      # Update Beta
-      Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z)))))
-      
-      # Update Z
-      Z = pzfn(Y,W,Alpha,X,Beta)
-      
-      # Update W
-      for (i in 1:dim(Z)[1]){
-        W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients
-      }
-    
-      # Update Z
-      Z = pzfn(Y,W,Alpha,X,Beta)
-      
-    }else{
-            
-      # Update Beta
-      Beta[,!pos_controls] = t(matrix(log(colMeans(Z[,!pos_controls])/(1-colMeans(Z[,!pos_controls])))))
-      
-      # Update Z
-      Z[,!pos_controls] = pzfn(Y[,!pos_controls],W,Alpha[,!pos_controls],X,Beta[,!pos_controls])
-      
-    }
+    # Update Beta
+    cmz = colMeans(Z[,!pos_controls])
+    Beta[,!pos_controls] = t(matrix(log(cmz) - log(1-cmz)))
+    # Update Z
+    Z[,!pos_controls] = pzfn(Y[,!pos_controls],W,Alpha[,!pos_controls],X,Beta[,!pos_controls])
     
-    EL2 = sum(na.omit(as.vector(Z*log(likfn(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(likfn(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(likfn(0,X,Beta)))))
+    matA = likfn(1,X,Beta)
+    EL2 = sum(Z*log(likfn(Y,W,Alpha)),na.rm = TRUE) + sum(Z*log( matA ),na.rm = TRUE) + sum((1-Z)*log(1- matA ),na.rm = TRUE)
     if(verbose){print(EL2)}
     niter = niter + 1
     if(niter >= maxiter){
@@ -147,23 +133,40 @@ estimate_ziber = function(x, fp_tresh = 0,
               expected_state = t(Z), loglik = EL2, convergence = 0))
 }
 
-#' Imputation of zero abundance based on zero-inflated bernoulli model
+#' Imputation of zero abundance based on general zero-inflated model
 #' 
-#' This function is used to impute the data, by weighting the observed detection failure by their
-#' probability of coming from the zero-inflation part of the distribution. Median positive expression
-#' is used as the sole gene-level determinant of drop-out, and the data is replaced with this expression value.
+#' This function is used to impute the data, weighted by probability of 
+#' data coming from the zero-inflation part of the distribution.
 #'
 #' @details The imputation is carried out with the following formula:
-#' y_{ij}* =  y_{ij} * Pr( No Drop | D_{ij}) + pos_median_{i} * Pr( Drop | D_{ij}).
+#' y_{ij}* =  y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}).
+#' 
+#' impute_args must contain two elements: 
+#' 1) p_nodrop  = posterior probability of data not having resulted from drop-out  (genes in rows, cells in columns)
+#' 2) mu  = expected expression of dropped data (genes in rows, cells in columns)
 #'  
 #' @export
 #' 
 #' @param expression the data matrix (genes in rows, cells in columns)
+#' @param impute_args arguments for imputation (see details)
 #' 
 #' @return the imputed expression matrix.
-impute_ziber_simp <- function(expression) {
-  pars <- estimate_ziber(expression,bulk_model = TRUE)
-  w <- pars$p_nodrop
-  imputed <- expression * w + exp(pars$Alpha[1,]) * (1 - w)
+#' 
+
+impute_expectation <- function(expression,impute_args) {
+  imputed <- expression * impute_args$p_nodrop + impute_args$mu * (1 - impute_args$p_nodrop)
   return(imputed)
 }
+
+#' Null or no-op imputation
+#' @export
+#' 
+#' @param expression the data matrix (genes in rows, cells in columns)
+#' @param impute_args arguments for imputation (not used)
+#' 
+#' @return the imputed expression matrix.
+#' 
+
+impute_null <- function(expression,impute_args) {
+  return(expression)
+}
diff --git a/R/zinb.R b/R/zinb.R
index 1dc7d70..e2953cc 100644
--- a/R/zinb.R
+++ b/R/zinb.R
@@ -158,9 +158,10 @@ loglik_small <- function(parms, Y, Y1, X, W, kx, kw, offsetx, offsetw, linkobj)
 #' @export
 #'
 #' @param expression the data matrix (genes in rows, cells in columns)
-#'
+#' @param impute_args arguments for imputation (not used)
+#' 
 #' @return the imputed expression matrix.
-impute_zinb <- function(expression) {
+impute_zinb <- function(expression, impute_args) {
   pars <- estimate_zinb(expression)
   w <- pars$p_z
   imputed <- expression * w + pars$mu * (1 - w)
diff --git a/inst/NEWS b/inst/NEWS
index 475b526..b4a7a93 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,6 +1,9 @@
 Changes in version 0.0.6
 ========================
 
+* Added option for restoring zeroes after scaling step.
+* New argument format for imputation via impute_args.
+* Simplified ziber fnr estimation function - requires control genes.
 * Fixed bug when using plot functionality of filtering functions.
 * "Conditional" pam replaced with "Stratified" pam, clustering each bio-cross-batch condition separately, rather than simply each bio condition.
 * Simple FNR for filtering is now based on medians of natural log expression in "expressing" cells / robust to convergence issues.
diff --git a/man/estimate_ziber.Rd b/man/estimate_ziber.Rd
index a339a25..8eea68d 100644
--- a/man/estimate_ziber.Rd
+++ b/man/estimate_ziber.Rd
@@ -17,8 +17,7 @@ estimate_ziber(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE,
 \item{bulk_model}{logical. Use median log-expression of gene in detected fraction as sole gene-level feature. Default FALSE.
 Ignored if gfeatM is specified.}
 
-\item{pos_controls}{logical. TRUE for all genes that are known to be expressed in all cells. 
-If specified, then drop-out characteristic is fit only once to control genes.}
+\item{pos_controls}{logical. TRUE for all genes that are known to be expressed in all cells.}
 
 \item{em_tol}{numeric. Convergence treshold on log-likelihood.}
 
diff --git a/man/impute_expectation.Rd b/man/impute_expectation.Rd
new file mode 100644
index 0000000..037040f
--- /dev/null
+++ b/man/impute_expectation.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ziber.R
+\name{impute_expectation}
+\alias{impute_expectation}
+\title{Imputation of zero abundance based on general zero-inflated model}
+\usage{
+impute_expectation(expression, impute_args)
+}
+\arguments{
+\item{expression}{the data matrix (genes in rows, cells in columns)}
+
+\item{impute_args}{arguments for imputation (see details)}
+}
+\value{
+the imputed expression matrix.
+}
+\description{
+This function is used to impute the data, weighted by probability of 
+data coming from the zero-inflation part of the distribution.
+}
+\details{
+The imputation is carried out with the following formula:
+y_{ij}* =  y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}).
+
+impute_args must contain two elements: 
+1) p_nodrop  = posterior probability of data not having resulted from drop-out  (genes in rows, cells in columns)
+2) mu  = expected expression of dropped data (genes in rows, cells in columns)
+}
+
diff --git a/man/impute_null.Rd b/man/impute_null.Rd
new file mode 100644
index 0000000..8711d49
--- /dev/null
+++ b/man/impute_null.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ziber.R
+\name{impute_null}
+\alias{impute_null}
+\title{Null or no-op imputation}
+\usage{
+impute_null(expression, impute_args)
+}
+\arguments{
+\item{expression}{the data matrix (genes in rows, cells in columns)}
+
+\item{impute_args}{arguments for imputation (not used)}
+}
+\value{
+the imputed expression matrix.
+}
+\description{
+Null or no-op imputation
+}
+
diff --git a/man/impute_ziber_simp.Rd b/man/impute_ziber_simp.Rd
deleted file mode 100644
index 20ddfe6..0000000
--- a/man/impute_ziber_simp.Rd
+++ /dev/null
@@ -1,24 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ziber.R
-\name{impute_ziber_simp}
-\alias{impute_ziber_simp}
-\title{Imputation of zero abundance based on zero-inflated bernoulli model}
-\usage{
-impute_ziber_simp(expression)
-}
-\arguments{
-\item{expression}{the data matrix (genes in rows, cells in columns)}
-}
-\value{
-the imputed expression matrix.
-}
-\description{
-This function is used to impute the data, by weighting the observed detection failure by their
-probability of coming from the zero-inflation part of the distribution. Median positive expression
-is used as the sole gene-level determinant of drop-out, and the data is replaced with this expression value.
-}
-\details{
-The imputation is carried out with the following formula:
-y_{ij}* =  y_{ij} * Pr( No Drop | D_{ij}) + pos_median_{i} * Pr( Drop | D_{ij}).
-}
-
diff --git a/man/impute_zinb.Rd b/man/impute_zinb.Rd
index 7fd45c8..64a9b6a 100644
--- a/man/impute_zinb.Rd
+++ b/man/impute_zinb.Rd
@@ -4,10 +4,12 @@
 \alias{impute_zinb}
 \title{Imputation of zero counts based on zero-inflated negative binomial model}
 \usage{
-impute_zinb(expression)
+impute_zinb(expression, impute_args)
 }
 \arguments{
 \item{expression}{the data matrix (genes in rows, cells in columns)}
+
+\item{impute_args}{arguments for imputation (not used)}
 }
 \value{
 the imputed expression matrix.
diff --git a/man/scone.Rd b/man/scone.Rd
index 20e59fc..d922df6 100644
--- a/man/scone.Rd
+++ b/man/scone.Rd
@@ -4,19 +4,24 @@
 \alias{scone}
 \title{scone main wrapper: function to apply and evaluate all the normalization schemes}
 \usage{
-scone(expr, imputation = list(none = identity), scaling, k_ruv = 5,
-  k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes",
-  "force"), adjust_batch = c("no", "yes", "force"), bio = NULL,
-  batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3,
-  eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10,
-  eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL,
-  verbose = FALSE, stratified_pam = FALSE)
+scone(expr, imputation = list(none = impute_null), impute_args = NULL,
+  rezero = FALSE, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL,
+  qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no",
+  "yes", "force"), bio = NULL, batch = NULL, run = TRUE,
+  evaluate = TRUE, eval_pcs = 3, eval_proj = NULL,
+  eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon,
+  eval_poscon = NULL, params = NULL, verbose = FALSE,
+  stratified_pam = FALSE)
 }
 \arguments{
 \item{expr}{matrix. The data matrix (genes in rows, cells in columns).}
 
 \item{imputation}{list or function. (A list of) function(s) to be used for imputation.}
 
+\item{impute_args}{arguments for imputation functions.}
+
+\item{rezero}{Restore zeroes following scaling step? Default FALSE.}
+
 \item{scaling}{list or function. (A list of) function(s) to be used for scaling normalization.}
 
 \item{k_ruv}{numeric. The maximum number of factors of unwanted variation (the function will adjust for 1 to k_ruv factors of unwanted variation).
diff --git a/tests/testthat/test_norm.R b/tests/testthat/test_norm.R
index 368be51..2f22bcc 100644
--- a/tests/testthat/test_norm.R
+++ b/tests/testthat/test_norm.R
@@ -7,7 +7,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package"
   rownames(e) <- as.character(1:nrow(e))
 
   # UQ + RUV
-  res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=5, k_qc=0,
+  res <- scone(e, imputation=impute_null, scaling=UQ_FN, k_ruv=5, k_qc=0,
                evaluate=FALSE, run=TRUE, ruv_negcon=as.character(1:100))
 
   uq <- EDASeq::betweenLaneNormalization(e, which="upper", round=FALSE)
@@ -21,7 +21,7 @@ test_that("Upper-quartile normalization works the same as in the EDASeq package"
   # UQ + QC
   qc_mat <- matrix(rnorm(20), nrow=10)
 
-  res <- scone(e, imputation=identity, scaling=UQ_FN, k_ruv=0, k_qc=2,
+  res <- scone(e, imputation=impute_null, scaling=UQ_FN, k_ruv=0, k_qc=2,
                evaluate=FALSE, run=TRUE, qc=qc_mat)
 
   pca_qc <- prcomp(qc_mat, center=TRUE, scale=TRUE)
diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R
index 6743a98..38a4a0d 100644
--- a/tests/testthat/test_scone.R
+++ b/tests/testthat/test_scone.R
@@ -7,33 +7,33 @@ test_that("Test with no real method (only identity)", {
   rownames(e) <- as.character(1:nrow(e))
 
   # one combination
-  res <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  res <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                evaluate=FALSE, run=TRUE)
   expect_equal(res$normalized_data[[1]], log1p(e))
 
   # add more imputations
-  res <- scone(e, imputation=list(a=identity, b=identity), scaling=identity,
+  res <- scone(e, imputation=list(a=impute_null, b=impute_null), scaling=identity,
                k_ruv=0, k_qc=0, evaluate=FALSE, run=TRUE)
   expect_equal(res$normalized_data[[1]], log1p(e))
   expect_equal(res$normalized_data[[2]], log1p(e))
 
   # add more scaling
-  res <- scone(e, imputation=list(a=identity, b=identity),
+  res <- scone(e, imputation=list(a=impute_null, b=impute_null),
                scaling=list(a=identity, b=identity, c=identity), k_ruv=0,
                k_qc=0, evaluate=FALSE, run=TRUE)
 
   # add ruv (the first two should not work)
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity),
                      k_ruv=5, k_qc=0, evaluate=FALSE, run=FALSE),
         "ruv_negcon must be specified")
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=1:100, k_qc=0, evaluate=FALSE, run=FALSE),
         "must be a character vector")
 
-  params <- scone(e, imputation=list(identity, identity),
+  params <- scone(e, imputation=list(impute_null,impute_null),
                   scaling=list(identity, identity, identity), k_ruv=5,
                   ruv_negcon=as.character(1:100), k_qc=0, evaluate=FALSE,
                   run=FALSE)
@@ -41,17 +41,17 @@ test_that("Test with no real method (only identity)", {
   # add qc (the first two should not work)
   qc_mat <- matrix(rnorm(20), nrow=10)
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
         scaling=list(identity, identity, identity), k_ruv=5,
         ruv_negcon=as.character(1:100), k_qc=5, evaluate=FALSE, run=FALSE),
         "qc must be specified")
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
         scaling=list(identity, identity, identity), k_ruv=5,
         ruv_negcon=as.character(1:100), k_qc=5, qc=qc_mat, evaluate=FALSE,
         run=FALSE),
         "k_qc must be less or equal than the number of columns")
 
-  res <- scone(e, imputation=list(identity, identity),
+  res <- scone(e, imputation=list(impute_null,impute_null),
                scaling=list(identity, identity, identity), k_ruv=5,
                ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                evaluate=FALSE, run=TRUE)
@@ -59,24 +59,24 @@ test_that("Test with no real method (only identity)", {
   # add bio (the first two should not work)
   bio <- rep(1:2, each=5)
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                      adjust_bio="yes", evaluate=FALSE, run=FALSE),
                "if adjust_bio is 'yes' or 'force', 'bio' must be specified")
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                      adjust_bio="yes", bio=bio, evaluate=FALSE, run=FALSE),
                "'bio' must be a factor")
 
   bio <- as.factor(bio)
-  res <- scone(e, imputation=list(identity, identity),
+  res <- scone(e, imputation=list(impute_null,impute_null),
                scaling=list(identity, identity, identity), k_ruv=5,
                ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                adjust_bio="yes", bio=bio, evaluate=FALSE, run=TRUE)
-  res <- scone(e, imputation=list(identity, identity),
+  res <- scone(e, imputation=list(impute_null,impute_null),
                scaling=list(identity, identity, identity), k_ruv=5,
                ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                adjust_bio="force", bio=bio, evaluate=FALSE, run=TRUE)
@@ -84,14 +84,14 @@ test_that("Test with no real method (only identity)", {
   # add batch (the first three should not work)
   batch <- rep(1:2, each=5)
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                      adjust_bio="force", bio=bio, adjust_batch="yes",
                      evaluate=FALSE, run=FALSE),
                "if adjust_batch is 'yes' or 'force', 'batch' must be specified")
 
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                      adjust_bio="force", bio=bio, adjust_batch="yes",
@@ -99,7 +99,7 @@ test_that("Test with no real method (only identity)", {
                "'batch' must be a factor")
 
   batch <- as.factor(batch)
-  expect_error(scone(e, imputation=list(identity, identity),
+  expect_error(scone(e, imputation=list(impute_null,impute_null),
                      scaling=list(identity, identity, identity), k_ruv=5,
                      ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                      adjust_bio="force", bio=bio, adjust_batch="yes",
@@ -107,14 +107,14 @@ test_that("Test with no real method (only identity)", {
                "Biological conditions and batches are confounded")
 
   batch <- as.factor(rep(1:2, 5))
-  res <- scone(e, imputation=list(a=identity, b=identity),
+  res <- scone(e, imputation=list(a=impute_null, b=impute_null),
                scaling=list(a=identity, b=identity, c=identity),
                k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
                evaluate=FALSE, run=TRUE)
 
   ## add evaluation
-  res <- scone(e, imputation=list(a=identity, b=identity),
+  res <- scone(e, imputation=list(a=impute_null, b=impute_null),
                scaling=list(a=identity, b=identity, c=identity),
                k_ruv=5, ruv_negcon=as.character(1:100), k_qc=2, qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
@@ -131,7 +131,7 @@ test_that("Test imputation and scaling", {
   batch <- as.factor(rep(1:2, 5))
 
   # factorial
-  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
@@ -139,20 +139,20 @@ test_that("Test imputation and scaling", {
 
   # nested
   batch <- as.factor(c(1, 2, 1, 2, 1, 3, 4, 3, 4, 3))
-  params <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  params <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                   scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                   k_ruv=3, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat,
                   adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
                   evaluate=FALSE, run=FALSE)
   params <- params[-(1:5),]
-  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=3, k_qc=2, ruv_negcon=as.character(1:10), qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
                evaluate=FALSE, run=TRUE, params=params)
 
   # evaluation
-  system.time(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  system.time(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                            scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                            k_ruv=5, k_qc=2, ruv_negcon=as.character(1:10),
                            qc=qc_mat, adjust_bio="yes", bio=bio,
@@ -165,7 +165,7 @@ test_that("Test imputation and scaling", {
   expect_equal(rownames(res$metrics), rownames(res$params))
   expect_equal(rownames(res$metrics), names(res$normalized_data))
 
-  res2 <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res2 <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=5, k_qc=2, ruv_negcon=as.character(1:10),
                qc=qc_mat, adjust_bio="yes", bio=bio,
@@ -184,7 +184,7 @@ test_that("scone works with only one normalization",{
   e <-  matrix(rpois(1000, lambda = 5), ncol=10)
   rownames(e) <- as.character(1:nrow(e))
 
-  res <- scone(e, imputation=list(none=identity),
+  res <- scone(e, imputation=list(none=impute_null),
                scaling=list(none=identity),
                k_ruv=0, k_qc=0, run=TRUE,
                evaluate=TRUE, eval_kclust = 2)
@@ -200,14 +200,14 @@ test_that("conditional PAM",{
   bio <- gl(2, 5)
   batch <- gl(5, 2)
 
-  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio, run=FALSE,
                evaluate=TRUE, eval_negcon=as.character(11:20),
                eval_poscon=as.character(21:30),
                eval_kclust = 2, stratified_pam = TRUE)
 
-  expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  expect_error(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=0, k_qc=0, adjust_bio="yes", bio=bio,
                adjust_batch="yes", batch=batch, run=FALSE,
@@ -216,7 +216,7 @@ test_that("conditional PAM",{
                eval_kclust = 6, stratified_pam = TRUE),
                "For stratified_pam, max 'eval_kclust' must be smaller than bio-cross-batch stratum size")
 
-  expect_error(res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  expect_error(res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                             scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                             k_ruv=0, k_qc=0, run = FALSE,
                             evaluate = TRUE, eval_negcon=as.character(11:20),
@@ -231,10 +231,10 @@ test_that("if bio=no bio is ignored", {
   e <-  matrix(rpois(10000, lambda = 5), ncol=10)
   rownames(e) <- as.character(1:nrow(e))
 
-  res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                adjust_bio = "no", bio=gl(2, 5), eval_kclust = 3)
 
-  res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                  adjust_bio = "no", eval_kclust = 3)
 
   expect_equal(res1$normalized_data, res2$normalized_data)
@@ -244,10 +244,10 @@ test_that("if batch=no batch is ignored", {
   e <-  matrix(rpois(10000, lambda = 5), ncol=10)
   rownames(e) <- as.character(1:nrow(e))
 
-  res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                 adjust_batch = "no", batch=gl(2, 5), eval_kclust = 3)
 
-  res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                 adjust_batch = "no", eval_kclust = 3)
 
   expect_equal(res1$normalized_data, res2$normalized_data)
@@ -257,11 +257,11 @@ test_that("batch and bio can be confounded if at least one of adjust_bio or adju
   e <-  matrix(rpois(10000, lambda = 5), ncol=10)
   rownames(e) <- as.character(1:nrow(e))
 
-  expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  expect_warning(scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                 adjust_batch = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3),
                 "Biological conditions and batches are confounded.")
 
-  expect_warning(scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0,
+  expect_warning(scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0,
                        adjust_bio = "no", batch=gl(2, 5), bio=gl(2, 5), eval_kclust = 3),
                  "Biological conditions and batches are confounded.")
 })

From 412cd2d21ae18296907fe91d191240989146f0f9 Mon Sep 17 00:00:00 2001
From: mbcole <mbeloc@gmail.com>
Date: Tue, 19 Jul 2016 11:14:49 -0700
Subject: [PATCH 25/27] Fixed test cases and documentation

---
 NAMESPACE                   |  2 +-
 man/scone.Rd                | 17 +++++++++--------
 man/score_matrix.Rd         |  2 +-
 tests/testthat/test_hdf5.R  |  8 ++++----
 tests/testthat/test_scone.R |  4 ++--
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 0e76c26..edfffde 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -13,9 +13,9 @@ export(biplot_interactive)
 export(estimate_ziber)
 export(estimate_zinb)
 export(factor_sample_filter)
+export(get_normalized)
 export(impute_expectation)
 export(impute_null)
-export(get_normalized)
 export(impute_zinb)
 export(lm_adjust)
 export(make_design)
diff --git a/man/scone.Rd b/man/scone.Rd
index eb42fdb..e891376 100644
--- a/man/scone.Rd
+++ b/man/scone.Rd
@@ -5,14 +5,15 @@
 \title{scone main wrapper: function to apply and evaluate all the normalization
 schemes}
 \usage{
-scone(expr, imputation = list(none = impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv = 5,
-  k_qc = 5, ruv_negcon = NULL, qc = NULL, adjust_bio = c("no", "yes",
-  "force"), adjust_batch = c("no", "yes", "force"), bio = NULL,
-  batch = NULL, run = TRUE, evaluate = TRUE, eval_pcs = 3,
-  eval_proj = NULL, eval_proj_args = NULL, eval_kclust = 2:10,
-  eval_negcon = ruv_negcon, eval_poscon = NULL, params = NULL,
-  verbose = FALSE, stratified_pam = FALSE, return_norm = c("no",
-  "in_memory", "hdf5"), hdf5file)
+scone(expr, imputation = list(none = impute_null), impute_args = NULL,
+  rezero = FALSE, scaling, k_ruv = 5, k_qc = 5, ruv_negcon = NULL,
+  qc = NULL, adjust_bio = c("no", "yes", "force"), adjust_batch = c("no",
+  "yes", "force"), bio = NULL, batch = NULL, run = TRUE,
+  evaluate = TRUE, eval_pcs = 3, eval_proj = NULL,
+  eval_proj_args = NULL, eval_kclust = 2:10, eval_negcon = ruv_negcon,
+  eval_poscon = NULL, params = NULL, verbose = FALSE,
+  stratified_pam = FALSE, return_norm = c("no", "in_memory", "hdf5"),
+  hdf5file)
 }
 \arguments{
 \item{expr}{matrix. The data matrix (genes in rows, cells in columns).}
diff --git a/man/score_matrix.Rd b/man/score_matrix.Rd
index 945bf70..99df0dc 100644
--- a/man/score_matrix.Rd
+++ b/man/score_matrix.Rd
@@ -61,7 +61,7 @@ This function evaluates an expression matrix using SCONE criteria, producing a n
 projections of the normalized data, correlations, and RLE metrics.
 }
 \details{
-The eval_proj function argument must have 2 inputs: 
+The eval_proj function argument must have 2 inputs:
 \itemize{
 \item{e}{ matrix. log-transformed expression (genes in rows, cells in columns).}
 \item{eval_proj_args}{ list. additional function arguments, e.g. prior data weights.}
diff --git a/tests/testthat/test_hdf5.R b/tests/testthat/test_hdf5.R
index 6822211..2e9811e 100644
--- a/tests/testthat/test_hdf5.R
+++ b/tests/testthat/test_hdf5.R
@@ -12,7 +12,7 @@ test_that("hd5 checks", {
   batch <- as.factor(rep(1:2, 5))
 
   # factorial
-  expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  expect_error(scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
@@ -21,7 +21,7 @@ test_that("hd5 checks", {
 
   BiocParallel::register(bpparam("MulticoreParam"))
 
-  expect_error(scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  expect_error(scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                      scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                      k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
                      adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
@@ -40,7 +40,7 @@ test_that("return_norm in memory", {
   batch <- as.factor(rep(1:2, 5))
 
   # factorial
-  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                      scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                      k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
                      adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
@@ -58,7 +58,7 @@ test_that("do not return_norm", {
   batch <- as.factor(rep(1:2, 5))
 
   # factorial
-  res <- scone(e, imputation=list(none=identity, zinb=impute_zinb),
+  res <- scone(e, imputation=list(none=impute_null, zinb=impute_zinb),
                scaling=list(none=identity, uq=UQ_FN, deseq=DESEQ_FN),
                k_ruv=3, k_qc=2, ruv_negcon=as.character(1:100), qc=qc_mat,
                adjust_bio="force", bio=bio, adjust_batch="yes", batch=batch,
diff --git a/tests/testthat/test_scone.R b/tests/testthat/test_scone.R
index f5d27a3..5a763f7 100644
--- a/tests/testthat/test_scone.R
+++ b/tests/testthat/test_scone.R
@@ -278,13 +278,13 @@ test_that("batch and bio can contain NA", {
   batch <- gl(2, 5)
   bio <- gl(5, 2)
 
-  res1 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
+  res1 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
                adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3)
 
   batch[1] <- NA
   bio[2] <- NA
 
-  res2 <- scone(e, imputation=identity, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
+  res2 <- scone(e, imputation=impute_null, scaling=identity, k_ruv=0, k_qc=0, evaluate = TRUE,
         adjust_batch = "no", batch=batch, bio=bio, eval_kclust = 3)
 
   expect_true(!is.na(res2$metrics[,"BIO_SIL"]))

From fe0ffcb3ff1b159809af5fa632a54b16659e7d03 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 22 Jul 2016 12:01:26 -0700
Subject: [PATCH 26/27] Minor

---
 R/scone_main.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/scone_main.R b/R/scone_main.R
index 9af6823..ccca798 100644
--- a/R/scone_main.R
+++ b/R/scone_main.R
@@ -121,7 +121,8 @@
 #'   written to the \code{hdf5file} file. This must be a string specifying (a
 #'   path to) a new file. If the file already exists, it will return error.
 #'
-scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL, rezero = FALSE, scaling, k_ruv=5, k_qc=5,
+scone <- function(expr, imputation=list(none=impute_null), impute_args = NULL,
+                  rezero = FALSE, scaling, k_ruv=5, k_qc=5,
                   ruv_negcon=NULL, qc=NULL, adjust_bio=c("no", "yes", "force"),
                   adjust_batch=c("no", "yes", "force"), bio=NULL, batch=NULL,
                   run=TRUE, evaluate=TRUE, eval_pcs=3, eval_proj = NULL,

From 1ca23e3a1f9e000d2ce6ea51bb8923f5d9fc9c07 Mon Sep 17 00:00:00 2001
From: Davide Risso <risso.davide@gmail.com>
Date: Fri, 22 Jul 2016 12:04:27 -0700
Subject: [PATCH 27/27] Version bumped

---
 DESCRIPTION | 4 ++--
 inst/NEWS   | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c53d27c..b1acdfe 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: scone
-Version: 0.0.5-9002
+Version: 0.0.6
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
@@ -8,7 +8,7 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
 	          role = c("aut", "cph")))
 Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph]
 Maintainer: Michael Cole <mbeloc@gmail.com>
-Date: 2016-07-14
+Date: 2016-07-22
 License: Artistic-2.0
 Depends:
   R (>= 3.3)
diff --git a/inst/NEWS b/inst/NEWS
index 168c21f..14013d5 100644
--- a/inst/NEWS
+++ b/inst/NEWS
@@ -1,5 +1,4 @@
-
-Changes in version 0.0.6 (2016-07-07)
+Changes in version 0.0.6 (2016-07-22)
 ========================
 
 * Added option for restoring zeroes after scaling step.