Merge branch 'release/0.0.6'

YosefLab · Jul 22, 2016 · c7b195b · c7b195b
2 parents ee405e9 + 1ca23e3
commit c7b195b
Show file tree

Hide file tree

Showing 26 changed files with 901 additions and 408 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -29,8 +29,12 @@ README[.]md
 ^[.]devel
 ^[.]test
 ^[.]check
+.Rhistory
+R/.Rhistory
 
 #----------------------------
 # Temp scripts
 #----------------------------
-^old_scripts/*
+^old_scripts/*
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,4 @@ vignettes/figure
 *.Rproj
 vignettes/R_cache
 vignettes/R_figure
-old_scripts
+old_scripts
diff --git a/.travis.yml b/.travis.yml
@@ -14,7 +14,7 @@ language: r
 cache: packages
 
 # R versions to be tested on
-r: 
+r:
   - bioc-release
   - bioc-devel
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,39 +1,44 @@
 Package: scone
-Version: 0.0.5
+Version: 0.0.6
 Title: Single Cell Overview of Normalized Expression data
 Description: scone is a package to compare and rank the performance of different normalization schemes in real single-cell RNA-seq datasets.
 Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
 	          role = c("aut", "cre", "cph")),
 	     person("Davide", "Risso", email = "risso.davide@gmail.com",
-	          role = c("aut")))
-Author: Michael Cole [aut, cre, cph], Davide Risso [aut]
+	          role = c("aut", "cph")))
+Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph]
 Maintainer: Michael Cole <mbeloc@gmail.com>
-Date: 2016-05-12
+Date: 2016-07-22
 License: Artistic-2.0
 Depends:
   R (>= 3.3)
 Imports:
+  aroma.light,
   BiocParallel,
+  class,
   cluster,
   DESeq,
-  EDASeq,
-  MASS,
-  RUVSeq,
-  aroma.light,
-  class,
   diptest,
+  EDASeq,
   edgeR,
   fpc,
   gplots,
+  grDevices,
   limma,
+  MASS,
   matrixStats,
   mixtools,
-  grDevices
+  grDevices,
+  boot,
+  shiny,
+  miniUI,
+  rhdf5,
+  RUVSeq
 Suggests:
   knitr,
   rmarkdown,
   testthat
 VignetteBuilder: knitr
 LazyLoad: yes
-BugReports: https://github.com/epurdom/clusterExperiment/issues
+BugReports: https://github.com/YosefLab/scone/issues
 RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,10 +9,13 @@ export(TMM_FN)
 export(UQ_FN)
 export(UQ_FN_POS)
 export(biplot_colored)
+export(biplot_interactive)
 export(estimate_ziber)
 export(estimate_zinb)
 export(factor_sample_filter)
-export(impute_ziber_simp)
+export(get_normalized)
+export(impute_expectation)
+export(impute_null)
 export(impute_zinb)
 export(lm_adjust)
 export(make_design)
@@ -26,6 +29,8 @@ importFrom(EDASeq,betweenLaneNormalization)
 importFrom(MASS,glm.nb)
 importFrom(RUVSeq,RUVg)
 importFrom(aroma.light,normalizeQuantileRank.matrix)
+importFrom(boot,inv.logit)
+importFrom(boot,logit)
 importFrom(class,knn)
 importFrom(cluster,silhouette)
 importFrom(diptest,dip.test)
@@ -43,7 +48,23 @@ importFrom(limma,lmFit)
 importFrom(matrixStats,colIQRs)
 importFrom(matrixStats,colMedians)
 importFrom(matrixStats,rowMedians)
+importFrom(miniUI,gadgetTitleBar)
+importFrom(miniUI,miniContentPanel)
+importFrom(miniUI,miniPage)
 importFrom(mixtools,normalmixEM)
+importFrom(rhdf5,h5createFile)
+importFrom(rhdf5,h5ls)
+importFrom(rhdf5,h5read)
+importFrom(rhdf5,h5write)
+importFrom(rhdf5,h5write.default)
+importFrom(shiny,brushedPoints)
+importFrom(shiny,observeEvent)
+importFrom(shiny,plotOutput)
+importFrom(shiny,renderPlot)
+importFrom(shiny,renderText)
+importFrom(shiny,runGadget)
+importFrom(shiny,stopApp)
+importFrom(shiny,verbatimTextOutput)
 importFrom(stats,approx)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)

diff --git a/R/biplot_interactive.R b/R/biplot_interactive.R
@@ -0,0 +1,72 @@
+#' Interactive biplot
+#'
+#' This is a wrapper around \code{\link{biplot_colored}}, which creates a shiny
+#' gadget to allow the user to select specific points in the graph.
+#'
+#' @details Since this is based on the shiny gadget feature, it will not work in
+#'   static documents, such as vignettes or markdown / knitr documents.
+#'   See \code{biplot_colored} for more details on the internals.
+#'
+#' @param data a data.frame containing the data to be plotted.
+#' @param scores a numeric vector used to color the points.
+#'
+#' @importFrom miniUI gadgetTitleBar miniContentPanel miniPage gadgetTitleBar
+#' @importFrom shiny plotOutput renderPlot observeEvent brushedPoints runGadget verbatimTextOutput stopApp renderText
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' mat <- matrix(rnorm(1000), ncol=10)
+#' colnames(mat) <- paste("X", 1:ncol(mat), sep="")
+#'
+#' biplot_interactive(mat, mat[,1])
+#' }
+biplot_interactive <- function(data, scores, ...) {
+
+  data <- as.data.frame(data)
+  scores <- as.numeric(scores)
+
+  ui <- miniPage(
+    gadgetTitleBar("Drag to select points"),
+    miniContentPanel(
+      # The brush="brush" argument means we can listen for
+      # brush events on the plot using input$brush.
+      plotOutput("plot1", height = "80%", brush = "plot_brush"),
+      verbatimTextOutput("info")
+    )
+  )
+
+  server <- function(input, output, session) {
+
+    # Compute PCA
+    pc_obj <- prcomp(data, center = TRUE, scale = FALSE)
+    bp_obj <- biplot_colored(pc_obj, y = scores)
+
+    # Render the plot
+    output$plot1 <- renderPlot({
+      # Biplot
+      biplot_colored(pc_obj, y = scores, ...)
+    })
+
+    data_out <- cbind(data, bp_obj)
+
+    output$info <- renderText({
+      xy_range_str <- function(e) {
+        if(is.null(e)) return("NULL\n")
+        idx <- which(bp_obj[,1] >= e$xmin & bp_obj[,1] <= e$xmax &
+                       bp_obj[,2] >= e$ymin & bp_obj[,2] <= e$ymax)
+        paste0(rownames(data)[idx], collapse = "\n")
+      }
+      xy_range_str(input$plot_brush)
+    })
+
+    # Handle the Done button being pressed.
+    observeEvent(input$done, {
+      # Return the brushed points. See ?shiny::brushedPoints.
+      stopApp(brushedPoints(data_out, input$plot_brush, xvar="PC1", yvar="PC2"))
+    })
+  }
+
+  runGadget(ui, server)
+}
diff --git a/R/sample_filtering.R b/R/sample_filtering.R
@@ -1,32 +1,53 @@
 #' Fit Logistic Regression Model of FNR against set of positive control (ubiquitously expressed) genes
 #'
-#' @details logit(Probability of False Negative) ~ a + b*(mean log10p1 expression) .
+#' @details logit(Probability of False Negative) ~ a + b*(median log-expression) .
 #'
 #' @param expr matrix The data matrix in transcript-proportional units (genes in rows, cells in columns).
 #' @param pos_controls A logical vector indexing positive control genes that will be used to compute false-negative rate characteristics.
+#' User must provide at least 2 positive control genes.
 #' @param fn_tresh Inclusive threshold for negative detection. Default 0.01.
+#' fn_tresh must be non-negative.
 #'
-#' @return A list of logistic regression coefficients corresponding to glm fits in each sample. If a fit did not converge, the result reported is NA.
+#' @return A matrix of logistic regression coefficients corresponding to glm fits in each sample (a and b in columns 1 and 2 respectively). If the a & b fit does not converge, b is set to zero and only a is estimated.
+#'
+#' @importFrom boot logit
+#' @importFrom matrixStats rowMedians
 #'
 simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 
-  # Mean log10p1 expression
-  mu_obs = rowMeans(log10(expr[pos_controls,]+1))
-
-  # Drop-outs
-  drop_outs = 0 + (expr[pos_controls,] <= fn_tresh)
+  stopifnot(!any(is.na(pos_controls)))
+
+  if (sum(pos_controls) < 2){
+    stop("User must provide at least 2 positive control genes")
+  }
+
+  if (fn_tresh < 0){
+    stop("fn_tresh must be non-negative")
+  }
+
+  pos_expr = expr[pos_controls,]  # Selecting positive-control genes
+  is_drop = pos_expr <= fn_tresh  # Identify false negatives
+  pos_expr[is_drop] = NA          # Set false negatives to NA
+  drop_outs = 0 + is_drop         # Numeric drop-out state
+  drop_rate = colMeans(drop_outs) # Total drop-out rate per sample
+
+  # Median log-expression in positive observations
+  mu_obs = log(rowMedians(pos_expr,na.rm = TRUE))
+  if(any(is.na(mu_obs))){
+    stop("Median log-expression in positive observations NA for some positive control gene/s")
+  }
 
   # Logistic Regression Model of FNR
-  ref.glms = list()
-  for (si in 1:dim(drop_outs)[2]){
+  logistic_coef = matrix(0,ncol(drop_outs),2)
+  for (si in seq_len(ncol(drop_outs))){
     fit = suppressWarnings(glm(cbind(drop_outs[,si],1 - drop_outs[,si]) ~ mu_obs,family=binomial(logit)))
     if(fit$converged){
-      ref.glms[[si]] = fit$coefficients
+      logistic_coef[si,] = fit$coefficients
     } else {
-      ref.glms[[si]] = NA
+      logistic_coef[si,1] = logit(drop_rate[si])
     }
   }
-  return(ref.glms)
+  return(logistic_coef)
 }
 
 #' metric-based sample filtering: function to filter single-cell RNA-Seq libraries.
@@ -51,7 +72,7 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #' If NULL, filtered_fnr will be returned NA.
 #' @param scale. logical. Will expression be scaled by total expression for FNR computation? Default = FALSE
 #' @param glen Gene lengths for gene-length normalization (normalized data used in FNR computation).
-#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log10(expr_units + 1)). Default c(0,6)
+#' @param AUC_range An array of two values, representing range over which FNR AUC will be computed (log(expr_units)). Default c(0,15)
 #' @param zcut A numeric value determining threshold Z-score for sd, mad, and mixture sub-criteria. Default 1.
 #' If NULL, only hard threshold sub-criteria will be applied.
 #' @param mixture A logical value determining whether mixture modeling sub-criterion will be applied per primary criterion (metric).
@@ -61,11 +82,11 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #' @param hard_nreads numeric. Hard (lower bound on) nreads threshold. Default 25000.
 #' @param hard_ralign numeric. Hard (lower bound on) ralign threshold. Default 15.
 #' @param hard_breadth numeric. Hard (lower bound on) breadth threshold. Default 0.2.
-#' @param hard_fnr numeric. Hard (upper bound on) fnr threshold. Default 3.
-#' @param suff_nreads numeric. If not null, serves as an upper bound on nreads threshold.
-#' @param suff_ralign numeric. If not null, serves as an upper bound on ralign threshold.  Default 65.
-#' @param suff_breadth numeric. If not null, serves as an upper bound on breadth threshold.  Default 0.8.
-#' @param suff_fnr numeric. If not null, serves as an lower bound on fnr threshold.
+#' @param hard_auc numeric. Hard (upper bound on) fnr auc threshold. Default 10.
+#' @param suff_nreads numeric. If not null, serves as an overriding upper bound on nreads threshold.
+#' @param suff_ralign numeric. If not null, serves as an overriding upper bound on ralign threshold.
+#' @param suff_breadth numeric. If not null, serves as an overriding upper bound on breadth threshold.
+#' @param suff_auc numeric. If not null, serves as an overriding lower bound on fnr auc threshold.
 #' @param plot logical. Should a plot be produced?
 #' @param hist_breaks hist() breaks argument. Ignored if `plot=FALSE`.
 #'
@@ -79,15 +100,16 @@ simple_FNR_params = function(expr, pos_controls, fn_tresh = 0.01){
 #'
 #'@importFrom mixtools normalmixEM
 #'@importFrom diptest dip.test
+#'@importFrom boot inv.logit
 #'@export
 #'
 #'
 metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
                                 gene_filter = NULL, pos_controls = NULL,scale. = FALSE,glen = NULL,
-                                AUC_range = c(0,6), zcut = 1,
+                                AUC_range = c(0,15), zcut = 1,
                                 mixture = TRUE, dip_thresh = 0.05,
-                                hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_fnr = 3,
-                                suff_nreads = NULL, suff_ralign = 65, suff_breadth = 0.8, suff_fnr = NULL,
+                                hard_nreads = 25000, hard_ralign = 15, hard_breadth = 0.2, hard_auc = 10,
+                                suff_nreads = NULL, suff_ralign = NULL, suff_breadth = NULL, suff_auc = NULL,
                                 plot = FALSE, hist_breaks = 10){
 
   criterion_count = 0
@@ -210,17 +232,17 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
     }
 
     # Compute FNR AUC
-    ref.glms = simple_FNR_params(expr = nexpr, pos_controls = pos_controls)
+    logistic_coef = simple_FNR_params(expr = nexpr, pos_controls = pos_controls)
     AUC = NULL
     for (si in 1:dim(expr)[2]){
-      if(!any(is.na(ref.glms[[si]]))){
-        AUC[si] = log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[2]) + 1)/ref.glms[[si]][2] - log(exp(ref.glms[[si]][1] + ref.glms[[si]][2] * AUC_range[1]) + 1)/ref.glms[[si]][2]
+      if(logistic_coef[si,2] != 0){
+        AUC[si] = log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[2]) + 1)/logistic_coef[si,2] - log(exp(logistic_coef[si,1] + logistic_coef[si,2] * AUC_range[1]) + 1)/logistic_coef[si,2]
       } else {
-        stop("glm fit did not converge")
+        AUC[si] = inv.logit(logistic_coef[si,1])*(AUC_range[2] - AUC_range[1])
       }
     }
 
-    AUC_CUTOFF = hard_fnr
+    AUC_CUTOFF = hard_auc
 
     if (!is.null(zcut)){
 
@@ -239,8 +261,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
         }
       }
 
-      if(!is.null(suff_fnr)){
-        AUC_CUTOFF = max(AUC_CUTOFF,suff_fnr)
+      if(!is.null(suff_auc)){
+        AUC_CUTOFF = max(AUC_CUTOFF,suff_auc)
       }
     }
     filtered_fnr = AUC > AUC_CUTOFF
@@ -254,7 +276,8 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
 
       is_bad = rep(FALSE,dim(expr)[2])
 
-      par(mfcol = c(criterion_count,2))
+      op <- par(mfcol = c(criterion_count,2))
+      on.exit(par(op))
 
       if(!is.null(nreads)){
         is_bad = filtered_nreads
@@ -304,7 +327,7 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
         }else{
           hist(AUC, main = paste0("auc: Thresh = ",signif(AUC_CUTOFF,3)," , Rm = ",sum(filtered_fnr)," , Tot_Rm = ",sum(is_bad)), xlab = "FNR AUC", breaks = hist_breaks)
         }
-        abline(v = hard_fnr, col = "yellow", lty = 1)
+        abline(v = hard_auc, col = "yellow", lty = 1)
         abline(v = AUC_CUTOFF, col = "blue", lty = 2)
       }
 
@@ -320,13 +343,6 @@ metric_sample_filter = function(expr, nreads = colSums(expr), ralign = NULL,
       if(!is.null(pos_controls)){
         hist(AUC[!is_bad],  main = paste0("auc: Tot = ",sum(!is_bad)), xlab = "FNR AUC", breaks = hist_breaks)
       }
-
-      # v = rbind(filtered_nreads,filtered_ralign,filtered_breadth,filtered_fnr)
-      # rownames(v) = c("nreads","ralign","breadth","fnr")
-      # v = na.omit(v)
-      # m = v %*% t(v)
-      #
-      # barplot(m, beside = TRUE, legend.text = TRUE)
   }
 
   return(list(filtered_nreads = filtered_nreads,
@@ -417,8 +433,9 @@ factor_sample_filter = function(expr, qual, gene_filter = NULL, max_exp_pcs = 5,
   num_qual_pcs = which(csum > min_qual_variance)[1]
 
   if(plot){
+    op <- par(mfrow = c(2,1))
+    on.exit(par(op))
     for (i in 1:num_qual_pcs){
-      par(mfrow = c(2,1))
       hist(qpc$x[,i],breaks = hist_breaks, main = paste0("Distribution of Quality PC ",i), xlab = paste0("Qual PC",i))
       barplot(abs(qpc$rotation[,i]),col = c("red","green")[1 + (qpc$rotation[,i] > 0)], cex.names = .25,horiz = T, las=1, main = "Loadings")
     }