diff --git a/DESCRIPTION b/DESCRIPTION index 32af9ff..35dbd45 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,37 +1,36 @@ Package: PTXQC Type: Package -Title: Quality Report Generation for MaxQuant Results -Version: 0.92.6 -Date: 2019-04-16 +Title: Quality Report Generation for MaxQuant and mzTab Results +Version: 1.0.0 +Date: 2020-01-08 Author: Chris Bielow Maintainer: Chris Bielow Description: Generates Proteomics (PTX) quality control (QC) reports for shotgun LC-MS data analyzed with the - MaxQuant software suite. + MaxQuant software suite (from .txt files) or mzTab files (ideally from OpenMS 'QualityControl' tool). Reports are customizable (target thresholds, subsetting) and available in HTML or PDF format. - Published in J. Proteome Res., Proteomics Quality Control: Quality Control Software for MaxQuant Results (2015) 'doi:10.1021/acs.jproteome.5b00780'. + Published in J. Proteome Res., Proteomics Quality Control: Quality Control Software for MaxQuant Results (2015) + . SystemRequirements: pandoc (http://pandoc.org) for building Vignettes and output reports as HTML Depends: - R (>= 3.0.0) + R (>= 3.3.0) Imports: data.table, ggplot2 (>= 2.2), ggdendro, - graphics, grid, - gridExtra, grDevices, gtable, kableExtra, knitr (>= 1.10), methods, plyr, - proto, RColorBrewer, reshape2, rmarkdown, seqinr, stats, utils, + UpSetR, yaml Suggests: testthat diff --git a/NAMESPACE b/NAMESPACE index f0ceaca..c125280 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(print,PTXQC_table) +export(FilenameMapper) export(LCS) export(LCSn) export(RTalignmentTree) @@ -50,6 +51,7 @@ export(plot_MissedCleavages) export(plot_RTPeakWidth) export(plot_RatiosPG) export(plot_ScanIDRate) +export(plot_TIC) export(plot_TopN) export(plot_TopNoverRT) export(plot_UncalibratedMSErr) @@ -70,49 +72,24 @@ export(shortenStrings) export(simplifyNames) export(supCount) export(theme_blank) +exportClasses(FilenameMapper) exportClasses(YAMLClass) exportClasses(qcMetric) +import(RColorBrewer) +import(UpSetR) +import(data.table) import(ggdendro) import(ggplot2) -import(graphics) -import(gridExtra) +import(grDevices) +import(grid) import(gtable) import(kableExtra) +import(knitr) import(methods) -import(proto) +import(plyr) +import(reshape2, except = c(dcast, melt)) +import(rmarkdown) import(stats) import(utils) -importFrom(RColorBrewer,brewer.pal) -importFrom(RColorBrewer,brewer.pal.info) -importFrom(data.table,as.data.table) -importFrom(data.table,setkey) -importFrom(grDevices,boxplot.stats) -importFrom(grDevices,dev.off) -importFrom(grDevices,pdf) -importFrom(grid,gList) -importFrom(grid,gTree) -importFrom(grid,gpar) -importFrom(grid,grid.draw) -importFrom(grid,grid.newpage) -importFrom(grid,grobHeight) -importFrom(grid,grobWidth) -importFrom(grid,rectGrob) -importFrom(grid,textGrob) -importFrom(grid,unit.c) -importFrom(knitr,kable) -importFrom(plyr,adply) -importFrom(plyr,compact) -importFrom(plyr,ddply) -importFrom(plyr,dlply) -importFrom(plyr,empty) -importFrom(plyr,ldply) -importFrom(plyr,llply) -importFrom(plyr,mapvalues) -importFrom(plyr,summarise) -importFrom(plyr,summarize) -importFrom(reshape2,dcast) -importFrom(reshape2,melt) -importFrom(rmarkdown,pandoc_available) -importFrom(rmarkdown,render) +import(yaml) importFrom(seqinr,circle) -importFrom(yaml,as.yaml) diff --git a/NEWS b/NEWS index e5c4989..ec5081c 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,13 @@ Versions uploaded to CRAN are marked with [CRAN]. ######### CHANGELOG ########## ################################# +[CRAN] v1.0.0 -- 2020/01/08 + - [FEATURE] support for mzTab input data (e.g. from OpenMS 2.5 'QualityControl' tool) + - [FEATURE] new metric: UpSetR plots (shows common sets of peptides across Raw files) + - [FEATURE] new metric: Total Ion Count (TIC) plots (for OpenMS' mzTab input only; not supported for MaxQuant) + - [FIX] issue #55 (MS2 Calibration metric crash when no evidence is given) + - [FIX] issue #56 package data.table not properly imported + [CRAN] v0.92.6 -- 2019/03/14 - [FIX] issue #51 (crash on MSMSScans) - [FIX] issue #54 (timsTOF data fix) diff --git a/R/FilenameMapper.R b/R/FilenameMapper.R new file mode 100644 index 0000000..1732014 --- /dev/null +++ b/R/FilenameMapper.R @@ -0,0 +1,337 @@ +#' +#' Make sure to call $readMappingFile(some_file) if you want to support a user-defined file mapping. +#' Otherwise, calls to $getShortNames() will create/augment the mapping for filenames. +#' +#' +#' +#' @field raw_file_mapping Data.frame with columns 'from', 'to' and maybe 'best.effort' (if shorting was unsuccessful) +#' @field mapping.creation how the current mapping was obtained (user or auto) +#' @field external.mapping.file Filename of user-defined mapping file; only defined if readMappingFile() was called +#' +#' @import ggplot2 +#' +#' @exportClass FilenameMapper +#' @export FilenameMapper +#' +#' @examples +#' a = FilenameMapper$new() +#' a$readMappingFile('filenamemapping.txt') +#' +FilenameMapper = setRefClass("FilenameMapper", + + fields = list(raw_file_mapping = "data.frame", ## with cols 'from', to' and maybe 'best.effort' (if shorting was unsuccessful) + mapping.creation = "character", ## how the current mapping was obtained (user or auto) + external.mapping.file = "character" ## filename of user-defined mapping file; only defined if readMappingFile() was called + ), + methods = list( + +initialize=function() { + .self$raw_file_mapping = data.frame() + .self$mapping.creation = NA_character_ + .self$external.mapping.file = NA_character_ + return(.self) +}, + +specrefToRawfile = function(.self, specrefs) +{ + "Return a DF with 'ms_run', 'raw.file' and 'fc.raw.file' given a vector of spectraReferences, e.g. 'ms_run[1]:...', '...'" + + res = data.frame(ms_run = sub("[.]*:.*", "\\1", specrefs)) + return (cbind(res, .self$msrunToRawfile(res$ms_run))) +}, + +msrunToRawfile = function(.self, ms_runs) +{ + "Given a vector of ms_runs, c('ms_run[1]', ...), return a data.frame of identical length with columns 'raw.file' and 'fc.raw.file'." + + if (!"ms.run" %in% colnames(.self$raw_file_mapping)) stop("Mapping is missing 'ms.run' from mzTab!") + + res = .self$getRawfm()[ match(ms_runs, .self$raw_file_mapping$ms.run), c("from", "to")] + colnames(res) = c("raw.file", "fc.raw.file") + return (res) +}, + +getShortNames = function(.self, raw_filenames, max_length = 10, ms_runs = NULL) +{ + "Uses the internal mapping (or re-creates it if current one is incomplete) and maps the input raw names to shorter output names. + Returns a vector of the same length." + #rf <<- raw_filenames + #raw_filenames = rf + + if (!is.null(ms_runs) && length(ms_runs) != length(raw_filenames)) stop("raw_filenames and ms_runs do not have the same length!") + + cat(paste0("Adding fc.raw.file column ...")) + ## if there is no mapping, or if its incomplete (outdated mapping file) + if (nrow(.self$raw_file_mapping) == 0 || any(is.na(match(raw_filenames, .self$raw_file_mapping$from)))) + { ## --> redo + rfm = .self$getShortNamesStatic(unique(raw_filenames), max_length) + if (!is.null(ms_runs)) { + rfm$ms.run = ms_runs[ match(rfm$from, raw_filenames) ] + } + .self$raw_file_mapping = rfm + ## indicate to outside that a new table is ready + .self$mapping.creation = .self$getMappingCreation()['auto'] + } + ## do the mapping + v.result = as.factor(.self$raw_file_mapping$to[match(raw_filenames, .self$raw_file_mapping$from)]) + + ## check for NA's + if (any(is.na(v.result))) + { ## if mapping is incomplete + missing = unique(raw_filenames[is.na(v.result)]) + stop(paste0("Hithero unknown Raw files: ", paste(missing, collapse=", ", sep=""), " encountered in file '", file, "' which were not present in previous data files.\nPlease delete the file or fix it.")) + } + cat(paste0(" done\n")) + return (v.result) +}, + +getShortNamesStatic = function(raw.files, max_len, fallbackStartNr = 1) +{ + "Static method: Shorten a set of Raw file names and return a data frame with the mappings. + Mapping will have: $from, $to and optionally $best.effort (if shorting was unsuccessful and numbers had to be used) + \\itemize{ + \\item{\\verb{raw.files} Vector of Raw files.} + \\item{\\verb{max_len} Maximal length of shortening results, before resorting to canonical names (file 1,...).} + \\item{\\verb{fallbackStartNr} Starting index for canonical names.} + } + \\subsection{Return Value}{ data.frame with mapping.} + " + rf_name = raw.files + ## remove prefix + rf_name_s = delLCP(rf_name, + min_out_length = 8, + add_dots = TRUE) + ## remove infix (2 iterations) + rf_name_s = simplifyNames(rf_name_s, + 2, + min_LCS_length = 7, + min_out_length = 8) + + ## check if shorter filenames are still unique (they should be.. if not we have a problem!!) + if (length(rf_name) != length(unique(rf_name_s))) + { + cat("Original names:\n") + cat(rf_name) + cat("Short names:\n") + cat(rf_name_s) + stop("While loading MQ data: shortened raw filenames are not unique! This should not happen. Please contact the developers and provide the above names!") + } + df.mapping = data.frame(from = rf_name, to = rf_name_s, stringsAsFactors = FALSE) + + ## always include 'best.effort' column + df.mapping[, "best.effort"] = df.mapping$to + + ## check if the minimal length was reached + if (max(nchar(df.mapping$to)) > max_len) + { ## resort to short naming convention + cat("Filenames are longer than the maximal allowed size of '" %+% max_len %+% "'. Resorting to short versions 'file X'.\n\n") + maxl = length(raw.files) - 1 + fallbackStartNr + df.mapping$to = paste("file", sprintf(paste0("%0", nchar(maxl), "d"), fallbackStartNr:maxl)) ## with leading 0's if required + } + return(df.mapping) +}, + + + +plotNameMapping = function(.self) +{ + "Plots the current mapping of Raw file names to their shortened version. + + Convenience function to plot the mapping (e.g. to a PDF device for reporting). + The data frame can be accessed directly via \\verb{.self$raw_file_mapping}. + If no mapping exists, the function prints a warning to console and returns NULL (which is safe to use in print(NULL)). + + @return if mapping is available, returns a list of plots 'plots' and a Html table string 'htmlTable' ; 'NULL' otherwise. + + " + if (nrow(.self$raw_file_mapping) == 0) + { + cat("No mapping found. Omitting plot.") + return (NULL); + } + + table_header = c("original", "short\nname") + xpos = c(9, 11) + extra = "" + has_best_effort = FALSE + if ("best.effort" %in% colnames(.self$raw_file_mapping)) + { + has_best_effort = TRUE + table_header = c(table_header, "best\neffort") + xpos = c(9, 11, 13) + if (all(.self$raw_file_mapping$to != .self$raw_file_mapping$best.effort)) { + extra = "\n(automatic shortening of names was not sufficient - see 'best effort')" + } + + } + + #mq_mapping = mq$raw_file_mapping + mq_mapping = .self$raw_file_mapping + pl_title = "Mapping of Raw files to their short names\nMapping source: " %+% .self$mapping.creation %+% extra; + + mappingChunk = function(mq_mapping) + { + mq_mapping$ypos = -(1:nrow(mq_mapping)) + head(mq_mapping) + mq_mapping.long = reshape2::melt(mq_mapping, id.vars = c("ypos")) + head(mq_mapping.long) + mq_mapping.long$variable = as.character(mq_mapping.long$variable) + mq_mapping.long$col = "#000000"; + mq_mapping.long$col[mq_mapping.long$variable=="to"] = "#5F0000" + mq_mapping.long$variable[mq_mapping.long$variable=="from"] = xpos[1] + mq_mapping.long$variable[mq_mapping.long$variable=="to"] = xpos[2] + if (nchar(extra)) mq_mapping.long$variable[mq_mapping.long$variable=="best.effort"] = xpos[3] + mq_mapping.long$variable = as.numeric(mq_mapping.long$variable) + mq_mapping.long$size = 2; + + df.header = data.frame(ypos = 1, variable = xpos, value = table_header, col = "#000000", size=3) + mq_mapping.long2 = rbind(mq_mapping.long, df.header) + mq_mapping.long2$hpos = 0 ## left aligned, 1=right aligned + mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[1]] = 1 + mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[2]] = 0 + if (nchar(extra)) mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[3]] = 0 + + mqmap_pl = ggplot(mq_mapping.long2, aes_string(x = "variable", y = "ypos")) + + geom_text(aes_string(label="value"), color = mq_mapping.long2$col, hjust=mq_mapping.long2$hpos, size=mq_mapping.long2$size) + + coord_cartesian(xlim=c(0,20)) + + theme_bw() + + theme(plot.margin = grid::unit(c(1,1,1,1), "cm"), line = element_blank(), + axis.title = element_blank(), panel.border = element_blank(), + axis.text = element_blank(), strip.text = element_blank(), legend.position = "none") + + ggtitle(pl_title) + return(mqmap_pl) + } + l_plots = byXflex(mq_mapping, 1:nrow(mq_mapping), 20, mappingChunk, sort_indices = FALSE); + return (list(plots = l_plots, htmlTable = getHTMLTable(.self$raw_file_mapping, pl_title))) + +}, + +getRawfm = function(.self) +{ + "Wrapper function for member 'raw_file_mapping', ensuring that $to is a factor" + + tmp = .self$raw_file_mapping + tmp$to = factor(tmp$to) + return(tmp) +}, + + +readMappingFile = function(.self, filename) +{ + "Reads a mapping table of full Raw file names to shortened names. + + The internal structure \\verb{raw_file_mapping} is created using this file. + If the file is missing, nothing is done and FALSE is returned. + If the file contains contradictory information (different set of $from files) compared to + the current mapping (if present), the internal mapping wins (filemapping is ignored) and FALSE is returned. + + The file must have two columns named: 'orig.Name' and 'new.Name' and use Tab as separator. + This file can be used to manually substitute Raw file names within the report. + The ordering of Raw files in the report can be changed by re-arranging the rows. + I.e. + \\preformatted{ + orig.Name new.Name + 2011_05_30_ALH_OT_21_VIL_TMT_FR01 myfile A + 2011_05_30_ALH_OT_22_VIL_TMT_FR02 another B + } + + @param filename Source filename to read. + @return Returns \\verb{TRUE} if file was read, \\verb{FALSE} if it does not exist. +" + + if (file.exists(filename)) + { + message(paste0("Reading mapping file '", filename, "'\n")) + dfs = read.delim(filename, comment.char="#", stringsAsFactors = FALSE) + colnames(dfs) = gsub("_", ".", colnames(dfs)) ## legacy support for old "best_effort" column (now "best.effort") + req_cols = c(from = "orig.Name", to = "new.Name") + if (!all(req_cols %in% colnames(dfs))) + { + stop("Input file '", filename, "' does not contain the columns '", paste(req_cols, collapse="' and '"), "'.", + " Please fix and re-run PTXQC!") + } + req_cols = c(req_cols, best.effort = "best.effort", ms.run = "ms.run") ## augment + colnames(dfs) = names(req_cols)[match(colnames(dfs), req_cols)] + + if (any(duplicated(dfs$from)) | any(duplicated(dfs$to))) + { + dups = c(dfs$from[duplicated(dfs$from)], dfs$to[duplicated(dfs$to)]) + stop("Input file '", filename_sorting, "' has duplicate entries ('", paste(dups, collapse=", "), ")'!", + " Please fix and re-run PTXQC!") + } + dfs + dfs$to = factor(dfs$to, levels = unique(dfs$to), ordered = TRUE) ## keep the order + dfs$from = factor(dfs$from, levels = unique(dfs$from), ordered = TRUE) ## keep the order + ## set internal mapping + if (nrow(.self$raw_file_mapping) > 0 & ## was initialized before... + !setequal(.self$raw_file_mapping$from, dfs$from)) ## .. and has different data + { + print(paste0("Raw filename mapping in file '", filename, "' has different set of raw files than current data. Mapping file will be ignored and overwritten!", + "\nold filenames in mapping:\n ", paste(dfs$from, collapse="\n "), + "\nnew filenames from data:\n ", paste(.self$raw_file_mapping$from, collapse="\n "))) + return (FALSE) + } + .self$raw_file_mapping = dfs + ## set who defined it + .self$mapping.creation = 'file (user-defined)' + .self$external.mapping.file = filename; ## remember filename for later error messages + return (TRUE) + } + return (FALSE) +}, + + +writeMappingFile = function(.self, filename) +{ + "Writes a mapping table of full Raw file names to shortened names. + + The internal structure \\verb{raw_file_mapping} is written to the + file specified. + File is only created if mapping exists (in .self$raw_file_mapping). + + @param filename Target filename to create. + @return Returns NULL. + " + if (nrow(.self$raw_file_mapping) == 0) + { + cat("No mapping found. Writing not possible!") + return (FALSE) + } + + dfs = data.frame(orig.Name = .self$raw_file_mapping$from, new.Name = .self$raw_file_mapping$to) + if (nrow(dfs) == 0) return(NULL) + + if ("best.effort" %in% colnames(.self$raw_file_mapping)) { + dfs$best.effort = .self$raw_file_mapping[, "best.effort"] + } + + if ("ms.run" %in% colnames(.self$raw_file_mapping)) { + dfs$ms.run = .self$raw_file_mapping[,"ms.run"] + } + + ## use a file handle to avoid warning from write.table() when appending + ## a table with column names 'Warning(): appending column names to file' + FH = file(filename, "w") + cat(file = FH, + "# This file can be used to manually substitute Raw file names within the report.", + "# The ordering of Raw files in the report can be changed by re-arranging the rows.", + sep = "\n") + write.table(x = dfs, file = FH, quote = FALSE, sep="\t", row.names = FALSE) + close(FH) ## flush + return (TRUE) +}, + + + +getMappingCreation = function(.self) +{ + "A static function" + return(c(user = 'file (user-defined)', auto = 'automatic')) +} + + + + + +) ## end methods list +) ## end RefClass diff --git a/R/MQDataReader.R b/R/MQDataReader.R index 4f36258..8427364 100644 --- a/R/MQDataReader.R +++ b/R/MQDataReader.R @@ -1,62 +1,32 @@ - -## A proto class for handling consistent Raw file names while loading -## multiple MQ result files. -## If the names are too long, an alias name (eg 'file 1', 'file 2', ...) is used instead. -## -## -## [Occasional rage: since S4 is so very inadequate for basically everything which is important in OOP and the syntax is -## even more horrible, we use the 'proto' package to at least abstract away much of this S4 non-sense. -## Read http://cran.r-project.org/doc/contrib/Genolini-S4tutorialV0-5en.pdf::10:2 Method to modify a field and you'll see -## what I mean] -## - #' #' Convenience wrapper for MQDataReader when only a single MQ file should be read #' and file mapping need not be stored. #' -#' For params, see \code{\link{MQDataReader$readMQ}}. +#' For params, see \code{MQDataReader::readMQ()}. #' -#' @param file see \code{\link{MQDataReader$readMQ}} -#' @param filter see \code{\link{MQDataReader$readMQ}} -#' @param type see \code{\link{MQDataReader$readMQ}} -#' @param col_subset see \code{\link{MQDataReader$readMQ}} -#' @param add_fs_col see \code{\link{MQDataReader$readMQ}} -#' @param LFQ_action see \code{\link{MQDataReader$readMQ}} -#' @param ... see \code{\link{MQDataReader$readMQ}} -#' @return see \code{\link{MQDataReader$readMQ}} +#' @param file see \code{MQDataReader::readMQ()} +#' @param filter see \code{MQDataReader::readMQ()} +#' @param type see \code{MQDataReader::readMQ()} +#' @param col_subset see \code{MQDataReader::readMQ()} +#' @param add_fs_col see \code{MQDataReader::readMQ()} +#' @param LFQ_action see \code{MQDataReader::readMQ()} +#' @param ... see \code{MQDataReader::readMQ()} +#' @return see \code{MQDataReader::readMQ()} #' #' @export #' -read.MQ <- function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = 10, LFQ_action = FALSE, ...) +read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = 10, LFQ_action = FALSE, ...) { mq = MQDataReader$new() mq$readMQ(file, filter, type, col_subset, add_fs_col, LFQ_action, ...) } - -## CLASS 'MQDataReader' -MQDataReader <- proto() - -#' Constructor for class 'MQDataReader'. #' -#' This class is used to read MQ data tables using readMQ() while holding +#' S5-RefClass to read MaxQuant .txt files +#' +#' This class is used to read MQ data tables using \code{MQDataReader::readMQ()} while holding #' the internal raw file --> short raw file name mapping (stored in a member called -#' 'raw_file_mapping') and updating/using it every time readMQ() is called. -#' -#' @name MQDataReader$new -#' @import proto -#' -MQDataReader$new <- function(.) -{ - proto(., raw_file_mapping = NULL, mq.data = NULL, mapping.creation = NULL, external.mapping.file = NULL) -} - - -## -## Functions -## - -#' Wrapper to read a MQ txt file (e.g. proteinGroups.txt). +#' 'fn_map') and updating/using it every time \code{MQDataReader::readMQ()} is called. #' #' Since MaxQuant changes capitalization and sometimes even column names, it seemed convenient #' to have a function which just reads a txt file and returns unified column names, irrespective of the MQ version. @@ -85,45 +55,59 @@ MQDataReader$new <- function(.) #' d_evd = mq$readMQ("evidence.txt", type="ev", filter="R", col_subset=c("proteins", "Retention.Length", "retention.time.calibration")) #' } #' -#' If the file is empty, this function stops with an error. -#' -#' @param . A 'this' pointer. Use it to refer/change internal members. It's implicitly added, thus not required too call the function! -#' @param file (Relative) path to a MQ txt file () -#' @param filter Searched for "C" and "R". If present, [c]ontaminants and [r]everse hits are removed if the respective columns are present. -#' E.g. to filter both, \code{filter = "C+R"} -#' @param type Allowed values are: -#' "pg" (proteinGroups) [default], adds abundance index columns (*AbInd*, replacing 'intensity') -#' "sm" (summary), splits into three row subsets (raw.file, condition, total) -#' Any other value will not add any special columns -#' @param col_subset A vector of column names as read by read.delim(), e.g., spaces are replaced by dot already. -#' If given, only columns with these names (ignoring lower/uppercase) will be returned (regex allowed) -#' E.g. col_subset=c("^lfq.intensity.", "protein.name") -#' @param add_fs_col If TRUE and a column 'raw.file' is present, an additional column 'fc.raw.file' will be added with -#' common prefix AND common substrings removed (\code{\link{simplifyNames}}) -#' E.g. two rawfiles named 'OrbiXL_2014_Hek293_Control', 'OrbiXL_2014_Hek293_Treated' will give -#' 'Control', 'Treated' -#' If \code{add_fs_col} is a number AND the longest short-name is still longer, the names are discarded and replaced by -#' a running ID of the form 'file ', where is a number from 1 to N. -#' If the function is called again and a mapping already exists, this mapping is used. -#' Should some raw.files be unknown (ie the mapping from the previous file is incomplete), they will be augmented -#' @param check_invalid_lines After reading the data, check for unusual number of NA's to detect if file was corrupted by Excel or alike -#' @param LFQ_action [For type=='pg' only] An additional custom LFQ column ('cLFQ...') is created where -#' zero values in LFQ columns are replaced by the following method IFF(!) the corresponding raw intensity is >0 (indicating that LFQ is erroneusly 0) -#' "toNA": replace by NA -#' "impute": replace by lowest LFQ value >0 (simulating 'noise') -#' @param ... Additional parameters passed on to read.delim() -#' @return A data.frame of the respective file +#' If the file is empty, this function shows a warning and returns NULL. +#' If the file is present but cannot be read, the program will stop. #' -#' @name MQDataReader$readMQ -#' @import utils -#' @import graphics -#' -#' -# (not exported!) -MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, add_fs_col=10, check_invalid_lines = TRUE, LFQ_action=FALSE, ...) +MQDataReader = setRefClass("MQDataReader", + fields = list(mq.data = "data.frame", + other = "list", + fn_map = "FilenameMapper" + ), + methods = list( + initialize = function() { + .self$mq.data = data.frame(); + .self$other = list(); + .self$fn_map = FilenameMapper$new(); + return(.self) + }, + +readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = 10, check_invalid_lines = TRUE, LFQ_action = FALSE, ...) { - # . = MQDataReader$new() ## debug - # ... = NULL + #' + #' Wrapper to read a MQ txt file (e.g. proteinGroups.txt). + #' + #' @param file (Relative) path to a MQ txt file. + #' @param filter Searched for "C" and "R". If present, [c]ontaminants and [r]everse hits are removed if the respective columns are present. + #' E.g. to filter both, \code{filter = "C+R"} + #' @param type Allowed values are: + #' "pg" (proteinGroups) [default], adds abundance index columns (*AbInd*, replacing 'intensity') + #' "sm" (summary), splits into three row subsets (raw.file, condition, total) + #' "ev" (evidence), will fix empty modified.sequence cells for older MQ versions (when MBR is active) + #' Any other value will not add any special columns + #' @param col_subset A vector of column names as read by read.delim(), e.g., spaces are replaced by dot already. + #' If given, only columns with these names (ignoring lower/uppercase) will be returned (regex allowed) + #' E.g. col_subset=c("^lfq.intensity.", "protein.name") + #' @param add_fs_col If TRUE and a column 'raw.file' is present, an additional column 'fc.raw.file' will be added with + #' common prefix AND common substrings removed (\code{\link{simplifyNames}}) + #' E.g. two rawfiles named 'OrbiXL_2014_Hek293_Control', 'OrbiXL_2014_Hek293_Treated' will give + #' 'Control', 'Treated' + #' If \code{add_fs_col} is a number AND the longest short-name is still longer, the names are discarded and replaced by + #' a running ID of the form 'file ', where is a number from 1 to N. + #' If the function is called again and a mapping already exists, this mapping is used. + #' Should some raw.files be unknown (ie the mapping from the previous file is incomplete), they will be augmented + #' @param check_invalid_lines After reading the data, check for unusual number of NA's to detect if file was corrupted by Excel or alike + #' @param LFQ_action [For type=='pg' only] An additional custom LFQ column ('cLFQ...') is created where + #' zero values in LFQ columns are replaced by the following method IFF(!) the corresponding raw intensity is >0 (indicating that LFQ is erroneusly 0) + #' "toNA": replace by NA + #' "impute": replace by lowest LFQ value >0 (simulating 'noise') + #' @param ... Additional parameters passed on to read.delim() + #' @return A data.frame of the respective file + #' + + if (!file.exists(file)) { + cat(paste0("MaxQuant file ", file, " was not found. Reading skipped.\n")) + return (NULL); + } cat(paste("Reading file", file,"...\n")) ## error message if failure should occur below msg_parse_error = paste0("\n\nParsing the file '", file, "' failed. See message above why. If the file is not usable but other files are ok, disable the corresponding section in the YAML config. You might also be running a foreign locale (e.g. Chinese) - switch to an English locale and make sure that txt files are encoded in ASCII (Latin-1)!") @@ -159,17 +143,17 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad cat(paste("Keeping", sum(colClasses != "NULL", na.rm=TRUE), "of", ncol(data_header), "columns!\n")) if (sum(colClasses != "NULL", na.rm=TRUE) == 0) { ## can happen for very old MQ files without header, or if the user just gave the wrong colClasses - .$mq.data = data.frame() - return (.$mq.data) + .self$mq.data = data.frame() + return (.self$mq.data) } } ## higher memory consumption during load (due to memory mapped files) compared to read.delim... but about 5x faster ## , but also different numerical results when parsing numbers!!! - #.$mq.data = try( + #.self$mq.data = try( # fread(file, header = TRUE, sep='\t', na.strings=c("NA", "n. def."), verbose = TRUE, select = idx_keep, data.table = FALSE, ...) #) - #colnames(.$mq.data) = make.names(colnames(.$mq.data), unique = TRUE) + #colnames(.self$mq.data) = make.names(colnames(.self$mq.data), unique = TRUE) ## comment.char should be "", since lines will be TRUNCATED starting at the comment char.. and a protein identifier might contain just anything... ## na.strings: @@ -177,17 +161,17 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad ## However, when the colClass is 'numeric', whitespaces are stripped, and only AFTERWARDS the string ## is checked against na.strings ## - the '\u975E\u6570\u5B57' na-string is the chinese UTF-8 representation of "NA" - .$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, ...)) - if (inherits(.$mq.data, 'try-error')) stop(msg_parse_error, call. = FALSE); + .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, ...)) + if (inherits(.self$mq.data, 'try-error')) stop(msg_parse_error, call. = FALSE); - #colnames(.$mq.data) + #colnames(.self$mq.data) - cat(paste0("Read ", nrow(.$mq.data), " entries from ", file,".\n")) + cat(paste0("Read ", nrow(.self$mq.data), " entries from ", file,".\n")) ### checking for invalid rows if (check_invalid_lines == TRUE & type != "sm") ## summary.txt has irregular structure { - inv_lines = .$getInvalidLines(); + inv_lines = .self$getInvalidLines(); if (length(inv_lines) > 0) { stop(paste0("\n\nError: file '", file, "' seems to have been edited in Microsoft Excel and", @@ -198,7 +182,7 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad } cat(paste0("Updating colnames\n")) - cn = colnames(.$mq.data) + cn = colnames(.self$mq.data) ### just make everything lower.case (MQ versions keep changing it and we want it to be reproducible) cn = tolower(cn) ## rename some columns since MQ 1.2 vs. 1.3 differ.... @@ -220,17 +204,17 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad cn[idx_c] = gsub(".intensity", ".intensity.corrected", cn[idx_c]); } - colnames(.$mq.data) = cn + colnames(.self$mq.data) = cn ## work in-place on 'contaminant' column cat(paste0("Simplifying contaminants\n")) - .$substitute("contaminant"); + .self$substitute("contaminant"); cat(paste0("Simplifying reverse\n")) - .$substitute("reverse"); - if (grepl("C", filter) & ("contaminant" %in% colnames(.$mq.data))) .$mq.data = .$mq.data[!(.$mq.data$contaminant),] - if (grepl("R", filter) & ("reverse" %in% colnames(.$mq.data))) .$mq.data = .$mq.data[!(.$mq.data$reverse),] + .self$substitute("reverse"); + if (grepl("C", filter) & ("contaminant" %in% colnames(.self$mq.data))) .self$mq.data = .self$mq.data[!(.self$mq.data$contaminant),] + if (grepl("R", filter) & ("reverse" %in% colnames(.self$mq.data))) .self$mq.data = .self$mq.data[!(.self$mq.data$reverse),] ## proteingroups.txt special treatment if (type=="pg") { @@ -238,15 +222,15 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad if (LFQ_action!=FALSE) { ## replace erroneous zero LFQ values with something else cat("Starting LFQ action.\nReplacing ...\n") - lfq_cols = grepv("^lfq", colnames(.$mq.data)) + lfq_cols = grepv("^lfq", colnames(.self$mq.data)) for (cc in lfq_cols) { ## get corresponding raw intensity column rawint_col = sub("^lfq\\.", "", cc) - if (!(rawint_col %in% colnames(.$mq.data))) {stop(paste0("Could not find column '", rawint_col, "' in dataframe with columns: ", paste(colnames(.$mq.data), collapse=",")), "\n")} - vals = .$mq.data[, cc] + if (!(rawint_col %in% colnames(.self$mq.data))) {stop(paste0("Could not find column '", rawint_col, "' in dataframe with columns: ", paste(colnames(.self$mq.data), collapse=",")), "\n")} + vals = .self$mq.data[, cc] ## affected rows - bad_rows = (.$mq.data[, rawint_col]>0 & .$mq.data[, cc]==0) + bad_rows = (.self$mq.data[, rawint_col]>0 & .self$mq.data[, cc]==0) if (sum(bad_rows, na.rm = TRUE)==0) {next;} ## take action if (LFQ_action=="toNA" | LFQ_action=="impute") { @@ -258,9 +242,9 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad ## replace with minimum noise value (>0!) vals[bad_rows] = impVal; } - cat(paste0(" '", cc, "' ", sum(bad_rows, na.rm = TRUE), ' entries (', sum(bad_rows, na.rm = TRUE)/nrow(.$mq.data)*100,'%) with ', impVal, '\n')) + cat(paste0(" '", cc, "' ", sum(bad_rows, na.rm = TRUE), ' entries (', sum(bad_rows, na.rm = TRUE)/nrow(.self$mq.data)*100,'%) with ', impVal, '\n')) ## add column - .$mq.data[, paste0("c", cc)] = vals; + .self$mq.data[, paste0("c", cc)] = vals; ## stats = rbind(stats, c(sum(bad_rows, na.rm = TRUE), impVal)) @@ -274,17 +258,17 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad } } - int_cols = grepv("intensity", colnames(.$mq.data)) + int_cols = grepv("intensity", colnames(.self$mq.data)) ## ## apply potential fix (MQ 1.5 writes numbers in scientific notation with ',' -- which parses as String :( ) ## - int_cols_nn = (apply(.$mq.data[,int_cols, drop = FALSE], 2, class) != "numeric") + int_cols_nn = (apply(.self$mq.data[,int_cols, drop = FALSE], 2, class) != "numeric") if (any(int_cols_nn)) { - .$mq.data[, int_cols[int_cols_nn]] = sapply(int_cols[int_cols_nn], function(x_name) + .self$mq.data[, int_cols[int_cols_nn]] = sapply(int_cols[int_cols_nn], function(x_name) { - x = .$mq.data[, x_name] + x = .self$mq.data[, x_name] if (class(x) == "integer") { x = as.numeric(x) @@ -307,314 +291,84 @@ MQDataReader$readMQ <- function(., file, filter="", type="pg", col_subset=NA, ad ## ## add Abundance index ## - if ("mol..weight..kda." %in% colnames(.$mq.data)){ + if ("mol..weight..kda." %in% colnames(.self$mq.data)){ ### add abundance index columns (for both, intensity and lfq.intensity) - .$mq.data[, sub("intensity", "AbInd", int_cols)] = apply(.$mq.data[,int_cols, drop = FALSE], 2, function(x) + .self$mq.data[, sub("intensity", "AbInd", int_cols)] = apply(.self$mq.data[,int_cols, drop = FALSE], 2, function(x) { - return (x / .$mq.data[,"mol..weight..kda."]) + return (x / .self$mq.data[,"mol..weight..kda."]) }) } else { - stop("MQDataReader$readMQ(): Cannot add abundance index since 'mol..weight..kda.' was not loaded from file. Did you use the correct 'type' or forgot to add the column in 'col_subset'?") + stop("MQDataReader::readMQ(): Cannot add abundance index since 'mol..weight..kda.' was not loaded from file. Did you use the correct 'type' or forgot to add the column in 'col_subset'?") } } else if (type=="sm") { ## summary.txt special treatment ## find the first row, which lists Groups (after Raw files): it has two non-zero entries only ## (or even less if the group name is empty) - ##dx <<- .$mq.data; - idx_group = which(apply(.$mq.data, 1, function(x) sum(x!="", na.rm = TRUE))<=2)[1] + ##dx <<- .self$mq.data; + idx_group = which(apply(.self$mq.data, 1, function(x) sum(x!="", na.rm = TRUE))<=2)[1] ## summary.txt will not contain groups, if none where specified during MQ-configuration if (is.na(idx_group)) { - idx_group = nrow(.$mq.data) + idx_group = nrow(.self$mq.data) groups= NA } else { - groups = .$mq.data[idx_group:(nrow(.$mq.data)-1), ] + groups = .self$mq.data[idx_group:(nrow(.self$mq.data)-1), ] } - raw.files = .$mq.data[1:(idx_group-1), ] - total = .$mq.data - .$mq.data = raw.files ## temporary, until we have assigned the fc.raw.files - } - - - if (add_fs_col & "raw.file" %in% colnames(.$mq.data)) - { - cat(paste0("Adding fc.raw.file column ...")) - ## check if we already have a mapping - if (is.null(.$raw_file_mapping)) + raw.files = .self$mq.data[1:(idx_group-1), ] + total = .self$mq.data + .self$mq.data = raw.files ## temporary, until we have assigned the fc.raw.files + } else if (type == "ev") { + ## check if data is missing + if (all(c("type", "modified.sequence") %in% colnames(.self$mq.data)) & + any("MULTI-MATCH" %in% .self$mq.data$type) & + all(.self$mq.data$modified.sequence[.self$mq.data$type=="MULTI-MATCH"]=="")) { - .$raw_file_mapping = .$getShortNames(unique(.$mq.data$raw.file), add_fs_col) - ## indicate to outside that a new table is ready - .$mapping.creation = .$getMappingCreation()['auto'] + warning(immediate. = TRUE, "readMQ(): Input data has empty cells for column 'modified.sequence' of type 'MULTI-MATCH'. Early MaxQuant versions (e.g. 1.2.2) have this problem. We will try to reconstruct the data.") + ## use the preceeding sequence (and hope that there are no missing rows in between) + .self$mq.data = .self$mq.data[order(.self$mq.data$id), ] + ## find blocks of MATCHed rows ... + idx_mm = which(.self$mq.data$type=="MULTI-MATCH") ## row index + head(idx_mm) + idx_block_start = idx_mm[ c(1, which(diff(idx_mm)>1) + 1) ] ## index to block of MATCHES + head(idx_block_start) + idx_block_end = c(idx_mm[match(idx_block_start, idx_mm)[-1]-1], idx_mm[length(idx_mm)]) + head(idx_block_end) + .self$mq.data$modified.sequence[idx_mm] = rep(.self$mq.data$modified.sequence[idx_block_start-1], + idx_block_end-idx_block_start+1) } - ## do the mapping - .$mq.data$fc.raw.file = as.factor(.$raw_file_mapping$to[match(.$mq.data$raw.file, .$raw_file_mapping$from)]) - ## check for NA's - if (any(is.na(.$mq.data$fc.raw.file))) - { ## if mapping is incomplete - missing = unique(.$mq.data$raw.file[is.na(.$mq.data$fc.raw.file)]) - if (.$mapping.creation == .$getMappingCreation()['user']) - { - ## the user has re-run MaxQuant with more Raw files, - ## but the old _filename_sort.txt file was used to read the (now incomplete mapping) - warning("Incomplete mapping file '", .$external.mapping.file, "'.\nAugmenting shortened Raw files:\n " %+% - paste(missing, collapse="\n ", sep="") %+% ".\nEdit the table if necessary and re-run PTXQC.") - ## augment - addon = .$getShortNames(missing, add_fs_col, nrow(.$raw_file_mapping) + 1) - .$raw_file_mapping = rbind(.$raw_file_mapping, - addon) - } else { - stop("Hithero unknown Raw files: " %+% paste(missing, collapse=", ", sep="") %+% " occurred in file '" %+% file %+% "' which were not present in previous txt files.") - } - } - cat(paste0(" done\n")) - } - - if (type=="sm") { ## post processing for summary - ## .$mq.data is basically "raw.files#, but with fc.raw.files augmented - .$mq.data = list(raw = .$mq.data, groups = groups, total = total) } - return (.$mq.data); -} ## end readMQ() - - -#' -#' Shorten a set of Raw file names and return a data frame with the mappings. -#' -#' @param raw.files Vector of Raw files -#' @param max_len Maximal length of shortening results, before resorting to canonical names (file 1,...) -#' @param fallbackStartNr Starting index for canonical names -#' @return data.frame with mapping -#' -#' @name MQDataReader$getShortNames -#' -MQDataReader$getShortNames = function(., raw.files, max_len, fallbackStartNr = 1) -{ - ## - ## mapping will have: $from, $to and optionally $best.effort (if shorting was unsuccessful and numbers had to be used) - ## - rf_name = raw.files - ## remove prefix - rf_name_s = delLCP(rf_name, - min_out_length = 8, - add_dots = TRUE) - ## remove infix (2 iterations) - rf_name_s = simplifyNames(rf_name_s, - 2, - min_LCS_length = 7, - min_out_length = 8) - ## check if shorter filenames are still unique (they should be.. if not we have a problem!!) - if (length(rf_name) != length(unique(rf_name_s))) + if (add_fs_col & "raw.file" %in% colnames(.self$mq.data)) { - cat("Original names:\n") - cat(rf_name) - cat("Short names:\n") - cat(rf_name_s) - stop("While loading MQ data: shortened raw filenames are not unique! This should not happen. Please contact the developers and provide the above names!") - } - df.mapping = data.frame(from = rf_name, to = rf_name_s, stringsAsFactors = FALSE) - - ## always include 'best.effort' column - df.mapping[, "best.effort"] = df.mapping$to - - ## check if the minimal length was reached - if (max(nchar(df.mapping$to)) > max_len) - { ## resort to short naming convention - cat("Filenames are longer than the maximal allowed size of '" %+% max_len %+% "'. Resorting to short versions 'file X'.\n\n") - maxl = length(raw.files) - 1 + fallbackStartNr - df.mapping$to = paste("file", sprintf(paste0("%0", nchar(maxl), "d"), fallbackStartNr:maxl)) ## with leading 0's if required - } - return(df.mapping) -} - -#' Plots the current mapping of Raw file names to their shortened version. -#' -#' Convenience function to plot the mapping (e.g. to a PDF device for reporting). -#' The data frame can be accessed directly via \code{.$raw_file_mapping}. -#' If no mapping exists, the function prints a warning to console and returns NULL (which is safe to use in print(NULL)). -#' -#' @return if mapping is available, returns a list of plots 'plots' and a Html table string 'htmlTable' ; 'NULL' otherwise. -#' -#' @import ggplot2 -#' -#' @name MQDataReader$plotNameMapping -#' -MQDataReader$plotNameMapping <- function(.) -{ - if (!is.null(.$raw_file_mapping)) - { - table_header = c("original", "short\nname") - xpos = c(9, 11) - extra = "" - has_best_effort = FALSE - if ("best.effort" %in% colnames(.$raw_file_mapping)) - { - has_best_effort = TRUE - table_header = c(table_header, "best\neffort") - xpos = c(9, 11, 13) - if (all(.$raw_file_mapping$to != .$raw_file_mapping$best.effort)) { - extra = "\n(automatic shortening of names was not sufficient - see 'best effort')" - } - - } - - #mq_mapping = mq$raw_file_mapping - mq_mapping = .$raw_file_mapping - pl_title = "Mapping of Raw files to their short names\nMapping source: " %+% .$mapping.creation %+% extra; + .self$mq.data$fc.raw.file = .self$fn_map$getShortNames(.self$mq.data$raw.file, add_fs_col) - mappingChunk = function(mq_mapping) - { - mq_mapping$ypos = -(1:nrow(mq_mapping)) - head(mq_mapping) - mq_mapping.long = melt(mq_mapping, id.vars = c("ypos")) - head(mq_mapping.long) - mq_mapping.long$variable = as.character(mq_mapping.long$variable) - mq_mapping.long$col = "#000000"; - mq_mapping.long$col[mq_mapping.long$variable=="to"] = "#5F0000" - mq_mapping.long$variable[mq_mapping.long$variable=="from"] = xpos[1] - mq_mapping.long$variable[mq_mapping.long$variable=="to"] = xpos[2] - if (nchar(extra)) mq_mapping.long$variable[mq_mapping.long$variable=="best.effort"] = xpos[3] - mq_mapping.long$variable = as.numeric(mq_mapping.long$variable) - mq_mapping.long$size = 2; - - df.header = data.frame(ypos = 1, variable = xpos, value = table_header, col = "#000000", size=3) - mq_mapping.long2 = rbind(mq_mapping.long, df.header) - mq_mapping.long2$hpos = 0 ## left aligned, 1=right aligned - mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[1]] = 1 - mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[2]] = 0 - if (nchar(extra)) mq_mapping.long2$hpos[mq_mapping.long2$variable==xpos[3]] = 0 - - mqmap_pl = ggplot(mq_mapping.long2, aes_string(x = "variable", y = "ypos")) + - geom_text(aes_string(label="value"), color = mq_mapping.long2$col, hjust=mq_mapping.long2$hpos, size=mq_mapping.long2$size) + - coord_cartesian(xlim=c(0,20)) + - theme_bw() + - theme(plot.margin = grid::unit(c(1,1,1,1), "cm"), line = element_blank(), - axis.title = element_blank(), panel.border = element_blank(), - axis.text = element_blank(), strip.text = element_blank(), legend.position = "none") + - ggtitle(pl_title) - return(mqmap_pl) - } - l_plots = byXflex(mq_mapping, 1:nrow(mq_mapping), 20, mappingChunk, sort_indices = FALSE); - return (list(plots = l_plots, htmlTable = getHTMLTable(.$raw_file_mapping, pl_title))) - } else { - cat("No mapping found. Omitting plot.") - return (NULL); } - - return (NULL); -} - -#' -#' Writes a mapping table of full Raw file names to shortened names. -#' -#' The internal structure \code{raw_file_mapping} is written to the -#' file specified. -#' File is only created if mapping exists (in .$raw_file_mapping). -#' -#' @param filename Target filename to create. -#' @return Returns NULL. -#' -#' @name MQDataReader$writeMappingFile -#' -MQDataReader$writeMappingFile = function(., filename) -{ - dfs = data.frame(orig.Name = .$raw_file_mapping$from, new.Name = .$raw_file_mapping$to) - if (nrow(dfs) == 0) return(NULL) - - if ("best.effort" %in% colnames(.$raw_file_mapping)) { - dfs$best.effort = .$raw_file_mapping[, "best.effort"] + if (type=="sm") { ## post processing for summary + ## .self$mq.data is basically "raw.files#, but with fc.raw.files augmented + .self$other = list(groups = groups, total = total) } - ## use a file handle to avoid warning from write.table() when appending - ## a table with column names 'Warning(): appending column names to file' - FH = file(filename, "w") - cat(file = FH, - "# This file can be used to manually substitute Raw file names within the report.", - "# The ordering of Raw files in the report can be changed by re-arranging the rows.", - sep = "\n") - write.table(x = dfs, file = FH, quote = FALSE, sep="\t", row.names = FALSE) - close(FH) ## flush - return(NULL) -} - + return (.self$mq.data); +}, ## end readMQ() -#' -#' Reads a mapping table of full Raw file names to shortened names. -#' -#' The internal structure \code{raw_file_mapping} is created using this file. -#' If the file is missing, nothing is done. -#' -#' The file must have two columns named: 'orig.Name' and 'new.Name' and use Tab as separator. -#' I.e. -#' \preformatted{# This file can be used to manually substitute Raw file names within the report. -#' # The ordering of Raw files in the report can be changed by re-arranging the rows. -#' orig.Name new.Name -#' 2011_05_30_ALH_OT_21_VIL_TMT_FR01 myfile A -#' 2011_05_30_ALH_OT_22_VIL_TMT_FR02 another B -#' } -#' -#' @param filename Source filename to read. -#' @return Returns \code{TRUE} if file was read, \code{FALSE} if it does not exist. -#' -#' @name MQDataReader$readMappingFile -#' -MQDataReader$readMappingFile = function(., filename) -{ - if (file.exists(filename)) - { - message(paste0("Reading mapping file '", filename, "'\n")) - dfs = read.delim(filename, comment.char="#", stringsAsFactors = FALSE) - colnames(dfs) = gsub("_", ".", colnames(dfs)) ## legacy support for old "best_effort" column (now "best.effort") - req_cols = c(from = "orig.Name", to = "new.Name") - if (!all(req_cols %in% colnames(dfs))) - { - stop("Input file '", filename, "' does not contain the columns '", paste(req_cols, collapse="' and '"), "'.", - " Please fix and re-run PTXQC!") - } - req_cols = c(req_cols, best.effort = "best.effort") ## augment - colnames(dfs) = names(req_cols)[match(colnames(dfs), req_cols)] - - if (any(duplicated(dfs$from)) | any(duplicated(dfs$to))) - { - dups = c(dfs$from[duplicated(dfs$from)], dfs$to[duplicated(dfs$to)]) - stop("Input file '", filename_sorting, "' has duplicate entries ('", paste(dups, collapse=", "), ")'!", - " Please fix and re-run PTXQC!") - } - dfs - dfs$to = factor(dfs$to, levels = unique(dfs$to), ordered = TRUE) ## keep the order - dfs$from = factor(dfs$from, levels = unique(dfs$from), ordered = TRUE) ## keep the order - ## set internal mapping - .$raw_file_mapping = dfs - ## set who defined it - .$mapping.creation = 'file (user-defined)' - .$external.mapping.file = filename; ## remember filename for later error messages - return (TRUE) - } - return (FALSE) -} -MQDataReader$getMappingCreation = function(.) -{ - return(c(user = 'file (user-defined)', auto = 'automatic')) -} -#' Replaces values in the mq.data member with (binary) values. -#' -#' Most MQ tables contain columns like 'contaminants' or 'reverse', whose values are either empty strings -#' or "+", which is inconvenient and can be much better represented as TRUE/FALSE. -#' The params \code{valid_entries} and \code{replacements} contain the matched pairs, which determine what is replaced with what. -#' -#' @param colname Name of the column (e.g. "contaminants") in the mq.data table -#' @param valid_entries Vector of values to be replaced (must contain all values expected in the column -- fails otherwise) -#' @param replacements Vector of values inserted with the same length as \code{valid_entries}. -#' @return Returns \code{TRUE} if successful. -#' -#' @name MQDataReader$substitute -#' -MQDataReader$substitute <- function(., colname, valid_entries = c(NA, "","+"), replacements = c(FALSE, FALSE, TRUE)) +substitute = function(colname, valid_entries = c(NA, "", "+"), replacements = c(FALSE, FALSE, TRUE)) { + #' + #' Replaces values in the mq.data member with (binary) values. + #' Most MQ tables contain columns like 'contaminants' or 'reverse', whose values are either empty strings + #' or "+", which is inconvenient and can be much better represented as TRUE/FALSE. + #' The params \code{valid_entries} and \code{replacements} contain the matched pairs, which determine what is replaced with what. + #' + #' @param colname Name of the column (e.g. 'contaminants') in the mq.data table + #' @param valid_entries Vector of values to be replaced (must contain all values expected in the column -- fails otherwise) + #' @param replacements Vector of values inserted with the same length as \code{valid_entries}. + #' @return Returns \code{TRUE} if successful. + #' + if (length(valid_entries) == 0) { stop("Entries given to $substitute() must not be empty.") @@ -623,60 +377,59 @@ MQDataReader$substitute <- function(., colname, valid_entries = c(NA, "","+"), r { stop("In function $substitute(): 'valid_entries' and 'replacements' to not have the same length!") } - if (colname %in% colnames(.$mq.data)) + if (colname %in% colnames(.self$mq.data)) { ## verify that there are only known entries (usually c("","+") ) - setD_c = setdiff(.$mq.data[, colname], valid_entries) + setD_c = setdiff(.self$mq.data[, colname], valid_entries) if (length(setD_c) > 0) stop(paste0("'", colname, "' column contains unknown entry (", paste(setD_c, collapse=",", sep="") ,").")) ## replace with TRUE/FALSE - .$mq.data[, colname] = replacements[ match(.$mq.data[, colname], valid_entries) ]; + .self$mq.data[, colname] = replacements[ match(.self$mq.data[, colname], valid_entries) ]; } - return (TRUE); -} + return (TRUE) +}, -#' Detect broken lines (e.g. due to Excel import+export) -#' -#' When editing a MQ txt file in Microsoft Excel, saving the file can cause it to be corrupted, -#' since Excel has a single cell content limit of 32k characters -#' (see http://office.microsoft.com/en-001/excel-help/excel-specifications-and-limits-HP010342495.aspx) -#' while MQ can easily reach 60k (e.g. in oxidation sites column). -#' Thus, affected cells will trigger a line break, effectively splitting one line into two (or more). -#' -#' If the table has an 'id' column, we can simply check the numbers are consecutive. If no 'id' column is available, -#' we detect line-breaks by counting the number of NA's per row and finding outliers. -#' The line break then must be in this line (plus the preceeding or following one). Depending on where -#' the break happened we can also detect both lines right away (if both have more NA's than expected). -#' -#' Currently, we have no good strategy to fix the problem since columns are not aligned any longer, which -#' leads to columns not having the class (e.g. numeric) they should have. -#' (thus one would need to un-do the linebreak and read the whole file again) -#' -#' [Solution to the problem: try LibreOffice 4.0.x or above -- seems not to have this limitation] -#' -#' @return Returns a vector of indices of broken (i.e. invalid) lines -#' -#' @name MQDataReader$getInvalidLines -#' -MQDataReader$getInvalidLines <- function(.) +getInvalidLines = function() { - if (!inherits(.$mq.data, 'data.frame')) + "Detect broken lines (e.g. due to Excel import+export) + + When editing a MQ txt file in Microsoft Excel, saving the file can cause it to be corrupted, + since Excel has a single cell content limit of 32k characters + (see http://office.microsoft.com/en-001/excel-help/excel-specifications-and-limits-HP010342495.aspx) + while MQ can easily reach 60k (e.g. in oxidation sites column). + Thus, affected cells will trigger a line break, effectively splitting one line into two (or more). + + If the table has an 'id' column, we can simply check the numbers are consecutive. If no 'id' column is available, + we detect line-breaks by counting the number of NA's per row and finding outliers. + The line break then must be in this line (plus the preceeding or following one). Depending on where + the break happened we can also detect both lines right away (if both have more NA's than expected). + + Currently, we have no good strategy to fix the problem since columns are not aligned any longer, which + leads to columns not having the class (e.g. numeric) they should have. + (thus one would need to un-do the linebreak and read the whole file again) + + [Solution to the problem: try LibreOffice 4.0.x or above -- seems not to have this limitation] + + @return Returns a vector of indices of broken (i.e. invalid) lines + " + + if (!inherits(.self$mq.data, 'data.frame')) { stop("In 'MQDataReader$getInvalidLines': function called before data was loaded. Internal error. Exiting.", call. = FALSE); } broken_rows = c() - if ("id" %in% colnames(.$mq.data)) + if ("id" %in% colnames(.self$mq.data)) { - last_id = as.numeric(as.character(.$mq.data$id[nrow(.$mq.data)])) - if (is.na(last_id) || (last_id+1)!=nrow(.$mq.data)) + last_id = as.numeric(as.character(.self$mq.data$id[nrow(.self$mq.data)])) + if (is.na(last_id) || (last_id+1)!=nrow(.self$mq.data)) { - print(paste0("While checking ID column: last ID was '", last_id, "', while table has '", nrow(.$mq.data), "' rows.")) - broken_rows = which(!is.numeric(as.character(.$mq.data$id))) + print(paste0("While checking ID column: last ID was '", last_id, "', while table has '", nrow(.self$mq.data), "' rows.")) + broken_rows = which(!is.numeric(as.character(.self$mq.data$id))) } } else { - cols = !grepl("ratio", colnames(.$mq.data)) ## exclude ratio columns, since these can have regular NA's in unpredictable frequency - counts = apply(.$mq.data[, cols], 1, function(x) sum(is.na(x))); + cols = !grepl("ratio", colnames(.self$mq.data)) ## exclude ratio columns, since these can have regular NA's in unpredictable frequency + counts = apply(.self$mq.data[, cols], 1, function(x) sum(is.na(x))); ## NA counts should be roughly equal across rows expected_count = quantile(counts, probs = 0.75) broken_rows = which(counts > (expected_count * 3 + 10)) @@ -690,3 +443,7 @@ MQDataReader$getInvalidLines <- function(.) return (broken_rows); } +) ## methods +) ## RefClass + + diff --git a/R/MzTabReader.R b/R/MzTabReader.R new file mode 100644 index 0000000..07963bd --- /dev/null +++ b/R/MzTabReader.R @@ -0,0 +1,475 @@ +#' +#' Class to read an mzTab file and store the tables internally. +#' +#' The 'sections' field is initialized after $readMzTab was called. +#' The 'fn_map' fields should be initialized via ...$fn_map$readMappingFile(...) manually if user-defined filename mappings are desired +#' and is automatically updated/queried when $readMzTab is called. +#' +#' @field sections MzTab sections as list. Valid list entries are: "MTD", "PRT", "PEP", "PSM", "SML", "filename" and "comments" +#' @field fn_map FilenameMapper which can translate raw filenames into something shorter +#' +#' +#' +MzTabReader = setRefClass("MzTabReader", + fields = list(sections = "list", + fn_map = "FilenameMapper" + ), + methods = list( + initialize=function() { + .self$sections = list(); + .self$fn_map = FilenameMapper$new(); + + return(.self) + }, + #' +readMzTab = function(.self, file) { + "Read a mzTab file into a list of 5 data.frames (one df per mzTab section). + Data.frames in the resulting list are named as follows: + 'MTD', 'PRT', 'PEP', 'PSM', 'SML',. + Additionally, 'filename' and 'comments' are valid list elements. + " + + cat("Reading mzTab '", file, "' ...", sep = "") + + ## this implementation is derived from with minor modifications + ## https://github.com/lgatto/MSnbase/blob/master/R/MzTab.R + + f_con = file(mztab_file, open = "r") ## for better error messages + lines = readLines(f_con) + close(f_con) + # remove empty lines + lines = lines[ nzchar(lines) ] + + ## Split on the first two characters (so headers stay in + ## the same group as table content rows) + lineType = substring(lines, 1, 2) + + ## Could be stricter in the type checking to check that all + ## three of the first characters match the 10 allowed types + ## but since it doesn't affect parsing, I don't think it's + ## worth bothering. + allowed_types = c("CO", "MT", "PR", "PE", "PS", "SM") + stopifnot(all(lineType %in% allowed_types)) + linesByType = split(lines, lineType) + + ## Comments are easy: just strip the first four characters + ## from each line. + comments = substring(linesByType[["CO"]], 5) + + ## Parse the other five blocks in a loop, then fix up + ## metadata afterwards + res = setNames( + lapply( + linesByType[c("MT", "PR", "PE", "PS", "SM")], + function(x) { + if (length(x) == 0) return(data.frame()) + ## MTD section has no header... + if (startsWith(x[1], "MTD")) { + d = read.delim(text = x, + header = FALSE, + col.names = c("MTD", "key", "value"), + na.strings = c("", "null"), + stringsAsFactors = FALSE, + fill = FALSE) + } else { + d = read.delim(text = x, + header = TRUE, + na.strings = c("", "null", "not mapped"), + stringsAsFactors = FALSE, + fill = FALSE) + colnames(d) = make.names(colnames(d), allow_ = FALSE) + } + return(d[,-1]) + }), + c("MTD", "PRT", "PEP", "PSM", "SML")) + + ## rewrite MetaData as named vector + #res[["MTD"]] = setNames(res[["MTD"]][,2], res[["MTD"]][, 1]) + + ## create Raw filename mapping internally + mtd = res[["MTD"]] + idx_run = grep("^ms_run\\[\\d*\\]-location", mtd$key, value = FALSE) + ms_runs = gsub("[.]*-location", "\\1", mtd$key[idx_run]) + raw_filenames = mtd$value[idx_run] + .self$fn_map$getShortNames(raw_filenames, ms_runs = ms_runs) + + + res[["filename"]] = file + res[["comments"]] = comments + .self$sections = res + return (NULL) +}, + +getParameters = function() +{ + "Converts internal mzTab metadata section to a two column key-value data.frame similar to MaxQuants parameters.txt." + + # copy the whole MTD for now, since R likes shallow copies and we are about to rename columns by reference (see ?setnames) + res = data.table::copy(.self$sections[["MTD"]]) + if (!is.na(unique(.self$sections$PSM$database))) { + res = rbind(res, data.frame(key= "fasta file", value = paste(basename(unique(.self$sections$PSM$database)), collapse=";"))) + } + else { + res = rbind(res, data.frame(key= "fasta file", value = "NULL")) + } + + res = res[grep("^custom", res$key, invert = TRUE),] + + res[is.na(res)] = "NULL" # temp workaround + + ## todo: remove at some point, since it forces us to use `::copy` + renameColumns(res, list(key = "parameter")) + + return (res) +}, + +getSummary = function() +{ + "Converts internal mzTab metadata section to a two data.frame with columns 'fc.raw.file', 'ms.ms.identified....' + similar to MaxQuants summary.txt." + res = .self$fn_map$getRawfm()[ , c("from", "to")] + colnames(res) = c("raw.file", "fc.raw.file") + + ## read all custom entries + mtd_custom_df = .self$sections$MTD[grep("^custom", .self$sections$MTD$key), ] + + if (nrow(mtd_custom_df) == 0) return(NULL) + + ## ... and subselect the ms2-ID-Rate + ms2_df = mtd_custom_df[grep("^\\[MS2 identification rate", mtd_custom_df$value), ] + res$ms.ms.identified.... = unlist(lapply(gsub(".*, ?(\\d*\\.\\d*)\\]", "\\1", ms2_df$value), as.numeric)) + + ## read TIC + tic_df = mtd_custom_df[grep("total ion current", mtd_custom_df$value),] + res$TIC = lapply(strsplit(sub(".* \\[(.*)\\]]", "\\1", tic_df$value), ","), as.numeric) + + return (res) +}, + +## MaxQuant-like representation of PRT table, i.e. augmented this with more columns (or renamed) if a metric requires it +getProteins = function() +{ + "Basically the PRT table ..." + + res = .self$sections$PRT + + return ( res ) +}, + +## MaxQuant-like representation of PEP table, i.e. augmented this with more columns (or renamed) if a metric requires it +getEvidence = function() +{ + "Basically the PSM table and additionally columns named 'raw.file' and 'fc.raw.file'." + + res = data.table::as.data.table(.self$sections$PSM) + + ## remove empty PepIDs + ## ... unidentfied MS2 scans (NA) or with no ConsensusFeature group (-1), i.e. unassigned PepIDs + + if (all(c("opt.global.cf.id") %in% colnames(res))) { + res = res[!(is.na(res$opt.global.cf.id) | (res$opt.global.cf.id == -1)),] + stopifnot(min(res$opt.global.cf.id) >= 0) ## would stop on NA as well + } + + ## augment with fc.raw.file + ## The 'spectra_ref' looks like 'ms_run[x]:index=y|ms_run' + res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref)) + stopifnot(all(!is.na(res$fc.raw.file))) # Spectra-Ref in PSM table not set for all entries + + + res$retention.time.calibration = NA + if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res))) + { + renameColumns(res, list( retention.time = "retention.time.pep", + opt.global.rt.raw = "retention.time", + opt.global.rt.align = "calibrated.retention.time" + )) + res$retention.time.calibration = res$calibrated.retention.time - res$retention.time + } + + name = list( opt.global.calibrated.mz.error.ppm = "mass.error..ppm.", + opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.", + exp.mass.to.charge = "m.z", + opt.global.mass = "mass", + opt.global.identified = "identified", + opt.global.ScanEventNumber = "scan.event.number", + PSM.ID = "id", + opt.global.modified.sequence = "modified.sequence", + opt.global.is.contaminant = "contaminant", + opt.global.fragment.mass.error.da = "mass.deviations..da.", + opt.global.cv.MS.1002217.decoy.peptide = "reverse" + ) + + + renameColumns(res, name) + + if (!"modified.sequence" %in% colnames(res)){ + res$modified.sequence = res$sequence + warning("modified.sequence is not present in input data, metrics use sequence instead", immediate. = TRUE) + } + + if(!"contaminant" %in% colnames(res)){ + res$contaminant = 0 + } + # set contaminant to TRUE/FALSE + res$contaminant = (res$contaminant > 0) + + ## optional in MzTab (depending on which FeatureFinder was used) + if ("opt.global.FWHM" %in% colnames(res)){ + renameColumns(res, list(opt.global.FWHM = "retention.length")) + } + + ## de-duplicate protein-accession entries + accessions = res[, .(l_accession=list(accession)), by=id]$l_accession + res = unique(res, by = "id") + res$proteins = unlist(lapply(accessions, paste, collapse=";")) + + ## todo: these are protein names not IDs, but the code does not care (its not very clean though) + res$protein.group.ids = res$proteins + + ## annotate ms.ms.count for identical sequences per rawfile, but only the first member of the group; + ## all others get NA to prevent double counting + res[, ms.ms.count := c(.N, rep(NA, .N-1)), by = list(raw.file, modified.sequence, charge)] + + ## convert values from seconds to minutes for all RT columns + RTUnitCorrection(res) + + ## + ## intensity from PEP to PSM: only labelfree ('opt.global.cf.id' links all PSMs belonging to a ConsensusFeature) + ## + df_pep = data.frame() + if (all(c("opt.global.feature.id") %in% colnames(res))){ + df_pep = data.table::as.data.table(.self$sections$PEP)[!is.na(sequence), ] + renameColumns(df_pep, list(opt.global.modified.sequence = "modified.sequence")) + ## add raw.file... + df_pep = cbind(df_pep, .self$fn_map$specrefToRawfile(df_pep$spectra.ref)) + ## .. a unique index + df_pep$idx = 1:nrow(df_pep) + ## map from PSM -> PEP row + ## ... do NOT use spectra.ref since this is ambiguous (IDMapper duplicates MS2 PepIDs to multiple features) + res$pep_idx = match(res$opt.global.feature.id, df_pep$opt.global.feature.id, nomatch = NA_integer_) + + res$ms_run_number = as.numeric(gsub("^ms_run\\[(\\d*)\\].*", "\\1", res$ms_run)) + col_abd_df_pep = grepv( "^peptide.abundance.study.variable.", names(df_pep)) + col_RT_df_pep = grepv( "^opt.global.retention.time.study.variable", names(df_pep)) + ## transposed matrix for all abundances (rows = study; cols = pep_idx) + ## .. this is a significant speedup compared to indexing into .SD[,] in subqueries, since that requires unlist() + m_pep_abd = t(df_pep[, ..col_abd_df_pep]) + m_pep_rt = t(df_pep[, ..col_RT_df_pep]) + N.studies = length(col_RT_df_pep) + stopifnot(N.studies == length(col_abd_df_pep)) + } + + + NA_duplicates = function(vec_abd, idx) { + ## replaces duplicate indices into the same ms_run intensities with NA + ## (to avoid counting a feature more than once due to oversampled PSMs from one run assigned to a CF) + r = vec_abd[idx] + r[duplicated(idx)] = NA + return(r) + } + + if (all(c("opt.global.cf.id") %in% colnames(res))) { + ## assign intensity to genuine PSMs + res$intensity = NA_real_ ## unassigned PSMs have no MS1 intensity + res[, + intensity := NA_duplicates(m_pep_abd[, .SD$pep_idx[1]], .SD$ms_run_number), + by = "opt.global.cf.id"] + summary(res$intensity) + } + + + res$is.transferred = FALSE + res$type = "MULTI-MSMS" + + #set reverse to needed values + if ("reverse" %in% colnames(res)){ + res$reverse=(res$reverse=="decoy") + } + + ## remove the data.table info, since metrics will break due to different syntax + class(res) = "data.frame" + + + ## + ## Infer MBR + ## --> find all subfeatures in a CF with abundance but missing PSM --> create as MBR-dummy-PSMs + + res_tf = res[NULL,] + stopifnot(ncol(res_tf) > 0) + if (!plyr::empty(df_pep)){ + res_tf_tmp = df_pep[, {#print(idx) + idx_PSM = which(res$pep_idx == idx) + runs_with_MS2 = unique(res$ms_run_number[idx_PSM]) ## existing PSMs for this PEP + runs_wo_MS2 = (1:N.studies)[-runs_with_MS2] + df = .SD[rep(1, length(runs_wo_MS2)), c("charge", "modified.sequence", "sequence")] + df$pep_idx = idx + df$ms_run_number = runs_wo_MS2 ## vector + df$calibrated.retention.time = m_pep_rt[runs_wo_MS2, idx] + df$intensity = m_pep_abd[runs_wo_MS2, idx] + df$protein.group.ids = res$protein.group.ids[idx_PSM[1]] ## use PGI from genuine PSMs + df ## return + }, by = "idx"] ## one PEP row at a time + + if(nrow(res_tf_tmp) > 0){ + res_tf = res_tf_tmp + ## convert from "ms_run_number" to fc.raw.file + res_tf = cbind(res_tf, .self$fn_map$msrunToRawfile(paste0("ms_run[", res_tf$ms_run_number, "]"))) + res_tf$is.transferred = TRUE + res_tf$type = "MULTI-MATCH" + + ## check: summed intensities should be equal + if("intensity" %in% colnames(res) && "intensity" %in% colnames(res_tf)){ + stopifnot(sum(df_pep[, ..col_abd_df_pep], na.rm = TRUE) == sum(res$intensity, na.rm = TRUE) + sum(res_tf$intensity, na.rm = TRUE)) + } + } + } + + ## remove the data.table info, since metrics will break due to different syntax + class(res_tf) = "data.frame" + + message("Evidence table generated: ", nrow(res), "x", ncol(res), "(genuine); ", nrow(res_tf), "x", ncol(res_tf), "(transferred)") + + ## must at least have column names, but can have 0 rows + stopifnot(ncol(res_tf) > 0) + + return (list("genuine" = res, "transferred" = res_tf)) +}, + + +getMSMSScans = function(identified_only = FALSE) +{ + "Basically the PSM table (partially renamed columns) and additionally two columns 'raw.file' and 'fc.raw.file'. + If identified_only is TRUE, only MS2 scans which were identified (i.e. a PSM) are returned -- this is equivalent to msms.txt in MaxQuant." + + res = data.table::as.data.table(.self$sections$PSM) + + stopifnot(all((res$opt.global.identified == 1) == (!is.na(res$sequence)))) + if (identified_only) { + res = res[!is.na(res$sequence), ] # == NA sequence + } + + ## de-duplicate PSM.ID column: take first row for each PSM.ID + res_temp = res[!duplicated(res$PSM.ID), ] + ## ... and append accessions of the complete subset (.SD) + res_temp$accessions = res[, paste0(.SD$accession, collapse=";"), by = "PSM.ID"]$V1 + res = res_temp + + ## Augment fc.raw.file column + ## ... the `spectra_ref` looks like "ms_run[12]:controllerType=0 controllerNumber=1 scan=25337" + res = cbind(res, .self$fn_map$specrefToRawfile(res$spectra.ref)) + + ## IDMapper might duplicate PepIDs if two or more features are a good match + ## ... but we only want each MS2 scan represented once here + res = unique(res, by = c("fc.raw.file", "spectra.ref")) + + if (all(c("opt.global.rt.align", "opt.global.rt.raw") %in% colnames(res))) + { + renameColumns(res, list( retention.time = "retention.time.pep", + opt.global.rt.raw = "retention.time", + opt.global.rt.align = "calibrated.retention.time")) + res$retention.time.calibration = res$calibrated.retention.time - res$retention.time + } + else res$retention.time.calibration = NA + + + name = list( opt.global.calibrated.mz.error.ppm = "mass.error..ppm", + opt.global.uncalibrated.mz.error.ppm = "uncalibrated.mass.error..ppm.", + exp.mass.to.charge = "m.z", + opt.global.mass = "mass", + opt.global.fragment.mass.error.da = "mass.deviations..da.", + opt.global.fragment.mass.error.ppm = "mass.deviations..ppm.", + opt.global.identified = "identified", + opt.global.ScanEventNumber = "scan.event.number", + PSM.ID = "id", + opt.global.modified.sequence = "modified.sequence", + opt.global.is.contaminant = "contaminant", + opt.global.missed.cleavages = "missed.cleavages", + opt.global.cv.MS.1002217.decoy.peptide = "reverse", + opt.global.activation.method = "fragmentation", + opt.global.total.ion.count = "total.ion.current", + opt.global.base.peak.intensity = "base.peak.intensity", + opt.global.ion.injection.time = "ion.injection.time") + + renameColumns(res, name) + + if ("mass.deviations..ppm." %in% colnames(res)) { + res$mass.deviations..ppm. = substr(res$mass.deviations..ppm., 2, nchar(res$mass.deviations..ppm.) - 1) + res$mass.deviations..ppm. = gsub(",", ";", res$mass.deviations..ppm., fixed = TRUE) + } + if ("mass.deviations..da." %in% colnames(res)) { + res$mass.deviations..da. = substr(res$mass.deviations..da., 2, nchar(res$mass.deviations..da.) - 1) + res$mass.deviations..da. = gsub(",", ";", res$mass.deviations..da., fixed = TRUE) + } + + #set reverse to needed values + if ("reverse" %in% colnames(res)){ + res$reverse=(res$reverse=="decoy") + } + + if (!"contaminant" %in% colnames(res)){ + res$contaminant = 0 + } + #set contaminant to TRUE/FALSE + res$contaminant = (res$contaminant > 0) + + #set identified to needed values + if ("identified" %in% colnames(res)){ + stopifnot(unique(res$identified) %in% c(0,1)) ## make sure the column has the expected values (0/1) + # set $identified to MaxQuant values (+/-) + res$identified = c("-", "+")[(res$identified==1) + 1] + } + + RTUnitCorrection(res) + + + ## remove the data.table info, since metrics will break due to different syntax + class(res) = "data.frame" + + ## order by file and specRef as RT proxy (do NOT use RT directly, since it might be NA or non-linearly transformed) + ## e.g. spectra.ref might be 'ms_run[1]:controllerType=0 controllerNumber=1 scan=13999' + ## or 'ms_run[2]:spectrum=33' + ## --> extract scan as numeric, since string compare is insufficient for numbers ("13999" > "140") + res$scan = as.numeric(gsub(".*index=(\\d*)|.*scan=(\\d*)|.*spectrum=(\\d*)", "\\1\\2\\3", res$spectra.ref)) + stopifnot(all(!is.na(res$scan))) + res = res[order(res$fc.raw.file, res$scan), ] + + return ( res ) +}, + +RTUnitCorrection = function(dt) +{ + "Convert all RT columns from seconds (OpenMS default) to minutes (MaxQuant default)" + + # heuristic to detect presence of unit:seconds; if retention.time has is, we assume that all rt-columns are in seconds + # retention.time is mandatory for mzTab + if (max(dt[, "retention.time"], na.rm = TRUE) > 300) + { + cn_rt = grepv("retention.time|retention.length", names(dt)) + dt[, c(cn_rt) := lapply(.SD, function(x) x / 60 ), .SDcols = cn_rt] + } + + #dt[, ..cn_rt] + return(NULL) +}, + +renameColumns = function(dt, namelist) +{ + "Renames all columns and throws a warning if a column does not exist in the data" + + from = names(namelist) + to = unlist(namelist) + data.table::setnames(dt, old = from, new = to, skip_absent = TRUE) + + existName = to %in% colnames(dt) + if (!all(existName)) + { + warning(paste0("Columns\n '", + paste(from[!existName], "' (mzTab name) --> '", to[!existName], collapse="' (internal name),\n '", sep=""), + "'\n are not present in input data!"), + immediate. = TRUE) + } +} + +) # methods +) # class diff --git a/R/PTXQC.R b/R/PTXQC.R new file mode 100644 index 0000000..de10401 --- /dev/null +++ b/R/PTXQC.R @@ -0,0 +1,46 @@ +#' PTXQC: A package for computing Quality Control (QC) metrics for Proteomics (PTX) +#' +#' +#' @section Input: +#' Valid input data are either the files from MaxQuant's .txt folder (all versions from MaxQuant >= 1.0 upwards are supported) +#' or a single mzTab file. All mzTab files will work, but most metrics can be obtained from OpenMS' mzTab as produced +#' by the QualityControl TOPP tool (from OpenMS 2.5 onwards). +#' +#' @section Important functions: +#' The central function of this package is called \code{\link{createReport}} and it accepts either MaxQuant or mzTab data, along with +#' a configuration (optional). +#' There is a parser for mzTab \code{\link{MzTabReader}} and MaxQuant txt files \code{\link{MQDataReader}}, as well as a plethora of QC metrics +#' derived from a common \code{\link{qcMetric}} class and scoring functions \code{qual...}, e.g. \code{\link{qualGaussDev}}. +#' +#' @section Configuration: +#' The user can modify the behaviour of PTXQC, e.g. to enable/disable certain metrics or change scoring thresholds, via a YAML object/file. +#' By default a Yaml file is written automatically side-by-side to the input files upon running PTXQC for the first time on a particular input. +#' A custom Yaml object can be passed to the main \code{\link{createReport}} function for customization. +#' Use \code{yaml::yaml.load_file(input = 'myYAML.yaml')} to load an existing file and pass the Yaml object along. +#' +#' @section Output: +#' Either a PDF and/or Html report which contains QC plots and a description of the metrics. +#' +#' @docType package +#' @name PTXQC +#' +#' @import data.table +#' @import ggplot2 +#' @import ggdendro +#' @import grid +#' @import grDevices +#' @import gtable +#' @import kableExtra +#' @import knitr +#' @import methods +#' @import plyr +#' @import RColorBrewer +#' @rawNamespace import(reshape2, except = c(dcast, melt)) +#' @import rmarkdown +#' @importFrom seqinr circle +#' @import stats +#' @import utils +#' @import UpSetR +#' @import yaml +#' +NULL \ No newline at end of file diff --git a/R/fcn_YAML.R b/R/YAMLClass.R similarity index 90% rename from R/fcn_YAML.R rename to R/YAMLClass.R index 9022626..d73c265 100644 --- a/R/fcn_YAML.R +++ b/R/YAMLClass.R @@ -13,8 +13,6 @@ #' #' @field yamlObj A Yaml object as created by \code{\link[yaml]{yaml.load}} #' -#' @importFrom yaml as.yaml -#' #' @exportClass YAMLClass #' @export YAMLClass #' @@ -42,9 +40,10 @@ YAMLClass = setRefClass( return(.self) }, - getYAML = function(param_name, default) + getYAML = function(param_name, default, min = NA, max = NA) { - "Query this YAML object for a certain parameter and return its value. If it does not exist it is created with a default value." + "Query this YAML object for a certain parameter and return its value. If it does not exist it is created with a default value. + An optional min/max range can be specified and will be enforced if the value is known (default will be used upon violation)." cat(paste0("YAML: ", param_name, " def: ", paste(default, sep="", collapse=","))) pval = eval(parse(text=paste0(".self$yamlObj$", param_name))) if (is.null(pval)) @@ -54,6 +53,14 @@ YAMLClass = setRefClass( cat("\n") return (default) } else { + if (!is.na(min)) + { # check range + if (!is.numeric(pval) || pval < min || max < pval) + { + cat(paste0("YAML value for '", param_name, "' is invalid ('", pval, "'). Using default of ", default, ".")) + pval = .self$setYAML(param_name, default) + } + } cat(paste0(" || new val: ", paste(pval, sep="", collapse=","), "\n")) return (pval) } @@ -126,7 +133,7 @@ YAMLClass = setRefClass( # # " - cat(paste0(yaml.user.warning, as.yaml(.self$yamlObj)), file=filename) + cat(paste0(yaml.user.warning, yaml::as.yaml(.self$yamlObj)), file=filename) return (TRUE); } diff --git a/R/fcn_computeQC.R b/R/createReport.R similarity index 51% rename from R/fcn_computeQC.R rename to R/createReport.R index cc87820..f71f9a8 100644 --- a/R/fcn_computeQC.R +++ b/R/createReport.R @@ -3,50 +3,47 @@ #' This is the main function of the package and the only thing you need to call directly if you are #' just interested in getting a QC report. #' -#' You need to provide the folder name of the 'txt' output, as generated by MaxQuant and -#' optionally a YAML configuration object, which allows to (de)activate certain plots and holds other parameters. +#' You need to provide either +#' a) the folder name of the 'txt' output, as generated by MaxQuant or an mzTab file +#' or b) an mzTab file as generated by the OpenMS QualityControl TOPP tool (other mzTab files will probably not work) +#' +#' Optionally, provide a YAML configuration object, which allows to (de)activate certain plots and holds other parameters. #' The yaml_obj is complex and best obtained by running this function once using the default (empty list). #' A full YAML configuration object will be written in the 'txt' folder you provide and can be loaded using #' \code{\link[yaml]{yaml.load}}. #' #' The PDF and the config file will be stored in the given txt folder. #' -#' @note You need write access to the txt folder! +#' @note You need write access to the txt/mzTab folder! #' #' For updates, bug fixes and feedback please visit \url{http://github.com/cbielow/PTXQC}. #' #' @param txt_folder Path to txt output folder of MaxQuant (e.g. "c:/data/Hek293/txt") +#' @param mztab_file Alternative to 'txt_folder', you can provide a single mzTab file which contains PSM, PEP and PRT tables #' @param yaml_obj A nested list object with configuration parameters for the report. #' Useful to switch off certain plots or skip entire sections. #' @param report_filenames Optional list with names (as generated by \code{\link{getReportFilenames}}). #' If not provided, will be created internally by calling \code{\link{getReportFilenames}}. #' @return List with named filename strings, e.g. $yaml_file, $report_file etc.. #' -#' @importFrom plyr ddply dlply ldply llply adply summarise mapvalues -#' @importFrom reshape2 melt -#' @importFrom rmarkdown render pandoc_available -#' @importFrom grDevices dev.off pdf -#' #' @export #' -createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) +createReport = function(txt_folder = NULL, mztab_file = NULL, yaml_obj = list(), report_filenames = NULL) { - DEBUG_PTXQC = FALSE + if (!exists("DEBUG_PTXQC")) DEBUG_PTXQC = FALSE ## debug only when defined externally time_start = Sys.time() + #mztab_file = "c:\\temp\\test.mzTab" + ##mztab_file = NULL - if (!any(file.info(txt_folder)$isdir, na.rm = TRUE)) + in_count = (!is.null(mztab_file)) + (!is.null(txt_folder)) + if ( in_count == 2 ) + { + stop("Please provide EITHER txt_folder or mztab_file, not both") + } + if ( in_count == 0 ) { - stop(paste0("Argument txt_folder with value '", txt_folder, "' is not a valid directory\n")); + stop("Please provide EITHER mz_folder or mztab_file. Both are currently missing!") } - txt_files = list() - txt_files$param = "parameters.txt" - txt_files$summary = "summary.txt" - txt_files$groups = "proteinGroups.txt" - txt_files$evd = "evidence.txt" - txt_files$msms = "msms.txt" - txt_files$msmsScan = "msmsScans.txt" - txt_files$mqpar = "mqpar.xml" - txt_files = lapply(txt_files, function(file) file.path(txt_folder, file)) ### ### prepare the YAML config @@ -57,79 +54,76 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) } yc = YAMLClass$new(yaml_obj) + + MZTAB_MODE = !is.null(mztab_file) ## will be TRUE if mzTab is detected + + if (MZTAB_MODE) + { + base_folder = dirname(mztab_file) + mzt = MzTabReader$new() + mzt$readMzTab(mztab_file) ## creates an inital fc.raw.file mapping from MTD + expr_fn_map = quote(mzt$fn_map) + + } else + { + if (!any(file.info(txt_folder)$isdir, na.rm = TRUE)) + { + stop(paste0("Argument txt_folder with value '", txt_folder, "' is not a valid directory\n")); + } + base_folder = txt_folder + txt_files = list() + txt_files$param = "parameters.txt" + txt_files$summary = "summary.txt" + txt_files$groups = "proteinGroups.txt" + txt_files$evd = "evidence.txt" + txt_files$msms = "msms.txt" + txt_files$msmsScan = "msmsScans.txt" + txt_files$mqpar = "mqpar.xml" + txt_files = lapply(txt_files, function(file) file.path(txt_folder, file)) + + ## prepare for readMQ() + mq = MQDataReader$new() + expr_fn_map = quote(mq$fn_map) + + } ## create names of output files (report PDF, YAML, stats, etc...) if (is.null(report_filenames)) { use_extended_reportname = yc$getYAML("PTXQC$ReportFilename$extended", TRUE) - rprt_fns = getReportFilenames(txt_folder, use_extended_reportname) + rprt_fns = getReportFilenames(base_folder, use_extended_reportname, mzTab_filename = mztab_file) } else { rprt_fns = report_filenames } + ## read manual filename shortening & sorting (if available) + eval(expr_fn_map)$readMappingFile(rprt_fns$filename_sorting) - ## stats_file:not used at the moment - #unlink(rprt_fns$stats_file) - #cat("Statistics summary:", file=rprt_fns$stats_file, append = FALSE, sep="\n") - + ## ## YAML default config - { - + ## ## determines if a local mqpar.xml should be used to grep all YAML parameters whose name starts with "MQpar_" from the ## original mqpar.xml instead of the yaml.config. The "MQpar_..." param from the config ## will be ignored and the newly written yaml.config will contain the values from mqpar.xml. - param_name_PTXQC_UseLocalMQPar = "PTXQC$UseLocalMQPar" - param_def_PTXQC_UseLocalMQPar = TRUE - param_useMQPAR = yc$getYAML(param_name_PTXQC_UseLocalMQPar, param_def_PTXQC_UseLocalMQPar) + param_useMQPAR = yc$getYAML("PTXQC$UseLocalMQPar", TRUE) - enabled_parameters = yc$getYAML("File$Parameters$enabled", TRUE) & file.exists(txt_files$param) add_fs_col = yc$getYAML("PTXQC$NameLengthMax_num", 10) - enabled_summary = yc$getYAML("File$Summary$enabled", TRUE) & file.exists(txt_files$summary) - id_rate_bad = yc$getYAML("File$Summary$IDRate$Thresh_bad_num", 20) - id_rate_great = yc$getYAML("File$Summary$IDRate$Thresh_great_num", 35) + id_rate_bad = yc$getYAML("File$Summary$IDRate$Thresh_bad_num", 20, 0, 100) + id_rate_great = yc$getYAML("File$Summary$IDRate$Thresh_great_num", 35, 0, 100) GL_name_min_length = 8 - enabled_proteingroups = yc$getYAML("File$ProteinGroups$enabled", TRUE) & file.exists(txt_files$groups) - enabled_pg_ratioLabIncThresh = yc$getYAML("File$ProteinGroups$RatioPlot$LabelIncThresh_num", 4) - param_name_PG_intThresh = "File$ProteinGroups$IntensityThreshLog2_num" - param_def_PG_intThresh = 25 ## default median intensity in log2 scale - param_PG_intThresh = yc$getYAML(param_name_PG_intThresh, param_def_PG_intThresh) - if (!is.numeric(param_PG_intThresh) || !(param_PG_intThresh %in% 1:100)) - { ## reset if value is weird - cat("YAML value for '" %+% param_name_PG_intThresh %+% "' is invalid ('" %+% param_PG_intThresh %+% "'). Using default of " %+% param_def_PG_intThresh %+% ".") - param_PG_intThresh = param_def_PG_intThresh - } - - enabled_evidence = yc$getYAML("File$Evidence$enabled", TRUE) & file.exists(txt_files$evd) - + pg_ratioLabIncThresh = yc$getYAML("File$ProteinGroups$RatioPlot$LabelIncThresh_num", 4) + ## default median intensity in log2 scale + param_PG_intThresh = yc$getYAML("File$ProteinGroups$IntensityThreshLog2_num", 25, 1, 100) + ## get scoring threshold (upper limit) - param_name_EV_protThresh = "File$Evidence$ProteinCountThresh_num" - param_def_EV_protThresh = 3500 - param_EV_protThresh = yc$getYAML(param_name_EV_protThresh, param_def_EV_protThresh) - if (!is.numeric(param_EV_protThresh) || !(param_EV_protThresh %in% 1:1e5)) - { ## reset if value is weird - cat("YAML value for '" %+% param_name_EV_protThresh %+% "' is invalid ('" %+% param_EV_protThresh %+% "'). Using default of " %+% param_def_EV_protThresh %+% ".") - param_EV_protThresh = param_def_EV_protThresh - } - - param_name_EV_intThresh = "File$Evidence$IntensityThreshLog2_num" - param_def_EV_intThresh = 23 ## default median intensity in log2 scale - param_EV_intThresh = yc$getYAML(param_name_EV_intThresh, param_def_EV_intThresh) - if (!is.numeric(param_EV_intThresh) || !(param_EV_intThresh %in% 1:100)) - { ## reset if value is weird - cat("YAML value for '" %+% param_name_EV_intThresh %+% "' is invalid ('" %+% param_EV_intThresh %+% "'). Using default of " %+% param_def_EV_intThresh %+% ".") - param_EV_intThresh = param_def_EV_intThresh - } - + param_EV_protThresh = yc$getYAML("File$Evidence$ProteinCountThresh_num", 3500, 1, 1e5) + + ## default median intensity in log2 scale + param_EV_intThresh = yc$getYAML("File$Evidence$IntensityThreshLog2_num", 23, 1, 100) + ## get scoring threshold (upper limit) - param_name_EV_pepThresh = "File$Evidence$PeptideCountThresh_num" - param_def_EV_pepThresh = 15000 - param_EV_pepThresh = yc$getYAML(param_name_EV_pepThresh, param_def_EV_pepThresh) - if (!is.numeric(param_EV_pepThresh) || !(param_EV_pepThresh %in% 1:1e6)) - { ## reset if value is weird - cat("YAML value for '" %+% param_name_EV_pepThresh %+% "' is invalid ('" %+% param_EV_pepThresh %+% "'). Using default of " %+% param_def_EV_pepThresh %+% ".") - param_EV_pepThresh = param_def_EV_pepThresh - } - + param_EV_pepThresh = yc$getYAML("File$Evidence$PeptideCountThresh_num", 15000, 1, 1e6) + ### warn of special contaminants! ## these need to be in FASTA headers (description is not enough)! ## syntax: list( contaminant1 = c(name, threshold), contaminant2 = c(...), ...) @@ -139,46 +133,34 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## is set, then 'yaml_contaminants' will be 'FALSE' ## contaminant_default = list("cont_MYCO" = c(name="MYCOPLASMA", threshold=1)) # name (FASTA), threshold for % of unique peptides - ##yaml_obj = list() ##contaminant_default = FALSE ## to switch it off by default yaml_contaminants = yc$getYAML("File$Evidence$SpecialContaminants", contaminant_default) - ## param - param_name_EV_MatchingTolerance = "File$Evidence$MQpar_MatchingTimeWindow_num" - param_def_EV_MatchingTolerance = 1 - param_EV_MatchingTolerance = yc$getYAML(param_name_EV_MatchingTolerance, param_def_EV_MatchingTolerance) - if (param_useMQPAR) { + param_EV_MatchingTolerance = yc$getYAML("File$Evidence$MQpar_MatchingTimeWindow_num", 1) + if (param_useMQPAR &! MZTAB_MODE) { v = getMQPARValue(txt_files$mqpar, "matchingTimeWindow") ## will also warn() if file is missing if (!is.null(v)) { - param_EV_MatchingTolerance = yc$setYAML(param_name_EV_MatchingTolerance, as.numeric(v)) + param_EV_MatchingTolerance = yc$setYAML("File$Evidence$MQpar_MatchingTimeWindow_num", as.numeric(v)) } } - param_name_mbr = "File$Evidence$MatchBetweenRuns_wA" - param_evd_mbr = yc$getYAML(param_name_mbr, "auto") - + param_evd_mbr = yc$getYAML("File$Evidence$MatchBetweenRuns_wA", "auto") - param_name_EV_PrecursorTolPPM = "File$Evidence$MQpar_firstSearchTol_num" - param_def_EV_PrecursorTolPPM = 20 - param_EV_PrecursorTolPPM = yc$getYAML(param_name_EV_PrecursorTolPPM, param_def_EV_PrecursorTolPPM) - if (param_useMQPAR) { + param_EV_PrecursorTolPPM = yc$getYAML("File$Evidence$MQpar_firstSearchTol_num", 20) + if (param_useMQPAR & !MZTAB_MODE) { v = getMQPARValue(txt_files$mqpar, "firstSearchTol") ## will also warn() if file is missing if (!is.null(v)) { - param_EV_PrecursorTolPPM = yc$setYAML(param_name_EV_PrecursorTolPPM, as.numeric(v)) + param_EV_PrecursorTolPPM = yc$setYAML("File$Evidence$MQpar_firstSearchTol_num", as.numeric(v)) } } - param_name_EV_PrecursorOutOfCalSD = "File$Evidence$firstSearch_outOfCalWarnSD_num" - param_def_EV_PrecursorOutOfCalSD = 2 - param_EV_PrecursorOutOfCalSD = yc$getYAML(param_name_EV_PrecursorOutOfCalSD, param_def_EV_PrecursorOutOfCalSD) + param_EV_PrecursorOutOfCalSD = yc$getYAML("File$Evidence$firstSearch_outOfCalWarnSD_num", 2) - - param_name_EV_PrecursorTolPPMmainSearch = "File$Evidence$MQpar_mainSearchTol_num" - param_def_EV_PrecursorTolPPMmainSearch = NA ## we do not dare to have a default, since it ranges from 6 - 4.5 ppm across MQ versions - param_EV_PrecursorTolPPMmainSearch = yc$getYAML(param_name_EV_PrecursorTolPPMmainSearch, param_def_EV_PrecursorTolPPMmainSearch) - if (param_useMQPAR) { + ## we do not dare to have a default, since it ranges from 6 - 4.5 ppm across MQ versions + param_EV_PrecursorTolPPMmainSearch = yc$getYAML("File$Evidence$MQpar_mainSearchTol_num", NA) + if (param_useMQPAR & !MZTAB_MODE) { v = getMQPARValue(txt_files$mqpar, "mainSearchTol") ## will also warn() if file is missing if (!is.null(v)) { - param_EV_PrecursorTolPPMmainSearch = yc$setYAML(param_name_EV_PrecursorTolPPMmainSearch, as.numeric(v)) + param_EV_PrecursorTolPPMmainSearch = yc$setYAML("File$Evidence$MQpar_mainSearchTol_num", as.numeric(v)) } } if (is.na(param_EV_PrecursorTolPPMmainSearch)) @@ -186,35 +168,13 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) warning("PTXQC: Cannot draw borders for calibrated mass error, since neither 'File$Evidence$MQpar_mainSearchTol_num' is set nor a mqpar.xml file is present!", immediate. = TRUE) } - enabled_msms = yc$getYAML("File$MsMs$enabled", TRUE) & file.exists(txt_files$msms) - enabled_msmsscans = yc$getYAML("File$MsMsScans$enabled", TRUE) & file.exists(txt_files$msmsScan) - - param_name_MSMSScans_ionInjThresh = "File$MsMsScans$IonInjectionThresh_num" - param_def_MSMSScans_ionInjThresh = 10 ## default ion injection threshold in milliseconds - param_MSMSScans_ionInjThresh = yc$getYAML(param_name_MSMSScans_ionInjThresh, param_def_MSMSScans_ionInjThresh) - if (!is.numeric(param_MSMSScans_ionInjThresh)) - { ## reset if value is weird - cat("YAML value for '" %+% param_name_MSMSScans_ionInjThresh %+% "' is invalid ('" %+% param_MSMSScans_ionInjThresh %+% "'). Using default of " %+% param_def_MSMSScans_ionInjThresh %+% ".") - param_MSMSScans_ionInjThresh = param_def_MSMSScans_ionInjThresh - } - - - param_name_PTXQC_OutputFormats = "PTXQC$OutputFormats" + param_MSMSScans_ionInjThresh = yc$getYAML("File$MsMsScans$IonInjectionThresh_num", 10, 0, 200) + out_formats_supported = c("html", "plainPDF") - param_def_PTXQC_OutputFormats = out_formats_supported - param_OutputFormats = yc$getYAML(param_name_PTXQC_OutputFormats, param_def_PTXQC_OutputFormats) + param_OutputFormats = yc$getYAML("PTXQC$OutputFormats", out_formats_supported) - param_name_PTXQC_PageNumbers = "PTXQC$PlainPDF$AddPageNumbers" - param_def_PTXQC_PageNumbers = "on" - param_PageNumbers = yc$getYAML(param_name_PTXQC_PageNumbers, param_def_PTXQC_PageNumbers) - - } - - ## prepare for readMQ() - mq = MQDataReader$new() + param_PageNumbers = yc$getYAML("PTXQC$PlainPDF$AddPageNumbers", "on") - ## read manual filename shortening & sorting (if available) - mq$readMappingFile(rprt_fns$filename_sorting) #### #### prepare the metrics @@ -248,40 +208,40 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## write out the final YAML file (so users can disable metrics, if they fail) yc$writeYAML(rprt_fns$yaml_file) + ###### ###### parameters.txt ... ###### - if (enabled_parameters) - { - d_parAll = mq$readMQ(txt_files$param, type="par") - lst_qcMetrics[["qcMetric_PAR"]]$setData(d_parAll) - } + if (MZTAB_MODE) d_parAll = mzt$getParameters() + else d_parAll = mq$readMQ(txt_files$param, type="par") + + lst_qcMetrics[["qcMetric_PAR"]]$setData(d_parAll) ###### ###### summary.txt ... ###### - if (enabled_summary) - { - d_smy = mq$readMQ(txt_files$summary, type="sm", add_fs_col = add_fs_col) - #colnames(d_smy) - #colnames(d_smy[[1]]) - - ### MS/MS identified [%] - lst_qcMetrics[["qcMetric_SM_MSMSIdRate"]]$setData(d_smy$raw, id_rate_bad, id_rate_great) - } + if (MZTAB_MODE) d_smy = mzt$getSummary() + else d_smy = mq$readMQ(txt_files$summary, type="sm", add_fs_col = add_fs_col) + #colnames(d_smy) + #colnames(d_smy[[1]]) + + ### MS/MS identified [%] + lst_qcMetrics[["qcMetric_SM_MSMSIdRate"]]$setData(d_smy, id_rate_bad, id_rate_great) + ### TIC + if (MZTAB_MODE) lst_qcMetrics[["qcMetric_SM_TIC"]]$setData(d_smy) ###### ###### proteinGroups.txt ... ###### - - if (enabled_proteingroups) - { - df_pg = mq$readMQ(txt_files$groups, type="pg", col_subset=NA, filter="R") - + if (MZTAB_MODE) df_pg = mzt$getProteins() + else df_pg = mq$readMQ(txt_files$groups, type="pg", col_subset=NA, filter="R") + + ## just a scope + { ## ## Raw/LFQ/Reporter intensity boxplots ## @@ -310,8 +270,8 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## Contaminants plots on Raw intensity ## lst_qcMetrics[["qcMetric_PG_Cont"]]$setData(df_pg, colsW, MAP_pg_groups) - - + + ### ### Raw intensity boxplot ### @@ -319,7 +279,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) clusterCols$raw.intensity = colsW ## cluster using intensity lst_qcMetrics[["qcMetric_PG_RawInt"]]$setData(df_pg, colsW, MAP_pg_groups, param_PG_intThresh) - + ## ## LFQ boxplots ## @@ -339,7 +299,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) min_out_length = GL_name_min_length, add_dots = TRUE), min_out_length = GL_name_min_length)) - + clusterCols$lfq.intensity = colsW ## cluster using LFQ lst_qcMetrics[["qcMetric_PG_LFQInt"]]$setData(df_pg, colsW, MAP_pg_groups_LFQ, param_PG_intThresh) @@ -359,7 +319,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) min_out_length = GL_name_min_length, add_dots = TRUE), min_out_length = GL_name_min_length)) - + clusterCols$reporter.intensity = colsITRAQ ## cluster using reporters lst_qcMetrics[["qcMetric_PG_ReporterInt"]]$setData(df_pg, colsITRAQ, MAP_pg_groups_ITRAQ, param_PG_intThresh) @@ -374,7 +334,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) MAP_pg_groups_ALL = rbind(MAP_pg_groups, MAP_pg_groups_LFQ, MAP_pg_groups_ITRAQ) lst_qcMetrics[["qcMetric_PG_PCA"]]$setData(df_pg, clusterCols, MAP_pg_groups_ALL) - + ################################## ## ratio plots @@ -394,7 +354,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) if (length(ratio_cols) > 0) { - lst_qcMetrics[["qcMetric_PG_Ratio"]]$setData(df_pg = df_pg, ratio_cols = ratio_cols, thresh_LabelIncorp = enabled_pg_ratioLabIncThresh, GL_name_min_length = GL_name_min_length) + lst_qcMetrics[["qcMetric_PG_Ratio"]]$setData(df_pg, ratio_cols = ratio_cols, thresh_LabelIncorp = pg_ratioLabIncThresh, GL_name_min_length = GL_name_min_length) } } @@ -402,16 +362,22 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ###### evidence.txt ... ###### - if (enabled_evidence) - { - ## protein.names is only available from MQ 1.4 onwards - df_evd = mq$readMQ(txt_files$evd, type="ev", filter="R", + ## protein.names is only available from MQ 1.4 onwards + if (MZTAB_MODE) { + all_evd = mzt$getEvidence() + df_evd = all_evd$genuine + df_evd_tf = all_evd$transferred + + } + else { + all_evd = mq$readMQ(txt_files$evd, type="ev", filter="R", col_subset=c("proteins", numeric = "Retention.Length", numeric = "retention.time.calibration", numeric = "Retention.time$", numeric = "Match.Time.Difference", - numeric = "^intensity$", "^Type$", + numeric = "^intensity$", + "^Type$", numeric = "Mass\\.Error", numeric = "^uncalibrated...calibrated." , numeric = "^m.z$", @@ -419,15 +385,34 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) numeric = "^fraction$", ## only available when fractions were given "Raw.file", "^Protein.Group.IDs$", "Contaminant", numeric = "[RK]\\.Count", - numeric = "^Charge$", "modified.sequence", - numeric = "^Mass$", "^protein.names$", + numeric = "^Charge$", + "modified.sequence", + numeric = "^Mass$", + "^protein.names$", numeric = "^ms.ms.count$", numeric = "^reporter.intensity.")) ## we want .corrected and .not.corrected - + ## contains NA if 'genuine' ID + ## ms.ms.count is always 0 when mtd has a number; 'type' is always "MULTI-MATCH" and ms.ms.ids is empty! + #dsub = d_evd[,c("ms.ms.count", "match.time.difference")] + #head(dsub[is.na(dsub[,2]),]) + #sum(0==(dsub[,1]) & is.na(dsub[,2])) + ## + ## MQ1.4 MTD is either: NA or a number + ## + if (!is.null(all_evd)) all_evd$is.transferred = (all_evd$type == "MULTI-MATCH") + + df_evd = all_evd[all_evd$type != "MULTI-MATCH", ] + df_evd_tf = all_evd[all_evd$type == "MULTI-MATCH", , drop=FALSE] ## keep columns, if empty + + } +## just a local scope to fold evidence metrics in the editor... + { + + ### warn of special contaminants! if (class(yaml_contaminants) == "list") ## SC are requested { - if (exists("df_pg")) + if (!is.null(df_pg)) { lst_qcMetrics[["qcMetric_EVD_UserContaminant"]]$setData(df_evd, df_pg, yaml_contaminants) } else { @@ -443,23 +428,16 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## ## MS2/MS3 labeled (TMT/ITRAQ) only: reporter intensity of peptides ## - if (length(grep("^reporter.intensity.", colnames(df_evd))) > 0) - { - lst_qcMetrics[["qcMetric_EVD_ReporterInt"]]$setData(df_evd) - } + lst_qcMetrics[["qcMetric_EVD_ReporterInt"]]$setData(df_evd) + ## ## peptide & protein counts ## - ## contains NA if 'genuine' ID - df_evd$hasMTD = !is.na(df_evd$match.time.difference) - ## report Match-between-runs data only if it was enabled - reportMTD = any(df_evd$hasMTD) - - lst_qcMetrics[["qcMetric_EVD_ProteinCount"]]$setData(df_evd, param_EV_protThresh) + lst_qcMetrics[["qcMetric_EVD_ProteinCount"]]$setData(df_evd, df_evd_tf, param_EV_protThresh) - lst_qcMetrics[["qcMetric_EVD_PeptideCount"]]$setData(df_evd, param_EV_pepThresh) + lst_qcMetrics[["qcMetric_EVD_PeptideCount"]]$setData(df_evd, df_evd_tf, param_EV_pepThresh) #### #### peak length (not supported in MQ 1.0.13) @@ -479,14 +457,11 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) if (("retention.time.calibration" %in% colnames(df_evd))) { ## this should enable us to decide if MBR was used (we could also look up parameters.txt -- if present) - MBR_HAS_DATA = (sum(df_evd$type == "MULTI-MATCH") > 0) - - if ((param_evd_mbr == FALSE) || (MBR_HAS_DATA == FALSE)) - { - ## MBR is not evaluated - } else + if (!(param_evd_mbr == FALSE) & nrow(df_evd_tf)>0) { - lst_qcMetrics[["qcMetric_EVD_MBRAlign"]]$setData(df_evd, param_EV_MatchingTolerance, mq$raw_file_mapping) + lst_qcMetrics[["qcMetric_EVD_MBRAlign"]]$setData(df_evd, + tolerance_matching = param_EV_MatchingTolerance, + raw_file_mapping = eval(expr_fn_map)$raw_file_mapping) ### ### MBR: ID transfer @@ -494,9 +469,8 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) #debug (restore data): lst_qcMetrics[["qcMetric_EVD_RTPeakWidth"]]$setData(df_evd) avg_peak_width = lst_qcMetrics[["qcMetric_EVD_RTPeakWidth"]]$outData[["avg_peak_width"]] if (is.null(avg_peak_width)) { - stop("RT peak width module did not run, but is required for MBR metrics. Enable it and try again or switch off MBR metrics!") - } - lst_qcMetrics[["qcMetric_EVD_MBRIdTransfer"]]$setData(df_evd, avg_peak_width) + warning("RT peak width module did not run, but is required for MBR metrics. Enable it and try again or switch off MBR metrics!") + } else lst_qcMetrics[["qcMetric_EVD_MBRIdTransfer"]]$setData(df_evd, df_evd_tf, avg_peak_width) ## @@ -521,7 +495,13 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## peptides per RT ## lst_qcMetrics[["qcMetric_EVD_IDoverRT"]]$setData(df_evd) - + + + ## + ## upSet plot + ## + lst_qcMetrics[["qcMetric_EVD_UpSet"]]$setData(df_evd) + ## ## barplots of mass error ## @@ -545,11 +525,7 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## MS1-out-of-calibration (i.e. the tol-window being too small) ## ## additionally use MS2-ID rate (should be below 1%) - if (enabled_summary) { - df_idrate = d_smy$raw[, c("fc.raw.file", "ms.ms.identified....")] - } else { - df_idrate = NULL - } + df_idrate = d_smy[, c("fc.raw.file", "ms.ms.identified....")] ## returns NULL if d_smy == NULL lst_qcMetrics[["qcMetric_EVD_PreCal"]]$setData(df_evd, df_idrate, param_EV_PrecursorTolPPM, param_EV_PrecursorOutOfCalSD) @@ -557,8 +533,6 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ## ## MS1 post calibration ## - - lst_qcMetrics[["qcMetric_EVD_PostCal"]]$setData(df_evd, df_idrate, param_EV_PrecursorTolPPM, param_EV_PrecursorOutOfCalSD, param_EV_PrecursorTolPPMmainSearch) @@ -586,215 +560,218 @@ createReport = function(txt_folder, yaml_obj = list(), report_filenames = NULL) ###### msms.txt ... ###### -if (enabled_msms) -{ - ### missed cleavages (again) - ### this is the real missed cleavages estimate ... but slow - #df_msms_s = mq$readMQ(txt_files$msms, type="msms", filter = "", nrows=10) - #colnames(df_msms_s) - #head(df_msms) - df_msms = mq$readMQ(txt_files$msms, type="msms", filter = "", col_subset=c(numeric = "Missed\\.cleavages", - "^Raw.file$", - "^mass.deviations", - "^masses$", "^mass.analyzer$", "fragmentation", "reverse", - numeric = "^evidence.id$" - ), check_invalid_lines = FALSE) + if (MZTAB_MODE) df_msms = mzt$getMSMSScans(identified_only = TRUE) + else df_msms = mq$readMQ(txt_files$msms, type="msms", filter = "", col_subset=c(numeric = "Missed\\.cleavages", + "^Raw.file$", + "^mass.deviations", + "^masses$", + "^mass.analyzer$", + "fragmentation", + "reverse", + numeric = "^evidence.id$" + ), check_invalid_lines = FALSE) + + ## just a scope + { + ### missed cleavages (again) + ### this is the real missed cleavages estimate ... but slow + #df_msms_s = mq$readMQ(txt_files$msms, type="msms", filter = "", nrows=10) + #colnames(df_msms_s) + #head(df_msms) + + ## + ## MS2 fragment decalibration + ## + lst_qcMetrics[["qcMetric_MSMS_MSMSDecal"]]$setData(df_msms, eval(expr_fn_map)$raw_file_mapping$to) + + ## + ## missed cleavages per Raw file + ## + if (!is.null(df_evd)) { + lst_qcMetrics[["qcMetric_MSMS_MissedCleavages"]]$setData(df_msms, df_evd) + } else { + lst_qcMetrics[["qcMetric_MSMS_MissedCleavages"]]$setData(df_msms) + } - ## - ## MS2 fragment decalibration - ## - lst_qcMetrics[["qcMetric_MSMS_MSMSDecal"]]$setData(df_msms = df_msms, fc_raw_files = mq$raw_file_mapping$to) - - ## - ## missed cleavages per Raw file - ## - if (exists("df_evd")) { - lst_qcMetrics[["qcMetric_MSMS_MissedCleavages"]]$setData(df_msms, df_evd) - } else { - lst_qcMetrics[["qcMetric_MSMS_MissedCleavages"]]$setData(df_msms) } - ## save RAM: msms.txt is not required any longer - rm(df_msms) + if (!DEBUG_PTXQC) rm(df_msms) if (!DEBUG_PTXQC) rm(df_evd) -} - + ###### ###### msmsScans.txt ... ###### - -if (enabled_msmsscans) -{ - #df_msmsScan_h = mq$readMQ(txt_files$msmsScan, type="msms", filter = "", nrows=2) - #colnames(df_msmsScan_h) - #head(df_msmsScan_h) - df_msmsScan = mq$readMQ(txt_files$msmsScan, type = "msms", filter = "", - col_subset = c(numeric = "^ion.injection.time", - numeric = "^retention.time$", - "^Identified", - "^Scan.event.number", - "^total.ion.current", - "^base.?peak.intensity", ## basepeak.intensity (MQ1.2) and base.peak.intensity (MQ1.3+) - "^Raw.file", - "^dp.aa$", - "^dp.modification$"), - check_invalid_lines = FALSE) - - ## - ## MQ version 1.0.13 has very rudimentary MSMSscans.txt, with no header, so we need to skip the metrics of this file - ## - if (ncol(df_msmsScan) > 3) + if (MZTAB_MODE) df_msmsScans = mzt$getMSMSScans(identified_only = FALSE) + else df_msmsScans = mq$readMQ(txt_files$msmsScan, type = "msms", filter = "", + col_subset = c(numeric = "^ion.injection.time", + numeric = "^retention.time$", + "^Identified", + "^Scan.event.number", + "^total.ion.current", + "^base.?peak.intensity", ## basepeak.intensity (MQ1.2) and base.peak.intensity (MQ1.3+) + "^Raw.file", + "^dp.aa$", + "^dp.modification$"), + check_invalid_lines = FALSE) + + # just a scope { - # round RT to 2 min intervals - df_msmsScan$rRT = round(df_msmsScan$retention.time/2)*2 - ## - ## TopN over RT + ## MQ version 1.0.13 has very rudimentary MSMSscans.txt, with no header, so we need to skip the metrics of this file ## - lst_qcMetrics[["qcMetric_MSMSScans_TopNoverRT"]]$setData(df_msmsScan) - - ## - ## Injection time over RT - ## - lst_qcMetrics[["qcMetric_MSMSScans_IonInjTime"]]$setData(df_msmsScan, param_MSMSScans_ionInjThresh) - - ## - ## MS/MS intensity (TIC and base peak) - ## - lst_qcMetrics[["qcMetric_MSMSScans_MSMSIntensity"]]$setData(df_msmsScan) + if (!is.null(df_msmsScans) && ncol(df_msmsScans) > 3) + { + # round RT to 2 min intervals + df_msmsScans$rRT = round(df_msmsScans$retention.time/2)*2 + + ## + ## TopN over RT + ## + lst_qcMetrics[["qcMetric_MSMSScans_TopNoverRT"]]$setData(df_msmsScans) + + ## + ## Injection time over RT + ## + lst_qcMetrics[["qcMetric_MSMSScans_IonInjTime"]]$setData(df_msmsScans, param_MSMSScans_ionInjThresh) + + ## + ## MS/MS intensity (TIC and base peak) + ## + lst_qcMetrics[["qcMetric_MSMSScans_MSMSIntensity"]]$setData(df_msmsScans) + + ## + ## TopN counts + ## + lst_qcMetrics[["qcMetric_MSMSScans_TopN"]]$setData(df_msmsScans) + + ## + ## Scan event: % identified + ## + lst_qcMetrics[["qcMetric_MSMSScans_TopNID"]]$setData(df_msmsScans) + + ## + ## Dependent peptides (no score) + ## + if ("dp.modification" %in% colnames(df_msmsScans)) { + lst_qcMetrics[["qcMetric_MSMSScans_DepPep"]]$setData(df_msmsScans) + } + + } ## end MSMSscan from MQ > 1.0.13 - ## - ## TopN counts - ## - lst_qcMetrics[["qcMetric_MSMSScans_TopN"]]$setData(df_msmsScan) - - ## - ## Scan event: % identified - ## - lst_qcMetrics[["qcMetric_MSMSScans_TopNID"]]$setData(df_msmsScan) - ## - ## Dependent peptides (no score) - ## - if ("dp.modification" %in% colnames(df_msmsScan)) { - lst_qcMetrics[["qcMetric_MSMSScans_DepPep"]]$setData(df_msmsScan) - } + } + ## save RAM: msmsScans.txt is not required any longer + if (!DEBUG_PTXQC) rm(df_msmsScans) + - } ## end MSMSscan from MQ > 1.0.13 + ##################################################################### + ## list of qcMetric objects + print("#Metrics: ") + print(length(lst_qcMetrics)) - ## save RAM: msmsScans.txt is not required any longer - rm(df_msmsScan) -} - + hm = getQCHeatMap(lst_qcMetrics, raw_file_mapping = eval(expr_fn_map)$raw_file_mapping) + #print(hm[["plot"]]) + write.table(hm[["table"]], file = rprt_fns$heatmap_values_file, quote = TRUE, sep = "\t", row.names = FALSE) + ## get MQ short name mapping plot (might be NULL if no mapping was required) + pl_nameMapping = eval(expr_fn_map)$plotNameMapping() -##################################################################### -## list of qcMetric objects -print("#Metrics: ") -print(length(lst_qcMetrics)) - -hm = getQCHeatMap(lst_qcMetrics, raw_file_mapping = mq$raw_file_mapping) -#print(hm[["plot"]]) -write.table(hm[["table"]], file = rprt_fns$heatmap_values_file, quote = TRUE, sep = "\t", row.names = FALSE) - -## get MQ short name mapping plot (might be NULL if no mapping was required) -pl_nameMapping = mq$plotNameMapping() - -## -## plot it!!! -## -cat("Creating Report file ...") - -# -#param_OutputFormats = "html pdf" -# -out_formats = unlist(strsplit(param_OutputFormats, "[ ,]+")) -out_formats -out_format_requested = out_formats_supported[match(out_formats, out_formats_supported)] -if (any(is.na(out_format_requested))) -{ - stop("Output format(s) not supported: '", paste(out_formats[is.na(out_format_requested)], collapse="', '"), "'") -} - - -if ("html" %in% out_format_requested) -{ - if (pandoc_available()) { - ## HTML reports require Pandoc for converting Markdown to Html via the rmarkdown package - if (DEBUG_PTXQC) { - html_template = "Z:/projects/QC/PTXQC/package/inst/reportTemplate/PTXQC_report_template.Rmd" - } else { - html_template = system.file("./reportTemplate/PTXQC_report_template.Rmd", package="PTXQC") - } - cat(paste0("HTML TEMPLATE: ", html_template, "\n")) - out_dir = dirname(rprt_fns$report_file_HTML) - file.copy(html_template, out_dir, overwrite = TRUE) - out_template = file.path(out_dir, basename(html_template)) - ## Rmarkdown: convert to Markdown, and then to HTML (or PDF) ... - ## Intermediates_dir is required if inputdir!=outputdir, since Shiny server might not allow write-access to input file directory - render(out_template, output_file = rprt_fns$report_file_HTML) #, intermediates_dir = dirname(rprt_fns$report_file_HTML)) - } else { - warning("The 'Pandoc' converter is not installed on your system or you do not have read-access to it!\n", - "Pandoc is required for HTML reports.\n", - "Please install Pandoc or make sure you have access to pandoc(.exe).\n", - "Restart your R-session afterwards.", - immediate. = TRUE) + ## + ## plot it!!! + ## + cat("Creating Report file ...") + + # + #param_OutputFormats = "html pdf" + # + out_formats = unlist(strsplit(param_OutputFormats, "[ ,]+")) + out_formats + out_format_requested = out_formats_supported[match(out_formats, out_formats_supported)] + if (any(is.na(out_format_requested))) + { + stop("Output format(s) not supported: '", paste(out_formats[is.na(out_format_requested)], collapse="', '"), "'") } -} - -if ("plainPDF" %in% out_format_requested) -{ - report_file_PDF = rprt_fns$report_file_PDF - ## give the user a chance to close open reports which are currently blocked for writing - if (!wait_for_writable(report_file_PDF)) + + + if ("html" %in% out_format_requested) { - stop("Target file not writable") + if (rmarkdown::pandoc_available()) { + ## HTML reports require Pandoc for converting Markdown to Html via the rmarkdown package + if (DEBUG_PTXQC) { + html_template = paste0(getwd(), "/inst/reportTemplate/PTXQC_report_template.Rmd") + if (!file.exists(html_template)) stop("Wrong working directroy. Please set your working directory to the PTXQC main dir such that 'paste0(getwd(), '/inst/reportTemplate/PTXQC_report_template.Rmd')' is a valid file.") + } else { + html_template = system.file("./reportTemplate/PTXQC_report_template.Rmd", package="PTXQC") + } + cat(paste0("HTML TEMPLATE: ", html_template, "\n")) + out_dir = dirname(rprt_fns$report_file_HTML) + file.copy(html_template, out_dir, overwrite = TRUE) + out_template = file.path(out_dir, basename(html_template)) + ## Rmarkdown: convert to Markdown, and then to HTML (or PDF) ... + ## Intermediates_dir is required if inputdir!=outputdir, since Shiny server might not allow write-access to input file directory + rmarkdown::render(out_template, output_file = rprt_fns$report_file_HTML) #, intermediates_dir = dirname(rprt_fns$report_file_HTML)) + } else { + warning("The 'Pandoc' converter is not installed on your system or you do not have read-access to it!\n", + "Pandoc is required for HTML reports.\n", + "Please install Pandoc or make sure you have access to pandoc(.exe).\n", + "Restart your R-session afterwards.", + immediate. = TRUE) + } } - if (param_PageNumbers == "on") + if ("plainPDF" %in% out_format_requested) { - printWithPage = function(gg_obj, page_nr, filename = report_file_PDF) + report_file_PDF = rprt_fns$report_file_PDF + ## give the user a chance to close open reports which are currently blocked for writing + if (!wait_for_writable(report_file_PDF)) { - filename = basename(filename) - printWithFooter(gg_obj, bottom_left = filename, bottom_right = page_nr) + stop("Target file not writable") } - } else { - ## no page number and filename at bottom of each page - printWithPage = function(gg_obj, page_nr, filename = report_file_PDF) + + if (param_PageNumbers == "on") { - print(gg_obj) + printWithPage = function(gg_obj, page_nr, filename = report_file_PDF) + { + filename = basename(filename) + printWithFooter(gg_obj, bottom_left = filename, bottom_right = page_nr) + } + } else { + ## no page number and filename at bottom of each page + printWithPage = function(gg_obj, page_nr, filename = report_file_PDF) + { + print(gg_obj) + } } - } - pdf(report_file_PDF) - printWithPage(hm[["plot"]], "p. 1") # summary heatmap - printWithPage(pl_nameMapping$plots, "p. 2") # short file mapping - pc = 3; ## subsequent pages start at #4 - for (qcm in lst_qcMetrics_ord) - { - for (p in qcm$plots) + grDevices::pdf(report_file_PDF) + printWithPage(hm[["plot"]], "p. 1") # summary heatmap + printWithPage(pl_nameMapping$plots, "p. 2") # short file mapping + pc = 3; ## subsequent pages start at #4 + for (qcm in lst_qcMetrics_ord) { - printWithPage(p, paste("p.", pc)) - pc = pc + 1 + for (p in qcm$plots) + { + printWithPage(p, paste("p.", pc)) + pc = pc + 1 + } } + grDevices::dev.off(); + cat(" done\n") } - dev.off(); - cat(" done\n") -} - -## save plot object (for easier access, in case someone wants high-res plots) -## (...disabled for now until concrete use case pops up) -#cat("Dumping plot objects as Rdata file ...") -#save(file = rprt_fns$R_plots_file, list = "GPL") -#cat(" done\n") - -## write shortnames and sorting of filenames -mq$writeMappingFile(rprt_fns$filename_sorting) - -cat(paste("Report file created at\n\n ", rprt_fns$report_file_prefix, ".*\n\n", sep="")) -cat(paste0("\n\nTime elapsed: ", round(as.double(Sys.time() - time_start, units="mins"), 1), " min\n\n")) - -## return path to PDF report and YAML config, etc -return(rprt_fns) + + ## save plot object (for easier access, in case someone wants high-res plots) + ## (...disabled for now until concrete use case pops up) + #cat("Dumping plot objects as Rdata file ...") + #save(file = rprt_fns$R_plots_file, list = "GPL") + #cat(" done\n") + + ## write shortnames and sorting of filenames + eval(expr_fn_map)$writeMappingFile(rprt_fns$filename_sorting) + + cat(paste("Report file created at\n\n ", rprt_fns$report_file_prefix, ".*\n\n", sep="")) + cat(paste0("\n\nTime elapsed: ", round(as.double(Sys.time() - time_start, units="mins"), 1), " min\n\n")) + + ## return path to PDF report and YAML config, etc + return(rprt_fns) } diff --git a/R/fcn_MQ.R b/R/fcn_MQ.R index 9a6df1d..5761553 100644 --- a/R/fcn_MQ.R +++ b/R/fcn_MQ.R @@ -24,9 +24,6 @@ #' @return List of ggplot objects #' #' @import ggplot2 -#' @importFrom plyr ddply -#' @importFrom grDevices boxplot.stats -#' #' @export #' boxplotCompare = function(data, @@ -64,7 +61,7 @@ boxplotCompare = function(data, if (!("factor" %in% class(data$group))) data$group = factor(data$group) ## actual number of entries in each column (e.g. LFQ often has 0) - ncol.stat = ddply(data, "group", function(x){ + ncol.stat = plyr::ddply(data, "group", function(x){ notNA = sum(!is.infinite(x$value) & !is.na(x$value)); data.frame(n = nrow(x), notNA = notNA, newname = paste0(x$group[1], " (n=", notNA, ")"))}) head(ncol.stat) @@ -86,6 +83,9 @@ boxplotCompare = function(data, "sample (medium)" = "blue", "sample (heavy)" = "green", "contaminant" = "yellow") + dark_cols = darken(cols) + names(dark_cols) = names(cols) + ## assign categories to channels cat_names = names(cols) cat = factor(cat_names, levels=cat_names) data$cat = cat[1] @@ -98,7 +98,7 @@ boxplotCompare = function(data, data$cat[data$contaminant] = cat[5] ## compute global y-limits (so we can fix it across plots) - ylims = boxplot.stats(data$value)$stats[c(1, 5)] + ylims = grDevices::boxplot.stats(data$value)$stats[c(1, 5)] ## make sure to inlude abline (if existing) if (!is.na(abline)) { @@ -107,17 +107,16 @@ boxplotCompare = function(data, fcn_boxplot_internal = function(data, abline = NA) { - #require(ggplot2) - pl = ggplot(data=data, aes_string(x = "group", y = "value", fill = "cat")) + ## do not use col="cat", since this will dodge bars and loose scaling + pl = ggplot(data=data, aes_string(x = "group", y = "value", fill = "cat", col = "cat")) + geom_boxplot(varwidth = TRUE) + xlab("") + ylab(ylab) + ylim(ylims) + scale_alpha(guide = FALSE) + - scale_fill_manual(values=cols, name = "Category") + - scale_color_manual(values=cols, name = "Category") + - theme(axis.text.x = element_text(angle=90, vjust = 0.5)) + - theme(legend.position=ifelse(length(cols)==1, "none", "right")) + + scale_fill_manual(values = cols, name = "Category") + + scale_color_manual(values = dark_cols, name = "Category") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + + theme(legend.position = ifelse(length(cols)==1, "none", "right")) + addGGtitle(mainlab, sublab) + scale_x_discrete_reverse(unique(data$group)) @@ -156,23 +155,25 @@ boxplotCompare = function(data, #' If ppm mass deviations are not available, errors in Da will be converted to ppm using the corresponding mass values. #' #' @param x Data frame in long format with numerical expression data +#' @param recurse Internal usage only. Leave at 0 when calling. #' @return Data frame with mass errors ('msErr') and their 'unit' (Da or ppm) or NULL (if no fragments were given) #' #' @export #' -getFragmentErrors = function(x) +getFragmentErrors = function(x, recurse = 0) { ## require only one mass analyzer type: stopifnot(length(unique(x$mass.analyzer))==1) stopifnot(all(c("mass.analyzer", "mass.deviations..da.") %in% colnames(x))) convert_Da2PPM = FALSE - if (grepl("ITMS|TOF|CID", x$mass.analyzer[1]) & ("mass.deviations..da." %in% colnames(x))) + ## note: the "^" is important to avoid matching to " ITMS" -- see fallback below + if (grepl("^ITMS|^TOF|^CID", x$mass.analyzer[1]) & ("mass.deviations..da." %in% colnames(x))) { ms2_unit = "[Da]"; ms2_col = "mass.deviations..da." - } else if (grepl("FTMS|HCD", x$mass.analyzer[1]) & ("mass.deviations..ppm." %in% colnames(x))) { + } else if (grepl("^FTMS|^HCD|^HCID", x$mass.analyzer[1]) & ("mass.deviations..ppm." %in% colnames(x))) { ms2_unit = "[ppm]"; ms2_col = "mass.deviations..ppm." - } else if (grepl("FTMS|HCD", x$mass.analyzer[1]) & ("mass.deviations..da." %in% colnames(x))) { + } else if (grepl("^FTMS|^HCD|^HCID", x$mass.analyzer[1]) & ("mass.deviations..da." %in% colnames(x))) { ## we know its high resolution, but this MQ version only gave us Dalton mass deviations ## --> convert back to ppm ms2_unit = "[ppm]"; ms2_col = "mass.deviations..da." @@ -198,13 +199,17 @@ getFragmentErrors = function(x) err = err / as.numeric(mass) * 1e6 } - if ((ms2_unit == "[ppm]") & (median(abs(err)) > 10)) { - cat(paste0("MS/MS fragment error seems rather large ", median(abs(err)), ". Reporting in [Da]...\n")) - # heuristic: ppm errors seem to be way to big. Use 'Da' instead. + abs_error95 = quantile(abs(err), probs = 0.95) + ## if dimension (Da vs ppm) seem weird, try switching to the other -- but avoid infinite recursion + if (recurse == 0 & (ms2_unit == "[ppm]") & (abs_error95 > 200)) { + warning(paste0("MS/MS fragment error seems rather large ", abs_error95, ". Reporting in [Da]...\n")) x$mass.analyzer = "ITMS" - return (getFragmentErrors(x)) + return (getFragmentErrors(x, recurse = 1)) + } else if (recurse == 0 & (ms2_unit == "[Da]") & (abs_error95 < 0.2)) { + warning(paste0("MS/MS fragment error seems rather small ", abs_error95, ". Reporting in [ppm]...\n")) + x$mass.analyzer = paste0(" ", x$mass.analyzer); ## just something which the regex above does not recognize and fallback to ppm + return (getFragmentErrors(x, recurse = 1)) } - return(data.frame(msErr = err, unit = ms2_unit)) } @@ -226,7 +231,7 @@ getFragmentErrors = function(x) fixCalibration = function(df_evd, df_idrate = NULL, tolerance_sd_PCoutOfCal = 2, low_id_rate = 1) { - stopifnot(c("fc.raw.file", "mass", "charge", "m.z", "mass.error..ppm.", "uncalibrated.mass.error..ppm.") %in% colnames(df_evd)) + if (!checkInput(c("fc.raw.file", "mass.error..ppm.", "uncalibrated.mass.error..ppm."), df_evd)) return(NULL) ## heuristic to determine if the instrument is completely out of calibration, ## i.e. all ID's are false positives, since the Precursor mass is wrong @@ -234,9 +239,10 @@ fixCalibration = function(df_evd, df_idrate = NULL, tolerance_sd_PCoutOfCal = 2, ## then ID's are supposedly random ## -- alt: we use the 1%-to-99% quantile range: if > 10ppm ## -- uninformative for detection is the distribution (it's still Gaussian for a strange reason) - MS1_decal_smr = ddply(df_evd, "fc.raw.file", function(x) + MS1_decal_smr = plyr::ddply(df_evd, "fc.raw.file", function(x) data.frame(n = nrow(x), - sd = round(sd(x$mass.error..ppm., na.rm = TRUE), 1), + sd = round(sd(x$mass.error..ppm., na.rm = TRUE), 1), + sd_uncal = round(sd(x$uncalibrated.mass.error..ppm., na.rm = TRUE), 1), range = diff(quantile(x$mass.error..ppm., c(0.01, 0.99), na.rm = TRUE)), decal = (median(abs(x$uncalibrated.mass.error..ppm.), na.rm = TRUE) > 1e3), hasMassErrorBug = FALSE, @@ -255,6 +261,7 @@ fixCalibration = function(df_evd, df_idrate = NULL, tolerance_sd_PCoutOfCal = 2, ## check each raw file individually (usually its just a few who are affected) if (any(MS1_decal_smr$decal, na.rm = TRUE)) { + if (!checkInput(c("mass", "charge", "m.z"), df_evd)) return(NULL) recal_message = "MQ bug: data rescued" recal_message_post = 'MQ bug: data cannot be rescued' @@ -266,7 +273,7 @@ fixCalibration = function(df_evd, df_idrate = NULL, tolerance_sd_PCoutOfCal = 2, df_evd$uncalibrated.mass.error..ppm.2 = df_evd$mass.error..ppm.2 + df_evd$uncalibrated...calibrated.m.z..ppm. ## check if fix worked - de_cal2 = ddply(df_evd, "fc.raw.file", .fun = function(x) data.frame(q = (median(abs(x$uncalibrated.mass.error..ppm.2), na.rm = TRUE) > 1e3))) + de_cal2 = plyr::ddply(df_evd, "fc.raw.file", .fun = function(x) data.frame(q = (median(abs(x$uncalibrated.mass.error..ppm.2), na.rm = TRUE) > 1e3))) if (any(de_cal2$q, na.rm = TRUE)) { ## fix did not work MS1_decal_smr$hasMassErrorBug_unfixable[ MS1_decal_smr$fc.raw.file %in% de_cal2$fc.raw.file[de_cal2$q] ] = TRUE diff --git a/R/fcn_MQalign.R b/R/fcn_MQalign.R index 536bd96..f5a339b 100644 --- a/R/fcn_MQalign.R +++ b/R/fcn_MQalign.R @@ -15,8 +15,6 @@ #' #' @param data The data.frame with columns 'retention.time.calibration' and 'raw.file' #' @return List of reference raw files (usually just one) -#' -#' @importFrom plyr ddply #' findAlignReference = function(data) { @@ -29,7 +27,7 @@ findAlignReference = function(data) { stop("findAlignReference(): Error, could not find column 'raw.file' in data. Aborting!") } - fr = ddply(data, "raw.file", function(x) data.frame(range = diff(range(x$retention.time.calibration, na.rm = TRUE)))) + fr = plyr::ddply(data, "raw.file", function(x) data.frame(range = diff(range(x$retention.time.calibration, na.rm = TRUE)))) ref = as.character(fr$raw.file[fr$range <= min(fr$range)]) return (ref) } @@ -59,8 +57,6 @@ findAlignReference = function(data) #' @param referenceFile A raw file name as occuring in data$raw.file, serving as alignment reference (when no fractions are used). #' @return A data.frame containing the RT diff for each feature found in a Raw file and the reference. #' -#' @importFrom plyr ddply empty -#' alignmentCheck = function(data, referenceFile) { colnames(data) = tolower(colnames(data)) @@ -113,7 +109,7 @@ alignmentCheck = function(data, referenceFile) { data_round = data[data$modified.sequence %in% data$modified.sequence[data$raw.file==file_ref] & data$raw.file %in% file_clients,] - alignQ_round = ddply(data_round, c("modified.sequence", "charge"), function(x) + alignQ_round = plyr::ddply(data_round, c("modified.sequence", "charge"), function(x) { ## reference must be present if ((nrow(x)==1) | (sum(x$raw.file == file_ref)==0) | (sum(duplicated(x$raw.file))>0)) { @@ -134,7 +130,7 @@ alignmentCheck = function(data, referenceFile) { }) ## alignQ_round might be an empty data.frame now, if ## there was only one fraction with no neighbours - if (!empty(alignQ_round)) + if (!plyr::empty(alignQ_round)) { alignQ_round$refFile = file_ref ## remove matches to self (i.e. with rtDiff==0) @@ -158,8 +154,6 @@ alignmentCheck = function(data, referenceFile) { #' @param allowed.deltaRT The allowed matching difference (1 minute by default) #' @return A data.frame with one row for each raw.file and columns 'raw.file' and 'withinRT' (0-1) #' -#' @importFrom plyr ddply -#' ScoreInAlignWindow = function(data, allowed.deltaRT = 1) { colnames(data) = tolower(colnames(data)) @@ -168,7 +162,7 @@ ScoreInAlignWindow = function(data, allowed.deltaRT = 1) { stop("alignmentCheck(): columns missing!") } - alignQC = ddply(data, "raw.file", function(x) { + alignQC = plyr::ddply(data, "raw.file", function(x) { withinRT = sum(abs(x$rtdiff) < allowed.deltaRT, na.rm = TRUE) / sum(!is.na(x$rtdiff)) return(data.frame(withinRT = withinRT)) }) @@ -180,14 +174,14 @@ ScoreInAlignWindow = function(data, allowed.deltaRT = 1) #' #' Check how close transferred ID's after alignment are to their genuine IDs within one Raw file. #' -#' The input is a data frame containing feature evidence with corrected retention times, +#' The input is a data.frame containing feature evidence with corrected retention times, #' e.g. a 'calibrated.retention.time' column. #' #' Note that this function must be given MS/MS identifications of type "MULTI-MSMS" and "MSMS-MATCH". #' It will stop() otherwise. #' -#' We compare for each peptide sequence (and charge) the RT difference within groups of genuine and mixed pairs. -#' For every comparison made, we report the RT difference. If alignment worked perfectly, the differences are very small (<1 min), +#' We compare for each peptide sequence (and charge) the RT difference within groups of either genuine as well as mixed pairs. +#' For every comparison made, we report the RT span If alignment worked perfectly, the span are very small (<1 min), #' for the mixed group, i.e. the pairs are accidentally split 3D peaks. Alignment performance has no influence on the #' genuine-only groups. #' @@ -195,51 +189,31 @@ ScoreInAlignWindow = function(data, allowed.deltaRT = 1) #' The sequence which SHOULD be present is equal to the immediate upper row. This is what we use to guess the sequence. #' However, this relies on the data.frame not being subsetted before (we can sort using the 'id' column)! #' -#' @param data A data.frame with columns 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'raw.file' +#' @param df_evd_all A data.frame with columns 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'raw.file' #' @return A data.frame containing the RT diff for each ID-group found in a Raw file (bg = genuine). #' -#' @importFrom plyr ddply -#' -idTransferCheck = function(data) { - colnames(data) = tolower(colnames(data)) +idTransferCheck = function(df_evd_all) { + colnames(df_evd_all) = tolower(colnames(df_evd_all)) - if (!all(c('id', 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'fc.raw.file') %in% colnames(data))) - { - stop("idTransferCheck(): columns missing!") - } + if (!checkInput(c('id', 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'fc.raw.file'), df_evd_all)) return() + - if (!all(c("MULTI-MSMS", "MULTI-MATCH") %in% unique(data$type))) + if (!all(c("MULTI-MSMS", "MULTI-MATCH") %in% unique(df_evd_all$type))) { stop('idTransferCheck(): scan types missing! Required: "MULTI-MSMS" and "MULTI-MATCH".') } - ## check if data is missing - if (unique(data$modified.sequence[data$type=="MULTI-MATCH"])[1]=="") - { - warning(immediate. = TRUE, "idTransferCheck(): Input data has empty cells for column 'modified.sequence' of type 'MULTI-MATCH'. Early MaxQuant versions (e.g. 1.2.2) have this problem. We will try to reconstruct the data.") - ## use the preceeding sequence (and hope that there are no missing rows in between) - data = data[order(data$id), ] - ## find blocks of MATCHed rows ... - idx_mm = which(data$type=="MULTI-MATCH") ## row index - head(idx_mm) - idx_block_start = idx_mm[ c(1, which(diff(idx_mm)>1) + 1) ] ## index to block of MATCHES - head(idx_block_start) - idx_block_end = c(idx_mm[match(idx_block_start, idx_mm)[-1]-1], idx_mm[length(idx_mm)]) - head(idx_block_end) - data$modified.sequence[idx_mm] = rep(data$modified.sequence[idx_block_start-1], - idx_block_end-idx_block_start+1) - } - data$seq_charge = paste(factor(data$modified.sequence), data$charge, sep="_") - alignQ = ddply(data[,c("fc.raw.file", "type", "calibrated.retention.time", "seq_charge")], - "fc.raw.file", - function(x) { - # unique(data$fc.raw.file) - # x = data[ data$fc.raw.file == "..3_P..14", ] + df_evd_all$seq_charge = paste(factor(df_evd_all$modified.sequence), df_evd_all$charge, sep="_") + alignQ = plyr::ddply(df_evd_all[,c("fc.raw.file", "type", "calibrated.retention.time", "seq_charge")], + "fc.raw.file", + function(x) { + # unique(df_evd_all$fc.raw.file) + # x = df_evd_all[ df_evd_all$fc.raw.file == "file 01", ] ## genuine groups only (within this Raw file): x_genuine = x[x$type=="MULTI-MSMS",] - rt_diffs_genuine = ddply(x_genuine, "seq_charge", + rt_diffs_genuine = plyr::ddply(x_genuine, "seq_charge", function(x2) { if (nrow(x2)==1) return(NULL) ## we do not want singlets return (data.frame(rtdiff_genuine = diff(range(x2$calibrated.retention.time)))) @@ -251,12 +225,12 @@ idTransferCheck = function(data) { ## retain only IDs which have at least one transferred ID x_mixed = x[x$seq_charge %in% x$seq_charge[x$type=="MULTI-MATCH"], ] if (nrow(x_mixed)>0) { - rt_diffs_mixed = ddply(x_mixed, "seq_charge", + rt_diffs_mixed = plyr::ddply(x_mixed, "seq_charge", function(x2) { if (nrow(x2)==1) return(NULL) ## we do not want singlets return (data.frame(rtdiff_mixed = diff(range(x2$calibrated.retention.time)))) }) - ## rtdiff_mixed might be empty, if only singlets where transferred + ## rtdiff_mixed might be empty, if only singlets were transferred ## only merge if non-empty (otherwise the whole merge is empty) if (nrow(rt_diffs_mixed) > 0) { rt_diffs_genuine = merge(rt_diffs_genuine, rt_diffs_mixed, all = TRUE) @@ -281,12 +255,10 @@ idTransferCheck = function(data) { #' #' Returned value is between 0 (bad) and 1 (all within tolerance). #' -#' @param data A data.frame with columns 'fc.raw.file' and !colname (param) +#' @param data A data.frame with columns 'fc.raw.file', 'rtdiff_mixed', 'rtdiff_genuine' #' @param df.allowed.deltaRT The allowed matching difference for each Raw file (as data.frame(fc.rawfile, m)) #' @return A data.frame with one row for each raw.file and columns 'raw.file' and score 'withinRT' (0-1) -#' -#' @importFrom plyr ddply -#' +#' inMatchWindow = function(data, df.allowed.deltaRT) { ## 'data' columns of interest : @@ -299,7 +271,7 @@ inMatchWindow = function(data, df.allowed.deltaRT) return (data.frame(fc.raw.file = NA, withinRT_genuine = NA, withinRT_mixed = NA, withinRT_all = NA)[numeric(0), ]) } - alignQC = ddply(data, "fc.raw.file", function(x) { + alignQC = plyr::ddply(data, "fc.raw.file", function(x) { # x=data[ data$fc.raw.file=="file 01",] allowed.deltaRT = df.allowed.deltaRT$m[match(x$fc.raw.file[1], df.allowed.deltaRT$fc.raw.file)] @@ -320,7 +292,7 @@ inMatchWindow = function(data, df.allowed.deltaRT) #' Determine fraction of evidence which causes segmentation, i.e. sibling peaks at different RTs #' confirmed either by genuine or transferred MS/MS. #' -#' Sometimes, MQ split a feature into 2 or more if the chromatograpic conditions are not optimal and there +#' Sometimes, MQ splits a feature into 2 or more if the chromatograpic conditions are not optimal and there #' is a drop in RT intensity. #' If both features contain successful MS/MS scans, we will find the same peptide twice (with slightly different RT) #' in the same charge state. This constitutes a natively split peak and is rare (95% of all genuine peaks are unique). @@ -332,49 +304,42 @@ inMatchWindow = function(data, df.allowed.deltaRT) #' and thus the intensity is random. #' To find by how much these peak pairs differ in RT, use idTransferCheck() and inMatchWindow(). #' -#' Required columns are 'match.time.difference', 'fc.raw.file', 'modified.sequence', 'charge', 'type'. +#' Required columns are 'is.transferred', 'fc.raw.file', 'modified.sequence', 'charge', 'type'. #' #' Note that this function must be given MS/MS identifications of type "MULTI-MSMS" and "MSMS-MATCH". #' It will stop() otherwise. #' -#' @param d_evd A data.frame of evidences containing the above columns +#' @param df_evd_all A data.frame of evidences containing the above columns #' @return A data.frame with one row per Raw file and #' three columns: #' 1) % of native single peaks (ignoring transferred IDs) -#' 2) % of single peaks (group of size=1) using only groups which have at at one transferred evidence +#' 2) % of single peaks (group of size=1) using only groups which have one transferred evidence #' 3) % of single peaks using all groups #' -#' @importFrom plyr ddply -#' -peakSegmentation = function(d_evd) +peakSegmentation = function(df_evd_all) { - if (!all(c("match.time.difference", "fc.raw.file", "modified.sequence", "charge", 'type') %in% colnames(d_evd))) - { - stop("peakSegmentation(): columns missing!") - } + if (!checkInput(c("is.transferred", "fc.raw.file", "modified.sequence", "charge", 'type'), df_evd_all)) return() - if (!all(c("MULTI-MSMS", "MULTI-MATCH") %in% unique(d_evd$type))) + if (!all(c("MULTI-MSMS", "MULTI-MATCH") %in% unique(df_evd_all$type))) { - stop('idTransferCheck(): scan types missing! Required: "MULTI-MSMS" and "MULTI-MATCH".') + stop('peakSegmentation(): scan types missing! Required: "MULTI-MSMS" and "MULTI-MATCH".') } - fc.raw.files = unique(d_evd$fc.raw.file) + fc.raw.files = unique(df_evd_all$fc.raw.file) ## just keep "MULTI-MATCH" and "MULTI-MSMS", to keep results comparable to idTransferCheck() - d_evd = d_evd[d_evd$type %in% c("MULTI-MSMS", "MULTI-MATCH"), ] - - d_evd$hasMTD = !is.na(d_evd$match.time.difference) ## all the MULTI-MATCH hits, i.e. transferred IDs - - cols = c("hasMTD", "fc.raw.file", "modified.sequence", "charge") - countSeqs = ddply(d_evd[d_evd$type!="MSMS", cols], cols[-1], function(x) + df_evd_all = df_evd_all[df_evd_all$type %in% c("MULTI-MSMS", "MULTI-MATCH"), ] + + cols = c("is.transferred", "fc.raw.file", "modified.sequence", "charge") + countSeqs = plyr::ddply(df_evd_all[, cols], cols[-1], function(x) { - return(data.frame(nNative = sum(!x$hasMTD), nMatched = sum(x$hasMTD)))#, ratio = ratio)) + return(data.frame(nNative = sum(!x$is.transferred), nMatched = sum(x$is.transferred)))#, ratio = ratio)) }) - mbr_score = ddply(countSeqs, "fc.raw.file", function(countSeqs_sub) + mbr_score = plyr::ddply(countSeqs, "fc.raw.file", function(countSeqs_sub) { #unique(countSeqs$fc.raw.file) - #countSeqs_sub = countSeqs[countSeqs$fc.raw.file == "..1_P..14", ] + #countSeqs_sub = countSeqs[countSeqs$fc.raw.file == "file 02", ] ddt = table(countSeqs_sub[, c("nMatched", "nNative")]) ### ddt might look like this: @@ -438,8 +403,6 @@ peakSegmentation = function(d_evd) #' @param qMBRSeg_Dist_inGroup A data.frame as computed by inMatchWindow() #' @return A data.frame which details the distribution of singlets and pairs (inRT and outRT) for each Raw file and genuine vs. all #' -#' @importFrom plyr ddply -#' computeMatchRTFractions = function(qMBR, qMBRSeg_Dist_inGroup) { ## data might look like this: @@ -459,7 +422,7 @@ computeMatchRTFractions = function(qMBR, qMBRSeg_Dist_inGroup) ## compute percentage of outside dRT peaks in genuine, matched and combined(=all) ## then calc the drop. - f = ddply(qMBR, "fc.raw.file", function(x) { + f = plyr::ddply(qMBR, "fc.raw.file", function(x) { #x = qMBR[3, , drop = FALSE] rr = qMBRSeg_Dist_inGroup$fc.raw.file==x$fc.raw.file @@ -500,29 +463,26 @@ computeMatchRTFractions = function(qMBR, qMBRSeg_Dist_inGroup) #' If not, MaxQuant's fraction settings should be optimized. #' Note that introducing fractions in MaxQuant will naturally lead to a clustering here (it's somewhat circular). #' -#' @param d_evd Evidence table containing calibrated retention times and sequence information. +#' @param df_evd Evidence table containing calibrated retention times and sequence information. #' @param col_fraction Empty vector or 1-values vector giving the name of the fraction column (if existing) #' @return ggplot object containing the correlation tree #' #' @import ggplot2 -#' @import ggdendro -#' @importFrom reshape2 dcast -#' #' @export #' -RTalignmentTree = function(d_evd, col_fraction = c()) +RTalignmentTree = function(df_evd, col_fraction = c()) { - #d_evd$fc.raw.file=d_evd$raw.file - head(d_evd) + #df_evd$fc.raw.file=df_evd$raw.file + head(df_evd) req_cols = c("calibrated.retention.time", "fc.raw.file", col_fraction, "modified.sequence", "charge") - if (!all(req_cols %in% colnames(d_evd))) + if (!all(req_cols %in% colnames(df_evd))) { stop("RTalignmentTree: Missing columns! Please fix the code: ", - setdiff(req_cols, colnames(d_evd)), "!") + setdiff(req_cols, colnames(df_evd)), "!") } - d_cast = dcast(d_evd, modified.sequence + charge ~ fc.raw.file, mean, value.var = "calibrated.retention.time") + d_cast = reshape2::dcast(df_evd, modified.sequence + charge ~ fc.raw.file, mean, value.var = "calibrated.retention.time") head(d_cast[,-(1:2)]) d_cast.m = as.matrix(d_cast[,-(1:2)]) @@ -536,23 +496,23 @@ RTalignmentTree = function(d_evd, col_fraction = c()) ## if some samples have no overlap, their cell is NA --> set to 1 (max distance) dissimilarity[is.na(dissimilarity)] = 1 #plot(hclust(as.dist(dissimilarity), method="ward.D")) - ddata = dendro_data(hclust(as.dist(dissimilarity)), type = "rectangle") + ddata = ggdendro::dendro_data(hclust(as.dist(dissimilarity)), type = "rectangle") if (length(col_fraction)) { - idx_raw = match(ddata$labels$label, d_evd$fc.raw.file) - ddata$labels$col = factor(d_evd[idx_raw, col_fraction]) + idx_raw = match(ddata$labels$label, df_evd$fc.raw.file) + ddata$labels$col = factor(df_evd[idx_raw, col_fraction]) } else { ddata$labels$col = "black" } - p = ggplot(segment(ddata)) + - geom_segment(aes_string(x = "x", y = "y", xend = "xend", yend = "yend")) + - scale_x_continuous(breaks=ddata$labels$x, labels=ddata$labels$label) + - theme_blank() + - theme(axis.text.y = element_text(colour=ddata$labels$col), - axis.text.x = element_blank()) + - coord_flip() + - addGGtitle("[experimental] EVD: Clustering Tree of Raw files", "by Correlation of Corrected Retention Times") + p = ggplot(ggdendro::segment(ddata)) + + geom_segment(aes_string(x = "x", y = "y", xend = "xend", yend = "yend")) + + scale_x_continuous(breaks = ddata$labels$x, labels = ddata$labels$label) + + theme_blank() + + theme(axis.text.y = element_text(colour = ddata$labels$col), + axis.text.x = element_blank()) + + coord_flip() + + addGGtitle("[experimental] EVD: Clustering Tree of Raw files", "by Correlation of Corrected Retention Times") #p return(p) } diff --git a/R/fcn_PCA.R b/R/fcn_PCA.R index fca4bd4..0468c64 100644 --- a/R/fcn_PCA.R +++ b/R/fcn_PCA.R @@ -12,18 +12,11 @@ #' and "plots": list of plot objects (one or two) #' #' @import ggplot2 -#' @import stats -#' @importFrom seqinr circle -#' #' @export #' getPCA = function(data, do_plot = TRUE, connect_line_order = NA, gg_layer) { - #require(ggplot2) - #require(stats) - #require(seqinr) - - pc = prcomp(data, scale. = TRUE) + pc = stats::prcomp(data, scale. = TRUE) useOrd = !is.na(connect_line_order[1]) # create data frame with scores scores = as.data.frame(pc$x) @@ -58,7 +51,7 @@ getPCA = function(data, do_plot = TRUE, connect_line_order = NA, gg_layer) pl = pl + gg_layer + geom_point(aes_string(colour = "ord"), size = 1) + - geom_text(size = 3, angle=0, aes_string(label = "class", colour = "ord", vjust = "1")) + + geom_text(aes_string(label = "class", colour = "ord", vjust = "1"), size = 3, angle=0) + geom_hline(yintercept=0, colour="gray65") + #theme(panel.background=element_rect("black")) + geom_vline(xintercept=0, colour="gray65") + @@ -80,7 +73,7 @@ getPCA = function(data, do_plot = TRUE, connect_line_order = NA, gg_layer) yy = center[1] + r * sin(tt) return(data.frame(x = xx, y = yy)) } - corcir = circle(c(0,0), npoints = 100) + corcir = seqinr::circle(c(0,0), npoints = 100) # create data frame with correlations between variables and PCs correlations = as.data.frame(cor(data, pc$x)) # data frame with arrows coordinates @@ -90,9 +83,9 @@ getPCA = function(data, do_plot = TRUE, connect_line_order = NA, gg_layer) lpl[[2]] = ggplot() + - geom_path(data=corcir, aes_string(x = "x", y = "y"), colour="gray65") + ## open circles - geom_segment(data=arrows, aes_string(x = "x1", y = "y1", xend = "x2", yend = "y2"), colour="gray65") + - geom_text(data=correlations, aes_string(x = "PC1", y = "PC2", label = "rownames(correlations)")) + + geom_path(data = corcir, aes_string(x = "x", y = "y"), colour="gray65") + ## open circles + geom_segment(data = arrows, aes_string(x = "x1", y = "y1", xend = "x2", yend = "y2"), colour="gray65") + + geom_text(data = correlations, aes_string(x = "PC1", y = "PC2", label = "rownames(correlations)")) + geom_hline(yintercept = 0, colour = "gray65") + geom_vline(xintercept = 0, colour = "gray65") + xlim(-1.1,1.1) + ylim(-1.1,1.1) + diff --git a/R/fcn_QCHeat.R b/R/fcn_QCHeat.R index cd7b042..5d451f7 100644 --- a/R/fcn_QCHeat.R +++ b/R/fcn_QCHeat.R @@ -22,17 +22,15 @@ HEATMAP_NA_VALUE = -Inf #' @param lst_qcMetrics List of QCMetric objects #' @param raw_file_mapping Data.frame with 'from' and 'to' columns for name mapping to unify names from list entries #' @return A ggplot object for printing -#' -#' @importFrom plyr compact ldply -#' @importFrom reshape2 dcast #' +#' @import ggplot2 #' getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) { if (length(lst_qcMetrics) == 0) stop("Heatmap: List of Qc metrics is empty!") lst.QCM = lapply(lst_qcMetrics, function(qcm) { qcm_sc = qcm$qcScores - if (empty(qcm_sc)) return(NULL) ## if metric was not computed, default DF is empty + if (plyr::empty(qcm_sc)) return(NULL) ## if metric was not computed, default DF is empty if ("raw.file" %in% colnames(qcm_sc)) { qcm_sc$fc.raw.file = renameFile(qcm_sc$raw.file, raw_file_mapping) ## create short name column qcm_sc = qcm_sc[, !(colnames(qcm_sc) %in% "raw.file")] ## remove raw.file column @@ -47,7 +45,7 @@ getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) } return(qcm_sc) }) - lst.QCM = compact(lst.QCM) ## remove 'NULL' entries + lst.QCM = plyr::compact(lst.QCM) ## remove 'NULL' entries ## final heat map of QC metrics df.QCM = Reduce(function(a,b) merge(a,b,all = TRUE), lst.QCM) @@ -60,7 +58,8 @@ getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) ## create summary column lst_qcMetrics[["qcMetric_AverageQualOverall"]]$setData(df.QCM) ## ... add it - df.QCMa = merge(df.QCM, lst_qcMetrics[["qcMetric_AverageQualOverall"]]$qcScores) + df.AverageQual = lst_qcMetrics[["qcMetric_AverageQualOverall"]]$qcScores + if (plyr::empty(df.AverageQual)) df.QCMa = df.QCM else df.QCMa = merge(df.QCM, df.AverageQual) ## get order and names for each metric df.meta = getMetaData(lst_qcMetrics) @@ -79,11 +78,11 @@ getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) df.QCMa = df.QCMa[, c("fc.raw.file", qc_names_all_scores)] ## add column numbering (ignore first column, which is 'fc.raw.file') df.QCMan = df.QCMa - idx = 2:(ncol(df.QCMan)-1) + idx = 2:(ncol(df.QCMan)) colnames(df.QCMan)[idx] = paste0(colnames(df.QCMa)[idx], "~\"[", idx-1, "]\"") colnames_wNum_map = data.frame(name = colnames(df.QCMa), nameWnum = colnames(df.QCMan)) - QCM_final.m = melt(df.QCMan, id.vars="fc.raw.file") + QCM_final.m = reshape2::melt(df.QCMan, id.vars="fc.raw.file") QCM_final.m$variable = factor(QCM_final.m$variable, ordered = TRUE) ## some files might not be in the original list (will receive 'bad' score in table) @@ -144,7 +143,7 @@ getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) xlab("") + ylab("Raw file") #print(p) - return(list(plot = p, table = dcast(QCM_final.m, fc.raw.file ~ variable))) + return(list(plot = p, table = reshape2::dcast(QCM_final.m, fc.raw.file ~ variable))) } #' @@ -156,10 +155,10 @@ getQCHeatMap = function(lst_qcMetrics, raw_file_mapping) #' getMetaData = function(lst_qcMetrics) { - df.meta = ldply(lst_qcMetrics, function(qcm) { + df.meta = plyr::ldply(lst_qcMetrics, function(qcm) { #qq <<- qcm qcm_sc = qcm$qcScores - if (empty(qcm_sc)) { + if (plyr::empty(qcm_sc)) { ## if metric was not computed, default DF is empty name = qcm$qcName } else { @@ -169,6 +168,6 @@ getMetaData = function(lst_qcMetrics) data.frame(name = name, order = qcm$orderNr, cat = qcm$qcCat) }) ## order meta - if (!empty(df.meta)) df.meta = df.meta[order(df.meta$order), ] + if (!plyr::empty(df.meta)) df.meta = df.meta[order(df.meta$order), ] return(df.meta) } diff --git a/R/fcn_misc.R b/R/fcn_misc.R index 4beafab..fefebb7 100644 --- a/R/fcn_misc.R +++ b/R/fcn_misc.R @@ -870,12 +870,11 @@ thinOut = function(data, filterColname, binsize) #' @param batchColname Name of the split column as string #' @param binCount Number of bins in the 'filterColname' dimension. #' @return Data.frame with reduced rows, but identical input columns -#' -#' @importFrom plyr ddply +#' thinOutBatch = function(data, filterColname, batchColname, binCount = 1000) { binsize = (max(data[, filterColname], na.rm=TRUE) - min(data[, filterColname], na.rm=TRUE)) / binCount - r = ddply(data, batchColname, thinOut, filterColname, binsize) + r = plyr::ddply(data, batchColname, thinOut, filterColname, binsize) return (r) } @@ -921,30 +920,38 @@ getAbundanceClass = function(x) { #' #' Assembles a list of output file names, which will be created during reporting. #' -#' @param txt_folder Directory where the MaxQuant output resides +#' You can combine @p report_name_has_folder and @p mzTab_filename to obtain filenames which are even more +#' robust to moving around (since they contain infixes of the mzTab filename and the folder), +#' e.g. @em report_HEK293-study_myProjects.html, where the input +#' was mzTab_filename='HEK293-study.mzTab' and folder='c:/somePath/myProjects/'. +#' +#' @param folder Directory where the MaxQuant output (txt folder) or the mzTab file resides #' @param report_name_has_folder Boolean: Should the report files (html, pdf) contain the name #' of the deepest(=last) subdirectory in 'txt_folder' which is not 'txt'? #' Useful for discerning different reports in a PDF viewer. #' E.g. when flag is FALSE: 'report_v0.91.0.html'; and 'report_v0.91.0_bloodStudy.html' when flag is TRUE (and the #' txt folder is '.../bloodStudy/txt/' or '...bloodStudy/', i.e. './txt/' will be skipped over) +#' @param mzTab_filename If input is an mzTab, specify its name, so that the filenames can use its basename as infix +#' E.g. when mzTab_filename = 'HEK293-study.mzTab' then the output will be +#' report_HEK293-study.html. +#' This allows to get reports on multiple mzTabs in the same folder without overwriting report results. +#' #' @return List of output file names (just names, no file is created) #' with list entries: #' yaml_file, heatmap_values_file, R_plots_file, filename_sorting, stats_file, log_file, report_file_prefix, report_file_PDF, report_file_HTML #' -#' @import utils -#' #' @export #' -getReportFilenames = function(txt_folder, report_name_has_folder = TRUE) +getReportFilenames = function(folder, report_name_has_folder = TRUE, mzTab_filename = NULL) { ## package version: added to output filename - pv = try(packageVersion("PTXQC")) + pv = try(utils::packageVersion("PTXQC")) if (inherits(pv, "try-error")) pv = "_unknown" report_version = paste0("v", pv) ## amend report filename with a foldername where it resides, to ease discerning different reports in a PDF viewer extra_folderRef = "" - folders = rev(unlist(strsplit(normalizePath(txt_folder, winslash = .Platform$file.sep), split=.Platform$file.sep))) + folders = rev(unlist(strsplit(normalizePath(folder, winslash = .Platform$file.sep), split=.Platform$file.sep))) while (length(folders)){ if (!grepl(":", folders[1]) & folders[1]!="txt") { extra_folderRef = paste0("_", folders[1]) @@ -952,7 +959,13 @@ getReportFilenames = function(txt_folder, report_name_has_folder = TRUE) } else folders = folders[-1] } - report_file_simple = paste0(txt_folder, .Platform$file.sep, "report_", report_version) + ## complete path + report_vXY + report_file_simple = paste0(folder, .Platform$file.sep, "report_", report_version) + ## .. + myMzTab [optional] + if (!is.null(mzTab_filename) && nchar(mzTab_filename) > 0) { + report_file_simple = paste0(report_file_simple, "_", gsub("\\.mzTab$", "", basename(mzTab_filename), ignore.case = TRUE)) + } + yaml_file = paste0(report_file_simple, ".yaml") heatmap_values_file = paste0(report_file_simple, "_heatmap.txt") R_plots_file = paste0(report_file_simple, "_plots.Rdata") @@ -960,9 +973,11 @@ getReportFilenames = function(txt_folder, report_name_has_folder = TRUE) stats_file = paste0(report_file_simple, "_stats.txt") log_file = paste0(report_file_simple, ".log") - report_file_extended = paste0(report_file_simple, extra_folderRef) - - report_file_prefix = ifelse(report_name_has_folder, report_file_extended, report_file_simple) + ## include folder-name at the end + if (report_name_has_folder) + report_file_prefix = paste0(report_file_simple, extra_folderRef) + else + report_file_prefix = report_file_simple fh = list(yaml_file = yaml_file, heatmap_values_file = heatmap_values_file, @@ -982,40 +997,30 @@ getReportFilenames = function(txt_folder, report_name_has_folder = TRUE) #' Extract the number of protein groups observed per Raw file #' from an evidence table. #' -#' Required columns are "protein.group.ids", "fc.raw.file" and "match.time.difference". +#' Required columns are "protein.group.ids", "fc.raw.file" and "is.transferred". #' #' If match-between-runs was enabled during the MaxQuant run, #' the data.frame returned will contain separate values for 'transferred' evidence #' plus an 'MBRgain' column, which will give the extra MBR evidence in percent. #' -#' @param d_evidence Data.frame of evidence.txt as read by MQDataReader +#' @param df_evd Data.frame of evidence.txt as read by MQDataReader #' @return Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain' #' #' -getProteinCounts = function(d_evidence) { +getProteinCounts = function(df_evd) { - required_cols = c("protein.group.ids", "fc.raw.file", "match.time.difference") - if (!all(required_cols %in% colnames(d_evidence))) { + required_cols = c("protein.group.ids", "fc.raw.file", "is.transferred") + if (!all(required_cols %in% colnames(df_evd))) { stop("getProteinCounts(): Missing columns!") } - - - ## ms.ms.count is always 0 when mtd has a number; 'type' is always "MULTI-MATCH" and ms.ms.ids is empty! - #dsub = d_evd[,c("ms.ms.count", "match.time.difference")] - #head(dsub[is.na(dsub[,2]),]) - #sum(0==(dsub[,1]) & is.na(dsub[,2])) - ## - ## MQ1.4 MTD is either: NA or a number - ## - d_evidence$hasMTD = !is.na(d_evidence$match.time.difference) - + ## report Match-between-runs data only if if it was enabled - reportMTD = any(d_evidence$hasMTD) + reportMTD = any(df_evd$is.transferred) - prot_counts = ddply(d_evidence, "fc.raw.file", .fun = function(x, reportMTD) + prot_counts = plyr::ddply(df_evd, "fc.raw.file", .fun = function(x, reportMTD) { ## proteins - x$group_mtdinfo = paste(x$protein.group.ids, x$hasMTD, sep="_") + x$group_mtdinfo = paste(x$protein.group.ids, x$is.transferred, sep="_") # remove duplicates (since strsplit below is expensive) -- has no effect on double counting of PROTEINS(!), since it honors MTD xpro = x[!duplicated(x$group_mtdinfo),] # split groups @@ -1023,9 +1028,9 @@ getProteinCounts = function(d_evidence) { return (strsplit(x, split=";", fixed=TRUE)) }) # get number of unique proteins - pg_set_genuineUnique = unique(unlist(p_groups[!xpro$hasMTD])) + pg_set_genuineUnique = unique(unlist(p_groups[!xpro$is.transferred])) # MBR will contribute peptides of already known proteins - pg_set_allMBRunique = unique(unlist(p_groups[xpro$hasMTD])) + pg_set_allMBRunique = unique(unlist(p_groups[xpro$is.transferred])) pg_count_GenAndMBR = length(intersect(pg_set_allMBRunique, pg_set_genuineUnique)) # ... and peptides of proteins which are new in this Raw file (few) pg_count_newMBR = length(pg_set_allMBRunique) - pg_count_GenAndMBR @@ -1056,41 +1061,32 @@ getProteinCounts = function(d_evidence) { #' Extract the number of peptides observed per Raw file #' from an evidence table. #' -#' Required columns are "fc.raw.file", "modified.sequence" and "match.time.difference". +#' Required columns are "fc.raw.file", "modified.sequence" and "is.transferred". #' #' If match-between-runs was enabled during the MaxQuant run, #' the data.frame returned will contain separate values for 'transferred' evidence #' plus an 'MBRgain' column, which will give the extra MBR evidence in percent. #' -#' @param d_evidence Data.frame of evidence.txt as read by MQDataReader +#' @param df_evd Data.frame of evidence.txt as read by MQDataReader #' @return Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain' #' #' -getPeptideCounts = function(d_evidence) { +getPeptideCounts = function(df_evd) { - required_cols = c("fc.raw.file", "modified.sequence", "match.time.difference") - if (!all(required_cols %in% colnames(d_evidence))) { + required_cols = c("fc.raw.file", "modified.sequence", "is.transferred") + if (!all(required_cols %in% colnames(df_evd))) { stop("getPeptideCounts(): Missing columns!") } - - ## ms.ms.count is always 0 when mtd has a number; 'type' is always "MULTI-MATCH" and ms.ms.ids is empty! - #dsub = d_evd[,c("ms.ms.count", "match.time.difference")] - #head(dsub[is.na(dsub[,2]),]) - #sum(0==(dsub[,1]) & is.na(dsub[,2])) - ## - ## MQ1.4 MTD is either: NA or a number - ## - d_evidence$hasMTD = !is.na(d_evidence$match.time.difference) - ## report Match-between-runs data only if if it was enabled - reportMTD = any(d_evidence$hasMTD) + reportMTD = any(df_evd$is.transferred) - pep_counts = ddply(d_evidence, "fc.raw.file", .fun = function(x, reportMTD) + pep_counts = plyr::ddply(df_evd, "fc.raw.file", .fun = function(x, reportMTD) { - #pep_count_genuineAll = sum(!x$hasMTD) # (we count double sequences... could be charge +2, +3,... or oversampling) - pep_set_genuineUnique = unique(x$modified.sequence[!x$hasMTD]) ## unique sequences (discarding PTM's) - pep_set_allMBRunique = unique(x$modified.sequence[x$hasMTD]) + x <<- x + #pep_count_genuineAll = sum(!x$is.transferred) # (we count double sequences... could be charge +2, +3,... or oversampling) + pep_set_genuineUnique = unique(x$modified.sequence[!x$is.transferred]) ## unique sequences (discarding PTM's) + pep_set_allMBRunique = unique(x$modified.sequence[x$is.transferred]) pep_count_GenAndMBR = length(intersect(pep_set_genuineUnique, pep_set_allMBRunique)) ## redundant, i.e. both Genuine and MBR pep_count_newMBR = length(pep_set_allMBRunique) - pep_count_GenAndMBR ## new MBR peptides pep_count_onlyGenuine = length(pep_set_genuineUnique) - pep_count_GenAndMBR ## genuine-only peptides @@ -1204,8 +1200,6 @@ getECDF = function(samples, y_eval = (1:100)/100) #' @param RT_bin_width Bin size in minutes #' @return Data.frame with columns 'bin', 'RT', 'peakWidth' #' -#' @importFrom plyr ddply -#' #' @export #' #' @examples @@ -1221,7 +1215,7 @@ peakWidthOverTime = function(data, RT_bin_width = 2) brs = seq(from = r[1], to = r[2] + RT_bin_width, by = RT_bin_width) data$bin = findInterval(data$retention.time, brs, all.inside = TRUE) ## faster than cut(..., labels = FALSE) #data$bin = cut(data$retention.time, breaks = brs, include.lowest = TRUE, labels = FALSE) - retLStats = ddply(data, "bin", .fun = function(xb) { + retLStats = plyr::ddply(data, "bin", .fun = function(xb) { data.frame(RT = brs[xb$bin[1]], peakWidth = median(xb$retention.length, na.rm = TRUE)) }) return(retLStats) @@ -1251,4 +1245,15 @@ getMetricsObjects = function(DEBUG_PTXQC = FALSE) return(lst_qcMetrics) } +#' +#' Make a color (given as name or in RGB) darker by factor x = [0 = black, 1=unchanged] +#' @param color A color as understood by col2rgb +#' @param factor Between 0 (make black) and 1 (leave color as is) +#' @return darkened color +#' +darken = function(color, factor=0.8){ + dark_col = grDevices::rgb(t(grDevices::col2rgb(color) * factor), maxColorValue=255) + return(dark_col) +} + diff --git a/R/fcn_miscGGplot.R b/R/fcn_miscGGplot.R index 9100c5b..8738a16 100644 --- a/R/fcn_miscGGplot.R +++ b/R/fcn_miscGGplot.R @@ -7,15 +7,14 @@ #' @param col Colour of text (excluding the title) #' @return ggplot object #' -#' @import ggplot2 #' ggText = function(title, text, col = "black") { pl = ggplot(data.frame(text = text, ypos=1, xpos=1), - aes_string(x = "xpos", y = "ypos")) + + aes_string(x = "xpos", y = "ypos")) + geom_text(aes_string(label = "text"), colour = col, family="mono") + theme_bw() + theme(plot.margin = grid::unit(c(1,1,1,1), "cm"), line = element_blank(), axis.title = element_blank(), panel.border = element_blank(), - axis.text = element_blank(), strip.text = element_blank(), legend.position="none") + + axis.text = element_blank(), strip.text = element_blank(), legend.position="none") + ggtitle(title) return(pl) } @@ -27,34 +26,31 @@ ggText = function(title, text, col = "black") { #' @param bottom_left Footer text for bottom left side #' @param bottom_right Footer text for bottom right side #' @return - -#' -#' @import ggplot2 -#' @importFrom grid textGrob gpar grid.draw #' printWithFooter = function(gg_obj, bottom_left = NULL, bottom_right = NULL) { print(gg_obj) if (!is.null(bottom_left)) { - label = textGrob(bottom_left, - x = 0.02, # left side - y = 0.0, # bottom - just="left", - hjust = NULL, - vjust = -.5, - gp=gpar(fontsize=7, col="#333333")) - grid.draw(label) + label = grid::textGrob(bottom_left, + x = 0.02, # left side + y = 0.0, # bottom + just = "left", + hjust = NULL, + vjust = -.5, + gp = grid::gpar(fontsize=7, col="#333333")) + grid::grid.draw(label) } if (!is.null(bottom_right)) { - label = textGrob(bottom_right, - x = 0.98, # right side - y = 0.0, # bottom - just="right", - hjust = NULL, - vjust = -.5, - gp=gpar(fontsize=7, col="#333333")) - grid.draw(label) + label = grid::textGrob(bottom_right, + x = 0.98, # right side + y = 0.0, # bottom + just = "right", + hjust = NULL, + vjust = -.5, + gp = grid::gpar(fontsize=7, col="#333333")) + grid::grid.draw(label) } } @@ -65,6 +61,8 @@ printWithFooter = function(gg_obj, bottom_left = NULL, bottom_right = NULL) #' @param ... Other arguments forwarded to 'scale_y_discrete()' #' @return ggplot object, concatenatable with '+' #' +#' @import ggplot2 +#' scale_x_discrete_reverse = function(values, ...) { if (!("factor" %in% class(values))) stop("Cannot use scale_x_discrete_reverse() on non-factor()") @@ -78,6 +76,8 @@ scale_x_discrete_reverse = function(values, ...) #' @param values The vector of values as given to the y aestetic #' @param ... Other arguments forwarded to 'scale_y_discrete()' #' @return ggplot object, concatenatable with '+' +#' +#' @import ggplot2 #' scale_y_discrete_reverse = function(values, ...) { @@ -132,11 +132,9 @@ ggAxisLabels = function(x, n = 20) #' @return A ggplot object #' #' @import ggplot2 -#' #' @export #' addGGtitle = function(main, sub = NULL){ - #require(ggplot2) if (is.null(sub) || sub=="") { pl = ggtitle(main) } else { @@ -160,7 +158,6 @@ addGGtitle = function(main, sub = NULL){ #' @return ggplot object with new geom_point #' #' @import ggplot2 -#' #' @export #' pointsPutX = function(x_range, x_section, y, col = NA){ @@ -182,16 +179,15 @@ pointsPutX = function(x_range, x_section, y, col = NA){ #' @return A ggplot2 object, representing an empty theme #' #' @import ggplot2 -#' #' @export #' theme_blank = function() { - theme_blank <- theme_bw() - theme_blank$line <- element_blank() - theme_blank$rect <- element_blank() - theme_blank$strip.text <- element_blank() - theme_blank$axis.title <- element_blank() + theme_blank = theme_bw() + theme_blank$line = element_blank() + theme_blank$rect = element_blank() + theme_blank$strip.text = element_blank() + theme_blank$axis.title = element_blank() return (theme_blank) } @@ -205,13 +201,11 @@ theme_blank = function() #' @param palette Name of palette (e.g. "set1") #' @return character vector of colors #' -#' @importFrom RColorBrewer brewer.pal brewer.pal.info -#' brewer.pal.Safe = function(n = 3, palette = "Set1") { - idx = which(rownames(brewer.pal.info) == palette) + idx = which(rownames(RColorBrewer::brewer.pal.info) == palette) if (length(idx) != 1) stop("Palette ", palette," unknown!") - if (brewer.pal.info$maxcolors[idx] < n) stop("Palette ", palette, " provides ", brewer.pal.info$maxcolors[idx], " colors, but not ", n, " as requested!") + if (RColorBrewer::brewer.pal.info$maxcolors[idx] < n) stop("Palette ", palette, " provides ", RColorBrewer::brewer.pal.info$maxcolors[idx], " colors, but not ", n, " as requested!") - return (brewer.pal(n, palette)) + return (RColorBrewer::brewer.pal(n, palette)) } diff --git a/R/fcn_plots.R b/R/fcn_plots.R index 2198e67..925c516 100644 --- a/R/fcn_plots.R +++ b/R/fcn_plots.R @@ -9,7 +9,6 @@ #' @return GGplot object #' #' @import ggplot2 -#' #' @export #' #' @examples @@ -63,7 +62,8 @@ plot_ContsPG = function(data) #' value = c(10, 20, 15, 9, 21, 14, 0, 1, 1, 0.3, 0.01, 0.04)) #' plot_ContUser(data, "myco", 5, "subtitle") #' -plot_ContUser = function(data, name_contaminant, extra_limit, subtitle = NULL) { +plot_ContUser = function(data, name_contaminant, extra_limit, subtitle = NULL) +{ datav = subset(data, data$variable %in% c('spectralCount', "intensity")) datav$section = assignBlocks(datav$fc.raw.file, set_size = 40, sort_values = TRUE) dataAT = subset(data, data$variable %in% c('above.thresh')) @@ -88,7 +88,7 @@ plot_ContUser = function(data, name_contaminant, extra_limit, subtitle = NULL) { scale_fill_discrete(name = "Method") + geom_hline(yintercept = extra_limit, linetype = 'dashed') ## group(NULL) seems important in geom_text() - if (nrow(dataKS)>0) p = p + geom_text(data = dataKS, aes_string(label = "value", y = maxY * 1.05, group=NULL)) + if (nrow(dataKS)>0) p = p + geom_text(data = dataKS, aes_string(label = "value", y = maxY * 1.05, group = NULL)) p = p + facet_wrap(~ section, ncol = 1, scales = "free_x") #print(p) return(p) @@ -123,10 +123,10 @@ plot_ContUser = function(data, name_contaminant, extra_limit, subtitle = NULL) { #' plot_ContUserScore = function(data, raw.file, score) { p = ggplot(data) + - geom_line(aes_string(x = "x", y = "y", col = "condition")) + - ggtitle(paste0("Empirical CDF of '", raw.file, "'\np = ", round(score, 2))) + - ylab("Pr") + - xlab("Andromeda score") + geom_line(aes_string(x = "x", y = "y", col = "condition")) + + ggtitle(paste0("Empirical CDF of '", raw.file, "'\np = ", round(score, 2))) + + ylab("Pr") + + xlab("Andromeda score") return(p) } @@ -139,7 +139,6 @@ plot_ContUserScore = function(data, raw.file, score) { #' @return GGplot object #' #' @import ggplot2 -#' @importFrom plyr summarize #' @export #' #' @examples @@ -160,17 +159,18 @@ plot_ContEVD = function(data, top5) if (length(top5) > 5) stop("Top5 protein list is longer than 5 (which is the maximum allowed).") intensity = NULL ## to make R CHECK happy... - data.sub = data[data$contaminant,] + data.sub = data[data$contaminant > 0,] ## rewrite prot names, and subsume 6th and below as 'other' data.sub$pname = as.character(data.sub$pname) data.sub[!(data.sub$pname %in% top5), "pname"] = 'other' ## aggregate identical proteins ## use sum(as.numeric(.)) to prevent overflow - d_sum = ddply(data.sub[, c("intensity", "pname", "fc.raw.file")], c("pname", "fc.raw.file"), - function(x) summarise(x, s.intensity=sum(as.numeric(intensity), na.rm = TRUE))) + d_sum = plyr::ddply(data.sub[, c("intensity", "pname", "fc.raw.file")], c("pname", "fc.raw.file"), + function(x) plyr::summarise(x, s.intensity=sum(as.numeric(intensity), na.rm = TRUE))) ## normalize by total intensity of raw file - d_norm = ddply(data[, c("intensity", "fc.raw.file")], "fc.raw.file", - function(x) summarise(x, total.intensity=sum(as.numeric(intensity), na.rm = TRUE))) + d_norm = plyr::ddply(data[, c("intensity", "fc.raw.file")], "fc.raw.file", + function(x) plyr::summarise(x, total.intensity=sum(as.numeric(intensity), na.rm = TRUE))) + d_sum$total.intensity = d_norm$total.intensity[match(d_sum$fc.raw.file, d_norm$fc.raw.file)] d_sum$Log10Diff = getAbundanceClass(log10(d_sum$total.intensity)) d_sum$s.intensity = d_sum$s.intensity / d_sum$total.intensity * 100 @@ -186,9 +186,9 @@ plot_ContEVD = function(data, top5) ## order of pname determines order of bars d_sum = rbind(d_sum[d_sum$pname!="other",], d_sum[d_sum$pname=="other",]) - ## value of factors determines order in the legend + ## value of factors determines order in the legend ## --> make proteins a factor, with 'other' being the first - d_sum$Protein = factor(d_sum$pname, levels=unique(c("other", d_sum$pname)), ordered = TRUE) + d_sum$Protein = factor(d_sum$pname, levels = unique(c("other", d_sum$pname)), ordered = TRUE) head(d_sum) ## plot @@ -202,8 +202,8 @@ plot_ContEVD = function(data, top5) theme_bw() + ggtitle("EVD: Top5 Contaminants per Raw file") + ylab("contaminant (% intensity)") + - scale_fill_manual(values = brewer.pal(6,"Accent")) + - scale_colour_manual(values = brewer.pal(6,"Accent")) + + scale_fill_manual(values = RColorBrewer::brewer.pal(6,"Accent")) + + scale_colour_manual(values = RColorBrewer::brewer.pal(6,"Accent")) + geom_hline(aes_string(yintercept = "5"), linetype='dashed') + #guides(alpha=NULL, fill = guide_legend(nrow = 2, ncol = 3, byrow = TRUE, reverse = TRUE)) + #theme(legend.position="top", legend.title=element_blank()) + @@ -229,7 +229,6 @@ plot_ContEVD = function(data, top5) #' @return GGplot object #' #' @import ggplot2 -#' @importFrom RColorBrewer brewer.pal #' @export #' #' @examples @@ -255,8 +254,8 @@ plot_RatiosPG = function(df_ratios, d_range, main_title, main_col, legend_title) geom_area(aes_string(alpha = "ltype", fill = "col")) + xlab("ratio") + ylab("density") + - scale_fill_manual(values = rep(brewer.pal(6,"Accent"), times=40), guide_legend(legend_title)) + - scale_colour_manual(values = rep(brewer.pal(6,"Accent"), times=40)) + + scale_fill_manual(values = rep(RColorBrewer::brewer.pal(6,"Accent"), times=40), guide_legend(legend_title)) + + scale_colour_manual(values = rep(RColorBrewer::brewer.pal(6,"Accent"), times=40)) + scale_alpha_discrete(range = c(1, 0.2), labels=c("dotted"="unimodal", "solid"="multimodal"), guide_legend("shape") @@ -303,7 +302,7 @@ plot_CountData = function(data, y_max, thresh_line, title) ylab("count") + scale_x_discrete_reverse(data$fc.raw.file) + ylim(0, y_max) + - scale_fill_manual(values = c("green", "#BEAED4", "red")) + + scale_fill_manual(values = c("green", "#BEAED4", "blue")) + addGGtitle(title_main, title_sub) + geom_abline(alpha = 0.5, intercept = thresh_line, slope = 0, colour = "black", linetype = "dashed", size = 1.5) + coord_flip() @@ -445,7 +444,7 @@ plot_MBRAlign = function(data, y_lim, title_sub, match_tol) #' plot_MBRIDtransfer = function(data) { - data.m = melt(data, id.vars=c("fc.raw.file", "sample")) + data.m = reshape2::melt(data, id.vars=c("fc.raw.file", "sample")) data.m$value = data.m$value * 100 ## used to be scores in [0-1] if (all(is.na(data.m$value))) {# the slice of Raw file we are looking at could have no MBR data -- and ggplot needs something to plot... @@ -538,8 +537,8 @@ plot_MBRgain = function(data, title_sub = "") #' plot_Charge = function(d_charge) { - p = ggplot(d_charge, aes_string(x = "Var1_center", y = "Var2_height")) + - geom_col(aes_string(width = "Margin_var1", fill = "Var2"), color = "black", position = position_stack(reverse = TRUE)) + + p = ggplot(d_charge, aes_string(x = "Var1_center", y = "Var2_height", width = "Margin_var1")) + + geom_col(aes_string(fill = "Var2"), color = "black", position = position_stack(reverse = TRUE)) + geom_text(aes_string(label = "Var1", x = "Var1_center", y = 1.05)) + xlab("Raw file") + ylab("fraction [%]") + @@ -633,7 +632,7 @@ plot_IDsOverRT = function(data, x_lim = range(data$RT), y_max = max(data$counts) #' plot_IDRate = function(data, id_rate_bad, id_rate_great, label_ID) { - p = ggplot(data, aes_string(y = "fc.raw.file", x = "ms.ms.identified....")) + + p = ggplot(data, aes_string(y = "fc.raw.file", x = "ms.ms.identified....")) + geom_point(aes_string(colour = "cat")) + geom_vline(xintercept = id_rate_bad, color=(label_ID)[1]) + geom_vline(xintercept = id_rate_great, color=(label_ID)[3]) + @@ -661,50 +660,47 @@ plot_IDRate = function(data, id_rate_bad, id_rate_great, label_ID) #' @param just (ignored) #' @return gTable #' -#' @import gtable -#' @importFrom grid rectGrob textGrob unit.c grobHeight grobWidth -#' plotTableRaw = function(data, colours="black", fill=NA, just="centre") { - label_matrix <- as.matrix(data) + label_matrix = as.matrix(data) - nc <- ncol(label_matrix) - nr <- nrow(label_matrix) - n <- nc*nr + nc = ncol(label_matrix) + nr = nrow(label_matrix) + n = nc*nr colours <- rep(colours, length.out = n) - justs <- rep(just, length.out = n) fill <- rep(fill, length.out = n) + justs <- rep(just, length.out = n) ## text for each cell labels <- lapply(seq_len(n), function(ii) - textGrob(as.character(label_matrix[ii]), gp=gpar(fontsize=8, col=colours[ii]), just="left", x = grid::unit(0.05, "npc"))) + grid::textGrob(as.character(label_matrix[ii]), gp = grid::gpar(fontsize=8, col=colours[ii]), just="left", x = grid::unit(0.05, "npc"))) label_grobs <- matrix(labels, ncol=nc) ## define the fill background of cells fill <- lapply(seq_len(n), function(ii) - rectGrob(gp=gpar(fill=fill[ii]))) + grid::rectGrob(gp = grid::gpar(fill=fill[ii]))) ## some calculations of cell sizes row_heights <- function(m){ - do.call(unit.c, apply(m, 1, function(l) - max(do.call(unit.c, lapply(l, grobHeight))))) + do.call(grid::unit.c, apply(m, 1, function(l) + max(do.call(grid::unit.c, lapply(l, grid::grobHeight))))) } col_widths <- function(m){ - do.call(unit.c, apply(m, 2, function(l) - max(do.call(unit.c, lapply(l, grobWidth))))) + do.call(grid::unit.c, apply(m, 2, function(l) + max(do.call(grid::unit.c, lapply(l, grid::grobWidth))))) } ## place labels in a gtable - g <- gtable_matrix("table", grobs=label_grobs, - widths=col_widths(label_grobs) + grid::unit(2,"mm"), - heights=row_heights(label_grobs) + grid::unit(2,"mm")) + g <- gtable::gtable_matrix("table", grobs = label_grobs, + widths = col_widths(label_grobs) + grid::unit(2,"mm"), + heights = row_heights(label_grobs) + grid::unit(2,"mm")) ## add the background xt <- rep(seq_len(nr), each=nc) xl <- rep(seq_len(nc), times=nr) - g <- gtable_add_grob(g, fill, t=xt, l=xl, z=0, name="fill") + g <- gtable::gtable_add_grob(g, fill, t=xt, l=xl, z=0, name="fill") return(g) } @@ -718,8 +714,6 @@ plotTableRaw = function(data, colours="black", fill=NA, just="centre") #' @param font_size Html font size #' @return table as character string for cat()'ing into html #' -#' @importFrom knitr kable -#' @import kableExtra #' @export #' #' @examples @@ -731,13 +725,15 @@ plotTableRaw = function(data, colours="black", fill=NA, just="centre") #' getHTMLTable = function(data, header = NA, font_size = 12) { - tbl = kable(data, row.names = FALSE, format = "html") %>% - kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE, font_size = font_size) - + tbl = kableExtra::kable_styling(kableExtra::kable(data, row.names = FALSE, format = "html"), + bootstrap_options = c("striped", "hover", "condensed"), + full_width = FALSE, + font_size = font_size) + if (!any(is.na(header))) { header__ = header; - tbl = tbl %>% add_header_above(c(header__ = ncol(data))) + tbl = kableExtra::add_header_above(tbl, c(header__ = ncol(data))) tbl = gsub("header__", paste(header__, sep = "", collapse = "
"), as.character(tbl)) } @@ -758,9 +754,6 @@ getHTMLTable = function(data, header = NA, font_size = 12) #' @param just (ignored) #' @return gTree object with class 'PTXQC_table' #' -#' @importFrom grid textGrob gTree gList grobHeight -#' @import gridExtra -#' @import gtable #' @export #' #' @examples @@ -784,34 +777,33 @@ plotTable = function(data, title = "", footer = "", col_names = colnames(data), colours = unlist(lapply(col, function(cc) c("black", rep(cc, nrow(data))))), ## col-wise just = c(rep("centre", ncol(data)), rep(just, each=nrow(data), length.out=nd))) - colhead = lapply(col_names, function(ii) textGrob(ii, gp=gpar(fontsize=12, col="black", fontface="bold", fill="grey"))) + colhead = lapply(col_names, function(ii) grid::textGrob(ii, gp = grid::gpar(fontsize=12, col="black", fontface="bold", fill="grey"))) ## replace column names - table = gtable_add_grob(table, colhead, t = 1, l = 1:ncol(data)) + table = gtable::gtable_add_grob(table, colhead, t = 1, l = 1:ncol(data)) - #table = tableGrob(data, rows = NULL, cols = c("Raw file", "% identified")) if (nchar(title[1]) > 0) { - gtitle = textGrob(title, gp = gpar(fontsize = 14)) + gtitle = grid::textGrob(title, gp = grid::gpar(fontsize = 14)) padding = grid::unit(1.5, "line") ## add heading (white space) - table = gtable_add_rows(table, heights = grobHeight(gtitle) + padding, pos = 0) + table = gtable::gtable_add_rows(table, heights = grid::grobHeight(gtitle) + padding, pos = 0) ## add heading (text as overlay) - table = gtable_add_grob(table, list(gtitle), t = 1, l = 1, r = ncol(table), clip = "off") + table = gtable::gtable_add_grob(table, list(gtitle), t = 1, l = 1, r = ncol(table), clip = "off") } if (nchar(footer[1]) > 0) { - gfooter = textGrob(footer, gp = gpar(fontsize = 10)) + gfooter = grid::textGrob(footer, gp = grid::gpar(fontsize = 10)) padding = grid::unit(1.5, "line") ## add heading (white space) - table = gtable_add_rows(table, heights = grobHeight(gfooter) + padding, pos = -1) ## bottom + table = gtable::gtable_add_rows(table, heights = grid::grobHeight(gfooter) + padding, pos = -1) ## bottom ## add heading (text as overlay) - table = gtable_add_grob(table, list(gfooter), t = nrow(table), l = 1, r = ncol(table), clip = "off") + table = gtable::gtable_add_grob(table, list(gfooter), t = nrow(table), l = 1, r = ncol(table), clip = "off") } ## neat trick to enable calling print(g), to mimic ggplot-behaviour on this object ## in combination with print.PTXQC_table() -- see below - p = gTree(children = gList(table), cl = c("PTXQC_table")) + p = grid::gTree(children = grid::gList(table), cl = c("PTXQC_table")) ## hide the table name inside (for qcMetric::getTitles()) p$labels$title = title @@ -821,16 +813,15 @@ plotTable = function(data, title = "", footer = "", col_names = colnames(data), } #' helper S3 class, enabling print(some-plot_Table-object) -#' @importFrom grid grid.newpage grid.draw #' @param x Some Grid object to plot -#' @param ... further arguments (not used, but required for consistency with other print methods) -#' @return A function +#' @param ... Further arguments (not used, but required for consistency with other print methods) +#' @return NULL #' #' @export #' print.PTXQC_table = function(x, ...) { - grid.newpage(); - grid.draw(x) + grid::grid.newpage(); + grid::grid.draw(x) return(NULL) } @@ -849,7 +840,6 @@ print.PTXQC_table = function(x, ...) { #' @return GGplot object #' #' @import ggplot2 -#' @importFrom RColorBrewer brewer.pal #' @export #' #' @examples @@ -860,7 +850,7 @@ print.PTXQC_table = function(x, ...) { #' rnorm(n[3], 3, 0.7), #' rnorm(n[4], 4.5, 0.8))) #' stats = data.frame(fc.raw.file = letters[4:1], -#' sd = c(2.4, 0.5, 0.7, 0.8), +#' sd_uncal = c(2.4, 0.5, 0.7, 0.8), #' outOfCal = c(TRUE, FALSE, FALSE, FALSE)) #' plot_UncalibratedMSErr(data, MQBug_raw_files = letters[1], #' stats, y_lim = c(-20,20), 15, "subtitle") @@ -875,13 +865,13 @@ plot_UncalibratedMSErr = function(data, MQBug_raw_files, stats, y_lim, extra_lim showColLegend = ifelse(length(setdiff(data$col, "default")) > 0, "legend", "none") ## amend SD to fc.raw.file - stats$fcr_new_lvl = paste0(stats$fc.raw.file, " (sd = ", stats$sd, "ppm)") + stats$fcr_new_lvl = paste0(stats$fc.raw.file, " (sd = ", stats$sd_uncal, "ppm)") ## i.e. change name without underlying value levels(data$fc.raw.file) = stats$fcr_new_lvl[ match(levels(data$fc.raw.file), stats$fc.raw.file) ] p = ggplot(data, col=data$col) + geom_boxplot(aes_string(x = "fc.raw.file", y = "uncalibrated.mass.error..ppm.", col="col"), varwidth = TRUE, outlier.shape = NA) + - scale_colour_manual("", values = c("default"="black", "MQ bug"="red", "out-of-search-tol"="red"), guide=showColLegend) + + scale_colour_manual("", values = c("default"="black", "MQ bug"="red", "out-of-search-tol"="red"), guide = showColLegend) + ylab(expression(Delta~"mass [ppm]")) + xlab("") + ylim(y_lim) + @@ -911,7 +901,6 @@ plot_UncalibratedMSErr = function(data, MQBug_raw_files, stats, y_lim, extra_lim #' @return GGplot object #' #' @import ggplot2 -#' @importFrom RColorBrewer brewer.pal #' @export #' #' @examples @@ -941,7 +930,7 @@ plot_CalibratedMSErr = function(data, MQBug_raw_files, stats, y_lim, extra_limit ## plot p = ggplot(data, col=data$col) + - geom_boxplot(aes_string(x = "fc.raw.file", y = "mass.error..ppm.", col="col"), varwidth=TRUE, outlier.shape = NA) + + geom_boxplot(aes_string(x = "fc.raw.file", y = "mass.error..ppm.", col="col"), varwidth = TRUE, outlier.shape = NA) + scale_colour_manual("", values = c("default"="black", "MQ bug"="red", "out-of-search-tol"="red"), guide = showColLegend) + ylab(expression(Delta~"mass [ppm]")) + xlab("") + @@ -1025,13 +1014,20 @@ plot_MS2Oversampling = function(data) #' plot_MS2Decal = function(data) { - p = ggplot(data, aes_string(x = "msErr", fill="type")) + - geom_histogram(binwidth = diff(range(data$msErr, na.rm=TRUE))/30) + ## explicit, to get rid of 'stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.' + ## trim down the data to 2-98 percentiles (to avoid outliers far off) + data2 = plyr::ddply(data, "file", function(x) { + qnt = quantile(x$msErr, probs = c(0.02, 0.98), na.rm = TRUE) + return (x[qnt[1] < x$msErr & x$msErr < qnt[2], ]) + }) + + p = ggplot(data2, aes_string(x = "msErr", fill="type")) + + ## individual bin width for each raw file by using a function + geom_histogram(binwidth = function(x) {diff(range(x, na.rm = TRUE))/30}) + xlab("fragment mass delta") + ylab("count") + scale_fill_manual(values = c(forward = "#99d594", decoy = "#ff0000")) + ggtitle("MSMS: Fragment mass errors per Raw file") + - facet_wrap(~file) + facet_wrap(~file, scales = "fixed") #print(p) return(p) @@ -1063,13 +1059,13 @@ plot_MS2Decal = function(data) #' plot_MissedCleavages = function(data, title_sub = "") { - st_bin.m = melt(data, id.vars = c("fc.raw.file")) + st_bin.m = reshape2::melt(data, id.vars = c("fc.raw.file")) p = ggplot(data = st_bin.m, aes_string(x = "factor(fc.raw.file)", y = "value", fill = "variable")) + - geom_col(position=position_stack(reverse = TRUE)) + + geom_col(position = position_stack(reverse = TRUE)) + xlab("Raw file") + ylab("missed cleavages [%]") + - theme(legend.title=element_blank()) + + theme(legend.title = element_blank()) + scale_fill_manual(values = rep(c("#99d594", "#ffffbf", "#fc8d59", "#ff0000", "#800080", "#000000"), 10)) + geom_abline(alpha = 0.5, intercept = 0.75, slope = 0, colour = "black", linetype = "dashed", size = 1.5) + coord_flip() + @@ -1162,7 +1158,7 @@ plot_IonInjectionTimeOverRT = function(data, stats, extra_limit) geom_hline(yintercept = extra_limit, linetype = 'dashed') + guides(color=guide_legend(title="Raw file with\naverage inj. time")) + ggtitle("MSMSscans: Ion Injection Time over RT") + - pointsPutX(x_range=range(data$rRT), x_section=c(0.03,0.08), y=stats_sub$mean, col=stats_sub$fc.raw.file[,drop = TRUE]) + pointsPutX(x_range = range(data$rRT), x_section = c(0.03, 0.08), y = stats_sub$mean, col = stats_sub$fc.raw.file[,drop = TRUE]) #print(p) return(p) @@ -1227,4 +1223,40 @@ plot_ScanIDRate = function(data) facet_wrap(~ fc.raw.file) + ggtitle(paste0("MSMSscans: TopN % identified over N")) return (p) +} + + +#' +#' Plot Total Ion Count over time +#' +#' The input is a data.frame with already averaged counts over binned RT-slices. +#' +#' @param data A data.frame with columns 'fc.raw.file', 'RT', 'intensity' +#' @param x_lim Plot range of x-axis +#' @param y_lim Plot range of y-axis +#' @return GGplot object +#' +#' @import ggplot2 +#' @export +#' +#' @examples +#' +#' data = data.frame(fc.raw.file = rep(c("file A", "file B", "file C"), each=81), +#' RT = c(20:100), +#' intensity = c(rnorm(81, mean=20), rnorm(81, mean=10), rnorm(81, mean=30))) +#' plot_TIC(data, c(10, 100), c(0, 40)) +#' +plot_TIC = function(data, x_lim, y_lim) +{ + p = ggplot(data) + + geom_line(aes_string(x = "RT", y = "intensity", colour = "fc.raw.file"), size=1, alpha=0.7) + + scale_color_manual(values = brewer.pal.Safe(length(unique(data$fc.raw.file)), "Set1")) + + guides(color = guide_legend(title = "Raw file\n(avg. peak width)")) + + xlab("retention time [min]") + + ylab("intensity") + + coord_cartesian(xlim = x_lim, ylim = y_lim) + ## zoom in y -- do not cut data (preserve lines) + ggtitle("SM: Total Ion Count") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + #print(p) + return(p) } \ No newline at end of file diff --git a/R/fcn_qualities.R b/R/fcn_qualities.R index b8f7d44..462f656 100644 --- a/R/fcn_qualities.R +++ b/R/fcn_qualities.R @@ -254,8 +254,6 @@ qualHighest = function(x, N) #' @param x List of vectors, where each vector holds a distribution #' @return A data.frame with ks-test values of the "reference" to all other distributions (see Details) #' -#' @import stats -#' #' @export #' qualBestKS = function(x) { @@ -274,7 +272,7 @@ qualBestKS = function(x) { { if (j>length(x)) next; rr[i,j] = 1 - suppressWarnings( ## brags about '-value will be approximate in the presence of ties' - ks.test(x[[i]], x[[j]])$statistic + stats::ks.test(x[[i]], x[[j]])$statistic ) } } diff --git a/R/qcMetric.R b/R/qcMetric.R index 3c9522d..92a8d41 100644 --- a/R/qcMetric.R +++ b/R/qcMetric.R @@ -14,8 +14,6 @@ #' @field qcName [placeholder] Name of the qcScore in the heatmap #' @field orderNr [placeholder] column index during heatmap generation and for the general order of plots #' -#' @import methods -#' #' @exportClass qcMetric #' @export qcMetric #' @@ -56,6 +54,7 @@ qcMetric = setRefClass("qcMetric", workerFcn = "function", ## returns list(plots =, [qcScores=]) plots = "list", htmlTable = "character", + title = "list", ## the following members are related to the heatmap only qcScores = "data.frame", ## with columns "raw.file", "score" qcCat = "character", ## one of "prep", "LC", "MS" or empty (e.g. for PG) @@ -75,37 +74,32 @@ qcMetric = setRefClass("qcMetric", .self$plots = list(); ## obtained from worker .self$htmlTable = NA_character_; ## obtained from worker .self$qcScores = data.frame(); ## obtained from worker + .self$title = list(); .self$qcCat = qcCat; .self$qcName = qcName; .self$orderNr = orderNr; .self$outData = list(); return(.self) }, - checkInput = function(required_columns, given_columns) - { - if (!all(required_columns %in% given_columns)) - { - warning(paste0("Input check failed: columns '", - paste(setdiff(required_columns, given_columns), collapse="', '", sep=""), - "' are not present in input data!"), - immediate. = TRUE) - return (FALSE) - } - return (TRUE) - }, - setData = function(...) { ## fill with MQ data and compute results + setData = function(df, ...) { ## fill with MQ data and compute results cat("Starting to work on", gsub("~", " ", .self$qcName), "...\n") if (.self$orderNr < 0) { cat(" Metric disabled. Skipping...\n") - return(NULL) + return (NULL) + } + + if (is.null(df)) + { + cat(" No data available. Skipping...\n") + return (NULL) } ## GC stats mem_before = gc(verbose = FALSE, reset = TRUE) ## resetting Max to see how much this metric uses t_before = proc.time() - r = workerFcn(.self, ...) + r = workerFcn(.self, df, ...) ## clean memory to get a clean picture of each metrics memory footprint ## to enable the user to skip expensive metrics @@ -132,6 +126,8 @@ qcMetric = setRefClass("qcMetric", if ("htmlTable" %in% names(r)) .self$htmlTable = r[["htmlTable"]]; if ("qcScores" %in% names(r)) .self$qcScores = r[["qcScores"]]; + if ("title" %in% names(r)) .self$title = r[["title"]] + cat("...", gsub("~", " ", .self$qcName), " done\n") return(NULL) @@ -152,18 +148,19 @@ qcMetric = setRefClass("qcMetric", getTitles = function(stopOnMissing = TRUE, subtitle_sep = " - ") { labels = sapply(1:length(.self$plots), function(idx) { - if ("title" %in% names(.self$plots[[idx]]$labels)){ - title = .self$plots[[idx]]$labels$title + if (length(.self$title) != 0){ + return(.self$title[[idx]]) + } + else if ("title" %in% names(.self$plots[[idx]]$labels)){ + titles = .self$plots[[idx]]$labels$title #title = 'atop("PG: PCA of 'reporter intensity'", scriptstyle("(excludes contaminants)"))' - title - regex = "atop\\(\"(.*)\", scriptstyle\\(\"(.*)\"\\)\\)" - m = regexpr(regex, title, perl = TRUE) + regex = 'atop\\("(.*)", scriptstyle\\("(.*)"\\)\\)' + m = regexpr(regex, titles, perl = TRUE) if (m == 1) { ## hit! - text = substring(title, attr(m, "capture.start"), attr(m, "capture.start") + attr(m, "capture.length") - 1) - title = paste0(text[1], subtitle_sep, text[2]) - title + text = substring(titles, attr(m, "capture.start"), attr(m, "capture.start") + attr(m, "capture.length") - 1) + return(paste0(text[1], subtitle_sep, text[2])) } - return (title) + return (titles) } else if (stopOnMissing) { stop(c("getTitles() for ", .self$qcName, ": No title found in ggplot object at index ", idx, "!")) } else return("") @@ -189,13 +186,26 @@ qcMetric = setRefClass("qcMetric", flattenList = function(x) { repeat { idx_list = sapply(x, function(arg) {return(all(class(arg) == "list"))}) - if(!any(idx_list)) return(x) + if (!any(idx_list)) return(x) r_list = Reduce(append, x[idx_list]) items = x[!idx_list] x = Reduce(append, list(r_list, items)) } } +checkInput = function(required_columns, given_df) +{ + if (!all(required_columns %in% colnames(given_df))) + { + warning(paste0("Input check failed: columns '", + paste(setdiff(required_columns, colnames(given_df)), collapse="', '", sep=""), + "' are not present in input data!"), + immediate. = TRUE) + return (FALSE) + } + return (TRUE) +} + qcMetric_AverageQualOverall = setRefClass("qcMetric_AverageQualOverall", contains = "qcMetric", @@ -205,9 +215,9 @@ qcMetric_AverageQualOverall = helpTextTemplate = "Internal metric to compute the average quality across all other metrics", workerFcn = function(.self, df.QCM) { - if (empty(df.QCM)) stop("AverageQual_qc::workerFcn(): input empty!") + if (plyr::empty(df.QCM)) stop("AverageQual_qc::workerFcn(): input empty!") lpl = list() ## empty... - qcScore = ddply(df.QCM, "fc.raw.file", function(df.row) { + qcScore = plyr::ddply(df.QCM, "fc.raw.file", function(df.row) { df.row.raw = unlist(df.row[,!grepl("fc.raw.file", colnames(df.row))]) df.row.raw[is.infinite(df.row.raw)] = NA ## mask explicitly missing values, since it will bias the mean otherwise return (data.frame(val = mean(df.row.raw, na.rm = TRUE))) diff --git a/R/qcMetric_EVD.R b/R/qcMetric_EVD.R index a16b4be..74762e1 100644 --- a/R/qcMetric_EVD.R +++ b/R/qcMetric_EVD.R @@ -1,4 +1,3 @@ - ##################################################################### qcMetric_EVD_UserContaminant = setRefClass( @@ -43,9 +42,9 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in #lst_contaminants = yaml_contaminants ## completeness check ## PG is either missing, or has the correct data - if (!is.null(df_pg)) stopifnot(c("id", "fasta.headers") %in% colnames(df_pg)) + if (!is.null(df_pg) | !checkInput(c("id", "fasta.headers"), df_pg)) return() ## "score" might not be present (e.g. missing in MQ 1.0.13.13) - stopifnot(c("protein.group.ids", "type", "intensity", "fc.raw.file") %in% colnames(df_evd)) + if (!checkInput(c("protein.group.ids", "type", "intensity", "fc.raw.file"),df_evd)) return() local_qcScores = data.frame() @@ -88,7 +87,7 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in ## do not trust MBR here. We want real evidence! evd_realMS = !grepl("MATCH", df_evd$type) ## for each Raw file: find unique peptides of our contaminant - cont_data.l = dlply(df_evd[evd_uniqueGroup & evd_realMS, ], "fc.raw.file", + cont_data.l = plyr::dlply(df_evd[evd_uniqueGroup & evd_realMS, ], "fc.raw.file", function(x) { if (length(grep(";", x$protein.group.ids))) stop("more than one proteinGroup for supposedly unique peptide...") @@ -100,7 +99,7 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in above.thresh = (sc > ca_thresh) | (int > ca_thresh) cont_scoreECDF = NULL; if ("score" %in% colnames(x)) { - cont_scoreECDF = ddply(x, "idx_cont", function(xx) { + cont_scoreECDF = plyr::ddply(x, "idx_cont", function(xx) { if (length(unique(xx$score)) < 2) return(NULL) ## not enough data for ECDF r = getECDF(xx$score) r$condition = c("sample", "contaminant")[xx$idx_cont[1]+1] @@ -122,8 +121,8 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in head(cont_data.l) ## melt - cont_data = ldply(cont_data.l, function(l) { l$cont_data }) - cont_data.long = melt(cont_data, id.vars="fc.raw.file") + cont_data = plyr::ldply(cont_data.l, function(l) { l$cont_data }) + cont_data.long = reshape2::melt(cont_data, id.vars="fc.raw.file") # # old: not_found = all(cont_data.long$value[cont_data.long$variable == "above.thresh"] == FALSE) @@ -137,13 +136,13 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in lpl = append(lpl, list(pl_cont)) } else { ## plot User-Contaminants - lpl_i = byXflex(data = cont_data.long, indices = cont_data.long$fc.raw.file, subset_size = 120, - FUN = plot_ContUser, sort_indices = TRUE, + lpl_i = byXflex(data = cont_data.long, indices = cont_data.long$fc.raw.file, subset_size = 120, + FUN = plot_ContUser, sort_indices = TRUE, name_contaminant = ca, extra_limit = ca_thresh, subtitle = paste("search realm:", search_realm)) lpl = append(lpl, lpl_i) ## plot Andromeda score distribution of contaminant vs. sample - pl_andr = llply(cont_data.l, function(l) + pl_andr = plyr::llply(cont_data.l, function(l) { if (l$cont_data$above.thresh == FALSE || is.null(l$cont_scoreECDF)) @@ -154,7 +153,7 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in #print(p) return(p) }) - pl_andr_nonNull = compact(pl_andr) ## remove 'NULL' entries from plot list + pl_andr_nonNull = plyr::compact(pl_andr) ## remove 'NULL' entries from plot list lpl = append(lpl, pl_andr_nonNull) ## add heatmap column @@ -173,8 +172,8 @@ Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the in return(list(plots = lpl, qcScores = local_qcScores)) }, - qcCat = "Prep", - qcName = "EVD:Contaminant~(%s)", + qcCat = "Prep", + qcName = "EVD:User~Contaminant~(%s)", orderNr = 0020 ) return(.self) @@ -189,7 +188,7 @@ qcMetric_EVD_PeptideInt = setRefClass( contains = "qcMetric", methods = list(initialize=function() { callSuper( helpTextTemplate = - "Peptide precursor intensity per Raw file from evidence.txt. + "Peptide precursor intensity per Raw file from evidence.txt WITHOUT match-between-runs evidence. Low peptide intensity usually goes hand in hand with low MS/MS identifcation rates and unfavourable signal/noise ratios, which makes signal detection harder. Also instrument acquisition time increases for trapping instruments. @@ -207,12 +206,12 @@ Heatmap score [EVD: Pep Intensity (>%1.1f)]: workerFcn = function(.self, df_evd, thresh_intensity) { ## completeness check - stopifnot(c("fc.raw.file", "intensity") %in% colnames(df_evd)) + if (!checkInput(c("fc.raw.file", "intensity", "contaminant"), df_evd)) return() ## update helpText .self$helpText = sprintf(.self$helpTextTemplate, thresh_intensity, thresh_intensity) - medians_pep = ddply(df_evd[ , c("fc.raw.file", "intensity")], "fc.raw.file", + medians_pep = plyr::ddply(df_evd[ , c("fc.raw.file", "intensity")], "fc.raw.file", function(x) data.frame(med = log2(quantile(x$intensity, probs=0.5, na.rm = TRUE)))) int_dev_pep = RSD((medians_pep$med)) @@ -234,7 +233,7 @@ Heatmap score [EVD: Pep Intensity (>%1.1f)]: return(list(plots = lpl, qcScores = qcScore)) }, qcCat = "prep", - qcName = "EVD:~Pep~Intensity~(\">%1.1f\")", + qcName = "EVD:~Peptide~Intensity~(\">%1.1f\")", orderNr = 0030 ) return(.self) @@ -276,11 +275,12 @@ Each Raw file is now scored by the minimum LE of all its 4 channels. workerFcn=function(.self, df_evd) { ## completeness check - stopifnot(c("fc.raw.file") %in% colnames(df_evd)) + if (!checkInput(c("fc.raw.file"), df_evd)) return() ## check if reporter.intensity.0... is present cols_reporter = grepv("^reporter.intensity.corrected.[0-9]", colnames(df_evd)); cols_reporter.nc = grepv("^reporter.intensity.[0-9]", colnames(df_evd)); - stopifnot(length(cols_reporter) > 1 && length(cols_reporter.nc) > 1) + if(length(cols_reporter) <= 1 || length(cols_reporter.nc) <= 1) {warning("Two reporter.intensity and two reporter.intensity.corrected columns are needed for metric ReporterIntensity.") + return()} ## check if correction was done at all if (all(df_evd[1:1000, cols_reporter] == df_evd[1:1000, cols_reporter.nc], na.rm = TRUE)) { @@ -293,12 +293,12 @@ Each Raw file is now scored by the minimum LE of all its 4 channels. ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory - df_reps = melt(df_evd[, c("fc.raw.file", cols_reporter)], + df_reps = reshape2::melt(df_evd[, c("fc.raw.file", cols_reporter)], id.vars ="fc.raw.file", value.name = "intensity", variable.name = "channel") head(df_reps) - dt_reps = data.table(df_reps) + dt_reps = data.table::data.table(df_reps) ## do NOT remove -inf and NA's and 0's -- we need them to count labeling-efficiency (#entries with intensity > 0 vs. ALL) @@ -383,22 +383,25 @@ MBR should be switched off for the Raw files which are affected (could be a few Heatmap score [EVD: Prot Count (>%1.0f)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%%. ", - workerFcn = function(.self, df_evd, thresh_protCount) + workerFcn = function(.self, df_evd, df_evd_tf, thresh_protCount) { ## completeness check - stopifnot(c("fc.raw.file", "protein.group.ids", "match.time.difference") %in% colnames(df_evd)) + + req_cols = c("fc.raw.file", "protein.group.ids", "is.transferred") + if (!checkInput(req_cols, df_evd)) return() + .self$helpText = sprintf(.self$helpTextTemplate, thresh_protCount) - protC = getProteinCounts(df_evd[, c("fc.raw.file", "protein.group.ids", "match.time.difference")]) + protC = getProteinCounts(rbind(df_evd[,req_cols], df_evd_tf[, req_cols])) protC$block = factor(assignBlocks(protC$fc.raw.file, 30)) - max_prot = max(unlist(dlply(protC, "fc.raw.file", function(x) sum(x$counts)))) + max_prot = max(unlist(plyr::dlply(protC, "fc.raw.file", function(x) sum(x$counts)))) ## average gain in percent - reportMTD = any(!is.na(df_evd$match.time.difference)) + reportMTD = nrow(df_evd_tf) > 0 gain_text = ifelse(reportMTD, sprintf("MBR gain: +%.0f%%", mean(protC$MBRgain, na.rm = TRUE)), "") - lpl = dlply(protC, "block", .fun = function(x) + lpl = plyr::dlply(protC, "block", .fun = function(x) { p = plot_CountData(data = x, y_max = max(thresh_protCount, max_prot)*1.1, @@ -409,7 +412,7 @@ Heatmap score [EVD: Prot Count (>%1.0f)]: Linear scoring from zero. Reaching or }) ## QC measure for protein ID performance - qc_protc = ddply(protC, "fc.raw.file", function(x){ + qc_protc = plyr::ddply(protC, "fc.raw.file", function(x){ if (nrow(x) == 3 && length(grep("^genuine", x$category))!= 2){ stop("expected two categories to start with 'genuine...'") } @@ -423,7 +426,7 @@ Heatmap score [EVD: Prot Count (>%1.0f)]: Linear scoring from zero. Reaching or return(list(plots = lpl, qcScores = qcScore)) }, qcCat = 'general', - qcName = "EVD:~Prot~Count~(\">%1.0f\")", + qcName = "EVD:~Protein~Count~(\">%1.0f\")", orderNr = 0450 ) return(.self) @@ -453,22 +456,25 @@ MBR should be switched off for the Raw files which are affected (could be a few Heatmap score [EVD: Pep Count (>%1.0f)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%%. ", - workerFcn = function(.self, df_evd, thresh_pepCount) + workerFcn = function(.self, df_evd, df_evd_tf, thresh_pepCount) { ## completeness check - stopifnot(c("fc.raw.file", "modified.sequence", "match.time.difference") %in% colnames(df_evd)) - + + req_cols = c("fc.raw.file", "modified.sequence", "is.transferred") + if (!checkInput(req_cols, df_evd)) return() + if (nrow(df_evd_tf)>0 & !checkInput(req_cols, df_evd_tf)) return() + .self$helpText = sprintf(.self$helpTextTemplate, thresh_pepCount) - pepC = getPeptideCounts(df_evd[, c("fc.raw.file", "modified.sequence", "match.time.difference")]) + pepC = getPeptideCounts(rbind(df_evd[, req_cols], df_evd_tf[, req_cols])) pepC$block = factor(assignBlocks(pepC$fc.raw.file, 30)) - max_pep = max(unlist(dlply(pepC, "fc.raw.file", function(x) sum(x$counts)))) + max_pep = max(unlist(plyr::dlply(pepC, "fc.raw.file", function(x) sum(x$counts)))) ## average gain in percent - reportMTD = any(!is.na(df_evd$match.time.difference)) + reportMTD = any(df_evd$is.transferred) gain_text = ifelse(reportMTD, sprintf("MBR gain: +%.0f%%", mean(pepC$MBRgain, na.rm = TRUE)), "") - lpl = dlply(pepC, "block", .fun = function(x) + lpl = plyr::dlply(pepC, "block", .fun = function(x) { p = plot_CountData(data = x, y_max = max(thresh_pepCount, max_pep)*1.1, @@ -479,7 +485,7 @@ Heatmap score [EVD: Pep Count (>%1.0f)]: Linear scoring from zero. Reaching or e }) ## QC measure for peptide ID performance - qc_pepc = ddply(pepC, "fc.raw.file", function(x){ + qc_pepc = plyr::ddply(pepC, "fc.raw.file", function(x){ if (nrow(x) == 3 && length(grep("^genuine", x$category))!= 2){ stop("expected two categories to start with 'genuine...'") } @@ -493,7 +499,7 @@ Heatmap score [EVD: Pep Count (>%1.0f)]: Linear scoring from zero. Reaching or e return(list(plots = lpl, qcScores = qcScore)) }, qcCat = 'general', - qcName = "EVD:~Pep~Count~(\">%1.0f\")", + qcName = "EVD:~Peptide~Count~(\">%1.0f\")", orderNr = 0400 ) return(.self) @@ -518,13 +524,13 @@ Heatmap score [EVD: RT Peak Width]: Scored using BestKS function, i.e. the D sta workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("retention.time", "retention.length", "fc.raw.file") %in% colnames(df_evd)) + if (!checkInput(c("retention.time", "retention.length", "fc.raw.file"), df_evd)) return() ## compute some summary stats before passing data to ggplot (performance issue for large experiments) - df_evd.m.d = ddply(df_evd[,c("retention.time", "retention.length", "fc.raw.file")], "fc.raw.file", .fun = peakWidthOverTime) + df_evd.m.d = plyr::ddply(df_evd[,c("retention.time", "retention.length", "fc.raw.file")], "fc.raw.file", .fun = peakWidthOverTime) head(df_evd.m.d) ## median peak width - df_evd.m.d_avg = ddply(df_evd[,c("retention.length","fc.raw.file")], "fc.raw.file", .fun = function(x) { + df_evd.m.d_avg = plyr::ddply(df_evd[,c("retention.length","fc.raw.file")], "fc.raw.file", .fun = function(x) { #fcr = as.character(x$fc.raw.file[1]) #cat(fcr) m = median(x$retention.length, na.rm = TRUE); @@ -534,7 +540,7 @@ Heatmap score [EVD: RT Peak Width]: Scored using BestKS function, i.e. the D sta .self$outData[["avg_peak_width"]] = df_evd.m.d_avg ## augment Raw filename with avg. RT peak width - df_evd.m.d$fc.raw.file = mapvalues(df_evd.m.d$fc.raw.file, df_evd.m.d_avg$fc.raw.file, df_evd.m.d_avg$fc.raw.file_aug) + df_evd.m.d$fc.raw.file = plyr::mapvalues(df_evd.m.d$fc.raw.file, df_evd.m.d_avg$fc.raw.file, df_evd.m.d_avg$fc.raw.file_aug) df_evd.m.d$block = factor(assignBlocks(df_evd.m.d$fc.raw.file, 6)) ## color set is 9, so do not increase this (6*150%) ## identical limits for all plots df_evd.xlim = range(df_evd.m.d$RT, na.rm = TRUE) @@ -550,7 +556,7 @@ Heatmap score [EVD: RT Peak Width]: Scored using BestKS function, i.e. the D sta ## QC measure for reproducibility of peak shape ##.. create a list of distributions - l_dists = dlply(df_evd[,c("retention.length", "fc.raw.file")], "fc.raw.file", function(x) return(x$retention.length)) + l_dists = plyr::dlply(df_evd[,c("retention.length", "fc.raw.file")], "fc.raw.file", function(x) return(x$retention.length)) qc_evd_PeakShape = qualBestKS(l_dists) colnames(qc_evd_PeakShape) = c("fc.raw.file", .self$qcName) @@ -590,7 +596,7 @@ Heatmap score [EVD: MBR Align]: fraction of 'green' vs. 'green+red' peptides. workerFcn = function(.self, df_evd, tolerance_matching, raw_file_mapping) { ## completeness check - stopifnot(c("type", "calibrated.retention.time", "id", "raw.file", "modified.sequence", "charge") %in% colnames(df_evd)) + if (!checkInput(c("type", "calibrated.retention.time", "retention.time.calibration", "id", "raw.file", "modified.sequence", "charge"), df_evd)) return() ## find reference if (('fraction' %in% colnames(df_evd)) && (length(unique(df_evd$fraction)) > 1)) { @@ -603,7 +609,7 @@ Heatmap score [EVD: MBR Align]: fraction of 'green' vs. 'green+red' peptides. } else { refRaw = findAlignReference(df_evd) col_fraction = c() - txt_subtitle = paste("alignment reference:", refRaw) + txt_subtitle = paste("alignment reference:", gsub("\\", "/", refRaw, fixed = TRUE)) ## subtitles in ggplot must not contain '\' evd_has_fractions = FALSE } @@ -626,6 +632,12 @@ Heatmap score [EVD: MBR Align]: fraction of 'green' vs. 'green+red' peptides. ## augment more columns d_alignQ$retention.time.calibration = df_evd$retention.time.calibration[match(d_alignQ$id, df_evd$id)] + if (diff(range(na.omit(d_alignQ$retention.time.calibration))) < 1e-5) + { + txt_subtitle = paste0(txt_subtitle, " || WARNING: MaxQuant did not correct RTs in any way!"); + warning("EVD MBRAlign: MaxQuant did not correct RTs in any way, despite MBR=on") + } + if (nrow(d_alignQ)==0) { ## very unusual case: reference contains no evidence -- e.g. pull-down experiment lpl[[1]] = ggText("EVD: RT Distance of peptides from reference after alignment", "Alignment cannot be verfied -- no data.") @@ -655,12 +667,12 @@ Heatmap score [EVD: MBR Align]: fraction of 'green' vs. 'green+red' peptides. } ## amend fc.raw.file with % good ID pairs qcAlign$newlabel = paste0(qcAlign$newlabel, " (sc: ", round(qcAlign$withinRT*100), "%)") - evd_RT_t$fc.raw.file_ext = mapvalues(evd_RT_t$fc.raw.file, qcAlign$fc.raw.file, qcAlign$newlabel) + evd_RT_t$fc.raw.file_ext = plyr::mapvalues(evd_RT_t$fc.raw.file, qcAlign$fc.raw.file, qcAlign$newlabel) evd_RT_t$RTdiff_in = c("green", "red")[(abs(evd_RT_t$rtdiff) > tolerance_matching)+1] ## plot alignment result - y_lim = quantile(c(evd_RT_t$rtdiff, evd_RT_t$retention.time.calibration), probs = c(0.01,0.99), na.rm = TRUE) * 1.1 + y_lim = quantile(c(evd_RT_t$rtdiff, evd_RT_t$retention.time.calibration), probs = c(0.01, 0.99), na.rm = TRUE) * 1.1 lpl = byX(evd_RT_t, evd_RT_t$fc.raw.file, 3*3, plot_MBRAlign, sort_indices = FALSE, y_lim = y_lim, title_sub = txt_subtitle, match_tol = tolerance_matching) @@ -708,20 +720,23 @@ This score is 'pessimistic' because if few ID's were transferred, but all of the the majority of peptides is still ok (because they are genuine). However, in this case MBR provides few (and wrong) additional information, and should be disabled. ", - workerFcn = function(.self, df_evd, avg_peak_width) + workerFcn = function(.self, df_evd, df_evd_tf, avg_peak_width) { ## completeness check #stopifnot(c("...") %in% colnames(df_evd)) + if (!checkInput(c("modified.sequence"), df_evd)) return() + df_evd_all = merge(df_evd, df_evd_tf, all = TRUE) + ## increase of segmentation by MBR: ## three values returned: single peaks(%) in genuine, transferred and all(combined) - qMBR = peakSegmentation(df_evd) + qMBR = peakSegmentation(df_evd_all) head(qMBR) ## for groups: get their RT-spans ## ... genuine ID's only (as 'rtdiff_genuine') ## or genuine+transferred (as 'rtdiff_mixed')) ## Could be empty (i.e. no groups, just singlets) if data is really sparse .. - qMBRSeg_Dist = idTransferCheck(df_evd) + qMBRSeg_Dist = idTransferCheck(df_evd_all) #head(qMBRSeg_Dist) #head(qMBRSeg_Dist[qMBRSeg_Dist$fc.raw.file=="file 13",]) @@ -769,14 +784,25 @@ qcMetric_EVD_MBRaux = setRefClass( methods = list(initialize=function() { callSuper( helpTextTemplate = "Auxililiary plots -- experimental -- without scores. + +Return a tree plot with a possible alignment tree. +This allows the user to judge which Raw files have similar corrected RT's (i.e. where aligned successfully). +If there are clear sub-clusters, it might be worth introducing artifical fractions into MaxQuant, +to avoid ID-transfer between these clusters (use the MBR-Align and MBR-ID-Transfer metrics to support the decision). + +If the input contains fractions, leaf nodes will be colored accordingly. +Distinct sub-clusters should have their own color. +If not, MaxQuant's fraction settings should be optimized. +Note that introducing fractions in MaxQuant will naturally lead to a clustering here (it's somewhat circular). Heatmap score: none. ", workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("type", "match.time.difference", "calibrated.retention.time", "fc.raw.file", "modified.sequence", "charge") %in% colnames(df_evd)) - + + if (!checkInput(c("type", "is.transferred", "calibrated.retention.time", "fc.raw.file", "modified.sequence", "charge"), df_evd)) return() + if (('fraction' %in% colnames(df_evd)) && (length(unique(df_evd$fraction)) > 1)) { ## fractions: there must be more than one, otherwise MQ will treat the samples as unfractionated col_fraction = "fraction" @@ -791,10 +817,10 @@ Heatmap score: none. col_fraction = col_fraction) ## MBR: additional evidence by matching MS1 by AMT across files - if (any(!is.na(df_evd$match.time.difference))) { + if (any(df_evd$is.transferred)) { ## gain for each raw file: absolute gain, and percent gain - mtr.df = ddply(df_evd, "fc.raw.file", function(x) { - match_count_abs = sum(!is.na(x$match.time.difference)) + mtr.df = plyr::ddply(df_evd, "fc.raw.file", function(x) { + match_count_abs = sum(x$is.transferred) ## if only matched IDs are present, this would be 'Inf' -- we limit that to 1e4 match_count_pc = min(1e4, round(100*match_count_abs/(nrow(x)-match_count_abs))) ## newIDs / oldIDs return (data.frame(abs = match_count_abs, pc = match_count_pc)) @@ -831,17 +857,17 @@ Consistent charge distribution is paramount for comparable 3D-peak intensities a Heatmap score [EVD: Charge]: Deviation of the charge 2 proportion from a representative Raw file ('qualMedianDist' function). ", - workerFcn = function(.self, df_evd, int_cols, MAP_pg_groups) + workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("hasMTD", "fc.raw.file", "charge") %in% colnames(df_evd)) + if (!checkInput(c("is.transferred", "fc.raw.file", "charge"), df_evd)) return() - d_charge = mosaicize(df_evd[!df_evd$hasMTD, c("fc.raw.file", "charge")]) + d_charge = mosaicize(df_evd[!df_evd$is.transferred, c("fc.raw.file", "charge")]) lpl = byXflex(d_charge, d_charge$Var1, 30, plot_Charge, sort_indices = TRUE) ## QC measure for charge centeredness - qc_charge = ddply(df_evd[!df_evd$hasMTD, c("charge", "fc.raw.file")], "fc.raw.file", function(x) data.frame(c = (sum(x$charge==2)/nrow(x)))) + qc_charge = plyr::ddply(df_evd[!df_evd$is.transferred, c("charge", "fc.raw.file")], "fc.raw.file", function(x) data.frame(c = (sum(x$charge==2)/nrow(x)))) qc_charge[, .self$qcName] = qualMedianDist(qc_charge$c) return(list(plots = lpl, qcScores = qc_charge[, c("fc.raw.file", .self$qcName)])) @@ -873,12 +899,12 @@ Heatmap score [EVD: ID rate over RT]: Scored using 'Uniform' scoring function, i workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("retention.time", "fc.raw.file") %in% colnames(df_evd)) + if (!checkInput(c("retention.time", "fc.raw.file"), df_evd)) return() raws_perPlot = 6 rt_range = range(df_evd$retention.time, na.rm = TRUE) - df_idRT = ddply(df_evd, "fc.raw.file", function(x) { + df_idRT = plyr::ddply(df_evd, "fc.raw.file", function(x) { h = hist(x$retention.time, breaks=seq(from=rt_range[1]-3, to=rt_range[2]+3, by=3), plot = FALSE) return(data.frame(RT = h$mid, counts = h$counts)) }) @@ -886,7 +912,7 @@ Heatmap score [EVD: ID rate over RT]: Scored using 'Uniform' scoring function, i byXflex(df_idRT, df_idRT$fc.raw.file, raws_perPlot, plot_IDsOverRT, sort_indices = TRUE) ## QC measure for uniform-ness - qcScore = ddply(df_evd[, c("retention.time", "fc.raw.file")], "fc.raw.file", + qcScore = plyr::ddply(df_evd[, c("retention.time", "fc.raw.file")], "fc.raw.file", function(x) data.frame(metric = qualUniform(na.omit(x$retention.time)))) colnames(qcScore)[colnames(qcScore)=="metric"] = .self$qcName @@ -925,8 +951,16 @@ Heatmap score [EVD: MS Cal Pre (%1.1f)]: the centeredness (function CenteredRef) .self$helpText = sprintf(.self$helpTextTemplate, tolerance_pc_ppm, tolerance_pc_ppm) - fix_cal = fixCalibration(df_evd, df_idrate, tolerance_sd_PCoutOfCal) + if (!checkInput(c("fc.raw.file", "uncalibrated.mass.error..ppm."), df_evd)) return() + ## for some mzTab (not recalibrated) 'mass.error..ppm.' is not there... but we only need a dummy + if (!("mass.error..ppm." %in% colnames(df_evd))) df_evd$mass.error..ppm. = 0 + + fix_cal = fixCalibration(df_evd, df_idrate, tolerance_sd_PCoutOfCal) + if (is.null(fix_cal)) { + warning("Internal error. Data missing. Skipping metric!", immediate. = TRUE) + return() + } ## some outliers can have ~5000ppm, blowing up the plot margins ## --> remove outliers ylim_g = range(boxplot.stats(fix_cal$df_evd$uncalibrated.mass.error..ppm.)$stats[c(1, 5)], c(-tolerance_pc_ppm, tolerance_pc_ppm) * 1.05) @@ -940,7 +974,7 @@ Heatmap score [EVD: MS Cal Pre (%1.1f)]: the centeredness (function CenteredRef) title_sub = fix_cal$recal_message) ## scores - qc_MS1deCal = ddply(fix_cal$df_evd, "fc.raw.file", + qc_MS1deCal = plyr::ddply(fix_cal$df_evd, "fc.raw.file", function(x) { xd = na.omit(x$uncalibrated.mass.error..ppm.) if (length(xd)==0) { @@ -984,6 +1018,7 @@ Heatmap score [EVD: MS Cal-Post]: The variance and centeredness around zero of t { ## completeness check #stopifnot(c("...") %in% colnames(df_pg)) + if (!checkInput(c("uncalibrated.mass.error..ppm.", "mass", "mass.error..ppm."), df_evd)) return() fix_cal = fixCalibration(df_evd, df_idrate, tolerance_sd_PCoutOfCal) @@ -999,7 +1034,7 @@ Heatmap score [EVD: MS Cal-Post]: The variance and centeredness around zero of t ## QC measure for post-calibration ppm error ## .. assume 0 centered and StdDev of observed data - obs_par = ddply(fix_cal$df_evd[, c("mass.error..ppm.", "fc.raw.file")], "fc.raw.file", + obs_par = plyr::ddply(fix_cal$df_evd[, c("mass.error..ppm.", "fc.raw.file")], "fc.raw.file", function(x) data.frame(mu = mean(x$mass.error..ppm., na.rm = TRUE), sd = sd(x$mass.error..ppm., na.rm = TRUE))) qc_MS1Cal = data.frame(fc.raw.file = obs_par$fc.raw.file, @@ -1042,7 +1077,7 @@ Heatmap score [EVD: Contaminants]: as fraction of summed intensity with 0 = samp workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("intensity", "contaminant", "fc.raw.file") %in% colnames(df_evd)) + if (!checkInput(c("intensity", "contaminant", "fc.raw.file", "proteins"), df_evd)) return() ## ## elaborate contaminant fraction per Raw.file (this is not possible from PG, since raw files could be merged) @@ -1067,7 +1102,8 @@ Heatmap score [EVD: Contaminants]: as fraction of summed intensity with 0 = samp df_evd$pname[df_evd$pname==""] = df_evd$proteins[df_evd$pname==""] ## a NOP if it already is 'proteins', but ok df_evd.totalInt = sum(as.numeric(df_evd$intensity), na.rm = TRUE) - df_evd.cont.only = df_evd[df_evd$contaminant,] + df_evd.cont.only = df_evd[df_evd$contaminant > 0,] + cont.top = by(df_evd.cont.only, df_evd.cont.only$pname, function(x) sum(as.numeric(x$intensity), na.rm = TRUE) / df_evd.totalInt*100) cont.top.sort = sort(cont.top, decreasing = TRUE) #head(cont.top.sort) @@ -1086,7 +1122,7 @@ Heatmap score [EVD: Contaminants]: as fraction of summed intensity with 0 = samp } ## QC measure for contamination - qc_cont = ddply(df_evd[, c("intensity", "contaminant", "fc.raw.file")], "fc.raw.file", + qc_cont = plyr::ddply(df_evd[, c("intensity", "contaminant", "fc.raw.file")], "fc.raw.file", function(x) { val = ifelse(is.null(cont.top5.names), HEATMAP_NA_VALUE, ## use NA in heatmap if there are no contaminants @@ -1126,9 +1162,9 @@ Heatmap score [EVD: MS2 Oversampling]: The percentage of non-oversamp workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("fc.raw.file", "ms.ms.count") %in% colnames(df_evd)) + if (!checkInput(c("fc.raw.file", "ms.ms.count"), df_evd)) return() - d_dups = ddply(df_evd, "fc.raw.file", function(x) { + d_dups = plyr::ddply(df_evd, "fc.raw.file", function(x) { tt = as.data.frame(table(x$ms.ms.count), stringsAsFactors = FALSE) tt$Count = as.numeric(tt$Var1) ## remove "0", since this would be MBR-features @@ -1136,7 +1172,7 @@ Heatmap score [EVD: MS2 Oversampling]: The percentage of non-oversamp ## summarize everything above 3 counts if (any(tt$Count >= 3)) { tt$Count[tt$Count >= 3] = "3+" - tt = ddply(tt, "Count", function(x) data.frame(Freq=sum(x$Freq))) + tt = plyr::ddply(tt, "Count", function(x) data.frame(Freq=sum(x$Freq))) } ## make counts relative fraction = tt$Freq / sum(tt$Freq) * 100 @@ -1192,7 +1228,7 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid workerFcn = function(.self, df_evd) { ## completeness check - stopifnot(c("fc.raw.file", "modified.sequence", "intensity") %in% colnames(df_evd)) + if (!checkInput(c("fc.raw.file", "modified.sequence", "intensity"), df_evd)) return() if (('fraction' %in% colnames(df_evd)) && (length(unique(df_evd$fraction)) > 1)) { lpl = list(ggText("Missing Values Skipped", "Missing values calculation skipped. Fractionated data detected!")) @@ -1205,7 +1241,7 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid } ## make peptides unique per Raw file - df_u = ddply(df_evd[ , c("fc.raw.file", "modified.sequence")], "fc.raw.file", + df_u = plyr::ddply(df_evd[ , c("fc.raw.file", "modified.sequence")], "fc.raw.file", function(x) { return(x[!duplicated(x$modified.sequence),]) }) @@ -1214,7 +1250,7 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid global_peps_count = length(global_peps) ## percent identified in each Raw file - pep_set = ddply(df_u[ , c("fc.raw.file", "modified.sequence")], "fc.raw.file", + pep_set = plyr::ddply(df_u[ , c("fc.raw.file", "modified.sequence")], "fc.raw.file", function(x) { score = 100*length(intersect(global_peps, x$modified.sequence)) / global_peps_count return(data.frame(idFraction = score)) @@ -1256,7 +1292,7 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid lpl_dens = byXflex(df_evd[, c("modified.sequence", "fc.raw.file", "logInt")], df_evd$fc.raw.file, subset_size = 5, FUN = function(dx) { - d_mat = dcast(dx, modified.sequence ~ fc.raw.file, fun.aggregate = mean, value.var = "logInt") + d_mat = reshape2::dcast(dx, modified.sequence ~ fc.raw.file, fun.aggregate = mean, value.var = "logInt") ## ... normalization factors d_mat_mult = sapply(2:ncol(d_mat), function(x) { @@ -1271,7 +1307,7 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid head(d_mat_n) ## find impute value pep_mean = rowMeans(d_mat_n[, -1, drop=FALSE], na.rm = TRUE) - df_missing = ddply(df_mult, "fc.raw.file", function(x) { + df_missing = plyr::ddply(df_mult, "fc.raw.file", function(x) { ## get set of missing values values = pep_mean[is.na(d_mat_n[, as.character(x$fc.raw.file)])] ## de-normalize (back to old intensity range) @@ -1285,8 +1321,8 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid geom_freqpoly(data = dx, aes_string(x = "logInt", col="fc.raw.file"), binwidth = 0.5, size = 1.2) + xlab("Intensity [log2]") + ggtitle(" [experimental] EVD: Imputed Peptide Intensity Distribution of Missing Values") + - scale_fill_manual(values = rep(brewer.pal(6,"Accent"), times=40), guide = guide_legend("")) + - scale_colour_manual(values = rep(brewer.pal(6,"Accent"), times=40), guide = "none") + scale_fill_manual(values = rep(RColorBrewer::brewer.pal(6,"Accent"), times=40), guide = guide_legend("")) + + scale_colour_manual(values = rep(RColorBrewer::brewer.pal(6,"Accent"), times=40), guide = "none") return(pl) }) @@ -1310,4 +1346,76 @@ Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptid ##################################################################### +qcMetric_EVD_UpSet = setRefClass( + "qcMetric_EVD_UpSet", + contains = "qcMetric", + methods = list(initialize=function() { callSuper( + helpTextTemplate = + "The metric shows an upSet plot based on the number of modified peptide sequences per Raw file, intersected or merged with other Raw files (see below for details).
+ +If the number of Raw files is >=6, only the 'distinct' plot is generated (the other two are skipped for performance reasons). +![](https://raw.githubusercontent.com/cbielow/PTXQC/mzTab_support/inst/reportTemplate/modes_UpSet.png 'Example plot showing how the set size is computed') + +Definition: An 'active set' is the set of black dots in a column of the plot -- as opposed to the grey dots (you'll understand when you see it). + +

+distinct: shows the number of sequences that are present in ALL active sets. For three Raw files and active sets A and B, this would mean all sequences which occur in A and B (intersect), but not in C (setdiff).
+intersection: shows the number of sequences that occurs in all active sets (intersection).
+union: shows the number of sequences that occurs in total. For two files that are all sequences that occurs either in A or in B (union).
+

+Heatmap score [EVD: UpSet]: The proportion of sequences that the file has in common with all other files. +", + workerFcn = function(.self, df_evd) + { + if (!checkInput(c("modified.sequence", "fc.raw.file"), df_evd)) return() + + getOutputWithMod = function(dl, mode){ + unlist(sapply(1:length(dl), function(numElem){ + comb = combn(names(dl),numElem) + sapply(1:ncol(comb), function(x){ + sets = comb[,x] + exp = as.expression(paste(sets, collapse = "&")) + value = length(Reduce(mode, dl[sets])) + names(value) = exp + return(value) + }) + })) + } + + lf = tapply(df_evd$modified.sequence, df_evd$fc.raw.file, function(x){return(list(unique(x)))}) + if (length(lf) <= 1) + { + lpl = list(ggText("UpSetR", "Only single Raw file detected. Cannot compute unions/intersections.")) + return(list(plots = lpl, titles = list("EVD: UpSet"))) + } + + lpl = list(UpSetR::upset(UpSetR::fromList(lf), nsets = min(30, length(lf)), keep.order = TRUE, mainbar.y.label = "distinct size")) + if (length(lf) < 6) + { ## performance for enumerating all supersets forbids doing it on larger sets until we make this code smarter... + lpl[[2]] = UpSetR::upset(UpSetR::fromExpression(getOutputWithMod(lf, intersect)), mainbar.y.label = "intersection size") + lpl[[3]] = UpSetR::upset(UpSetR::fromExpression(getOutputWithMod(lf, union)), mainbar.y.label = "union size") + } + titles = list("EVD: UpSet distinct", + "EVD: UpSet intersect", + "EVD: UpSet union")[1:length(lpl)] + + score = sapply(1:length(names(lf)), function(x){ + union = unique(unlist(lf[-x])) + inters = intersect(lf[[x]], union) + score = length(inters)/length(union) + return(score) + }) + + qcScore = data.frame(fc.raw.file = names(lf), score = score) + colnames(qcScore)[2] = .self$qcName + + return(list(plots = lpl, title = titles, qcScores = qcScore)) + }, + qcCat = "LC", + qcName = "EVD:~UpSet", + orderNr = 0500 # just before peptide count + ) + return(.self) + }) +) diff --git a/R/qcMetric_MSMS.R b/R/qcMetric_MSMS.R index 62461cf..77fdc09 100644 --- a/R/qcMetric_MSMS.R +++ b/R/qcMetric_MSMS.R @@ -19,7 +19,7 @@ Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around workerFcn = function(.self, df_msms, fc_raw_files) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "fragmentation", "reverse", "mass.deviations..da."), colnames(df_msms))) + if (!checkInput(c("fc.raw.file", "fragmentation", "reverse", "mass.deviations..da."), df_msms)) return () ## older MQ versions do not have 'mass.analyzer' or 'mass.deviations..ppm.' ## , so we use fragmentation instead (this is a little risky, since you could do CID fragmentation and forward to Orbi, but hey...) if (!("mass.analyzer" %in% colnames(df_msms))) df_msms$mass.analyzer = df_msms$fragmentation @@ -31,7 +31,7 @@ Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around return (sort(sample.int(x, size = max))) } - ms2_decal = ddply(df_msms, c("fc.raw.file", "mass.analyzer"), .fun = function(x) { + ms2_decal = plyr::ddply(df_msms, c("fc.raw.file", "mass.analyzer"), .fun = function(x) { df.ms = NULL ## ## Forwards @@ -57,7 +57,6 @@ Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around df.ms = rbind(df.ms, df.ms_r) } } - return (df.ms) }) @@ -67,7 +66,7 @@ Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around #ms2_decal$msErr = round(ms2_decal$msErr, digits=ceiling(-log10(ms2_binwidth)+1)) ## separate plots for each mass analyzer, since we want to keep 'fixed' scales for all raw.files (comparability) - lpl = dlply(ms2_decal, "mass.analyzer", function(ms2_decal) { + lpl = plyr::dlply(ms2_decal, "mass.analyzer", function(ms2_decal) { ## create filename inside, since we need to retain the factor levels (i.e. ordering) ## and this only works if raw file + massanalyzer is unique ms2_decal$new_filename = paste(ms2_decal$fc.raw.file, paste(ms2_decal$mass.analyzer, ms2_decal$unit), sep="\n") @@ -86,7 +85,7 @@ Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around qcScore = list() for (analyzer in unique(ms2_decal$mass.analyzer)) { qc_name = sprintf(.self$qcName, analyzer) - qc_MS2_decal = ddply(ms2_decal[ms2_decal$mass.analyzer==analyzer, ], "fc.raw.file", + qc_MS2_decal = plyr::ddply(ms2_decal[ms2_decal$mass.analyzer==analyzer, ], "fc.raw.file", function(x) { xx = na.omit(x$msErr); @@ -124,8 +123,8 @@ general, increased MC counts also increase the number of peptide signals, thus c space and potentially provoking overlapping peptide signals, biasing peptide quantification. Thus, low MC counts should be favored. Interestingly, it has been shown recently that incorporation of peptides with missed cleavages does not negatively influence protein quantification (see -[http://pubs.acs.org/doi/abs/10.1021/pr500294d](Chiva, C., Ortega, M., and Sabido, E. Influence of the Digestion Technique, Protease, and Missed -Cleavage Peptides in Protein Quantitation. J. Proteome Res. 2014, 13, 3979-86) ). +[Chiva, C., Ortega, M., and Sabido, E. Influence of the Digestion Technique, Protease, and Missed +Cleavage Peptides in Protein Quantitation. J. Proteome Res. 2014, 13, 3979-86](http://pubs.acs.org/doi/abs/10.1021/pr500294d) ). However this is true only if all samples show the same degree of digestion. High missed cleavage values can indicate for example, either a) failed digestion, b) a high (post-digestion) protein contamination, or c) a sample with high amounts of unspecifically degraded peptides which are not digested by trypsin. @@ -142,28 +141,44 @@ current study. ", workerFcn = function(.self, df_msms, df_evd = NULL) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "missed.cleavages"), colnames(df_msms))) - if (!is.null(df_evd)) stopifnot(.self$checkInput(c("contaminant", "id"), colnames(df_evd))) + if (!checkInput(c("fc.raw.file"), df_msms)) return() + if (!checkInput(c("missed.cleavages"), df_msms) && !checkInput(c("sequence"), df_msms)) return() + if (!is.null(df_evd) && !checkInput(c("contaminant", "id"), df_evd)) return() + + # if missed.cleavages is not given, it is assumed that trypsin was used for digestion + if (!"missed.cleavages" %in% colnames(df_msms)) { + seqs = gsub('.{1}$', '', df_msms$sequence) + df_msms$missed.cleavages = nchar(seqs) - nchar(gsub("K|R", "", seqs)) + msg_missed_clea = "(MCs computed assuming trypsin)" + } + else { + msg_missed_clea = "" + } max_mc = max(-Inf, df_msms$missed.cleavages, na.rm = TRUE) ## will be -Inf iff enzyme was not specified and columns is 100% NA if (!is.infinite(max_mc)) { ## MC's require an enzyme to be set ## remove contaminants - msg_cont_removed = "(includes contaminants -- no evidence.txt read)" - if (!is.null(df_evd)) { - msg_cont_removed = "(excludes contaminants)" + msg_cont_removed = "(excludes contaminants)" + if ("contaminant" %in% colnames(df_msms)) { # for MzTab + df_msms = df_msms[!df_msms$contaminant,] + } + else if (!is.null(df_evd)) { + if (!checkInput(c("evidence.id"), df_msms)) return() df_msms = df_msms[!df_evd$contaminant[match(df_msms$evidence.id, df_evd$id)], ] } + else msg_cont_removed = "(includes contaminants -- no evidence.txt read)" - st_bin = ddply(df_msms[, c("missed.cleavages", "fc.raw.file")], "fc.raw.file", .fun = function(x) { + st_bin = plyr::ddply(df_msms[, c("missed.cleavages", "fc.raw.file")], "fc.raw.file", .fun = function(x) { t = table(x$missed.cleavages)/nrow(x) r = rep(0, max_mc + 1) names(r) = as.character(0:max_mc) r[names(t)] = t return (r) }) + lpl = - byXflex(st_bin, st_bin$fc.raw.file, 25, plot_MissedCleavages, title_sub = msg_cont_removed, sort_indices = TRUE) + byXflex(st_bin, st_bin$fc.raw.file, 25, plot_MissedCleavages, sort_indices = TRUE, title_sub = paste(msg_cont_removed, msg_missed_clea)) ## QC measure for missed-cleavages variation qc_score = data.frame(fc.raw.file = st_bin$fc.raw.file, valMC = st_bin[, "0"]) @@ -177,8 +192,8 @@ current study. ", valMCVar = HEATMAP_NA_VALUE) }## end enyzme check - colnames(qc_score)[colnames(qc_score) == "valMC"] = sprintf(.self$qcName, "MC") - colnames(qc_score)[colnames(qc_score) == "valMCVar"] = sprintf(.self$qcName, "MC~Var") + colnames(qc_score)[colnames(qc_score) == "valMC"] = sprintf(.self$qcName, "Missed~Cleavages") + colnames(qc_score)[colnames(qc_score) == "valMCVar"] = sprintf(.self$qcName, "Missed~Cleavages~Var") return(list(plots = lpl, qcScores = qc_score)) }, diff --git a/R/qcMetric_MSMSScans.R b/R/qcMetric_MSMSScans.R index d94dedb..2968fe3 100644 --- a/R/qcMetric_MSMSScans.R +++ b/R/qcMetric_MSMSScans.R @@ -4,8 +4,6 @@ #' #' Metric for msmsscans.txt, showing TopN over RT. #' -#' @importFrom data.table as.data.table setkey -#' qcMetric_MSMSScans_TopNoverRT = setRefClass( "qcMetric_MSMSScans_TopNoverRT", contains = "qcMetric", @@ -20,10 +18,10 @@ Heatmap score [MS2 Scans: TopN over RT]: Rewards uniform (function Un workerFcn = function(.self, df_msmsScans) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "retention.time", "scan.event.number", "rRT"), colnames(df_msmsScans))) + if (!checkInput(c("fc.raw.file", "retention.time", "scan.event.number", "rRT"), df_msmsScans)) return() - dd = as.data.table(df_msmsScans[, c("fc.raw.file", "retention.time", "scan.event.number", "rRT")]) - setkey(dd, fc.raw.file, retention.time) ## sort by RT + dd = data.table::as.data.table(df_msmsScans[, c("fc.raw.file", "retention.time", "scan.event.number", "rRT")]) + data.table::setkey(dd, fc.raw.file, retention.time) ## sort by RT ## find the highest scan event (SE) after an MS1 scan DF_max = dd[, { idx = which(getMaxima(scan.event.number, thresh_rel = 0.0)) @@ -40,7 +38,7 @@ Heatmap score [MS2 Scans: TopN over RT]: Rewards uniform (function Un byXflex(DFmse, DFmse$fc.raw.file, 6, plot_TopNoverRT, sort_indices = FALSE) ## QC measure for smoothness of TopN over RT - qc_TopNRT = ddply(DFmse, "fc.raw.file", function(x) data.frame(val = qualUniform(x$topN))) + qc_TopNRT = plyr::ddply(DFmse, "fc.raw.file", function(x) data.frame(val = qualUniform(x$topN))) colnames(qc_TopNRT)[colnames(qc_TopNRT) == "val"] = .self$qcName return(list(plots = lpl, qcScores = qc_TopNRT)) @@ -81,10 +79,10 @@ Heatmap score [MS2 Scans: Intensity]: Linear score (0-100%) between 3 workerFcn = function(.self, d_msmsScan, score_min_factor = 3, score_max_factor = 10) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "total.ion.current", "base.peak.intensity"), colnames(d_msmsScan))) + if (!checkInput(c("fc.raw.file", "total.ion.current", "base.peak.intensity"), d_msmsScan)) return () ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory - dd = as.data.table(d_msmsScan[, c("fc.raw.file", "total.ion.current", "base.peak.intensity")]) + dd = data.table::as.data.table(d_msmsScan[, c("fc.raw.file", "total.ion.current", "base.peak.intensity")]) dd$log.total.ion.current = log10(dd$total.ion.current) dd$log.base.peak.intensity = log10(dd$base.peak.intensity) log.dd.tic = dd[,list(mean=mean(log.total.ion.current), @@ -112,7 +110,7 @@ Heatmap score [MS2 Scans: Intensity]: Linear score (0-100%) between 3 plot_MSMSintensity = function(dd.all) { pl = ggplot(data = dd.all, aes(x = fc.raw.file)) + geom_boxplot(stat = "identity", aes(col = "TIC", ymin = min, lower = lower, middle = middle, upper = upper, ymax = max)) + - geom_boxplot(stat = "identity", aes(col = "Base\nPeak", ymin = min2, lower = lower2, middle = middle2, upper = upper2, ymax = max2, width = 0.3)) + + geom_boxplot(stat = "identity", aes(col = "Base\nPeak", ymin = min2, lower = lower2, middle = middle2, upper = upper2, ymax = max2), width = 0.3) + scale_color_manual("MS/MS\nintensity", values = c("TIC" = "black", "Base\nPeak" = "blue")) + ylim(0, NA) + scale_x_discrete_reverse(dd.all$fc.raw.file) + @@ -126,7 +124,7 @@ Heatmap score [MS2 Scans: Intensity]: Linear score (0-100%) between 3 ## QC measure for intensity ratio below expected threshold (3x-10x by default) - qc_MSMSint = ddply(dd.ratio, "fc.raw.file", + qc_MSMSint = plyr::ddply(dd.ratio, "fc.raw.file", function(x) data.frame(val = qualLinThresh(pmax(0, x$ratio - score_min_factor), t = score_max_factor - score_min_factor))) colnames(qc_MSMSint)[colnames(qc_MSMSint) == "val"] = .self$qcName @@ -154,11 +152,15 @@ Heatmap score [MS2 Scans: Ion Inj Time]: Linear score as fraction of ", workerFcn = function(.self, df_msmsScans, threshold_iit) { - ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "ion.injection.time", "rRT"), colnames(df_msmsScans))) + ## completeness check (mzTab might not have IIJ) + if ( any(!("ion.injection.time" %in% colnames(df_msmsScans)), all(is.na(df_msmsScans$ion.injection.time)) ) ) { + return(NULL) + } + + if (!checkInput(c("fc.raw.file", "ion.injection.time", "rRT"), df_msmsScans)) return () ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory - dd = as.data.table(df_msmsScans[, c("fc.raw.file", "ion.injection.time", "rRT")]) + dd = data.table::as.data.table(df_msmsScans[, c("fc.raw.file", "ion.injection.time", "rRT")]) ## average injection time over RT DFmIIT = dd[, list(medIIT = median(ion.injection.time)), by=c("fc.raw.file", "rRT")] @@ -177,7 +179,7 @@ Heatmap score [MS2 Scans: Ion Inj Time]: Linear score as fraction of list(belowThresh_IIT = sum(ion.injection.time < threshold_iit, na.rm = TRUE) / .N), by = "fc.raw.file"] - qc_IIT = ddply(DFmIIT_belowThresh, "fc.raw.file", + qc_IIT = plyr::ddply(DFmIIT_belowThresh, "fc.raw.file", function(x) data.frame(val = qualLinThresh(x$belowThresh_IIT, t = 1))) colnames(qc_IIT)[colnames(qc_IIT) == "val"] = .self$qcName @@ -206,22 +208,22 @@ Heatmap score [MS2 Scans: TopN high]: rewards if TopN was reached on workerFcn = function(.self, df_msmsScans) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "scan.event.number"), colnames(df_msmsScans))) + if (!checkInput(c("fc.raw.file", "scan.event.number"), df_msmsScans)) return () ## check if scan.event.number requires fixing ## (e.g. when MS3 events are recorded between MS2 events, there are gaps in the numbering) ## we close the gaps by requiring consecutive scan event numbers in MS2 scan.events = df_msmsScans[, c("scan.event.number", "fc.raw.file")] - while (TRUE) { ## should be at most max(scan.even.number) iterations + while (TRUE) { ## should be at most max(scan.event.number) iterations se_pos = 1 + which(diff(scan.events$scan.event.number) > 1) ## position of gaps>1 if (length(se_pos) == 0) break; scan.events$scan.event.number[se_pos] = scan.events$scan.event.number[se_pos] - 1 } ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory - DFc = as.data.table(scan.events)[, list(n=.N), by=c("scan.event.number", "fc.raw.file")] + DFc = data.table::as.data.table(scan.events)[, list(n=.N), by=c("scan.event.number", "fc.raw.file")] - dfc.ratio = ddply(DFc, "fc.raw.file", function(x, maxn) + dfc.ratio = plyr::ddply(DFc, "fc.raw.file", function(x, maxn) { ## sort x by scan event event_count = x$n @@ -256,7 +258,7 @@ Heatmap score [MS2 Scans: TopN high]: rewards if TopN was reached on ## QC measure for always reaching the maximum TopN maxTopN = max(dfc.ratio$scan.event.number) - qc_TopN = ddply(dfc.ratio, "fc.raw.file", function(x) data.frame(val = qualHighest(x$n, maxTopN))) + qc_TopN = plyr::ddply(dfc.ratio, "fc.raw.file", function(x) data.frame(val = qualHighest(x$n, maxTopN))) colnames(qc_TopN)[colnames(qc_TopN) == "val"] = .self$qcName return(list(plots = lpl, qcScores = qc_TopN)) @@ -288,10 +290,10 @@ Heatmap score [MS2 Scans: TopN ID over N]: Rewards uniform identifica workerFcn = function(.self, df_msmsScans) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "scan.event.number", "identified"), colnames(df_msmsScans))) + if (!checkInput(c("fc.raw.file", "scan.event.number", "identified"), df_msmsScans)) return() ## use data.table for aggregation, its MUCH faster than ddply() and uses almost no extra memory - DF = as.data.table(df_msmsScans[, c("fc.raw.file", "scan.event.number", "identified")])[, list(n=.N), by=c("fc.raw.file", "scan.event.number", "identified")] + DF = data.table::as.data.table(df_msmsScans[, c("fc.raw.file", "scan.event.number", "identified")])[, list(n=.N), by=c("fc.raw.file", "scan.event.number", "identified")] # try KS on underlying data instead of using qualUniform() # DF2= ddply(df_msmsScans, "fc.raw.file", function(rf){ @@ -304,12 +306,13 @@ Heatmap score [MS2 Scans: TopN ID over N]: Rewards uniform identifica # kk$statistic # = 'D' ,, p.value is much smaller (~0) # }) # --> fail, 'D' and p-values are too low - df.ratio = ddply(DF, c("scan.event.number", "fc.raw.file"), function(x) + df.ratio = plyr::ddply(DF, c("scan.event.number", "fc.raw.file"), function(x) { xp = xm = 0 if ("+" %in% x$identified) xp = x$n[x$identified=="+"] if ("-" %in% x$identified) xm = x$n[x$identified=="-"] ratio = xp * 100 / sum(xp, xm) + return (data.frame(ratio = ratio, count = sum(x$n))) }) head(df.ratio) @@ -318,7 +321,7 @@ Heatmap score [MS2 Scans: TopN ID over N]: Rewards uniform identifica ## QC measure for constantly identifiying peptides, irrespective of scan event number ## -- we weight scan events by their number of occurence - qc_TopN_ID = ddply(df.ratio, "fc.raw.file", function(x) data.frame(val = qualUniform(x$ratio, x$count))) + qc_TopN_ID = plyr::ddply(df.ratio, "fc.raw.file", function(x) data.frame(val = qualUniform(x$ratio, x$count))) colnames(qc_TopN_ID)[colnames(qc_TopN_ID) == "val"] = .self$qcName return(list(plots = lpl, qcScores = qc_TopN_ID)) @@ -356,12 +359,12 @@ Heatmap score [MS2 Scans: DepPep]: No score. workerFcn = function(.self, d_msmsScan) { ## completeness check - stopifnot(.self$checkInput(c("fc.raw.file", "dp.modification", "dp.aa", "identified"), colnames(d_msmsScan))) + if (!checkInput(c("fc.raw.file", "dp.modification", "dp.aa", "identified"), d_msmsScan)) return() stopifnot(unique(d_msmsScan$identified) %in% c("-","+")) ## modified subset d_msmsScan$hasDP = (d_msmsScan$dp.modification != "") & (tolower(d_msmsScan$dp.modification) != "unmodified") - d_dp = as.data.table(d_msmsScan[d_msmsScan$hasDP,]) + d_dp = data.table::as.data.table(d_msmsScan[d_msmsScan$hasDP,]) ## pick global top-5 modifications d_dp.mods.top = sort(table(d_dp$dp.modification), decreasing = TRUE)[1:5] @@ -385,11 +388,11 @@ Heatmap score [MS2 Scans: DepPep]: No score. d_dp.mods = d_dp[, list(n=.N), by=c("fc.raw.file", "dp.modification")] tail(d_dp.mods) ## sort mods by # of occurences - d_dp.mods.sort = ddply(d_dp.mods, "dp.modification", function(x) sum(x$n)) + d_dp.mods.sort = plyr::ddply(d_dp.mods, "dp.modification", function(x) sum(x$n)) d_dp.mods.sort.name = d_dp.mods.sort$dp.modification[order(d_dp.mods.sort$V1)] d_dp.mods$dp.modification = factor(d_dp.mods$dp.modification, levels = d_dp.mods.sort.name) - d_noDP_id = as.data.table(d_msmsScan[d_msmsScan$identified=="+" & !d_msmsScan$hasDP,])[, list(n=.N), by=c("fc.raw.file")] + d_noDP_id = data.table::as.data.table(d_msmsScan[d_msmsScan$identified=="+" & !d_msmsScan$hasDP,])[, list(n=.N), by=c("fc.raw.file")] d_dp.mods$n_noDP = d_noDP_id$n[match(d_dp.mods$fc.raw.file, d_noDP_id$fc.raw.file)] d_dp.mods$n_percent = d_dp.mods$n / d_dp.mods$n_noDP * 100 diff --git a/R/qcMetric_PG.R b/R/qcMetric_PG.R index fa235b9..35f7994 100644 --- a/R/qcMetric_PG.R +++ b/R/qcMetric_PG.R @@ -19,9 +19,9 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R workerFcn=function(.self, df_pg, int_cols, MAP_pg_groups) { ## completeness check - stopifnot(c(int_cols, "contaminant") %in% colnames(df_pg)) + if (!checkInput(c(int_cols, "contaminant"),df_pg)) return() - df.con_stats = adply(int_cols, .margins=1, function(group) { + df.con_stats = plyr::adply(int_cols, .margins=1, function(group) { #cat(group) total_int = sum(as.numeric(df_pg[, group]), na.rm = TRUE) return(data.frame(group_long = as.character(group), @@ -68,7 +68,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R workerFcn=function(.self, df_pg, int_cols, MAP_pg_groups, thresh_intensity) { ## completeness check - stopifnot(c(int_cols, "contaminant") %in% colnames(df_pg)) + if (!checkInput(c(int_cols, "contaminant"),df_pg)) return() ## some stats (for plot title) @@ -78,7 +78,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R medians = sort(apply(log2(df_pg[, int_cols, drop = FALSE]+1), 2, quantile, na.rm = TRUE, probs=0.5)) # + c(0,0,0,0,0,0)) int_dev = RSD(medians) int_dev.s = pastet("INT RSD [%]", round(int_dev, 3)) - lpl = boxplotCompare(data = melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], + lpl = boxplotCompare(data = reshape2::melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], log2 = TRUE, mainlab = "PG: intensity distribution", ylab = expression(log[2]*" intensity"), @@ -90,7 +90,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R return(list(plots = lpl)) }, qcCat = "prep", - qcName = "PG:~raw~intensity", + qcName = "PG:~Raw~intensity", orderNr = 0032 ) return(.self) @@ -119,7 +119,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R workerFcn=function(.self, df_pg, int_cols, MAP_pg_groups, thresh_intensity) { ## completeness check - stopifnot(c(int_cols, "contaminant") %in% colnames(df_pg)) + if (!checkInput(c(int_cols, "contaminant"),df_pg)) return() ## some stats (for plot title) @@ -128,7 +128,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R ## do not remove zeros (but add +1 since RSD is 'NA' when 'inf' is included in log-data) medians = sort(apply(log2(df_pg[, int_cols, drop = FALSE]+1), 2, quantile, na.rm = TRUE, probs=0.5)) # + c(0,0,0,0,0,0)) int_dev = RSD(medians) - lpl = boxplotCompare(data = melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], + lpl = boxplotCompare(data = reshape2::melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], log2 = TRUE, mainlab = "PG: LFQ intensity distribution", ylab = expression(log[2]*" intensity"), @@ -170,7 +170,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R workerFcn=function(.self, df_pg, int_cols, MAP_pg_groups, thresh_intensity) { ## completeness check - stopifnot(c(int_cols, "contaminant") %in% colnames(df_pg)) + if (!checkInput(c(int_cols, "contaminant"),df_pg)) return() ## some stats (for plot title) @@ -179,7 +179,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R ## do not remove zeros (but add +1 since RSD is 'NA' when 'inf' is included in log-data) medians = sort(apply(log2(df_pg[, int_cols, drop = FALSE]+1), 2, quantile, probs=0.5, na.rm = TRUE)) # + c(0,0,0,0,0,0)) reprt_dev = RSD(medians) - lpl = boxplotCompare( data = melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], + lpl = boxplotCompare( data = reshape2::melt(df_pg[, c(int_cols, "contaminant"), drop = FALSE], id.vars=c("contaminant"))[,c(2,3,1)], log2 = TRUE, ylab = expression(log[2]*" reporter intensity"), mainlab = "PG: reporter intensity distribution", @@ -219,7 +219,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R workerFcn=function(.self, df_pg, lst_cols, MAP_pg_groups) { ## completeness check - stopifnot(c(unlist(lst_cols), "contaminant") %in% colnames(df_pg)) + if (!checkInput(c(unlist(lst_cols), "contaminant"),df_pg)) return() lpl = list() for (cond in names(lst_cols)) @@ -268,11 +268,10 @@ automatically assumes a pulsed experiment and reports the label incorporation in Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files) ", - workerFcn=function(.self, df_pg, ratio_cols, thresh_LabelIncorp, GL_name_min_length) + workerFcn = function(.self, df_pg, ratio_cols, thresh_LabelIncorp, GL_name_min_length) { ## completeness check - stopifnot(c(ratio_cols, "contaminant", "reverse") %in% colnames(df_pg)) - + if (!checkInput(c(ratio_cols, "contaminant", "reverse"),df_pg)) return() ## remove reverse and contaminants (might skew the picture) idx_row = !df_pg$contaminant & !df_pg$reverse @@ -322,7 +321,7 @@ Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to R ## compute label incorporation? - ratio.mode = ddply(ratio.densities, "col", .fun = function(x) { + ratio.mode = plyr::ddply(ratio.densities, "col", .fun = function(x) { mode = x$x[which.max(x$y)] return (data.frame(mode = mode)) }) diff --git a/R/qcMetric_SM.R b/R/qcMetric_SM.R index dd50891..01eced2 100644 --- a/R/qcMetric_SM.R +++ b/R/qcMetric_SM.R @@ -13,10 +13,12 @@ The thresholds for the bins are %s -Heatmap score [SM: MS2 IDrate (>%1.0f)]: reaches 1 (=100%%) if the threshold for 'great' is reached or exceeded. ", +Heatmap score [SM: MS2 IDrate (>%1.0f)]: reaches 1 (=100%%) if the threshold for 'great' is reached or exceeded. +", + workerFcn = function(.self, df_summary, id_rate_bad, id_rate_great) { - stopifnot(.self$checkInput(c("fc.raw.file", "ms.ms.identified...."), colnames(df_summary))) + if (!checkInput(c("fc.raw.file", "ms.ms.identified...."), df_summary)) return() dms = df_summary$"ms.ms.identified...." dms[is.na(dms)] = 0 ## ID rate can be NaN for some raw files if NOTHING was acquired @@ -73,3 +75,45 @@ Heatmap score [SM: MS2 IDrate (>%1.0f)]: reaches 1 (=100%%) if the th }) ) + +qcMetric_SM_TIC = setRefClass( + "qcMetric_SM_TIC", + contains = "qcMetric", + methods = list(initialize=function() { callSuper( + helpTextTemplate = + "Total Ion Count: Returns the summed intensity of all MS1 signals (regardless of identification state). + +Heatmap score [SM: TIC]: reaches 1 (=100%%) if the TIC is uniform (i.e. a flat line) +", + workerFcn = function(.self, d_smy) + { + ## completeness check + if (!checkInput(c("fc.raw.file", "TIC"), d_smy)) return() + + df_long = plyr::ddply(d_smy, "fc.raw.file", function(x) { + n = length(x$TIC[[1]]) + df = data.frame(RT = x$TIC[[1]][seq(1,n,2)], intensity = x$TIC[[1]][seq(2,n,2)]) + df$RT = round(df$RT / 60) ## seconds to minutes + df2 = data.frame(RT = df$RT[!duplicated(df$RT)], intensity = tapply(df$intensity, df$RT, FUN = mean)) + return(df2) + }) + + head(df_long) + + lpl = + byXflex(df_long, df_long$fc.raw.file, 6, plot_TIC, x_lim = range(df_long$RT), y_lim = range(df_long$intensity), sort_indices = FALSE) + + ## QC measure for smoothness of TopN over RT + qc_TIC = plyr::ddply(df_long, "fc.raw.file", function(x) data.frame(val = qualUniform(x$intensity))) + colnames(qc_TIC)[colnames(qc_TIC) == "val"] = .self$qcName + + return(list(plots = lpl, qcScores = qc_TIC)) + }, + qcCat = "LC", + qcName = "SM:~TIC", + orderNr = 0025 + ) + return(.self) + }) +) + diff --git a/README.md b/README.md index 281f806..7d7f699 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,15 @@ PTXQC [![Build Status](https://travis-ci.org/cbielow/PTXQC.svg?branch=master)](https://travis-ci.org/cbielow/PTXQC) [![Project Stats](https://www.openhub.net/p/PTXQC/widgets/project_thin_badge.gif)](https://www.openhub.net/p/PTXQC) -**This package allows users of MaxQuant to generate quality control reports in Html/PDF format.** +**This package allows users of MaxQuant (from .txt files) and OpenMS (from mzTab files) to generate quality control reports in Html/PDF format.** -### Latest changes / Change log +### Latest changes / ChangeLog + - v1.0.0 - Jan 2020: support for mzTab, more metrics (UpSetR) and fixes - v0.92.06 - Apr 2019: Bug Fixes - v0.92.05 - Mar 2019: Raw name simplification fix - v0.92.04 - Feb 2019: More robust package vignette builds - v0.92.03 - Feb 2018: Full List of Metrics added as vignette - - v0.92.02 - Jan 2018: plots and metrics of reporter intensity (iTRAQ, TMT, ...) for labeled MSn experiments - - v0.92.01 - Oct 2017: fix issue #41 (partial data error) - - v0.92.00 - Oct 2017: cleaner R interface; log file for drag'n'drop; fix boxPlots issue (usually for large experiments only); - - v0.90.00 - Aug 2017: Tables are shown in Html format See [NEWS][News_File] file for a version history. @@ -42,11 +39,12 @@ DOI: [10.1021/acs.jproteome.5b00780][JPR_paper] - Match-between-runs performance - easy usage ([Windows OS only] `drag'n'drop` your `txt output folder` onto a `batch file`) - 10 min [Installation](#installation) - - Html/PDF report will be generated within your MaxQuant-txt folder + - Html/PDF report will be generated within your MaxQuant-txt folder or next to the mzTab file - optional configuration file *in YAML format* for generation of shorter/customized reports ### Target audience - MaxQuant users (no knowledge of R required) + - OpenMS users (or any other software which can write an mzTab) - bioinformaticians (who want to contribute or customize) @@ -60,13 +58,13 @@ you can browse the vignettes using either of these commands within R: browseVignettes(package = 'PTXQC') If you do not want to wait that long, you can look at the -[latest online vignette at CRAN](https://cran.r-project.org/web/packages/PTXQC/vignettes/) +[latest online vignette at CRAN](https://cran.r-project.org/package=PTXQC) You will find documentation on - Full List of Quality Metrics with help text - Input and Output - Report customization - - (for MaxQuant users) Usage of Drag'n'drop + - (for MaxQuant/OpenMS users) Usage of Drag'n'drop - (for R users) Code examples in R The 'List of Metrics' vignette contains a full description for each metric (as seen in the Help section of a Html report). @@ -101,8 +99,8 @@ or if (!require(devtools, quietly = TRUE)) install.packages("devtools") library("devtools") ## this might give a warning like 'WARNING: Rtools is required ...'. Ignore it. - ## use build_vignettes = FALSE if you did not install pandoc or if you encounter errors when building vignettes (e.g. PRIDE ftp unavailable)! - install_github("cbielow/PTXQC", build_vignettes = TRUE, dependencies = TRUE) + ## use build_vignettes = FALSE if you did not install pandoc or if you encounter errors when building vignettes (e.g. PRIDE ftp unavailable)! + install_github("cbielow/PTXQC", build_vignettes = TRUE, dependencies = TRUE) To get started, see the help and/or vignettes: diff --git a/inst/dragNdrop/QC-dragdrop/_internal/compute_QC_report.R b/inst/dragNdrop/QC-dragdrop/_internal/compute_QC_report.R index 8ee0a9c..2da4459 100644 --- a/inst/dragNdrop/QC-dragdrop/_internal/compute_QC_report.R +++ b/inst/dragNdrop/QC-dragdrop/_internal/compute_QC_report.R @@ -46,7 +46,7 @@ use_extended_reportname = yc$getYAML("PTXQC$ReportFilename$extended", TRUE) rprt_fns = getReportFilenames(PATH_TO_TXT, use_extended_reportname) sink(rprt_fns$log_file, split = TRUE) ## log output to file -output_files = try(createReport(PATH_TO_TXT, YAML_CONFIG, rprt_fns)) +output_files = try(createReport(PATH_TO_TXT, NULL, YAML_CONFIG, rprt_fns)) sink() ## undo sink() diff --git a/inst/examples/README.md b/inst/examples/README.md index e83860c..105872f 100644 --- a/inst/examples/README.md +++ b/inst/examples/README.md @@ -41,5 +41,5 @@ See the package vignettes for documentation on how to create and customize a rep [4]: https://github.com/cbielow/PTXQC/blob/master/vignettes/PTXQC-Basic_Guide_for_R_users.Rmd [5]: https://github.com/cbielow/PTXQC/blob/master/vignettes/PTXQC-DragNDrop.Rmd [JPR_paper]: https://doi.org/10.1021/acs.jproteome.5b00780 - [example_html]: http://htmlpreview.github.io/?https://github.com/cbielow/PTXQC/blob/master/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.html - [example_pdf]: report_v0.92.2__txt_5files_withMatch-100min.pdf \ No newline at end of file + [example_html]: http://htmlpreview.github.io/?https://github.com/cbielow/PTXQC/blob/master/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.html + [example_pdf]: report_v1.0.0__txt_5files_withMatch-100min.pdf \ No newline at end of file diff --git a/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.html b/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.html deleted file mode 100644 index bc970c9..0000000 --- a/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.html +++ /dev/null @@ -1,1242 +0,0 @@ - - - - - - - - - - - - - -PTXQC Quality Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

- - - - - - - - - - - - - - - - - -

-
-

1 Overview

-

Quick guide

- -
-

1.1 HeatMap

-

-
-
-

1.2 Name Mapping

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-Mapping of Raw files to their short names Mapping source: automatic (automatic shortening of names was not sufficient - see ‘best effort’) -
-
-from - -to - -best.effort -
-Toni_20140521_GM_QC_01 - -file 1 - -..521_GM_QC_01 -
-Toni_20140521_GM_QC_02 - -file 2 - -..521_GM_QC_02 -
-Toni_20140522_GM_QC_01 - -file 3 - -..522_GM_QC_01 -
-Toni_20140531_FB_QC_02 - -file 4 - -..531_FB_QC_02 -
-Toni_20140608_FB_qc_01 - -file 5 - -..608_FB_qc_01 -
-
-
-

1.3 Metrics

-
-

1.3.1 PAR: parameters

-
-
-↓ Show Help -
-
-MaxQuant parameters, extracted from parameters.txt (abbreviated as ‘PAR’), summarizes the settings used for the MaxQuant analysis. Key parameters are MaxQuant version, Re-quantify, Match-between-runs and mass search tolerances. A list of protein database files is also provided, allowing to track database completeness and database version information (if given in the filename). -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-uniprot_human_canonical_and_isoforms_20130513.fasta -
-
-parameter - -value - -parameter - -value -
-Advanced ratios - -False - -MS/MS deisotoping (ITMS) - -False -
-Alignment time window [min] - -100 - -MS/MS deisotoping (TOF) - -False -
-Cut peaks - -True - -MS/MS deisotoping (Unknown) - -False -
-Decoy mode - -revert - -MS/MS recalibration - -False -
-Discard unmodified counterpa.. - -True - -MS/MS tol. (FTMS) - -20 ppm -
-Find dependent peptides - -False - -MS/MS tol. (ITMS) - -0.5 Da -
-First pass AIF correlation - -0.8 - -MS/MS tol. (TOF) - -0.1 Da -
-Fixed modifications - -Carbamidomethyl (C) - -MS/MS tol. (Unknown) - -0.5 Da -
-iBAQ - -False - -Peptides used for protein qu.. - -Razor -
-iBAQ log fit - -False - -Protein FDR - -0.01 -
-Include contaminants - -True - -PSM FDR - -0.01 -
-Labeled amino acid filtering - -True - -Re-quantify - -True -
-Match between runs - -True - -RT shift - -False -
-Matching time window [min] - -1 - -Site FDR - -0.01 -
-Min. delta score for modifie.. - -17 - -Site quantification - -Use least modified peptide -
-Min. delta score for unmodif.. - -0 - -Site tables - -Oxidation (M)Sites.txt -
-Min. peptide Length - -7 - -Special AAs - -KR -
-Min. peptides - -1 - -Top MS/MS peaks per 100 Da. .. - -12 -
-Min. ratio count - -2 - -Top MS/MS peaks per 100 Da. .. - -8 -
-Min. razor peptides - -1 - -Top MS/MS peaks per 100 Da. .. - -10 -
-Min. score for modified pept.. - -40 - -Top MS/MS peaks per 100 Da. .. - -10 -
-Min. score for unmodified pe.. - -0 - -Use delta score - -False -
-Min. unique peptides - -0 - -Use Normalized Ratios For Oc.. - -True -
-Modifications included in pr.. - -Acetyl (Protein N-term) Oxidation (M) - -Use only unmodified peptides.. - -True -
-MS/MS deisotoping (FTMS) - -True - -Version - -1.4.1.2 -
-

-back to top -

-
-
-

1.3.2 PG: PCA of ‘raw intensity’

-


(excludes contaminants)

-
-
-↓ Show Help -
-
-

Principal components plots of experimental groups (as defined during MaxQuant configuration).

-

This plot is shown only if more than one experimental group was defined. If LFQ was activated in MaxQuant, an additional PCA plot for LFQ intensities is shown. Similarly, if iTRAQ/TMT reporter intensities are detected.

-

Since experimental groups and Raw files do not necessarily correspond 1:1, this plot cannot use the abbreviated Raw file names, but instead must rely on automatic shortening of group names.

-

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

-
-
-


-
-
-

1.3.3 PG: PCA of ‘lfq intensity’

-


(excludes contaminants)

-


-

-back to top -

-
-
-

1.3.4 EVD: Top5 Contaminants per Raw file

-
-
-↓ Show Help -
-
-

PTXQC will explicitly show the five most abundant external protein contaminants (as detected via MaxQuant’s contaminants FASTA file) by Raw file, and summarize the remaining contaminants as ‘other’. This allows to track down which proteins exactly contaminate your sample. Low contamination is obviously better. The ‘Abundance class’ models the average peptide intensity in each Raw file and is visualized using varying degrees of transparency. It is not unusual to see samples with low sample content to have higher contamination. If you see only one abundance class (‘mid’), this means all your Raw files have roughly the same peptide intensity distribution.

-

Heatmap score [EVD: Contaminants]: as fraction of summed intensity with 0 = sample full of contaminants; 1 = no contaminants

-
-
-


-

-back to top -

-
-
-

1.3.5 EVD: Contaminants

-
-
-↓ Show Help -
-
-

User defined contaminant plot based on peptide intensities and counts. Usually used for Mycoplasma detection, but can be used for an arbitrary (set of) proteins.

-

All proteins (and their peptides) which contain the search string from the YAML file are considered contaminants. The contaminant’s search string is searched in the full FASTA header in proteinGroups.txt. If proteinGroups.txt is not available/found, only protein identifiers can be considered. The search realm used is given in the plot subtitle. You should choose the contaminant name to be distinctive. Only peptides belonging to a single protein group are considered when computing the fractions (contaminant vs. all), since peptides shared across multiple groups are potentially false positives.

-

Two abundance measures are computed per Raw file:

-
    -
  • fraction of contaminant intensity (used for scoring of the metric)
  • -
  • fraction of contaminant spectral counts (as comparison; both should be similar)
  • -
-

If the intensity fraction exceeds the threshold (indicated by the dashed horizontal line) a contamination is assumed.

-

For each Raw file exceeding the threshold an additional plot giving cumulative Andromeda peptide score distributions is shown. This allows to decide if the contamination is true. Contaminant scores should be equally high (or higher), i.e. to the right, compared to the sample scores. Each graph’s subtitle is augmented with a p-value of the Kologorov-Smirnoff test of this data (Andromeda scores of contaminant peptides vs. sample peptides). If the p-value is high, there is no score difference between the two peptide populations. In particular, the contaminant peptides are not bad-scoring, random hits. These p-values are also shown in the first figure for each Raw file. Note that the p-value is purely based on Andromeda scores and is independent of intensity or spectral counts.

-

Heatmap score [EVD: Contaminant ]: boolean score, i.e. 0% (fail) if the intensity threshold was exceeded; otherwise 100% (pass).

-
-
-


-

-back to top -

-
-
-

1.3.6 EVD: peptide intensity distribution

-


RSD 3.2% (expected < 5%)

-
-
-↓ Show Help -
-
-

Peptide precursor intensity per Raw file from evidence.txt. Low peptide intensity usually goes hand in hand with low MS/MS identifcation rates and unfavourable signal/noise ratios, which makes signal detection harder. Also instrument acquisition time increases for trapping instruments.

-

Failing to reach the intensity threshold is usually due to unfavorable column conditions, inadequate column loading or ionization issues. If the study is not a dilution series or pulsed SILAC experiment, we would expect every condition to have about the same median log-intensity (of 223.0). The relative standard deviation (RSD) gives an indication about reproducibility across files and should be below 5%.

-

Depending on your setup, your target thresholds might vary from PTXQC’s defaults. Change the threshold using the YAML configuration file.

-

Heatmap score [EVD: Pep Intensity (>23.0)]: Linear scale of the median intensity reaching the threshold, i.e. reaching 221 of 223 gives score 0.25.

-
-
-


-

-back to top -

-
-
-

1.3.7 PG: intensity distribution

-


RSD 3% (w/o zero int.; expected < 5%)3.2% [high RSD –> few peptides])

-
-
-↓ Show Help -
-
-

Intensity boxplots by experimental groups. Groups are user-defined during MaxQuant configuration. This plot displays a (customizable) threshold line for the desired mean intensity of proteins. Groups which underperform here, are likely to also suffer from a worse MS/MS id rate and higher contamination due to the lack of total protein loaded/detected. If possible, all groups should show a high and consistent amount of total protein. The height of the bar correlates to the number of proteins with non-zero abundance.

-

Contaminants are shown as overlayed yellow boxes, whose height corresponds to the number of contaminant proteins. The position of the box gives the intensity distribution of the contaminants.

-

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

-
-
-


-

-back to top -

-
-
-

1.3.8 PG: LFQ intensity distribution

-


RSD 1.7% (w/o zero int.; expected < 5%)0.5% [high RSD –> few peptides])

-
-
-↓ Show Help -
-
-

Label-free quantification (LFQ) intensity boxplots by experimental groups. Groups are user-defined during MaxQuant configuration. This plot displays a (customizable) threshold line for the desired mean of LFQ intensity of proteins. Raw files which underperform in Raw intensity, are likely to show an increased mean here, since only high-abundance proteins are recovered and quantifyable by MaxQuant in this Raw file. The remaining proteins are likely to receive an LFQ value of 0 (i.e. do not contribute to the distribution). The height of the bar correlates to the number of proteins with non-zero abundance.

-

Contaminants are shown as overlayed yellow boxes, whose height corresponds to the number of contaminant proteins. The position of the box gives the intensity distribution of the contaminants.

-

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

-
-
-


-

-back to top -

-
-
-

1.3.9 MSMS: Missed cleavages per Raw file

-


(excludes contaminants)

-
-
-↓ Show Help -
-
-

Under optimal digestion conditions (high enzyme grade etc.), only few missed cleavages (MC) are expected. In general, increased MC counts also increase the number of peptide signals, thus cluttering the available space and potentially provoking overlapping peptide signals, biasing peptide quantification. Thus, low MC counts should be favored. Interestingly, it has been shown recently that incorporation of peptides with missed cleavages does not negatively influence protein quantification (see http://pubs.acs.org/doi/abs/10.1021/pr500294d ). However this is true only if all samples show the same degree of digestion. High missed cleavage values can indicate for example, either a) failed digestion, b) a high (post-digestion) protein contamination, or c) a sample with high amounts of unspecifically degraded peptides which are not digested by trypsin.

-

In the rare case that ‘no enzyme’ was specified in MaxQuant, neither scores nor plots are shown.

-

Heatmap score [MSMS: MC]: the fraction (0% - 100%) of fully cleaved peptides per Raw file

-Heatmap score [MSMS: MC Var]: each Raw file is scored for its deviation (score: MedianDist) from the ‘average’ digestion state of the current study. -
-
-


-

-back to top -

-
-
-

1.3.10 EVD: charge distribution

-
-
-↓ Show Help -
-
-

Charge distribution per Raw file. For typtic digests, peptides of charge 2 (one N-terminal and one at tryptic C-terminal R or K residue) should be dominant. Ionization issues (voltage?), in-source fragmentation, missed cleavages and buffer irregularities can cause a shift (see http://onlinelibrary.wiley.com/doi/10.1002/mas.21544/abstract ). The charge distribution should be similar across Raw files. Consistent charge distribution is paramount for comparable 3D-peak intensities across samples.

-

Heatmap score [EVD: Charge]: Deviation of the charge 2 proportion from a representative Raw file (‘qualMedianDist’ function).

-
-
-


-

-back to top -

-
-
-

1.3.11 PG: Contaminant per condition

-
-
-↓ Show Help -
-
-

External protein contamination should be controlled for, therefore MaxQuant ships with a comprehensive, yet customizable protein contamination database, which is searched by MaxQuant by default. PTXQC generates a contamination plot derived from the proteinGroups (PG) table showing the fraction of total protein intensity attributable to contaminants. The plot employs transparency to discern differences in the group-wise summed protein abundance. This allows to delineate a high contamination in high complexity samples from a high contamination in low complexity samples (e.g. from in-gel digestion). If you see only one abundance class (‘mid’), this means all your groups have roughly the same summed protein intensity. Note that this plot is based on experimental groups, and therefore may not correspond 1:1 to Raw files.

-

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

-
-
-


-

-back to top -

-
-
-

1.3.12 MSMSscans: TopN

-
-
-↓ Show Help -
-
-

Reaching TopN on a regular basis indicates that all sections of the LC gradient deliver a sufficient number of peptides to keep the instrument busy. This metric somewhat summarizes ‘TopN over RT’.

-

Heatmap score [MS2 Scans: TopN high]: rewards if TopN was reached on a regular basis (function qualHighest)

-
-
-


-

-back to top -

-
-
-

1.3.13 MSMSscans: TopN over RT

-
-
-↓ Show Help -
-
-

TopN over retention time. Similar to ID over RT, this metric reflects the complexity of the sample at any point in time. Ideally complexity should be made roughly equal (constant) by choosing a proper (non-linear) LC gradient. See http://www.ncbi.nlm.nih.gov/pubmed/24700534 for details.

-

Heatmap score [MS2 Scans: TopN over RT]: Rewards uniform (function Uniform) TopN events over time.

-
-
-


-

-back to top -

-
-
-

1.3.14 EVD: IDs over RT

-
-
-↓ Show Help -
-
-

Judge column occupancy over retention time. Ideally, the LC gradient is chosen such that the number of identifications (here, after FDR filtering) is uniform over time, to ensure consistent instrument duty cycles. Sharp peaks and uneven distribution of identifications over time indicate potential for LC gradient optimization. See http://www.ncbi.nlm.nih.gov/pubmed/24700534 for details.

-

Heatmap score [EVD: ID rate over RT]: Scored using ‘Uniform’ scoring function, i.e. constant receives good score, extreme shapes are bad.

-
-
-


-

-back to top -

-
-
-

1.3.15 EVD: Peak width over RT

-
-
-↓ Show Help -
-
-

One parameter of optimal and reproducible chromatographic separation is the distribution of widths of peptide elution peaks, derived from the evidence table. Ideally, all Raw files show a similar distribution, e.g. to allow for equal conditions during dynamic precursor exclusion, RT alignment or peptide quantification.

-

Heatmap score [EVD: RT Peak Width]: Scored using BestKS function, i.e. the D statistic of a Kolmogoriv-Smirnoff test.

-
-
-


-

-back to top -

-
-
-

1.3.16 EVD: MBR - alignment

-


alignment reference: Toni_20140521_GM_QC_01

-
-
-↓ Show Help -
-
-

MBR Alignment: First of two steps (1=align, 2=transfer) during Match-between-runs. This plot is based purely on real MS/MS ids. Ideally, RTs of identical peptides should be equal (i.e. very small residual RT delta) across Raw files after alignment.

-

MaxQuants RT correction is shown in blue – it should be well within the alignment search window (20min by default) set during MaxQuant configuration. The resulting residual RT delta after RT alignment (compared to a reference Raw file), is shown as green/red dots. One dot represents one peptide (incl. charge). Every dot (peptide) outside an allowed residual delta RT (1min by default) is colored red. All others are green.

-

If moving ‘red’ dots to the horizontal zero-line (to make them green) requires large RT shifts, then increasing the alignment search window might help MaxQuant to find a better alignment.

-

Heatmap score [EVD: MBR Align]: fraction of ‘green’ vs. ‘green+red’ peptides.

-
-
-


-

-back to top -

-
-
-

1.3.17 EVD: MBR - ID Transfer

-
-
-↓ Show Help -
-
-

MBR Transfer: Last of two steps (1=align, 2=transfer) during Match-between-runs. If MaxQuant only transfers peptide ID’s which are not present in the target file, then each Raw file should not have any duplicates of identical peptides (incl. charge). Sometimes, a single or split 3D-peak gets annotated multiple times, that’s ok. However, the same peptide should not be annotated twice (or more) at vastly different points in RT.

-

This plot shows three columns: - left: the ‘genuine’ situation (pretending that no MBR was computed) - middle: looking only at transferred IDs - right: combined picture (a mixture of left+middle, usually)

-

Each peptide falls into three categories (the colors): - single (good, because it has either one genuine OR a transferred ID). - in-group (also good, because all ID’s are very close in RT) - out-group (bad, spread across the RT gradient – should not be possible; a false ID)

-

Heatmap score [EVD: MBR ID-Transfer]: The fraction of non-out-group peptides (i.e. good peptides) in the middle column. This score is ‘pessimistic’ because if few ID’s were transferred, but all of them are bad, the score is bad, even though the majority of peptides is still ok (because they are genuine). However, in this case MBR provides few (and wrong) additional information, and should be disabled.

-
-
-


-

-back to top -

-
-
-

1.3.18 [experimental] EVD: Clustering Tree of Raw files

-


by Correlation of Corrected Retention Times

-
-
-↓ Show Help -
-
-

Auxililiary plots – experimental – without scores.

-

Heatmap score: none.

-
-
-


-
-
-

1.3.19 EVD: Peptides inferred by MBR

-


-

-back to top -

-
-
-

1.3.20 MSMSscans: Ion Injection Time over RT

-
-
-↓ Show Help -
-
-

Ion injection time score - should be as low as possible to allow fast cycles. Correlated with peptide intensity. Note that this threshold needs customization depending on the instrument used (e.g., ITMS vs. FTMS).

-

Heatmap score [MS2 Scans: Ion Inj Time]: Linear score as fraction of MS/MS below the threshold.

-
-
-


-

-back to top -

-
-
-

1.3.21 [experimental] MSMSscans: MS/MS intensity

-
-
-↓ Show Help -
-
-

MS/MS identifications can be ‘bad’ for a couple of reasons. It could be computational, i.e. ID rates are low because you specified the wrong protein database or modifications (not our concern here). Another reason is low/missing signals for fragment ions, e.g. due to bad (quadrupole/optics) ion transmission (charging effects), too small isolation windows, etc.

-

Hence, we plot the TIC and base peak intensity of all MS/MS scans (incl. unidentified ones) per Raw file. Depending on the setup, these intensities can vary, but telling apart good from bad samples should never be a problem. If you only have bad samples, you need to know the intensity a good sample would reach.

-

To automatically score this, we found that the TIC should be 10-100x larger than the base peak, i.e. there should be many other ions which are roughly as high (a good fragmentation ladder). If there are only a few spurious peaks (bad MS/MS), the TIC is much lower. Thus, we score the ratio BP * 10 > TIC (this would be 100% score). If it’s only BP * 3 < TIC, we say this MS/MS failed (0%). Anything between 3x and 10x gets a score in between. The score for the Raw file is computed as the median score across all its MS/MS scans.

-Heatmap score [MS2 Scans: Intensity]: Linear score (0-100%) between 3 < (TIC / BP) < 10. -
-
-


-

-back to top -

-
-
-

1.3.22 EVD: Oversampling (MS/MS counts per 3D-peak)

-
-
-↓ Show Help -
-
-

An oversampled 3D-peak is defined as a peak whose peptide ion (same sequence and same charge state) was identified by at least two distinct MS2 spectra in the same Raw file. For high complexity samples, oversampling of individual 3D-peaks automatically leads to undersampling or even omission of other 3D-peaks, reducing the number of identified peptides. Oversampling occurs in low-complexity samples or long LC gradients, as well as undersized dynamic exclusion windows for data independent acquisitions.

-

Heatmap score [EVD: MS2 Oversampling]: The percentage of non-oversampled 3D-peaks.

-
-
-


-

-back to top -

-
-
-

1.3.23 EVD: Uncalibrated mass error

-
-
-↓ Show Help -
-
-

Mass accurary before calibration. Outliers are marked as such (‘out-of-search-tol’) using ID rate and standard deviation as additional information (if available). If any Raw file is flagged ‘failed’, increasing MaxQuant’s first-search tolerance (20ppm by default, here: 20.0 ppm) might help to enable successful recalibration. A bug in MaxQuant sometimes leads to excessively high ppm mass errors (>104) reported in the output data. However, this can sometimes be corrected for by re-computing the delta mass error from other data. If this is the case, a warning (‘bugfix applied’) will be shown.

-

Heatmap score [EVD: MS Cal Pre (20.0)]: the centeredness (function CenteredRef) of uncalibrated masses in relation to the search window size.

-
-
-


-

-back to top -

-
-
-

1.3.24 EVD: Calibrated mass error

-
-
-↓ Show Help -
-
-

Precursor mass accuracy after calibration. Failed samples from precalibration data are still marked here. Ppm errors should be centered on zero and their spread is expected to be significantly smaller than before calibration.

-

Heatmap score [EVD: MS Cal-Post]: The variance and centeredness around zero of the calibrated distribution (function GaussDev).

-
-
-


-

-back to top -

-
-
-

1.3.25 MSMS: Fragment mass errors per Raw file

-
-
-↓ Show Help -
-
-

MS/MS decalibration metric. If most of the fragments are within tighter bounds, you can reduce the fragment mass tolerance to obtain more identifications under the same FDR. On the other hand, if the fragment mass errors are not centered on zero, a recalibration of the instrument should be performed. If the (Gaussian-like) distribution is cut too severely on either side by the search tolerance window in MaxQuant, you might be able to increase the number of identifications by allowing for a wider MS/MS search window when re-running MaxQuant. However, the number of decoy identifications will increase as well, potentially offsetting any gain when FDR is applied.

-

Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around 0 ppm/Da (function Centered).

-
-
-


-

-back to top -

-
-
-

1.3.26 SM: MS/MS identified per Raw file

-
-
-↓ Show Help -
-
-

MS/MS identification rate per Raw file from summary.txt (SM). Each Raw file is colored according to its ID rate and categorized into performance bins as ‘bad’, ‘ok’ and ‘great’. Raw files below ‘ok’, are listed separately on the next page of the report for convenient follow-up.

-

The thresholds for the bins are

-
    -
  • bad (<20%)
  • -
  • ok (20-35%)
  • -
  • great (>35%)
  • -
-Heatmap score [SM: MS2 IDrate (>35)]: reaches 1 (=100%) if the threshold for ‘great’ is reached or exceeded. -
-
-


-

-back to top -

-
-
-

1.3.27 MSMSscans: TopN % identified over N

-
-
-↓ Show Help -
-
-

Looking at the identification rates per scan event (i.e. the MS/MS scans after a survey scan) can give hints on how well scheduled precursor peaks could be fragmented and identified. If performance drops for the later MS/MS scans, then the LC peaks are probably not wide enough to deliver enough eluent or the intensity threshold to trigger the MS/MS event should be lowered (if LC peak is already over), or increased (if LC peak is still to weak to collect enough ions).

-

Heatmap score [MS2 Scans: TopN ID over N]: Rewards uniform identification performance across all scan events.

-
-
-


-

-back to top -

-
-
-

1.3.28 [experimental] EVD: Non-Missing Peptides

-


compared to all peptides seen in experiment

-
-
-↓ Show Help -
-
-

Missing peptide intensities per Raw file from evidence.txt. This metric shows the fraction of missing peptides compared to all peptides seen in the whole experiment. The more Raw files you have, the higher this fraction is going to be (because there is always going to be some exotic [low intensity?] peptide which gets [falsely] identified in only a single Raw file). A second plot shows how many peptides (Y-axis) are covered by at least X Raw files. A third plot shows the density of the observed (line) and the missing (filled area) data. To reconstruct the distribution of missing values, an imputation strategy is required, so the argument is somewhat circular here. If all Raw files are (technical) replicates, i.e. we can expect that missing peptides are indeed present and have an intensity similar to the peptides we do see, then the median is a good estimator. This method performs a global normalization across Raw files (so their observed intensitiy distributions have the same mean), before computing the imputed values. Afterwards, the distributions are de-normalized again (shifting them back to their) original locations – but this time with imputed peptides.

-

Peptides obtained via Match-between-run (MBR) are accounted for (i.e. are considered as present = non-missing). Thus, make sure that MBR is working as intended (see MBR metrics).

-

Warning: this metric is meaningless for fractionated data! TODO: compensate for lower scores in large studies (with many Raw files), since peptide FDR is accumulating!?

-

Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptides.

-
-
-


-
-
-

1.3.29 [experimental] EVD: Non-missing by set

-


-
-
-

1.3.30 [experimental] EVD: Imputed Peptide Intensity Distribution of Missing Values

-


-

-back to top -

-
-
-

1.3.31 EVD: Peptide ID count

-


MBR gain: +33%

-
-
-↓ Show Help -
-
-

Number of unique (i.e. not counted twice) peptide sequences including modifications (after FDR) per Raw file. A configurable target threshold is indicated as dashed line.

-

If MBR was enabled, three categories (‘genuine (exclusive)’, ‘genuine + transferred’, ‘transferred (exclusive)’ are shown, so the user can judge the gain that MBR provides.
-Peptides in the ‘genuine + transferred’ category were identified within the Raw file by MS/MS, but at the same time also transferred to this Raw file using MBR. This ID transfer can be correct (e.g. in case of different charge states), or incorrect – see MBR-related metrics to tell the difference. Ideally, the ‘genuine + transferred’ category should be rather small, the other two should be large.

-

If MBR would be switched off, you can expect to see the number of peptides corresponding to ‘genuine (exclusive)’ + ‘genuine + transferred’. In general, if the MBR gain is low and the MBR scores are bad (see the two MBR-related metrics), MBR should be switched off for the Raw files which are affected (could be a few or all).

-

Heatmap score [EVD: Pep Count (>15000)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%.

-
-
-


-

-back to top -

-
-
-

1.3.32 EVD: ProteinGroups count

-


MBR gain: +14%

-
-
-↓ Show Help -
-
-

Number of Protein groups (after FDR) per Raw file. A configurable target threshold is indicated as dashed line.

-

If MBR was enabled, three categories (‘genuine (exclusive)’, ‘genuine + transferred’, ‘transferred (exclusive)’ are shown, so the user can judge the gain that MBR provides. Here, ‘transferred (exclusive)’ means that this protein group has peptide evidence which originates only from transferred peptide IDs. The quantification is (of course) always from the local Raw file. Proteins in the ‘genuine + transferred’ category have peptide evidence from within the Raw file by MS/MS, but at the same time also peptide IDs transferred to this Raw file using MBR were used. It is not unusual to see the ‘genuine + transferred’ category be the rather large, since a protein group usually has peptide evidence from both sources. To see of MBR worked, it is better to look at the two MBR-related metrics.

-

If MBR would be switched off, you can expect to see the number of protein groups corresponding to ‘genuine (exclusive)’ + ‘genuine + transferred’. In general, if the MBR gain is low and the MBR scores are bad (see the two MBR-related metrics), MBR should be switched off for the Raw files which are affected (could be a few or all).

-

Heatmap score [EVD: Prot Count (>3500)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%.

-
-
-


-

-back to top -

- -
-
-
- - - - -
- - - - - - diff --git a/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.pdf b/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.pdf deleted file mode 100644 index ea37f96..0000000 Binary files a/inst/examples/report_v0.92.2__txt_5files_withMatch-100min.pdf and /dev/null differ diff --git a/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.html b/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.html new file mode 100644 index 0000000..e0c950a --- /dev/null +++ b/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.html @@ -0,0 +1,1326 @@ + + + + + + + + + + + + + + +ProTeomiX (PTX) Quality Control (QC) Report + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + +

+
+

1 Overview

+

Quick guide

+ +
+

1.1 HeatMap

+

+
+
+

1.2 Name Mapping

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+Mapping of Raw files to their short names Mapping source: file (user-defined) (automatic shortening of names was not sufficient - see ‘best effort’) +
+
+from + +to + +best.effort +
+Toni_20140521_GM_QC_01 + +file 1 + +..521_GM_QC_0.._01 +
+Toni_20140521_GM_QC_02 + +file 2 + +..521_GM_QC_0.._02 +
+Toni_20140522_GM_QC_01 + +file 3 + +..522_GM_QC_0.._01 +
+Toni_20140531_FB_QC_02 + +file 4 + +..531_FB_QC_0.._02 +
+Toni_20140608_FB_qc_01 + +file 5 + +..608_FB_qc_0.._01 +
+
+
+

1.3 Metrics

+
+

1.3.1 PAR: parameters

+
+
+↓ Show Help +
+
+MaxQuant parameters, extracted from parameters.txt (abbreviated as ‘PAR’), summarizes the settings used for the MaxQuant analysis. Key parameters are MaxQuant version, Re-quantify, Match-between-runs and mass search tolerances. A list of protein database files is also provided, allowing to track database completeness and database version information (if given in the filename). +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+uniprot_human_canonical_and_isoforms_20130513.fasta +
+
+parameter + +value + +parameter + +value +
+Advanced ratios + +False + +MS/MS deisotoping (ITMS) + +False +
+Alignment time window [min] + +100 + +MS/MS deisotoping (TOF) + +False +
+Cut peaks + +True + +MS/MS deisotoping (Unknown) + +False +
+Decoy mode + +revert + +MS/MS recalibration + +False +
+Discard unmodified counterpa.. + +True + +MS/MS tol. (FTMS) + +20 ppm +
+Find dependent peptides + +False + +MS/MS tol. (ITMS) + +0.5 Da +
+First pass AIF correlation + +0.8 + +MS/MS tol. (TOF) + +0.1 Da +
+Fixed modifications + +Carbamidomethyl (C) + +MS/MS tol. (Unknown) + +0.5 Da +
+iBAQ + +False + +Peptides used for protein qu.. + +Razor +
+iBAQ log fit + +False + +Protein FDR + +0.01 +
+Include contaminants + +True + +PSM FDR + +0.01 +
+Labeled amino acid filtering + +True + +Re-quantify + +True +
+Match between runs + +True + +RT shift + +False +
+Matching time window [min] + +1 + +Site FDR + +0.01 +
+Min. delta score for modifie.. + +17 + +Site quantification + +Use least modified peptide +
+Min. delta score for unmodif.. + +0 + +Site tables + +Oxidation (M)Sites.txt +
+Min. peptide Length + +7 + +Special AAs + +KR +
+Min. peptides + +1 + +Top MS/MS peaks per 100 Da. .. + +12 +
+Min. ratio count + +2 + +Top MS/MS peaks per 100 Da. .. + +8 +
+Min. razor peptides + +1 + +Top MS/MS peaks per 100 Da. .. + +10 +
+Min. score for modified pept.. + +40 + +Top MS/MS peaks per 100 Da. .. + +10 +
+Min. score for unmodified pe.. + +0 + +Use delta score + +False +
+Min. unique peptides + +0 + +Use Normalized Ratios For Oc.. + +True +
+Modifications included in pr.. + +Acetyl (Protein N-term) Oxidation (M) + +Use only unmodified peptides.. + +True +
+MS/MS deisotoping (FTMS) + +True + +Version + +1.4.1.2 +
+

+back to top +

+
+
+

1.3.2 PG: PCA of ‘raw intensity’

+


(excludes contaminants)

+
+
+↓ Show Help +
+
+

Principal components plots of experimental groups (as defined during MaxQuant configuration).

+

This plot is shown only if more than one experimental group was defined. If LFQ was activated in MaxQuant, an additional PCA plot for LFQ intensities is shown. Similarly, if iTRAQ/TMT reporter intensities are detected.

+

Since experimental groups and Raw files do not necessarily correspond 1:1, this plot cannot use the abbreviated Raw file names, but instead must rely on automatic shortening of group names.

+

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

+
+
+


+
+
+

1.3.3 PG: PCA of ‘lfq intensity’

+


(excludes contaminants)

+


+

+back to top +

+
+
+

1.3.4 EVD: Top5 Contaminants per Raw file

+
+
+↓ Show Help +
+
+

PTXQC will explicitly show the five most abundant external protein contaminants (as detected via MaxQuant’s contaminants FASTA file) by Raw file, and summarize the remaining contaminants as ‘other’. This allows to track down which proteins exactly contaminate your sample. Low contamination is obviously better. The ‘Abundance class’ models the average peptide intensity in each Raw file and is visualized using varying degrees of transparency. It is not unusual to see samples with low sample content to have higher contamination. If you see only one abundance class (‘mid’), this means all your Raw files have roughly the same peptide intensity distribution.

+

Heatmap score [EVD: Contaminants]: as fraction of summed intensity with 0 = sample full of contaminants; 1 = no contaminants

+
+
+


+

+back to top +

+
+
+

1.3.5 EVD: peptide intensity distribution

+


RSD 3% (expected < 5%)

+
+
+↓ Show Help +
+
+

Peptide precursor intensity per Raw file from evidence.txt WITHOUT match-between-runs evidence. Low peptide intensity usually goes hand in hand with low MS/MS identifcation rates and unfavourable signal/noise ratios, which makes signal detection harder. Also instrument acquisition time increases for trapping instruments.

+

Failing to reach the intensity threshold is usually due to unfavorable column conditions, inadequate column loading or ionization issues. If the study is not a dilution series or pulsed SILAC experiment, we would expect every condition to have about the same median log-intensity (of 223.0). The relative standard deviation (RSD) gives an indication about reproducibility across files and should be below 5%.

+

Depending on your setup, your target thresholds might vary from PTXQC’s defaults. Change the threshold using the YAML configuration file.

+

Heatmap score [EVD: Pep Intensity (>23.0)]: Linear scale of the median intensity reaching the threshold, i.e. reaching 221 of 223 gives score 0.25.

+
+
+


+

+back to top +

+
+
+

1.3.6 PG: intensity distribution

+


RSD 3% (w/o zero int.; expected < 5%)3.2% [high RSD –> few peptides])

+
+
+↓ Show Help +
+
+

Intensity boxplots by experimental groups. Groups are user-defined during MaxQuant configuration. This plot displays a (customizable) threshold line for the desired mean intensity of proteins. Groups which underperform here, are likely to also suffer from a worse MS/MS id rate and higher contamination due to the lack of total protein loaded/detected. If possible, all groups should show a high and consistent amount of total protein. The height of the bar correlates to the number of proteins with non-zero abundance.

+

Contaminants are shown as overlayed yellow boxes, whose height corresponds to the number of contaminant proteins. The position of the box gives the intensity distribution of the contaminants.

+

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

+
+
+


+

+back to top +

+
+
+

1.3.7 PG: LFQ intensity distribution

+


RSD 1.7% (w/o zero int.; expected < 5%)0.5% [high RSD –> few peptides])

+
+
+↓ Show Help +
+
+

Label-free quantification (LFQ) intensity boxplots by experimental groups. Groups are user-defined during MaxQuant configuration. This plot displays a (customizable) threshold line for the desired mean of LFQ intensity of proteins. Raw files which underperform in Raw intensity, are likely to show an increased mean here, since only high-abundance proteins are recovered and quantifyable by MaxQuant in this Raw file. The remaining proteins are likely to receive an LFQ value of 0 (i.e. do not contribute to the distribution). The height of the bar correlates to the number of proteins with non-zero abundance.

+

Contaminants are shown as overlayed yellow boxes, whose height corresponds to the number of contaminant proteins. The position of the box gives the intensity distribution of the contaminants.

+

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

+
+
+


+

+back to top +

+
+
+

1.3.8 MSMS: Missed cleavages per Raw file

+


(excludes contaminants)

+
+
+↓ Show Help +
+
+

Under optimal digestion conditions (high enzyme grade etc.), only few missed cleavages (MC) are expected. In general, increased MC counts also increase the number of peptide signals, thus cluttering the available space and potentially provoking overlapping peptide signals, biasing peptide quantification. Thus, low MC counts should be favored. Interestingly, it has been shown recently that incorporation of peptides with missed cleavages does not negatively influence protein quantification (see Chiva, C., Ortega, M., and Sabido, E. Influence of the Digestion Technique, Protease, and Missed Cleavage Peptides in Protein Quantitation. J. Proteome Res. 2014, 13, 3979-86 ). However this is true only if all samples show the same degree of digestion. High missed cleavage values can indicate for example, either a) failed digestion, b) a high (post-digestion) protein contamination, or c) a sample with high amounts of unspecifically degraded peptides which are not digested by trypsin.

+

If MC>=1 is high (>20%) you should increase the missed cleavages settings in MaxQuant and compare the number of peptides. Usually high MC correlates with bad identification rates, since many spectra cannot be matched to the forward database.

+

In the rare case that ‘no enzyme’ was specified in MaxQuant, neither scores nor plots are shown.

+

Heatmap score [MSMS: MC]: the fraction (0% - 100%) of fully cleaved peptides per Raw file

+Heatmap score [MSMS: MC Var]: each Raw file is scored for its deviation (score: MedianDist) from the ‘average’ digestion state of the current study. +
+
+


+

+back to top +

+
+
+

1.3.9 EVD: charge distribution

+
+
+↓ Show Help +
+
+

Charge distribution per Raw file. For typtic digests, peptides of charge 2 (one N-terminal and one at tryptic C-terminal R or K residue) should be dominant. Ionization issues (voltage?), in-source fragmentation, missed cleavages and buffer irregularities can cause a shift (see Bittremieux 2017, DOI: 10.1002/mas.21544 ). The charge distribution should be similar across Raw files. Consistent charge distribution is paramount for comparable 3D-peak intensities across samples.

+

Heatmap score [EVD: Charge]: Deviation of the charge 2 proportion from a representative Raw file (‘qualMedianDist’ function).

+
+
+


+

+back to top +

+
+
+

1.3.10 PG: Contaminant per condition

+
+
+↓ Show Help +
+
+

External protein contamination should be controlled for, therefore MaxQuant ships with a comprehensive, yet customizable protein contamination database, which is searched by MaxQuant by default. PTXQC generates a contamination plot derived from the proteinGroups (PG) table showing the fraction of total protein intensity attributable to contaminants. The plot employs transparency to discern differences in the group-wise summed protein abundance. This allows to delineate a high contamination in high complexity samples from a high contamination in low complexity samples (e.g. from in-gel digestion). If you see only one abundance class (‘mid’), this means all your groups have roughly the same summed protein intensity. Note that this plot is based on experimental groups, and therefore may not correspond 1:1 to Raw files.

+

Heatmap score: none (since data source proteinGroups.txt is not related 1:1 to Raw files)

+
+
+


+

+back to top +

+
+
+

1.3.11 MSMSscans: TopN

+
+
+↓ Show Help +
+
+

Reaching TopN on a regular basis indicates that all sections of the LC gradient deliver a sufficient number of peptides to keep the instrument busy. This metric somewhat summarizes ‘TopN over RT’.

+

Heatmap score [MS2 Scans: TopN high]: rewards if TopN was reached on a regular basis (function qualHighest)

+
+
+


+

+back to top +

+
+
+

1.3.12 MSMSscans: TopN over RT

+
+
+↓ Show Help +
+
+

TopN over retention time. Similar to ID over RT, this metric reflects the complexity of the sample at any point in time. Ideally complexity should be made roughly equal (constant) by choosing a proper (non-linear) LC gradient. See Moruz 2014, DOI: 10.1002/pmic.201400036 for details.

+

Heatmap score [MS2 Scans: TopN over RT]: Rewards uniform (function Uniform) TopN events over time.

+
+
+


+

+back to top +

+
+
+

1.3.13 EVD: IDs over RT

+
+
+↓ Show Help +
+
+

Judge column occupancy over retention time. Ideally, the LC gradient is chosen such that the number of identifications (here, after FDR filtering) is uniform over time, to ensure consistent instrument duty cycles. Sharp peaks and uneven distribution of identifications over time indicate potential for LC gradient optimization. See Moruz 2014, DOI: 10.1002/pmic.201400036 for details.

+

Heatmap score [EVD: ID rate over RT]: Scored using ‘Uniform’ scoring function, i.e. constant receives good score, extreme shapes are bad.

+
+
+


+

+back to top +

+
+
+

1.3.14 EVD: Peak width over RT

+
+
+↓ Show Help +
+
+

One parameter of optimal and reproducible chromatographic separation is the distribution of widths of peptide elution peaks, derived from the evidence table. Ideally, all Raw files show a similar distribution, e.g. to allow for equal conditions during dynamic precursor exclusion, RT alignment or peptide quantification.

+

Heatmap score [EVD: RT Peak Width]: Scored using BestKS function, i.e. the D statistic of a Kolmogoriv-Smirnoff test.

+
+
+


+

+back to top +

+
+
+

1.3.15 EVD: MBR - alignment

+


alignment reference: Toni_20140521_GM_QC_01

+
+
+↓ Show Help +
+
+

MBR Alignment: First of two steps (1=align, 2=transfer) during Match-between-runs. This plot is based purely on real MS/MS ids. Ideally, RTs of identical peptides should be equal (i.e. very small residual RT delta) across Raw files after alignment.

+

MaxQuants RT correction is shown in blue – it should be well within the alignment search window (20min by default) set during MaxQuant configuration. The resulting residual RT delta after RT alignment (compared to a reference Raw file), is shown as green/red dots. One dot represents one peptide (incl. charge). Every dot (peptide) outside an allowed residual delta RT (1min by default) is colored red. All others are green.

+

If moving ‘red’ dots to the horizontal zero-line (to make them green) requires large RT shifts, then increasing the alignment search window might help MaxQuant to find a better alignment.

+

Heatmap score [EVD: MBR Align]: fraction of ‘green’ vs. ‘green+red’ peptides.

+
+
+


+

+back to top +

+
+
+

1.3.16 EVD: MBR - ID Transfer

+
+
+↓ Show Help +
+
+

MBR Transfer: Last of two steps (1=align, 2=transfer) during Match-between-runs. If MaxQuant only transfers peptide ID’s which are not present in the target file, then each Raw file should not have any duplicates of identical peptides (incl. charge). Sometimes, a single or split 3D-peak gets annotated multiple times, that’s ok. However, the same peptide should not be annotated twice (or more) at vastly different points in RT.

+

This plot shows three columns: - left: the ‘genuine’ situation (pretending that no MBR was computed) - middle: looking only at transferred IDs - right: combined picture (a mixture of left+middle, usually)

+

Each peptide falls into three categories (the colors): - single (good, because it has either one genuine OR a transferred ID). - in-group (also good, because all ID’s are very close in RT) - out-group (bad, spread across the RT gradient – should not be possible; a false ID)

+

Heatmap score [EVD: MBR ID-Transfer]: The fraction of non-out-group peptides (i.e. good peptides) in the middle column. This score is ‘pessimistic’ because if few ID’s were transferred, but all of them are bad, the score is bad, even though the majority of peptides is still ok (because they are genuine). However, in this case MBR provides few (and wrong) additional information, and should be disabled.

+
+
+


+

+back to top +

+
+
+

1.3.17 [experimental] EVD: Clustering Tree of Raw files

+


by Correlation of Corrected Retention Times

+
+
+↓ Show Help +
+
+

Auxililiary plots – experimental – without scores.

+

Return a tree plot with a possible alignment tree. This allows the user to judge which Raw files have similar corrected RT’s (i.e. where aligned successfully). If there are clear sub-clusters, it might be worth introducing artifical fractions into MaxQuant, to avoid ID-transfer between these clusters (use the MBR-Align and MBR-ID-Transfer metrics to support the decision).

+

If the input contains fractions, leaf nodes will be colored accordingly. Distinct sub-clusters should have their own color. If not, MaxQuant’s fraction settings should be optimized. Note that introducing fractions in MaxQuant will naturally lead to a clustering here (it’s somewhat circular).

+

Heatmap score: none.

+
+
+


+

+back to top +

+
+
+

1.3.18 MSMSscans: Ion Injection Time over RT

+
+
+↓ Show Help +
+
+

Ion injection time score - should be as low as possible to allow fast cycles. Correlated with peptide intensity. Note that this threshold needs customization depending on the instrument used (e.g., ITMS vs. FTMS).

+

Heatmap score [MS2 Scans: Ion Inj Time]: Linear score as fraction of MS/MS below the threshold.

+
+
+


+

+back to top +

+
+
+

1.3.19 [experimental] MSMSscans: MS/MS intensity

+
+
+↓ Show Help +
+
+

MS/MS identifications can be ‘bad’ for a couple of reasons. It could be computational, i.e. ID rates are low because you specified the wrong protein database or modifications (not our concern here). Another reason is low/missing signals for fragment ions, e.g. due to bad (quadrupole/optics) ion transmission (charging effects), too small isolation windows, etc.

+

Hence, we plot the TIC and base peak intensity of all MS/MS scans (incl. unidentified ones) per Raw file. Depending on the setup, these intensities can vary, but telling apart good from bad samples should never be a problem. If you only have bad samples, you need to know the intensity a good sample would reach.

+

To automatically score this, we found that the TIC should be 10-100x larger than the base peak, i.e. there should be many other ions which are roughly as high (a good fragmentation ladder). If there are only a few spurious peaks (bad MS/MS), the TIC is much lower. Thus, we score the ratio BP * 10 > TIC (this would be 100% score). If it’s only BP * 3 < TIC, we say this MS/MS failed (0%). Anything between 3x and 10x gets a score in between. The score for the Raw file is computed as the median score across all its MS/MS scans.

+Heatmap score [MS2 Scans: Intensity]: Linear score (0-100%) between 3 < (TIC / BP) < 10. +
+
+


+

+back to top +

+
+
+

1.3.20 EVD: Oversampling (MS/MS counts per 3D-peak)

+
+
+↓ Show Help +
+
+

An oversampled 3D-peak is defined as a peak whose peptide ion (same sequence and same charge state) was identified by at least two distinct MS2 spectra in the same Raw file. For high complexity samples, oversampling of individual 3D-peaks automatically leads to undersampling or even omission of other 3D-peaks, reducing the number of identified peptides. Oversampling occurs in low-complexity samples or long LC gradients, as well as undersized dynamic exclusion windows for data independent acquisitions.

+

Heatmap score [EVD: MS2 Oversampling]: The percentage of non-oversampled 3D-peaks.

+
+
+


+

+back to top +

+
+
+

1.3.21 EVD: Uncalibrated mass error

+
+
+↓ Show Help +
+
+

Mass accurary before calibration. Outliers are marked as such (‘out-of-search-tol’) using ID rate and standard deviation as additional information (if available). If any Raw file is flagged ‘failed’, increasing MaxQuant’s first-search tolerance (20ppm by default, here: 20.0 ppm) might help to enable successful recalibration. A bug in MaxQuant sometimes leads to excessively high ppm mass errors (>104) reported in the output data. However, this can sometimes be corrected for by re-computing the delta mass error from other data. If this is the case, a warning (‘bugfix applied’) will be shown.

+

Heatmap score [EVD: MS Cal Pre (20.0)]: the centeredness (function CenteredRef) of uncalibrated masses in relation to the search window size.

+
+
+


+

+back to top +

+
+
+

1.3.22 EVD: Calibrated mass error

+
+
+↓ Show Help +
+
+

Precursor mass accuracy after calibration. Failed samples from precalibration data are still marked here. Ppm errors should be centered on zero and their spread is expected to be significantly smaller than before calibration.

+

Heatmap score [EVD: MS Cal-Post]: The variance and centeredness around zero of the calibrated distribution (function GaussDev).

+
+
+


+

+back to top +

+
+
+

1.3.23 MSMS: Fragment mass errors per Raw file

+
+
+↓ Show Help +
+
+

MS/MS decalibration metric. If most of the fragments are within tighter bounds, you can reduce the fragment mass tolerance to obtain more identifications under the same FDR. On the other hand, if the fragment mass errors are not centered on zero, a recalibration of the instrument should be performed. If the (Gaussian-like) distribution is cut too severely on either side by the search tolerance window in MaxQuant, you might be able to increase the number of identifications by allowing for a wider MS/MS search window when re-running MaxQuant. However, the number of decoy identifications will increase as well, potentially offsetting any gain when FDR is applied.

+

Heatmap score [MSMS: MS2 Cal (Analyzer)]: rewards centeredness around 0 ppm/Da (function Centered).

+
+
+


+

+back to top +

+
+
+

1.3.24 SM: MS/MS identified per Raw file

+
+
+↓ Show Help +
+
+

MS/MS identification rate per Raw file from summary.txt (SM). Each Raw file is colored according to its ID rate and categorized into performance bins as ‘bad’, ‘ok’ and ‘great’. Raw files below ‘ok’, are listed separately on the next page of the report for convenient follow-up.

+

The thresholds for the bins are

+
    +
  • bad (<20%)
  • +
  • ok (20-35%)
  • +
  • great (>35%)
  • +
+

Heatmap score [SM: MS2 IDrate (>35)]: reaches 1 (=100%) if the threshold for ‘great’ is reached or exceeded.

+
+
+


+

+back to top +

+
+
+

1.3.25 MSMSscans: TopN % identified over N

+
+
+↓ Show Help +
+
+

Looking at the identification rates per scan event (i.e. the MS/MS scans after a survey scan) can give hints on how well scheduled precursor peaks could be fragmented and identified. If performance drops for the later MS/MS scans, then the LC peaks are probably not wide enough to deliver enough eluent or the intensity threshold to trigger the MS/MS event should be lowered (if LC peak is already over), or increased (if LC peak is still to weak to collect enough ions).

+

Heatmap score [MS2 Scans: TopN ID over N]: Rewards uniform identification performance across all scan events.

+
+
+


+

+back to top +

+
+
+

1.3.26 [experimental] EVD: Non-Missing Peptides

+


compared to all peptides seen in experiment

+
+
+↓ Show Help +
+
+

Missing peptide intensities per Raw file from evidence.txt. This metric shows the fraction of missing peptides compared to all peptides seen in the whole experiment. The more Raw files you have, the higher this fraction is going to be (because there is always going to be some exotic [low intensity?] peptide which gets [falsely] identified in only a single Raw file). A second plot shows how many peptides (Y-axis) are covered by at least X Raw files. A third plot shows the density of the observed (line) and the missing (filled area) data. To reconstruct the distribution of missing values, an imputation strategy is required, so the argument is somewhat circular here. If all Raw files are (technical) replicates, i.e. we can expect that missing peptides are indeed present and have an intensity similar to the peptides we do see, then the median is a good estimator. This method performs a global normalization across Raw files (so their observed intensitiy distributions have the same mean), before computing the imputed values. Afterwards, the distributions are de-normalized again (shifting them back to their) original locations – but this time with imputed peptides.

+

Peptides obtained via Match-between-run (MBR) are accounted for (i.e. are considered as present = non-missing). Thus, make sure that MBR is working as intended (see MBR metrics).

+

Warning: this metric is meaningless for fractionated data! TODO: compensate for lower scores in large studies (with many Raw files), since peptide FDR is accumulating!?

+

Heatmap score [EVD: Pep Missing]: Linear scale of the fraction of missing peptides.

+
+
+


+
+
+

1.3.27 [experimental] EVD: Non-missing by set

+


+
+
+

1.3.28 [experimental] EVD: Imputed Peptide Intensity Distribution of Missing Values

+


+

+back to top +

+
+
+

1.3.29 EVD: Peptide ID count

+
+
+↓ Show Help +
+
+

Number of unique (i.e. not counted twice) peptide sequences including modifications (after FDR) per Raw file. A configurable target threshold is indicated as dashed line.

+

If MBR was enabled, three categories (‘genuine (exclusive)’, ‘genuine + transferred’, ‘transferred (exclusive)’ are shown, so the user can judge the gain that MBR provides.
+Peptides in the ‘genuine + transferred’ category were identified within the Raw file by MS/MS, but at the same time also transferred to this Raw file using MBR. This ID transfer can be correct (e.g. in case of different charge states), or incorrect – see MBR-related metrics to tell the difference. Ideally, the ‘genuine + transferred’ category should be rather small, the other two should be large.

+

If MBR would be switched off, you can expect to see the number of peptides corresponding to ‘genuine (exclusive)’ + ‘genuine + transferred’. In general, if the MBR gain is low and the MBR scores are bad (see the two MBR-related metrics), MBR should be switched off for the Raw files which are affected (could be a few or all).

+

Heatmap score [EVD: Pep Count (>15000)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%.

+
+
+


+

+back to top +

+
+
+

1.3.30 EVD: ProteinGroups count

+


MBR gain: +14%

+
+
+↓ Show Help +
+
+

Number of Protein groups (after FDR) per Raw file. A configurable target threshold is indicated as dashed line.

+

If MBR was enabled, three categories (‘genuine (exclusive)’, ‘genuine + transferred’, ‘transferred (exclusive)’ are shown, so the user can judge the gain that MBR provides. Here, ‘transferred (exclusive)’ means that this protein group has peptide evidence which originates only from transferred peptide IDs. The quantification is (of course) always from the local Raw file. Proteins in the ‘genuine + transferred’ category have peptide evidence from within the Raw file by MS/MS, but at the same time also peptide IDs transferred to this Raw file using MBR were used. It is not unusual to see the ‘genuine + transferred’ category be the rather large, since a protein group usually has peptide evidence from both sources. To see of MBR worked, it is better to look at the two MBR-related metrics.

+

If MBR would be switched off, you can expect to see the number of protein groups corresponding to ‘genuine (exclusive)’ + ‘genuine + transferred’. In general, if the MBR gain is low and the MBR scores are bad (see the two MBR-related metrics), MBR should be switched off for the Raw files which are affected (could be a few or all).

+

Heatmap score [EVD: Prot Count (>3500)]: Linear scoring from zero. Reaching or exceeding the target threshold gives a score of 100%.

+
+
+


+

+back to top +

+
+
+

1.3.31 EVD: UpSet distinct

+
+
+↓ Show Help +
+
+

The metric shows an upSet plot based on the number of modified peptide sequences per Raw file, intersected or merged with other Raw files (see below for details).

+

If the number of Raw files is >=6, only the ‘distinct’ plot is generated (the other two are skipped for performance reasons).

+
+ + +
+

Definition: An ‘active set’ is the set of black dots in a column of the plot – as opposed to the grey dots (you’ll understand when you see it).

+

+distinct: shows the number of sequences that are present in ALL active sets. For three Raw files and active sets A and B, this would mean all sequences which occur in A and B (intersect), but not in C (setdiff).
intersection: shows the number of sequences that occurs in all active sets (intersection).
union: shows the number of sequences that occurs in total. For two files that are all sequences that occurs either in A or in B (union).
+

+Heatmap score [EVD: UpSet]: The proportion of sequences that the file has in common with all other files. +

+
+


+
+
+

1.3.32 EVD: UpSet intersect

+


+
+
+

1.3.33 EVD: UpSet union

+


+

+back to top +

+ +
+
+
+ + + + +
+ + + + + + + + + + + + + diff --git a/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.pdf b/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.pdf new file mode 100644 index 0000000..030f3d5 Binary files /dev/null and b/inst/examples/report_v1.0.0__txt_5files_withMatch-100min.pdf differ diff --git a/inst/reportTemplate/modes_UpSet.png b/inst/reportTemplate/modes_UpSet.png new file mode 100644 index 0000000..c560677 Binary files /dev/null and b/inst/reportTemplate/modes_UpSet.png differ diff --git a/man/FilenameMapper-class.Rd b/man/FilenameMapper-class.Rd new file mode 100644 index 0000000..251eaf8 --- /dev/null +++ b/man/FilenameMapper-class.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FilenameMapper.R +\docType{class} +\name{FilenameMapper-class} +\alias{FilenameMapper-class} +\alias{FilenameMapper} +\title{Make sure to call $readMappingFile(some_file) if you want to support a user-defined file mapping. + Otherwise, calls to $getShortNames() will create/augment the mapping for filenames.} +\description{ +Make sure to call $readMappingFile(some_file) if you want to support a user-defined file mapping. + Otherwise, calls to $getShortNames() will create/augment the mapping for filenames. +} +\section{Fields}{ + +\describe{ +\item{\code{raw_file_mapping}}{Data.frame with columns 'from', 'to' and maybe 'best.effort' (if shorting was unsuccessful)} + +\item{\code{mapping.creation}}{how the current mapping was obtained (user or auto)} + +\item{\code{external.mapping.file}}{Filename of user-defined mapping file; only defined if readMappingFile() was called} +}} + +\section{Methods}{ + +\describe{ +\item{\code{getShortNamesStatic(raw.files, max_len, fallbackStartNr = 1)}}{Static method: Shorten a set of Raw file names and return a data frame with the mappings. + Mapping will have: $from, $to and optionally $best.effort (if shorting was unsuccessful and numbers had to be used) + \itemize{ + \item{\verb{raw.files} Vector of Raw files.} + \item{\verb{max_len} Maximal length of shortening results, before resorting to canonical names (file 1,...).} + \item{\verb{fallbackStartNr} Starting index for canonical names.} + } + \subsection{Return Value}{ data.frame with mapping.}} +}} + +\examples{ +a = FilenameMapper$new() +a$readMappingFile('filenamemapping.txt') + +} diff --git a/man/MQDataReader-cash-getInvalidLines.Rd b/man/MQDataReader-cash-getInvalidLines.Rd deleted file mode 100644 index 2a0a8fb..0000000 --- a/man/MQDataReader-cash-getInvalidLines.Rd +++ /dev/null @@ -1,27 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$getInvalidLines} -\alias{MQDataReader$getInvalidLines} -\title{Detect broken lines (e.g. due to Excel import+export)} -\value{ -Returns a vector of indices of broken (i.e. invalid) lines -} -\description{ -When editing a MQ txt file in Microsoft Excel, saving the file can cause it to be corrupted, -since Excel has a single cell content limit of 32k characters -(see http://office.microsoft.com/en-001/excel-help/excel-specifications-and-limits-HP010342495.aspx) -while MQ can easily reach 60k (e.g. in oxidation sites column). -Thus, affected cells will trigger a line break, effectively splitting one line into two (or more). -} -\details{ -If the table has an 'id' column, we can simply check the numbers are consecutive. If no 'id' column is available, -we detect line-breaks by counting the number of NA's per row and finding outliers. -The line break then must be in this line (plus the preceeding or following one). Depending on where -the break happened we can also detect both lines right away (if both have more NA's than expected). - -Currently, we have no good strategy to fix the problem since columns are not aligned any longer, which -leads to columns not having the class (e.g. numeric) they should have. -(thus one would need to un-do the linebreak and read the whole file again) - -[Solution to the problem: try LibreOffice 4.0.x or above -- seems not to have this limitation] -} diff --git a/man/MQDataReader-cash-getShortNames.Rd b/man/MQDataReader-cash-getShortNames.Rd deleted file mode 100644 index 40f0a2b..0000000 --- a/man/MQDataReader-cash-getShortNames.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$getShortNames} -\alias{MQDataReader$getShortNames} -\title{Shorten a set of Raw file names and return a data frame with the mappings.} -\arguments{ -\item{raw.files}{Vector of Raw files} - -\item{max_len}{Maximal length of shortening results, before resorting to canonical names (file 1,...)} - -\item{fallbackStartNr}{Starting index for canonical names} -} -\value{ -data.frame with mapping -} -\description{ -Shorten a set of Raw file names and return a data frame with the mappings. -} diff --git a/man/MQDataReader-cash-new.Rd b/man/MQDataReader-cash-new.Rd deleted file mode 100644 index d5e8690..0000000 --- a/man/MQDataReader-cash-new.Rd +++ /dev/null @@ -1,10 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$new} -\alias{MQDataReader$new} -\title{Constructor for class 'MQDataReader'.} -\description{ -This class is used to read MQ data tables using readMQ() while holding -the internal raw file --> short raw file name mapping (stored in a member called -'raw_file_mapping') and updating/using it every time readMQ() is called. -} diff --git a/man/MQDataReader-cash-plotNameMapping.Rd b/man/MQDataReader-cash-plotNameMapping.Rd deleted file mode 100644 index 0fc51b6..0000000 --- a/man/MQDataReader-cash-plotNameMapping.Rd +++ /dev/null @@ -1,13 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$plotNameMapping} -\alias{MQDataReader$plotNameMapping} -\title{Plots the current mapping of Raw file names to their shortened version.} -\value{ -if mapping is available, returns a list of plots 'plots' and a Html table string 'htmlTable' ; 'NULL' otherwise. -} -\description{ -Convenience function to plot the mapping (e.g. to a PDF device for reporting). -The data frame can be accessed directly via \code{.$raw_file_mapping}. -If no mapping exists, the function prints a warning to console and returns NULL (which is safe to use in print(NULL)). -} diff --git a/man/MQDataReader-cash-readMappingFile.Rd b/man/MQDataReader-cash-readMappingFile.Rd deleted file mode 100644 index 6619282..0000000 --- a/man/MQDataReader-cash-readMappingFile.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$readMappingFile} -\alias{MQDataReader$readMappingFile} -\title{Reads a mapping table of full Raw file names to shortened names.} -\arguments{ -\item{filename}{Source filename to read.} -} -\value{ -Returns \code{TRUE} if file was read, \code{FALSE} if it does not exist. -} -\description{ -The internal structure \code{raw_file_mapping} is created using this file. -If the file is missing, nothing is done. -} -\details{ -The file must have two columns named: 'orig.Name' and 'new.Name' and use Tab as separator. -I.e. -\preformatted{# This file can be used to manually substitute Raw file names within the report. -# The ordering of Raw files in the report can be changed by re-arranging the rows. -orig.Name new.Name -2011_05_30_ALH_OT_21_VIL_TMT_FR01 myfile A -2011_05_30_ALH_OT_22_VIL_TMT_FR02 another B -} -} diff --git a/man/MQDataReader-cash-substitute.Rd b/man/MQDataReader-cash-substitute.Rd deleted file mode 100644 index af8bb3b..0000000 --- a/man/MQDataReader-cash-substitute.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$substitute} -\alias{MQDataReader$substitute} -\title{Replaces values in the mq.data member with (binary) values.} -\arguments{ -\item{colname}{Name of the column (e.g. "contaminants") in the mq.data table} - -\item{valid_entries}{Vector of values to be replaced (must contain all values expected in the column -- fails otherwise)} - -\item{replacements}{Vector of values inserted with the same length as \code{valid_entries}.} -} -\value{ -Returns \code{TRUE} if successful. -} -\description{ -Most MQ tables contain columns like 'contaminants' or 'reverse', whose values are either empty strings -or "+", which is inconvenient and can be much better represented as TRUE/FALSE. -The params \code{valid_entries} and \code{replacements} contain the matched pairs, which determine what is replaced with what. -} diff --git a/man/MQDataReader-cash-writeMappingFile.Rd b/man/MQDataReader-cash-writeMappingFile.Rd deleted file mode 100644 index 3c3a54f..0000000 --- a/man/MQDataReader-cash-writeMappingFile.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/MQDataReader.R -\name{MQDataReader$writeMappingFile} -\alias{MQDataReader$writeMappingFile} -\title{Writes a mapping table of full Raw file names to shortened names.} -\arguments{ -\item{filename}{Target filename to create.} -} -\value{ -Returns NULL. -} -\description{ -The internal structure \code{raw_file_mapping} is written to the -file specified. -File is only created if mapping exists (in .$raw_file_mapping). -} diff --git a/man/MQDataReader-cash-readMQ.Rd b/man/MQDataReader-class.Rd similarity index 54% rename from man/MQDataReader-cash-readMQ.Rd rename to man/MQDataReader-class.Rd index fcbf624..85121b7 100644 --- a/man/MQDataReader-cash-readMQ.Rd +++ b/man/MQDataReader-class.Rd @@ -1,12 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/MQDataReader.R -\name{MQDataReader$readMQ} -\alias{MQDataReader$readMQ} -\title{Wrapper to read a MQ txt file (e.g. proteinGroups.txt).} +\docType{class} +\name{MQDataReader-class} +\alias{MQDataReader-class} +\alias{MQDataReader} +\title{S5-RefClass to read MaxQuant .txt files} \arguments{ -\item{.}{A 'this' pointer. Use it to refer/change internal members. It's implicitly added, thus not required too call the function!} - -\item{file}{(Relative) path to a MQ txt file ()} +\item{file}{(Relative) path to a MQ txt file.} \item{filter}{Searched for "C" and "R". If present, [c]ontaminants and [r]everse hits are removed if the respective columns are present. E.g. to filter both, \code{filter = "C+R"}} @@ -14,6 +14,7 @@ E.g. to filter both, \code{filter = "C+R"}} \item{type}{Allowed values are: "pg" (proteinGroups) [default], adds abundance index columns (*AbInd*, replacing 'intensity') "sm" (summary), splits into three row subsets (raw.file, condition, total) +"ev" (evidence), will fix empty modified.sequence cells for older MQ versions (when MBR is active) Any other value will not add any special columns} \item{col_subset}{A vector of column names as read by read.delim(), e.g., spaces are replaced by dot already. @@ -37,11 +38,30 @@ zero values in LFQ columns are replaced by the following method IFF(!) the corre "impute": replace by lowest LFQ value >0 (simulating 'noise')} \item{...}{Additional parameters passed on to read.delim()} + +\item{colname}{Name of the column (e.g. 'contaminants') in the mq.data table} + +\item{valid_entries}{Vector of values to be replaced (must contain all values expected in the column -- fails otherwise)} + +\item{replacements}{Vector of values inserted with the same length as \code{valid_entries}.} } \value{ A data.frame of the respective file + + +Replaces values in the mq.data member with (binary) values. +Most MQ tables contain columns like 'contaminants' or 'reverse', whose values are either empty strings +or "+", which is inconvenient and can be much better represented as TRUE/FALSE. +The params \code{valid_entries} and \code{replacements} contain the matched pairs, which determine what is replaced with what. + +Returns \code{TRUE} if successful. } \description{ +This class is used to read MQ data tables using \code{MQDataReader::readMQ()} while holding +the internal raw file --> short raw file name mapping (stored in a member called +'fn_map') and updating/using it every time \code{MQDataReader::readMQ()} is called. +} +\details{ Since MaxQuant changes capitalization and sometimes even column names, it seemed convenient to have a function which just reads a txt file and returns unified column names, irrespective of the MQ version. So, it unifies access to columns (e.g. by using lower case for ALL columns) and ensures columns are @@ -55,8 +75,7 @@ identically named across MQ versions: mass.deviations mass.deviations..da. basepeak.intensity base.peak.intensity } -} -\details{ + We also correct 'reporter.intensity.*' naming issues to MQ 1.6 convention, when 'reporter.intensity.not.corrected' is present. MQ 1.5 uses: reporter.intensity.X and reporter.intensity.not.corrected.X MQ 1.6 uses: reporter.intensity.X and reporter.intensity.corrected.X @@ -70,5 +89,34 @@ Example of usage: d_evd = mq$readMQ("evidence.txt", type="ev", filter="R", col_subset=c("proteins", "Retention.Length", "retention.time.calibration")) } -If the file is empty, this function stops with an error. +If the file is empty, this function shows a warning and returns NULL. +If the file is present but cannot be read, the program will stop. + +Wrapper to read a MQ txt file (e.g. proteinGroups.txt). } +\section{Methods}{ + +\describe{ +\item{\code{getInvalidLines()}}{Detect broken lines (e.g. due to Excel import+export) + + When editing a MQ txt file in Microsoft Excel, saving the file can cause it to be corrupted, + since Excel has a single cell content limit of 32k characters + (see http://office.microsoft.com/en-001/excel-help/excel-specifications-and-limits-HP010342495.aspx) + while MQ can easily reach 60k (e.g. in oxidation sites column). + Thus, affected cells will trigger a line break, effectively splitting one line into two (or more). + + If the table has an 'id' column, we can simply check the numbers are consecutive. If no 'id' column is available, + we detect line-breaks by counting the number of NA's per row and finding outliers. + The line break then must be in this line (plus the preceeding or following one). Depending on where + the break happened we can also detect both lines right away (if both have more NA's than expected). + + Currently, we have no good strategy to fix the problem since columns are not aligned any longer, which + leads to columns not having the class (e.g. numeric) they should have. + (thus one would need to un-do the linebreak and read the whole file again) + + [Solution to the problem: try LibreOffice 4.0.x or above -- seems not to have this limitation] + + @return Returns a vector of indices of broken (i.e. invalid) lines + } +}} + diff --git a/man/MzTabReader-class.Rd b/man/MzTabReader-class.Rd new file mode 100644 index 0000000..00d9f90 --- /dev/null +++ b/man/MzTabReader-class.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MzTabReader.R +\docType{class} +\name{MzTabReader-class} +\alias{MzTabReader-class} +\alias{MzTabReader} +\title{Class to read an mzTab file and store the tables internally.} +\description{ +The 'sections' field is initialized after $readMzTab was called. +The 'fn_map' fields should be initialized via ...$fn_map$readMappingFile(...) manually if user-defined filename mappings are desired +and is automatically updated/queried when $readMzTab is called. +} +\section{Fields}{ + +\describe{ +\item{\code{sections}}{MzTab sections as list. Valid list entries are: "MTD", "PRT", "PEP", "PSM", "SML", "filename" and "comments"} + +\item{\code{fn_map}}{FilenameMapper which can translate raw filenames into something shorter} +}} + +\section{Methods}{ + +\describe{ +\item{\code{getEvidence()}}{Basically the PSM table and additionally columns named 'raw.file' and 'fc.raw.file'.} + +\item{\code{getMSMSScans(identified_only = FALSE)}}{Basically the PSM table (partially renamed columns) and additionally two columns 'raw.file' and 'fc.raw.file'. +If identified_only is TRUE, only MS2 scans which were identified (i.e. a PSM) are returned -- this is equivalent to msms.txt in MaxQuant.} + +\item{\code{getParameters()}}{Converts internal mzTab metadata section to a two column key-value data.frame similar to MaxQuants parameters.txt.} + +\item{\code{getProteins()}}{Basically the PRT table ...} + +\item{\code{getSummary()}}{Converts internal mzTab metadata section to a two data.frame with columns 'fc.raw.file', 'ms.ms.identified....' +similar to MaxQuants summary.txt.} + +\item{\code{renameColumns(dt, namelist)}}{Renames all columns and throws a warning if a column does not exist in the data} + +\item{\code{RTUnitCorrection(dt)}}{Convert all RT columns from seconds (OpenMS default) to minutes (MaxQuant default)} +}} + diff --git a/man/PTXQC.Rd b/man/PTXQC.Rd new file mode 100644 index 0000000..b03ee19 --- /dev/null +++ b/man/PTXQC.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PTXQC.R +\docType{package} +\name{PTXQC} +\alias{PTXQC} +\alias{PTXQC-package} +\title{PTXQC: A package for computing Quality Control (QC) metrics for Proteomics (PTX)} +\description{ +PTXQC: A package for computing Quality Control (QC) metrics for Proteomics (PTX) +} +\section{Input}{ + +Valid input data are either the files from MaxQuant's .txt folder (all versions from MaxQuant >= 1.0 upwards are supported) +or a single mzTab file. All mzTab files will work, but most metrics can be obtained from OpenMS' mzTab as produced +by the QualityControl TOPP tool (from OpenMS 2.5 onwards). +} + +\section{Important functions}{ + +The central function of this package is called \code{\link{createReport}} and it accepts either MaxQuant or mzTab data, along with +a configuration (optional). +There is a parser for mzTab \code{\link{MzTabReader}} and MaxQuant txt files \code{\link{MQDataReader}}, as well as a plethora of QC metrics +derived from a common \code{\link{qcMetric}} class and scoring functions \code{qual...}, e.g. \code{\link{qualGaussDev}}. +} + +\section{Configuration}{ + +The user can modify the behaviour of PTXQC, e.g. to enable/disable certain metrics or change scoring thresholds, via a YAML object/file. +By default a Yaml file is written automatically side-by-side to the input files upon running PTXQC for the first time on a particular input. +A custom Yaml object can be passed to the main \code{\link{createReport}} function for customization. +Use \code{yaml::yaml.load_file(input = 'myYAML.yaml')} to load an existing file and pass the Yaml object along. +} + +\section{Output}{ + +Either a PDF and/or Html report which contains QC plots and a description of the metrics. +} + diff --git a/man/RTalignmentTree.Rd b/man/RTalignmentTree.Rd index 813559f..6d7bbfd 100644 --- a/man/RTalignmentTree.Rd +++ b/man/RTalignmentTree.Rd @@ -4,10 +4,10 @@ \alias{RTalignmentTree} \title{Return a tree plot with a possible alignment tree.} \usage{ -RTalignmentTree(d_evd, col_fraction = c()) +RTalignmentTree(df_evd, col_fraction = c()) } \arguments{ -\item{d_evd}{Evidence table containing calibrated retention times and sequence information.} +\item{df_evd}{Evidence table containing calibrated retention times and sequence information.} \item{col_fraction}{Empty vector or 1-values vector giving the name of the fraction column (if existing)} } diff --git a/man/YAMLClass-class.Rd b/man/YAMLClass-class.Rd index 069518e..b03eeb0 100644 --- a/man/YAMLClass-class.Rd +++ b/man/YAMLClass-class.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fcn_YAML.R +% Please edit documentation in R/YAMLClass.R \docType{class} \name{YAMLClass-class} \alias{YAMLClass-class} @@ -18,7 +18,8 @@ If the param is unknown, create it with the given default value and return the d \section{Methods}{ \describe{ -\item{\code{getYAML(param_name, default)}}{Query this YAML object for a certain parameter and return its value. If it does not exist it is created with a default value.} +\item{\code{getYAML(param_name, default, min = NA, max = NA)}}{Query this YAML object for a certain parameter and return its value. If it does not exist it is created with a default value. +An optional min/max range can be specified and will be enforced if the value is known (default will be used upon violation).} \item{\code{setYAML(param_name, value)}}{Set a YAML parameter to a certain value. Overwrites the old value or creates a new entry if hithero unknown.} diff --git a/man/createReport.Rd b/man/createReport.Rd index adfdf30..33f803e 100644 --- a/man/createReport.Rd +++ b/man/createReport.Rd @@ -1,14 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fcn_computeQC.R +% Please edit documentation in R/createReport.R \name{createReport} \alias{createReport} \title{Create a quality control report (in PDF format).} \usage{ -createReport(txt_folder, yaml_obj = list(), report_filenames = NULL) +createReport(txt_folder = NULL, mztab_file = NULL, yaml_obj = list(), + report_filenames = NULL) } \arguments{ \item{txt_folder}{Path to txt output folder of MaxQuant (e.g. "c:/data/Hek293/txt")} +\item{mztab_file}{Alternative to 'txt_folder', you can provide a single mzTab file which contains PSM, PEP and PRT tables} + \item{yaml_obj}{A nested list object with configuration parameters for the report. Useful to switch off certain plots or skip entire sections.} @@ -23,8 +26,11 @@ This is the main function of the package and the only thing you need to call dir just interested in getting a QC report. } \details{ -You need to provide the folder name of the 'txt' output, as generated by MaxQuant and -optionally a YAML configuration object, which allows to (de)activate certain plots and holds other parameters. +You need to provide either +a) the folder name of the 'txt' output, as generated by MaxQuant or an mzTab file +or b) an mzTab file as generated by the OpenMS QualityControl TOPP tool (other mzTab files will probably not work) + +Optionally, provide a YAML configuration object, which allows to (de)activate certain plots and holds other parameters. The yaml_obj is complex and best obtained by running this function once using the default (empty list). A full YAML configuration object will be written in the 'txt' folder you provide and can be loaded using \code{\link[yaml]{yaml.load}}. @@ -32,7 +38,7 @@ A full YAML configuration object will be written in the 'txt' folder you provide The PDF and the config file will be stored in the given txt folder. } \note{ -You need write access to the txt folder! +You need write access to the txt/mzTab folder! For updates, bug fixes and feedback please visit \url{http://github.com/cbielow/PTXQC}. } diff --git a/man/darken.Rd b/man/darken.Rd new file mode 100644 index 0000000..bbc4b58 --- /dev/null +++ b/man/darken.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fcn_misc.R +\name{darken} +\alias{darken} +\title{Make a color (given as name or in RGB) darker by factor x = [0 = black, 1=unchanged]} +\usage{ +darken(color, factor = 0.8) +} +\arguments{ +\item{color}{A color as understood by col2rgb} + +\item{factor}{Between 0 (make black) and 1 (leave color as is)} +} +\value{ +darkened color +} +\description{ +Make a color (given as name or in RGB) darker by factor x = [0 = black, 1=unchanged] +} diff --git a/man/getFragmentErrors.Rd b/man/getFragmentErrors.Rd index 6e51e6d..e14f193 100644 --- a/man/getFragmentErrors.Rd +++ b/man/getFragmentErrors.Rd @@ -4,10 +4,12 @@ \alias{getFragmentErrors} \title{Extract fragment mass deviation errors from a data.frame from msms.txt} \usage{ -getFragmentErrors(x) +getFragmentErrors(x, recurse = 0) } \arguments{ \item{x}{Data frame in long format with numerical expression data} + +\item{recurse}{Internal usage only. Leave at 0 when calling.} } \value{ Data frame with mass errors ('msErr') and their 'unit' (Da or ppm) or NULL (if no fragments were given) diff --git a/man/getPeptideCounts.Rd b/man/getPeptideCounts.Rd index afa1636..fb8e38e 100644 --- a/man/getPeptideCounts.Rd +++ b/man/getPeptideCounts.Rd @@ -5,16 +5,16 @@ \title{Extract the number of peptides observed per Raw file from an evidence table.} \usage{ -getPeptideCounts(d_evidence) +getPeptideCounts(df_evd) } \arguments{ -\item{d_evidence}{Data.frame of evidence.txt as read by MQDataReader} +\item{df_evd}{Data.frame of evidence.txt as read by MQDataReader} } \value{ Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain' } \description{ -Required columns are "fc.raw.file", "modified.sequence" and "match.time.difference". +Required columns are "fc.raw.file", "modified.sequence" and "is.transferred". } \details{ If match-between-runs was enabled during the MaxQuant run, diff --git a/man/getProteinCounts.Rd b/man/getProteinCounts.Rd index cd8d141..71b704b 100644 --- a/man/getProteinCounts.Rd +++ b/man/getProteinCounts.Rd @@ -5,16 +5,16 @@ \title{Extract the number of protein groups observed per Raw file from an evidence table.} \usage{ -getProteinCounts(d_evidence) +getProteinCounts(df_evd) } \arguments{ -\item{d_evidence}{Data.frame of evidence.txt as read by MQDataReader} +\item{df_evd}{Data.frame of evidence.txt as read by MQDataReader} } \value{ Data.frame with columns 'fc.raw.file', 'counts', 'category', 'MBRgain' } \description{ -Required columns are "protein.group.ids", "fc.raw.file" and "match.time.difference". +Required columns are "protein.group.ids", "fc.raw.file" and "is.transferred". } \details{ If match-between-runs was enabled during the MaxQuant run, diff --git a/man/getReportFilenames.Rd b/man/getReportFilenames.Rd index 33bbe93..027fc29 100644 --- a/man/getReportFilenames.Rd +++ b/man/getReportFilenames.Rd @@ -4,16 +4,22 @@ \alias{getReportFilenames} \title{Assembles a list of output file names, which will be created during reporting.} \usage{ -getReportFilenames(txt_folder, report_name_has_folder = TRUE) +getReportFilenames(folder, report_name_has_folder = TRUE, + mzTab_filename = NULL) } \arguments{ -\item{txt_folder}{Directory where the MaxQuant output resides} +\item{folder}{Directory where the MaxQuant output (txt folder) or the mzTab file resides} \item{report_name_has_folder}{Boolean: Should the report files (html, pdf) contain the name of the deepest(=last) subdirectory in 'txt_folder' which is not 'txt'? Useful for discerning different reports in a PDF viewer. E.g. when flag is FALSE: 'report_v0.91.0.html'; and 'report_v0.91.0_bloodStudy.html' when flag is TRUE (and the txt folder is '.../bloodStudy/txt/' or '...bloodStudy/', i.e. './txt/' will be skipped over)} + +\item{mzTab_filename}{If input is an mzTab, specify its name, so that the filenames can use its basename as infix +E.g. when mzTab_filename = 'HEK293-study.mzTab' then the output will be + report_HEK293-study.html. +This allows to get reports on multiple mzTabs in the same folder without overwriting report results.} } \value{ List of output file names (just names, no file is created) @@ -21,5 +27,8 @@ List of output file names (just names, no file is created) yaml_file, heatmap_values_file, R_plots_file, filename_sorting, stats_file, log_file, report_file_prefix, report_file_PDF, report_file_HTML } \description{ -Assembles a list of output file names, which will be created during reporting. +You can combine @p report_name_has_folder and @p mzTab_filename to obtain filenames which are even more +robust to moving around (since they contain infixes of the mzTab filename and the folder), +e.g. @em report_HEK293-study_myProjects.html, where the input + was mzTab_filename='HEK293-study.mzTab' and folder='c:/somePath/myProjects/'. } diff --git a/man/idTransferCheck.Rd b/man/idTransferCheck.Rd index 0ba3dfe..ab4803d 100644 --- a/man/idTransferCheck.Rd +++ b/man/idTransferCheck.Rd @@ -4,24 +4,24 @@ \alias{idTransferCheck} \title{Check how close transferred ID's after alignment are to their genuine IDs within one Raw file.} \usage{ -idTransferCheck(data) +idTransferCheck(df_evd_all) } \arguments{ -\item{data}{A data.frame with columns 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'raw.file'} +\item{df_evd_all}{A data.frame with columns 'type', 'calibrated.retention.time', 'modified.sequence', 'charge', 'raw.file'} } \value{ A data.frame containing the RT diff for each ID-group found in a Raw file (bg = genuine). } \description{ -The input is a data frame containing feature evidence with corrected retention times, +The input is a data.frame containing feature evidence with corrected retention times, e.g. a 'calibrated.retention.time' column. } \details{ Note that this function must be given MS/MS identifications of type "MULTI-MSMS" and "MSMS-MATCH". It will stop() otherwise. -We compare for each peptide sequence (and charge) the RT difference within groups of genuine and mixed pairs. -For every comparison made, we report the RT difference. If alignment worked perfectly, the differences are very small (<1 min), +We compare for each peptide sequence (and charge) the RT difference within groups of either genuine as well as mixed pairs. +For every comparison made, we report the RT span If alignment worked perfectly, the span are very small (<1 min), for the mixed group, i.e. the pairs are accidentally split 3D peaks. Alignment performance has no influence on the genuine-only groups. diff --git a/man/inMatchWindow.Rd b/man/inMatchWindow.Rd index de196e3..1a167c4 100644 --- a/man/inMatchWindow.Rd +++ b/man/inMatchWindow.Rd @@ -7,7 +7,7 @@ inMatchWindow(data, df.allowed.deltaRT) } \arguments{ -\item{data}{A data.frame with columns 'fc.raw.file' and !colname (param)} +\item{data}{A data.frame with columns 'fc.raw.file', 'rtdiff_mixed', 'rtdiff_genuine'} \item{df.allowed.deltaRT}{The allowed matching difference for each Raw file (as data.frame(fc.rawfile, m))} } diff --git a/man/peakSegmentation.Rd b/man/peakSegmentation.Rd index 4d3aded..e599b7b 100644 --- a/man/peakSegmentation.Rd +++ b/man/peakSegmentation.Rd @@ -5,20 +5,20 @@ \title{Determine fraction of evidence which causes segmentation, i.e. sibling peaks at different RTs confirmed either by genuine or transferred MS/MS.} \usage{ -peakSegmentation(d_evd) +peakSegmentation(df_evd_all) } \arguments{ -\item{d_evd}{A data.frame of evidences containing the above columns} +\item{df_evd_all}{A data.frame of evidences containing the above columns} } \value{ A data.frame with one row per Raw file and three columns: 1) % of native single peaks (ignoring transferred IDs) - 2) % of single peaks (group of size=1) using only groups which have at at one transferred evidence + 2) % of single peaks (group of size=1) using only groups which have one transferred evidence 3) % of single peaks using all groups } \description{ -Sometimes, MQ split a feature into 2 or more if the chromatograpic conditions are not optimal and there +Sometimes, MQ splits a feature into 2 or more if the chromatograpic conditions are not optimal and there is a drop in RT intensity. If both features contain successful MS/MS scans, we will find the same peptide twice (with slightly different RT) in the same charge state. This constitutes a natively split peak and is rare (95% of all genuine peaks are unique). @@ -31,7 +31,7 @@ In most cases, the RT is too far off to be a split peak. It's rather a lucky hit and thus the intensity is random. To find by how much these peak pairs differ in RT, use idTransferCheck() and inMatchWindow(). -Required columns are 'match.time.difference', 'fc.raw.file', 'modified.sequence', 'charge', 'type'. +Required columns are 'is.transferred', 'fc.raw.file', 'modified.sequence', 'charge', 'type'. Note that this function must be given MS/MS identifications of type "MULTI-MSMS" and "MSMS-MATCH". It will stop() otherwise. diff --git a/man/plot_TIC.Rd b/man/plot_TIC.Rd new file mode 100644 index 0000000..37c2a05 --- /dev/null +++ b/man/plot_TIC.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fcn_plots.R +\name{plot_TIC} +\alias{plot_TIC} +\title{Plot Total Ion Count over time} +\usage{ +plot_TIC(data, x_lim, y_lim) +} +\arguments{ +\item{data}{A data.frame with columns 'fc.raw.file', 'RT', 'intensity'} + +\item{x_lim}{Plot range of x-axis} + +\item{y_lim}{Plot range of y-axis} +} +\value{ +GGplot object +} +\description{ +The input is a data.frame with already averaged counts over binned RT-slices. +} +\examples{ + + data = data.frame(fc.raw.file = rep(c("file A", "file B", "file C"), each=81), + RT = c(20:100), + intensity = c(rnorm(81, mean=20), rnorm(81, mean=10), rnorm(81, mean=30))) + plot_TIC(data, c(10, 100), c(0, 40)) + +} diff --git a/man/plot_UncalibratedMSErr.Rd b/man/plot_UncalibratedMSErr.Rd index e122487..f114e99 100644 --- a/man/plot_UncalibratedMSErr.Rd +++ b/man/plot_UncalibratedMSErr.Rd @@ -35,7 +35,7 @@ if PTXQC detected a too narrow search window. rnorm(n[3], 3, 0.7), rnorm(n[4], 4.5, 0.8))) stats = data.frame(fc.raw.file = letters[4:1], - sd = c(2.4, 0.5, 0.7, 0.8), + sd_uncal = c(2.4, 0.5, 0.7, 0.8), outOfCal = c(TRUE, FALSE, FALSE, FALSE)) plot_UncalibratedMSErr(data, MQBug_raw_files = letters[1], stats, y_lim = c(-20,20), 15, "subtitle") diff --git a/man/print.PTXQC_table.Rd b/man/print.PTXQC_table.Rd index 1f490b6..c5e4699 100644 --- a/man/print.PTXQC_table.Rd +++ b/man/print.PTXQC_table.Rd @@ -9,10 +9,7 @@ \arguments{ \item{x}{Some Grid object to plot} -\item{...}{further arguments (not used, but required for consistency with other print methods)} -} -\value{ -A function +\item{...}{Further arguments (not used, but required for consistency with other print methods)} } \description{ helper S3 class, enabling print(some-plot_Table-object) diff --git a/man/read.MQ.Rd b/man/read.MQ.Rd index 0dc1bcd..a6cd8a8 100644 --- a/man/read.MQ.Rd +++ b/man/read.MQ.Rd @@ -9,23 +9,23 @@ read.MQ(file, filter = "", type = "pg", col_subset = NA, add_fs_col = 10, LFQ_action = FALSE, ...) } \arguments{ -\item{file}{see \code{\link{MQDataReader$readMQ}}} +\item{file}{see \code{MQDataReader::readMQ()}} -\item{filter}{see \code{\link{MQDataReader$readMQ}}} +\item{filter}{see \code{MQDataReader::readMQ()}} -\item{type}{see \code{\link{MQDataReader$readMQ}}} +\item{type}{see \code{MQDataReader::readMQ()}} -\item{col_subset}{see \code{\link{MQDataReader$readMQ}}} +\item{col_subset}{see \code{MQDataReader::readMQ()}} -\item{add_fs_col}{see \code{\link{MQDataReader$readMQ}}} +\item{add_fs_col}{see \code{MQDataReader::readMQ()}} -\item{LFQ_action}{see \code{\link{MQDataReader$readMQ}}} +\item{LFQ_action}{see \code{MQDataReader::readMQ()}} -\item{...}{see \code{\link{MQDataReader$readMQ}}} +\item{...}{see \code{MQDataReader::readMQ()}} } \value{ -see \code{\link{MQDataReader$readMQ}} +see \code{MQDataReader::readMQ()} } \description{ -For params, see \code{\link{MQDataReader$readMQ}}. +For params, see \code{MQDataReader::readMQ()}. } diff --git a/tests/testthat/test_computeQC.R b/tests/testthat/test_computeQC.R index 2fee66d..f8e85c1 100644 --- a/tests/testthat/test_computeQC.R +++ b/tests/testthat/test_computeQC.R @@ -5,7 +5,7 @@ library(PTXQC) ## Error in Scales$new : could not find function "loadMethod" require(methods) -context("fcn_computeQC.R") +context("createReport.R") test_that("createReport", { ## this is a rather lengthy function, and its hard to test in all its granularity (hence we test @@ -38,7 +38,7 @@ test_that("createReport", { txt_folder = file.path(tempdir(), "txt") yaml_obj = list() ## no special config... - r = createReport(txt_folder, yaml_obj) + r = createReport(txt_folder, NULL, yaml_obj) expect_equal(c("yaml_file", "heatmap_values_file", "R_plots_file", "filename_sorting", "stats_file", "log_file", "report_file_prefix", "report_file_PDF", "report_file_HTML"), names(r)) rep_files = c(r[["report_file_PDF"]], r[["report_file_HTML"]]) @@ -61,4 +61,4 @@ test_that("createReport", { unlink(local_zip) ## delete zip unlink(txt_folder, recursive = TRUE) ## delete txt-folder -}) \ No newline at end of file +}) diff --git a/tests/testthat/test_qualities.R b/tests/testthat/test_qualities.R index fa65f80..75ad7fa 100644 --- a/tests/testthat/test_qualities.R +++ b/tests/testthat/test_qualities.R @@ -58,13 +58,13 @@ test_that("qualUniform", { expect_equal(qualUniform(c(4,0,0), c(1,0,0)), 1) expect_equal(qualUniform(c(4,0,0), c(0,1,0)), 1) expect_equal(qualUniform(c(0,4,0)), 0) - expect_less_than(abs(qualUniform(c(3,2,1))-0.58578), 0.0001) - expect_less_than(abs(qualUniform(c(1,2,3))-0.58578), 0.0001) + expect_lt(abs(qualUniform(c(3,2,1))-0.58578), 0.0001) + expect_lt(abs(qualUniform(c(1,2,3))-0.58578), 0.0001) expect_equal(qualUniform(c(1,2,3), c(0,1,0)), 1) - expect_less_than(abs(qualUniform(c(1,2,3))-0.58578), 0.0001) - expect_less_than(abs(qualUniform(c(1,2,3), c(0,1,1))- 0.590316), 0.0001) - expect_less_than(abs(qualUniform(c(2,3), c(1,1))-0.552786), 0.0001) - expect_less_than(abs(qualUniform(1:120)-0.38661), 0.0001) + expect_lt(abs(qualUniform(c(1,2,3))-0.58578), 0.0001) + expect_lt(abs(qualUniform(c(1,2,3), c(0,1,1))- 0.590316), 0.0001) + expect_lt(abs(qualUniform(c(2,3), c(1,1))-0.552786), 0.0001) + expect_lt(abs(qualUniform(1:120)-0.38661), 0.0001) }) @@ -103,7 +103,7 @@ test_that("qualBestKS", { expect_equal(max(r$ks_best[1:3]), 1) expect_true(all(r$ks_best[1:3] > 0.7)) ## the three similar ones, should score good (one of them the the reference) - expect_less_than(r$ks_best[4], 0.2) ## outlier should score badly + expect_lt(r$ks_best[4], 0.2) ## outlier should score badly }) diff --git a/vignettes/PTXQC-Basic_Guide_for_R_users.Rmd b/vignettes/PTXQC-Basic_Guide_for_R_users.Rmd index 64cbbbf..45b7cc3 100644 --- a/vignettes/PTXQC-Basic_Guide_for_R_users.Rmd +++ b/vignettes/PTXQC-Basic_Guide_for_R_users.Rmd @@ -106,7 +106,7 @@ if (file.exists(fh_out$yaml_file)) yaml_config = list() } -r = createReport(txt_folder, yaml_config) +r = createReport(txt_folder, mztab_file = NULL, yaml_obj = yaml_config) cat(paste0("\nReport generated as '", r$report_file, "'\n\n")) diff --git a/vignettes/PTXQC-Input_And_Output_Data.Rmd b/vignettes/PTXQC-Input_And_Output_Data.Rmd index 9bf8494..76a2a00 100644 --- a/vignettes/PTXQC-Input_And_Output_Data.Rmd +++ b/vignettes/PTXQC-Input_And_Output_Data.Rmd @@ -13,27 +13,36 @@ vignette: > # Input/Output data for PTXQC -PTXQC generates quality control reports for [MaxQuant] results. +PTXQC generates quality control reports for [MaxQuant] txt data, and [OpenMS] mzTab data. ## Input -When a full report is generated, PTXQC requires the following **input files**: +When a full report is generated, PTXQC requires the following **input files**. +MaxQuant: + - parameters.txt - summary.txt - proteinGroups.txt - evidence.txt - msms.txt - msmsScans.txt - + These files can be found in the `/combined/txt` folder after MaxQuant analysis successfully finished. Here, `` is your main MaxQuant project folder containing your Raw files. + +OpenMS: + + - an .mzTab file from the QualityControl TOPP tool (see [OpenMSdoc] --> TOPP documentation --> QualityControl) + + Note: PTX-QC will compute all metrics for which input data is present. MzTab files from other software packages (even mzTab from MaxQuant) will be missing some data and thus only produce a small subset of metrics. + If you are not interested in the full report, PTXQC can be configured to look only at some of these files by editing the YAML file. See the [PTX-CustomizeReport][Ref_VignCust] vignette ```{r, eval=FALSE} vignette("PTXQC-CustomizeReport", package = "PTXQC") ``` -for details. Alternatively, you can rename any .txt file if you want PTXQC to ignore it. +for details. Alternatively (for .txt input), you can rename any .txt file you want PTXQC to ignore. ## Output @@ -47,11 +56,13 @@ Further output files (with the same prefix name as the Html/PDF report), include - `report_vXXX_filename_sort.txt` (customize the order and naming of Raw files in the report) - `report_v0XXX.yaml` (the configuration file) -The YAML configuration file is created the first time you run PTXQC on a txt-folder. Customize it as you see fit. +The YAML configuration file is created the first time you run PTXQC on a txt-folder or mzTab file. Customize it as you see fit. Read more about `report_vXXX_filename_sort.txt` and `report_vXXX.yaml` in the [PTX-CustomizeReport][Ref_VignCust] vignette. [MaxQuant]: https://maxquant.org + [OpenMS]: https://www.openms.de + [OpenMSdoc]: https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/nightly/html/index.html [Ref_VignCust]: https://github.com/cbielow/PTXQC/blob/master/vignettes/PTXQC-CustomizeReport.Rmd