Merge pull request #131 from cbielow/mzqc_refact

fixes #130 (UTF-8 BOM)
cbielow · May 17, 2023 · f4dc462 · f4dc462
2 parents 83c0c3a + 053c3e1
commit f4dc462
Show file tree

Hide file tree

Showing 16 changed files with 172 additions and 100 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: PTXQC
 Type: Package
 Title: Quality Report Generation for MaxQuant and mzTab Results
-Version: 1.0.15
-Date: 2022-09-21
+Version: 1.0.16
+Date: 2023-05-17
 Author: Chris Bielow [aut, cre],
   Juliane Schmachtenberg [ctb],
   Swenja Wagner [ctb],
@@ -36,7 +36,7 @@ Imports:
     RColorBrewer,
     reshape2,
     rmarkdown,
-    rmzqc (>= 0.3.0),
+    rmzqc (>= 0.4.2),
     seqinr,
     stats,
     utils,
@@ -49,6 +49,6 @@ VignetteBuilder: knitr
 License: BSD_3_clause + file LICENSE
 Encoding: UTF-8
 Roxygen: list()
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 URL: https://github.com/cbielow/PTXQC
 BugReports: https://github.com/cbielow/PTXQC/issues
diff --git a/NEWS b/NEWS
@@ -14,8 +14,12 @@ Glossary:
 #########   CHANGELOG  ##########
 #################################
 
-PRE!!! v1.00.15 -- 2022/??/??
+[CRAN] v1.00.16 -- 2023/05/17
+  - add MaxQuant 2.4 support (#130)
+
+[CRAN] v1.00.15 -- 2023/04/23
   - better reporter ion minimum range in violin plot (pr #124)
+  - workaround for MQ bug causing negative scan index in msms_scan.txt (issue #128)
   - use 'rmzqc' package to handle mzQC output
 
 [CRAN] v1.00.14 -- 2022/09/21

diff --git a/PTXQC.Rproj b/PTXQC.Rproj
@@ -15,3 +15,5 @@ LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
+PackageCheckArgs: manual = TRUE,   cran = TRUE,   remote = TRUE,   incoming = TRUE,
+PackageRoxygenize: rd,collate,namespace
diff --git a/R/MQDataReader.R b/R/MQDataReader.R
@@ -21,6 +21,26 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   mq$readMQ(file, filter, type, col_subset, add_fs_col, LFQ_action, ...)
 }
 
+#' Determine if a file is 'UTF-8' or 'UTF-8-BOM' (as of MQ2.4) or 'UTF-16BE' or 'UTF-16LE'
+#' @param filename Relative or absolute path to a file
+#' @return '' if the file does not exist or is not readable
+getFileEncoding = function(filename)
+{
+  file_handle = try(file(filename, "rb"))
+  if (inherits(file_handle, 'try-error')) return("")
+
+  data = readBin(file_handle, "raw", n = 4)
+
+  if (data[1]==as.raw(0xef) & data[2]==as.raw(0xbb) & data[3]==as.raw(0xbf)) 
+    return("UTF-8-BOM")
+  if (data[1]==as.raw(0xfe) & data[2]==as.raw(0xff))
+    return("UTF-16BE")  ##UTF16 big endian
+  if (data[1]==as.raw(0xff) & data[2]==as.raw(0xfe))
+    return("UTF-16LE")  ##UTF16 little endian
+  return("UTF-8")
+}
+
+
 #'
 #' S5-RefClass to read MaxQuant .txt files
 #'
@@ -49,6 +69,11 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
 #' Note: you must find a regex which matches both versions, or explicitly add both terms if you are requesting only a subset
 #'       of columns!
 #' 
+#' Fixes for msmsScans.txt:
+#'  negative Scan Event Numbers in msmsScans.txt are reconstructed by using other columns
+#'
+#' Automatically detects UTF8-BOM encoding and deals with it (since MQ2.4).
+#'
 #' Example of usage:
 #' \preformatted{
 #'   mq = MQDataReader$new()
@@ -83,7 +108,8 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   #'               "pg" (proteinGroups) [default], adds abundance index columns (*AbInd*, replacing 'intensity')
   #'               "sm" (summary), splits into three row subsets (raw.file, condition, total)
   #'               "ev" (evidence), will fix empty modified.sequence cells for older MQ versions (when MBR is active)
-  #'               Any other value will not add any special columns
+  #'               "msms_scans", will fix invalid (negative) scan event numbers
+  #'               Any other value will not add/modify any columns
   #' @param col_subset A vector of column names as read by read.delim(), e.g., spaces are replaced by dot already.
   #'                   If given, only columns with these names (ignoring lower/uppercase) will be returned (regex allowed)
   #'                   E.g. col_subset=c("^lfq.intensity.", "protein.name")
@@ -112,13 +138,17 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   ## error message if failure should occur below
   msg_parse_error = paste0("\n\nParsing the file '", file, "' failed. See message above why. If the file is not usable but other files are ok, disable the corresponding section in the YAML config. You might also be running a foreign locale (e.g. Chinese) - switch to an English locale and make sure that txt files are encoded in ASCII (Latin-1)!")
 
+  ## get encoding, to pass on to read.delim
+  file_encoding = getFileEncoding(file)
+
+
   ## resolve set of columns which we want to keep
   #example: col_subset = c("Multi.*", "^Peaks$")
   colClasses = NA ## read.table: keep all columns by default
   if (sum(!is.na(col_subset)) > 0)
   { ## just read a tiny bit to get column names
     ## do not use data.table::fread for this, since it will read the WHOLE file and takes ages...
-    data_header = try(read.delim(file, comment.char="", nrows=2))
+    data_header = try(read.delim(file, comment.char="", nrows=2, fileEncoding = file_encoding))
     if (inherits(data_header, 'try-error')) stop(msg_parse_error, call. = FALSE);
 
     colnames(data_header) = tolower(colnames(data_header))
@@ -161,7 +191,9 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   ##    However, when the colClass is 'numeric', whitespaces are stripped, and only AFTERWARDS the string
   ##    is checked against na.strings
   ##  - the '\u975E\u6570\u5B57' na-string is the chinese UTF-8 representation of "NA"
-  .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, ...))
+  .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), 
+                                 encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, 
+                                 fileEncoding = file_encoding, ...))
   if (inherits(.self$mq.data, 'try-error')) stop(msg_parse_error, call. = FALSE);
 
   #colnames(.self$mq.data)
@@ -336,9 +368,24 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
       .self$mq.data$modified.sequence[idx_mm] = rep(.self$mq.data$modified.sequence[idx_block_start-1],
                                                     idx_block_end-idx_block_start+1)
     }
+  } else if (type == "msms_scans") {
+
+    ## fix scan.event.number (some MQ 1.6.17.0 results have negative values...)
+    if (min(.self$mq.data$scan.event.number, na.rm = TRUE) < 1)
+    { ## fix by manually computing it from 'Scan index' and 'MS scan index' (the precursor MS1)
+      warning("Found MaxQuant bug in msmsScans.txt (Scan Event Numbers are negative)")
+      req_cols = c("raw.file", "ms.scan.index", "scan.index")
+      if (!checkInput(req_cols, .self$mq.data)) stop("Could not find all of '", paste0(req_cols, sep="', '"), "' in msmsScans.txt load() request. Please request loading these columns in order to fix the scan.event.number values.")
+      dtemp = as.data.table(.self$mq.data)
+      ## sort by precursor index + MS2 index, such that subsetting later already has the right order
+      setorder(dtemp, raw.file, ms.scan.index, scan.index) 
+      dtemp[, scan.event.number := 1:.N, by = .(raw.file, ms.scan.index)]
+      .self$mq.data = as.data.frame(dtemp)
+    }
   }
 
 
+
   if (add_fs_col & "raw.file" %in% colnames(.self$mq.data))
   {
     .self$mq.data$fc.raw.file = .self$fn_map$getShortNames(.self$mq.data$raw.file, add_fs_col)

diff --git a/R/createReport.R b/R/createReport.R
@@ -571,12 +571,14 @@ createReport = function(txt_folder = NULL,
   ######
   ######  msmsScans.txt ...
   ######
-  if (MZTAB_MODE) df_msmsScans = mzt$getMSMSScans(identified_only = FALSE)
-  else df_msmsScans = mq$readMQ(txt_files$msmsScan, type = "msms", filter = "", 
+  if (MZTAB_MODE) df_msmsScans = mzt$getMSMSScans(identified_only = FALSE) else
+     df_msmsScans = mq$readMQ(txt_files$msmsScan, type = "msms_scans", filter = "", 
                                 col_subset = c(numeric = "^ion.injection.time", 
                                                numeric = "^retention.time$", 
                                                "^Identified", 
-                                               "^Scan.event.number", 
+                                               numeric = "^Scan.event.number", 
+                                               numeric = 'Scan.index',    ## required for fixing scan.event.number, in case its broken
+                                               numeric = 'MS.scan.index', ## required for fixing scan.event.number, in case its broken
                                                "^total.ion.current",
                                                "^base.?peak.intensity", ## basepeak.intensity (MQ1.2) and base.peak.intensity (MQ1.3+)
                                                "^Raw.file",

diff --git a/R/fcn_plots.R b/R/fcn_plots.R
@@ -753,11 +753,11 @@ plotTableRaw = function(data, colours="black", fill=NA, just="centre")
 getHTMLTable = function(data, caption = NA)
 {
 
-  tbl = data %>% 
-          htmlTable::addHtmlTableStyle(align = 'l',  ## align columns left
-                                       col.rgroup = c("none", "#F7F7F7")) %>%
-          htmlTable::htmlTable(rnames = FALSE,    ## no row names
-                               caption = caption) 
+  tbl = htmlTable::addHtmlTableStyle(data,
+                                     align = 'l',  ## align columns left
+                                     col.rgroup = c("none", "#F7F7F7"))
+  tbl = htmlTable::htmlTable(tbl, rnames = FALSE,    ## no row names
+                             caption = caption) 
 
   return(tbl)
 }

diff --git a/R/mzQC.R b/R/mzQC.R
@@ -56,20 +56,22 @@ getRunQualityTemplate = function(fc.raw.file, raw_file_mapping)
     ## we're just guessing here...
     warning("Cannot properly fill metadata of mzQC file, since full filenames are unknown. Using placeholders.")
     filename = paste0(raw_file, ".raw"); 
-    fullpath = paste0("???/", filename);
+    location = paste0("???/", filename);
     accession = rmzqc::filenameToCV(filename)
   } else {
     idx_meta = which(meta$file_no_suffix == raw_file)
     filename = as.character(meta$file[idx_meta])
-    fullpath = as.character(meta$path[idx_meta])
+    location = as.character(meta$path[idx_meta])
     accession = as.character(meta$CV[idx_meta])
   }
-
+  ## make location a proper URI
+  if (!startsWith(location, "file:///")) location = paste0("file:///", location);
+  location = gsub("\\", "/", location, fixed = TRUE)
   file_format = rmzqc::getCVTemplate(accession = accession)
   ptxqc_software = rmzqc::toAnalysisSoftware(id = "MS:1003162", version = as.character(utils::packageVersion("PTXQC")))
 
   out = rmzqc::MzQCrunQuality$new(rmzqc::MzQCmetadata$new(raw_file,  ## label
-                                                          list(rmzqc::MzQCinputFile$new(filename, fullpath, file_format)),
+                                                          list(rmzqc::MzQCinputFile$new(filename, location, file_format)),
                                                           list(ptxqc_software)),
                                   list())
 

diff --git a/R/qcMetric_MSMS.R b/R/qcMetric_MSMS.R
@@ -124,7 +124,7 @@ space and potentially provoking overlapping peptide signals, biasing peptide qua
 Thus, low MC counts should be favored. Interestingly, it has been shown recently that 
 incorporation of peptides with missed cleavages does not negatively influence protein quantification (see 
 [Chiva, C., Ortega, M., and Sabido, E. Influence of the Digestion Technique, Protease, and Missed 
-Cleavage Peptides in Protein Quantitation. J. Proteome Res. 2014, 13, 3979-86](https://pubs.acs.org/doi/abs/10.1021/pr500294d) ). 
+Cleavage Peptides in Protein Quantitation. J. Proteome Res. 2014, 13, 3979-86](https://doi.org/10.1021/pr500294d) ). 
 However this is true only if all samples show the same degree of digestion. High missed cleavage values 
 can indicate for example, either a) failed digestion, b) a high (post-digestion) protein contamination, or 
 c) a sample with high amounts of unspecifically degraded peptides which are not digested by trypsin. 

diff --git a/R/qcMetric_MSMSScans.R b/R/qcMetric_MSMSScans.R
@@ -21,6 +21,7 @@ Heatmap score [MS<sup>2</sup> Scans: TopN over RT]: Rewards uniform (function Un
       if (!checkInput(c("fc.raw.file", "retention.time", "scan.event.number", "rRT"), df_msmsScans)) return()
       dd = data.table::as.data.table(df_msmsScans[, c("fc.raw.file", "retention.time", "scan.event.number", "rRT")])
       data.table::setkey(dd, fc.raw.file, retention.time) ## sort by RT
+
       ## find the highest scan event (SE) after an MS1 scan
       DF_max = dd[, {
           idx = which(getMaxima(scan.event.number, thresh_rel = 0.0))

diff --git a/R/qcMetric_PAR.R b/R/qcMetric_PAR.R
@@ -7,13 +7,14 @@ qcMetric_PAR =  setRefClass(
 Key parameters are MaxQuant version, Re-quantify, Match-between-runs and mass search tolerances. 
 A list of protein database files is also provided, allowing to 
 track database completeness and database version information (if given in the filename).", 
-    workerFcn=function(.self, df_mqpar)
+    workerFcn=function(.self, d_parAll)
     {
       ##todo: read in mqpar.xml to get group information and ppm tolerances for all groups (parameters.txt just gives Group1)
 
       line_break = "\n"; ## use space to make it work with table
       ## remove AIF stuff
-      df_mqpar = df_mqpar[!grepl("^AIF ", df_mqpar$parameter),]
+
+      df_mqpar = d_parAll[!grepl("^AIF ", d_parAll$parameter),]
       df_mqpar$value = gsub(";", line_break, df_mqpar$value)
       ## seperate FASTA files (usually they destroy the layout)
       idx_fastafile = grepl("fasta file", df_mqpar$parameter, ignore.case = TRUE)
@@ -48,7 +49,7 @@ track database completeness and database version information (if given in the fi
       d_par2 = cbind(d_par[d_par$page==0, parC], d_par[d_par$page==1, parC])
 
       ## HTML: alternative table
-      ## (do this before line breaks, since Html can handle larger strings)      
+      ## (do this before line breaks, since Html can handle larger strings)
       tbl_f = getHTMLTable(d_par2, caption = fasta_files)
 
       ## break long values into multiple lines (to preserve PDF table width)

diff --git a/README.md b/README.md
@@ -1,16 +1,15 @@
 PTXQC
 ---------------
 
-[![License (3-Clause BSD)](https://img.shields.io/badge/license-BSD%203--Clause-blue.svg?style=flat-square)](https://opensource.org/licenses/BSD-3-Clause)
-[![Build Status](https://travis-ci.org/cbielow/PTXQC.svg?branch=master)](https://travis-ci.org/cbielow/PTXQC) 
+[![License (3-Clause BSD)](https://img.shields.io/badge/license-BSD%203--Clause-blue.svg?style=flat-square)](https://opensource.org/license/bsd-3-clause/)
 [![Project Stats](https://www.openhub.net/p/PTXQC/widgets/project_thin_badge.gif)](https://www.openhub.net/p/PTXQC)
 
 **This package allows users of MaxQuant (from .txt files) and OpenMS (from mzTab files) to generate quality control reports in Html/PDF format.**
 
 ### Latest changes / ChangeLog
 
-latest Release: v1.0.14 - September 2022
-latest Release on CRAN: v1.0.14 - September 2022
+latest Release: v1.0.16 - May 2023
+latest Release on CRAN: v1.0.16 - May 2023
 
 See [NEWS][News_File] file for a version history.