this is PTXQC 1.0.16

cbielow · May 17, 2023 · 053c3e1 · 053c3e1
1 parent 23b2116
commit 053c3e1
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 9 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: PTXQC
 Type: Package
 Title: Quality Report Generation for MaxQuant and mzTab Results
-Version: 1.0.15
-Date: 2023-04-23
+Version: 1.0.16
+Date: 2023-05-17
 Author: Chris Bielow [aut, cre],
   Juliane Schmachtenberg [ctb],
   Swenja Wagner [ctb],

diff --git a/NEWS b/NEWS
@@ -14,6 +14,9 @@ Glossary:
 #########   CHANGELOG  ##########
 #################################
 
+[CRAN] v1.00.16 -- 2023/05/17
+  - add MaxQuant 2.4 support (#130)
+
 [CRAN] v1.00.15 -- 2023/04/23
   - better reporter ion minimum range in violin plot (pr #124)
   - workaround for MQ bug causing negative scan index in msms_scan.txt (issue #128)

diff --git a/R/MQDataReader.R b/R/MQDataReader.R
@@ -21,6 +21,26 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   mq$readMQ(file, filter, type, col_subset, add_fs_col, LFQ_action, ...)
 }
 
+#' Determine if a file is 'UTF-8' or 'UTF-8-BOM' (as of MQ2.4) or 'UTF-16BE' or 'UTF-16LE'
+#' @param filename Relative or absolute path to a file
+#' @return '' if the file does not exist or is not readable
+getFileEncoding = function(filename)
+{
+  file_handle = try(file(filename, "rb"))
+  if (inherits(file_handle, 'try-error')) return("")
+
+  data = readBin(file_handle, "raw", n = 4)
+
+  if (data[1]==as.raw(0xef) & data[2]==as.raw(0xbb) & data[3]==as.raw(0xbf)) 
+    return("UTF-8-BOM")
+  if (data[1]==as.raw(0xfe) & data[2]==as.raw(0xff))
+    return("UTF-16BE")  ##UTF16 big endian
+  if (data[1]==as.raw(0xff) & data[2]==as.raw(0xfe))
+    return("UTF-16LE")  ##UTF16 little endian
+  return("UTF-8")
+}
+
+
 #'
 #' S5-RefClass to read MaxQuant .txt files
 #'
@@ -51,7 +71,9 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
 #' 
 #' Fixes for msmsScans.txt:
 #'  negative Scan Event Numbers in msmsScans.txt are reconstructed by using other columns
-#' 
+#'
+#' Automatically detects UTF8-BOM encoding and deals with it (since MQ2.4).
+#'
 #' Example of usage:
 #' \preformatted{
 #'   mq = MQDataReader$new()
@@ -116,13 +138,17 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   ## error message if failure should occur below
   msg_parse_error = paste0("\n\nParsing the file '", file, "' failed. See message above why. If the file is not usable but other files are ok, disable the corresponding section in the YAML config. You might also be running a foreign locale (e.g. Chinese) - switch to an English locale and make sure that txt files are encoded in ASCII (Latin-1)!")
 
+  ## get encoding, to pass on to read.delim
+  file_encoding = getFileEncoding(file)
+
+
   ## resolve set of columns which we want to keep
   #example: col_subset = c("Multi.*", "^Peaks$")
   colClasses = NA ## read.table: keep all columns by default
   if (sum(!is.na(col_subset)) > 0)
   { ## just read a tiny bit to get column names
     ## do not use data.table::fread for this, since it will read the WHOLE file and takes ages...
-    data_header = try(read.delim(file, comment.char="", nrows=2))
+    data_header = try(read.delim(file, comment.char="", nrows=2, fileEncoding = file_encoding))
     if (inherits(data_header, 'try-error')) stop(msg_parse_error, call. = FALSE);
 
     colnames(data_header) = tolower(colnames(data_header))
@@ -165,7 +191,9 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col =
   ##    However, when the colClass is 'numeric', whitespaces are stripped, and only AFTERWARDS the string
   ##    is checked against na.strings
   ##  - the '\u975E\u6570\u5B57' na-string is the chinese UTF-8 representation of "NA"
-  .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, ...))
+  .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), 
+                                 encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, 
+                                 fileEncoding = file_encoding, ...))
   if (inherits(.self$mq.data, 'try-error')) stop(msg_parse_error, call. = FALSE);
 
   #colnames(.self$mq.data)

diff --git a/R/qcMetric_PAR.R b/R/qcMetric_PAR.R
@@ -7,13 +7,14 @@ qcMetric_PAR =  setRefClass(
 Key parameters are MaxQuant version, Re-quantify, Match-between-runs and mass search tolerances. 
 A list of protein database files is also provided, allowing to 
 track database completeness and database version information (if given in the filename).", 
-    workerFcn=function(.self, df_mqpar)
+    workerFcn=function(.self, d_parAll)
     {
       ##todo: read in mqpar.xml to get group information and ppm tolerances for all groups (parameters.txt just gives Group1)
 
       line_break = "\n"; ## use space to make it work with table
       ## remove AIF stuff
-      df_mqpar = df_mqpar[!grepl("^AIF ", df_mqpar$parameter),]
+
+      df_mqpar = d_parAll[!grepl("^AIF ", d_parAll$parameter),]
       df_mqpar$value = gsub(";", line_break, df_mqpar$value)
       ## seperate FASTA files (usually they destroy the layout)
       idx_fastafile = grepl("fasta file", df_mqpar$parameter, ignore.case = TRUE)

diff --git a/README.md b/README.md
@@ -8,8 +8,8 @@ PTXQC
 
 ### Latest changes / ChangeLog
 
-latest Release: v1.0.15 - April 2023
-latest Release on CRAN: v1.0.15 - April 2023
+latest Release: v1.0.16 - May 2023
+latest Release on CRAN: v1.0.16 - May 2023
 
 See [NEWS][News_File] file for a version history.
 

diff --git a/man/MQDataReader-class.Rd b/man/MQDataReader-class.Rd
diff --git a/man/getFileEncoding.Rd b/man/getFileEncoding.Rd