diff --git a/DESCRIPTION b/DESCRIPTION index 5cc2c52..79bff30 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: PTXQC Type: Package Title: Quality Report Generation for MaxQuant and mzTab Results -Version: 1.0.15 -Date: 2023-04-23 +Version: 1.0.16 +Date: 2023-05-17 Author: Chris Bielow [aut, cre], Juliane Schmachtenberg [ctb], Swenja Wagner [ctb], diff --git a/NEWS b/NEWS index f2f2f91..d886983 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,9 @@ Glossary: ######### CHANGELOG ########## ################################# +[CRAN] v1.00.16 -- 2023/05/17 + - add MaxQuant 2.4 support (#130) + [CRAN] v1.00.15 -- 2023/04/23 - better reporter ion minimum range in violin plot (pr #124) - workaround for MQ bug causing negative scan index in msms_scan.txt (issue #128) diff --git a/R/MQDataReader.R b/R/MQDataReader.R index d0d058a..4e0f38d 100644 --- a/R/MQDataReader.R +++ b/R/MQDataReader.R @@ -21,6 +21,26 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = mq$readMQ(file, filter, type, col_subset, add_fs_col, LFQ_action, ...) } +#' Determine if a file is 'UTF-8' or 'UTF-8-BOM' (as of MQ2.4) or 'UTF-16BE' or 'UTF-16LE' +#' @param filename Relative or absolute path to a file +#' @return '' if the file does not exist or is not readable +getFileEncoding = function(filename) +{ + file_handle = try(file(filename, "rb")) + if (inherits(file_handle, 'try-error')) return("") + + data = readBin(file_handle, "raw", n = 4) + + if (data[1]==as.raw(0xef) & data[2]==as.raw(0xbb) & data[3]==as.raw(0xbf)) + return("UTF-8-BOM") + if (data[1]==as.raw(0xfe) & data[2]==as.raw(0xff)) + return("UTF-16BE") ##UTF16 big endian + if (data[1]==as.raw(0xff) & data[2]==as.raw(0xfe)) + return("UTF-16LE") ##UTF16 little endian + return("UTF-8") +} + + #' #' S5-RefClass to read MaxQuant .txt files #' @@ -51,7 +71,9 @@ read.MQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = #' #' Fixes for msmsScans.txt: #' negative Scan Event Numbers in msmsScans.txt are reconstructed by using other columns -#' +#' +#' Automatically detects UTF8-BOM encoding and deals with it (since MQ2.4). +#' #' Example of usage: #' \preformatted{ #' mq = MQDataReader$new() @@ -116,13 +138,17 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = ## error message if failure should occur below msg_parse_error = paste0("\n\nParsing the file '", file, "' failed. See message above why. If the file is not usable but other files are ok, disable the corresponding section in the YAML config. You might also be running a foreign locale (e.g. Chinese) - switch to an English locale and make sure that txt files are encoded in ASCII (Latin-1)!") + ## get encoding, to pass on to read.delim + file_encoding = getFileEncoding(file) + + ## resolve set of columns which we want to keep #example: col_subset = c("Multi.*", "^Peaks$") colClasses = NA ## read.table: keep all columns by default if (sum(!is.na(col_subset)) > 0) { ## just read a tiny bit to get column names ## do not use data.table::fread for this, since it will read the WHOLE file and takes ages... - data_header = try(read.delim(file, comment.char="", nrows=2)) + data_header = try(read.delim(file, comment.char="", nrows=2, fileEncoding = file_encoding)) if (inherits(data_header, 'try-error')) stop(msg_parse_error, call. = FALSE); colnames(data_header) = tolower(colnames(data_header)) @@ -165,7 +191,9 @@ readMQ = function(file, filter = "", type = "pg", col_subset = NA, add_fs_col = ## However, when the colClass is 'numeric', whitespaces are stripped, and only AFTERWARDS the string ## is checked against na.strings ## - the '\u975E\u6570\u5B57' na-string is the chinese UTF-8 representation of "NA" - .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, ...)) + .self$mq.data = try(read.delim(file, na.strings=c("NA", "n. def.", "n.def.", "\u975E\u6570\u5B57"), + encoding="UTF-8", comment.char="", stringsAsFactors = FALSE, colClasses = colClasses, + fileEncoding = file_encoding, ...)) if (inherits(.self$mq.data, 'try-error')) stop(msg_parse_error, call. = FALSE); #colnames(.self$mq.data) diff --git a/R/qcMetric_PAR.R b/R/qcMetric_PAR.R index 64d9885..7e5603a 100644 --- a/R/qcMetric_PAR.R +++ b/R/qcMetric_PAR.R @@ -7,13 +7,14 @@ qcMetric_PAR = setRefClass( Key parameters are MaxQuant version, Re-quantify, Match-between-runs and mass search tolerances. A list of protein database files is also provided, allowing to track database completeness and database version information (if given in the filename).", - workerFcn=function(.self, df_mqpar) + workerFcn=function(.self, d_parAll) { ##todo: read in mqpar.xml to get group information and ppm tolerances for all groups (parameters.txt just gives Group1) line_break = "\n"; ## use space to make it work with table ## remove AIF stuff - df_mqpar = df_mqpar[!grepl("^AIF ", df_mqpar$parameter),] + + df_mqpar = d_parAll[!grepl("^AIF ", d_parAll$parameter),] df_mqpar$value = gsub(";", line_break, df_mqpar$value) ## seperate FASTA files (usually they destroy the layout) idx_fastafile = grepl("fasta file", df_mqpar$parameter, ignore.case = TRUE) diff --git a/README.md b/README.md index ecd1a56..64aa1af 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ PTXQC ### Latest changes / ChangeLog -latest Release: v1.0.15 - April 2023 -latest Release on CRAN: v1.0.15 - April 2023 +latest Release: v1.0.16 - May 2023 +latest Release on CRAN: v1.0.16 - May 2023 See [NEWS][News_File] file for a version history. diff --git a/man/MQDataReader-class.Rd b/man/MQDataReader-class.Rd index 6067bc0..a050add 100644 --- a/man/MQDataReader-class.Rd +++ b/man/MQDataReader-class.Rd @@ -87,6 +87,8 @@ Note: you must find a regex which matches both versions, or explicitly add both Fixes for msmsScans.txt: negative Scan Event Numbers in msmsScans.txt are reconstructed by using other columns +Automatically detects UTF8-BOM encoding and deals with it (since MQ2.4). + Example of usage: \preformatted{ mq = MQDataReader$new() diff --git a/man/getFileEncoding.Rd b/man/getFileEncoding.Rd new file mode 100644 index 0000000..ab83269 --- /dev/null +++ b/man/getFileEncoding.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MQDataReader.R +\name{getFileEncoding} +\alias{getFileEncoding} +\title{Determine if a file is 'UTF-8' or 'UTF-8-BOM' (as of MQ2.4) or 'UTF-16BE' or 'UTF-16LE'} +\usage{ +getFileEncoding(filename) +} +\arguments{ +\item{filename}{Relative or absolute path to a file} +} +\value{ +'' if the file does not exist or is not readable +} +\description{ +Determine if a file is 'UTF-8' or 'UTF-8-BOM' (as of MQ2.4) or 'UTF-16BE' or 'UTF-16LE' +}