Merge pull request #46 from margotbligh/dev

Dev
margotbligh · Feb 29, 2024 · e5f1e67 · e5f1e67
2 parents c95d13f + b0ba692
commit e5f1e67
Show file tree

Hide file tree

Showing 8 changed files with 1,082 additions and 142 deletions.
diff --git a/R/glycoAnnotate.R b/R/glycoAnnotate.R
@@ -166,42 +166,52 @@ glycoAnnotate <- function(data,
       return(ppm)
     }
     pred_table <- pred_table %>%
-      dplyr::mutate(mzmin = mz - ppm_to_mz(mz, error),
-                    mzmax = mz + ppm_to_mz(mz, error))
+      dplyr::mutate(mzmin_match = mz - ppm_to_mz(mz, error),
+                    mzmax_match = mz + ppm_to_mz(mz, error))
   }
   if(error_units == 'Da'){
     pred_table <- pred_table %>%
-      dplyr::mutate(mzmin = mz - error,
-                    mzmax = mz + error)
+      dplyr::mutate(mzmin_match = mz - error,
+                    mzmax_match = mz + error)
   }
 
+  #rename mz column in pred_table
+  names(pred_table)[names(pred_table) == 'mz'] <- 'mz_pred'
+
   #run annotation
   message("Starting annotation with predictions against data")
   if(!is.null(mzmin_column) & !is.null(mzmax_column)){
-    if (mzmin_column != "mzmin"){
-      names(data)[names(data) == mzmin_column] <- "mzmin"
-    }
-    if (mzmax_column != "mzmax"){
-      names(data)[names(data) == mzmax_column] <- "mzmax"
-    }
+    data$mzmin_match <- data[, mzmin_column]
+    data$mzmax_match <- data[, mzmax_column]
+
     data.table::setDT(data)
     data.table::setDT(pred_table)
-    data.table::setkey(pred_table, mzmin, mzmax)
+    data.table::setkey(pred_table, mzmin_match, mzmax_match)
 
     data_annot <- data.table::foverlaps(data, pred_table)
   }
   if(is.null(mzmin_column) & is.null(mzmax_column)){
     data <- data %>%
-      dplyr::mutate(mzmin = get(mz_column),
-                    mzmax = get(mz_column))
+      dplyr::mutate(mzmin_match = get(mz_column),
+                    mzmax_match = get(mz_column))
 
     data.table::setDT(data)
     data.table::setDT(pred_table)
-    data.table::setkey(pred_table, mzmin, mzmax)
+    data.table::setkey(pred_table, mzmin_match, mzmax_match)
 
     data_annot <- data.table::foverlaps(data, pred_table)
   }
 
+  #calculate mass error
+  if(mz_column == 'mz'){
+    data_annot <- data_annot %>%
+      dplyr::mutate(mass_error = abs(mz - mz_pred))
+  }
+  if(mz_column != 'mz'){
+    data_annot <- data_annot %>%
+      dplyr::mutate(mass_error = abs(get(mz_column) - mz_pred))
+  }
+
   #collapse annotations
   data.table::setDF(data_annot)
   if(isTRUE(collapse) & nrow(data_annot) > nrow(data)){
@@ -232,40 +242,10 @@ glycoAnnotate <- function(data,
   }
 
   #format final df
-  if(isFALSE(collapse)){
-    data_annot <- data_annot %>%
-      dplyr::select(!c('mzmin', 'mzmax'))
-  }
-  if('mz' %in% names(pred_table) & 'mz' %in% names(data)){
-    if(isFALSE(collapse)){
-      data_annot <-  data_annot %>%
-        dplyr::rename(mz_pred = mz)
-    }
-  }
-  if(!is.null(mzmin_column)){
-    if ("i.mzmin" %in% names(data_annot)){
-      names(data_annot)[names(data_annot) == "i.mzmin"] <-  mzmin_column
-    }
-  }
-  if(!is.null(mzmax_column)){
-    if("i.mzmax" %in% names(data_annot)){
-      names(data_annot)[names(data_annot) == "i.mzmax"] <-  mzmax_column
+  data_annot <- data_annot %>%
+    dplyr::select(!any_of(c('mzmin_match', 'mzmax_match',
+                            'i.mzmin_match', 'i.mzmax_match')))
 
-    }
-  }
-
-  if(is.null(mzmin_column)){
-    if ("i.mzmin" %in% names(data_annot)){
-      data_annot <- data_annot %>%
-        dplyr::select(!'i.mzmin')
-    }
-  }
-  if(is.null(mzmax_column)){
-    if("i.mzmax" %in% names(data_annot)){
-      data_annot <- data_annot %>%
-        dplyr::select(!'i.mzmax')
-    }
-  }
 
   return(data_annot)
 }

diff --git a/R/glycoPredict.R b/R/glycoPredict.R
@@ -2,12 +2,12 @@
 #'
 #' @include setClass.R
 #'
-#' @description \code{glycoPredict()} predicts all possible glycan within the 
+#' @description \code{glycoPredict()} predicts all possible glycan within the
 #' constraints set by the \code{glycoPredictParam} object.
 #' @param param A \code{glycoPredictParam} object. See \link[GlycoAnnotateR]{glycoPredictParam}
 #'
-#' @export 
-#' 
+#' @export
+#'
 #' @examples
 #' gpp <- glycoPredictParam()
 #' gpp@@dp <- c(1,7)
@@ -16,19 +16,19 @@
 #' gpp@@modifications <- c('sulphate', 'carboxylicacid')
 #' gpp@@double_sulphate <- TRUE
 #' predicted.df <- glycoPredict(param = gpp)
-#' 
-#' @details 
-#' \code{glycoPredict()} is used to predict masses and mass to charge ratios of all theoretically 
-#' possible glycans within a set of constraining parameters (defined in the 
-#' \code{glycoPredictParam} object). This package was written 
-#' for annotation of mass spec data (especially LC-MS) but if used for 
-#' other purposes either ionisation mode and very wide scan ranges can be given. 
-#' The function works by sourcing a python file and then using the function 
+#'
+#' @details
+#' \code{glycoPredict()} is used to predict masses and mass to charge ratios of all theoretically
+#' possible glycans within a set of constraining parameters (defined in the
+#' \code{glycoPredictParam} object). This package was written
+#' for annotation of mass spec data (especially LC-MS) but if used for
+#' other purposes either ionisation mode and very wide scan ranges can be given.
+#' The function works by sourcing a python file and then using the function
 #' encoded in the python script.
-#' 
-#' @seealso 
+#'
+#' @seealso
 #' glycoAnnotateR::glycoPredictParam()
-#' 
+#'
 
 glycoPredict <- function(param){
   path <- paste(system.file(package="GlycoAnnotateR"), "sugarMassesPredict.py", sep="/")
@@ -56,9 +56,9 @@ glycoPredict <- function(param){
   naming = as.list(param@naming)
   glycan_linkage = as.list(param@glycan_linkage)
   modification_limits = param@modification_limits
-  
+
   message(paste("Glycans will be predicted according to the following glycoPredictParam() object:\n", str(param)))
-  
+
   df <- predict_sugars(dp = dp, polarity = polarity,
                        scan_range = scan_range,
                        pent_option = pent_option, modifications = modifications,
@@ -77,89 +77,104 @@ glycoPredict <- function(param){
       x[na] = NA_real_
       x
     }
-    df.l <- df %>% 
+    df.l <- df %>%
       #make long
       tidyr::pivot_longer(cols = starts_with("[M"),
                           names_to = "ion",
-                          values_to = "mz") %>% 
+                          values_to = "mz") %>%
       #remove ions outside scan range
-      tidyr::drop_na(mz) %>% 
+      tidyr::drop_na(mz) %>%
       #calculate ion formula
-      dplyr::mutate(C = stringr::str_split_i(formula, "C", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+      dplyr::mutate(C = stringr::str_split_i(formula, "C", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
-                    H = stringr::str_split_i(formula, "H", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+                    H = stringr::str_split_i(formula, "H", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
-                    N = stringr::str_split_i(formula, "N", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+                    N = stringr::str_split_i(formula, "N", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
                     N = dplyr::case_when(grepl("N", formula) & is.na(N) ~ 1,
                                          TRUE ~ N),
-                    O = stringr::str_split_i(formula, "O", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+                    O = stringr::str_split_i(formula, "O", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
-                    P = stringr::str_split_i(formula, "P", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+                    P = stringr::str_split_i(formula, "P", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
                     P = dplyr::case_when(grepl("P", formula) & is.na(P) ~ 1,
                                          TRUE ~ P),
-                    S = stringr::str_split_i(formula, "S", 2) %>% 
-                      sub("\\D.*", "", .) %>% 
+                    S = stringr::str_split_i(formula, "S", 2) %>%
+                      sub("\\D.*", "", .) %>%
                       as.num(),
                     S = dplyr::case_when(grepl("S", formula) & is.na(S) ~ 1,
                                          TRUE ~ S),
                     ion_effect = gsub("\\[M|\\].*", "", ion),
-                    delta_H = sub(".*([+-]\\d*H).*", "\\1", ion_effect) %>% 
-                      sub("[-+]\\d[^H].*|[-+][A-G, I-Z].*", "", .) %>% 
-                      sub("H", "", .) %>% 
-                      sub("^-$", -1, .) %>% 
-                      sub("^\\+$", 1, .) %>% 
-                      as.num())
-    df.l$delta_H[df.l$ion_effect == "+NH4"] <- 4
-    df.l <- df.l %>% 
-      dplyr::mutate(delta_N = sub(".*([+-]\\d*N[^a]).*", "\\1", ion_effect) %>% 
-                      sub("[+-]Na", "", .) %>% 
-                      sub("[-+]\\d[^N].*|[-+][A-M, O-Z].*|[A-M, O-Z]", "", .) %>% 
-                      sub("N", "", .) %>% 
-                      sub("^-$", -1, .) %>% 
-                      sub("^\\+$", 1, .) %>% 
+                    delta_H = sub(".*([+-]\\d*H).*", "\\1", ion_effect) %>%
+                      sub("[-+]\\d[^H].*|[-+][A-G, I-Z].*", "", .) %>%
+                      sub("H", "", .) %>%
+                      sub("^-$", -1, .) %>%
+                      sub("^\\+$", 1, .) %>%
+                      as.num(),
+                    multiple_ammonium = dplyr::case_when(grepl('\\dNH4', ion) ~
+                                                           stringr::str_split_i(ion_effect,
+                                                                                '\\+|N', 2) %>%
+                                                           as.numeric(),
+                                                         TRUE ~ NA),
+                    delta_H = dplyr::case_when(grepl('\\dNH4', ion) ~
+                                                 delta_H + (multiple_ammonium*4),
+                                               grepl('\\+NH4', ion) ~ 4,
+                                               grepl('\\+CHOO', ion) ~ 1,
+                                               TRUE ~ delta_H),
+                    delta_N = sub(".*([+-]\\d*N[^a]).*", "\\1", ion_effect) %>%
+                      sub("[+-]Na", "", .) %>%
+                      sub("[-+]\\d[^N].*|[-+][A-M, O-Z].*|[A-M, O-Z]", "", .) %>%
+                      sub("N", "", .) %>%
+                      sub("^-$", -1, .) %>%
+                      sub("^\\+$", 1, .) %>%
                       as.num(),
-                    delta_Cl = sub(".*([+-]\\d*Cl).*", "\\1", ion_effect) %>%  
-                      sub("[-+]\\d[^Cl].*|[-+][A-B, D-Z].*", "", .) %>% 
-                      sub("Cl", "", .) %>% 
-                      sub("^-$", "-1", .) %>% 
-                      sub("^\\+$", "1", .) %>% 
+                    delta_Cl = sub(".*([+-]\\d*Cl).*", "\\1", ion_effect) %>%
+                      sub("[-+]\\d[^Cl].*|[-+][A-B, D-Z].*", "", .) %>%
+                      sub("Cl", "", .) %>%
+                      sub("^-$", "-1", .) %>%
+                      sub("^\\+$", "1", .) %>%
                       as.num(na.strings = "+CHOO"),
-                    delta_Na = sub(".*([+-]\\d*Na).*", "\\1", ion_effect) %>% 
-                      sub("[-+]\\d[^Na].*|[-+][A-M, O-Z].*", "", .) %>% 
-                      sub("Na", "", .) %>% 
-                      sub("^-$", -1, .) %>% 
-                      sub("^\\+$", 1, .) %>% 
+                    delta_Na = sub(".*([+-]\\d*Na).*", "\\1", ion_effect) %>%
+                      sub("[-+]\\d[^Na].*|[-+][A-M, O-Z].*", "", .) %>%
+                      sub("Na", "", .) %>%
+                      sub("^-$", -1, .) %>%
+                      sub("^\\+$", 1, .) %>%
                       as.num(na.strings = "+NH4"),
-                    delta_K = sub(".*([+-]\\d*K).*", "\\1", ion_effect) %>% 
-                      sub("[-+]\\d[^K].*|[-+][A-J, L-Z].*", "", .) %>% 
-                      sub("K", "", .) %>% 
-                      sub("^-$", -1, .) %>% 
-                      sub("^\\+$", 1, .) %>% 
-                      as.num())
+                    delta_K = sub(".*([+-]\\d*K).*", "\\1", ion_effect) %>%
+                      sub("[-+]\\d[^K].*|[-+][A-J, L-Z].*", "", .) %>%
+                      sub("K", "", .) %>%
+                      sub("^-$", -1, .) %>%
+                      sub("^\\+$", 1, .) %>%
+                      as.num(),
+                    delta_C = dplyr::case_when(grepl('\\+CHOO', ion) ~ 1,
+                                               TRUE ~ 0),
+                    delta_O = dplyr::case_when(grepl('\\+CHOO', ion) ~ 2,
+                                               TRUE ~ 0))
     df.l[is.na(df.l)] <- 0
-    df.l <- df.l %>% 
-      dplyr::mutate(ion_formula = paste0("C", C, 
+    df.l <- df.l %>%
+      dplyr::mutate(ion_formula = paste0("C", C + delta_C,
                                          "Cl", delta_Cl,
                                          "H", H + delta_H,
                                          "K", delta_K,
                                          "N", N + delta_N,
                                          "Na", delta_Na,
-                                         "O", O,
-                                         "S", S, "P", P) %>% 
-                      gsub("[A-Z]0|Na0|Cl0", "", .) %>% 
+                                         "O", O + delta_O,
+                                         "S", S, "P", P) %>%
+                      gsub("[A-Z]0|Na0|Cl0", "", .) %>%
                       gsub("(\\b|\\D)1(\\b|\\D)", "\\1\\2", .))
-    df <- df.l %>% 
-      dplyr::select(!matches("delta_|^[[:upper:]][a,c]?$|_effect"))
-
+    df <- df.l %>%
+      dplyr::select(!matches("delta_|^[[:upper:]][a,c]?$|_effect|multiple_")) %>%
+      dplyr::mutate(charge = stringr::str_split_i(ion, '\\]', 2) %>%
+                      sub('\\+$', '+1', .) %>%
+                      sub('\\-$', '-1', .))
+
   }
-  
+
   if (nrow(df) == 0){
     warning('Output has zero rows! Check your scan range, adducts/polarity and DP range are sensible')
   }

diff --git a/inst/example_data/M31_20230717_stds_DDA_neg_06.mzML.zip b/inst/example_data/M31_20230717_stds_DDA_neg_06.mzML.zip
diff --git a/inst/example_data/M31_20230717_stds_MS1_neg_05.mzML.zip b/inst/example_data/M31_20230717_stds_MS1_neg_05.mzML.zip
diff --git a/inst/example_data/M31_20230718_stds_DDA_neg_04.mzML.zip b/inst/example_data/M31_20230718_stds_DDA_neg_04.mzML.zip