diff --git a/R/glycoAnnotate.R b/R/glycoAnnotate.R index 654b1ee..7a317d8 100644 --- a/R/glycoAnnotate.R +++ b/R/glycoAnnotate.R @@ -166,42 +166,52 @@ glycoAnnotate <- function(data, return(ppm) } pred_table <- pred_table %>% - dplyr::mutate(mzmin = mz - ppm_to_mz(mz, error), - mzmax = mz + ppm_to_mz(mz, error)) + dplyr::mutate(mzmin_match = mz - ppm_to_mz(mz, error), + mzmax_match = mz + ppm_to_mz(mz, error)) } if(error_units == 'Da'){ pred_table <- pred_table %>% - dplyr::mutate(mzmin = mz - error, - mzmax = mz + error) + dplyr::mutate(mzmin_match = mz - error, + mzmax_match = mz + error) } + #rename mz column in pred_table + names(pred_table)[names(pred_table) == 'mz'] <- 'mz_pred' + #run annotation message("Starting annotation with predictions against data") if(!is.null(mzmin_column) & !is.null(mzmax_column)){ - if (mzmin_column != "mzmin"){ - names(data)[names(data) == mzmin_column] <- "mzmin" - } - if (mzmax_column != "mzmax"){ - names(data)[names(data) == mzmax_column] <- "mzmax" - } + data$mzmin_match <- data[, mzmin_column] + data$mzmax_match <- data[, mzmax_column] + data.table::setDT(data) data.table::setDT(pred_table) - data.table::setkey(pred_table, mzmin, mzmax) + data.table::setkey(pred_table, mzmin_match, mzmax_match) data_annot <- data.table::foverlaps(data, pred_table) } if(is.null(mzmin_column) & is.null(mzmax_column)){ data <- data %>% - dplyr::mutate(mzmin = get(mz_column), - mzmax = get(mz_column)) + dplyr::mutate(mzmin_match = get(mz_column), + mzmax_match = get(mz_column)) data.table::setDT(data) data.table::setDT(pred_table) - data.table::setkey(pred_table, mzmin, mzmax) + data.table::setkey(pred_table, mzmin_match, mzmax_match) data_annot <- data.table::foverlaps(data, pred_table) } + #calculate mass error + if(mz_column == 'mz'){ + data_annot <- data_annot %>% + dplyr::mutate(mass_error = abs(mz - mz_pred)) + } + if(mz_column != 'mz'){ + data_annot <- data_annot %>% + dplyr::mutate(mass_error = abs(get(mz_column) - mz_pred)) + } + #collapse annotations data.table::setDF(data_annot) if(isTRUE(collapse) & nrow(data_annot) > nrow(data)){ @@ -232,40 +242,10 @@ glycoAnnotate <- function(data, } #format final df - if(isFALSE(collapse)){ - data_annot <- data_annot %>% - dplyr::select(!c('mzmin', 'mzmax')) - } - if('mz' %in% names(pred_table) & 'mz' %in% names(data)){ - if(isFALSE(collapse)){ - data_annot <- data_annot %>% - dplyr::rename(mz_pred = mz) - } - } - if(!is.null(mzmin_column)){ - if ("i.mzmin" %in% names(data_annot)){ - names(data_annot)[names(data_annot) == "i.mzmin"] <- mzmin_column - } - } - if(!is.null(mzmax_column)){ - if("i.mzmax" %in% names(data_annot)){ - names(data_annot)[names(data_annot) == "i.mzmax"] <- mzmax_column + data_annot <- data_annot %>% + dplyr::select(!any_of(c('mzmin_match', 'mzmax_match', + 'i.mzmin_match', 'i.mzmax_match'))) - } - } - - if(is.null(mzmin_column)){ - if ("i.mzmin" %in% names(data_annot)){ - data_annot <- data_annot %>% - dplyr::select(!'i.mzmin') - } - } - if(is.null(mzmax_column)){ - if("i.mzmax" %in% names(data_annot)){ - data_annot <- data_annot %>% - dplyr::select(!'i.mzmax') - } - } return(data_annot) } diff --git a/R/glycoPredict.R b/R/glycoPredict.R index bf1c40c..6f7c0db 100644 --- a/R/glycoPredict.R +++ b/R/glycoPredict.R @@ -2,12 +2,12 @@ #' #' @include setClass.R #' -#' @description \code{glycoPredict()} predicts all possible glycan within the +#' @description \code{glycoPredict()} predicts all possible glycan within the #' constraints set by the \code{glycoPredictParam} object. #' @param param A \code{glycoPredictParam} object. See \link[GlycoAnnotateR]{glycoPredictParam} #' -#' @export -#' +#' @export +#' #' @examples #' gpp <- glycoPredictParam() #' gpp@@dp <- c(1,7) @@ -16,19 +16,19 @@ #' gpp@@modifications <- c('sulphate', 'carboxylicacid') #' gpp@@double_sulphate <- TRUE #' predicted.df <- glycoPredict(param = gpp) -#' -#' @details -#' \code{glycoPredict()} is used to predict masses and mass to charge ratios of all theoretically -#' possible glycans within a set of constraining parameters (defined in the -#' \code{glycoPredictParam} object). This package was written -#' for annotation of mass spec data (especially LC-MS) but if used for -#' other purposes either ionisation mode and very wide scan ranges can be given. -#' The function works by sourcing a python file and then using the function +#' +#' @details +#' \code{glycoPredict()} is used to predict masses and mass to charge ratios of all theoretically +#' possible glycans within a set of constraining parameters (defined in the +#' \code{glycoPredictParam} object). This package was written +#' for annotation of mass spec data (especially LC-MS) but if used for +#' other purposes either ionisation mode and very wide scan ranges can be given. +#' The function works by sourcing a python file and then using the function #' encoded in the python script. -#' -#' @seealso +#' +#' @seealso #' glycoAnnotateR::glycoPredictParam() -#' +#' glycoPredict <- function(param){ path <- paste(system.file(package="GlycoAnnotateR"), "sugarMassesPredict.py", sep="/") @@ -56,9 +56,9 @@ glycoPredict <- function(param){ naming = as.list(param@naming) glycan_linkage = as.list(param@glycan_linkage) modification_limits = param@modification_limits - + message(paste("Glycans will be predicted according to the following glycoPredictParam() object:\n", str(param))) - + df <- predict_sugars(dp = dp, polarity = polarity, scan_range = scan_range, pent_option = pent_option, modifications = modifications, @@ -77,89 +77,104 @@ glycoPredict <- function(param){ x[na] = NA_real_ x } - df.l <- df %>% + df.l <- df %>% #make long tidyr::pivot_longer(cols = starts_with("[M"), names_to = "ion", - values_to = "mz") %>% + values_to = "mz") %>% #remove ions outside scan range - tidyr::drop_na(mz) %>% + tidyr::drop_na(mz) %>% #calculate ion formula - dplyr::mutate(C = stringr::str_split_i(formula, "C", 2) %>% - sub("\\D.*", "", .) %>% + dplyr::mutate(C = stringr::str_split_i(formula, "C", 2) %>% + sub("\\D.*", "", .) %>% as.num(), - H = stringr::str_split_i(formula, "H", 2) %>% - sub("\\D.*", "", .) %>% + H = stringr::str_split_i(formula, "H", 2) %>% + sub("\\D.*", "", .) %>% as.num(), - N = stringr::str_split_i(formula, "N", 2) %>% - sub("\\D.*", "", .) %>% + N = stringr::str_split_i(formula, "N", 2) %>% + sub("\\D.*", "", .) %>% as.num(), N = dplyr::case_when(grepl("N", formula) & is.na(N) ~ 1, TRUE ~ N), - O = stringr::str_split_i(formula, "O", 2) %>% - sub("\\D.*", "", .) %>% + O = stringr::str_split_i(formula, "O", 2) %>% + sub("\\D.*", "", .) %>% as.num(), - P = stringr::str_split_i(formula, "P", 2) %>% - sub("\\D.*", "", .) %>% + P = stringr::str_split_i(formula, "P", 2) %>% + sub("\\D.*", "", .) %>% as.num(), P = dplyr::case_when(grepl("P", formula) & is.na(P) ~ 1, TRUE ~ P), - S = stringr::str_split_i(formula, "S", 2) %>% - sub("\\D.*", "", .) %>% + S = stringr::str_split_i(formula, "S", 2) %>% + sub("\\D.*", "", .) %>% as.num(), S = dplyr::case_when(grepl("S", formula) & is.na(S) ~ 1, TRUE ~ S), ion_effect = gsub("\\[M|\\].*", "", ion), - delta_H = sub(".*([+-]\\d*H).*", "\\1", ion_effect) %>% - sub("[-+]\\d[^H].*|[-+][A-G, I-Z].*", "", .) %>% - sub("H", "", .) %>% - sub("^-$", -1, .) %>% - sub("^\\+$", 1, .) %>% - as.num()) - df.l$delta_H[df.l$ion_effect == "+NH4"] <- 4 - df.l <- df.l %>% - dplyr::mutate(delta_N = sub(".*([+-]\\d*N[^a]).*", "\\1", ion_effect) %>% - sub("[+-]Na", "", .) %>% - sub("[-+]\\d[^N].*|[-+][A-M, O-Z].*|[A-M, O-Z]", "", .) %>% - sub("N", "", .) %>% - sub("^-$", -1, .) %>% - sub("^\\+$", 1, .) %>% + delta_H = sub(".*([+-]\\d*H).*", "\\1", ion_effect) %>% + sub("[-+]\\d[^H].*|[-+][A-G, I-Z].*", "", .) %>% + sub("H", "", .) %>% + sub("^-$", -1, .) %>% + sub("^\\+$", 1, .) %>% + as.num(), + multiple_ammonium = dplyr::case_when(grepl('\\dNH4', ion) ~ + stringr::str_split_i(ion_effect, + '\\+|N', 2) %>% + as.numeric(), + TRUE ~ NA), + delta_H = dplyr::case_when(grepl('\\dNH4', ion) ~ + delta_H + (multiple_ammonium*4), + grepl('\\+NH4', ion) ~ 4, + grepl('\\+CHOO', ion) ~ 1, + TRUE ~ delta_H), + delta_N = sub(".*([+-]\\d*N[^a]).*", "\\1", ion_effect) %>% + sub("[+-]Na", "", .) %>% + sub("[-+]\\d[^N].*|[-+][A-M, O-Z].*|[A-M, O-Z]", "", .) %>% + sub("N", "", .) %>% + sub("^-$", -1, .) %>% + sub("^\\+$", 1, .) %>% as.num(), - delta_Cl = sub(".*([+-]\\d*Cl).*", "\\1", ion_effect) %>% - sub("[-+]\\d[^Cl].*|[-+][A-B, D-Z].*", "", .) %>% - sub("Cl", "", .) %>% - sub("^-$", "-1", .) %>% - sub("^\\+$", "1", .) %>% + delta_Cl = sub(".*([+-]\\d*Cl).*", "\\1", ion_effect) %>% + sub("[-+]\\d[^Cl].*|[-+][A-B, D-Z].*", "", .) %>% + sub("Cl", "", .) %>% + sub("^-$", "-1", .) %>% + sub("^\\+$", "1", .) %>% as.num(na.strings = "+CHOO"), - delta_Na = sub(".*([+-]\\d*Na).*", "\\1", ion_effect) %>% - sub("[-+]\\d[^Na].*|[-+][A-M, O-Z].*", "", .) %>% - sub("Na", "", .) %>% - sub("^-$", -1, .) %>% - sub("^\\+$", 1, .) %>% + delta_Na = sub(".*([+-]\\d*Na).*", "\\1", ion_effect) %>% + sub("[-+]\\d[^Na].*|[-+][A-M, O-Z].*", "", .) %>% + sub("Na", "", .) %>% + sub("^-$", -1, .) %>% + sub("^\\+$", 1, .) %>% as.num(na.strings = "+NH4"), - delta_K = sub(".*([+-]\\d*K).*", "\\1", ion_effect) %>% - sub("[-+]\\d[^K].*|[-+][A-J, L-Z].*", "", .) %>% - sub("K", "", .) %>% - sub("^-$", -1, .) %>% - sub("^\\+$", 1, .) %>% - as.num()) + delta_K = sub(".*([+-]\\d*K).*", "\\1", ion_effect) %>% + sub("[-+]\\d[^K].*|[-+][A-J, L-Z].*", "", .) %>% + sub("K", "", .) %>% + sub("^-$", -1, .) %>% + sub("^\\+$", 1, .) %>% + as.num(), + delta_C = dplyr::case_when(grepl('\\+CHOO', ion) ~ 1, + TRUE ~ 0), + delta_O = dplyr::case_when(grepl('\\+CHOO', ion) ~ 2, + TRUE ~ 0)) df.l[is.na(df.l)] <- 0 - df.l <- df.l %>% - dplyr::mutate(ion_formula = paste0("C", C, + df.l <- df.l %>% + dplyr::mutate(ion_formula = paste0("C", C + delta_C, "Cl", delta_Cl, "H", H + delta_H, "K", delta_K, "N", N + delta_N, "Na", delta_Na, - "O", O, - "S", S, "P", P) %>% - gsub("[A-Z]0|Na0|Cl0", "", .) %>% + "O", O + delta_O, + "S", S, "P", P) %>% + gsub("[A-Z]0|Na0|Cl0", "", .) %>% gsub("(\\b|\\D)1(\\b|\\D)", "\\1\\2", .)) - df <- df.l %>% - dplyr::select(!matches("delta_|^[[:upper:]][a,c]?$|_effect")) - + df <- df.l %>% + dplyr::select(!matches("delta_|^[[:upper:]][a,c]?$|_effect|multiple_")) %>% + dplyr::mutate(charge = stringr::str_split_i(ion, '\\]', 2) %>% + sub('\\+$', '+1', .) %>% + sub('\\-$', '-1', .)) + } - + if (nrow(df) == 0){ warning('Output has zero rows! Check your scan range, adducts/polarity and DP range are sensible') } diff --git a/inst/example_data/M31_20230717_stds_DDA_neg_06.mzML.zip b/inst/example_data/M31_20230717_stds_DDA_neg_06.mzML.zip new file mode 100644 index 0000000..ef573d1 Binary files /dev/null and b/inst/example_data/M31_20230717_stds_DDA_neg_06.mzML.zip differ diff --git a/inst/example_data/M31_20230717_stds_MS1_neg_05.mzML.zip b/inst/example_data/M31_20230717_stds_MS1_neg_05.mzML.zip new file mode 100644 index 0000000..5c37d7b Binary files /dev/null and b/inst/example_data/M31_20230717_stds_MS1_neg_05.mzML.zip differ diff --git a/inst/example_data/M31_20230718_stds_DDA_neg_04.mzML.zip b/inst/example_data/M31_20230718_stds_DDA_neg_04.mzML.zip new file mode 100644 index 0000000..a8287ee Binary files /dev/null and b/inst/example_data/M31_20230718_stds_DDA_neg_04.mzML.zip differ diff --git a/inst/sugarMassesPredict.py b/inst/sugarMassesPredict.py index bf9c186..d7f9c83 100644 --- a/inst/sugarMassesPredict.py +++ b/inst/sugarMassesPredict.py @@ -26,25 +26,25 @@ 'sulphate', 'aminopentyllinker'] # hexose and water masses to build molecule base -hex_mass = 180.06339 -water_mass = 18.010565 +hex_mass = 180.0633881 +water_mass = 18.01056468 # mass differences for modifications -pent_mdiff = -30.010566 +pent_mdiff = -30.01056468 modifications_mdiff = { - "sulphate": 79.956817, + "sulphate": 79.95681486, "anhydrobridge": -water_mass, - "omethyl": 14.01565, - "carboxylicacid": 13.979265, - "sialicacid" : 129.042594, - "nacetyl": 41.026549, - "oacetyl": 42.010565, - "phosphate": 79.966333, - "deoxy": -15.994915, + "omethyl": 14.01565006, + "carboxylicacid": 13.97926456, + "sialicacid" : 129.0425931, + "nacetyl": 41.0265491, + "oacetyl": 42.01056468 , + "phosphate": 79.96633052 , + "deoxy": -15.99491462, "unsaturated": -water_mass, - "alditol": 2.015650, - "amino": -0.984016, + "alditol": 2.015650064, + "amino": -0.984015588, "dehydrated": -water_mass, - "aminopentyllinker": 85.089148 + "aminopentyllinker": 85.08914935 } # mass differences for labels @@ -64,15 +64,16 @@ # mass differences for ions ion_mdiff = { - "H": 1.00782500000003, - "Na": 22.98977, - "Cl": 34.968853, - "CHOO": 44.997655, - "NH4": 18.034374, - "K": 38.963708, - "Ca": 39.962591 + "H": 1.007825032, + "Na": 22.98976928, + "Cl": 34.96885268, + "CHOO": 44.997654272, + "NH4": 18.034374128, + "K": 38.96370668, + "Ca": 39.96259098 } -e_mdiff = 0.000548579909 + +e_mdiff = 0.00054857990924 # formulas @@ -839,7 +840,7 @@ def getModificationNumbers(dp_range_list, m, pent_option, modifications): Me_ions = [x + 1 for x in H_ions] ions = list("[M-" + pd.Series(H_ions).astype(str) + "H+" + pd.Series(Me_ions).astype(str) + "Na]+") for i in range(len(ions)): - masses_anionic[ions[i]] = masses_anionic.mass - (ion_mdiff['H'] * i) + (ion_mdiff['Na'] * (i + 1)) + e_mdiff + masses_anionic[ions[i]] = masses_anionic.mass - (ion_mdiff['H'] * i) + (ion_mdiff['Na'] * (i + 1)) - e_mdiff masses_anionic[ions[i]] = masses_anionic[ions[i]].where(masses_anionic['nmod_anionic'] >= (i)) masses_anionic = masses_anionic.rename({'[M-0H+1Na]+': '[M+Na]+'}, axis=1) masses_anionic = masses_anionic.rename({'[M-1H+2Na]+': '[M-H+2Na]+'}, axis=1) @@ -849,7 +850,7 @@ def getModificationNumbers(dp_range_list, m, pent_option, modifications): ions = list("[M-" + pd.Series(H_ions).astype(str) + "H+" + pd.Series(Me_ions).astype(str) + "K]+") for i in range(len(ions)): masses_anionic[ions[i]] = masses_anionic.mass - (ion_mdiff['H'] * i) + ( - ion_mdiff['K'] * (i + 1)) + e_mdiff + ion_mdiff['K'] * (i + 1)) - e_mdiff masses_anionic[ions[i]] = masses_anionic[ions[i]].where(masses_anionic['nmod_anionic'] >= (i)) masses_anionic = masses_anionic.rename({'[M-0H+1K]+': '[M+K]+'}, axis=1) masses_anionic = masses_anionic.rename({'[M-1H+2K]+': '[M-H+2K]+'}, axis=1) @@ -859,7 +860,7 @@ def getModificationNumbers(dp_range_list, m, pent_option, modifications): ions = list("[M-" + pd.Series(H_ions).astype(str) + "H+" + pd.Series(Me_ions).astype(str) + "NH4]+") for i in range(len(ions)): masses_anionic[ions[i]] = masses_anionic.mass - (ion_mdiff['H'] * i) + ( - ion_mdiff['NH4'] * (i + 1)) + e_mdiff + ion_mdiff['NH4'] * (i + 1)) - e_mdiff masses_anionic[ions[i]] = masses_anionic[ions[i]].where(masses_anionic['nmod_anionic'] >= (i)) masses_anionic = masses_anionic.rename({'[M-0H+1NH4]+': '[M+NH4]+'}, axis=1) masses_anionic = masses_anionic.rename({'[M-1H+2NH4]+': '[M-H+2NH4]+'}, axis=1) diff --git a/vignettes/lcms_annotation_tutorial.Rmd b/vignettes/lcms_annotation_tutorial.Rmd new file mode 100644 index 0000000..8d4cb41 --- /dev/null +++ b/vignettes/lcms_annotation_tutorial.Rmd @@ -0,0 +1,944 @@ +--- +title: "Glycan annotation with GlycoAnnotateR" +author: "Margot Bligh" +date: "2024-01-31" +output: + html_document: + toc: true + number_sections: true +vignette: > + %\VignetteIndexEntry{Glycan annotation with GlycoAnnotateR} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Introduction + +This is a tutorial for the annotation of glycans in LC-MS data in R using XCMS, CAMERA and GlycoAnnotateR. Other data processing pipelines (e.g. mzMine) can be used but are not covered in this tutorial. For the annotation step is all that is needed is a dataframe with one row per *m/z*. This tutorial assumes a basic understanding of the XMCS processing pipeline - for a great explanation of LC-MS data processing with XCMS, check out this [tutorial](https://bioconductor.org/packages/release/bioc/vignettes/xcms/inst/doc/xcms.html). + + +This tutorial will use example data, which you can find here on Github under 'inst/example_data'. The .mzML files should be downloaded and unzipped. The data consists of three mzML files. All contain HILIC-ESI-Orbitrap MS data. The same mixture of commercial oligosaccharide standards was run three times in negative mode: once with full MS only and twice with DDA-MS/MS. + +We will first set up the environment by loading the required packages. xcms is used for the data processing (peak picking and grouping, chromatogram extraction etc) and CAMERA is used for isotope detection. xcms can be installed from BioConductor using BiocManager. + +# Load required packages + +```{r import packages, message=FALSE, warning=FALSE} +library(GlycoAnnotateR) +library(tidyverse) +library(xcms) +library(CAMERA) +library(ggplot2) +library(ggrepel) +library(scales) +``` + +# Pre-processing the data +## Import and prepare the data + +### Import data +Now we import the data as an 'OnDiskMSnExp' object. A metadata or phenodata frame is created here using the filepath names. You may need to change the path to the directory containing the example data. + + +```{r import data} +#get filepaths of mzML files +fp <- dir(path = '../inst/example_data/', pattern = '.mzML$', full.names = T) + +#make a metadata df +pd <- data.frame(sample_name = gsub(".mzML|.*1[78]_", "", basename(fp)), + frag = str_split_i(basename(fp), "_", 4), + sample_type = str_split_i(basename(fp), "_", 3)) + +#print the metadata df +pd + +#read in data with metadata df as phenodata +all_data <- readMSData(files = fp, + pdata = new("NAnnotatedDataFrame", pd), + mode = "onDisk") +``` + +### Filter by retention time + +We will now filter the data to only retain scans between 1 minute (sample injection) and 30 minutes (solvent B ramped up to re-condition the column). This is not strictly necessary but increases the speed of processing by dropping unneeded data. + +```{r filter retention time} +#filter retention time +all_data <- filterRt(all_data, rt = c(1*60, 30*60)) +``` + + +### Split data + +We will now split the data in two. One object has only MS1 scan events (data_ms1) and one object has all scan events (MS1 and MS2). CAMERA peak grouping by retention time for isotope picking requires objects with only MS1 scans. + +```{r split data} +#make MS1 only data object +data_ms1 <- all_data[all_data@featureData@data$msLevel == 1] +data_ms2 <- all_data +``` + +## Peak picking, refining and grouping + +### Peak picking + +Peak picking is done with the CentWave algorithm, for details see [Tautenhahn et al., 2008](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-504). CentWave parameters should be optimised for each dataset - this can be done with the [IPO package](https://bioconductor.org/packages/release/bioc/html/IPO.html). + +We will only perform peak picking on the MS1 object. Later, MS2 spectra associated with MS1 features annotated as glycans will be extracted, using the MS1 peaks. + +```{r pick peaks} +#create parameter object +cwp <- CentWaveParam(ppm = 4, peakwidth = c(10, 60)) +#pick peaks +data_ms1_pks <- findChromPeaks(data_ms1, cwp) +``` + + +Peak picking should always be optimised and checked extensively, as this will have a massive impact on the downstream data interpretation. Here, we will check an example chromatogram for a dihexose [M+Cl]- adduct. We will quickly use glycoPredict from GlycoAnnotateR to get the correct *m/z* value. +```{r extract chr, warning=FALSE, message=FALSE} +#calculate m/z value for dihexose Cl adduct +df_dihexose <- glycoPredict(param = glycoPredictParam(dp = c(2,2), + polarity = 'neg', + adducts = 'Cl')) + +#define function for conversion between ppm and mz +ppm_to_mz = function(mz, noise){ + ppm = mz / 1000000 * noise + return(ppm) +} + +#extract chromatogram +- 4 ppm +chr_dihexose_pk <- chromatogram(data_ms1_pks, + mz = c(df_dihexose$mz-ppm_to_mz(df_dihexose$mz, 4), + df_dihexose$mz+ppm_to_mz(df_dihexose$mz, 4))) +``` + +Picked peaks are indicated by the grey shading in the chromatograms. We can see that the peak was picked relatively nicely for the first two samples, but the peak is split in the third sample. + +```{r plot chr, fig.show='hold', fig.width=6, fig.height=3} +#plot chromatogram for each sample +plot(chr_dihexose_pk[[1]]) +plot(chr_dihexose_pk[[2]]) +plot(chr_dihexose_pk[[3]]) +``` + +In the XIC plots, we can see the defined peaks in the red boxes. 4 ppm seems to be appropriate, as the majority of *m/z* values within the peak fall within the red boxes. +```{r XIC check peaks, message=FALSE, warning=FALSE, error=FALSE, fig.width=6, fig.height=6} +#check peak definition by looking at XIC plots +#extract data for larger m/z slice than 4 ppm to be able to see data around the peak +data_ms1_pks %>% + filterRt(rt = c(350, 550)) %>% + filterMz(mz = c(df_dihexose$mz-ppm_to_mz(df_dihexose$mz, 6), + df_dihexose$mz+ppm_to_mz(df_dihexose$mz, 6))) %>% + plot(type = "XIC") +``` + + +### Peak merging + +We will merge the split peaks using the 'refineChromPeaks' function of xcms. Again, the parameters used will vary for each dataset. + +```{r merge neighbouring peaks} +#create parameter object +mpp <- MergeNeighboringPeaksParam(expandRt = 10, minProp = 0.5) +#merge peaks +data_ms1_pks_mg <- refineChromPeaks(data_ms1_pks, mpp) +``` + +We can see in the chromatograms after peak merging that there is a single dihexose peak picked for each sample. The width of the peak captured in each sample is slightly different. We will not worry about that too much now, as the peaks will be grouped into features in the next step. + +```{r check merged chromatograms, message=FALSE, warning=FALSE, error=FALSE, fig.width=6, fig.height=6, fig.show='hold'} +#extract dihexose chromatogram +- 4 ppm +chr_dihexose_pk_mg <- chromatogram(data_ms1_pks_mg, + mz = c(df_dihexose$mz-ppm_to_mz(df_dihexose$mz, 4), + df_dihexose$mz+ppm_to_mz(df_dihexose$mz, 4))) +#plot chromatogram for each sample +plot(chr_dihexose_pk_mg[[1]]) +plot(chr_dihexose_pk_mg[[2]]) +plot(chr_dihexose_pk_mg[[3]]) + +#check peak definition by looking at XIC plots +#extract data for larger m/z slice than 3 ppm to be able to see data around the peak +data_ms1_pks_mg %>% + filterRt(rt = c(400, 500)) %>% + filterMz(mz = c(df_dihexose$mz-ppm_to_mz(df_dihexose$mz, 6), + df_dihexose$mz+ppm_to_mz(df_dihexose$mz, 6))) %>% + plot(type = "XIC") + +``` + +### Peak grouping (correspondence) + +We will now group the peaks across samples by density into features. In this example there will only be one sample group, as the data contains the same sample run three times. We will double check the parameters first and then group the peaks. Only peaks in 2 of the three runs will be grouped into features. + +```{r peak grouping, message=FALSE, warning=FALSE, error=FALSE, fig.width=6, fig.height=4, fig.show='hold'} +#set the parameters for peak grouping +pdp <- PeakDensityParam(sampleGroups = data_ms1_pks_mg$sample_type, + minFraction = 2/3, bw = 15, binSize = 0.01) +#check the parameters using the dehexose chromatogram +plotChromPeakDensity(chr_dihexose_pk_mg, + param = pdp, + peakPch = 16) + +#group peaks +data_ms1_pks_mg_gp <- groupChromPeaks(data_ms1_pks_mg, pdp) +``` + + + +### Gap filling +The final step in our pre-processing workflow is gap filling. This aims to 'rescue' peaks that were not identified in some samples, but which were identified in other samples and have been grouped into a feature. So, for example, there are tetrahexose [M-H]- peaks in samples 1 and 2, but there is not peak in the third sample. + +```{r gap filling prefilled chromatograms, message=FALSE, warning=FALSE, fig.show='hold', fig.width=6, fig.height=4} +#calculate m/z value for deprotonated tetrahexose +df_tetrahexose <- glycoPredict(param = glycoPredictParam(dp = c(4,4), + polarity = 'neg', + adducts = 'H')) + +#extract chromatogram +- 4 ppm +chr_tetrahexose_pk <- chromatogram(data_ms1_pks_mg_gp, + mz = c(df_tetrahexose$mz-ppm_to_mz(df_tetrahexose$mz, 4), + df_tetrahexose$mz+ppm_to_mz(df_tetrahexose$mz, 4))) +#plot chromatogram for each sample +plot(chr_tetrahexose_pk[[1]]) +plot(chr_tetrahexose_pk[[2]]) +plot(chr_tetrahexose_pk[[3]]) + + +``` + + +We apply gap filling using the default parameters, and then re-extract the chromatograms, making sure that filled peaks are included. We can now see that there is a peak in all three samples. + +```{r gap filling post filling chromatograms, warning=FALSE, fig.show='hold', fig.width=6, fig.height=4} +#gap filling +data_ms1_pks_mg_gp_fill <- fillChromPeaks(data_ms1_pks_mg_gp) + +#extract chromatogram +- 4 ppm +chr_tetrahexose_pkfill <- chromatogram(data_ms1_pks_mg_gp_fill, + mz = c(df_tetrahexose$mz-ppm_to_mz(df_tetrahexose$mz, 4), + df_tetrahexose$mz+ppm_to_mz(df_tetrahexose$mz, 4)), + filled = T) #make sure filled is TRUE to include gap filling + +#plot chromatograms +plot(chr_tetrahexose_pkfill[[1]]) +plot(chr_tetrahexose_pkfill[[2]]) +plot(chr_tetrahexose_pkfill[[3]]) +``` + + + +# Isotope detection + +We will now use CAMERA for detection of isotopes. We first need to convert our `XCMSnExp` object to an `xcmsSet` object, which can the be used to construct an `xsAnnotate` object. + +```{r change to xcmsSet, message=FALSE} +#creat xcmsSet object +xset <- as(data_ms1_pks_mg_gp_fill, "xcmsSet") +#set sample names +sampnames(xset) <- pData(data_ms1_pks_mg_gp_fill)$sample_name +#set sample classes/groups +sampclass(xset) <- pData(data_ms1_pks_mg_gp_fill)$sample_type +``` + +Now we create the `xsAnnotate` object, group peaks by retention time, find isotopes, and generate a feature table containing the isotope information. Features are filtered so that at least 1 sample has an integrated peak area >5e5. + +```{r isotope detection} +#build xsAnnotate object +an <- xsAnnotate(xset) +#group peaks by retention time (full width half maximum) +an <- groupFWHM(an, perfwhm = 0.8) +#find isotopes +an <- findIsotopes(an, ppm = 3) +#get feature list and filter features by intensity values to remove those with only values below 5e5 +pl <-getPeaklist(an) %>% + filter(if_any(contains("stds"), ~ .x > 5e5)) +``` + +# Annotating the data + +## Making the parameter object + +We will now move forward with annotating our feature table to identify putative oligosaccharide ions. Here, we will first create the `glycoPredictParam` object, and then annotate the data. These two steps can be done in one, but we will first look at and understand the `glycoPredictParam` object. + +```{r creating the parameter object} +#create the glycoPredictParam object +gpp <- glycoPredictParam( + #degree of polymerisation range, here monomer to tetramer + dp = c(1,4), + #ionisation polarity + polarity = "neg", + #ionisation type (affects adduct types) + ion_type = "ESI", + #scan range during acquisition (here for MS1 scans) + scan_range = c(175,1400), + #expected modifications + modifications = c("alditol", + "sulphate", + "omethyl", + "anhydrobridge"), + #maximum average number of modifications per monomer + nmod_max = 1, + #adducts + adducts = c("H", "Cl")) +gpp +``` + +## Calculating compositions + +We can now use parameters defined in the `glycoPredictParam` object to 'predict' (or rather calculate) all possible glycans. You can directly run `glycoAnnotate` to annotate your peak list using the parameters, which will call `glycoPredict` in the background, but here we will demonstrate the output of `glycoPredict` by running it separately. + +```{r build calculated df, warning=FALSE} +#create a table of theoretical glycans and their ions +pred_df <- glycoPredict(param = gpp) +``` + +By default we have selected IUPAC naming and long output format, so glycan compositions are named with IUPAC abbreviations and there is one row per adduct/ion. 309 adducts from 134 possible compositions are calculated based on the constraints of the input parameters. + +```{r print number adducts comps} +#print the number of adducts and compositions +print(paste('There are', nrow(pred_df), + 'adducts/ions from', + length(unique(pred_df$`IUPAC name`)), + 'different compositions')) +``` + +There are 8 columns in the output: + +* dp = the degree of polymerisation (oligomer length) +* mass = the monisotopic exact mass of the neutral oligosaccharide +* formula = the sum formula of the neutral oligosaccharide +* IUPAC name = the name of the oligosaacharide, using common abbreviations from IUPAC (see [here](https://margotbligh.github.io/GlycoAnnotateR/)) for a summary of the naming +* ion = the adduct of the ion +* mz = the mass-to-charge ratio of the ion (*m/z*) +* ion_formula = the sum formula of the ion formed by the oligosaccharide +* charge = the charge state of the adduct + +```{r inspect comp table} +#inspect the output +head(pred_df) +``` + +## Annotate features + +We will now annotate the feature list generated after isotope detection using the parameters we defined. Because our feature list has minimum and maximum *m/z* values, we can use those values for annotation against theoretical values (the range for each theoretical values is created as the value +- the error in ppm or Da). This is done by checking for overlap of two ranges. For peak/feature lists with only an *m/z* value (mean or otherwise defined), or if you would like to only use this value, we can supply just the *m/z* column name. This will annotate by checking if the value is within the range of the theoretical values. The below figure summarises the two types of annotation possible: + +```{r image matching types, out.width = "400px", echo=FALSE} +knitr::include_graphics("matching_types.png") +``` + +We will try both types of annotation here to see how they work and how the output differs. We will choose NOT to collapse the annotations (i.e. if a feature receives multiple annotations, there will be one row per annotation) at this point so that we can look at all of the output information. Collapsing of annotations (so that there is one row per feature, and multiple annotations for the same feature are comma separated) can be done during this step or later. + +### Overlap of two ranges + + +Note that here we are supplying again the `glycoPredictParam` object, to demonstrate that annotation can be a single step process. However, given that we have already run `glycoPredict`, we could alternatively have supplied `pred_table = pred_df` and not provide a value for `param`. + +```{r overlap two ranges, message=FALSE, warning=FALSE} +#annotation is performed +pl_annot_overlap2ranges <- glycoAnnotate( + #feature or peak list to be annotated + data = pl, + #glycoPredictParam object + param = gpp, + #error unit and value + error = 2, error_units = 'ppm', + #should annotations be collapse + collapse = F, + #column containing m/z values + mz_column = 'mz', + #columns containing minimum and maximimum mz values + mzmin_column = 'mzmin', mzmax_column = 'mzmax') +``` + + +We can see that the output contains the same columns as the input (feature table), but now also with 7 additional columns - these come from the output of `glycoPredict`. The column containing the theoretical *m/z* values is now named 'mz_pred'. The 'mz' column contains the *m/z* values of the data. The output contains 380 rows - 1 more than the input (381). This means that some features received multiple annotations. 12 features in total received at least one annotation. + + +```{r output overlap two ranges, collapse=TRUE} +#see output +head(pl_annot_overlap2ranges) + +#print dimensions of input +dim(pl) + +#print dimensions of output +dim(pl_annot_overlap2ranges) + +#print number of features annotated +pl_annot_overlap2ranges %>% + drop_na(dp) %>% + distinct(mz) %>% + nrow() +``` + +### Value within a range + +For comparison, we will repeat the annotation but now only providing the *m/z* column, and therefore matching by the value within a range method. + +```{r value within range, message=F, warning=F} +#annotation is performed +pl_annot_valueInRange <- glycoAnnotate( + #feature or peak list to be annotated + data = pl, + #glycoPredictParam object + param = gpp, + #error unit and value + error = 3, error_units = 'ppm', + #should annotations be collapse + collapse = F, + #column containing m/z values + mz_column = 'mz') +``` + +Now 15 features have been annotated, and there is again one feature with multiple annotations. + +```{r output value within range, collapse=TRUE} +#see output +head(pl_annot_valueInRange) + +#print dimensions of input +dim(pl) + +#print dimensions of output +dim(pl_annot_valueInRange) + +#print number of features annotated +pl_annot_valueInRange %>% + drop_na(dp) %>% + distinct(mz) %>% + nrow() +``` + +In this tutorial we will proceed with the annotations from within a range matching, but it is open to the user to decide which is more appropriate to their data type. + +## Collapse annotations + +We will now collapse the annotations, so that there is one row per feature. We will collapse the columns with the IUPAC name and ion. + +```{r collapse annotations} +#define columns to collapse +collapse = c('IUPAC name', 'ion') +#define columns to keep, uncollapsed +noncollapse = names(pl)[names(pl) %in% names(pl_annot_valueInRange)] + +#perform collapsing +pl_annot_c <- glycoAnnotationsCollapse(pl_annot_valueInRange, + collapse_columns = collapse, + noncollapse_columns = noncollapse) +``` + + +The output now contains the same number of rows as the feature table, with one additional column called 'annotations' that contains the IUPAC name and ion separated by a colon. Where multiple annotations were assigned they are comma separated. + +```{r inspect collapsed annotations} +#inspect output +head(pl_annot_c) + +#print output dimensions +dim(pl_annot_c) +``` + +We will now filter the data to only retain annotated features. We can see that there is one feature (*m/z* = 403.0546, rt = 137 s) that has two annotations: 'Hex4 AnhydroBridge1 Sulfate2:[M-2H]-2' and 'Hex2 AnhydroBridge1 Sulfate1:[M-H]-'. + +```{r filter for annotated features} +pl_annotonly <- pl_annot_c %>% filter(annotations != 'NA:NA') +pl_annotonly[, c('mz', 'rt', 'annotations')] +``` + +## Inspect isotopes for ambiguous annotations + +We will see if we can use the detected isotopes to choose one of the two annotations as the more likely one. The annotated feature table is filtered to only the pcgroup of the feature with two annotations, and within that group only features detected as isotopes by CAMERA. The 'pcgroup' represents features grouped together based on retention time. + +```{r filter for pc group} +#get the pc group of the feature with two annotstions +pc = pl_annotonly$pcgroup[grepl(',', pl_annotonly$annotations)] + +#filter the annotated feature table to only that pcgroup +#and peaks with detected isotopes +iso_check <- pl_annot_valueInRange %>% + filter(pcgroup %in% pc & isotopes != '') +``` + + +There are a couple of things we can notice in the filtered df: + +* The mass error differences of the two annotations are the same to 10 decimal places (meaning this cannot differentiate the annotations) + +* There are M+1 (*m/z* 404.0576) and M+2 (*m/z* 405.0532) isotopes detected for molecule 42 (and the annotated feature was detected as the M isotope for molecule 42) + +* Each isotope has a mass increased of ~1 Da, indicating that the ion has a single charge, supporting the annotation of Hex2 AnhydroBridge1 Sulfate1 [M-H]- + + +```{r look at isotopes of multiply annotated features} +#inspect the dataframe +iso_check +``` + + +We can double check the isotope assignment by plotting the mass spectrum for one sample. + + +```{r spectrum isotopes, fig.height=5, fig.width=6} +#filter data +m1 <- all_data %>% + filterFile(which(pd$frag == 'MS1')) %>% #filter for sample 2 + filterRt(rt = c(min(iso_check$rtmin[1:4]), + max(iso_check$rtmax[1:4]))) %>% #filter by rt + filterMz(mz = c(min(iso_check$mzmin[1:4]-0.5), + max(iso_check$mzmax[1:4]+0.5))) #filter by mz + +#average spectra +m1_avg <- combineSpectra(m1, + fcol = 'fileIdx', + intensityFun = mean, + ppm = 3) + +#normalise with respect to ion with highest intensity +m1_avg_norm <- normalise(m1_avg, method = 'max') + +#make dataframe from spectrum +#add M, M1 and M2 annotations +m1_avg_norm_df <- data.frame(mz = m1_avg_norm[[1]]@mz, + int = m1_avg_norm[[1]]@intensity) %>% + mutate(isotope = case_when(mz > 403.054 & mz < 403.058 ~ 'M', + mz > 404.054 & mz < 404.059 ~ 'M+1', + mz > 405.053 & mz < 405.06 ~ 'M+2',)) + +#plot spectrum with annotated isotopes +ggplot(m1_avg_norm_df)+ + geom_segment(aes(x = mz, xend = mz, y = 0, yend= int)) + + geom_text(aes(x = mz, y = int+0.1, label = isotope)) + + labs(x = expression(italic('m/z')), y = "Normalised intensity")+ + scale_x_continuous(breaks = seq(402.2, 405.2, 0.2)) + + theme_light()+ + theme(axis.title = element_text(colour = 'black', size = 10, face = 'bold'), + axis.text = element_text(colour = 'black', size = 10), + axis.text.x = element_text(angle = 45, hjust = 1)) +``` + + +We can be relatively confident in the annotation of Hex2 AnhydroBridge1 Sulfate1 [M-H]- over Hex4 AnhydroBridge1 Sulfate2:[M-2H]-2, so we will assign the annotation as such. + + +```{r assign annotation} +#assign the singly charged annotation +pl_annotonly$annotations[pl_annotonly$pcgroup == pc] <- 'Hex2 Anhydro1 Sulfate1:[M-H]-' +``` + + +## Extract and plot chromatograms of annotated features + +It is often interesting to look at the chromatograms of the annotated features, to verify the peak processing and see how the retention times differ across annotations (and if that makes sense, e.g. are higher DP oligosaccharides eluting later than shorter DP. oligosaccharides), and if we have multiple peaks (i.e. isomers) for some annotations. We can use the `featureChromatograms` function from xcms to extract the chromatograms, and then extract the information from those objects. + + + +```{r extract feature chromatograms} +#get the definitions of all featuzres +featDef <- featureDefinitions(data_ms1_pks_mg_gp_fill) +#get in the indices of the annotated features +featIndices = which(featDef$mzmed %in% pl_annotonly$mz) +#extract the chromatograms of the annotated features, expand rt by 2 min +featChr <- featureChromatograms(data_ms1_pks_mg_gp_fill, + expandRt = 120, + features = featIndices) + +#reorder annotation table to be the same order as the chromatograms +pl_annotonly_reordered <- pl_annotonly[match(featDef$mzmed[featIndices], + pl_annotonly$mz),] + +#make empty df +featChr_df <- data.frame(sample_name = as.character(), + frag = as.character(), + annotations = as.character(), + mz = as.numeric(), + rt = as.numeric(), + rt_med = as.numeric(), + intensity = as.numeric()) + +#fill in df with chromatogram info +for(i in 1:length(featIndices)){ + mz = pl_annotonly_reordered$mz[i] + rt_med = pl_annotonly_reordered$rt[i] + annotations = pl_annotonly_reordered$annotations[i] + for(j in 1:nrow(pd)){ + chr <- featChr[i,j] + rt <- chr@rtime + intensity <- chr@intensity + tmp <- data.frame(sample_name = pd$sample_name[j], + frag = pd$frag[j], + annotations = annotations, + mz = mz, + rt = rt, + rt_med = rt_med, + intensity = intensity) + featChr_df <- rbind(featChr_df, tmp) + } +} + +#make annotations into a factor, ordered by rt +featChr_df$annotations <- factor(featChr_df$annotations, + levels = pl_annotonly_reordered %>% + arrange(rt) %>% + select(annotations) %>% + unlist() %>% as.vector() %>% + unique()) +``` + + +Chromatograms can then be plotted with ggplot.We can see that some features likely derive from in-source fragmentation e.g. some of the monosulphated hexose peaks could be from the sulphated kappa carrageenan oligosaccharides (e.g. 'Hex2 AnhydroBridge1 Sulfate1'). + +```{r plot feature chromatograms, fig.width=6, fig.height=10} +#get the definitions of all featuzres +featDef <- featureDefinitions(data_ms1_pks_mg_gp_fill) +#get in the indices of the annotated features +featIndices = which(featDef$mzmed %in% pl_annotonly$mz) +#extract the chromatograms of the annotated features, expand rt by 2 min +featChr <- featureChromatograms(data_ms1_pks_mg_gp_fill, + expandRt = 120, + features = featIndices) + +#reorder annotation table to be the same order as the chromatograms +pl_annotonly_reordered <- pl_annotonly[match(featDef$mzmed[featIndices], + pl_annotonly$mz),] + +#make empty df +featChr_df <- data.frame(sample_name = as.character(), + frag = as.character(), + annotations = as.character(), + mz = as.numeric(), + rt = as.numeric(), + rt_med = as.numeric(), + intensity = as.numeric()) + +#fill in df with chromatogram info +for(i in 1:length(featIndices)){ + mz = pl_annotonly_reordered$mz[i] + rt_med = pl_annotonly_reordered$rt[i] + annotations = pl_annotonly_reordered$annotations[i] + for(j in 1:nrow(pd)){ + chr <- featChr[i,j] + rt <- chr@rtime + intensity <- chr@intensity + tmp <- data.frame(sample_name = pd$sample_name[j], + frag = pd$frag[j], + annotations = annotations, + mz = mz, + rt = rt, + rt_med = rt_med, + intensity = intensity) + featChr_df <- rbind(featChr_df, tmp) + } +} + +#make annotations into a factor, ordered by rt +featChr_df$annotations <- factor(featChr_df$annotations, + levels = pl_annotonly_reordered %>% + arrange(rt) %>% + select(annotations) %>% + unlist() %>% as.vector() %>% + unique()) + +ggplot(featChr_df, + aes(x = rt, y = intensity, group = rt_med)) + + geom_line() + + facet_grid(cols = vars(sample_name), + rows = vars(annotations), scales = "free") + + labs(y = "Intensity (a.u.)", x = "Retention time (s)") + + scale_y_continuous(labels = scientific) + + theme_light()+ + theme(axis.title = element_text(colour = 'black', size = 10, + face = 'bold'), + axis.text = element_text(colour = 'black', size = 5), + strip.background = element_blank(), + strip.text.y = element_text(colour = 'black', size = 5, + angle = 0, hjust = 0), + strip.text.x = element_text(colour = 'black', size = 5)) + + +``` + + + + + +# Extract and annotate MS2 spectra associated with annotations +## Extract MS2 spectra + +We will use the `glycoMS2Extract` function from GlycoAnnotateR to extract the spectra. This function is basically a wrapper for XCMS functions for MS2 spectra extraction, with a layer to only extract those spectra associated with annotated features. + +The function *expects* that you have split your MS1 and MS2 data, like we did at the start. However, it should still work even if you didn't split your MS1 and MS2 data - in this case, you can simply provide the same data object for `data_ms2` and `data_features`. + +```{r extract ms2 spectra} +ms2 <- glycoMS2Extract( + #object with MS1 and MS2 data + data_ms2 = data_ms2, + #processed MS1 data object (must have features) + data_features = data_ms1_pks_mg_gp_fill, + #annotated feature table + #must contain an 'mz' and 'rt' column for the features! + annotations = pl_annotonly) +``` + + +We can see that the output is an `MSpectra` object which contains 248 MS2 spectra. + +```{r inspect ms2 spectra} +class(ms2) +length(ms2@listData) +``` + + +## Process MS2 spectra + +We will use the functions from `MSnbase` to clean, average and normalise the MS2 spectra. + +### Clean MS2 spectra + +By 'cleaning', we remove zero intensity masses. + +```{r clean MS2} +#remove zero intensity masses +ms2@listData <- lapply(ms2, clean, all = TRUE) + +``` + + +### Average MS2 spectra + +We average all MS2 spectra that correspond to the same (chromatographic) peak in the same sample (i.e. file). There are a number of different functions which can be used to combine the spectra + +```{r average MS2} +#combine spectra by peak within each file +ms2_avg <- MSnbase::combineSpectra(ms2, + fcol = 'peak_id', + method = meanMzInts, + mzd = 0.001) + +``` + +The number of spectra per file is reduced to 14-15, from 119-129. + +```{r number spectra after avg, collapse=TRUE} +#print the number of spectra per file before and after averaging +table(fromFile(ms2)) +table(fromFile(ms2_avg)) + +``` + + +### Normalise intensities + +Intensities are normalised with respect to the maximum intensity. + +```{r normalise MS2} +#normalise with respect to ion with highest intensity +ms2_avg@listData <- lapply(ms2_avg, + normalise, + method = "max") + +``` + +### Convert to dataframe + +Now we will convert the data into a dataframe format so its easier to work with. + + +```{r convert spectra to df} +#build empty df +ms2_df <- data.frame(sample.name = as.character(), + sample.number = as.numeric(), + precursorMz = as.numeric(), + rt = as.numeric(), + mz = as.numeric(), + intensity = as.numeric()) + +for (i in 1:length(ms2_avg)){ + mz = sprintf("%.4f",ms2_avg[[i]]@mz) + intensity = ms2_avg[[i]]@intensity * 100 + rt = rep(sprintf("%.4f", ms2_avg[[i]]@rt / 60), + length(mz)) + precursorMz = rep(sprintf("%.4f",ms2_avg[[i]]@precursorMz), + length(mz)) + sample.number = rep(fromFile(ms2_avg[[i]]), + length(mz)) + sample.name = rep(data_ms1_pks_mg_gp_fill$sample_name[fromFile(ms2_avg[[i]])], + length(mz)) + temp <- data.frame(sample.number = sample.number, + sample.name = sample.name, + precursorMz = precursorMz, + rt = rt, + mz = mz, + intensity = intensity) + ms2_df <- rbind(temp, ms2_df) +} + +#make sure mz columns are numeric +ms2_df$mz <- as.numeric(ms2_df$mz) +ms2_df$precursorMz <- as.numeric(ms2_df$precursorMz) + +``` + + + +## Annotate MS2 spectra + +### Annotate precursors + +We will begin by annotating the precursors. + +```{r annotate precursors, warning=FALSE, message=F} +ms2_df_annotP <- glycoAnnotate(ms2_df, + error = 4, + param = gpp, #param object + mz_column = 'precursorMz', #name of mz column + collapse = T, #we would like to collapse annotations... + collapse_columns = c('IUPAC name', 'ion', 'dp')) #...on these columns + +``` + +All precursors should be assigned an annotation (as we only extracted MS2 spectra for annotated precursors). + +```{r check precursor annotations} +#check all precursors annotated +any(is.na(ms2_df_annotP$annotations)) +``` + + +### Annotate fragments +Now we can annotate the MS2 spectra. We change the `glycoPredictParam` parameters depending on the precursor annotation. Only fragments resulting from glycosidic bond breakage, dehydration, or loss of modified groups (e.g. desulphation) can be annotated by GlycoAnnotateR. + +```{r annotate fragment ions, message=FALSE, warning=F} +#make a vector of the precursor annotations +annotP <- ms2_df_annotP$annotations %>% unique() + + +#make an empty list +ms2_annot_list <- list() + +#fill in list with dfs for annotated spectra, with one entry per annotation +for(i in 1:length(annotP)){ + #get annotation + annot = annotP[i] + + #get dp from annotation - we collapsed on name, ion and dp, so dp is at the + #end of the annotation string + if (grepl(',', annot)){ + #if multiple annotations, get maximum dp + annots <- str_split_1(annot, ',') + dps <- sub('.*:(\\d)', '\\1', annots) %>% as.numeric() + max_dp = max(dps) + } else { + #otherwise just get the dp + max_dp = sub('.*:(\\d)', '\\1', annot) %>% as.numeric() + } + + #get modifications from annotations + #split by colon or space + annot_split <- str_split_1(annot, pattern = ' |:') + #remove elements with the ion + annot_split <- annot_split[!grepl('\\[', annot_split)] %>% + #number elements + gsub('\\d', '', .) %>% + #make unique + unique() + #remove hexose (always included) + #only keep elements with word characters + annot_split <- annot_split[annot_split != 'Hex' & + grepl('\\w',annot_split)] + #if no modifications, set to 'none' + if(length(annot_split) == 0){ + annot_split <- 'none' + } + + #make sure everything is lower case or formatted correctly + annot_split_lc <- str_to_lower(annot_split) %>% + sub('-', '', .) %>% + sub('fate', 'phate', .) + + #get pentose option + if(any(annot_split == 'pen')){ + pent_option = T + } else { + pent_option = F + } + + #create parameter object + gpp <- glycoPredictParam(dp = c(1, max_dp), + pent_option = pent_option, + modifications = annot_split_lc, + polarity = 'neg', + scan_range = c(50, 1000), #set the scan range wide + ion_type = 'ESI') + + #make df for annotation + df <- ms2_df_annotP %>% + #filter for precursor annotation + filter(annotations == !!annot) %>% + #rename precursor annotation column + dplyr::rename(precursorAnnotations = annotations) + + + #annotate fragments + df_annot <- glycoAnnotate(df, param = gpp, collapse = T, error = 4) + + #add to list + ms2_annot_list[[i]] <- df_annot + +} + +#bind dfs in list into one df +ms2_df_annotF <- ms2_annot_list %>% + bind_rows() + +``` + + +Annotated spectra can be plotted with ggplot. We can see that the fragmentation energy was too low for many of the ions to generate a good fragmentation pattern, except for the kappa-carrageenan oligosaccharides (e.g. Hex4 AnhydroBridge2 Sulfate2 [M-2H]-2, precursor *m/z* 394.0496. + +```{r plot annotated ms2, fig.width=6, fig.height=5, fig.show='hold', warning=FALSE} +#plot spectra with annotated fragments +#you can play around with how you plot this, many ways +precursorMzs <- ms2_df_annotF$precursorMz %>% unique() +for(i in 1:length(precursorMzs)){ + pmz <- precursorMzs[i] + df <- ms2_df_annotF %>% + #filter for precursor Mz + filter(precursorMz == !!pmz) %>% + #filter for 1% relative abundance + filter(intensity > 1) + + + + p <- ggplot(df) + + #plot lines + geom_segment(aes(x = mz, xend = mz, y = 0, yend= intensity)) + + #add mz values for annotated ions + geom_text(data = df %>% drop_na(annotations), + mapping = aes(x = mz, y = intensity + 0.5, + label = round(mz, 4))) + + #add annotations + geom_text_repel(aes(label = annotations, x = mz, y = intensity), + size = 2, hjust = 0.5, vjust = 0.5, + colour = '#f49e4c', segment.linetype = 'dashed') + + #set labels + labs(x = expression(italic("m/z")), y = "Normalised intensity (a.u.)", + title = df$precursorAnnotations %>% unique())+ + theme_light()+ + theme(axis.title = element_text(colour = 'black', size = 8, family = 'Arial', + face = 'bold'), + axis.text = element_text(colour = 'black', size = 6, family = 'Arial'), + plot.title =element_text(colour = 'black', size = 8, family = 'Arial', + face = 'bold')) + print(p) +} + +``` + + + + diff --git a/vignettes/matching_types.png b/vignettes/matching_types.png new file mode 100644 index 0000000..f54ec54 Binary files /dev/null and b/vignettes/matching_types.png differ