Skip to content

Commit

Permalink
try to fix error with collapsing and duplicate names
Browse files Browse the repository at this point in the history
  • Loading branch information
margotbligh committed Feb 1, 2024
1 parent a8a0283 commit 0953022
Showing 1 changed file with 59 additions and 61 deletions.
120 changes: 59 additions & 61 deletions R/glycoAnnotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
#'
#' @description \code{glycoAnnotate()} annotates peaks or features in MS data,
#' using either a pre-generated table by \link[GlycoAnnotateR]{glycoPredict} or
#' by generating a new table.
#' by generating a new table.
#'
#' @export
#'
#'
#' @slot data Dataframe containing data to be annotated. For example,
#' feature dataframe from XCMS pre-processing (LC-MS or direct inject)
#' feature dataframe from XCMS pre-processing (LC-MS or direct inject)
#' or features from Cardinal (MALDI).
#' @slot mz_column Name of column containing m/z values.
#' @slot mzmin_column OPTIONAL: Name of column containing minimum m/z data values.
Expand All @@ -21,38 +21,38 @@
#' If supplied, will do overlap-overlap matching. Generally only if mzmin and mzmax
#' values generated during peak picking.If not provided, mz value will be annotated
#' if within range of theoretical mz +- error.
#' @slot pred_table Table generated previously by \link[GlycoAnnotateR]{glycoPredict}.
#' @slot pred_table Table generated previously by \link[GlycoAnnotateR]{glycoPredict}.
#' MUST BE LONG FORMAT - select \code{format='long'} when running prediction.
#' @slot param \link[GlycoAnnotateR]{glycoPredictParam} object for generation of table
#' of theoretical mz values for annotation.
#' @slot collapse Logical. If \code{TRUE}, annotations will be 'collapsed' so that multiple
#' annotations for one mz will be in the same row, comma separated (nrow of output is in
#' this case equal to nrow of input data). If \code{FALSE} (default), it is possible
#' that rows in the input dataframe are repeated with different annotations. The
#' information on annotations is more detailed in this case. Collapsing can also be done
#' information on annotations is more detailed in this case. Collapsing can also be done
#' afterwards on the output using \link[GlycoAnnotateR]{glycoAnnotationsCollapse}.
#' @slot collapse_columns Columns to be pasted together before collapsing.
#' Only needed if \code{collapse=TRUE} and non-default columns wanted - default is
#' molecule name and ion. If prediction table provided to \code{pred_table} instead of
#' \code{param}, column names are required.
#' molecule name and ion. If prediction table provided to \code{pred_table} instead of
#' \code{param}, column names are required.
#' @slot error Numeric value - error used to create window for matching. mz values
#' will be matched against theoretical mzs +- error.
#' @slot error_units Units for error - can be 'ppm' or 'Da'
#'
#'
#' @examples
#'
#'
#' #with prediction parameters
#' gpp <- glycoPredictParam(dp = c(1, 8), modifications = "deoxy", polarity = "pos", naming = "IUPAC")
#' annotated_data <- glycoAnnotate(data = data, param = gpp, error = 1.5, units = 'ppm', collapse = T)
#'
#'
#' #with prediction table
#' gpp <- glycoPredictParam(dp = c(1, 8), modifications = "deoxy", polarity = "pos", naming = "IUPAC")
#' pred_table <- glycoPredict(param = gpp)
#' annotated_data <- glycoAnnotate(data = data, pred_table = pred_table, error = 1.5, units = 'ppm', collapse = T, collapse_columns = c("IUPAC name", "ion"))
#'
#'
#' @seealso glycoAnnotateR::glycoPredict()
#' @seealso glycoAnnotateR::glycoPredictParam()
#'
#'

glycoAnnotate <- function(data,
mz_column = 'mz',
Expand Down Expand Up @@ -85,11 +85,11 @@ glycoAnnotate <- function(data,
}
}
if (!is.null(pred_table) & !is.null(param)){
stop("pred_table and param supplied.",
stop("pred_table and param supplied.",
" please provide ONE type of input for annotation")
}
if (is.null(pred_table) & is.null(param)){
stop("no glycoPredictParam supplied to 'param' AND no prediction table",
stop("no glycoPredictParam supplied to 'param' AND no prediction table",
" supplied to 'pred_table'. please provide one type of input for annotation")
}
if (!is.null(pred_table)){
Expand Down Expand Up @@ -121,7 +121,7 @@ glycoAnnotate <- function(data,
if(!all(collapse_columns %in% names(pred_table))){
stop("collapse_columns are not column names in pred_table!")
}
}
}
if (!is.null(collapse_columns) & is.null(pred_table)){
message("warning: collapse_columns provided but no pred_table...",
"these must correspond to columns in the table newly generated",
Expand All @@ -133,48 +133,48 @@ glycoAnnotate <- function(data,
" before collapsing (and collapsed) - for example the annotation name",
" and ion column names")
}

if(!is.null(collapse_columns) & isFALSE(collapse)){
message('collapse_columns provided but collapse is FALSE, no collapse',
'will be performed')
}

if(!is.null(param)){
if(param@format != "long"){
message('change "format" to long in param!')}
}

#run glycoPredict
if (!is.null(param)){
message("Starting glycoPredict to generate possible annotations")
pred_table <- GlycoAnnotateR::glycoPredict(param = param)

if(isTRUE(collapse)){
if(!is.null(collapse_columns)){
if(!all(collapse_columns %in% names(pred_table))){
stop("collapse_columns are not columns in the generated prediction table.",
"either remove collapse_columns or ensure they match columns!")}

}
}
}

#generate mzmin and mzmax columns in pred_table
if(error_units == 'ppm'){
ppm_to_mz = function(mz, noise){
ppm = mz / 1000000 * noise
return(ppm)
}
pred_table <- pred_table %>%
pred_table <- pred_table %>%
dplyr::mutate(mzmin = mz - ppm_to_mz(mz, error),
mzmax = mz + ppm_to_mz(mz, error))
}
if(error_units == 'Da'){
pred_table <- pred_table %>%
pred_table <- pred_table %>%
dplyr::mutate(mzmin = mz - error,
mzmax = mz + error)
}

#run annotation
message("Starting annotation with predictions against data")
if(!is.null(mzmin_column) & !is.null(mzmax_column)){
Expand All @@ -187,26 +187,26 @@ glycoAnnotate <- function(data,
data.table::setDT(data)
data.table::setDT(pred_table)
data.table::setkey(pred_table, mzmin, mzmax)

data_annot <- data.table::foverlaps(data, pred_table)
}
if(is.null(mzmin_column) & is.null(mzmax_column)){
data <- data %>%
data <- data %>%
dplyr::mutate(mzmin = get(mz_column),
mzmax = get(mz_column))

data.table::setDT(data)
data.table::setDT(pred_table)
data.table::setkey(pred_table, mzmin, mzmax)

data_annot <- data.table::foverlaps(data, pred_table)
}

#collapse annotations
data.table::setDF(data_annot)
if(isTRUE(collapse) & nrow(data_annot) > nrow(data)){
message("Collapsing annotations")

#add annotation column that is pasted together for collapsing
if (is.null(collapse_columns)){
if(length(param@naming) == 1){
Expand All @@ -216,59 +216,57 @@ glycoAnnotate <- function(data,
collapse_columns = c(paste(param@naming[1], "name"), "ion")
}
}
data_annot <- data_annot %>%
dplyr::mutate(annotations = paste0(apply(data_annot[collapse_columns], 1,

data_annot <- data_annot %>%
dplyr::mutate(annotations = paste0(apply(data_annot[collapse_columns], 1,
paste, collapse=':')))
group_column_names <- setdiff(names(data_annot), names(pred_table))
group_column_names <- group_column_names[group_column_names != "annotations"]
data_annot <- data_annot %>%
dplyr::group_by(across(all_of(group_column_names))) %>%
dplyr::summarise(annotations = toString(annotations)) %>%
dplyr::ungroup() %>%
dplyr::distinct(across(all_of(c(group_column_names, "annotations")))) %>%
dplyr::distinct(across(all_of(c(group_column_names, "annotations")))) %>%
dplyr::mutate(annotations = sub('NA:NA', NA, annotations))

}

#format final df
if(isFALSE(collapse)){
data_annot <- data_annot %>%
data_annot <- data_annot %>%
dplyr::select(!c('mzmin', 'mzmax'))
}
if('mz' %in% names(pred_table) & 'mz' %in% names(data)){
if(isFALSE(collapse)){
data_annot <- data_annot %>%
data_annot <- data_annot %>%
dplyr::rename(mz_pred = mz)
}
data_annot <- data_annot %>%
dplyr::rename(mz = `i.mz`)
}
if(!is.null(mzmin_column)){
if ("i.mzmin" %in% names(data_annot)){
names(data_annot)[names(data_annot) == "i.mzmin"] <- mzmin_column
}
}
}
if(!is.null(mzmax_column)){
if("i.mzmax" %in% names(data_annot)){
names(data_annot)[names(data_annot) == "i.mzmax"] <- mzmax_column

}
}

if(is.null(mzmin_column)){
if ("i.mzmin" %in% names(data_annot)){
data_annot <- data_annot %>%
data_annot <- data_annot %>%
dplyr::select(!'i.mzmin')
}
}
}
if(is.null(mzmax_column)){
if("i.mzmax" %in% names(data_annot)){
data_annot <- data_annot %>%
data_annot <- data_annot %>%
dplyr::select(!'i.mzmax')
}
}

return(data_annot)
}

Expand All @@ -280,26 +278,26 @@ glycoAnnotate <- function(data,
#' @description \code{glycoAnnotationsCollapse()} collapses the output of
#' \link[GlycoAnnotateR]{glycoAnnotate} in the case of multiple annotations
#' per peak or feature so that there is one row per peak/feature with
#' multiple annotations comma-separated.
#' multiple annotations comma-separated.
#'
#' @export
#'
#' @slot annotated_data Dataframe annotated by \link[GlycoAnnotateR]{glycoAnnotate}
#'
#' @slot annotated_data Dataframe annotated by \link[GlycoAnnotateR]{glycoAnnotate}
#' that has NOT been collapsed and has multiple annotations per peak/feature.
#' @slot collapse_columns Names of columns to be pasted together before collapsing.
#' Suggested is molecule name and ion.
#' Suggested is molecule name and ion.
#' @slot noncollapse_columns Names of columns that uniquely identify peaks and
#' that should be retained after collapsing - these are generally the column
#' names of your input dataframe before annotation.
#'
#'
#' @examples
#' #annotate dataframe
#' gpp <- glycoPredictParam(dp = c(1, 8), modifications = "deoxy", polarity = "pos", naming = "IUPAC")
#' annotated_data <- glycoAnnotate(data = data, param = gpp, error = 1.5, units = 'ppm', collapse = F)
#'
#'
#' #collapse multiple annotations
#' annotated_data_collapsed <- glycoAnnotationsCollapse(annotated_data = annotated_data, collapse_columns = c('IUPAC name', 'ion'), noncollapse_columns = c('mz', 'rt', 'sampleA', 'sampleB'))
#'
#'
#' @seealso glycoAnnotateR::glycoPredict()
#' @seealso glycoAnnotateR::glycoPredictParam()
#' @seealso glycoAnnotateR::glycoAnnotate()
Expand All @@ -315,8 +313,8 @@ glycoAnnotationsCollapse <- function(annotated_data,
if(!all(collapse_columns %in% names(annotated_data))){
stop("collapse_columns are not column names in annotated_data!")
}
nrow_distinct = dplyr::distinct(annotated_data,
dplyr::across(dplyr::all_of(noncollapse_columns))) %>%
nrow_distinct = dplyr::distinct(annotated_data,
dplyr::across(dplyr::all_of(noncollapse_columns))) %>%
nrow()
nrow = nrow(annotated_data)
if(nrow_distinct == nrow){
Expand All @@ -326,19 +324,19 @@ glycoAnnotationsCollapse <- function(annotated_data,
#collapse annotations
data.table::setDF(annotated_data)
message("Collapsing annotations")
annotated_data_collapsed <- annotated_data %>%
dplyr::mutate(annotations = paste0(apply(annotated_data[collapse_columns], 1,
paste, collapse=':'))) %>%

annotated_data_collapsed <- annotated_data %>%
dplyr::mutate(annotations = paste0(apply(annotated_data[collapse_columns], 1,
paste, collapse=':'))) %>%
dplyr::group_by(across(all_of(noncollapse_columns))) %>%
dplyr::summarise(annotations = toString(annotations)) %>%
dplyr::ungroup() %>%
dplyr::distinct(across(all_of(c(noncollapse_columns, "annotations"))))

return(annotated_data_collapsed)

}




Expand Down

0 comments on commit 0953022

Please sign in to comment.