From 1d2ef60c84f6ef6275296e19dca2238d44de1ecb Mon Sep 17 00:00:00 2001 From: Christopher Mohr Date: Tue, 23 Jul 2024 16:32:29 +0200 Subject: [PATCH] Fix VCF processing (#8) * fix VCF processing * Roxygenize * trigger ci --------- Co-authored-by: christopher-mohr Co-authored-by: Gregor Sturm --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/personalis.R | 33 ++++++++++++++++++++------------- R/util.R | 18 ++++++++++++------ 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cedacbb..e482659 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Authors@R: Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects. Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: dplyr, SummarizedExperiment, diff --git a/NAMESPACE b/NAMESPACE index 72157b1..a74587d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ importFrom(rvest,html_nodes) importFrom(rvest,html_table) importFrom(rvest,html_text) importFrom(rvest,read_html) +importFrom(stringr,str_replace) importFrom(stringr,str_split_i) importFrom(stringr,str_to_title) importFrom(tibble,as_tibble) diff --git a/R/personalis.R b/R/personalis.R index eb6780b..ace379e 100644 --- a/R/personalis.R +++ b/R/personalis.R @@ -331,11 +331,15 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality)) table_number <- if_else(sample_type == "somatic", 1, 2) columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total") - + tables <- read_html(html_file) |> html_elements(html_section) |> html_elements("table") |> html_table(na.strings = "N/A") + + if (!length(tables)) { + return(tibble()) + } else { tes <- tables[table_number] |> lapply(function(df) { colnames(df) <- make.names(colnames(df)) @@ -350,6 +354,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo select(sample, var_name, value) |> pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |> mutate(across(contains("Number"), fix_thousands_separator)) + } } #' Read in (unfiltered) small variant data from VCF files from personalis folders @@ -379,15 +384,16 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) { modality = modality, sample_type = sample_type ) - if (!length(variant_list)) { return(NULL) } - col_data <- map(variant_list, "summary_stats") |> - bind_rows() |> - tibble::column_to_rownames("sample") - + col_data <- bind_rows(map(variant_list, "summary_stats")) + if (nrow(col_data)) { + col_data <- col_data |> + tibble::column_to_rownames("sample") + } + all_variants <- map(variant_list, "vcf_data") |> bind_rows() row_data <- all_variants |> select( @@ -440,19 +446,20 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ # even though they are all in the K13T folder. tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name) - # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases - file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf") - variant_table <- parse_vcf_to_df( file.path( sample_folder, sprintf("%s_Pipeline", modality), "Variants", - sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending) + sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz") ) - ) |> - mutate(sample = sample_name) |> - mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT)) + ) + + if (nrow(variant_table)) { + variant_table <- variant_table |> + mutate(sample = sample_name) |> + mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT)) + } variant_table } diff --git a/R/util.R b/R/util.R index 292492e..662e1f6 100644 --- a/R/util.R +++ b/R/util.R @@ -27,7 +27,8 @@ catch_file_not_found <- function(path, callback, ...) { if ( # List of erorr messages to catch grepl("`path` does not exist: .*", conditionMessage(e)) || # from read_excel - grepl("'.*?' does not exist.", conditionMessage(e)) # from read_html + grepl("'.*?' does not exist.", conditionMessage(e)) || # from read_html + grepl("cannot open the connection", conditionMessage(e)) # from read.vcfR ) { return(NULL) } else { @@ -122,22 +123,27 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") { #' @return {tibble} new data frame with all variants (fixed field and genotype information) #' @importFrom dplyr mutate left_join #' @importFrom vcfR read.vcfR vcfR2tidy -#' @importFrom stringr str_split_i +#' @importFrom stringr str_split_i str_replace #' @importFrom tibble as_tibble parse_vcf_to_df <- function(path) { # parse VCF file - vcf_content <- read.vcfR(path) - + vcf_content <- tryCatch({ + read.vcfR(path) + }, error = function(e) { + read.vcfR(str_replace(path, "vcf.gz", "vcf")) + } + ) + # fixed field content to data frame fixed_df <- vcfR2tidy(vcf_content)$fix # GT content to data frame gt_df <- vcfR2tidy(vcf_content)$gt - + # create addition column with observed nucleotides in order to avoid collisions when we do the left_join gt_df <- gt_df |> dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2)) - + # next use ChromKey, POS and ALT for joining vcf content data frames joined_vcf_df <- fixed_df |> dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))