From 1d2ef60c84f6ef6275296e19dca2238d44de1ecb Mon Sep 17 00:00:00 2001
From: Christopher Mohr <christopher.mohr@boehringer-ingelheim.com>
Date: Tue, 23 Jul 2024 16:32:29 +0200
Subject: [PATCH] Fix VCF processing (#8)

* fix VCF processing

* Roxygenize

* trigger ci

---------

Co-authored-by: christopher-mohr <christopher-mohr@users.noreply.github.com>
Co-authored-by: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
---
 DESCRIPTION    |  2 +-
 NAMESPACE      |  1 +
 R/personalis.R | 33 ++++++++++++++++++++-------------
 R/util.R       | 18 ++++++++++++------
 4 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index cedacbb..e482659 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -6,7 +6,7 @@ Authors@R:
 Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Imports:
     dplyr,
     SummarizedExperiment,
diff --git a/NAMESPACE b/NAMESPACE
index 72157b1..a74587d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -31,6 +31,7 @@ importFrom(rvest,html_nodes)
 importFrom(rvest,html_table)
 importFrom(rvest,html_text)
 importFrom(rvest,read_html)
+importFrom(stringr,str_replace)
 importFrom(stringr,str_split_i)
 importFrom(stringr,str_to_title)
 importFrom(tibble,as_tibble)
diff --git a/R/personalis.R b/R/personalis.R
index eb6780b..ace379e 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -331,11 +331,15 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
   html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
   table_number <- if_else(sample_type == "somatic", 1, 2)
   columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")
-
+  
   tables <- read_html(html_file) |>
     html_elements(html_section) |>
     html_elements("table") |>
     html_table(na.strings = "N/A")
+  
+  if (!length(tables)) {
+    return(tibble())
+  } else {
   tes <- tables[table_number] |>
     lapply(function(df) {
       colnames(df) <- make.names(colnames(df))
@@ -350,6 +354,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
     select(sample, var_name, value) |>
     pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
     mutate(across(contains("Number"), fix_thousands_separator))
+  }
 }
 
 #' Read in (unfiltered) small variant data from VCF files from personalis folders
@@ -379,15 +384,16 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
     modality = modality,
     sample_type = sample_type
   )
-
   if (!length(variant_list)) {
     return(NULL)
   }
 
-  col_data <- map(variant_list, "summary_stats") |>
-    bind_rows() |>
-    tibble::column_to_rownames("sample")
-
+  col_data <- bind_rows(map(variant_list, "summary_stats"))
+  if (nrow(col_data)) {
+    col_data <- col_data |>
+      tibble::column_to_rownames("sample")
+  }
+  
   all_variants <- map(variant_list, "vcf_data") |> bind_rows()
   row_data <- all_variants |>
     select(
@@ -440,19 +446,20 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
   # even though they are all in the K13T folder.
   tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)
 
-  # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases
-  file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf")
-
   variant_table <- parse_vcf_to_df(
     file.path(
       sample_folder,
       sprintf("%s_Pipeline", modality),
       "Variants",
-      sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending)
+      sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz")
     )
-  ) |>
-    mutate(sample = sample_name) |>
-    mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
+  )
+  
+  if (nrow(variant_table)) {
+    variant_table <- variant_table |> 
+      mutate(sample = sample_name) |>
+      mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
+  }
 
   variant_table
 }
diff --git a/R/util.R b/R/util.R
index 292492e..662e1f6 100644
--- a/R/util.R
+++ b/R/util.R
@@ -27,7 +27,8 @@ catch_file_not_found <- function(path, callback, ...) {
       if (
         # List of erorr messages to catch
         grepl("`path` does not exist: .*", conditionMessage(e)) || # from read_excel
-          grepl("'.*?' does not exist.", conditionMessage(e)) # from read_html
+          grepl("'.*?' does not exist.", conditionMessage(e)) || # from read_html
+          grepl("cannot open the connection", conditionMessage(e)) # from read.vcfR
       ) {
         return(NULL)
       } else {
@@ -122,22 +123,27 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
 #' @return {tibble} new data frame with all variants (fixed field and genotype information)
 #' @importFrom dplyr mutate left_join
 #' @importFrom vcfR read.vcfR vcfR2tidy
-#' @importFrom stringr str_split_i
+#' @importFrom stringr str_split_i str_replace
 #' @importFrom tibble as_tibble
 parse_vcf_to_df <- function(path) {
   # parse VCF file
-  vcf_content <- read.vcfR(path)
-
+  vcf_content <- tryCatch({
+    read.vcfR(path) 
+  }, error = function(e) {
+      read.vcfR(str_replace(path, "vcf.gz", "vcf"))
+    }
+  )
+  
   # fixed field content to data frame
   fixed_df <- vcfR2tidy(vcf_content)$fix
 
   # GT content to data frame
   gt_df <- vcfR2tidy(vcf_content)$gt
-
+  
   # create addition column with observed nucleotides in order to avoid collisions when we do the left_join
   gt_df <- gt_df |>
     dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))
-
+  
   # next use ChromKey, POS and ALT for joining vcf content data frames
   joined_vcf_df <- fixed_df |>
     dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))