Skip to content

Commit

Permalink
Fix VCF processing (#8)
Browse files Browse the repository at this point in the history
* fix VCF processing

* Roxygenize

* trigger ci

---------

Co-authored-by: christopher-mohr <christopher-mohr@users.noreply.github.com>
Co-authored-by: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
  • Loading branch information
3 people authored Jul 23, 2024
1 parent 516135f commit 1d2ef60
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 20 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Authors@R:
Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Imports:
dplyr,
SummarizedExperiment,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ importFrom(rvest,html_nodes)
importFrom(rvest,html_table)
importFrom(rvest,html_text)
importFrom(rvest,read_html)
importFrom(stringr,str_replace)
importFrom(stringr,str_split_i)
importFrom(stringr,str_to_title)
importFrom(tibble,as_tibble)
Expand Down
33 changes: 20 additions & 13 deletions R/personalis.R
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,15 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
table_number <- if_else(sample_type == "somatic", 1, 2)
columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")

tables <- read_html(html_file) |>
html_elements(html_section) |>
html_elements("table") |>
html_table(na.strings = "N/A")

if (!length(tables)) {
return(tibble())
} else {
tes <- tables[table_number] |>
lapply(function(df) {
colnames(df) <- make.names(colnames(df))
Expand All @@ -350,6 +354,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
select(sample, var_name, value) |>
pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
mutate(across(contains("Number"), fix_thousands_separator))
}
}

#' Read in (unfiltered) small variant data from VCF files from personalis folders
Expand Down Expand Up @@ -379,15 +384,16 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
modality = modality,
sample_type = sample_type
)

if (!length(variant_list)) {
return(NULL)
}

col_data <- map(variant_list, "summary_stats") |>
bind_rows() |>
tibble::column_to_rownames("sample")

col_data <- bind_rows(map(variant_list, "summary_stats"))
if (nrow(col_data)) {
col_data <- col_data |>
tibble::column_to_rownames("sample")
}

all_variants <- map(variant_list, "vcf_data") |> bind_rows()
row_data <- all_variants |>
select(
Expand Down Expand Up @@ -440,19 +446,20 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
# even though they are all in the K13T folder.
tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)

# tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases
file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf")

variant_table <- parse_vcf_to_df(
file.path(
sample_folder,
sprintf("%s_Pipeline", modality),
"Variants",
sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending)
sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz")
)
) |>
mutate(sample = sample_name) |>
mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
)

if (nrow(variant_table)) {
variant_table <- variant_table |>
mutate(sample = sample_name) |>
mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
}

variant_table
}
Expand Down
18 changes: 12 additions & 6 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ catch_file_not_found <- function(path, callback, ...) {
if (
# List of erorr messages to catch
grepl("`path` does not exist: .*", conditionMessage(e)) || # from read_excel
grepl("'.*?' does not exist.", conditionMessage(e)) # from read_html
grepl("'.*?' does not exist.", conditionMessage(e)) || # from read_html
grepl("cannot open the connection", conditionMessage(e)) # from read.vcfR
) {
return(NULL)
} else {
Expand Down Expand Up @@ -122,22 +123,27 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
#' @return {tibble} new data frame with all variants (fixed field and genotype information)
#' @importFrom dplyr mutate left_join
#' @importFrom vcfR read.vcfR vcfR2tidy
#' @importFrom stringr str_split_i
#' @importFrom stringr str_split_i str_replace
#' @importFrom tibble as_tibble
parse_vcf_to_df <- function(path) {
# parse VCF file
vcf_content <- read.vcfR(path)

vcf_content <- tryCatch({
read.vcfR(path)
}, error = function(e) {
read.vcfR(str_replace(path, "vcf.gz", "vcf"))
}
)

# fixed field content to data frame
fixed_df <- vcfR2tidy(vcf_content)$fix

# GT content to data frame
gt_df <- vcfR2tidy(vcf_content)$gt

# create addition column with observed nucleotides in order to avoid collisions when we do the left_join
gt_df <- gt_df |>
dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))

# next use ChromKey, POS and ALT for joining vcf content data frames
joined_vcf_df <- fixed_df |>
dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))
Expand Down

0 comments on commit 1d2ef60

Please sign in to comment.