Skip to content

Commit

Permalink
Add functionality to parse variants from VCF files (#6)
Browse files Browse the repository at this point in the history
Co-authored-by: Christopher Mohr <contact.cmohr@gmail.com>
  • Loading branch information
grst and christopher-mohr authored Mar 15, 2024
1 parent 980e27b commit 8f73ffa
Show file tree
Hide file tree
Showing 11 changed files with 371 additions and 30 deletions.
10 changes: 6 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
Package: PersonalisIO
Title: Read Personalis data into MultiAssayExperiment objects
Version: 0.2.0.9000
Version: 0.3.0.9000
Authors@R:
person("Gregor", "Sturm", , "gregor.sturm@boehringer-ingelheim.com", role = c("aut", "cre"))
Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Depends:
Imports:
dplyr,
SummarizedExperiment,
readxl,
MultiAssayExperiment,
tibble,
pbapply,
tidyr,
purrr,
dplyr,
BumpyMatrix,
rvest
rvest,
stringr,
vcfR
Suggests:
knitr,
rmarkdown,
Expand Down
11 changes: 11 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@ export(read_personalis_hla_reports)
export(read_personalis_msi_reports)
export(read_personalis_small_variant_reports)
export(read_personalis_tcr_reports)
export(read_personalis_vcf_files)
importFrom(BumpyMatrix,splitAsBumpyMatrix)
importFrom(MultiAssayExperiment,MultiAssayExperiment)
importFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(dplyr,across)
importFrom(dplyr,all_of)
importFrom(dplyr,any_of)
importFrom(dplyr,bind_rows)
importFrom(dplyr,contains)
importFrom(dplyr,cur_column)
importFrom(dplyr,distinct)
importFrom(dplyr,if_else)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,rename_with)
importFrom(dplyr,select)
importFrom(purrr,keep)
importFrom(purrr,map)
Expand All @@ -25,6 +31,11 @@ importFrom(rvest,html_nodes)
importFrom(rvest,html_table)
importFrom(rvest,html_text)
importFrom(rvest,read_html)
importFrom(stringr,str_split_i)
importFrom(stringr,str_to_title)
importFrom(tibble,as_tibble)
importFrom(tibble,tibble)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,pivot_wider)
importFrom(vcfR,read.vcfR)
importFrom(vcfR,vcfR2tidy)
225 changes: 206 additions & 19 deletions R/personalis.R

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ read_samples <- function(sample_paths, io_func, description, ...) {
#' @param col_data {data.frame} data frame that is used as colData (must have rownames that are sample identifiers!)
#' @param sample_col {character} column in `df` that contains the sample identifier
#' @return {tibble} new data frame with dummy entries added
#' @importFrom tibble as_tibble
#' @keywords internal
add_dummy_entry <- function(df, col_data, sample_col = "sample") {
missing_samples <- setdiff(rownames(col_data), unique(df[[sample_col]]))
Expand All @@ -114,3 +115,32 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
dummy_entries
)
}

#' Parse VCF files for a provided path and construct data frame.
#'
#' @param path path to VCF file in `*.vcf` or `*.vcf.gz` format
#' @return {tibble} new data frame with all variants (fixed field and genotype information)
#' @importFrom dplyr mutate left_join
#' @importFrom vcfR read.vcfR vcfR2tidy
#' @importFrom stringr str_split_i
#' @importFrom tibble as_tibble
parse_vcf_to_df <- function(path) {
# parse VCF file
vcf_content <- read.vcfR(path)

# fixed field content to data frame
fixed_df <- vcfR2tidy(vcf_content)$fix

# GT content to data frame
gt_df <- vcfR2tidy(vcf_content)$gt

# create addition column with observed nucleotides in order to avoid collisions when we do the left_join
gt_df <- gt_df |>
dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))

# next use ChromKey, POS and ALT for joining vcf content data frames
joined_vcf_df <- fixed_df |>
dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))

as_tibble(joined_vcf_df)
}
17 changes: 17 additions & 0 deletions man/parse_vcf_to_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/read_personalis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions man/read_personalis_small_variant_report_sample.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion man/read_personalis_small_variant_reports.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions man/read_personalis_variant_calling_summary_statistics.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/read_personalis_vcf_files.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions man/read_personalis_vcf_files_sample.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 8f73ffa

Please sign in to comment.