diff --git a/frequently_mutated_genes/frequently_mutated_genes.R b/frequently_mutated_genes/frequently_mutated_genes.R new file mode 100644 index 0000000..1dc8d58 --- /dev/null +++ b/frequently_mutated_genes/frequently_mutated_genes.R @@ -0,0 +1,273 @@ +#Script for computing frequenly mutated genes for the manuscript + +#In summary, dNdScv is run using discovery and validation sets +#separately. + +#Variants are filtered to have AF > 0 in selected sample for each +#patient. + +#For dNdScv, phased mutations (gap <= 10bp) are combined according to +#the input tables to avoid the same mutation being repeated as +#multiple separate ones. + +library(tidyverse) +library(parallel) + +options(readr.show_col_types = FALSE) +options(mc.cores = max(detectCores(), 1)) + +# -------------------------------------------------- +# Inputs +# -------------------------------------------------- + +setwd("/path/to/frequently_mutated_genes") + +#Define inputs +input_sets = list( + "discovery" = list( + "samples" = "phased_mutations/vcf_files.tsv", + "vars_dir" = "data/variants_csv/", + "phased_vars_split" = "phased_mutations/phased_mutations.split.tsv", + "phased_vars_merged" = "phased_mutations/phased_mutations.merged.tsv" + ), + "validation" = list( + "samples" = "phased_mutations/validation.vcf_files.tsv", + "vars_dir" = "data/variants_csv.validation/", + "phased_vars_split" = "phased_mutations/validation.phased_mutations.split.tsv", + "phased_vars_merged" = "phased_mutations/validation.phased_mutations.merged.tsv" + ) +) + +# -------------------------------------------------- +# Read and pre-process data +# -------------------------------------------------- + +#Functions +#--------- + +#Function for reading in phased and split mutation tables with removal of chr prefix from contig names +readPhasedVars = function(file_path) { + read_tsv(file_path) %>% mutate(chr = sub("^chr", "", chr)) +} + +#Function to read and process a single variants table +processVarsInput = function(tbl_path, patient, sample) { + #Keep only specific annotations and AF column for selected tumour sample. Filter AF > 0 + read_tsv(tbl_path) %>% + mutate(FILTER = {if ("FILTER" %in% colnames(.)) FILTER else "PASS"}) %>% #Add FILTER column as PASS only if missing + select( + CHROM, POS, REF, ALT, FILTER, + ends_with(".refGene"), starts_with("CLN"), starts_with("CADD_"), + all_of(paste0(sample, ".AF")) + ) %>% + rename_with( + ~sub(paste0(sample, "\\."), "", .), + starts_with(sample) + ) %>% + filter(AF > 0) %>% + mutate( + patient = patient, + sample = sample + ) %>% + select(patient, sample, everything()) +} + +#Process variants +#---------------- + +#Read and process mutation tables, extracting chosen sample per each patient +#Also prepares the tables for dNdScv with phased variants merged +mutation_sets = mclapply( + input_sets, + function(inputs) { + #Read sample table + tbl_samples = read_tsv(inputs$samples) + + #Read and process mutations, including filtering + tbl_mutations = mclapply( + tbl_samples %>% group_split(patient), + function (x) { + processVarsInput(paste0(inputs$vars_dir, x$patient, ".csv"), x$patient, x$sample) + }, + mc.cores = round(getOption("mc.cores") / min(getOption("mc.cores"), length(input_sets))) %>% + max(1) + ) %>% + do.call(rbind, .) + + #Read phased variant tables + tbl_phased_vars_split = readPhasedVars(inputs$phased_vars_split) + tbl_phased_vars_merged = readPhasedVars(inputs$phased_vars_merged) + + #Output + list( + "mutations" = tbl_mutations, + "phased_vars_split" = tbl_phased_vars_split, + "phased_vars_merged" = tbl_phased_vars_merged + ) + }, + mc.cores = min(getOption("mc.cores"), length(input_sets)) +) + +# -------------------------------------------------- +# Run dNdScv +# -------------------------------------------------- + +#Function for converting the mutation table to format desired by dndscv +#Merges nearby phased mutations according to table input +#Adds distance to nearest variant for investigating complex events +mutsToDndscv = function(muts_tbl, phased_vars_split=NULL, phased_vars_merged=NULL, merge_adjacent=T, compute_dist=T) { + out = muts_tbl %>% + select(sampleID=sample, chr=CHROM, pos=POS, ref=REF, mut=ALT) %>% + mutate(chr = sub("^chr", "", chr)) %>% + filter(chr != "M") + + if (merge_adjacent) { + phased_muts = merge( + out %>% mutate(patient = sub("_.*$", "", sampleID)), + phased_vars_split + ) %>% + group_by(patient, index) %>% + filter(n() > 1) %>% + ungroup() %>% + select(sampleID, chr, pos, ref, mut, patient, index) + + out = out %>% + setdiff(phased_muts %>% select(-patient, -index)) %>% + bind_rows( + phased_vars_merged %>% + merge( + phased_muts %>% select(sampleID, patient, index) %>% distinct(), + by = c("patient", "index") + ) %>% + select(-patient, -index) + ) %>% + arrange(sampleID, chr, pos) + } + + if (compute_dist) { + out = out %>% + group_by(sampleID, chr) %>% + mutate(dist = pmin(c(diff(pos), NA), c(NA, diff(pos)), na.rm=T)) %>% + ungroup() + } + + out +} + +#Prepare dNdScv +library(dndscv) + +#Load hg38 covariates object +load("data/covariates_hg19_hg38_epigenome_pcawg.rda") + +#Prepare gene list without chrY genes +load("data/RefCDS_human_GRCh38_GencodeV18_recommended.rda") + +chr_genes = sapply( + RefCDS, + function(x) c(x$chr, x$gene_name) +) %>% + t() %>% + as.data.frame() %>% + select(chr=V1, gene_name=V2) %>% + filter(chr != "Y") %>% + pull(gene_name) + +#Run dNdScv for each set +dndscv_sets = mclapply( + mutation_sets, + function(mutation_set) { + #Convert mutation table to desired dNdScv format with phased variants merged + tbl_dndscv_mutations = mutsToDndscv( + mutation_set$mutations, + mutation_set$phased_vars_split, + mutation_set$phased_vars_merged + ) + + #Run dNdScv + dndscv( + tbl_dndscv_mutations, gene_list=chr_genes, + refdb="data/RefCDS_human_GRCh38_GencodeV18_recommended.rda", cv=covs + ) + } +) + +# -------------------------------------------------- +# Outputting +# -------------------------------------------------- + +#Functions +#--------- + +#Compute frequency of mutated patients per gene (stop loss is not considered by dndscv) +dndscvMutFreq = function(dndscv_result, sort=F) { + out = dndscv_result$annotmuts %>% + filter(! impact %in% c("Stop_loss", "Synonymous")) %>% + select(sampleID, gene) %>% + distinct() %>% + group_by(gene) %>% + summarise( + nPatients = n(), + freqPatients = nPatients / n_distinct(dndscv_result$annotmuts$sampleID) + ) %>% + merge(dndscv_result$sel_cv %>% select(gene=gene_name), by="gene", all.y=T) %>% + mutate_if(is.numeric, ~ifelse(is.na(.), 0, .)) + + if (sort) { + out %>% arrange(match(gene, dndscv_result$sel_cv$gene_name)) + } else { + out + } +} + +#Write results +#------------- + +#Make sure output directory exists +dir.create("output", showWarnings=F) + +#Write all sets, including combined and filtered list +tbls_sel_cv = lapply( + names(mutation_sets), + function(set_name) { + #Retrieve repeated variables + dndscv_set = dndscv_sets[[set_name]] + + #Merge the sel_cv table with mutation frequencies + tbl_sel_cv = dndscv_set$sel_cv %>% + merge(dndscvMutFreq(dndscv_set), by.x="gene_name", by.y="gene", sort=F) %>% + arrange(match(gene_name, dndscv_set$sel_cv$gene_name)) + + #Write the sel_cv table as well as the annotated mutations of dndscv as is + write_tsv(tbl_sel_cv, paste0("output/", set_name, ".dndscv.sel_cv.tsv")) + write_tsv(dndscv_set$annotmuts, paste0("output/", set_name, ".dndscv.annotmuts.tsv")) + + #Output for merging + tbl_sel_cv %>% + select( + Gene=gene_name, Syn=n_syn, Mis=n_mis, Non=n_non, Spl=n_spl, Ind=n_ind, + `P-value`=pglobal_cv, `Q-value`=qglobal_cv, + Patients=nPatients, `Fraction of Patients`=freqPatients + ) + } +) %>% + setNames(names(mutation_sets)) + +#Side-by-side table of discovery and validation runs with NAs as empty fields +lapply( + c("discovery", "validation"), + function(set_name) { + rbind( + colnames(tbls_sel_cv[[set_name]]), + tbls_sel_cv[[set_name]] %>% + filter(`P-value` < 0.05) %>% + setNames(paste0(set_name, colnames(tbls_sel_cv[[set_name]]))) + ) %>% + mutate(i = row_number()) + } +) %>% + reduce(full_join, by="i") %>% + select(-i) %>% + setNames(NULL) %>% + write_tsv("output/S2.merged.dndscv.sel_cv.tsv", na="", col_names=F) + diff --git a/frequently_mutated_genes/phased_mutations.sh b/frequently_mutated_genes/phased_mutations.sh new file mode 100755 index 0000000..76547b3 --- /dev/null +++ b/frequently_mutated_genes/phased_mutations.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +#Script to produce two tables for matching original nearby (gap < 10bp) +#phased mutations with their merged counterparts for dNdScv + +REFERENCE="/path/to/GRCh38.d1.vd1.fa" + +#Work directory +OUT_DIR="phased_mutations" +mkdir -p $OUT_DIR + +#Function to process one patient's variants to merge and gaps between +function process_phased_mutations_and_gaps() { + local PATIENT=$1 + local VCF=$2 + local PREFIX="$3" + + #Look only at variants with phasing information (PGT:PID) + bcftools view -H $VCF | + cut -f-2,4,5,9,10 | + grep -P ":PGT:PID:" | + perl -e ' + use warnings; + use strict; + + sub which_index { + my ($col, $field_name) = @_; + + my @col_split = split /:/, $col; + + for my $i (0 .. $#col_split) { + return ($i) if ($col_split[$i] eq $field_name); + } + + return -1; + } + + sub get_index { + my ($col, $index) = @_; + + return (split /:/, $col)[$index]; + } + + print "chr\tpos1\tpos2\tref1\tref2\tmut1\tmut2\tindex\tgap\tgapChr\tgapStart\tgapEnd\n"; + + my $index = 1; + my ($prev_chr, $prev_pos, $prev_ref, $prev_mut, $prev_pgt, $prev_pid) = ("", "-inf", "", "", "", ""); + my @prev_line; + + while () { + chomp; + my @line = split /\t/; + + my ($pgt_ind, $pid_ind) = (which_index($line[4], "PGT"), which_index($line[4], "PID")); + my ($chr, $pos, $ref, $mut, $pgt, $pid) = (@line[0 .. 3], get_index($line[5], $pgt_ind), get_index($line[5], $pid_ind)); + + #Same chromosome, PGT, PID and distance is within 10 + my $gap = $pos - $prev_pos - length($prev_ref); + + #gap < 10 to better consider indels; gap >= 0 to avoid first deletion covering second variant + if ( + $prev_chr eq $chr and + $gap < 10 and + $gap >= 0 and + $prev_pgt eq $pgt and + $prev_pid eq $pid + ) { + print join("\t", ($prev_chr, $prev_pos, $pos, $prev_ref, $ref, $prev_mut, $mut, $index++)) . + "\t$gap" . "\t$chr\t" . ($pos - 1 - $gap) . "\t" . ($pos - 1) . "\n"; + } + + ($prev_chr, $prev_pos, $prev_ref, $prev_mut, $prev_pgt, $prev_pid) = ($chr, $pos, $ref, $mut, $pgt, $pid); + @prev_line = @line; + } + ' \ + > $OUT_DIR/${PREFIX}vars_to_merge/$PATIENT.tsv + + awk '{if (NR > 1 && $9 > 0) {print $10, $11, $12}}' OFS="\t" $OUT_DIR/${PREFIX}vars_to_merge/$PATIENT.tsv | + bedtools getfasta -fi $REFERENCE -bed stdin -s -tab -fo stdout | + sed 's/()//; s/[:\-]/\t/g' | + cat <(echo -e "gapChr\tgapStart\tgapEnd\tgapSeq") - \ + > $OUT_DIR/${PREFIX}vars_to_merge.gaps/$PATIENT.tsv +} + +#Main processing function +function generate_phased_mutation_tables() { + local VCF_FILES_TBL=$1 + local PREFIX="$2" + + mkdir -p $OUT_DIR/${PREFIX}vars_to_merge $OUT_DIR/${PREFIX}vars_to_merge.gaps + + #Determine variants (always at most two) to merge + while read PATIENT VCF; do + process_phased_mutations_and_gaps $PATIENT $VCF "$PREFIX" & + done < <(cut -f1,3 $VCF_FILES_TBL | tail -n +2) + + wait + + #Merge results + Rscript -e ' + suppressMessages(library(tidyverse)) + + options(readr.show_col_types=FALSE) + + args = commandArgs(trailingOnly=TRUE) + + in_tbl = args[1] + prefix = paste0("phased_mutations/", args[2]) + + samples = read_tsv(in_tbl) + + mut_tables = lapply( + samples$patient, + function(patient) { + vars_to_merge = read_tsv(paste0(prefix, "vars_to_merge/", patient, ".tsv")) + vars_to_merge_gaps = read_tsv(paste0(prefix, "vars_to_merge.gaps/", patient, ".tsv")) + + vars_processed = vars_to_merge %>% + merge(vars_to_merge_gaps, by = c("gapChr", "gapStart", "gapEnd"), all.x=T) %>% + mutate(gapSeq = ifelse(gap == 0, "", gapSeq)) + + list( + "split" = cbind( + "patient" = rep(patient, nrow(vars_processed)), + rbind( + vars_processed %>% select(chr, pos=pos1, ref=ref1, mut=mut1, index), + vars_processed %>% select(chr, pos=pos2, ref=ref2, mut=mut2, index) + ) %>% + arrange(chr, pos) + ), + "merged" = vars_processed %>% + mutate( + patient = rep(patient, nrow(vars_processed)), + ref = paste0(ref1, gapSeq, ref2), + mut = paste0(mut1, gapSeq, mut2) + ) %>% + select(patient, chr, pos=pos1, mut, ref, index) + ) + } + ) + + lapply(mut_tables, function(x) x$split) %>% + do.call(rbind, .) %>% + arrange(patient, index) %>% + write_tsv(paste0(prefix, "phased_mutations.split.tsv")) + lapply(mut_tables, function(x) x$merged) %>% + do.call(rbind, .) %>% + arrange(patient, index) %>% + write_tsv(paste0(prefix, "phased_mutations.merged.tsv")) + ' $VCF_FILES_TBL "$PREFIX" +} + +#R scripts to produce a table of VCF files to process +#Discovery +Rscript -e ' + suppressMessages(library(tidyverse)) + + suppressMessages(read_tsv("data/samples.tsv")) %>% + filter(gistic) %>% + select(patient, sample) %>% + mutate(vcf = paste0("data/variants_vcf/", patient, ".vcf.gz")) %>% + write_tsv("phased_mutations/vcf_files.tsv") +' + +#Validation +Rscript -e ' + suppressMessages(library(tidyverse)) + + suppressMessages(read_tsv("data/samples.validation.tsv")) %>% + #Purity >= 10% non-duplicate tumour + filter(purity >= 0.1, isMain, !normal) %>% + #Choose sample with the same GISTIC prioritisation + mutate( + psNonAsc = !(tissueType %in% c("ascites", "pleural fluid")), + psNonAdn = !(tissueType %in% c("adnex", "fallopian tube", "ovary")), + psNaive = sampleTime == "primary", + priorityScore = 4 * psNonAsc + 2 * psNonAdn + 1 * psNaive + ) %>% + group_by(patient) %>% + arrange(priorityScore, purity) %>% + slice_tail(n = 1) %>% + ungroup() %>% + #Get VCF path + select(patient, sample) %>% + mutate(vcf = paste0("data/variants_vcf.validation/", patient, ".vcf.gz")) %>% + write_tsv("phased_mutations/validation.vcf_files.tsv") +' + +#Generate the tables +generate_phased_mutation_tables "phased_mutations/vcf_files.tsv" "" +generate_phased_mutation_tables "phased_mutations/validation.vcf_files.tsv" "validation."