diff --git a/frequently_mutated_genes/frequently_mutated_genes.R b/frequently_mutated_genes/frequently_mutated_genes.R
new file mode 100644
index 0000000..1dc8d58
--- /dev/null
+++ b/frequently_mutated_genes/frequently_mutated_genes.R
@@ -0,0 +1,273 @@
+#Script for computing frequenly mutated genes for the manuscript
+
+#In summary, dNdScv is run using discovery and validation sets
+#separately.
+
+#Variants are filtered to have AF > 0 in selected sample for each
+#patient.
+
+#For dNdScv, phased mutations (gap <= 10bp) are combined according to
+#the input tables to avoid the same mutation being repeated as
+#multiple separate ones.
+
+library(tidyverse)
+library(parallel)
+
+options(readr.show_col_types = FALSE)
+options(mc.cores = max(detectCores(), 1))
+
+# --------------------------------------------------
+# Inputs
+# --------------------------------------------------
+
+setwd("/path/to/frequently_mutated_genes")
+
+#Define inputs
+input_sets = list(
+    "discovery" = list(
+        "samples" = "phased_mutations/vcf_files.tsv",
+        "vars_dir" = "data/variants_csv/",
+        "phased_vars_split" = "phased_mutations/phased_mutations.split.tsv",
+        "phased_vars_merged" = "phased_mutations/phased_mutations.merged.tsv"
+    ),
+    "validation" = list(
+        "samples" = "phased_mutations/validation.vcf_files.tsv",
+        "vars_dir" = "data/variants_csv.validation/",
+        "phased_vars_split" = "phased_mutations/validation.phased_mutations.split.tsv",
+        "phased_vars_merged" = "phased_mutations/validation.phased_mutations.merged.tsv"
+    )
+)
+
+# --------------------------------------------------
+# Read and pre-process data
+# --------------------------------------------------
+
+#Functions
+#---------
+
+#Function for reading in phased and split mutation tables with removal of chr prefix from contig names
+readPhasedVars = function(file_path) {
+    read_tsv(file_path) %>% mutate(chr = sub("^chr", "", chr))
+}
+
+#Function to read and process a single variants table
+processVarsInput = function(tbl_path, patient, sample) {
+    #Keep only specific annotations and AF column for selected tumour sample. Filter AF > 0
+    read_tsv(tbl_path) %>%
+        mutate(FILTER = {if ("FILTER" %in% colnames(.)) FILTER else "PASS"}) %>% #Add FILTER column as PASS only if missing
+        select(
+            CHROM, POS, REF, ALT, FILTER,
+            ends_with(".refGene"), starts_with("CLN"), starts_with("CADD_"),
+            all_of(paste0(sample, ".AF"))
+        ) %>%
+        rename_with(
+            ~sub(paste0(sample, "\\."), "", .),
+            starts_with(sample)
+        ) %>%
+        filter(AF > 0) %>%
+        mutate(
+            patient = patient,
+            sample = sample
+        ) %>%
+        select(patient, sample, everything())
+}
+
+#Process variants
+#----------------
+
+#Read and process mutation tables, extracting chosen sample per each patient
+#Also prepares the tables for dNdScv with phased variants merged
+mutation_sets = mclapply(
+    input_sets,
+    function(inputs) {
+        #Read sample table
+        tbl_samples = read_tsv(inputs$samples)
+
+        #Read and process mutations, including filtering
+        tbl_mutations = mclapply(
+            tbl_samples %>% group_split(patient),
+            function (x) {
+                processVarsInput(paste0(inputs$vars_dir, x$patient, ".csv"), x$patient, x$sample)
+            },
+            mc.cores = round(getOption("mc.cores") / min(getOption("mc.cores"), length(input_sets))) %>%
+                max(1)
+        ) %>%
+            do.call(rbind, .)
+
+        #Read phased variant tables
+        tbl_phased_vars_split  = readPhasedVars(inputs$phased_vars_split)
+        tbl_phased_vars_merged = readPhasedVars(inputs$phased_vars_merged)
+
+        #Output
+        list(
+            "mutations" = tbl_mutations,
+            "phased_vars_split"  = tbl_phased_vars_split,
+            "phased_vars_merged" = tbl_phased_vars_merged
+        )
+    },
+    mc.cores = min(getOption("mc.cores"), length(input_sets))
+)
+
+# --------------------------------------------------
+# Run dNdScv
+# --------------------------------------------------
+
+#Function for converting the mutation table to format desired by dndscv
+#Merges nearby phased mutations according to table input
+#Adds distance to nearest variant for investigating complex events
+mutsToDndscv = function(muts_tbl, phased_vars_split=NULL, phased_vars_merged=NULL, merge_adjacent=T, compute_dist=T) {
+    out = muts_tbl %>%
+        select(sampleID=sample, chr=CHROM, pos=POS, ref=REF, mut=ALT) %>%
+        mutate(chr = sub("^chr", "", chr)) %>%
+        filter(chr != "M")
+
+    if (merge_adjacent) {
+        phased_muts = merge(
+            out %>% mutate(patient = sub("_.*$", "", sampleID)),
+            phased_vars_split
+        ) %>%
+            group_by(patient, index) %>%
+            filter(n() > 1) %>%
+            ungroup() %>%
+            select(sampleID, chr, pos, ref, mut, patient, index)
+
+        out = out %>%
+            setdiff(phased_muts %>% select(-patient, -index)) %>%
+            bind_rows(
+                phased_vars_merged %>%
+                    merge(
+                        phased_muts %>% select(sampleID, patient, index) %>% distinct(),
+                        by = c("patient", "index")
+                    ) %>%
+                    select(-patient, -index)
+            ) %>%
+            arrange(sampleID, chr, pos)
+    }
+
+    if (compute_dist) {
+        out = out %>%
+            group_by(sampleID, chr) %>%
+            mutate(dist = pmin(c(diff(pos), NA), c(NA, diff(pos)), na.rm=T)) %>%
+            ungroup()
+    }
+
+    out
+}
+
+#Prepare dNdScv
+library(dndscv)
+
+#Load hg38 covariates object
+load("data/covariates_hg19_hg38_epigenome_pcawg.rda")
+
+#Prepare gene list without chrY genes
+load("data/RefCDS_human_GRCh38_GencodeV18_recommended.rda")
+
+chr_genes = sapply(
+    RefCDS,
+    function(x) c(x$chr, x$gene_name)
+) %>%
+    t() %>%
+    as.data.frame() %>%
+    select(chr=V1, gene_name=V2) %>%
+    filter(chr != "Y") %>%
+    pull(gene_name)
+
+#Run dNdScv for each set
+dndscv_sets = mclapply(
+    mutation_sets,
+    function(mutation_set) {
+        #Convert mutation table to desired dNdScv format with phased variants merged
+        tbl_dndscv_mutations = mutsToDndscv(
+            mutation_set$mutations,
+            mutation_set$phased_vars_split,
+            mutation_set$phased_vars_merged
+        )
+
+        #Run dNdScv
+        dndscv(
+            tbl_dndscv_mutations, gene_list=chr_genes,
+            refdb="data/RefCDS_human_GRCh38_GencodeV18_recommended.rda", cv=covs
+        )
+    }
+)
+
+# --------------------------------------------------
+# Outputting
+# --------------------------------------------------
+
+#Functions
+#---------
+
+#Compute frequency of mutated patients per gene (stop loss is not considered by dndscv)
+dndscvMutFreq = function(dndscv_result, sort=F) {
+    out = dndscv_result$annotmuts %>%
+        filter(! impact %in% c("Stop_loss", "Synonymous")) %>%
+        select(sampleID, gene) %>%
+        distinct() %>%
+        group_by(gene) %>%
+        summarise(
+            nPatients = n(),
+            freqPatients = nPatients / n_distinct(dndscv_result$annotmuts$sampleID)
+        ) %>%
+        merge(dndscv_result$sel_cv %>% select(gene=gene_name), by="gene", all.y=T) %>%
+        mutate_if(is.numeric, ~ifelse(is.na(.), 0, .))
+
+    if (sort) {
+        out %>% arrange(match(gene, dndscv_result$sel_cv$gene_name))
+    } else {
+        out
+    }
+}
+
+#Write results
+#-------------
+
+#Make sure output directory exists
+dir.create("output", showWarnings=F)
+
+#Write all sets, including combined and filtered list
+tbls_sel_cv = lapply(
+    names(mutation_sets),
+    function(set_name) {
+        #Retrieve repeated variables
+        dndscv_set = dndscv_sets[[set_name]]
+
+        #Merge the sel_cv table with mutation frequencies
+        tbl_sel_cv = dndscv_set$sel_cv %>%
+            merge(dndscvMutFreq(dndscv_set), by.x="gene_name", by.y="gene", sort=F) %>%
+            arrange(match(gene_name, dndscv_set$sel_cv$gene_name))
+
+        #Write the sel_cv table as well as the annotated mutations of dndscv as is
+        write_tsv(tbl_sel_cv, paste0("output/", set_name, ".dndscv.sel_cv.tsv"))
+        write_tsv(dndscv_set$annotmuts, paste0("output/", set_name, ".dndscv.annotmuts.tsv"))
+
+        #Output for merging
+        tbl_sel_cv %>%
+            select(
+                Gene=gene_name, Syn=n_syn, Mis=n_mis, Non=n_non, Spl=n_spl, Ind=n_ind,
+                `P-value`=pglobal_cv, `Q-value`=qglobal_cv,
+                Patients=nPatients, `Fraction of Patients`=freqPatients
+            )
+    }
+) %>%
+    setNames(names(mutation_sets))
+
+#Side-by-side table of discovery and validation runs with NAs as empty fields
+lapply(
+    c("discovery", "validation"),
+    function(set_name) {
+        rbind(
+            colnames(tbls_sel_cv[[set_name]]),
+            tbls_sel_cv[[set_name]] %>%
+                filter(`P-value` < 0.05) %>%
+                setNames(paste0(set_name, colnames(tbls_sel_cv[[set_name]])))
+        ) %>%
+        mutate(i = row_number())
+    }
+) %>%
+    reduce(full_join, by="i") %>%
+    select(-i) %>%
+    setNames(NULL) %>%
+    write_tsv("output/S2.merged.dndscv.sel_cv.tsv", na="", col_names=F)
+
diff --git a/frequently_mutated_genes/phased_mutations.sh b/frequently_mutated_genes/phased_mutations.sh
new file mode 100755
index 0000000..76547b3
--- /dev/null
+++ b/frequently_mutated_genes/phased_mutations.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+#Script to produce two tables for matching original nearby (gap < 10bp)
+#phased mutations with their merged counterparts for dNdScv
+
+REFERENCE="/path/to/GRCh38.d1.vd1.fa"
+
+#Work directory
+OUT_DIR="phased_mutations"
+mkdir -p $OUT_DIR
+
+#Function to process one patient's variants to merge and gaps between
+function process_phased_mutations_and_gaps() {
+    local PATIENT=$1
+    local VCF=$2
+    local PREFIX="$3"
+
+    #Look only at variants with phasing information (PGT:PID)
+    bcftools view -H $VCF |
+        cut -f-2,4,5,9,10 |
+        grep -P ":PGT:PID:" |
+        perl -e '
+            use warnings;
+            use strict;
+
+            sub which_index {
+                my ($col, $field_name) = @_;
+
+                my @col_split = split /:/, $col;
+
+                for my $i (0 .. $#col_split) {
+                    return ($i) if ($col_split[$i] eq $field_name);
+                }
+
+                return -1;
+            }
+
+            sub get_index {
+                my ($col, $index) = @_;
+
+                return (split /:/, $col)[$index];
+            }
+
+            print "chr\tpos1\tpos2\tref1\tref2\tmut1\tmut2\tindex\tgap\tgapChr\tgapStart\tgapEnd\n";
+
+            my $index = 1;
+            my ($prev_chr, $prev_pos, $prev_ref, $prev_mut, $prev_pgt, $prev_pid) = ("", "-inf", "", "", "", "");
+            my @prev_line;
+
+            while (<STDIN>) {
+                chomp;
+                my @line = split /\t/;
+
+                my ($pgt_ind, $pid_ind) = (which_index($line[4], "PGT"), which_index($line[4], "PID"));
+                my ($chr, $pos, $ref, $mut, $pgt, $pid) = (@line[0 .. 3], get_index($line[5], $pgt_ind), get_index($line[5], $pid_ind));
+
+                #Same chromosome, PGT, PID and distance is within 10
+                my $gap = $pos - $prev_pos - length($prev_ref);
+
+                #gap < 10 to better consider indels; gap >= 0 to avoid first deletion covering second variant
+                if (
+                    $prev_chr eq $chr and
+                    $gap < 10 and
+                    $gap >= 0 and
+                    $prev_pgt eq $pgt and
+                    $prev_pid eq $pid
+                ) {
+                    print join("\t", ($prev_chr, $prev_pos, $pos, $prev_ref, $ref, $prev_mut, $mut, $index++)) .
+                        "\t$gap" . "\t$chr\t" . ($pos - 1 - $gap) . "\t" . ($pos - 1) . "\n";
+                }
+
+                ($prev_chr, $prev_pos, $prev_ref, $prev_mut, $prev_pgt, $prev_pid) = ($chr, $pos, $ref, $mut, $pgt, $pid);
+                @prev_line = @line;
+            }
+        ' \
+            > $OUT_DIR/${PREFIX}vars_to_merge/$PATIENT.tsv
+
+    awk '{if (NR > 1 && $9 > 0) {print $10, $11, $12}}' OFS="\t" $OUT_DIR/${PREFIX}vars_to_merge/$PATIENT.tsv |
+        bedtools getfasta -fi $REFERENCE -bed stdin -s -tab -fo stdout |
+        sed 's/()//; s/[:\-]/\t/g' |
+        cat <(echo -e "gapChr\tgapStart\tgapEnd\tgapSeq") - \
+            > $OUT_DIR/${PREFIX}vars_to_merge.gaps/$PATIENT.tsv
+}
+
+#Main processing function
+function generate_phased_mutation_tables() {
+    local VCF_FILES_TBL=$1
+    local PREFIX="$2"
+
+    mkdir -p $OUT_DIR/${PREFIX}vars_to_merge $OUT_DIR/${PREFIX}vars_to_merge.gaps
+
+    #Determine variants (always at most two) to merge
+    while read PATIENT VCF; do
+        process_phased_mutations_and_gaps $PATIENT $VCF "$PREFIX" &
+    done < <(cut -f1,3 $VCF_FILES_TBL | tail -n +2)
+
+    wait
+
+    #Merge results
+    Rscript -e '
+        suppressMessages(library(tidyverse))
+
+        options(readr.show_col_types=FALSE)
+
+        args = commandArgs(trailingOnly=TRUE)
+
+        in_tbl = args[1]
+        prefix = paste0("phased_mutations/", args[2])
+
+        samples = read_tsv(in_tbl)
+
+        mut_tables = lapply(
+            samples$patient,
+            function(patient) {
+                vars_to_merge = read_tsv(paste0(prefix, "vars_to_merge/", patient, ".tsv"))
+                vars_to_merge_gaps = read_tsv(paste0(prefix, "vars_to_merge.gaps/", patient, ".tsv"))
+
+                vars_processed = vars_to_merge %>%
+                    merge(vars_to_merge_gaps, by = c("gapChr", "gapStart", "gapEnd"), all.x=T) %>%
+                    mutate(gapSeq = ifelse(gap == 0, "", gapSeq))
+
+                list(
+                    "split" = cbind(
+                        "patient" = rep(patient, nrow(vars_processed)),
+                        rbind(
+                            vars_processed %>% select(chr, pos=pos1, ref=ref1, mut=mut1, index),
+                            vars_processed %>% select(chr, pos=pos2, ref=ref2, mut=mut2, index)
+                        ) %>%
+                            arrange(chr, pos)
+                    ),
+                    "merged" = vars_processed %>%
+                        mutate(
+                            patient = rep(patient, nrow(vars_processed)),
+                            ref = paste0(ref1, gapSeq, ref2),
+                            mut = paste0(mut1, gapSeq, mut2)
+                        ) %>%
+                        select(patient, chr, pos=pos1, mut, ref, index)
+                )
+            }
+        )
+
+        lapply(mut_tables, function(x) x$split) %>%
+            do.call(rbind, .) %>%
+            arrange(patient, index) %>%
+            write_tsv(paste0(prefix, "phased_mutations.split.tsv"))
+        lapply(mut_tables, function(x) x$merged) %>%
+            do.call(rbind, .) %>%
+            arrange(patient, index) %>%
+            write_tsv(paste0(prefix, "phased_mutations.merged.tsv"))
+    ' $VCF_FILES_TBL "$PREFIX"
+}
+
+#R scripts to produce a table of VCF files to process
+#Discovery
+Rscript -e '
+    suppressMessages(library(tidyverse))
+
+    suppressMessages(read_tsv("data/samples.tsv")) %>%
+        filter(gistic) %>%
+        select(patient, sample) %>%
+        mutate(vcf = paste0("data/variants_vcf/", patient, ".vcf.gz")) %>%
+        write_tsv("phased_mutations/vcf_files.tsv")
+'
+
+#Validation
+Rscript -e '
+    suppressMessages(library(tidyverse))
+
+    suppressMessages(read_tsv("data/samples.validation.tsv")) %>%
+        #Purity >= 10% non-duplicate tumour
+        filter(purity >= 0.1, isMain, !normal) %>%
+        #Choose sample with the same GISTIC prioritisation
+        mutate(
+            psNonAsc = !(tissueType %in% c("ascites", "pleural fluid")),
+            psNonAdn = !(tissueType %in% c("adnex", "fallopian tube", "ovary")),
+            psNaive = sampleTime == "primary",
+            priorityScore = 4 * psNonAsc + 2 * psNonAdn + 1 * psNaive
+        ) %>%
+        group_by(patient) %>%
+        arrange(priorityScore, purity) %>%
+        slice_tail(n = 1) %>%
+        ungroup() %>%
+        #Get VCF path
+        select(patient, sample) %>%
+        mutate(vcf = paste0("data/variants_vcf.validation/", patient, ".vcf.gz")) %>%
+        write_tsv("phased_mutations/validation.vcf_files.tsv")
+'
+
+#Generate the tables
+generate_phased_mutation_tables "phased_mutations/vcf_files.tsv" ""
+generate_phased_mutation_tables "phased_mutations/validation.vcf_files.tsv" "validation."