Skip to content

WES Benchmark Validation

Mike Lloyd edited this page Feb 17, 2023 · 3 revisions

Library Load

library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(data.table)
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last

Mouse WES Comparison

Import Data

Truth Set

Mm_WES_truth <- as.data.frame(fread('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/WES/gold_truth_vcf/Mus_musculus.GRCm38_simVar_60x_FINAL_ALLchr_golden.vcf'))
colnames(Mm_WES_truth)[1] <- 'CHROM'
head(Mm_WES_truth)
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 10 3267307 . G T . PASS WP=1
2 10 3267328 . G T . PASS WP=1
3 10 3463324 . C T . PASS WP=1
4 10 3903531 . A G . PASS WP=1
5 10 3940612 . C T . PASS WP=1
6 10 3945884 . G A . PASS WP=1
nrow(Mm_WES_truth)

48366

head(Mm_WES_truth)
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 10 3267307 . G T . PASS WP=1
2 10 3267328 . G T . PASS WP=1
3 10 3463324 . C T . PASS WP=1
4 10 3903531 . A G . PASS WP=1
5 10 3940612 . C T . PASS WP=1
6 10 3945884 . G A . PASS WP=1
ind <- duplicated(Mm_WES_truth[,1:8])
table(ind)
ind
FALSE 
48366 
Mm_WES_truth[ind,]
A data.frame: 0 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>

Test Set

Mm_WES_test <- read.delim('/projects/omics_share/meta/benchmarking/WES_mouse_nxf_bench/Mus_musculus.GRCm38_simVar_60x_allChr/Mus_musculus.GRCm38_simVar_60x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
head(Mm_WES_test)
A data.frame: 6 × 14
CHROM POS REF ALT ID FILTER QUAL FILTER.1 AF SNPEFF_FUNCTIONAL_CLASS SNPEFF_GENE_NAME SNPEFF_AMINO_ACID_CHANGE SNPEFF_EFFECT SNPEFF_TRANSCRIPT_ID
<chr> <int> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
1 1 3670606 C A PASS 849.64 PASS 0.5 MISSENSE Xkr4 C248F NON_SYNONYMOUS_CODING ENSMUST00000070533
2 1 4292973 C T PASS 848.64 PASS 0.5 MISSENSE Rp1 V344I NON_SYNONYMOUS_CODING ENSMUST00000208660
3 1 4346268 T C PASS 1647.64 PASS 0.5 SILENT Rp1 E1540 SYNONYMOUS_CODING ENSMUST00000027032
4 1 4346499 T A PASS 1595.64 PASS 0.5 MISSENSE Rp1 E1463D NON_SYNONYMOUS_CODING ENSMUST00000027032
5 1 4346590 A G PASS 1677.64 PASS 0.5 MISSENSE Rp1 I1433T NON_SYNONYMOUS_CODING ENSMUST00000027032
6 1 4348204 C T PASS 2083.64 PASS 0.5 MISSENSE Rp1 G895D NON_SYNONYMOUS_CODING ENSMUST00000027032

Compare Truth to Test

joined_data <- dplyr::left_join(Mm_WES_truth, Mm_WES_test, by = c('CHROM' = 'CHROM', 'POS' = 'POS'))
TP_hits <- joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x == ALT.y) 

# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect. 
TP <- nrow(TP_hits)
FN_hits <- joined_data %>% filter(!complete.cases(.))
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Mm_WES_test,Mm_WES_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% filter(!complete.cases(.))
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
LowCoverage       LowQD        PASS  SnpCluster  StrandBias 
         53           1       47172          12           2 
table(FP_hits$FILTER.x)
PASS 
 583 
#Precision = TruePositives / (TruePositives + FalsePositives)

precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)

recall = TP / (TP + FN)
round(precision, digits = 4)

0.9878

round(recall, digits = 4)

0.9767

Human WES Comparison

Import Data

Truth Set

Hs_WES_truth <- as.data.frame(fread('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/WES/gold_truth_vcf/Homo_sapiens.GRCh38_simVar_60x_ALLchr_golden_sorted.vcf'))
colnames(Hs_WES_truth)[1] <- 'CHROM'
nrow(Hs_WES_truth)

51468

head(Hs_WES_truth)
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr10 221321 . G A . PASS WP=1
2 chr10 236865 . C A . PASS WP=1
3 chr10 252317 . T C . PASS WP=1
4 chr10 286349 . T C . PASS WP=1
5 chr10 324915 . T C . PASS WP=1
6 chr10 348785 . T A . PASS WP=1
ind <- duplicated(Hs_WES_truth[,1:8])
table(ind)
ind
FALSE 
51468 
Hs_WES_truth[ind,]
A data.frame: 0 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>

Test Set

Hs_WES_test <- read.delim('/projects/omics_share/meta/benchmarking/WES_human_nxf_bench/Homo_sapiens.GRCh38_simVar_60x_allChr/Homo_sapiens.GRCh38_simVar_60x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
head(Hs_WES_test)
A data.frame: 6 × 45
CHROM POS ID REF ALT QUAL FILTER ANN....ALLELE ANN....EFFECT ANN....IMPACT dbNSFP_Polyphen2_HDIV_score dbNSFP_MutationAssessor_score dbNSFP_phyloP100way_vertebrate dbNSFP_1000Gp3_AF dbNSFP_1000Gp3_AFR_AF dbNSFP_1000Gp3_EUR_AF dbNSFP_1000Gp3_AMR_AF dbNSFP_1000Gp3_EAS_AF dbNSFP_ESP6500_AA_AF dbNSFP_ESP6500_EA_AF
<chr> <int> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 chr1 942193 G A 998.64 PASS A synonymous_variant LOW NA NA NA NA NA NA NA NA NA NA
2 chr1 942193 G A 998.64 PASS A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
3 chr1 943029 rs181205550;COSV58985687 C T 1205.64 PASS T missense_variant MODERATE NA NA NA NA NA NA NA NA NA NA
4 chr1 943029 rs181205550;COSV58985687 C T 1205.64 PASS T downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
5 chr1 943344 rs1198509075 G A 632.64 PASS A synonymous_variant LOW NA NA NA NA NA NA NA NA NA NA
6 chr1 943344 rs1198509075 G A 632.64 PASS A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
Hs_WES_test_short <- Hs_WES_test %>% dplyr::select(CHROM, POS, ID, REF, ALT, QUAL, FILTER) %>% distinct(CHROM, POS, ID, REF, ALT, QUAL, FILTER, .keep_all = TRUE)
head(Hs_WES_test_short)
A data.frame: 6 × 7
CHROM POS ID REF ALT QUAL FILTER
<chr> <int> <chr> <chr> <chr> <dbl> <chr>
1 chr1 942193 G A 998.64 PASS
2 chr1 943029 rs181205550;COSV58985687 C T 1205.64 PASS
3 chr1 943344 rs1198509075 G A 632.64 PASS
4 chr1 943950 rs768785250 C G 1091.64 PASS
5 chr1 945215 G T 916.64 PASS
6 chr1 945255 G A 691.64 PASS
nrow(Hs_WES_test_short)

51390

Compare Truth to Test

joined_data <- dplyr::left_join(Hs_WES_truth, Hs_WES_test_short, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y)
head(joined_data)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr10 221321 G A G A PASS PASS
2 chr10 236865 C A C A PASS PASS
3 chr10 252317 T C T C PASS PASS
4 chr10 286349 T C T C PASS PASS
5 chr10 324915 T C T C PASS PASS
6 chr10 348785 T A T A PASS PASS
TP_hits <- joined_data %>% filter(ALT.x == ALT.y) 

# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect. 
head(TP_hits)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr10 221321 G A G A PASS PASS
2 chr10 236865 C A C A PASS PASS
3 chr10 252317 T C T C PASS PASS
4 chr10 286349 T C T C PASS PASS
5 chr10 324915 T C T C PASS PASS
6 chr10 348785 T A T A PASS PASS
TP <- nrow(TP_hits)
TP

50722

FN_hits <- joined_data %>% filter(!complete.cases(.))
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Hs_WES_test, Hs_WES_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y) %>% filter(!complete.cases(.))
head(FP_hits)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr1 966556 C CATGCTAGA NA NA PASS NA
2 chr1 966556 C CATGCTAGA NA NA PASS NA
3 chr1 966556 C CATGCTAGA NA NA PASS NA
4 chr1 966556 C CATGCTAGA NA NA PASS NA
5 chr1 1307751 C CG NA NA PASS NA
6 chr1 1307751 C CG NA NA PASS NA
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
LowCoverage        PASS  SnpCluster  StrandBias 
         35       50680           3           4 
table(FP_hits$FILTER.x)
LowCoverage        PASS 
          5        2915 
#Precision = TruePositives / (TruePositives + FalsePositives)

precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)

recall = TP / (TP + FN)
round(precision, digits = 4)

0.9455

round(recall, digits = 4)

0.9855

Clone this wiki locally