-
Notifications
You must be signed in to change notification settings - Fork 10
WES Benchmark Validation
Mike Lloyd edited this page Feb 17, 2023
·
3 revisions
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(data.table)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Attaching package: ‘data.table’
The following objects are masked from ‘package:dplyr’:
between, first, last
Mm_WES_truth <- as.data.frame(fread('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/WES/gold_truth_vcf/Mus_musculus.GRCm38_simVar_60x_FINAL_ALLchr_golden.vcf'))
colnames(Mm_WES_truth)[1] <- 'CHROM'
head(Mm_WES_truth)
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | 10 | 3267307 | . | G | T | . | PASS | WP=1 |
2 | 10 | 3267328 | . | G | T | . | PASS | WP=1 |
3 | 10 | 3463324 | . | C | T | . | PASS | WP=1 |
4 | 10 | 3903531 | . | A | G | . | PASS | WP=1 |
5 | 10 | 3940612 | . | C | T | . | PASS | WP=1 |
6 | 10 | 3945884 | . | G | A | . | PASS | WP=1 |
nrow(Mm_WES_truth)
48366
head(Mm_WES_truth)
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | 10 | 3267307 | . | G | T | . | PASS | WP=1 |
2 | 10 | 3267328 | . | G | T | . | PASS | WP=1 |
3 | 10 | 3463324 | . | C | T | . | PASS | WP=1 |
4 | 10 | 3903531 | . | A | G | . | PASS | WP=1 |
5 | 10 | 3940612 | . | C | T | . | PASS | WP=1 |
6 | 10 | 3945884 | . | G | A | . | PASS | WP=1 |
ind <- duplicated(Mm_WES_truth[,1:8])
table(ind)
ind
FALSE
48366
Mm_WES_truth[ind,]
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO |
---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
Mm_WES_test <- read.delim('/projects/omics_share/meta/benchmarking/WES_mouse_nxf_bench/Mus_musculus.GRCm38_simVar_60x_allChr/Mus_musculus.GRCm38_simVar_60x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
head(Mm_WES_test)
CHROM | POS | REF | ALT | ID | FILTER | QUAL | FILTER.1 | AF | SNPEFF_FUNCTIONAL_CLASS | SNPEFF_GENE_NAME | SNPEFF_AMINO_ACID_CHANGE | SNPEFF_EFFECT | SNPEFF_TRANSCRIPT_ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <dbl> | <chr> | <dbl> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | 1 | 3670606 | C | A | PASS | 849.64 | PASS | 0.5 | MISSENSE | Xkr4 | C248F | NON_SYNONYMOUS_CODING | ENSMUST00000070533 | |
2 | 1 | 4292973 | C | T | PASS | 848.64 | PASS | 0.5 | MISSENSE | Rp1 | V344I | NON_SYNONYMOUS_CODING | ENSMUST00000208660 | |
3 | 1 | 4346268 | T | C | PASS | 1647.64 | PASS | 0.5 | SILENT | Rp1 | E1540 | SYNONYMOUS_CODING | ENSMUST00000027032 | |
4 | 1 | 4346499 | T | A | PASS | 1595.64 | PASS | 0.5 | MISSENSE | Rp1 | E1463D | NON_SYNONYMOUS_CODING | ENSMUST00000027032 | |
5 | 1 | 4346590 | A | G | PASS | 1677.64 | PASS | 0.5 | MISSENSE | Rp1 | I1433T | NON_SYNONYMOUS_CODING | ENSMUST00000027032 | |
6 | 1 | 4348204 | C | T | PASS | 2083.64 | PASS | 0.5 | MISSENSE | Rp1 | G895D | NON_SYNONYMOUS_CODING | ENSMUST00000027032 |
joined_data <- dplyr::left_join(Mm_WES_truth, Mm_WES_test, by = c('CHROM' = 'CHROM', 'POS' = 'POS'))
TP_hits <- joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x == ALT.y)
# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect.
TP <- nrow(TP_hits)
FN_hits <- joined_data %>% filter(!complete.cases(.))
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Mm_WES_test,Mm_WES_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% filter(!complete.cases(.))
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
LowCoverage LowQD PASS SnpCluster StrandBias
53 1 47172 12 2
table(FP_hits$FILTER.x)
PASS
583
#Precision = TruePositives / (TruePositives + FalsePositives)
precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)
recall = TP / (TP + FN)
round(precision, digits = 4)
0.9878
round(recall, digits = 4)
0.9767
Hs_WES_truth <- as.data.frame(fread('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/WES/gold_truth_vcf/Homo_sapiens.GRCh38_simVar_60x_ALLchr_golden_sorted.vcf'))
colnames(Hs_WES_truth)[1] <- 'CHROM'
nrow(Hs_WES_truth)
51468
head(Hs_WES_truth)
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | chr10 | 221321 | . | G | A | . | PASS | WP=1 |
2 | chr10 | 236865 | . | C | A | . | PASS | WP=1 |
3 | chr10 | 252317 | . | T | C | . | PASS | WP=1 |
4 | chr10 | 286349 | . | T | C | . | PASS | WP=1 |
5 | chr10 | 324915 | . | T | C | . | PASS | WP=1 |
6 | chr10 | 348785 | . | T | A | . | PASS | WP=1 |
ind <- duplicated(Hs_WES_truth[,1:8])
table(ind)
ind
FALSE
51468
Hs_WES_truth[ind,]
CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO |
---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
Hs_WES_test <- read.delim('/projects/omics_share/meta/benchmarking/WES_human_nxf_bench/Homo_sapiens.GRCh38_simVar_60x_allChr/Homo_sapiens.GRCh38_simVar_60x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
head(Hs_WES_test)
CHROM | POS | ID | REF | ALT | QUAL | FILTER | ANN....ALLELE | ANN....EFFECT | ANN....IMPACT | ⋯ | dbNSFP_Polyphen2_HDIV_score | dbNSFP_MutationAssessor_score | dbNSFP_phyloP100way_vertebrate | dbNSFP_1000Gp3_AF | dbNSFP_1000Gp3_AFR_AF | dbNSFP_1000Gp3_EUR_AF | dbNSFP_1000Gp3_AMR_AF | dbNSFP_1000Gp3_EAS_AF | dbNSFP_ESP6500_AA_AF | dbNSFP_ESP6500_EA_AF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <chr> | <chr> | <chr> | <chr> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | chr1 | 942193 | G | A | 998.64 | PASS | A | synonymous_variant | LOW | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | |
2 | chr1 | 942193 | G | A | 998.64 | PASS | A | downstream_gene_variant | MODIFIER | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | |
3 | chr1 | 943029 | rs181205550;COSV58985687 | C | T | 1205.64 | PASS | T | missense_variant | MODERATE | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
4 | chr1 | 943029 | rs181205550;COSV58985687 | C | T | 1205.64 | PASS | T | downstream_gene_variant | MODIFIER | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
5 | chr1 | 943344 | rs1198509075 | G | A | 632.64 | PASS | A | synonymous_variant | LOW | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
6 | chr1 | 943344 | rs1198509075 | G | A | 632.64 | PASS | A | downstream_gene_variant | MODIFIER | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
Hs_WES_test_short <- Hs_WES_test %>% dplyr::select(CHROM, POS, ID, REF, ALT, QUAL, FILTER) %>% distinct(CHROM, POS, ID, REF, ALT, QUAL, FILTER, .keep_all = TRUE)
head(Hs_WES_test_short)
CHROM | POS | ID | REF | ALT | QUAL | FILTER | |
---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <chr> | |
1 | chr1 | 942193 | G | A | 998.64 | PASS | |
2 | chr1 | 943029 | rs181205550;COSV58985687 | C | T | 1205.64 | PASS |
3 | chr1 | 943344 | rs1198509075 | G | A | 632.64 | PASS |
4 | chr1 | 943950 | rs768785250 | C | G | 1091.64 | PASS |
5 | chr1 | 945215 | G | T | 916.64 | PASS | |
6 | chr1 | 945255 | G | A | 691.64 | PASS |
nrow(Hs_WES_test_short)
51390
joined_data <- dplyr::left_join(Hs_WES_truth, Hs_WES_test_short, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y)
head(joined_data)
CHROM | POS | REF.x | ALT.x | REF.y | ALT.y | FILTER.x | FILTER.y | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | chr10 | 221321 | G | A | G | A | PASS | PASS |
2 | chr10 | 236865 | C | A | C | A | PASS | PASS |
3 | chr10 | 252317 | T | C | T | C | PASS | PASS |
4 | chr10 | 286349 | T | C | T | C | PASS | PASS |
5 | chr10 | 324915 | T | C | T | C | PASS | PASS |
6 | chr10 | 348785 | T | A | T | A | PASS | PASS |
TP_hits <- joined_data %>% filter(ALT.x == ALT.y)
# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect.
head(TP_hits)
CHROM | POS | REF.x | ALT.x | REF.y | ALT.y | FILTER.x | FILTER.y | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | chr10 | 221321 | G | A | G | A | PASS | PASS |
2 | chr10 | 236865 | C | A | C | A | PASS | PASS |
3 | chr10 | 252317 | T | C | T | C | PASS | PASS |
4 | chr10 | 286349 | T | C | T | C | PASS | PASS |
5 | chr10 | 324915 | T | C | T | C | PASS | PASS |
6 | chr10 | 348785 | T | A | T | A | PASS | PASS |
TP <- nrow(TP_hits)
TP
50722
FN_hits <- joined_data %>% filter(!complete.cases(.))
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Hs_WES_test, Hs_WES_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y) %>% filter(!complete.cases(.))
head(FP_hits)
CHROM | POS | REF.x | ALT.x | REF.y | ALT.y | FILTER.x | FILTER.y | |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | chr1 | 966556 | C | CATGCTAGA | NA | NA | PASS | NA |
2 | chr1 | 966556 | C | CATGCTAGA | NA | NA | PASS | NA |
3 | chr1 | 966556 | C | CATGCTAGA | NA | NA | PASS | NA |
4 | chr1 | 966556 | C | CATGCTAGA | NA | NA | PASS | NA |
5 | chr1 | 1307751 | C | CG | NA | NA | PASS | NA |
6 | chr1 | 1307751 | C | CG | NA | NA | PASS | NA |
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
LowCoverage PASS SnpCluster StrandBias
35 50680 3 4
table(FP_hits$FILTER.x)
LowCoverage PASS
5 2915
#Precision = TruePositives / (TruePositives + FalsePositives)
precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)
recall = TP / (TP + FN)
round(precision, digits = 4)
0.9455
round(recall, digits = 4)
0.9855