-
Notifications
You must be signed in to change notification settings - Fork 1
/
concordance.pm
113 lines (100 loc) · 2.36 KB
/
concordance.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
## functions for concordance analysis
package concordance;
use Exporter 'import';
@EXPORT = qw(getGenoCoord convertVCF readChrom concordanceSite alignAlleles);
sub getGenoCoord {
my $line = shift;
my @entry = split / /, $line;
my ($chr, $pos);
if ($entry[0] =~ /(.+):(\d+)/) {
$chr = $1;
$pos = $2;
}
else {
$chr = $entry[0];
$pos = $entry[2];
}
return ($chr, $pos);
}
sub convertVCF {
my $gt = shift;
my $hom = shift;
my $ans;
my @entry = split /[:\/]/;
if ($entry[0] ne '.') {
$ans = $entry[0] + $entry[1];
}
else {
if ($hom) {
$ans = 0;
}
else {
$ans = 'NA';
}
}
return $ans;
}
sub readChrom {
my $file = shift;
my $chrom = shift;
my @lines = `tabix -p vcf $file $chrom`;
chomp @lines;
my %vcf =
map { my @entry = split /\t/; "$entry[0]:$entry[1]" => $_ } @lines;
\%vcf;
}
## computes concordance at a single site for several samples
## returns reference to array indicating for each sample whether the calls matched
sub concordanceSite {
my $nargs = @_;
my $geno1 = shift;
my $geno2 = shift;
my $idx1 = shift;
my $idx2 = shift;
if ($nargs < 4) {
die "Expected 4 parameters, got only $nargs.";
}
if ($nargs > 4) {
warn
"Ignoring additional parameters. Expected 4 parameters, got $nargs.";
}
if (@$idx1 != @$idx2) {
warn
"Index vectors are of different length. The longer one will be truncated.";
}
my @conc = ();
my $value;
for (my $i = 0; $i < @$idx1 and $i < @$idx2; $i++) {
$value = 0;
if ($$geno1[ $$idx1[$i] ] eq 'NA' or $$geno2[ $$idx2[$i] ] eq 'NA') {
$value = 'NA';
}
elsif ($$geno1[ $$idx1[$i] ] == $$geno2[ $$idx2[$i] ]) {
$value = 1;
}
push @conc, $value;
}
return \@conc;
}
## Ensure that the alles for a given site are lined up properly in two files.
## Matches the alleles and modifies the alleles of the second argument to line up with the first
## where possible.
## The input is expected to be two array references with the alleles in columns 3 and 4. The
## second array should have genotypes starting in column 5.
sub alignAlleles {
my $geno1 = shift;
my @geno2 = @{shift()};
if ($$geno1[3] ne $geno2[3] or $$geno1[4] ne $geno2[4]) {
if ($$geno1[3] eq $geno2[4] and $$geno1[4] eq $geno2[3]) {
$geno2[3] = $$geno1[3];
$geno2[4] = $$geno1[4];
@geno2[ 5 .. $#geno2 ] =
map { 2 - $_ } @geno2[ 5 .. $#geno2 ];
}
else {
undef @geno2;
}
}
return \@geno2;
}
1;