-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathFreqOfSite.pl
executable file
·87 lines (76 loc) · 2.86 KB
/
FreqOfSite.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/perl
# load module
use strict;
use Getopt::Long;
use Bio::SeqIO;
# Use this script to get the frequency of a particular nucleotide at a position
# from an epidemiological set for example.
# Users provides an alignment that contains a reference
# The alignment may be gapped, so you provide the name of your reference you are comparing to.
# The user needs to know how many gaps are in the reference
#global vars
my ( $help, $ref, $set, $site);
GetOptions( "ref=s" => \$ref, # name of the reference to use
"set=s" => \$set, #the name of the aligned fasta file which contains the alignment
"site=i" => \$site, #
"help" => \$help, ); # to get help
if(!($help)&&!($ref))
{
print "Usage : FreqofSite.pl <list of arguments>, all arguments are necessary\n";
print "Part of Sequence-manipulation by J. Hughes (joseph(dot)hughes(at)glasgow(dot)ac(dot)uk\n\n";
print "The columns/sites with gaps in the reference are removed and the position provided by the user\n";
print "is from the ungapped reference\n";
print " -ref <txt> - name of the reference sequence to use\n";
print " -set <txt> an alignment of fasta sequences\n";
print " -site <txt> the site wehere there is a mutation of interest\n";
print " -help - Get more detailed help\n";
exit();
}
if($help)
{
print "To help with getting the frequency of a mutations at a site\n\n";
print "---------------------------------------------------------------------\n";
print " -ref <txt> - name of the reference sequence to use\n";
print " -set <txt> an alignment of fasta sequences\n";
print " -site <txt> the site wehere there is a mutation of interest\n";
print " -help - Gets this help information\n";
exit();
}
# matrix used to store the proportion of ATCG at a site
my (%matrix,%refungapped,@ref);
my $inseq = Bio::SeqIO->new(-file => "$set" ,
-format => 'fasta');
print "$ref\n";
my $gapcnt=0;
while ( my $seq_obj = $inseq->next_seq() ) {
my $id=$seq_obj->display_id;
#print "$id\n";
if ($id=~/$ref/){
#print "$id $ref\n";
@ref=split(//,uc($seq_obj->seq));
for (my $i=0;$i<scalar(@ref); $i++){
#print "$ref[$i]\n";
if ($ref[$i]=~/\-/){
#print "gap\n";
$gapcnt++;
my $ungappedsite=$i+1-$gapcnt;
my $alignsite=$i+1;
#print "$ungappedsite $alignsite\n";
$refungapped{$ungappedsite}=$i+1;
}else{
my $ungappedsite=$i+1-$gapcnt;
$refungapped{$ungappedsite}=$i+1;
}
}
}
my @nucs=split(//,uc($seq_obj->seq));
for (my $j=0;$j<scalar(@nucs); $j++){
$matrix{$j+1}{$nucs[$j]}++;
}
}
print "Site $site Reference $ref[$refungapped{$site}]\n";
print "Site with gaps $refungapped{$site}\n";
print "As: $matrix{$refungapped{$site}}{A}\n";
print "Cs: $matrix{$refungapped{$site}}{C}\n";
print "Ts: $matrix{$refungapped{$site}}{T}\n";
print "Gs: $matrix{$refungapped{$site}}{G}\n";