forked from DamienFr/GC_content_in_sliding_window
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GC_content.pl
178 lines (140 loc) · 6 KB
/
GC_content.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;
use Getopt::Long;
=head1 Description
This script will first calculate, for each nucleotide position, the %GC of the surrounding nucleotid sequence according to the window size.
For the first nucleotid, half of the window size will be taken from the end of the sequence, because this script has been developped for cicular genoms.
Then, i will print a value of % each "step" nucleotides
=head1 Usage
perl GC_content.pl -fasta genome.fasta [-window 1000] [-step 100] [-log] [-help]
Arguments details :
-fasta Fasta file, (REQUIRED) Can contain multiple sequences: script will produce as many output files as input sequences. Each sequence can be multiline.
-window Window size, (optional, default 1000) Even number ONLY. Sets the number of nucleotides used to calculate the %GC value of each position.
-step Step size, (optional, default 100) The output will contain a sliding GC% value every "step" nucleotides. The numbers of values you get is therefore (length genome)/(step).
-log (optional) For debugging purposes only
-help (optional) Shows this help
=head1 Output files
[input_name].GC_content
column 1: "chr1" for convenience when the program is used to create an input for Circos (www.circos.ca/)
column 2: start position of interval
column 3: end position of interval
column 4: GC% of the interval
example:
chr1 1 100 0.499
chr1 101 200 0.5
chr1 201 300 0.5
chr1 301 400 0.5
chr1 401 500 0.5
chr1 501 600 0.5
chr1 601 700 0.499
chr1 701 800 0.5
[input_name].GC_deviation
column 1: "chr1" for convenience when the program is used to create an input for Circos (www.circos.ca/)
column 2: start position of interval
column 3: end position of interval
column 4: GC deviation of the interval
example:
chr1 1 100 0.162
chr1 101 200 0.16
chr1 201 300 0.16
chr1 301 400 0.16
chr1 401 500 0.16
chr1 501 600 0.16
chr1 601 700 0.162
=head1 AUTHOR
Damien Richard, 2019
=cut
my ($fasta,$help,$log,%h_pcent, $size_fasta, $sequence, $not_sequence,%h_gcdev,%hhh,$title_provisoire,$last_end,$last_end2);
GetOptions(
"fasta=s" => \$fasta,
"window=s" => \(my $window = 1000),
"step=s" => \(my $step = 100),
"log" => \$log,
"help" => \$help
);
die `pod2text $0` unless $fasta;
die `pod2text $0` if $help;
$fasta =~ s/\r?\n//g;
$window =~ s/\r?\n//g;
$step =~ s/\r?\n//g;
if ($window % 2 != 0){print "\n!!! Warning : window must be an even number... !!!\n\n"; die `pod2text $0`}
if($log){ open (LOG, ">", $fasta . ".log") or die "can't open $!";}
# reading input fasta file WITHOUT using perl module Bio::SeqIO;
#open (IN, "<", $fasta) or die "can't open $!";
#$not_sequence = 0;
#while (my $line = <IN>) {
# $line =~ s/\r?\n//g;
# $line =~ s/\s//g;
# if($line =~ /^>/){ $line =~ s/^>//; if(exists($hhh{$line})){ print "\nTwo sequence titles have the exact same name : $line, i'll die now ...\n"; die } ;
# $title_provisoire = $line}else{ $hhh{$title_provisoire} .= $line }
#}
#close IN;
# reading input fasta file WITH Bio::SeqIO;
my $fasta_in = Bio::SeqIO->new( -file => $fasta, -format => 'Fasta' );
while ( my $seq = $fasta_in->next_seq() ) {
$title_provisoire = $seq->id;
if ( exists( $hhh{$seq->id} ) ) { print "\nTwo sequence titles have the exact same name : $title_provisoire, i'll die now ...\n"; die ;}
$hhh{$seq->id} = $seq->seq();
}
#### loop on all sequences of the input fasta file ####
foreach my $kkk ( keys %hhh ){
my $kkk2 = $kkk;
print "\n\tWorking on sequence $kkk\n\n";
open (GCPC, ">", $fasta . "_" . $kkk2 . ".GC_content" ) or die "can't open $!";
open (GCDEV, ">", $fasta . "_" . $kkk2 . ".GC_deviation") or die "can't open $!";
my $sequence = $hhh{$kkk};
$size_fasta = length($sequence);
if ($sequence =~ m/[^ATGC]/){print "Your fasta sequence contains other caracters than [ATGC], i'll keep working normally, it's just so you know ...\n ";}
#### Calculating mean GC content ####
my $count1 = 0;
while ($sequence =~ /[GC]/g) { $count1++ }
my $pcent = $count1 / $size_fasta * 100 ;
print "\tSequence\tLength\tMean GC content\n\t$kkk\t$size_fasta\t$pcent % \n";
my $line_mod = (substr $sequence, -$window/2 ) . $sequence . (substr $sequence, 0, ($window / 2));
print LOG $line_mod . "\n" if $log;
my $start = 0;
until ($start > (length($sequence)))
{
my $count = 0;
my $countG = 0; my $countC = 0; #GC_deviation
my $tmp = substr $line_mod, $start, $window;
my $position = $start +1 ;
while ($tmp =~ /[GC]/g) {$count++}
while ($tmp =~ /[C]/g) {$countC++} #GC_deviation
while ($tmp =~ /[G]/g) {$countG++} #GC_deviation
my $gcdev = ($countG-$countC)/($countG+$countC) ; #GC_deviation
#print LOG $count . " " . $tmp . "\n";
my $pcent = $count / $window ;
$h_pcent{$position} = $pcent;
$h_gcdev{$position} = $gcdev; #GC_deviation
print LOG $position . " " . $tmp . " $pcent\n" if $log;
$start ++; # $start += $window; if i choose to calculate the mean on the windw instead of making it for each nucleotid
}
my $k = 0;
my $acc = 0;
foreach my $key2 ( sort {$a<=>$b} keys %h_pcent)
{
if ($key2 % $step == 0 || $key2 == ($size_fasta + 1)) {
my $start_window2 = $key2 - ($step - 1);
my $end_window2 = $key2; if($end_window2 >= $size_fasta){$start_window2 = $last_end + 1; $end_window2 = $size_fasta }
print GCPC "chr1 " . $start_window2 . " " . $end_window2 . " " . $h_pcent{$key2} . "\n";
$last_end = $end_window2;
}
}
foreach my $key3 ( sort {$a<=>$b} keys %h_gcdev)
{
if ($key3 % $step == 0 || $key3 == ($size_fasta + 1)) {
my $start_window2 = $key3 - ($step - 1);
my $end_window2 = $key3; if($end_window2 >= $size_fasta){$start_window2 = $last_end2 + 1; $end_window2 = $size_fasta }
print GCDEV "chr1 " . $start_window2 . " " . $end_window2 . " " . $h_gcdev{$key3} . "\n";
$last_end2 = $end_window2;
}
}
close GCDEV;
close GCPC;
print "\n\tOutput files are :\n\t" . $fasta . "_" . $kkk2 . ".GC_content\n\t" . $fasta . "_" . $kkk2 . ".GC_deviation\n";
}
if($log){close LOG;}
unlink $fasta . ".log" if !$log;