-
Notifications
You must be signed in to change notification settings - Fork 1
/
gff_utils.pm
executable file
·190 lines (169 loc) · 8.13 KB
/
gff_utils.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#-------------------------------------------------------------------------------
#---- MODULE NAME ----
#-------------------------------------------------------------------------------
package gff_utils;
use strict;
use vars qw(@ISA @EXPORT $VERSION);
use Exporter;
use IO::File;
use IO::Zlib; #qw(:gzip_external 1);
use Carp;
use Bio::SeqIO;
use Data::Dumper;
use Cwd;
use File::Basename;
# Custom modules
use Fasta_utils;
our @ISA = qw(Exporter);
our @EXPORT = qw(
parse_gff
get_gff_attribute
);
# Andrew J. Oler, PhD
# Created at
# Computational Biology Section
# Bioinformatics and Computational Biosciences Branch (BCBB)
# OCICB/OSMO/OD/NIAID/NIH
# Bethesda, MD 20892
#
# andrew.oler@gmail.com
#
#This package is free software; you can redistribute it and/or modify
#it under the terms of the GPL (either version 1, or at your option,
#any later version) or the Artistic License 2.0.
#-------------------------------------------------------------------------------
#----------------------------------- CHANGELOG ---------------------------------
#-------------------------------------------------------------------------------
# 2017-02-05
# Created module. Moved some subroutines from convert_reads_to_amino_acid.pl to
# this module since they are deprecated in that script but they might be useful
# for another script.
#-------------------------------------------------------------------------------
#----------------------------------- FUNCTIONS ---------------------------------
#-------------------------------------------------------------------------------
#-----------------------------------------------------------------------------
sub parse_gff {
# Takes a gff file and a genome multi-fasta file of chromosomes and outputs a hashref containing the CDS starts, stops, strand, nucleotide sequence and protein sequence for each gene. Also, nucleotide, amino acid and codons tables for lookup by position for each gene. (Is it possible to have multiple CDS per gene in a gff file? How would that affect this?)
# See primer-id-progs v0.1.1 for deprecated usage.
my ($gff, $ref_fasta) = @_;
my $genes; # hashref to store the coordinates for the CDS of genes on the chromosomes. first keys, chr; second keys, gene id; third keys start, stop, starts, stops, strand, nucleotide, protein, nuc->pos, aa->pos, codon->pos.
my $readfh = open_to_read($gff);
while(<$readfh>){
next if (m/^#/);
my @line = split(/\t/);
next unless (scalar(@line) > 8);
next unless ($line[2] eq "CDS");
my ($chr, $start, $stop, $strand) = ($line[0], $line[3], $line[4], $line[6]);
my $gene_id = get_gff_attribute($line[8], "ID");
my $codon_start = get_gff_attribute($line[8], "codon_start");
if ($debug){ print "id: $gene_id\tcodon_start: $codon_start\n"; }
if ($codon_start && ($codon_start > 1)){
unless ($genes->{$chr}->{$gene_id}->{starts}){ # This only applies to the first exon. For some reason, the codon_start attribute occurs on all CDS exon features of a CDS, all referring to the start codon of the first CDS exon.
# Then this is the first CDS exon.
$start += $codon_start - 1;
}
}
# next unless ($strand eq "+"); # Assuming + strand for now because my annotation is all on + strand.
$genes->{$chr}->{$gene_id}->{strand} = $strand;
if (exists($genes->{$chr}->{$gene_id}->{start})){
# One portion of the CDS is already present. For the purpose of the TSS and TTS, get the earliest start and latest stop
$genes->{$chr}->{$gene_id}->{start} = ($start < $genes->{$chr}->{$gene_id}->{start}) ? $start : $genes->{$chr}->{$gene_id}->{start}; # If the value present is larger, then reassign to the smaller one.
$genes->{$chr}->{$gene_id}->{stop} = ($stop > $genes->{$chr}->{$gene_id}->{stop}) ? $stop : $genes->{$chr}->{$gene_id}->{stop}; # If the value present is smaller, then reassign to the larger one.
}
else {
$genes->{$chr}->{$gene_id}->{start} = $start;
$genes->{$chr}->{$gene_id}->{stop} = $stop;
}
push @{ $genes->{$chr}->{$gene_id}->{starts} }, $start; # Some influenza proteins are spliced. In that case, multiple CDS lines from GFF file need to be concatenated...
push @{ $genes->{$chr}->{$gene_id}->{stops} }, $stop;
}
close ($readfh);
# return $genes;
# Now take CDS entries and get sequence for them from --fasta input file, then translate that sequence
my $in = Bio::SeqIO->new(-file => "$ref_fasta" ,
-format => 'Fasta');
while ( my $seq = $in->next_seq() ) {
my $chr = $seq->display_id(); # chr name #print "id: $chr\n";
next unless exists ($genes->{$chr}); # Skip this fasta record if there are no annotated genes on this chromosome.
foreach my $gene (keys %{$genes->{$chr}}){ # Look at each gene on this chromosome. Assuming + strand since my GFF file only has + strand genes. Actually, I can just reverse complement the final concatenated sequence if it is the - strand.
# Get the first CDS segment
my $seqstr_nuc = $seq->subseq( $genes->{$chr}->{$gene}->{starts}->[0], $genes->{$chr}->{$gene}->{stops}->[0] ); # print "$chr $gene: ".length($seqstr_nuc)."\n";
if (scalar(@{$genes->{$chr}->{$gene}->{starts}})>1){ # If there are additional CDS segments
# Then get the rest of the sequences. I can concatenate the rest to the first segment $seqstr_nuc because I've assumed above that they are all on the + strand.
for (my $i = 1; $i < @{$genes->{$chr}->{$gene}->{starts}}; $i++){
$seqstr_nuc .= $seq->subseq( $genes->{$chr}->{$gene}->{starts}->[$i], $genes->{$chr}->{$gene}->{stops}->[$i] ); # print "$chr $gene: ".length($seqstr_nuc)."\n";
}
}
if ($genes->{$chr}->{$gene}->{strand} eq '-'){
my $seqstr_nuc = Fasta_utils::rev_comp($seqstr_nuc); # Not tested.
}
my $prot = translate_seq($seqstr_nuc);
$genes->{$chr}->{$gene}->{nucleotide} = $seqstr_nuc;
$genes->{$chr}->{$gene}->{protein} = $prot;
# Now save the nucleotides, amino acids and codons by position.
my @nuc = split(/|/, $seqstr_nuc);
my @aa = split(/|/, $prot);
my @codon = unpack("(a3)*", $seqstr_nuc); # http://stackoverflow.com/questions/372370/how-can-i-split-a-string-into-chunks-of-two-characters-each-in-perl
# Nucleotides first
for (my $i = 0; $i < @nuc; $i++){
my $nt = uc($nuc[$i]);
my $pos = $i + 1;
if (exists($genes->{$chr}->{$gene}->{'nuc'}->{$pos})){
warn "This gene + nucleotide position already present: $gene, $pos, $nt\n";
}
else {
$genes->{$chr}->{$gene}->{'nuc'}->{$pos} = $nt;
}
}
# Now save amino acids and codons
for (my $i = 0; $i < @aa; $i++){
my $aa = uc($aa[$i]);
my $codon = uc($codon[$i]);
my $pos = $i + 1;
# unless ($aa =~ m/X/i){ # Commented this out because if we skip these amino acids, it will cause a frameshift.
# Amino acid
if (exists($genes->{$chr}->{$gene}->{'aa'}->{$pos})){
warn "This gene + amino acid position already present: $gene, $pos, $aa\n";
}
else {
$genes->{$chr}->{$gene}->{'aa'}->{$pos} = $aa;
}
# Codon
if (exists($genes->{$chr}->{$gene}->{'codon'}->{$pos})){
warn "This gene + codon position already present: $gene, $pos, $codon\n";
}
else {
$genes->{$chr}->{$gene}->{'codon'}->{$pos} = $codon;
}
# }
}
}
}
return $genes;
}
#--------------------------------------------------------------------------------
sub get_gff_attribute {
# gets the gene ID. "IDs must be unique within the scope of the GFF file" http://gmod.org/wiki/GFF
my $attributes = shift;
my $tag = shift;
my @attributes = split(/;/, $attributes);
my $value;
foreach my $pair (@attributes){
my @pair = split(/=/, $pair);
if ($pair[0] eq "$tag"){
$value = $pair[1];
}
else {
# print "Not ID: $pair[0]\n";
}
}
unless ($value){
die "No $tag found in the GFF attributes: \n$attributes\n";
}
return $value;
}
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
1;