-
Notifications
You must be signed in to change notification settings - Fork 26
/
blast2gff.py
executable file
·184 lines (164 loc) · 7.47 KB
/
blast2gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# v1.0 2014-10-22
# v1.1 added option to remove weak blast hits, fixed integer bug 2014-10-28
# v1.2 reverted to previous version after name change 2015-05-13
# v1.3 parsing for IDs with comma separated items
#
# blast2gff.py convert blast output to gff format for genome annotation
# translated from blast2gff.pl and parseblast.pl
#
# and using information from:
# https://www.sanger.ac.uk/resources/software/gff/spec.html
# http://www.sequenceontology.org/gff3.shtml
'''blast2gff.py last modified 2022-07-08
blast2gff.py -b tblastn_output.tab > output.gff3
change the second and third fields in the gff output with -p and -t
blast2gff.py -b blastn_output.tab -p BLASTN -t EST_match > output.gff3
tabular blast output should be made from blast programs with -outfmt 6
tblastn -query refprots.fa -db target_genome.fa -outfmt 6 > tblastn_output.tab
evalue cutoff between 1 and 1e-3 is appropriate to filter bad hits
though this depends on the bitscore and so the relatedness of the species
to generate hints for AUGUSTUS, use -A, also change type as -t CDSpart
'''
#
import sys
import os
import argparse
import time
from collections import defaultdict
#
### BLAST OUTPUTS
# default blastn or tblastn output for -outfmt 6 is printed tabular of:
#
# qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
#
# referring to: query ID, subject ID, percent identity, length of alignment,
# mismatch count, gap openings, query start and end, subject start and end,
# evalue, and the bitscore
#
# for example tblastn output may look:
# prot12345 Contig248 86.89 122 0 1 213 318 16668 17033 1e-63 214
#
# where protein 12345 hit Contig248 with evalue of 1e-63
#
### GFF FORMAT
# GFF3 format is a tabular of:
#
# seqid source type start end score strand phase attributes
#
# in the context of blast results, this is converted as:
# seqid is the subject ID, so sseqid
# source is BLAST
# type could be mRNA, exon, CDS, but in this case HSP as the BLAST result
# start must always be less or equal to end, so if BLAST hits the reverse
# complement, these must be switched
# score is the bitscore
# strand is either + or - for forward and reverse hits
# phase is intron phase used only for "CDS" type, otherwise is "."
# attributes is a list of tag=value; pairs usually including query ID
#
#
#
# general notes:
#
# to calculate query coverage as on ncbi, appears to be query length / subject length
# to calculate identity % as on ncbi, subject length might have to be absolute value, and 1 must be added as the first base is position 1
# however on ncbi it is calculated as identities / subject length
def write_line(outlist, wayout):
outline = "{}\n".format( "\t".join(outlist) )
wayout.write(outline)
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-b','--blast', help="blast results file")
parser.add_argument('-p','--program', help="blast program for 2nd column in output [TBLASTN]", default="TBLASTN")
parser.add_argument('-t','--type', help="gff type or method [match_part]", default="match_part")
parser.add_argument('-e','--evalue-cutoff', type=float, help="evalue cutoff [1]", default=1.0)
parser.add_argument('-s','--score-cutoff', type=float, help="bitscore/length cutoff for filtering [0.1], use with -f", default=0.1)
parser.add_argument('-l','--length-cutoff', type=int, help="minimum match length cutoff for filtering [1]", default=1)
parser.add_argument('-A','--augustus', action="store_true", help="print source information for AUGUSTUS hints")
parser.add_argument('-F','--filter', action="store_true", help="filter low quality matches, set value with -s")
parser.add_argument('-S','--swissprot', action="store_true", help="query sequences have swissprot headers")
parser.add_argument('-L','--locus', action="store_true", help="IDs will be named by the locus, as scaffold_start_end")
parser.add_argument('-U','--parse-properties', action="store_true", help="parse additional attributes from the seq ID separated by ,")
parser.add_argument('-v','--verbose', action="store_true", help="extra output")
args = parser.parse_args(argv)
# counter for number of lines, and strand flips
linecounter, writecounter = 0,0
plusstrand, minusstrand = 0,0
# counter for number of hits that are filtered
badhits = 0
shorthits = 0
hitDictCounter = defaultdict(int)
sys.stderr.write("# Starting BLAST parsing on {} {}\n".format(args.blast, time.asctime() ) )
for line in open(args.blast, 'r'):
linecounter += 1
extra_attributes = []
line = line.strip()
lsplits = line.split("\t")
# 0 1 2 3 4 5 6 7 8 9 10 11
qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = line.rstrip().split("\t")
# remove stray characters
if args.swissprot:
# blast outputs swissprot proteins as: sp|P0DI82|TPC2B_HUMAN
qseqid = qseqid.split("|")[2]
elif args.parse_properties:
extra_attributes = [ "{}={}".format( *attr.split(":") ) for attr in qseqid.split(",")[1:] ]
qseqid = qseqid.split(",")[0]
else:
qseqid = qseqid.replace("|","")
hitDictCounter[qseqid] += 1
# currently 'attributes' is only query id
# ID only appears to not work for visualization, as the gene should be the blast query
#attributes = "ID={}".format(qseqid)
if args.augustus:
attributes = "source=P;ID={0}.{1}-{2}".format(qseqid, qstart, qend)
elif args.locus:
attributes = "ID={0}_{1}_{2};Target={3} {4} {5}".format(sseqid, sstart, send, qseqid, qstart, qend)
elif args.parse_properties:
attributes = ";".join( ["ID={0}.{1}{2}".format(qseqid, args.type, hitDictCounter[qseqid]) , "Target={0} {1} {2}".format(qseqid, qstart, qend)] + extra_attributes )
else:
attributes = "ID={0}.{1}{2};Target={0} {3} {4}".format(qseqid, args.type, hitDictCounter[qseqid], qstart, qend)
# if verbose, display the current attributes format for debugging
if args.verbose and linecounter == 1:
sys.stderr.write( "{}\n".format(attributes) )
# convert strings of start and end to integers for calculations
isend = int(send)
isstart = int(sstart)
# as start must always be less or equal to end, reverse them for opposite strand hits
if isstart <= isend:
strand = "+"
outlist = [sseqid, args.program, args.type, sstart, send, bitscore, strand, ".", attributes]
plusstrand += 1
else:
strand = "-"
outlist = [sseqid, args.program, args.type, send, sstart, bitscore, strand, ".", attributes]
minusstrand += 1
# if filtering, check if bits/length is above threshold
if args.filter:
fl = abs(isend-isstart)
nbs = float(bitscore)/fl
if nbs < args.score_cutoff:
badhits += 1
continue
# check here for low evalues
if float(evalue) > args.evalue_cutoff:
badhits += 1
continue
# check for short matches
matchlength = int(length)
if matchlength < args.length_cutoff:
shorthits += 1
continue
writecounter += 1
write_line(outlist, wayout)
sys.stderr.write("# Parsed {} lines {}\n".format( linecounter, time.asctime() ) )
sys.stderr.write("# Found {} forward and {} reverse hits\n".format(plusstrand, minusstrand) )
if shorthits:
sys.stderr.write("# Removed {} short matches\n".format(shorthits) )
if badhits:
sys.stderr.write("# Removed {} weak matches\n".format(badhits) )
sys.stderr.write("# Wrote {} matches {}\n".format(writecounter, time.asctime() ) )
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)