-
Notifications
You must be signed in to change notification settings - Fork 0
/
gbk_extractGeneInfo2.py
executable file
·40 lines (33 loc) · 1.33 KB
/
gbk_extractGeneInfo2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/python
import sys, os, Bio
"""gbk_extractGeneInfo2.py The program takes a genbank file and then pulls out the
contig, locus tag, start, stop, nucleotide sequence, and protein sequence for each gene"""
__author__ = "Sarah Stevens"
__email__ = "sstevens2@wisc.edu"
def usage():
print "Usage: gbk_extractGeneInfo.py inputfile"
if len(sys.argv) != 2:
usage()
sys.exit(2)
inputfile=sys.argv[1]
output=open(sys.argv[1].split(".gb")[0]+".tsv", "w")
outputfaa=open(sys.argv[1].split(".gb")[0]+".faa", "w")
from Bio import SeqIO
file=open(inputfile, "rU")
#records = SeqIO.read(file, "genbank")
output.write("contig\tlocus_tag\tstart\tstop\tstrand\tprot_seq\tprot_len\n")
for records in SeqIO.parse(file, 'genbank'):
for record in records.features:
if record.type == 'CDS':
output.write(records.description.split(': ')[-1][:-1]+"\t")
output.write(record.qualifiers['locus_tag'][0]+"\t")
output.write(str(record.location.nofuzzy_start)+"\t")
output.write(str(record.location.nofuzzy_end)+"\t")
output.write(str(record.strand)+"\t")
output.write(record.qualifiers['translation'][0]+"\t")
output.write(str(len(record.qualifiers['translation'][0]))+"\n")
outputfaa.write(">"+record.qualifiers['locus_tag'][0]+"\n")
outputfaa.write(record.qualifiers['translation'][0]+"\n")
output.close()
outputfaa.close()
file.close()