forked from zeyaxue/ARG_metagenome
-
Notifications
You must be signed in to change notification settings - Fork 1
/
DNAtoAA_transcription_translation.py
executable file
·90 lines (64 loc) · 2.5 KB
/
DNAtoAA_transcription_translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python
# change path to the python folder I can access
#export PATH="/home/xzyao/.local/bin:$PATH"
if __name__ == '__main__':
# import libraries
try: import sys
except: sys.exit("Could not import module 'sys'")
try: import os
except: sys.exit("Could not import module 'os'")
# parse arguments
try: import argparse
except: sys.exit("Could not import module 'argparse'")
parser = argparse.ArgumentParser(description='Transcribe and translate DNA sequences to amino acid sequences.')
parser.add_argument('in_dna', type=str)
parser.add_argument('out_aa', type=str)
args = vars(parser.parse_args())
args['in_dna'] = args['in_dna'].split(',')
try: from Bio.Seq import Seq
except: sys.exit("Could not import module 'Bio.Seq'")
try:from Bio.Alphabet import IUPAC
except: sys.exit("Could not import module 'Bio.Alphabet'")
for infile in args['in_dna']:
if not os.path.isfile(infile):
sys.exit("Input file %s not found" % infile)
# create a list with all the lines in the input file
with open(infile) as f:
template_dna = f.readlines()
# remove whitespace characters like `\n` at the end of each line
template_dna = [x.strip() for x in template_dna]
# creast empty string variable for taking DNA and aa sequences
dna_seq=""
ind=-1
# write to output file
with open(args['out_aa'], 'w') as outfile:
# loop through the entire list
for line in template_dna:
ind=ind+1
# starts with ">", indicate the sequence header
if line.startswith(">") and ind==0:
# directly write name to output file
outfile.write(line+'\n')
elif line.startswith(">"):
# dna sequences with the previous header
dna_seq=Seq(dna_seq, IUPAC.ambiguous_dna)
aa=dna_seq.translate(table=11, to_stop=True)
outfile.write(str(aa)+'\n')
# clear contents of the dna_seq
dna_seq=str("")
# write the current header
outfile.write(line+'\n')
else:
dna_seq=dna_seq+line
# the last loop
dna_seq=Seq(dna_seq, IUPAC.ambiguous_dna)
aa=dna_seq.translate(table=11, to_stop=True)
outfile.write(str(aa)+'\n')
#### another method I tried with list.....
#dna.append(Seq(line, IUPAC.ambiguous_dna))
#ind=ind+1
# NCBI genetic code table 11 for bacterial DNA: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
# Becasue we start with full length database sequence files, each sequence should be a complete coding sequence
#aa.append(dna[ind].translate(table=11, to_stop=True))
# write the aa sequence to file
#outfile.write(str(aa[ind])+'\n')