-
Notifications
You must be signed in to change notification settings - Fork 0
/
pickle_bacmet.py
67 lines (46 loc) · 1.82 KB
/
pickle_bacmet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pickle
def reverse_translate_protein(protein_seq):
codon_table = {
'F' : ['TTT', 'TTC'],
'L' : ['TTA', 'TTG'],
'S' : ['TCT', 'TCC', 'TCA', 'TCG'],
'Y' : ['TAT', 'TAC'],
'C' : ['TGC', 'TGT'],
'W' : ['TGG'],
'L' : ['CTA', 'CTT', 'CTG', 'CTC'],
'P' : ['CCA', 'CCT', 'CCG', 'CCC'],
'H' : ['CAC', 'CAT'],
'Q' : ['CAA', 'CAG'],
'R' : ['CGA', 'CGT', 'CGG', 'CGC'],
'I' : ['ATA', 'ATT', 'ATC'],
'M' : ['ATG'],
'T' : ['ACA', 'ACT', 'ACG', 'ACC'],
'N' : ['AAT', 'AAC'],
'K' : ['AAA', 'AAG'],
'S' : ['AGT', 'AGC'],
'R' : ['AGA', 'AGG'],
'V' : ['GTA', 'GTT', 'CTG', 'GTC'],
'A' : ['GCA', 'GCT', 'GCG', 'GCC'],
'D' : ['GAT', 'GAC'],
'E' : ['GAA', 'GAG'],
'G' : ['GGA', 'GGT', 'GGG', 'GGC']
}
dna_from_prot = ''
for a in protein_seq:
try:
dna_from_prot += (codon_table[a][0])
except:
dna_from_prot += a
return dna_from_prot
bacmet_filename = 'data/BacMet2_predicted_database.fasta'
with open(bacmet_filename) as f:
bacmet_lines = f.readlines()
bacmet_sequences = [] # {}
for i in range(len(bacmet_lines)):
current_line = bacmet_lines[i].rstrip()
if current_line[0] == ">":
bacmet_sequences.append('')
else:
bacmet_sequences[-1] += reverse_translate_protein(current_line)
with open('bacmet_seqs.p', 'wb') as h:
pickle.dump(bacmet_sequences, h)