-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathglob.py
111 lines (97 loc) · 3.8 KB
/
glob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import string
VALID_CHARS = set([c for c in string.ascii_letters] + [c for c in string.digits])
class DNA():
def __init__(self, name='', info='', content=None):
self.name = name
self.info = info
self.content = content
def __repr__(self):
return "name: {}\ninfo: {}\ncontent: {}".format(self.name, self.info, self.content)
def add_new_DNA(dna_list, line):
assert line[0] == '>'
first_space_idx = line.find(' ')
if first_space_idx != -1:
dna_name = line[1:first_space_idx]
dna_info = line[first_space_idx:].strip()
else:
dna_name = line[1:]
dna_info = ''
dna_list.append(DNA(name=dna_name, info=dna_info, content=[]))
def add_line_to_DNA(cur_DNA, line):
for x in line:
if x in VALID_CHARS:
cur_DNA.content.append(x)
elif x == ' ':
continue
else:
raise Exception()
def parse_FASTA(file):
"""
Basic state machine for parsing.
0 = Expecting '>' or empty line.
1 = Expecting valid string for current DNA or empty line.
2 = Got at least one string for current DNA. Ready for new DNA
or continue current DNA.
"""
state = 0
dna_list = []
for line in file:
line = line.strip()
if state == 0:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
elif line == '':
continue
else:
raise Exception()
elif state == 1:
add_line_to_DNA(dna_list[-1], line)
state = 2
elif state == 2:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
else:
add_line_to_DNA(dna_list[-1], line)
else:
raise Exception()
file.seek(0)
return dna_list
with open("rosalind_glob.txt") as file:
dna_list = parse_FASTA(file)
s, t = [x.content for x in dna_list]
score_mat = [ # substitution penalty
[4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2],
[0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2],
[-2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3],
[-1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2],
[-2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3],
[0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3],
[-2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2],
[-1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1],
[-1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2],
[-1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1],
[-1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1],
[-2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2],
[-1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3],
[-1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1],
[-1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2],
[1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2],
[0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2],
[0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1],
[-3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2],
[-2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7]
]
score_mat_map = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9,
'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}
gamma = -5 # gap penalty
dists = [j * gamma for j in range(len(s) + 1)]
for i in range(len(t)):
old_dist = dists[0]
dists[0] = (i + 1) * gamma
for j in range(len(s)):
tmp = max(old_dist + score_mat[score_mat_map[s[j]]][score_mat_map[t[i]]], dists[j] + gamma, dists[j + 1] + gamma)
old_dist = dists[j + 1]
dists[j + 1] = tmp
print(dists[-1])