-
Notifications
You must be signed in to change notification settings - Fork 18
/
MergeAlign.py
executable file
·329 lines (252 loc) · 10.5 KB
/
MergeAlign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#! /usr/bin/python
"""
MergeAlign algorithm for create consensus multiple sequence alignments
For a lsit of options, run:
>> python mergeAlign.py -h
"""
import os
import sys
import getopt
class Node:
""" A position in alignment space with edges to the previous nodes in all alignments """
def __init__(self):
self.previous_nodes = {}
self.path_score = 0
self.path_length = 0
self.path_average = 0
self.best_previous_node = None
def combineMultipleAlignments(alignment_names):
""" Combine all alignments in a folder into a single alignment.
Takes a list filepaths to FASTA alignments.
Returns a consensus alignment as a dictionary and a list of column scores. """
# Read alignments
alignments = [parseFASTA(alignment) for alignment in alignment_names]
# Open the first file and remove gaps to get original sequences in a dictionary
original_sequences = dict([(name, seq.replace('-', '')) for name, seq in alignments[0].iteritems()])
sequence_names = original_sequences.keys()
# Convert dictionaries of sequences to lists of indices
indices = [[convertSequenceToIndices(alignment[seq]) for seq in sequence_names] for alignment in alignments]
# Convert alignments into list of coordinates
coordinates = [zip(*alignment) for alignment in indices]
# Move through coordinates, generating paths of nodes through the alignments and finding best alignment
nodes = createNodes(coordinates)
final_coordinates, scores = scoreNodes(nodes, len(coordinates))
final_alignment = convertListOfCoordinatesToSequences(final_coordinates, original_sequences)
return final_alignment, scores
def parseFASTA(filename):
""" Read file in FASTA format.
Return a dictionary with key=sequence name, value=sequence. """
try:
f = open(filename, 'r')
except IOError:
print "Unable to open file %s" % filename
sys.exit(2)
sequences = {}
seq_id = None
for line in f.readlines():
if line.startswith('>'):
seq_id = line.rstrip()[1:]
sequences[seq_id] = ''
elif seq_id != None:
sequences[seq_id] += line.rstrip()
if len(sequences) == 0:
print "%s contains no sequences" % filename
sys.exit(2)
return sequences
def convertSequenceToIndices(sequence):
""" Convert a sequence to a list of amino acid indices.
Gap are converted to the index of the preceeding amino acid. """
indices = []
i = 0
for aa in sequence:
if aa != '-':
i += 1
indices.append(i)
return indices
def convertIndicesToSequence(indices, original_sequence):
""" Convert list of indices back to original sequence plus gaps """
sequence = ''
previous_i = 0
for i in indices:
if i != previous_i:
sequence += original_sequence[i-1]
else:
sequence += '-'
previous_i = i
return sequence
def convertListOfCoordinatesToSequences(final_coordinates, original_sequences):
""" Convert a list of coordinates in alignment space into an alignment using a passed dictionary of sequences. """
seq_names = original_sequences.keys()
alignment = {}
for i, sequence in enumerate(zip(*final_coordinates)):
seq = seq_names[i]
alignment[seq] = convertIndicesToSequence(sequence, original_sequences[seq])
return alignment
def createNodes(alignments):
""" Traverse each alignment creating nodes at each point found in alignment space.
For each node record which nodes preceded it and how often. """
dimensions = len(alignments[0][0])
first_node = tuple([0]*dimensions)
nodes = {first_node: Node()}
for alignment in alignments:
previous_node = first_node
for point in alignment:
if not point in nodes:
nodes[(point)] = Node()
node_dict = nodes[(point)].previous_nodes
# Add previous node to current nodes count of previous nodes
node_dict[previous_node] = node_dict.get(previous_node, 0) + 1
previous_node = point
return nodes
def scoreNodes(nodes, num_paths=100):
""" Travserse nodes, finding the best route to each one with a dynamic programming approach.
Return the best path to the final node. """
dimensions = len(nodes.keys()[0])
first_node = tuple([0]*dimensions)
for coord, current_node in sorted(nodes.iteritems())[1:]:
previous_nodes = current_node.previous_nodes.items()
best_node = max(previous_nodes, key=lambda n: n[1] + nodes[n[0]].path_average)
# Update this node with the best path so far
current_node.path_length = nodes[best_node[0]].path_length + 1
current_node.path_score = best_node[1] + nodes[best_node[0]].path_score
current_node.path_average = current_node.path_score* 1.0 / current_node.path_length
current_node.best_previous_node = best_node[0]
#print "final score: %.2f; length: %d" % (current_node.path_average/num_paths, current_node.path_length)
# Start at the final node and work backwards, moving from each node to its best previous node
path = []
scores = []
while coord != first_node:
path.append(coord)
scores.append(nodes[coord].previous_nodes[nodes[coord].best_previous_node]*1.0/num_paths)
coord = nodes[coord].best_previous_node
path.reverse()
scores.reverse()
return path, scores
def outputAlignmentAsFASTA(filename, final_alignment, threshold=None, scores=None):
""" Output alignment in form of a dictionary as a FASTA file. """
output = file(filename, 'w')
for name, sequence in final_alignment.items():
output.write(">%s\n" % name)
if threshold:
sequence = ''.join([aa for (aa, score) in zip (sequence, scores) if score>threshold])
output.write("%s\n" % sequence)
def outputAlignmentAsCLUSTAL(filename, final_alignment, scores):
""" Output dictionary of alignments as a FASTA file. """
output = file(filename, 'w')
output.write("CLUSTAL format alignment by MultiMatrixMafft\n\n")
trunc_names = [len(key)>19 and "%s..." % key[:16] or key for key in final_alignment.keys()]
start = 0
while start < len(scores):
stop = start+60
for i, sequence in enumerate(final_alignment.values()):
output.write(">%s\t" % trunc_names[i])
output.write("%s\n" % sequence[start:stop])
output.write(" " * 24)
for score in scores[start:stop]:
if score == 1:
output.write("*")
elif score > 0.9:
output.write(":")
elif score > 0.75:
output.write(".")
else:
output.write(" ")
output.write('\n\n')
start = stop
def outputScore(filename, scores):
""" Output score as a list of numbers. """
output = file(filename, 'w')
for score in scores:
output.write('%.3f\n' % score)
def outputAlignment(filename, final_alignment, scores):
output = file(filename, 'w')
for score in scores:
if score == 1:
output.write('*')
else:
output.write('%d' % (10*score))
output.write('\n')
for name, sequence in final_alignment.items():
output.write(">%s\n" % name)
output.write("%s\n" % sequence)
def outputAlignmentVertical(filename, final_alignment, original_sequences):
output = file(filename, 'w')
for i, sequence in enumerate(zip(*[residue for (residue, score) in final_alignment])):
print convertIndicesToSequence(sequence, original_sequences[i])
def usage():
print """
To combine multiple alignments:
% python multimatrixmafft.py -a path/to/folder
-a --alignments
ARG: path to a folder
Path to a folder containing all the FASTA alignments you want to combine.
All the alignments must be of the same set of sequences.
OPTIONAL
-f --fasta
ARG: filename
Final alignment is saved as a FASTA file with this name.
-s --score
ARG: filename
Final scores are saved as a list in a text file with this name.
-t --score
ARG: number >0 and <= 1
Only columns with a score > threshold are outputted.
Only works in conjection with -f.
Additional arguments:
-h, --help
ARG: none
Get command line arguments.
"""
if __name__ == '__main__':
# Options
matrices = None
sequence_file = None
alignment_folder = None
fasta_output = None
clustal_output = None
score_output = None
threshold = None
# Get commandline arguments
try:
opts, args = getopt.getopt(sys.argv[1:],
"ha:f:s:t:",
["help", "alignments=", "fasta=", "score=", "threshold="])
except getopt.GetoptError:
print "Error: command line argument not recognised"
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-a", "--alignments"):
alignment_folder = arg
elif opt in ("-f", "--fasta"):
fasta_output = arg
elif opt in ("-s", "--score"):
score_output = arg
elif opt in ("-t", "--threshold"):
try:
threshold = float(arg)
except ValueError:
print 'Error: threshold must be a number'
usage()
sys.exit(2)
if not 0 < threshold <= 1:
print 'Error: threshold must be >0 and <= 1'
usage()
sys.exit(2)
# Ensure there is at least one argument
if not alignment_folder:
print "Error: alignment folder not defined"
usage()
sys.exit(2)
# Get list of dictionaries of aligned sequences; indices correspond to matrices
alignment_names = [os.path.join(alignment_folder, alignment) for alignment in os.listdir(alignment_folder)]
# Combine alignments
final_alignment, scores = combineMultipleAlignments(alignment_names)
# Output alignment
if fasta_output:
outputAlignmentAsFASTA(fasta_output, final_alignment, threshold, scores)
if score_output:
outputScore(score_output, scores)