-
Notifications
You must be signed in to change notification settings - Fork 4
/
ncrf_resolve_overlaps.py
executable file
·350 lines (277 loc) · 10.5 KB
/
ncrf_resolve_overlaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python
"""
Resolve overlapping alignments of different motifs.
"""
from sys import argv,stdin,stdout,stderr,exit
from gzip import open as gzip_open
from ncrf_parse import int_with_unit
def usage(s=None):
message = """
usage: ncrf_resolve_overlaps <alignment_summary..> [options]
<alignment_summary> (cumulative) file(s) containing aligment summaries
for which overlaps are to be resolved
--head=<number> limit the number of input aligment summaries
--out=<name_template> file to write overlap groups to; see discussion of
name template below; if this option is absent, all
output is written to the console
The name template either names a single file or a collection of files. See
below for some examples.
The input alignment summaries are usually the output from ncrf_summary. Any
input file may contain alignments for more than one motif.
A typical input file is shown below. However, we do not interpret any columns
other than motif, seq, start, and end. This allows, for example, the output
from ncrf_summary_with_consensus.
#line motif seq start end strand seqLen querybp mRatio m mm i d
1 TGTA JZL5129_2 552 582 + 15216 30 100.0% 30 0 0 0
9 GGTA JZL5129_2 579 2262 + 15216 1681 92.9% 1583 77 23 21
17 GGGA JZL5129_2 782 876 + 15216 92 92.6% 87 5 2 0
...
If the output name template includes the substring "{motif}", this substring is
replaced by a motif name and any un-overlapped alignments to that motif are
written to that file. If the template name doesn't include "{motif}", all
un-overlapped alignments and overlapping groups are written to one file. Note
that "{motif}" is the word "motif" surrounded by two curly brackets.
Overlapping groups are either written to the console (if no name template is
given), to the same file with un-overlapped alignments (if the name template
doesn't contain "{motif}"), or to a file separate from the un-overlapped
alignments (with "{motif}" replaced by "overlaps").
This is summarized in the table below. We assume for this that the input only
contains two motifs, GGAAT and CATATA.
name_template | output
-----------------+----------------------------------------------------------
(none) | un-overlapped and overlap groups written to the console
-----------------+----------------------------------------------------------
filename | un-overlapped and overlap groups written to filename
-----------------+----------------------------------------------------------
filename.{motif} | un-overlapped GGAAT written to filename.GGAAT
| un-overlapped CATATA written to filename.CATATA
| overlap groups written to filename.overlaps
-----------------+----------------------------------------------------------
Overlap groups are separated by a single blank line, as shown below. When
un-overlapped alignments and overlapped ones are in the same file, the
un-overlapped ones come first, with a blank line separating each alignment,
followed by the overlap groups.
#line motif seq start end strand seqLen querybp mRatio m mm i d
497 GGTA JZL5129_2 11178 11604 + 15216 428 93.1% 404 16 6 8
505 GGGA JZL5129_2 11423 11455 + 15216 31 93.8% 30 1 1 0
(blank line)
521 GGTA JZL5129_2 11733 13179 + 15216 1435 89.6% 1325 77 44 33
529 GGGA JZL5129_2 12038 12061 + 15216 23 95.7% 22 1 0 0
537 GGGA JZL5129_2 12291 12402 + 15216 111 92.0% 103 7 1 1
(blank line)
... (more groups)"""
if (s == None): exit (message)
else: exit ("%s\n%s" % (s,message))
def main():
global summaryHeaderLine
global debug
summaryHeaderLine = None
# parse the command line
inputFilenames = []
outTemplate = None
headLimit = None
debug = []
for arg in argv[1:]:
if ("=" in arg):
argVal = arg.split("=",1)[1]
if (arg.startswith("--out=")):
outTemplate = argVal
elif (arg.startswith("--head=")):
headLimit = int_with_unit(argVal)
elif (arg == "--debug"):
debug += ["debug"]
elif (arg.startswith("--debug=")):
debug += argVal.split(",")
elif (arg.startswith("--")):
usage("unrecognized option: %s" % arg)
else:
if (arg not in inputFilenames): inputFilenames += [arg]
if (inputFilenames == []):
usage("you have to give me at least one summary file")
writeSingletonsSeparately = (outTemplate != None) and ("{motif}" in outTemplate)
# collect the alignments
seqToSummaries = {}
seqOrder = []
summaryNum = 0
for filename in inputFilenames:
if (filename.endswith(".gz")) or (filename.endswith(".gzip")):
f = gzip_open(filename,"rt")
else:
f = file(filename,"rt")
for summary in read_summary(f,filename):
summaryNum += 1
if (headLimit != None) and (summaryNum > headLimit):
print >>stderr, "limit of %d summaries reached" % headLimit
break
if (summary.seq not in seqToSummaries):
seqOrder += [summary.seq]
seqToSummaries[summary.seq] = []
seqToSummaries[summary.seq] += [summary]
f.close()
# partition the alignments into overlapping groups
seqToGroups = {}
for seq in seqOrder:
seqToGroups[seq] = overlapping_groups(seqToSummaries[seq])
if ("groups" in debug):
for seq in seqOrder:
for group in seqToGroups[seq]:
print >>stderr, "==="
for summary in group:
print >>stderr,summary.line
# collect groups by motif subset
subsetToGroups = {}
singletons = set()
for seq in seqOrder:
for group in seqToGroups[seq]:
subset = set([summary.motif for summary in group])
subset = list(subset)
subset.sort()
subset = tuple(subset)
if (subset not in subsetToGroups):
subsetToGroups[subset] = [group]
else:
subsetToGroups[subset] += [group]
if (len(subset) == 1):
singletons.add(subset[0])
# if we're to report un-overlapped alignments separately, do so now (and
# remove them from the groups)
singletons = list(singletons)
singletons.sort()
if (writeSingletonsSeparately):
for motif in singletons:
subset = (motif,)
motifFilename = outTemplate.replace("{motif}",motif)
motifF = file(motifFilename,"wt")
print >>stderr, "writing to \"%s\"" % motifFilename
if (summaryHeaderLine != None):
print >>motifF,summaryHeaderLine
for group in subsetToGroups[subset]:
for summary in group:
print >>motifF,summary.line
del subsetToGroups[subset]
# report overlapping alignment groups (and un-overlapped groups if we
# didn't report them already)
if (outTemplate == None):
outF = stdout
if (list(subsetToGroups) == []):
print >>stderr, "no alignments to write to console"
elif ("{motif}" not in outTemplate):
outF = file(outTemplate,"wt")
if (list(subsetToGroups) == []):
print >>stderr, "no alignments to write to \"%s\"" % outTemplate
else:
print >>stderr,"writing to \"%s\"" % outTemplate
else:
outFilename = outTemplate.replace("{motif}","overlaps")
outF = file(outFilename,"wt")
if (list(subsetToGroups) == []):
print >>stderr, "no alignments to write to \"%s\"" % outFilename
else:
print >>stderr,"writing to \"%s\"" % outFilename
motifCountToSubsets = {}
for subset in subsetToGroups:
motifCount = len(subset)
if (motifCount not in motifCountToSubsets):
motifCountToSubsets[motifCount] = [subset]
else:
motifCountToSubsets[motifCount] += [subset]
motifCounts = list(motifCountToSubsets)
motifCounts.sort()
isFirstGroup = True
for motifCount in motifCounts:
subsets = motifCountToSubsets[motifCount]
subsets.sort()
for subset in subsets:
for group in subsetToGroups[subset]:
if (isFirstGroup):
if (summaryHeaderLine != None):
print >>outF,summaryHeaderLine
isFirstGroup = False
else:
print >>outF # (line to separate groups)
for summary in group:
print >>outF,summary.line
if (outF != stdout):
outF.close()
# read_summary--
# Read alignments from an NCRF alignment summary file, one per line
class Summary: pass
def read_summary(f,filename=None):
global summaryHeaderLine
if (filename == None): filename = "input"
lineNumber = 0
for line in f:
lineNumber += 1
line = line.strip()
if (line == ""): continue
if (line.startswith("#")):
if (summaryHeaderLine == None): summaryHeaderLine = line
continue
fields = line.split()
if (len(fields) < 5):
exit("%s: not enough columns at line %d (%d, expected at least %d)\n%s"
% (os_path.basename(argv[0]),lineNumber,len(fields),5,line))
s = Summary()
s.filename = filename
s.lineNumber = lineNumber
try:
s.line = line
# (fields[0] intentionally ignored)
s.motif = fields[1]
s.seq = fields[2]
s.start = int(fields[3])
s.end = int(fields[4])
#s.strand = fields[5]
#s.seqLen = int(fields[6])
#s.querybp = int(fields[7])
#s.mRatio = fields[8]
#s.m = int(fields[9])
#s.mm = int(fields[10])
#s.i = int(fields[11])
#s.d = int(fields[12])
except ValueError:
assert (False), "bad alignment summary line (%d in %s): %s" \
% (lineNumber,filename,line)
try:
if (s.end < s.start): raise ValueError
except ValueError:
assert (False), "bad alignment summary line (bad interval at %d in %s): %s" \
% (lineNumber,filename,line)
#try:
# if (s.strand not in ["+","-"]): raise ValueError
#except ValueError:
# assert (False), "bad alignment summary line (bad strand at %d in %s): %s" \
# % (lineNumber,filename,line)
#try:
# if (not s.mRatio.endswith("%")): raise ValueError
# s.mRatio = float(s.mRatio[:-1]) / 100
# if (not 0 <= s.mRatio <= 1): raise ValueError
#except ValueError:
# assert (False), "bad alignment summary line (bad mRatio at %d in %s): %s" \
# % (lineNumber,filename,line)
yield s
# overlapping_groups--
# Combine alignment summaries into overlapping groups
def overlapping_groups(summaries):
summaries = [(summary.start,summary.end,summary) for summary in summaries]
summaries.sort()
groups = []
currentGroup = None
start = end = None
for (s,e,summary) in summaries:
if (start == None):
currentGroup = [summary]
(start,end) = (s,e)
elif (s >= end):
groups += [currentGroup]
currentGroup = [summary]
(start,end) = (s,e)
elif (e <= end):
currentGroup += [summary]
else: # if (e > end):
currentGroup += [summary]
end = e
if (currentGroup != None):
groups += [currentGroup]
return groups
if __name__ == "__main__": main()