-
Notifications
You must be signed in to change notification settings - Fork 21
/
LazyNCList.pm
314 lines (251 loc) · 10.2 KB
/
LazyNCList.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
package LazyNCList;
use strict;
use warnings;
use Carp;
use List::Util qw(max);
use NCList;
=head2 new
Title : new
Usage : LazyNCList->new($attrs, $lazyClass, $makeLazy,
$measure, $output, $sizeThresh
Function: create an LazyNCList
Returns : an LazyNCList object
Args : $attrs is a reference to an ArrayRepr instance
$lazyClass is the class number to be used for 'lazy'
NCLists, which are references to sub-lists,
$makeLazy is a reference to a sub taking the arguments
(start, end, ID), which returns a "lazy feature" with the
given attributes
$loadChunk is a subroutine that takes a chunk ID number and returns the contents of that chunk (
$measure is a reference to a sub that takes a feature to be
output, and returns the number of bytes that feature will
take up in the output
$output is a reference to a sub that, given a chunk ID and some data,
will output that data under that chunk ID
$sizeThresh is the target chunk size
=cut
sub new {
my ($class, $attrs, $lazyClass, $makeLazy, $loadChunk,
$measure, $output, $sizeThresh) = @_;
my $self = { attrs => $attrs,
start => $attrs->makeFastGetter("Start"),
end => $attrs->makeFastGetter("End"),
setSublist => $attrs->makeSetter("Sublist"),
lazyClass => $lazyClass,
makeLazy => $makeLazy,
loadChunk => $loadChunk,
measure => $measure,
output => $output,
sizeThresh => $sizeThresh,
count => 0,
minStart => undef,
maxEnd => undef,
chunkNum => 1,
chunkSizes => [],
partialStack => [] };
bless $self, $class;
$self->addNewLevel();
return $self;
}
sub importExisting {
my ($class, $attrs, $lazyClass, $count, $minStart,
$maxEnd, $loadChunk, $topLevelList) = @_;
my $self = { attrs => $attrs,
lazyClass => $lazyClass,
start => $attrs->makeFastGetter("Start"),
end => $attrs->makeFastGetter("End"),
count => $count,
minStart => $minStart,
maxEnd => $maxEnd,
loadChunk => $loadChunk,
topLevelList => $topLevelList };
bless $self, $class;
$self->addNewLevel();
return $self;
}
=head2 addSorted
Title : addSorted
Usage : $ncl->addSorted($feat)
Function: Adds a single feature to the set of features in this LazyNCList;
features passed to this method are accumulated into "chunks";
once a chunk grows to sizeThresh, the chunk is output.
The features given to addSorted must be sorted by the NCList sort.
Returns : nothing meaningful
Args : $feat is the feature to be added;
=cut
sub addSorted {
my ($self, $feat) = @_;
$self->{count} += 1;
my $lastAdded = $self->{lastAdded};
my $start = $self->{start}->( $feat );
my $end = $self->{end}->( $feat );
if (defined($lastAdded)) {
my $lastStart = $self->{start}->($lastAdded);
my $lastEnd = $self->{end}->($lastAdded);
# check that the input is sorted
$lastStart <= $start
or die "input not sorted: got start $lastStart before $start";
die "input not sorted: got $lastStart..$lastEnd before $start..$end"
if $lastStart == $start && $lastEnd < $end;
} else {
# LazyNCList requires sorted input, so the start of the first feat
# is the minStart
$self->{minStart} = $start;
}
$self->{lastAdded} = $feat;
my $chunkSizes = $self->{chunkSizes};
my $partialStack = $self->{partialStack};
for (my $level = 0; $level <= $#$partialStack; $level++) {
# due to NCList nesting, among other things, it's hard to be exactly
# precise about the size of the JSON serialization, but this will get
# us pretty close.
my $featSize = $self->{measure}->($feat);
my $proposedChunkSize = $chunkSizes->[$level] + $featSize;
#print STDERR "chunksize at $level is now " . $chunkSizes->[$level] . "; (next chunk is " . $self->{chunkNum} . ")\n";
# If this partial chunk is full,
if ( $proposedChunkSize > $self->{sizeThresh} && @{$partialStack->[$level]} ){
# then we're finished with the current "partial" chunk (i.e.,
# it's now a "complete" chunk rather than a partial one), so
# create a new NCList to hold all the features in this chunk.
my $lazyFeat = $self->finishChunk( $partialStack->[$level] );
# start a new partial chunk with the current feature
$partialStack->[$level] = [$feat];
$chunkSizes->[$level] = $featSize;
# and propagate $lazyFeat up to the next level
$feat = $lazyFeat;
# if we're already at the highest level,
if ($level == $#{$self->{partialStack}}) {
# then we need to make a new level to have somewhere to put
# the new lazy feat
$self->addNewLevel();
}
} else {
# add the current feature the partial chunk at this level
push @{$partialStack->[$level]}, $feat;
$chunkSizes->[$level] = $proposedChunkSize;
last;
}
}
}
sub addNewLevel {
my ($self) = @_;
push @{$self->{partialStack}}, [];
push @{$self->{chunkSizes}}, 0;
}
sub finishChunk {
my ($self, $featList) = @_;
my $newNcl = NCList->new($self->{start},
$self->{end},
$self->{setSublist},
$featList);
my $chunkId = $self->{chunkNum};
$self->{chunkNum} += 1;
$self->{output}->($newNcl->nestedList, $chunkId);
$self->{maxEnd} = $newNcl->maxEnd unless defined($self->{maxEnd});
$self->{maxEnd} = max($self->{maxEnd}, $newNcl->maxEnd);
# return the lazy ("fake") feature representing this chunk
return $self->{makeLazy}->($newNcl->minStart, $newNcl->maxEnd, $chunkId);
}
=head2 finish
Title : finish
Usage : $ncl->finish()
Function: Once all features have been added (through addSorted),
call "finish" to flush all of the partial chunks.
After calling finish, you can access the "topLevelList" property.
Returns : nothing
=cut
sub finish {
my ($self) = @_;
my $level;
for ($level = 0; $level < $#{$self->{partialStack}}; $level++) {
my $lazyFeat = $self->finishChunk($self->{partialStack}->[$level]);
# pass $lazyFeat up to the next higher level.
# (the loop ends one level before the highest level, so there
# will always be at least one higher level)
push @{$self->{partialStack}->[$level + 1]}, $lazyFeat;
}
# make sure there's a top-level NCL
$level = $#{$self->{partialStack}};
my $newNcl = NCList->new($self->{start},
$self->{end},
$self->{setSublist},
$self->{partialStack}->[$level]);
$self->{maxEnd} = max( grep defined, $self->{maxEnd}, $newNcl->maxEnd );
#print STDERR "top level NCL has " . scalar(@{$self->{partialStack}->[$level]}) . " features\n";
$self->{topLevelList} = $newNcl->nestedList;
}
sub binarySearch {
my ($self, $arr, $item, $getter) = @_;
my $low = -1;
my $high = $#{$arr} + 1;
my $mid;
while ($high - $low > 1) {
$mid = int(($low + $high) / 2);
if ($getter->($arr->[$mid]) > $item) {
$high = $mid;
} else {
$low = $mid;
}
}
# if we're iterating rightward, return the high index;
# if leftward, the low index
if ($getter == $self->{end}) { return $high } else { return $low };
};
sub iterHelper {
my ($self, $arr, $from, $to, $fun, $inc,
$searchGet, $testGet, $path) = @_;
my $len = $#{$arr} + 1;
my $i = $self->binarySearch($arr, $from, $searchGet);
my $getChunk = $self->{attrs}->makeGetter("Chunk");
my $getSublist = $self->{attrs}->makeGetter("Sublist");
while (($i < $len)
&& ($i >= 0)
&& (($inc * $testGet->($arr->[$i])) < ($inc * $to)) ) {
if ($arr->[$i][0] == $self->{lazyClass}) {
my $chunkNum = $getChunk->($arr->[$i]);
my $chunk = $self->{loadChunk}->($chunkNum);
$self->iterHelper($chunk, $from, $to, $fun, $inc,
$searchGet, $testGet, [$chunkNum]);
} else {
$fun->($arr->[$i], [@$path, $i]);
}
my $sublist = $getSublist->($arr->[$i]);
if (defined($sublist)) {
$self->iterHelper($sublist, $from, $to, $fun, $inc,
$searchGet, $testGet, [@$path, $i]);
}
$i += $inc;
}
}
=head2 overlapCallback( $from, $to, \&func )
Calls the given function once for each of the intervals that overlap
the given interval if C<<$from <= $to>>, iterates left-to-right, otherwise
iterates right-to-left.
=cut
sub overlapCallback {
my ($self, $from, $to, $fun) = @_;
croak "LazyNCList not loaded" unless defined($self->{topLevelList});
return unless $self->count;
# inc: iterate leftward or rightward
my $inc = ($from > $to) ? -1 : 1;
# searchGet: search on start or end
my $searchGet = ($from > $to) ? $self->{start} : $self->{end};
# testGet: test on start or end
my $testGet = ($from > $to) ? $self->{end} : $self->{start};
# treats the root chunk as number 0
$self->iterHelper($self->{topLevelList}, $from, $to, $fun,
$inc, $searchGet, $testGet, [0]);
}
sub count { return shift->{count}; }
sub maxEnd { return shift->{maxEnd}; }
sub minStart { return shift->{minStart}; }
sub topLevelList { return shift->{topLevelList}; }
1;
=head1 AUTHOR
Mitchell Skinner E<lt>jbrowse@arctur.usE<gt>
Copyright (c) 2007-2011 The Evolutionary Software Foundation
This package and its accompanying libraries are free software; you can
redistribute it and/or modify it under the terms of the LGPL (either
version 2.1, or at your option, any later version) or the Artistic
License 2.0. Refer to LICENSE for the full license text.
=cut