-
Notifications
You must be signed in to change notification settings - Fork 22
/
session2b-alt.qmd
148 lines (125 loc) · 3.9 KB
/
session2b-alt.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
---
title: "2b: Determining Interval Overlap with TypedTables"
author: "Douglas Bates and Claudia Solis-Lemus"
jupyter: julia-1.8
---
This is a re-formulation of the functions in "2b: Determining Interval Overlap" using [TypedTables.jl](https://github.com/JuliaData/TypedTables.jl) in place of [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) and [Tables.jl](https://github.com/JuliaData/Tables.jl)
## Load packages to be used
```{julia}
#| code-fold: show
using Arrow # Arrow storage and file format
using BenchmarkTools # tools for benchmarking code
using RangeTrees # a bespoke implementation of interval trees
using TypedTables # row- or column-oriented tabular data
using Base: intersect! # not exported from Base
datadir = joinpath(@__DIR__, "biofast-data-v1");
```
```{julia}
asrange(start, stop) = (start+one(start)):stop
```
```{julia}
function chromodict(tbl::Table)
r1 = first(tbl)
itype = promote_type(typeof(r1.start), typeof(r1.stop))
vtype = Vector{UnitRange{itype}}
dict = Dict{Symbol,vtype}()
for (; chromo, start, stop) in tbl
push!(get!(dict, Symbol(chromo), vtype()), asrange(start, stop))
end
return dict
end
function chromodict(fnm::AbstractString)
return chromodict(Table(Arrow.Table(joinpath(datadir, fnm))))
end
tarrngvecs = chromodict("ex-rna.arrow")
refrngvecs = chromodict("ex-anno.arrow")
```
```{julia}
@code_warntype chromodict(Table(Arrow.Table("biofast-data-v1/ex-anno.arrow")))
```
```{julia}
let
function lt(x::UnitRange{T}, y::UnitRange{T}) where {T}
fx, fy = first(x), first(y)
return fx == fy ? last(x) < last(y) : fx < fy
end
for v in values(refrngvecs)
sort!(v; lt)
end
end
refrngvecs # note changes in refrngvecs[:chr01]
```
```{julia}
refrngvec01 = refrngvecs[:chr01]
tarrngvec01 = tarrngvecs[:chr01]
target = last(tarrngvec01)
```
## RangeTrees
```{julia}
refrngtrees = Dict(k => RangeNode(v) for (k, v) in refrngvecs)
rangetree01 = refrngtrees[:chr01]
treesize(rangetree01), treeheight(rangetree01), treebreadth(rangetree01)
```
```{julia}
result = similar(tarrngvec01, 0)
savedresult = intersect!(result, target, rangetree01)
@benchmark intersect!(res, tar, ref) setup =
(res = result; tar = target; ref = rangetree01)
```
```{julia}
function coveragecount(
target::AbstractUnitRange,
isects::Vector{UnitRange{T}},
) where {T}
leftpost, rightpost = T(first(target)), T(last(target))
coverage = 0
for isect in isects
coverage += length(intersect(isect, leftpost:rightpost))
leftpost = max(leftpost, last(isect) + one(T))
end
return coverage
end
coveragecount(target, savedresult)
```
# Iterating over a collection of targets
```{julia}
function overlaps(targets::Vector{UnitRange{T}}, refs::RangeNode{T}) where {T}
nover = similar(targets, Int)
covcount = similar(targets, T)
result = empty!(similar(targets, ceil(Int, sqrt(treesize(refs)))))
@inbounds for (i, tar) in enumerate(targets)
nover[i] = length(intersect!(result, tar, refs))
covcount[i] = coveragecount(tar, result)
end
Table(targets = targets, nover = nover, covcount = covcount)
end
overlaps(tarrngvec01, rangetree01)
```
```{julia}
@benchmark overlaps(targets, refs) setup =
(targets = tarrngvec01; refs = rangetree01)
```
# The whole shootin' match
- The computations on different chromosome are independent of each other and can be assigned to different threads when Julia is started with multiple threads.
```{julia}
function overlaps(
tardict::Dict{Symbol,Vector{UnitRange{T}}},
refdict::Dict{Symbol,RangeNode{T,R}},
) where {T,R}
result = Dict{Symbol,Table}()
@sync for k in intersect(keys(tardict), keys(refdict))
Threads.@spawn result[k] = overlaps(tardict[k], refdict[k])
end
return result
end
bigresult = overlaps(tarrngvecs, refrngtrees);
bigresult[:chrX]
```
```{julia}
@benchmark overlaps(targets, refs) setup =
(targets = tarrngvecs; refs = refrngtrees)
```
# Version information
```{julia}
versioninfo()
```