-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf(query): Update CompressedBin IntersectionAlgo #9000
Changes from 7 commits
20a7a77
4e458f2
a0b1e23
0945878
b8b96d6
7bf9605
d350c12
36b2722
170f37c
f1e65ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,7 +60,7 @@ func IntersectCompressedWith(pack *pb.UidPack, afterUID uint64, v, o *pb.List) { | |
|
||
// Select appropriate function based on heuristics. | ||
ratio := float64(m) / float64(n) | ||
if ratio < 500 { | ||
if ratio < 10 { | ||
IntersectCompressedWithLinJump(&dec, v.Uids, &dst) | ||
} else { | ||
IntersectCompressedWithBin(&dec, v.Uids, &dst) | ||
|
@@ -94,7 +94,7 @@ func IntersectCompressedWithLinJump(dec *codec.Decoder, v []uint64, o *[]uint64) | |
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3 | ||
// Call seek on dec before calling this function | ||
func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { | ||
ld := dec.ApproxLen() | ||
ld := codec.ExactLen(dec.Pack) | ||
lq := len(q) | ||
|
||
if lq == 0 { | ||
|
@@ -105,46 +105,50 @@ func IntersectCompressedWithBin(dec *codec.Decoder, q []uint64, o *[]uint64) { | |
} | ||
|
||
// Pick the shorter list and do binary search | ||
if ld < lq { | ||
if ld <= lq { | ||
for { | ||
blockUids := dec.Uids() | ||
if len(blockUids) == 0 { | ||
break | ||
} | ||
IntersectWithBin(blockUids, q, o) | ||
lastUid := blockUids[len(blockUids)-1] | ||
qidx := sort.Search(len(q), func(idx int) bool { | ||
return q[idx] >= lastUid | ||
}) | ||
if qidx >= len(q) { | ||
if ld*10 < len(q) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why 10? if it’s based on heuristic then you can just invoke the const |
||
q = q[IntersectWithBin(blockUids, q, o):] | ||
} else { | ||
// For small enough difference between two arrays, we should just | ||
// do lin intersect | ||
_, off := IntersectWithLin(blockUids, q, o) | ||
q = q[off:] | ||
} | ||
if len(q) == 0 { | ||
return | ||
} | ||
q = q[qidx:] | ||
dec.Next() | ||
} | ||
return | ||
} | ||
|
||
var uids []uint64 | ||
for _, u := range q { | ||
uids := dec.Uids() | ||
qidx := 0 | ||
for { | ||
if qidx >= len(q) { | ||
return | ||
} | ||
u := q[qidx] | ||
if len(uids) == 0 || u > uids[len(uids)-1] { | ||
uids = dec.Seek(u, codec.SeekStart) | ||
if lq*10 < ld { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above comments abt 10 |
||
uids = dec.LinearSeek(u) | ||
} else { | ||
uids = dec.SeekToBlock(u, codec.SeekCurrent) | ||
harshil-goel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
if len(uids) == 0 { | ||
return | ||
} | ||
} | ||
uidIdx := sort.Search(len(uids), func(idx int) bool { | ||
return uids[idx] >= u | ||
}) | ||
if uidIdx >= len(uids) { | ||
// We know that u < max(uids). If we didn't find it here, it's not here. | ||
continue | ||
} | ||
if uids[uidIdx] == u { | ||
*o = append(*o, u) | ||
uidIdx++ | ||
_, off := IntersectWithJump(uids, q[qidx:], o) | ||
if off == 0 { | ||
off = 1 // if v[k] isn't in u, move forward | ||
} | ||
uids = uids[uidIdx:] | ||
qidx += off | ||
} | ||
} | ||
|
||
|
@@ -233,7 +237,8 @@ func IntersectWithJump(u, v []uint64, o *[]uint64) (int, int) { | |
// IntersectWithBin is based on the paper | ||
// "Fast Intersection Algorithms for Sorted Sequences" | ||
// https://link.springer.com/chapter/10.1007/978-3-642-12476-1_3 | ||
func IntersectWithBin(d, q []uint64, o *[]uint64) { | ||
// Returns where to move the second array(q) to. O means not found | ||
func IntersectWithBin(d, q []uint64, o *[]uint64) int { | ||
harshil-goel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ld := len(d) | ||
lq := len(q) | ||
|
||
|
@@ -242,7 +247,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) { | |
d, q = q, d | ||
} | ||
if ld == 0 || lq == 0 || d[ld-1] < q[0] || q[lq-1] < d[0] { | ||
return | ||
return 0 | ||
} | ||
|
||
val := d[0] | ||
|
@@ -256,6 +261,7 @@ func IntersectWithBin(d, q []uint64, o *[]uint64) { | |
}) | ||
|
||
binIntersect(d, q[minq:maxq], o) | ||
return maxq | ||
} | ||
|
||
// binIntersect is the recursive function used. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,6 +223,64 @@ func (d *Decoder) ApproxLen() int { | |
|
||
type searchFunc func(int) bool | ||
|
||
// SeekToBlock will find the nearest block, and unpack it. Unlike Seek, it doesn't | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can improve this explanation |
||
// apply search in the resulting uid list and then move the pointer forward. When we are going | ||
// to intersect the list later, this function is useful. | ||
func (d *Decoder) SeekToBlock(uid uint64, whence seekPos) []uint64 { | ||
if d.Pack == nil { | ||
return []uint64{} | ||
} | ||
prevBlockIdx := d.blockIdx | ||
d.blockIdx = 0 | ||
if uid == 0 { | ||
return d.UnpackBlock() | ||
} | ||
|
||
// If for some reason we are searching an older uid, we need to search the entire pack | ||
if prevBlockIdx > 0 && uid < d.Pack.Blocks[prevBlockIdx].Base { | ||
prevBlockIdx = 0 | ||
} | ||
|
||
pack := d.Pack | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this assignment |
||
blocksFunc := func() searchFunc { | ||
var f searchFunc | ||
switch whence { | ||
case SeekStart: | ||
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base >= uid } | ||
case SeekCurrent: | ||
f = func(i int) bool { return pack.Blocks[i+prevBlockIdx].Base > uid } | ||
} | ||
return f | ||
} | ||
|
||
idx := sort.Search(len(pack.Blocks[prevBlockIdx:]), blocksFunc()) + prevBlockIdx | ||
// The first block.Base >= uid. | ||
if idx == 0 { | ||
return d.UnpackBlock() | ||
} | ||
// The uid is the first entry in the block. | ||
if idx < len(pack.Blocks) && pack.Blocks[idx].Base == uid { | ||
d.blockIdx = idx | ||
return d.UnpackBlock() | ||
} | ||
|
||
// Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or, | ||
// we found the first block index whose base is greater than uid. In these cases, go to the | ||
// previous block and search there. | ||
d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it. | ||
if d.blockIdx != prevBlockIdx { | ||
d.UnpackBlock() // And get all their uids. | ||
} | ||
|
||
if uid <= d.uids[len(d.uids)-1] { | ||
return d.uids | ||
} | ||
|
||
// Could not find any uid in the block, which is >= uid. The next block might still have valid | ||
// entries > uid. | ||
return d.Next() | ||
} | ||
|
||
// Seek will search for uid in a packed block using the specified whence position. | ||
// The value of whence must be one of the predefined values SeekStart or SeekCurrent. | ||
// SeekStart searches uid and includes it as part of the results. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
set hardcoded variables to a const