Skip to content

Commit

Permalink
ORC-1610: Reduce the number of hash computation in CuckooSetBytes
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Add boundary conditions on "length" with the min/max length stored in the hashes.

### Why are the changes needed?
https://issues.apache.org/jira/browse/HIVE-24205

> This would significantly reduce the number of hash computation that needs to happen.

```
main insert:00:00:00.689
main lookup:00:00:01.124
PR insert:00:00:00.628
PR lookup:00:00:01.055
```

```java
  Test
  public void testLen() {
    int maxSize = 200000;
    Random gen = new Random();
    String[] strings = new String[maxSize];
    for (int i = 0; i < maxSize; i++) {
      strings[i] = RandomStringUtils.random(Math.abs(gen.nextInt(1000)));
    }
    byte[][] values = getByteArrays(strings);

    StopWatch mainSW = new StopWatch();

    // load set
    mainSW.start();
    CuckooSetBytes main = new CuckooSetBytes(strings.length);
    main.fastLookup = false;
    for (byte[] v : values) {
      main.insert(v);
    }
    mainSW.split();
    System.out.println("main insert:" + mainSW);
    // test that the values we added are there
    for (byte[] v : values) {
      assertTrue(main.lookup(v, 0, v.length));
    }
    mainSW.stop();
    System.out.println("main lookup:" + mainSW);

    StopWatch prSW = new StopWatch();
    prSW.start();
    CuckooSetBytes pr = new CuckooSetBytes(strings.length);
    pr.fastLookup = true;
    for (byte[] v : values) {
      pr.insert(v);
    }
    prSW.split();
    System.out.println("PR insert:" + prSW);
    for (byte[] v : values) {
      assertTrue(pr.lookup(v, 0, v.length));
    }
    prSW.stop();
    System.out.println("PR lookup:" + prSW);
  }
```
### How was this patch tested?
GA

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #1785 from cxzl25/ORC-1610.

Authored-by: sychen <sychen@ctrip.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 4eff23a)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
cxzl25 authored and dongjoon-hyun committed Feb 7, 2024
1 parent 6058aea commit 8ca152a
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions java/core/src/java/org/apache/orc/util/CuckooSetBytes.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ public class CuckooSetBytes {
private int rehashCount = 0;
private static final long INT_MASK = 0x00000000ffffffffL;
private static final long BYTE_MASK = 0x00000000000000ffL;
private int maxLen;
private int minLen = Integer.MAX_VALUE;
// some prime numbers spaced about at powers of 2 in magnitude
static final int[] primes = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 1019, 2089,
4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 4074073,
Expand Down Expand Up @@ -84,6 +86,9 @@ public CuckooSetBytes(int expectedSize) {
* and ending at start+len is present in the set.
*/
public boolean lookup(byte[] b, int start, int len) {
if (len < minLen || len > maxLen) {
return false;
}

return entryEqual(t1, h1(b, start, len), b, start, len) ||
entryEqual(t2, h2(b, start, len), b, start, len);
Expand All @@ -98,6 +103,8 @@ public void insert(byte[] x) {
if (lookup(x, 0, x.length)) {
return;
}
minLen = Math.min(minLen, x.length);
maxLen = Math.max(maxLen, x.length);

// Try to insert up to n times. Rehash if that fails.
for(int i = 0; i != n; i++) {
Expand Down

0 comments on commit 8ca152a

Please sign in to comment.