Skip to content

Commit

Permalink
feat: support estimate count for element in counting bloom filter. See (
Browse files Browse the repository at this point in the history
  • Loading branch information
yankun1992 committed Feb 13, 2023
1 parent e5cc128 commit a13add4
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom_rs"
version = "0.5.0"
version = "0.5.1"
edition = "2021"
authors = ["Yan Kun <yan_kun_1992@foxmail.com>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust! 10x faster than pybloom!"
Expand Down
2 changes: 1 addition & 1 deletion fastbloom-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fastbloom-rs"
version = "0.5.0"
version = "0.5.1"
edition = "2021"
authors = ["Yan Kun <yan_kun_1992@foxmail.com>"]
description = "Some fast bloom filter implemented by Rust for Python and Rust!"
Expand Down
50 changes: 49 additions & 1 deletion fastbloom-rs/src/bloom.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::clone;
use std::cmp::min;
use std::ptr::slice_from_raw_parts;

use fastmurmur3::murmur3_x64_128;
Expand Down Expand Up @@ -459,6 +460,31 @@ from_array!(from_u16_array, u16, 4);
from_array!(from_u32_array, u32, 8);
from_array!(from_u64_array, u64, 16);

impl CountingBloomFilter {
/// Get the estimate count for element in this counting bloom filter.
/// See: https://github.com/yankun1992/fastbloom/issues/3
pub fn estimate_count(&self, element: &[u8]) -> usize {
let m = self.config.size;
let hash1 = xxh3_64_with_seed(element, 0) % m;
let hash2 = xxh3_64_with_seed(element, 32) % m;

let mut res = self.counting_vec.get(hash1 as usize);
if res == 0 { return 0; }

for i in 1..self.config.hashes as u64 {
let mo = ((hash1 + i * hash2) % m) as usize;
let count = self.counting_vec.get(mo);
if count == 0 { return 0; } else { res = min(count, res) }
}

res
}

/// Get the underlying counter at index.
pub fn counter_at(&self, index: u64) -> usize {
self.counting_vec.get(index as usize)
}
}

impl Membership for CountingBloomFilter {
fn add(&mut self, element: &[u8]) {
Expand Down Expand Up @@ -785,4 +811,26 @@ fn counting_bloom_hash_indices_test() {
bloom.remove(b"hello");
assert_eq!(bloom.contains(b"hello"), false);
assert_eq!(bloom.contains_hash_indices(&bloom.get_hash_indices(b"hello")), false);
}
}

#[test]
fn counting_bloom_estimate_count() {
let mut builder =
FilterBuilder::new(10_000, 0.01);
let mut bloom = builder.build_counting_bloom_filter();

bloom.add(b"hello");
bloom.add(b"world");

assert_eq!(bloom.estimate_count(b"hello"), 1);
let indices = bloom.get_hash_indices(b"hello");

for index in indices {
assert_eq!(bloom.counter_at(index), 1)
}

assert_eq!(bloom.estimate_count(b"world"), 1);
for index in bloom.get_hash_indices(b"world") {
assert!(bloom.counter_at(index) <= 2);
}
}
12 changes: 12 additions & 0 deletions fastbloom_rs/fastbloom_rs.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,18 @@ class PyCountingBloomFilter(object):
def get_hash_indices_str(self, element: str) -> Sequence[int]:
...

def estimate_count(self, element: bytes) -> int:
...

def estimate_count_int(self, element: int) -> int:
...

def estimate_count_str(self, element: str) -> int:
...

def counter_at(self, index: int) -> int:
...

@staticmethod
def from_bytes(array: bytes, hashes: int, enable_repeat_insert: bool) -> PyCountingBloomFilter:
...
Expand Down
27 changes: 27 additions & 0 deletions fastbloom_rs/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,33 @@ def get_hash_indices(self, element: Union[str, int, bytes]) -> Sequence[int]:
else:
return self._py_counting_bloom.get_hash_indices_str(str(element))

def estimate_count(self, element: Union[str, int, bytes]) -> int:
"""
Get the estimate count for element in this counting bloom filter.
See: https://github.com/yankun1992/fastbloom/issues/3
:param element:
:return:
"""
if isinstance(element, int):
return self._py_counting_bloom.estimate_count_int(element)
elif isinstance(element, str):
return self._py_counting_bloom.estimate_count_str(element)
elif isinstance(element, bytes):
return self._py_counting_bloom.estimate_count(element)
else:
return self._py_counting_bloom.estimate_count_str(str(element))

def counter_at(self, index: int) -> int:
"""
Get the underlying counter at index.
:param index: index of counter slot.
:return:
"""
assert index > 0
return self._py_counting_bloom.counter_at(index)

def config(self) -> FilterBuilder:
"""
Returns the configuration/builder of the Bloom filter.
Expand Down
21 changes: 21 additions & 0 deletions py_tests/test_counting_bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,24 @@ def test_hash_indices():
assert not cbf2.contains_hash_indices(cbf2.get_hash_indices(31))
assert not cbf2.contains_hash_indices(cbf2.get_hash_indices('world'))
assert cbf2.contains_hash_indices(cbf2.get_hash_indices('Yan Kun'))


def test_estimate_count():
builder = FilterBuilder(100_000, 0.01)
# enable repeat insert
builder.enable_repeat_insert(True)
cbf = builder.build_counting_bloom_filter() # type: CountingBloomFilter

cbf.add(b'hello')

assert cbf.estimate_count(b'hello') == 1

for index in cbf.get_hash_indices(b'hello'):
assert cbf.counter_at(index) == 1

cbf.add(b'world')
for index in cbf.get_hash_indices(b'world'):
assert cbf.counter_at(index) <= 2

cbf.add(b'hello')
assert cbf.estimate_count(b'hello') == 2
16 changes: 16 additions & 0 deletions src/pybloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,22 @@ impl PyCountingBloomFilter {
Ok(self.counting_bloom_filter.get_hash_indices(bts.as_bytes()))
}

pub fn estimate_count_int(&self, element: i64) -> PyResult<u32> {
Ok(self.counting_bloom_filter.estimate_count(&i64::to_le_bytes(element)) as u32)
}

pub fn estimate_count_str(&self, element: &str) -> PyResult<u32> {
Ok(self.counting_bloom_filter.estimate_count(element.as_bytes()) as u32)
}

pub fn estimate_count(&self, element: &PyBytes) -> PyResult<u32> {
Ok(self.counting_bloom_filter.estimate_count(element.as_bytes()) as u32)
}

pub fn counter_at(&self, index: i64) -> PyResult<u64> {
Ok(self.counting_bloom_filter.counter_at(index as u64) as u64)
}

#[staticmethod]
pub fn from_bytes(array: &[u8], hashes: u32, enable_repeat_insert: bool) -> PyResult<Self> {
Ok(PyCountingBloomFilter {
Expand Down

0 comments on commit a13add4

Please sign in to comment.