diff --git a/csf/src/fp/cmap/mod.rs b/csf/src/fp/cmap/mod.rs index 8142247..7300237 100644 --- a/csf/src/fp/cmap/mod.rs +++ b/csf/src/fp/cmap/mod.rs @@ -106,8 +106,9 @@ impl CMap { let mut level_nr = 0u32; while input_size != 0 { let level_size_segments = conf.level_size_chooser.size_segments( - &value_coding, - &values[0..input_size], &value_rev_indices[0..input_size]); + || values[0..input_size].iter().zip(value_rev_indices[0..input_size].iter()).map(|(c, ri)| value_coding.rev_fragment_of(*c, *ri) as u64), + input_size, + value_coding.bits_per_fragment()); let level_size = level_size_segments * 64; stats.level(input_size, level_size); let mut collision_solver = conf.collision_solver.new(level_size_segments, value_coding.bits_per_fragment()); diff --git a/csf/src/fp/gocmap/mod.rs b/csf/src/fp/gocmap/mod.rs index ebf7e77..a4b2a85 100644 --- a/csf/src/fp/gocmap/mod.rs +++ b/csf/src/fp/gocmap/mod.rs @@ -135,8 +135,11 @@ impl GOCMap { @@ -11,6 +13,9 @@ pub trait KVSet { /// If `self` doesn't remember which keys are retained it uses `retained_hint` to check this. fn for_each_key_value(&self, f: F, retained_hint: P) where F: FnMut(&K, u8), P: FnMut(&K) -> bool; + /// Returns minimal number of bits that can store any value. + fn bits_per_value(&self) -> u8; + /// Calls `map` for each key-value pair in the set, and returns outputs of these calls. Uses single thread. /// /// If `self` doesn't remember which keys are retained it uses `retained_hint` to check this. @@ -63,6 +68,10 @@ impl KVSet for HashMap { for (k, v) in self { f(k, *v) } } + fn bits_per_value(&self) -> u8 { + bits_to_store_any_of_ref(self.values()) + } + fn retain_keys(&mut self, mut filter: F, _retained_earlier: P, _remove_count: R) where F: FnMut(&K) -> bool, P: FnMut(&K) -> bool, R: FnMut() -> usize { @@ -77,6 +86,10 @@ impl KVSet for BTreeMap { for (k, v) in self { f(k, *v) } } + fn bits_per_value(&self) -> u8 { + bits_to_store_any_of_ref(self.values()) + } + fn retain_keys(&mut self, mut filter: F, _retained_earlier: P, _remove_count: R) where F: FnMut(&K) -> bool, P: FnMut(&K) -> bool, R: FnMut() -> usize { @@ -114,6 +127,10 @@ impl<'k, K: Sync> KVSet for SlicesMutSource<'k, K> { } } + fn bits_per_value(&self) -> u8 { + bits_to_store_any_of_ref(self.values.iter()) + } + #[inline(always)] fn map_each_key_value(&self, mut map: M, _retained_hint: P) -> Vec where M: FnMut(&K, u8) -> R, P: FnMut(&K) -> bool { diff --git a/csf/src/fp/level_size_chooser.rs b/csf/src/fp/level_size_chooser.rs index 32a0373..6774467 100644 --- a/csf/src/fp/level_size_chooser.rs +++ b/csf/src/fp/level_size_chooser.rs @@ -3,25 +3,15 @@ use std::mem::MaybeUninit; use fsum::FSum; use std::fmt; use std::fmt::Formatter; -use crate::coding::Coding; -/// Chooses the size of level for the given level input. +/// Chooses the size of level for the given sequence of retained values. pub trait LevelSizeChooser { - /// Returns number of 64-bit segments to use for given level input. - fn size_segments(&self, _coding: &C, values: &[C::Codeword], _value_rev_indices: &[u8]) -> usize { - self.max_size_segments(values.len()) - } - - /// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less. - fn max_size_segments(&self, max_level_size: usize) -> usize; -} - -pub trait SimpleLevelSizeChooser { - - /// Returns number of 64-bit segments to use for given level input. - fn size_segments(&self, values: &[u8], _bits_per_value: u8) -> usize { - self.max_size_segments(values.len()) + /// Returns number of 64-bit segments to use for given sequence of retained `values`. + fn size_segments(&self, _values: F, values_len: usize, _bits_per_value: u8) -> usize + where VIt: IntoIterator, F: FnMut() -> VIt + { + self.max_size_segments(values_len) } /// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less. @@ -48,12 +38,6 @@ impl LevelSizeChooser for ProportionalLevelSize { } } -impl SimpleLevelSizeChooser for ProportionalLevelSize { - fn max_size_segments(&self, max_level_size: usize) -> usize { - ceiling_div(max_level_size*self.percent as usize, 64*100) - } -} - impl fmt::Display for ProportionalLevelSize { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}percent", self.percent) @@ -141,7 +125,7 @@ impl OptimalLevelSize { // poisson(licza fragmentów do zapisania, wielkość wejścia / wielkość tablicy, liczba wpisów) } -impl LevelSizeChooser for OptimalLevelSize { +/*impl LevelSizeChooser for OptimalLevelSize { fn size_segments(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize { let mut counts = [0u32; 256]; for (c, ri) in values.iter().zip(value_rev_indices.iter()) { @@ -157,15 +141,17 @@ impl LevelSizeChooser for OptimalLevelSize { fn max_size_segments(&self, max_level_size: usize) -> usize { ceiling_div(max_level_size, 64) } -} +}*/ -impl SimpleLevelSizeChooser for OptimalLevelSize { - fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize { - let mut counts = [0u32; 256]; - for v in values { counts[*v as usize] += 1; } +impl LevelSizeChooser for OptimalLevelSize { + fn size_segments(&self, mut values: F, values_len: usize, bits_per_value: u8) -> usize + where VIt: IntoIterator, F: FnMut() -> VIt + { + let mut counts = [0u32; 256]; // TODO support bits_per_value > 8 + for v in values() { counts[v as usize] += 1; } Self::size_segments_for_dist( &mut counts[0..(1usize< usize { +impl LevelSizeChooser for OptimalGroupedLevelSize { + fn size_segments(&self, mut values: F, values_len: usize, bits_per_value: u8) -> usize + where VIt: IntoIterator, F: FnMut() -> VIt + { let divider = self.divider as usize; let max_value = (1usize< 8 + for v in values() { counts[(v as usize + delta) / divider] += 1; } OptimalLevelSize::size_segments_for_dist( &mut counts[0 ..= (max_value + delta) / divider], - values.len(), + values_len, bits_per_value // this must be unchanged as it is used to calculate memory used by a value ) }).min().unwrap() @@ -243,21 +231,13 @@ impl ResizedLevel { } impl LevelSizeChooser for ResizedLevel { - fn size_segments(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize { - self.resized(self.level_size_chooser.size_segments(coding, values, value_rev_indices)) - } - - fn max_size_segments(&self, max_level_size: usize) -> usize { - self.resized(self.level_size_chooser.max_size_segments(max_level_size)) - } -} - -impl SimpleLevelSizeChooser for ResizedLevel { - #[inline(always)] fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize { - self.resized(self.level_size_chooser.size_segments(values, bits_per_value)) + #[inline] fn size_segments(&self, values: F, values_len: usize, bits_per_value: u8) -> usize + where VIt: IntoIterator, F: FnMut() -> VIt + { + self.resized(self.level_size_chooser.size_segments(values, values_len, bits_per_value)) } - #[inline(always)] fn max_size_segments(&self, max_level_size: usize) -> usize { + #[inline] fn max_size_segments(&self, max_level_size: usize) -> usize { self.resized(max_level_size) } } \ No newline at end of file diff --git a/csf/src/fp/map/mod.rs b/csf/src/fp/map/mod.rs index 61bfbee..f3e50dd 100644 --- a/csf/src/fp/map/mod.rs +++ b/csf/src/fp/map/mod.rs @@ -5,7 +5,8 @@ pub use conf::MapConf; use std::hash::Hash; use bitm::{BitAccess, Rank}; -pub use super::level_size_chooser::SimpleLevelSizeChooser; +use super::kvset::KVSet; +pub use super::level_size_chooser::LevelSizeChooser; use ph::{BuildDefaultSeededHasher, BuildSeededHasher, utils, stats, utils::{ArrayWithRank, read_bits}}; use std::collections::HashMap; use std::io; @@ -63,6 +64,89 @@ impl Map { self.get_stats(k, &mut ()) } + /// Constructs [`Map`] for given key-value pairs `kv`, using the build configuration `conf` and reporting statistics with `stats`. + /// + /// TODO Panics if the construction fails. + /// Then it is almost certain that the input contains either duplicate keys + /// or keys indistinguishable by any hash function from the family used. + /*fn with_conf_stats( + kv: impl KVSet, + mut conf: MapConf, + stats: &mut BS + ) -> Self + where K: Hash, + LSC: SimpleLevelSizeChooser, + CSB: CollisionSolverBuilder, + BS: stats::BuildStatsCollector + { + if conf.bits_per_value == 0 { + conf.bits_per_value = kv.bits_per_value(); + } + let mut level_sizes = Vec::::new(); + let mut arrays = Vec::>::new(); + let mut values = Vec::>::new(); + let mut input_size = kv.kv_len(); + let mut level_nr = 0u32; + while input_size != 0 { + let level_size_segments = conf.level_size_chooser.size_segments( + &values[0..input_size], conf.bits_per_value); + let level_size = level_size_segments * 64; + stats.level(input_size, level_size); + let mut collision_solver = conf.collision_solver.new(level_size_segments, conf.bits_per_value); + for i in 0..input_size { + let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize; + if collision_solver.is_under_collision(a_index) { continue } + collision_solver.process_fragment(a_index, values[i], conf.bits_per_value); + } + + let current_array = collision_solver.to_collision_array(); + let mut i = 0usize; + while i < input_size { + let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize; + if current_array.get_bit(a_index) { // no collision + // remove i-th element by replacing it with the last one + input_size -= 1; + keys.swap(i, input_size); + //values.swap_fragments(i, input_size, bits_per_value); + values.swap(i, input_size); + } else { // collision, has to be processed again, at the next level + i += 1; + } + } + arrays.push(current_array); + level_sizes.push(level_size_segments as u64); + level_nr += 1; + } + + let (array, out_fragments_num) = ArrayWithRank::build(arrays.concat().into_boxed_slice()); + let mut output_value_fragments = CSB::CollisionSolver::construct_value_array(out_fragments_num as usize, conf.bits_per_value); + for input_index in 0..keys.len() { + //let mut result_decoder = self.value_coding.decoder(); + let mut array_begin_index = 0usize; + let mut level = 0u32; + loop { + let level_size = (level_sizes[level as usize] as usize) << 6usize; + let i = array_begin_index + utils::map64_to_64(conf.hash.hash_one(&keys[input_index], level), level_size as u64) as usize; + if array.content.get_bit(i) { + CSB::CollisionSolver::set_value(&mut output_value_fragments, array.rank(i), values[input_index], conf.bits_per_value); + // stats.value_on_level(level); // TODO do we need this? we can get average levels from lookups + break; + } + array_begin_index += level_size; + level += 1; + } + } + stats.end(0); + Self { + array, + values: output_value_fragments, + bits_per_value: conf.bits_per_value, + level_sizes: level_sizes.into_boxed_slice(), + hash_builder: conf.hash + } + }*/ + + /// Build `Map` for given keys -> values map, where: /// - keys are given directly, /// - TODO values are given as bit vector with bit_per_value. @@ -73,7 +157,7 @@ impl Map { stats: &mut BS ) -> Self where K: Hash, - LSC: SimpleLevelSizeChooser, + LSC: LevelSizeChooser, CSB: CollisionSolverBuilder, BS: stats::BuildStatsCollector @@ -87,7 +171,7 @@ impl Map { let mut level_nr = 0u32; while input_size != 0 { let level_size_segments = conf.level_size_chooser.size_segments( - &values[0..input_size], conf.bits_per_value); + || values[0..input_size].iter().map(|v| *v as u64), input_size, conf.bits_per_value); let level_size = level_size_segments * 64; stats.level(input_size, level_size); let mut collision_solver = conf.collision_solver.new(level_size_segments, conf.bits_per_value); @@ -145,7 +229,7 @@ impl Map { } #[inline] - pub fn with_slices_conf( + pub fn with_slices_conf( keys: &mut [K], values: &mut [u8], /*&mut [u64],*/ conf: MapConf) -> Self { Self::with_slices_conf_stats(keys, values, conf, &mut ()) @@ -195,7 +279,7 @@ impl Map { impl Map { - pub fn with_map_conf( + pub fn with_map_conf( map: &HashMap, conf: MapConf, stats: &mut BS @@ -268,7 +352,7 @@ mod tests { test_4pairs(MapConf::default()); } - fn test_8pairs(conf: MapConf) { + fn test_8pairs(conf: MapConf) { let fpmap = Map::with_map_conf(&hashmap!( 'a' => 1, 'b' => 2, 'c' => 1, 'd' => 3, 'e' => 4, 'f' => 1, 'g' => 5, 'h' => 6), conf, &mut ()); diff --git a/csf/src/fp/mod.rs b/csf/src/fp/mod.rs index 200d9df..ba3beac 100644 --- a/csf/src/fp/mod.rs +++ b/csf/src/fp/mod.rs @@ -16,7 +16,7 @@ pub use gocmap::{GOCMap, GOCMapConf}; pub use ph::fmph::{GroupSize, SeedSize, TwoToPowerBits, TwoToPowerBitsStatic, Bits, Bits8, GOConf}; pub mod level_size_chooser; -pub use level_size_chooser::{LevelSizeChooser, SimpleLevelSizeChooser, ProportionalLevelSize, OptimalLevelSize, ResizedLevel}; +pub use level_size_chooser::{LevelSizeChooser, ProportionalLevelSize, OptimalLevelSize, ResizedLevel}; pub mod collision_solver; pub use collision_solver::{CollisionSolver, CollisionSolverBuilder, IsLossless, LoMemAcceptEquals}; diff --git a/csf_benchmark/src/function.rs b/csf_benchmark/src/function.rs index 998b9f5..65db026 100644 --- a/csf_benchmark/src/function.rs +++ b/csf_benchmark/src/function.rs @@ -45,7 +45,7 @@ impl PrintParams for ProportionalLevelSize { } impl CSFBuilder for fp::MapConf -where LSC: fp::LevelSizeChooser+fp::SimpleLevelSizeChooser, CSB: fp::CollisionSolverBuilder, S: BuildSeededHasher +where LSC: fp::LevelSizeChooser, CSB: fp::CollisionSolverBuilder, S: BuildSeededHasher { type CSF = fp::Map;