From 37028ca9fdabdcb564786500214a2df561b1bc54 Mon Sep 17 00:00:00 2001 From: Victor Ung <161058892+vpung@users.noreply.github.com> Date: Thu, 29 Aug 2024 08:17:12 -0700 Subject: [PATCH] add fixed length str2int / asin str2int memory-mappable hashmap (#295) add fixed length str2int / asin str2int memory-mappable hashmap --- .github/style_type_check_cfg/.flake8 | 2 +- pecos/core/base.py | 18 +- pecos/core/libpecos.cpp | 24 +- pecos/core/utils/mmap_hashmap.hpp | 412 ++++++++++++++++++++- pecos/utils/mmap_hashmap_util.py | 12 +- test/pecos/utils/test_mmap_hashmap_util.py | 122 ++++++ 6 files changed, 584 insertions(+), 6 deletions(-) diff --git a/.github/style_type_check_cfg/.flake8 b/.github/style_type_check_cfg/.flake8 index dbe66680..5326990d 100644 --- a/.github/style_type_check_cfg/.flake8 +++ b/.github/style_type_check_cfg/.flake8 @@ -1,3 +1,3 @@ [flake8] -ignore = E203,E501,W605,F541 +extend-ignore = E203,E501,W605,F541 max_line_length = 100 diff --git a/pecos/core/base.py b/pecos/core/base.py index 2a348bc8..8cbe6660 100644 --- a/pecos/core/base.py +++ b/pecos/core/base.py @@ -2070,12 +2070,20 @@ def link_mmap_hashmap_methods(self): Specify C-lib's Memory-mappable Hashmap methods arguments and return types. """ fn_prefix = "mmap_hashmap" - map_type_list = ["str2int", "int2int"] + map_type_list = ["str2int", "fixed_len_str2int", "fixed_len_10_str2int", "int2int"] key_args_dict = { "str2int": [ c_char_p, # pointer of key string c_uint32, # length of key string ], + "fixed_len_str2int": [ + c_char_p, # pointer of key string + c_uint32, # length of key string + ], + "fixed_len_10_str2int": [ + c_char_p, # pointer of key string + c_uint32, # length of key string + ], "int2int": [ c_uint64, # key int64 ], @@ -2085,6 +2093,14 @@ def link_mmap_hashmap_methods(self): c_void_p, # List of pointer of key string POINTER(c_uint32), # List of length of key string ], + "fixed_len_str2int": [ + c_void_p, # List of pointer of key string + POINTER(c_uint32), # List of length of key string + ], + "fixed_len_10_str2int": [ + c_void_p, # List of pointer of key string + POINTER(c_uint32), # List of length of key string + ], "int2int": [ POINTER(c_uint64), # List of key int64 ], diff --git a/pecos/core/libpecos.cpp b/pecos/core/libpecos.cpp index 7f10d6ee..7e3b2636 100644 --- a/pecos/core/libpecos.cpp +++ b/pecos/core/libpecos.cpp @@ -661,7 +661,9 @@ extern "C" { // ==== C Interface of Memory-mappable Hashmap ==== - typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_str2int; + typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_str2int; + typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_str2int; + typedef pecos::mmap_hashmap::Str2IntMap mmap_hashmap_fixed_len_10_str2int; typedef pecos::mmap_hashmap::Int2IntMap mmap_hashmap_int2int; // New @@ -669,6 +671,8 @@ extern "C" { void* mmap_hashmap_new_ ## SUFFIX () { \ return static_cast(new mmap_hashmap_ ## SUFFIX()); } MMAP_MAP_NEW(str2int) + MMAP_MAP_NEW(fixed_len_str2int) + MMAP_MAP_NEW(fixed_len_10_str2int) MMAP_MAP_NEW(int2int) // Destruct @@ -676,6 +680,8 @@ extern "C" { void mmap_hashmap_destruct_ ## SUFFIX (void* map_ptr) { \ delete static_cast(map_ptr); } MMAP_MAP_DESTRUCT(str2int) + MMAP_MAP_DESTRUCT(fixed_len_str2int) + MMAP_MAP_DESTRUCT(fixed_len_10_str2int) MMAP_MAP_DESTRUCT(int2int) // Save @@ -683,6 +689,8 @@ extern "C" { void mmap_hashmap_save_ ## SUFFIX (void* map_ptr, const char* map_dir) { \ static_cast(map_ptr)->save(map_dir); } MMAP_MAP_SAVE(str2int) + MMAP_MAP_SAVE(fixed_len_str2int) + MMAP_MAP_SAVE(fixed_len_10_str2int) MMAP_MAP_SAVE(int2int) // Load @@ -692,6 +700,8 @@ extern "C" { map_ptr->load(map_dir, lazy_load); \ return static_cast(map_ptr); } MMAP_MAP_LOAD(str2int) + MMAP_MAP_LOAD(fixed_len_str2int) + MMAP_MAP_LOAD(fixed_len_10_str2int) MMAP_MAP_LOAD(int2int) // Size @@ -699,6 +709,8 @@ extern "C" { size_t mmap_hashmap_size_ ## SUFFIX (void* map_ptr) { \ return static_cast(map_ptr)->size(); } MMAP_MAP_SIZE(str2int) + MMAP_MAP_SIZE(fixed_len_str2int) + MMAP_MAP_SIZE(fixed_len_10_str2int) MMAP_MAP_SIZE(int2int) // Insert @@ -707,6 +719,8 @@ extern "C" { void mmap_hashmap_insert_ ## SUFFIX (void* map_ptr, KEY, uint64_t val) { \ static_cast(map_ptr)->insert(FUNC_CALL_KEY, val); } MMAP_MAP_INSERT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_INSERT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_INSERT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_INSERT(int2int, uint64_t key, key) // Get @@ -714,18 +728,24 @@ extern "C" { uint64_t mmap_hashmap_get_ ## SUFFIX (void* map_ptr, KEY) { \ return static_cast(map_ptr)->get(FUNC_CALL_KEY); } MMAP_MAP_GET(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET(int2int, uint64_t key, key) #define MMAP_MAP_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ uint64_t mmap_hashmap_get_w_default_ ## SUFFIX (void* map_ptr, KEY, uint64_t def_val) { \ return static_cast(map_ptr)->get_w_default(FUNC_CALL_KEY, def_val); } MMAP_MAP_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_GET_W_DEFAULT(int2int, uint64_t key, key) #define MMAP_MAP_BATCH_GET_W_DEFAULT(SUFFIX, KEY, FUNC_CALL_KEY) \ void mmap_hashmap_batch_get_w_default_ ## SUFFIX (void* map_ptr, const uint32_t n_key, KEY, uint64_t def_val, uint64_t* vals, const int threads) { \ static_cast(map_ptr)->batch_get_w_default(n_key, FUNC_CALL_KEY, def_val, vals, threads); } MMAP_MAP_BATCH_GET_W_DEFAULT(str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) + MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) + MMAP_MAP_BATCH_GET_W_DEFAULT(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* const* keys, const uint32_t* keys_lens), KEY_SINGLE_ARG(keys, keys_lens)) MMAP_MAP_BATCH_GET_W_DEFAULT(int2int, const uint64_t* key, key) // Contains @@ -733,6 +753,8 @@ extern "C" { bool mmap_hashmap_contains_ ## SUFFIX (void* map_ptr, KEY) { \ return static_cast(map_ptr)->contains(FUNC_CALL_KEY); } MMAP_MAP_CONTAINS(str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_CONTAINS(fixed_len_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) + MMAP_MAP_CONTAINS(fixed_len_10_str2int, KEY_SINGLE_ARG(const char* key, uint32_t key_len), KEY_SINGLE_ARG(key, key_len)) MMAP_MAP_CONTAINS(int2int, uint64_t key, key) diff --git a/pecos/core/utils/mmap_hashmap.hpp b/pecos/core/utils/mmap_hashmap.hpp index 5d41e3f7..da34db9b 100644 --- a/pecos/core/utils/mmap_hashmap.hpp +++ b/pecos/core/utils/mmap_hashmap.hpp @@ -14,6 +14,7 @@ #ifndef __MMAP_ANKERL_HASHMAP_H__ #define __MMAP_ANKERL_HASHMAP_H__ +#include #include #include "third_party/ankerl/unordered_dense.h" #include "mmap_util.hpp" @@ -228,6 +229,413 @@ class AnkerlStr2IntMmapableVector { }; +// Memory-mappable vector of std::pair for Ankerl +// This vector takes/gets std::string_view as the key and the key should be of fixed length +class AnkerlFixedLenStr2IntMmapableVector { + template + class iter_t; + + public: + using key_type = std::string_view; + using value_type = std::pair; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using allocator_type = std::allocator; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = allocator_type::pointer; + using const_pointer = allocator_type::const_pointer; + // Custom iterator + using iterator = iter_t; + using const_iterator = iter_t; + + AnkerlFixedLenStr2IntMmapableVector() = default; + AnkerlFixedLenStr2IntMmapableVector(allocator_type alloc) + : store_(alloc) {} + + value_type* data() { return data_; } + const value_type* data() const { return data_; } + + value_type& operator[](uint64_t idx) { return data_[idx]; } + const value_type& operator[](uint64_t idx) const { return data_[idx]; } + + /* Functions to match std::vector interface */ + auto get_allocator() { return store_.get_allocator(); } + + constexpr auto back() -> reference { return data_[size_ - 1]; } + constexpr auto begin() -> iterator { return {data_}; } + constexpr auto cbegin() -> const_iterator { return {data_}; } + constexpr auto end() -> iterator { return {data_ + size_}; } + constexpr auto end() const -> const_iterator { return {data_ + size_}; } + constexpr auto cend() const -> const_iterator{ return {data_ + size_}; } + + // ----- Write funcs start ----- + void shrink_to_fit() { store_.shrink_to_fit(); } + void reserve(size_t new_capacity) { store_.reserve(new_capacity); } + + /* Emplace string-like key and int value as std::pair*/ + template + auto emplace_back(std::piecewise_construct_t, std::tuple key, std::tuple args) { + // Extract key + key_type k = std::get<0>(key); + + // Emplace back std::pair + auto eb_val = store_.emplace_back( + std::piecewise_construct, + std::forward_as_tuple(size_), + std::forward< std::tuple >(args)); + + + size_type key_length = k.size(); + + // Length of new key should be the same as previous keys + if (key_length == 0) { + throw std::runtime_error("Key length should be greater than 0."); + } else if (fixed_str_len_ != 0 && fixed_str_len_ != key_length) { + throw std::runtime_error("Key length differs from previous keys."); + } else { + fixed_str_len_ = key_length; + } + + // Append key string + str_store_.insert(str_store_.end(), k.data(), k.data() + key_length); + + // Update pointers + size_ = store_.size(); + data_ = store_.data(); + str_size_ = str_store_.size(); + str_data_ = str_store_.data(); + + return eb_val; + } + + void pop_back() { + throw std::runtime_error("Not implemented for deletion"); + } + // ----- Write funcs end ----- + + size_type size() const { return size_; } + + bool empty() const { return static_cast (size_ == 0); } + + /* Get key for given member */ + key_type get_key(value_type const& vt) const { + auto index = vt.first; + return key_type(str_data_ + (index * fixed_str_len_), fixed_str_len_); + } + + /* Mmap save/load with MmapStore */ + void save_to_mmap_store(pecos::mmap_util::MmapStore& mmap_s) const { + mmap_s.fput_one(size_); + mmap_s.fput_one(str_size_); + mmap_s.fput_one(fixed_str_len_); + mmap_s.fput_multiple(data_, size_); + mmap_s.fput_multiple(str_data_, str_size_); + } + + void load_from_mmap_store(pecos::mmap_util::MmapStore& mmap_s) { + if (is_self_allocated_()) { // raises error for non-empty self-allocated vector + throw std::runtime_error("Cannot load for non-empty vector case."); + } + size_ = mmap_s.fget_one(); + str_size_ = mmap_s.fget_one(); + fixed_str_len_ = mmap_s.fget_one(); + data_ = mmap_s.fget_multiple(size_); + str_data_ = mmap_s.fget_multiple(str_size_); + } + + + private: + // Number of elements of the data + size_type size_ = 0; + size_type str_size_ = 0; + + // Pointer to data + value_type* data_ = nullptr; + char* str_data_ = nullptr; + + size_type fixed_str_len_ = 0; + + // Actual data storage for in-memory case + std::vector store_; + std::vector str_store_; + + /* Whether data storage is non-empty self-allocated vector. + * True indicates non-empty vector case; False indicates either empty or mmap view. */ + bool is_self_allocated_() const { + return static_cast (store_.size() > 0); + } + + /** + * Iterator class doubles as const_iterator and iterator + */ + template + class iter_t { + using ptr_t = typename std::conditional_t; + ptr_t iter_data_{}; + + template + friend class iter_t; + + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = AnkerlFixedLenStr2IntMmapableVector::difference_type; + using value_type = AnkerlFixedLenStr2IntMmapableVector::value_type; + using reference = typename std::conditional_t; + using pointer = typename std::conditional_t; + + iter_t() noexcept = default; + + template ::type> + constexpr iter_t(iter_t const& other) noexcept + : iter_data_(other.iter_data_) {} + + constexpr iter_t(ptr_t data) noexcept + : iter_data_(data) {} + + template ::type> + constexpr auto operator=(iter_t const& other) noexcept -> iter_t& { + iter_data_ = other.iter_data_; + return *this; + } + + constexpr auto operator++() noexcept -> iter_t& { + ++iter_data_; + return *this; + } + + constexpr auto operator+(difference_type diff) noexcept -> iter_t { + return {iter_data_ + diff}; + } + + template + constexpr auto operator-(iter_t const& other) noexcept -> difference_type { + return static_cast(iter_data_ - other.iter_data_); + } + + constexpr auto operator*() const noexcept -> reference { + return *iter_data_; + } + + constexpr auto operator->() const noexcept -> pointer { + return iter_data_; + } + + template + constexpr auto operator==(iter_t const& o) const noexcept -> bool { + return iter_data_ == o.iter_data_; + } + + template + constexpr auto operator!=(iter_t const& o) const noexcept -> bool { + return !(*this == o); + } + }; +}; + + +// Memory-mappable vector of std::pair for Ankerl +// This vector takes/gets std::string_view as the key, but emplace back as the special mmap format FixedLen10Str +// The key must be of length 10 +class AnkerlFixedLen10Str2IntMmapableVector { + template + class iter_t; + + // Fixed Length of 10 for keys + static constexpr std::size_t fixed_str_len = 10; + + struct FixedLen10Str { + char str[fixed_str_len]; + + FixedLen10Str(const char* input_str = nullptr) { + if (input_str) { + std::memcpy(str, input_str, fixed_str_len); + } else { + throw std::runtime_error("Illegal initialization of FixLen10Str with nullptr."); + } + } + }; + + public: + using key_type = std::string_view; + using value_type = std::pair; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using allocator_type = std::allocator; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = allocator_type::pointer; + using const_pointer = allocator_type::const_pointer; + // Custom iterator + using iterator = iter_t; + using const_iterator = iter_t; + + AnkerlFixedLen10Str2IntMmapableVector() = default; + AnkerlFixedLen10Str2IntMmapableVector(allocator_type alloc) + : store_(alloc) {} + + value_type* data() { return data_; } + const value_type* data() const { return data_; } + + value_type& operator[](uint64_t idx) { return data_[idx]; } + const value_type& operator[](uint64_t idx) const { return data_[idx]; } + + /* Functions to match std::vector interface */ + auto get_allocator() { return store_.get_allocator(); } + + constexpr auto back() -> reference { return data_[size_ - 1]; } + constexpr auto begin() -> iterator { return {data_}; } + constexpr auto cbegin() -> const_iterator { return {data_}; } + constexpr auto end() -> iterator { return {data_ + size_}; } + constexpr auto end() const -> const_iterator { return {data_ + size_}; } + constexpr auto cend() const -> const_iterator{ return {data_ + size_}; } + + // ----- Write funcs start ----- + void shrink_to_fit() { store_.shrink_to_fit(); } + void reserve(size_t new_capacity) { store_.reserve(new_capacity); } + + /* Emplace string-like key and int value as std::pair*/ + template + auto emplace_back(std::piecewise_construct_t, std::tuple key, std::tuple args) { + // Extract key + key_type key_string = std::get<0>(key); + + if (key_string.size() != fixed_str_len) { + throw std::runtime_error("ASIN string length is not 10."); + } + + // Emplace back std::pair + auto eb_val = store_.emplace_back( + std::piecewise_construct, + std::forward_as_tuple(key_string.data()), + std::forward< std::tuple >(args)); + + // Update pointers + size_ = store_.size(); + data_ = store_.data(); + + return eb_val; + } + + void pop_back() { + throw std::runtime_error("Not implemented for deletion"); + } + // ----- Write funcs end ----- + + size_type size() const { return size_; } + + bool empty() const { return static_cast (size_ == 0); } + + /* Get key for given member */ + key_type get_key(value_type const& vt) const { + return key_type(vt.first.str, fixed_str_len); + } + + /* Mmap save/load with MmapStore */ + void save_to_mmap_store(pecos::mmap_util::MmapStore& mmap_s) const { + mmap_s.fput_one(size_); + mmap_s.fput_multiple(data_, size_); + } + + void load_from_mmap_store(pecos::mmap_util::MmapStore& mmap_s) { + if (is_self_allocated_()) { // raises error for non-empty self-allocated vector + throw std::runtime_error("Cannot load for non-empty vector case."); + } + size_ = mmap_s.fget_one(); + data_ = mmap_s.fget_multiple(size_); + } + + + private: + // Number of elements of the data + size_type size_ = 0; + + // Pointer to data + value_type* data_ = nullptr; + + // Actual data storage for in-memory case + std::vector store_; + + /* Whether data storage is non-empty self-allocated vector. + * True indicates non-empty vector case; False indicates either empty or mmap view. */ + bool is_self_allocated_() const { + return static_cast (store_.size() > 0); + } + + /** + * Iterator class doubles as const_iterator and iterator + */ + template + class iter_t { + using ptr_t = typename std::conditional_t; + ptr_t iter_data_{}; + + template + friend class iter_t; + + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = AnkerlFixedLen10Str2IntMmapableVector::difference_type; + using value_type = AnkerlFixedLen10Str2IntMmapableVector::value_type; + using reference = typename std::conditional_t; + using pointer = typename std::conditional_t; + + iter_t() noexcept = default; + + template ::type> + constexpr iter_t(iter_t const& other) noexcept + : iter_data_(other.iter_data_) {} + + constexpr iter_t(ptr_t data) noexcept + : iter_data_(data) {} + + template ::type> + constexpr auto operator=(iter_t const& other) noexcept -> iter_t& { + iter_data_ = other.iter_data_; + return *this; + } + + constexpr auto operator++() noexcept -> iter_t& { + ++iter_data_; + return *this; + } + + constexpr auto operator+(difference_type diff) noexcept -> iter_t { + return {iter_data_ + diff}; + } + + template + constexpr auto operator-(iter_t const& other) noexcept -> difference_type { + return static_cast(iter_data_ - other.iter_data_); + } + + constexpr auto operator*() const noexcept -> reference { + return *iter_data_; + } + + constexpr auto operator->() const noexcept -> pointer { + return iter_data_; + } + + template + constexpr auto operator==(iter_t const& o) const noexcept -> bool { + return iter_data_ == o.iter_data_; + } + + template + constexpr auto operator!=(iter_t const& o) const noexcept -> bool { + return !(*this == o); + } + }; +}; + + // Memory-mappable vector of std::pair for Ankerl class AnkerlInt2IntMmapableVector : public pecos::mmap_util::MmapableVector> { template @@ -360,6 +768,7 @@ class AnkerlInt2IntMmapableVector : public pecos::mmap_util::MmapableVector class Str2IntMap { public: void insert(const char* key, uint32_t key_len, uint64_t val) { @@ -397,10 +806,11 @@ class Str2IntMap { std::string_view, uint64_t, ankerl::unordered_dense::v4_0_0::hash, std::equal_to, - details_::AnkerlStr2IntMmapableVector + AllocatorOrContainer > map; }; + class Int2IntMap { public: void insert(uint64_t key, uint64_t val) { map[key] = val; } diff --git a/pecos/utils/mmap_hashmap_util.py b/pecos/utils/mmap_hashmap_util.py index 0f73c144..b9d1cc0e 100644 --- a/pecos/utils/mmap_hashmap_util.py +++ b/pecos/utils/mmap_hashmap_util.py @@ -187,7 +187,11 @@ def init(cls, map_type, map_dir, lazy_load): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["load"](map_dir.encode("utf-8"), lazy_load) - if map_type == "str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntReadOnly(map_ptr, fn_dict) elif map_type == "int2int": return _MmapHashmapInt2IntReadOnly(map_ptr, fn_dict) @@ -340,7 +344,11 @@ def init(cls, map_type, map_dir): fn_dict = clib.mmap_hashmap_init(map_type) map_ptr = fn_dict["new"]() - if map_type == "str2int": + if ( + map_type == "str2int" + or map_type == "fixed_len_str2int" + or map_type == "fixed_len_10_str2int" + ): return _MmapHashmapStr2IntWrite(map_ptr, fn_dict, map_dir) elif map_type == "int2int": return _MmapHashmapInt2IntWrite(map_ptr, fn_dict, map_dir) diff --git a/test/pecos/utils/test_mmap_hashmap_util.py b/test/pecos/utils/test_mmap_hashmap_util.py index d0c43fa6..fb5db919 100644 --- a/test/pecos/utils/test_mmap_hashmap_util.py +++ b/test/pecos/utils/test_mmap_hashmap_util.py @@ -64,6 +64,126 @@ def test_str2int_mmap_hashmap(tmpdir): ) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 + + +def test_fixed_len_str2int_mmap_hashmap(tmpdir): + from pecos.utils.mmap_hashmap_util import MmapHashmap, MmapHashmapBatchGetter + + map_dir = tmpdir.join("fixed_len_str2int").realpath().strpath + kv_dict = {"aaa".encode("utf-8"): 2, "bbb".encode("utf-8"): 3} + + # Write-only Mode + w_map = MmapHashmap("fixed_len_str2int") + w_map.open("w", map_dir) + # Insert + w_map.map.insert("aaa".encode("utf-8"), 1) # Test for overwrite later + for k, v in kv_dict.items(): + w_map.map.insert(k, v) + # Size + assert w_map.map.size() == len(kv_dict) + w_map.close() + + # Read-only Mode + r_map = MmapHashmap("fixed_len_str2int") + r_map.open("r", map_dir) + # Get + for k, v in kv_dict.items(): + assert r_map.map[k] == v + # Get with default + for k, v in kv_dict.items(): + assert r_map.map.get(k, 10) == v + assert r_map.map.get("ccc".encode("utf-8"), 10) == 10 + # Contains + for k, _ in kv_dict.items(): + assert k in r_map.map + assert not ("ccc".encode("utf-8") in r_map.map) + # Size + assert r_map.map.size() == len(kv_dict) + + # Batch get with default + max_batch_size = 5 + # max_batch_size > num of key + r_map_batch_getter = MmapHashmapBatchGetter(r_map.map, max_batch_size) + ks = list(kv_dict.keys()) + ["ccc".encode("utf-8")] # Non-exist key + vs = list(kv_dict.values()) + [10] + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # max_batch_size = num of key + ks = list(kv_dict.keys()) + ["ccc".encode("utf-8")] * ( + max_batch_size - len(kv_dict) + ) # Non-exist key + vs = list(kv_dict.values()) + [10] * (max_batch_size - len(kv_dict)) + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # max_batch_size = num of key * 3 + ks = list(kv_dict.keys()) + ["ccc".encode("utf-8")] * ( + 3 * max_batch_size - len(kv_dict) + ) # Non-exist key + vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 + + +def test_fixed_len_10_str2int_mmap_hashmap(tmpdir): + from pecos.utils.mmap_hashmap_util import MmapHashmap, MmapHashmapBatchGetter + + len_10_a_string = "a" * 10 + len_10_b_string = "b" * 10 + len_10_c_string = "c" * 10 + + map_dir = tmpdir.join("fixed_len_10_str2int").realpath().strpath + kv_dict = {len_10_a_string.encode("utf-8"): 2, len_10_b_string.encode("utf-8"): 3} + + # Write-only Mode + w_map = MmapHashmap("fixed_len_10_str2int") + w_map.open("w", map_dir) + # Insert + w_map.map.insert(len_10_a_string.encode("utf-8"), 1) # Test for overwrite later + for k, v in kv_dict.items(): + w_map.map.insert(k, v) + # Size + assert w_map.map.size() == len(kv_dict) + w_map.close() + + # Read-only Mode + r_map = MmapHashmap("fixed_len_10_str2int") + r_map.open("r", map_dir) + # Get + for k, v in kv_dict.items(): + assert r_map.map[k] == v + # Get with default + for k, v in kv_dict.items(): + assert r_map.map.get(k, 10) == v + assert r_map.map.get(len_10_c_string.encode("utf-8"), 10) == 10 + # Contains + for k, _ in kv_dict.items(): + assert k in r_map.map + assert not (len_10_c_string.encode("utf-8") in r_map.map) + # Size + assert r_map.map.size() == len(kv_dict) + + # Batch get with default + max_batch_size = 5 + # max_batch_size > num of key + r_map_batch_getter = MmapHashmapBatchGetter(r_map.map, max_batch_size) + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] # Non-exist key + vs = list(kv_dict.values()) + [10] + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # max_batch_size = num of key + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * ( + max_batch_size - len(kv_dict) + ) # Non-exist key + vs = list(kv_dict.values()) + [10] * (max_batch_size - len(kv_dict)) + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # max_batch_size = num of key * 3 + ks = list(kv_dict.keys()) + [len_10_c_string.encode("utf-8")] * ( + 3 * max_batch_size - len(kv_dict) + ) # Non-exist key + vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) + assert r_map_batch_getter.get(ks, 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15 def test_int2int_mmap_hashmap(tmpdir): @@ -116,3 +236,5 @@ def test_int2int_mmap_hashmap(tmpdir): ks = list(kv_dict.keys()) + [1000] * (3 * max_batch_size - len(kv_dict)) # Non-exist key vs = list(kv_dict.values()) + [10] * (3 * max_batch_size - len(kv_dict)) assert r_map_batch_getter.get(np.array(ks, dtype=np.int64), 10).tolist() == vs + # check max batch size increased + assert r_map_batch_getter.max_batch_size == 15