diff --git a/CMakeLists.txt b/CMakeLists.txt index 79c97a35c..ec0a5e958 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,9 @@ endif() set(SNMALLOC_MIN_ALLOC_SIZE "" CACHE STRING "Minimum allocation bytes (power of 2)") set(SNMALLOC_MIN_ALLOC_STEP_SIZE "" CACHE STRING "Minimum allocation step (power of 2)") +set(SNMALLOC_DEALLOC_BATCH_RING_ASSOC "" CACHE STRING "Associativity of deallocation batch cache; 0 to disable") +set(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS "" CACHE STRING "Logarithm of number of deallocation batch cache associativity sets") + if(MSVC AND SNMALLOC_STATIC_LIBRARY AND (SNMALLOC_STATIC_LIBRARY_PREFIX STREQUAL "")) message(FATAL_ERROR "Empty static library prefix not supported on MSVC") endif() @@ -251,6 +254,8 @@ if (SNMALLOC_NO_REALLOCARR) endif() add_as_define_value(SNMALLOC_MIN_ALLOC_SIZE) add_as_define_value(SNMALLOC_MIN_ALLOC_STEP_SIZE) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) +add_as_define_value(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) target_compile_definitions(snmalloc INTERFACE $<$:MALLOC_USABLE_SIZE_QUALIFIER=const>) diff --git a/src/snmalloc/ds/allocconfig.h b/src/snmalloc/ds/allocconfig.h index b4af45a68..1a3c88702 100644 --- a/src/snmalloc/ds/allocconfig.h +++ b/src/snmalloc/ds/allocconfig.h @@ -120,6 +120,45 @@ namespace snmalloc static constexpr size_t REMOTE_SLOTS = 1 << REMOTE_SLOT_BITS; static constexpr size_t REMOTE_MASK = REMOTE_SLOTS - 1; +#if defined(SNMALLOC_DEALLOC_BATCH_RING_ASSOC) + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + SNMALLOC_DEALLOC_BATCH_RING_ASSOC; +#else +# if defined(__has_cpp_attribute) +# if ( \ + __has_cpp_attribute(msvc::no_unique_address) && \ + (__cplusplus >= 201803L || _MSVC_LANG >= 201803L)) || \ + __has_cpp_attribute(no_unique_address) + // For C++20 or later, we do have [[no_unique_address]] and so can also do + // batching if we aren't turning on the backward-pointer mitigations + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = + mitigations(freelist_backward_edge) ? 4 : 2; +# else + // For C++17, we don't have [[no_unique_address]] and so we always end up + // needing all four pointers' worth of space (because BatchedRemoteMessage has + // two freelist::Object::T<> links within, each of which will have two fields + // and will be padded to two pointers). + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif +# else + // If we don't even have the feature test macro, we're C++17 or earlier. + static constexpr size_t DEALLOC_BATCH_MIN_ALLOC_WORDS = 4; +# endif + + static constexpr size_t DEALLOC_BATCH_RING_ASSOC = + (MIN_ALLOC_SIZE >= (DEALLOC_BATCH_MIN_ALLOC_WORDS * sizeof(void*))) ? 2 : 0; +#endif + +#if defined(SNMALLOC_DEALLOC_BATCH_RING_SET_BITS) + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = + SNMALLOC_DEALLOC_BATCH_RING_SET_BITS; +#else + static constexpr size_t DEALLOC_BATCH_RING_SET_BITS = 3; +#endif + + static constexpr size_t DEALLOC_BATCH_RINGS = + DEALLOC_BATCH_RING_ASSOC * bits::one_at_bit(DEALLOC_BATCH_RING_SET_BITS); + static_assert( INTERMEDIATE_BITS < MIN_ALLOC_STEP_BITS, "INTERMEDIATE_BITS must be less than MIN_ALLOC_BITS"); diff --git a/src/snmalloc/mem/freelist.h b/src/snmalloc/mem/freelist.h index adf72fbc2..7b0e8e3b3 100644 --- a/src/snmalloc/mem/freelist.h +++ b/src/snmalloc/mem/freelist.h @@ -40,7 +40,7 @@ namespace snmalloc { - class RemoteMessage; + class BatchedRemoteMessage; static constexpr address_t NO_KEY_TWEAK = 0; @@ -134,7 +134,7 @@ namespace snmalloc friend class Object; - friend class ::snmalloc::RemoteMessage; + friend class ::snmalloc::BatchedRemoteMessage; class Empty { diff --git a/src/snmalloc/mem/remoteallocator.h b/src/snmalloc/mem/remoteallocator.h index 4b9c5d4b2..ee31eda08 100644 --- a/src/snmalloc/mem/remoteallocator.h +++ b/src/snmalloc/mem/remoteallocator.h @@ -26,22 +26,22 @@ namespace snmalloc * address space). This gives us enough room to pack in the length of the * ring, without needing to grow the structure. */ - class RemoteMessage + class BatchedRemoteMessage { - friend class RemoteMessageAssertions; + friend class BatchedRemoteMessageAssertions; freelist::Object::T<> free_ring; freelist::Object::T<> message_link; static_assert( sizeof(free_ring.next_object) >= sizeof(void*), - "RemoteMessage bitpacking needs sizeof(void*) in next_object"); + "BatchedRemoteMessage bitpacking needs sizeof(void*) in next_object"); public: static auto emplace_in_alloc(capptr::Alloc alloc) { - return CapPtr::unsafe_from( - new (alloc.unsafe_ptr()) RemoteMessage()); + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) BatchedRemoteMessage()); } static auto mk_from_freelist_builder( @@ -57,8 +57,9 @@ namespace snmalloc // Preserve the last node's backpointer and change its type. auto last_prev = last->prev; - auto self = CapPtr::unsafe_from( - new (last.unsafe_ptr()) RemoteMessage()); + auto self = + CapPtr::unsafe_from( + new (last.unsafe_ptr()) BatchedRemoteMessage()); self->free_ring.prev = last_prev; // XXX On CHERI, we could do a fair bit better if we had a primitive for @@ -78,25 +79,27 @@ namespace snmalloc return self; } - static freelist::HeadPtr to_message_link(capptr::Alloc m) + static freelist::HeadPtr + to_message_link(capptr::Alloc m) { - return pointer_offset(m, offsetof(RemoteMessage, message_link)) + return pointer_offset(m, offsetof(BatchedRemoteMessage, message_link)) .as_reinterpret>(); } - static capptr::Alloc + static capptr::Alloc from_message_link(freelist::HeadPtr chainPtr) { return pointer_offset_signed( chainPtr, - -static_cast(offsetof(RemoteMessage, message_link))) - .as_reinterpret(); + -static_cast( + offsetof(BatchedRemoteMessage, message_link))) + .as_reinterpret(); } template SNMALLOC_FAST_PATH static std::pair open_free_ring( - capptr::Alloc m, + capptr::Alloc m, size_t objsize, const FreeListKey& key, address_t key_tweak, @@ -142,7 +145,7 @@ namespace snmalloc template static uint16_t ring_size( - capptr::Alloc m, + capptr::Alloc m, const FreeListKey& key, address_t key_tweak, Domesticator_queue domesticate) @@ -183,16 +186,88 @@ namespace snmalloc } }; - class RemoteMessageAssertions + class BatchedRemoteMessageAssertions { - static_assert(sizeof(RemoteMessage) <= MIN_ALLOC_SIZE); - static_assert(offsetof(RemoteMessage, free_ring) == 0); + static_assert( + (DEALLOC_BATCH_RINGS == 0) || + (sizeof(BatchedRemoteMessage) <= MIN_ALLOC_SIZE)); + static_assert(offsetof(BatchedRemoteMessage, free_ring) == 0); static_assert( - MAX_SLAB_SPAN_BITS + MAX_CAPACITY_BITS < 8 * sizeof(void*), + (DEALLOC_BATCH_RINGS == 0) || + (MAX_SLAB_SPAN_BITS + MAX_CAPACITY_BITS < 8 * sizeof(void*)), "Ring bit-stuffing trick can't reach far enough to enclose a slab"); }; + class SingletonRemoteMessage + { + friend class SingletonRemoteMessageAssertions; + + freelist::Object::T<> message_link; + + public: + static auto emplace_in_alloc(capptr::Alloc alloc) + { + return CapPtr::unsafe_from( + new (alloc.unsafe_ptr()) SingletonRemoteMessage()); + } + + static freelist::HeadPtr + to_message_link(capptr::Alloc m) + { + return pointer_offset(m, offsetof(SingletonRemoteMessage, message_link)) + .as_reinterpret>(); + } + + static capptr::Alloc + from_message_link(freelist::HeadPtr chainPtr) + { + return pointer_offset_signed( + chainPtr, + -static_cast( + offsetof(SingletonRemoteMessage, message_link))) + .as_reinterpret(); + } + + template + SNMALLOC_FAST_PATH static std::pair + open_free_ring( + capptr::Alloc m, + size_t, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return { + m.as_reinterpret>(), static_cast(1)}; + } + + template + static uint16_t ring_size( + capptr::Alloc, + const FreeListKey&, + address_t, + Domesticator_queue) + { + return 1; + } + }; + + class SingletonRemoteMessageAssertions + { + static_assert(sizeof(SingletonRemoteMessage) <= MIN_ALLOC_SIZE); + static_assert( + sizeof(SingletonRemoteMessage) == sizeof(freelist::Object::T<>)); + static_assert(offsetof(SingletonRemoteMessage, message_link) == 0); + }; + + using RemoteMessage = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + BatchedRemoteMessage, + SingletonRemoteMessage>; + + static_assert(sizeof(RemoteMessage) <= MIN_ALLOC_SIZE); + /** * A RemoteAllocator is the message queue of freed objects. It builds on the * FreeListMPSCQ but encapsulates knowledge that the objects are actually diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h index 7150b1eb5..72ba92dac 100644 --- a/src/snmalloc/mem/remotecache.h +++ b/src/snmalloc/mem/remotecache.h @@ -12,14 +12,13 @@ namespace snmalloc { - template + /** + * Stores the remote deallocation to batch them before sending + */ + template class RemoteDeallocCacheBatching { - static constexpr size_t RING_ASSOC = 2; - static constexpr size_t RING_SET_BITS = 3; - - static constexpr size_t RINGS = - RING_ASSOC * bits::one_at_bit(RING_SET_BITS); + static_assert(RINGS > 0); std::array, RINGS> open_builder; std::array open_meta = @@ -30,13 +29,13 @@ namespace snmalloc { // See https://github.com/skeeto/hash-prospector for choice of constant return ((meta->as_key_tweak() * 0x7EFB352D) >> 16) & - bits::mask_bits(RING_SET_BITS); + bits::mask_bits(DEALLOC_BATCH_RING_SET_BITS); } template SNMALLOC_FAST_PATH void close_one_pending(Forward forward, size_t ix) { - auto rmsg = RemoteMessage::mk_from_freelist_builder( + auto rmsg = BatchedRemoteMessage::mk_from_freelist_builder( open_builder[ix], freelist::Object::key_root, open_meta[ix]->as_key_tweak()); @@ -65,7 +64,7 @@ namespace snmalloc { size_t ix_set = ring_set(meta); - for (size_t ix_way = 0; ix_way < RING_ASSOC; ix_way++) + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) { size_t ix = ix_set + ix_way; if (meta == open_meta[ix]) @@ -78,7 +77,7 @@ namespace snmalloc size_t victim_ix = ix_set; size_t victim_size = 0; - for (size_t ix_way = 0; ix_way < RING_ASSOC; ix_way++) + for (size_t ix_way = 0; ix_way < DEALLOC_BATCH_RING_ASSOC; ix_way++) { size_t ix = ix_set + ix_way; if (open_meta[ix] == nullptr) @@ -124,6 +123,34 @@ namespace snmalloc } }; + template + struct RemoteDeallocCacheNoBatching + { + void init() {} + + template + void close_all(Forward) + {} + + template + SNMALLOC_FAST_PATH void dealloc( + typename Config::PagemapEntry::SlabMetadata*, + freelist::HeadPtr r, + Forward forward) + { + auto& entry = Config::Backend::get_metaentry(address_cast(r)); + forward( + entry.get_remote()->trunc_id(), + SingletonRemoteMessage::emplace_in_alloc(r.as_void())); + } + }; + + template + using RemoteDeallocCacheBatchingImpl = std::conditional_t< + (DEALLOC_BATCH_RINGS > 0), + RemoteDeallocCacheBatching, + RemoteDeallocCacheNoBatching>; + /** * Stores the remote deallocation to batch them before sending */ @@ -132,7 +159,7 @@ namespace snmalloc { std::array, REMOTE_SLOTS> list; - RemoteDeallocCacheBatching batching; + RemoteDeallocCacheBatchingImpl batching; /** * The total amount of memory we are waiting for before we will dispatch