Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Remove USE_ALIGNED_ACCESS and enhance BYTE_ORDER handling #2456

Open
wants to merge 9 commits into
base: unstable
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/common/port.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@

#pragma once

#if defined(__sparc__) || defined(__arm__)
#define USE_ALIGNED_ACCESS
#endif

#if defined(__s390__)
#if defined(__GNUC__) && __GNUC__ < 7
constexpr size_t CACHE_LINE_SIZE = 64U;
Expand Down
4 changes: 0 additions & 4 deletions src/storage/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@
#include "observer_or_unique.h"
#include "status.h"

#if defined(__sparc__) || defined(__arm__)
#define USE_ALIGNED_ACCESS
#endif

enum class StorageEngineType : uint16_t {
RocksDB,
Speedb,
Expand Down
50 changes: 29 additions & 21 deletions src/types/redis_bitmap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -549,33 +549,40 @@ rocksdb::Status Bitmap::BitOp(BitOpFlags op_flag, const std::string &op_name, co
} else {
memset(frag_res.get(), 0, frag_maxlen);
}

/* Fast path: as far as we have data for all the input bitmaps we
* can take a fast path that performs much better than the
* vanilla algorithm. On ARM we skip the fast path since it will
* result in GCC compiling the code using multiple-words load/store
* operations that are not supported even in ARM >= v6. */
#ifndef USE_ALIGNED_ACCESS
* vanilla algorithm.
* We hope the compiler will generate a better code for memcpy
* rather than keep this fast path only in ARM machine.
*/
if (frag_minlen >= sizeof(uint64_t) * 4 && frag_numkeys <= 16) {
auto *lres = reinterpret_cast<uint64_t *>(frag_res.get());
const uint64_t *lp[16];
uint8_t *lres = frag_res.get();
// lp points to the start of each fragment and would be advanced
// to frag_minlen.
const uint8_t *lp[16];
for (uint64_t i = 0; i < frag_numkeys; i++) {
lp[i] = reinterpret_cast<const uint64_t *>(fragments[i].data());
lp[i] = reinterpret_cast<const uint8_t *>(fragments[i].data());
}
memcpy(frag_res.get(), fragments[0].data(), frag_minlen);
auto apply_fast_path_op = [&](auto op) {
// Note: kBitOpNot cannot use this op, it only applying
// to kBitOpAnd, kBitOpOr, kBitOpXor.
DCHECK(op_flag != kBitOpNot);
while (frag_minlen >= sizeof(uint64_t) * 4) {
uint64_t lres_u64[4];
memcpy(lres_u64, lres, sizeof(lres_u64));
for (uint64_t i = 1; i < frag_numkeys; i++) {
op(lres[0], lp[i][0]);
op(lres[1], lp[i][1]);
op(lres[2], lp[i][2]);
op(lres[3], lp[i][3]);
lp[i] += 4;
uint64_t lp_data[4];
memcpy(lp_data, lp[i], sizeof(lp_data));
op(lres_u64[0], lp_data[0]);
op(lres_u64[1], lp_data[1]);
op(lres_u64[2], lp_data[2]);
op(lres_u64[3], lp_data[3]);
lp[i] += 4 * sizeof(uint64_t);
}
lres += 4;
// memcpy back to lres
memcpy(lres, &lres_u64, sizeof(lres_u64));
lres += 4 * sizeof(uint64_t);
j += sizeof(uint64_t) * 4;
frag_minlen -= sizeof(uint64_t) * 4;
}
Expand All @@ -589,18 +596,19 @@ rocksdb::Status Bitmap::BitOp(BitOpFlags op_flag, const std::string &op_name, co
apply_fast_path_op([](uint64_t &a, uint64_t b) { a ^= b; });
} else if (op_flag == kBitOpNot) {
while (frag_minlen >= sizeof(uint64_t) * 4) {
lres[0] = ~lres[0];
lres[1] = ~lres[1];
lres[2] = ~lres[2];
lres[3] = ~lres[3];
lres += 4;
uint64_t lres_u64[4];
memcpy(lres_u64, lres, sizeof(lres_u64));
lres_u64[0] = ~lres_u64[0];
lres_u64[1] = ~lres_u64[1];
lres_u64[2] = ~lres_u64[2];
lres_u64[3] = ~lres_u64[3];
memcpy(lres, &lres_u64, sizeof(lres_u64));
lres += 4 * sizeof(uint64_t);
j += sizeof(uint64_t) * 4;
frag_minlen -= sizeof(uint64_t) * 4;
}
}
}
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if the performance difference between this fast path and vanilla algorithm is large.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally I think https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/bitmap_ops.h#L160-L242 would be better, I'll do a benchmark there

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've checked the current impl, and I guess the fast path would be faster😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use https://quick-bench.com/ which can generate a chart.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I revert the part because the underlying code is too slow 😅

Besides, I change to use memcpy to generalize it. I'll try a benchmark later

Copy link
Member Author

@mapleFU mapleFU Aug 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

apache/arrow@76cebfa
Lucky, arrow has similiar testing here. On My MacOS M1Pro with Release -O2:

BenchmarkBitmapVisitBitsetAnd/32768/0      753392 ns       749634 ns          937 bytes_per_second=41.687M/s
BenchmarkBitmapVisitBitsetAnd/131072/0    2986097 ns      2985449 ns          234 bytes_per_second=41.8698M/s
BenchmarkBitmapVisitBitsetAnd/32768/1      746267 ns       746040 ns          939 bytes_per_second=41.8878M/s
BenchmarkBitmapVisitBitsetAnd/131072/1    2991597 ns      2990679 ns          234 bytes_per_second=41.7965M/s
BenchmarkBitmapVisitBitsetAnd/32768/2      747519 ns       747314 ns          940 bytes_per_second=41.8164M/s
BenchmarkBitmapVisitBitsetAnd/131072/2    2985102 ns      2984500 ns          234 bytes_per_second=41.8831M/s

The code has no different from bit-hacking and


uint8_t output = 0, byte = 0;
for (; j < frag_maxlen; j++) {
output = (fragments[0].size() <= j) ? 0 : fragments[0][j];
Expand Down
4 changes: 2 additions & 2 deletions src/vendor/endianconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ uint64_t intrev64(uint64_t v);

/* variants of the function doing the actual conversion only if the target
* host is big endian */
#if (BYTE_ORDER == LITTLE_ENDIAN)
#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define memrev16ifbe(p) ((void)(0))
#define memrev32ifbe(p) ((void)(0))
#define memrev64ifbe(p) ((void)(0))
Expand All @@ -61,7 +61,7 @@ uint64_t intrev64(uint64_t v);

/* The functions htonu64() and ntohu64() convert the specified value to
* network byte ordering and back. In big endian systems they are no-ops. */
#if (BYTE_ORDER == BIG_ENDIAN)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define htonu64(v) (v)
#define ntohu64(v) (v)
#else
Expand Down
10 changes: 0 additions & 10 deletions src/vendor/murmurhash2.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,6 @@

#include <stdint.h>

#ifndef USE_ALIGNED_ACCESS
#if defined(__sparc__) || defined(__arm__)
#define USE_ALIGNED_ACCESS
#endif
#endif

// NOLINTBEGIN

/* MurmurHash2, 64 bit version.
Expand All @@ -55,11 +49,7 @@ inline uint64_t HllMurMurHash64A(const void *key, int len, uint32_t seed) {
uint64_t k = 0;

#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#ifdef USE_ALIGNED_ACCESS
git-hulk marked this conversation as resolved.
Show resolved Hide resolved
memcpy(&k, data, sizeof(uint64_t));
#else
k = *((uint64_t *)data);
#endif
#else
k = (uint64_t)data[0];
k |= (uint64_t)data[1] << 8;
Expand Down
4 changes: 2 additions & 2 deletions src/vendor/sha1.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ A million repetitions of "a"

/* blk0() and blk() perform the initial expand. */
/* I got the idea of expanding during the round function from SSLeay */
#if BYTE_ORDER == LITTLE_ENDIAN
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
// NOLINTNEXTLINE
#define blk0(i) (block->l[i] = (rol(block->l[i], 24) & 0xFF00FF00) | (rol(block->l[i], 8) & 0x00FF00FF))
#elif BYTE_ORDER == BIG_ENDIAN
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
// NOLINTNEXTLINE
#define blk0(i) block->l[i]
#else
Expand Down
Loading