diff --git a/be/src/gutil/CMakeLists.txt b/be/src/gutil/CMakeLists.txt index bac9aec3037020..0ed2dd1de3d351 100644 --- a/be/src/gutil/CMakeLists.txt +++ b/be/src/gutil/CMakeLists.txt @@ -27,9 +27,7 @@ SET(SOURCE_FILES bits.cc dynamic_annotations.c hash/city.cc - hash/hash.cc hash/jenkins.cc - int128.cc ref_counted.cc stringprintf.cc strings/ascii_ctype.cc diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h index f1a9cf2a1a2da1..4bc04e1e303eb5 100644 --- a/be/src/gutil/endian.h +++ b/be/src/gutil/endian.h @@ -32,7 +32,6 @@ #include -#include "gutil/int128.h" #include "gutil/integral_types.h" #include "gutil/port.h" #include "vec/core/wide_integer.h" @@ -197,29 +196,6 @@ class LittleEndian { static void Store64(void* p, uint64 v) { UNALIGNED_STORE64(p, FromHost64(v)); } - static uint128 Load128(const void* p) { - return uint128(ToHost64(UNALIGNED_LOAD64(reinterpret_cast(p) + 1)), - ToHost64(UNALIGNED_LOAD64(p))); - } - - static void Store128(void* p, const uint128 v) { - UNALIGNED_STORE64(p, FromHost64(Uint128Low64(v))); - UNALIGNED_STORE64(reinterpret_cast(p) + 1, FromHost64(Uint128High64(v))); - } - - // Build a uint128 from 1-16 bytes. - // 8 * len least significant bits are loaded from the memory with - // LittleEndian order. The 128 - 8 * len most significant bits are - // set all to 0. - static uint128 Load128VariableLength(const void* p, int len) { - if (len <= 8) { - return uint128(Load64VariableLength(p, len)); - } else { - return uint128(Load64VariableLength(static_cast(p) + 8, len - 8), - Load64(p)); - } - } - // Load & Store in machine's word size. static uword_t LoadUnsignedWord(const void* p) { if (sizeof(uword_t) == 8) @@ -278,9 +254,6 @@ class BigEndian { static uint64 FromHost64(uint64 x) { return x; } static uint64 ToHost64(uint64 x) { return x; } - static uint128 FromHost128(uint128 x) { return x; } - static uint128 ToHost128(uint128 x) { return x; } - static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } @@ -328,29 +301,6 @@ class BigEndian { static void Store64(void* p, uint64 v) { UNALIGNED_STORE64(p, FromHost64(v)); } - static uint128 Load128(const void* p) { - return uint128(ToHost64(UNALIGNED_LOAD64(p)), - ToHost64(UNALIGNED_LOAD64(reinterpret_cast(p) + 1))); - } - - static void Store128(void* p, const uint128 v) { - UNALIGNED_STORE64(p, FromHost64(Uint128High64(v))); - UNALIGNED_STORE64(reinterpret_cast(p) + 1, FromHost64(Uint128Low64(v))); - } - - // Build a uint128 from 1-16 bytes. - // 8 * len least significant bits are loaded from the memory with - // BigEndian order. The 128 - 8 * len most significant bits are - // set all to 0. - static uint128 Load128VariableLength(const void* p, int len) { - if (len <= 8) { - return uint128(Load64VariableLength(static_cast(p) + 8, len)); - } else { - return uint128(Load64VariableLength(p, len - 8), - Load64(static_cast(p) + 8)); - } - } - // Load & Store in machine's word size. static uword_t LoadUnsignedWord(const void* p) { if (sizeof(uword_t) == 8) diff --git a/be/src/gutil/hash/city.cc b/be/src/gutil/hash/city.cc index a8163a0e68e068..d4d53ab2f6bf31 100644 --- a/be/src/gutil/hash/city.cc +++ b/be/src/gutil/hash/city.cc @@ -19,6 +19,7 @@ // IWYU pragma: no_include #include + #include #include @@ -34,10 +35,7 @@ using std::make_pair; using std::pair; #include "common/logging.h" - #include "gutil/endian.h" -#include "gutil/hash/hash128to64.h" -#include "gutil/int128.h" #include "gutil/integral_types.h" #include "gutil/port.h" @@ -71,8 +69,14 @@ static uint64 ShiftMix(uint64 val) { return val ^ (val >> 47); } -static uint64 HashLen16(uint64 u, uint64 v) { - return Hash128to64(uint128(u, v)); +uint64 HashLen16(uint64 u, uint64 v) { + const uint64 kMul = 0xc6a4a7935bd1e995ULL; + uint64 a = (u ^ v) * kMul; + a ^= (a >> 47); + uint64 b = (v ^ a) * kMul; + b ^= (b >> 47); + b *= kMul; + return b; } static uint64 HashLen0to16(const char* s, size_t len) { @@ -202,109 +206,4 @@ uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) { uint64 CityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64 seed1) { return HashLen16(CityHash64(s, len) - seed0, seed1); } - -// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings -// of any length representable in ssize_t. Based on City and Murmur128. -static uint128 CityMurmur(const char* s, size_t len, uint128 seed) { - uint64 a = Uint128Low64(seed); - uint64 b = Uint128High64(seed); - uint64 c = 0; - uint64 d = 0; - ssize_t l = len - 16; - if (l <= 0) { // len <= 16 - c = b * k1 + HashLen0to16(s, len); - d = Rotate(a + (len >= 8 ? LittleEndian::Load64(s) : c), 32); - } else { // len > 16 - c = HashLen16(LittleEndian::Load64(s + len - 8) + k1, a); - d = HashLen16(b + len, c + LittleEndian::Load64(s + len - 16)); - a += d; - do { - a ^= ShiftMix(LittleEndian::Load64(s) * k1) * k1; - a *= k1; - b ^= a; - c ^= ShiftMix(LittleEndian::Load64(s + 8) * k1) * k1; - c *= k1; - d ^= c; - s += 16; - l -= 16; - } while (l > 0); - } - a = HashLen16(a, c); - b = HashLen16(d, b); - return uint128(a ^ b, HashLen16(b, a)); -} - -uint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed) { - // TODO(user): As of February 2011, there's a beta of Murmur3 that would - // most likely be useful here. E.g., if (len < 900) return Murmur3(...) - if (len < 128) { - return CityMurmur(s, len, seed); - } - - // We expect len >= 128 to be the common case. Keep 56 bytes of state: - // v, w, x, y, and z. - pair v, w; - uint64 x = Uint128Low64(seed); - uint64 y = Uint128High64(seed); - uint64 z = len * k1; - v.first = Rotate(y ^ k1, 49) * k1 + LittleEndian::Load64(s); - v.second = Rotate(v.first, 42) * k1 + LittleEndian::Load64(s + 8); - w.first = Rotate(y + z, 35) * k1 + x; - w.second = Rotate(x + LittleEndian::Load64(s + 88), 53) * k1; - - // This is similar to the inner loop of CityHash64(), manually unrolled. - do { - x = Rotate(x + y + v.first + LittleEndian::Load64(s + 16), 37) * k1; - y = Rotate(y + v.second + LittleEndian::Load64(s + 48), 42) * k1; - x ^= w.second; - y ^= v.first; - z = Rotate(z ^ w.first, 33); - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); - std::swap(z, x); - s += 64; - x = Rotate(x + y + v.first + LittleEndian::Load64(s + 16), 37) * k1; - y = Rotate(y + v.second + LittleEndian::Load64(s + 48), 42) * k1; - x ^= w.second; - y ^= v.first; - z = Rotate(z ^ w.first, 33); - v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); - w = WeakHashLen32WithSeeds(s + 32, z + w.second, y); - std::swap(z, x); - s += 64; - len -= 128; - } while (PREDICT_TRUE(len >= 128)); - y += Rotate(w.first, 37) * k0 + z; - x += Rotate(v.first + z, 49) * k0; - // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. - for (size_t tail_done = 0; tail_done < len;) { - tail_done += 32; - y = Rotate(y - x, 42) * k0 + v.second; - w.first += LittleEndian::Load64(s + len - tail_done + 16); - x = Rotate(x, 49) * k0 + w.first; - w.first += v.first; - v = WeakHashLen32WithSeeds(s + len - tail_done, v.first, v.second); - } - // At this point our 48 bytes of state should contain more than - // enough information for a strong 128-bit hash. We use two - // different 48-byte-to-8-byte hashes to get a 16-byte final result. - x = HashLen16(x, v.first); - y = HashLen16(y, w.first); - return uint128(HashLen16(x + v.second, w.second) + y, HashLen16(x + w.second, y + v.second)); -} - -uint128 CityHash128(const char* s, size_t len) { - if (len >= 16) { - return CityHash128WithSeed( - s + 16, len - 16, - uint128(LittleEndian::Load64(s) ^ k3, LittleEndian::Load64(s + 8))); - } else if (len >= 8) { - return CityHash128WithSeed(nullptr, 0, - uint128(LittleEndian::Load64(s) ^ (len * k0), - LittleEndian::Load64(s + len - 8) ^ k1)); - } else { - return CityHash128WithSeed(s, len, uint128(k0, k1)); - } -} - } // namespace util_hash diff --git a/be/src/gutil/hash/city.h b/be/src/gutil/hash/city.h index bcacaa9147485b..8e6042b61a8e3c 100644 --- a/be/src/gutil/hash/city.h +++ b/be/src/gutil/hash/city.h @@ -22,11 +22,12 @@ #include // for size_t. -#include "gutil/int128.h" #include "gutil/integral_types.h" namespace util_hash { +uint64 HashLen16(uint64 u, uint64 v); + // Hash function for a byte array. // The mapping may change from time to time. uint64 CityHash64(const char* buf, size_t len); @@ -39,11 +40,4 @@ uint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed); // hashed into the result. The mapping may change from time to time. uint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0, uint64 seed1); -// Hash function for a byte array. The mapping will never change. -uint128 CityHash128(const char* s, size_t len); - -// Hash function for a byte array. For convenience, a 128-bit seed is also -// hashed into the result. The mapping will never change. -uint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed); - } // namespace util_hash diff --git a/be/src/gutil/hash/hash.cc b/be/src/gutil/hash/hash.cc deleted file mode 100644 index b16bea5dfeaf0a..00000000000000 --- a/be/src/gutil/hash/hash.cc +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// This is the legacy unified hash library implementation. Its components are -// being split up into smaller, dedicated libraries. What remains here are -// things still being migrated. -// -// To find the implementation of the core Bob Jenkins lookup2 hash, look in -// jenkins.cc. - -#include "gutil/hash/hash.h" - -#include "common/logging.h" - -#include "gutil/hash/jenkins.h" -#include "gutil/hash/jenkins_lookup2.h" -#include "gutil/integral_types.h" - -// For components that ship code externally (notably the Google Search -// Appliance) we want to change the fingerprint function so that -// attackers cannot mount offline attacks to find collisions with -// google.com internal fingerprints (most importantly, for URL -// fingerprints). -#ifdef GOOGLECLIENT -#error Do not compile this into binaries that we deliver to users! -#error Instead, use -#endif -#ifdef EXTERNAL_FP -static const uint32 kFingerprintSeed0 = 0xabc; -static const uint32 kFingerprintSeed1 = 0xdef; -#else -static const uint32 kFingerprintSeed0 = 0; -static const uint32 kFingerprintSeed1 = 102072; -#endif - -static inline uint32 char2unsigned(char c) { - return static_cast(static_cast(c)); -} - -uint64 FingerprintReferenceImplementation(const char* s, uint32 len) { - uint32 hi = Hash32StringWithSeed(s, len, kFingerprintSeed0); - uint32 lo = Hash32StringWithSeed(s, len, kFingerprintSeed1); - return CombineFingerprintHalves(hi, lo); -} - -// This is a faster version of FingerprintReferenceImplementation(), -// making use of the fact that we're hashing the same string twice. -// The code is tedious to read, but it's just two interleaved copies of -// Hash32StringWithSeed(). -uint64 FingerprintInterleavedImplementation(const char* s, uint32 len) { - uint32 a, b, c = kFingerprintSeed0, d, e, f = kFingerprintSeed1; - uint32 keylen; - - a = b = d = e = 0x9e3779b9UL; // the golden ratio; an arbitrary value - - keylen = len; - if (keylen >= 4 * sizeof(a)) { - uint32 word32AtOffset0 = Google1At(s); - do { - a += word32AtOffset0; - d += word32AtOffset0; - b += Google1At(s + sizeof(a)); - e += Google1At(s + sizeof(a)); - c += Google1At(s + sizeof(a) * 2); - f += Google1At(s + sizeof(a) * 2); - s += 3 * sizeof(a); - word32AtOffset0 = Google1At(s); - mix(a, b, c); - mix(d, e, f); - keylen -= 3 * static_cast(sizeof(a)); - } while (keylen >= 4 * sizeof(a)); - if (keylen >= 3 * sizeof(a)) { - a += word32AtOffset0; - d += word32AtOffset0; - b += Google1At(s + sizeof(a)); - e += Google1At(s + sizeof(a)); - c += Google1At(s + sizeof(a) * 2); - f += Google1At(s + sizeof(a) * 2); - s += 3 * sizeof(a); - mix(a, b, c); - mix(d, e, f); - keylen -= 3 * static_cast(sizeof(a)); - DCHECK_LT(keylen, sizeof(a)); - c += len; - f += len; - switch (keylen) { // deal with rest. Cases fall through - case 3: - a += char2unsigned(s[2]) << 16; - d += char2unsigned(s[2]) << 16; - case 2: - a += char2unsigned(s[1]) << 8; - d += char2unsigned(s[1]) << 8; - case 1: - a += char2unsigned(s[0]); - d += char2unsigned(s[0]); - } - } else { - DCHECK(sizeof(a) <= keylen && keylen < 3 * sizeof(a)); - c += len; - f += len; - switch (keylen) { // deal with rest. Cases fall through - case 11: - c += char2unsigned(s[10]) << 24; - f += char2unsigned(s[10]) << 24; - case 10: - c += char2unsigned(s[9]) << 16; - f += char2unsigned(s[9]) << 16; - case 9: - c += char2unsigned(s[8]) << 8; - f += char2unsigned(s[8]) << 8; - case 8: - b += Google1At(s + 4); - a += word32AtOffset0; - e += Google1At(s + 4); - d += word32AtOffset0; - break; - case 7: - b += char2unsigned(s[6]) << 16; - e += char2unsigned(s[6]) << 16; - case 6: - b += char2unsigned(s[5]) << 8; - e += char2unsigned(s[5]) << 8; - case 5: - b += char2unsigned(s[4]); - e += char2unsigned(s[4]); - case 4: - a += word32AtOffset0; - d += word32AtOffset0; - } - } - } else { - if (keylen >= 3 * sizeof(a)) { - a += Google1At(s); - d += Google1At(s); - b += Google1At(s + sizeof(a)); - e += Google1At(s + sizeof(a)); - c += Google1At(s + sizeof(a) * 2); - f += Google1At(s + sizeof(a) * 2); - s += 3 * sizeof(a); - mix(a, b, c); - mix(d, e, f); - keylen -= 3 * static_cast(sizeof(a)); - } - c += len; - f += len; - switch (keylen) { // deal with rest. Cases fall through - case 11: - c += char2unsigned(s[10]) << 24; - f += char2unsigned(s[10]) << 24; - case 10: - c += char2unsigned(s[9]) << 16; - f += char2unsigned(s[9]) << 16; - case 9: - c += char2unsigned(s[8]) << 8; - f += char2unsigned(s[8]) << 8; - case 8: - b += Google1At(s + 4); - a += Google1At(s); - e += Google1At(s + 4); - d += Google1At(s); - break; - case 7: - b += char2unsigned(s[6]) << 16; - e += char2unsigned(s[6]) << 16; - case 6: - b += char2unsigned(s[5]) << 8; - e += char2unsigned(s[5]) << 8; - case 5: - b += char2unsigned(s[4]); - e += char2unsigned(s[4]); - case 4: - a += Google1At(s); - d += Google1At(s); - break; - case 3: - a += char2unsigned(s[2]) << 16; - d += char2unsigned(s[2]) << 16; - case 2: - a += char2unsigned(s[1]) << 8; - d += char2unsigned(s[1]) << 8; - case 1: - a += char2unsigned(s[0]); - d += char2unsigned(s[0]); - } - } - mix(a, b, c); - mix(d, e, f); - return CombineFingerprintHalves(c, f); -} diff --git a/be/src/gutil/hash/hash.h b/be/src/gutil/hash/hash.h deleted file mode 100644 index 18eedceeb94402..00000000000000 --- a/be/src/gutil/hash/hash.h +++ /dev/null @@ -1,276 +0,0 @@ -// -// Copyright (C) 1999 and onwards Google, Inc. -// -// -// This file contains routines for hashing and fingerprinting. -// -// A hash function takes an arbitrary input bitstring (string, char*, -// number) and turns it into a hash value (a fixed-size number) such -// that unequal input values have a high likelihood of generating -// unequal hash values. A fingerprint is a hash whose design is -// biased towards avoiding hash collisions, possibly at the expense of -// other characteristics such as execution speed. -// -// In general, if you are only using the hash values inside a single -// executable -- you're not writing the values to disk, and you don't -// depend on another instance of your program, running on another -// machine, generating the same hash values as you -- you want to use -// a HASH. Otherwise, you want to use a FINGERPRINT. -// -// RECOMMENDED HASH FOR STRINGS: GoodFastHash -// -// It is a functor, so you can use it like this: -// hash_map > -// hash_set > -// -// RECOMMENDED HASH FOR NUMBERS: hash<> -// -// Note that this is likely the identity hash, so if your -// numbers are "non-random" (especially in the low bits), another -// choice is better. You can use it like this: -// hash_map -// hash_set -// -// RECOMMENDED HASH FOR POINTERS: hash<> -// -// This is also likely the identity hash. -// -// RECOMMENDED HASH FOR STRUCTS: hash -// -// Take a fingerprint of the struct, and use that as the key. -// For instance: const uint64 hash_data[] = { s.foo, bit_cast(s.bar) }; -// uint64 fprint = (reinterpret_cast(hash_data), -// sizeof(hash_data)); -// hash_map[fprint] = whatever; -// -// RECOMMENDED FINGERPRINT: Fingerprint2011 -// -// (In util/hash/fingerprint2011.h) -// In particular, do *not* use Fingerprint in new code; it has -// problems with excess collisions. -// -// OTHER HASHES AND FINGERPRINTS: -// -// -// The wiki page also has good advice for when to use a fingerprint vs -// a hash. -// -// -// Note: if your file declares hash_map or -// hash_set, it will use the default hash function, -// hash. This is not a great choice. Always provide an -// explicit functor, such as GoodFastHash, as a template argument. -// (Either way, you will need to #include this file to get the -// necessary definition.) -// -// Some of the hash functions below are documented to be fixed -// forever; the rest (whether they're documented as so or not) may -// change over time. If you require a hash function that does not -// change over time, you should have unittests enforcing this -// property. We already have several such functions; see -// hash_unittest.cc for the details and unittests. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "gutil/hash/hash128to64.h" -#include "gutil/hash/jenkins.h" -#include "gutil/hash/jenkins_lookup2.h" -#include "gutil/hash/legacy_hash.h" -#include "gutil/hash/string_hash.h" -#include "gutil/int128.h" -#include "gutil/integral_types.h" -#include "gutil/hash/builtin_type_hash.h" - -// ---------------------------------------------------------------------- -// Fingerprint() -// Not recommended for new code. Instead, use Fingerprint2011(), -// a higher-quality and faster hash function. See fingerprint2011.h. -// -// Fingerprinting a string (or char*) will never return 0 or 1, -// in case you want a couple of special values. However, -// fingerprinting a numeric type may produce 0 or 1. -// -// The hash mapping of Fingerprint() will never change. -// -// Note: AVOID USING FINGERPRINT if at all possible. Use -// Fingerprint2011 (in fingerprint2011.h) instead. -// Fingerprint() is susceptible to collisions for even short -// strings with low edit distance; see -// Example collisions: -// "01056/02" vs. "11057/02" -// "LTA 02" vs. "MTA 12" -// The same study found only one collision each for CityHash64() and -// MurmurHash64(), from more than 2^32 inputs, and on medium-length -// strings with large edit distances.These issues, among others, -// led to the recommendation that new code should avoid Fingerprint(). -// ---------------------------------------------------------------------- -extern uint64 FingerprintReferenceImplementation(const char* s, uint32 len); -extern uint64 FingerprintInterleavedImplementation(const char* s, uint32 len); -inline uint64 Fingerprint(const char* s, uint32 len) { - if constexpr (sizeof(s) == 8) { // 64-bit systems have 8-byte pointers. - // The better choice when we have a decent number of registers. - return FingerprintInterleavedImplementation(s, len); - } else { - return FingerprintReferenceImplementation(s, len); - } -} - -// Routine that combines the hi/lo part of a fingerprint -// and changes the result appropriately to avoid returning 0/1. -inline uint64 CombineFingerprintHalves(uint32 hi, uint32 lo) { - uint64 result = (static_cast(hi) << 32) | static_cast(lo); - if ((hi == 0) && (lo < 2)) { - result ^= GG_ULONGLONG(0x130f9bef94a0a928); - } - return result; -} - -inline uint64 Fingerprint(const std::string& s) { - return Fingerprint(s.data(), static_cast(s.size())); -} -inline uint64 Hash64StringWithSeed(const std::string& s, uint64 c) { - return Hash64StringWithSeed(s.data(), static_cast(s.size()), c); -} -inline uint64 Fingerprint(schar c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(char c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(uint16 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(int16 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(uint32 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(int32 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(uint64 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} -inline uint64 Fingerprint(int64 c) { - return Hash64NumWithSeed(static_cast(c), MIX64); -} - -// This concatenates two 64-bit fingerprints. It is a convenience function to -// get a fingerprint for a combination of already fingerprinted components. -// It assumes that each input is already a good fingerprint itself. -// Note that this is legacy code and new code should use its replacement -// FingerprintCat2011(). -// -// Note that in general it's impossible to construct Fingerprint(str) -// from the fingerprints of substrings of str. One shouldn't expect -// FingerprintCat(Fingerprint(x), Fingerprint(y)) to indicate -// anything about Fingerprint(StrCat(x, y)). -inline uint64 FingerprintCat(uint64 fp1, uint64 fp2) { - return Hash64NumWithSeed(fp1, fp2); -} - -// This intended to be a "good" hash function. It may change from time to time. -template <> -struct std::hash { - size_t operator()(const uint128& x) const { - if (sizeof(&x) == 8) { // 64-bit systems have 8-byte pointers. - return Hash128to64(x); - } else { - uint32 a = static_cast(Uint128Low64(x)) + static_cast(0x9e3779b9UL); - uint32 b = - static_cast(Uint128Low64(x) >> 32) + static_cast(0x9e3779b9UL); - uint32 c = static_cast(Uint128High64(x)) + MIX32; - mix(a, b, c); - a += static_cast(Uint128High64(x) >> 32); - mix(a, b, c); - return c; - } - } - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; - -// Hasher for STL pairs. Requires hashers for both members to be defined -template -struct std::hash > { - size_t operator()(const pair& p) const { - size_t h1 = std::hash()(p.first); - size_t h2 = std::hash()(p.second); - // The decision below is at compile time - return (sizeof(h1) <= sizeof(uint32)) ? Hash32NumWithSeed(h1, h2) - : Hash64NumWithSeed(h1, h2); - } - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; - -// If you want an excellent string hash function, and you don't mind if it -// might change when you sync and recompile, please use GoodFastHash<>. -// For most applications, GoodFastHash<> is a good choice, better than -// hash or hash or similar. GoodFastHash<> can change -// from time to time and may differ across platforms, and we'll strive -// to keep improving it. -// -// By the way, when deleting the contents of a hash_set of pointers, it is -// unsafe to delete *iterator because the hash function may be called on -// the next iterator advance. Use STLDeleteContainerPointers(). - -template -struct GoodFastHash; - -// This intended to be a "good" hash function. It may change from time to time. -template <> -struct GoodFastHash { - size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); } - // Less than operator for MSVC. - bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; } - static const size_t bucket_size = 4; // These are required by MSVC - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; - -// This intended to be a "good" hash function. It may change from time to time. -template <> -struct GoodFastHash { - size_t operator()(const char* s) const { return HashStringThoroughly(s, strlen(s)); } - // Less than operator for MSVC. - bool operator()(const char* a, const char* b) const { return strcmp(a, b) < 0; } - static const size_t bucket_size = 4; // These are required by MSVC - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; - -// This intended to be a "good" hash function. It may change from time to time. -template -struct GoodFastHash > { - size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { - return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); - } - // Less than operator for MSVC. - bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, - const std::basic_string<_CharT, _Traits, _Alloc>& b) const { - return a < b; - } - static const size_t bucket_size = 4; // These are required by MSVC - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; - -// This intended to be a "good" hash function. It may change from time to time. -template -struct GoodFastHash > { - size_t operator()(const std::basic_string<_CharT, _Traits, _Alloc>& k) const { - return HashStringThoroughly(k.data(), k.length() * sizeof(k[0])); - } - // Less than operator for MSVC. - bool operator()(const std::basic_string<_CharT, _Traits, _Alloc>& a, - const std::basic_string<_CharT, _Traits, _Alloc>& b) const { - return a < b; - } - static const size_t bucket_size = 4; // These are required by MSVC - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; diff --git a/be/src/gutil/hash/hash128to64.h b/be/src/gutil/hash/hash128to64.h deleted file mode 100644 index 1bf870a6d691de..00000000000000 --- a/be/src/gutil/hash/hash128to64.h +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// Authors: jyrki@google.com (Jyrki Alakuijala), gpike@google.com (Geoff Pike) - -#pragma once - -#include "gutil/int128.h" -#include "gutil/integral_types.h" - -// Hash 128 input bits down to 64 bits of output. -// This is intended to be a reasonably good hash function. -// It may change from time to time. -inline uint64 Hash128to64(const uint128& x) { - // Murmur-inspired hashing. - const uint64 kMul = 0xc6a4a7935bd1e995ULL; - uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; - a ^= (a >> 47); - uint64 b = (Uint128High64(x) ^ a) * kMul; - b ^= (b >> 47); - b *= kMul; - return b; -} diff --git a/be/src/gutil/hash/legacy_hash.h b/be/src/gutil/hash/legacy_hash.h deleted file mode 100644 index 9dff097031acc7..00000000000000 --- a/be/src/gutil/hash/legacy_hash.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// This is a library of legacy hashing routines. These routines are still in -// use, but are not encouraged for any new code, and may be removed at some -// point in the future. -// -// New code should use one of the targeted libraries that provide hash -// interfaces for the types needed. See //util/hash/README for details. - -#pragma once - -#include "gutil/hash/builtin_type_hash.h" -#include "gutil/hash/string_hash.h" -#include "gutil/integral_types.h" - -// Hash8, Hash16 and Hash32 are for legacy use only. -typedef uint32 Hash32; -typedef uint16 Hash16; -typedef uint8 Hash8; - -const Hash32 kIllegalHash32 = static_cast(0xffffffffUL); -const Hash16 kIllegalHash16 = static_cast(0xffff); - -static const uint32 MIX32 = 0x12b9b0a1UL; // pi; an arbitrary number -static const uint64 MIX64 = GG_ULONGLONG(0x2b992ddfa23249d6); // more of pi - -// ---------------------------------------------------------------------- -// HashTo32() -// HashTo16() -// These functions take various types of input (through operator -// overloading) and return 32 or 16 bit quantities, respectively. -// The basic rule of our hashing is: always mix(). Thus, even for -// char outputs we cast to a uint32 and mix with two arbitrary numbers. -// HashTo32 never returns kIllegalHash32, and similarity, -// HashTo16 never returns kIllegalHash16. -// -// Note that these methods avoid returning certain reserved values, while -// the corresponding HashXXStringWithSeed() methods may return any value. -// ---------------------------------------------------------------------- - -// This macro defines the HashTo32 and HashTo16 versions all in one go. -// It takes the argument list and a command that hashes your number. -// (For 16 we just mod retval before returning it.) Example: -// HASH_TO((char c), Hash32NumWithSeed(c, MIX32_1)) -// evaluates to -// uint32 retval; -// retval = Hash32NumWithSeed(c, MIX32_1); -// return retval == kIllegalHash32 ? retval-1 : retval; -// - -#define HASH_TO(arglist, command) \ - inline uint32 HashTo32 arglist { \ - uint32 retval = command; \ - return retval == kIllegalHash32 ? retval - 1 : retval; \ - } - -// This defines: -// HashToXX(char *s, int slen); -// HashToXX(char c); -// etc - -HASH_TO((const char* s, uint32 slen), Hash32StringWithSeed(s, slen, MIX32)) -HASH_TO((const wchar_t* s, uint32 slen), - Hash32StringWithSeed(reinterpret_cast(s), - static_cast(sizeof(wchar_t) * slen), MIX32)) -HASH_TO((char c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((schar c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((uint16 c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((int16 c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((uint32 c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((int32 c), Hash32NumWithSeed(static_cast(c), MIX32)) -HASH_TO((uint64 c), static_cast(Hash64NumWithSeed(c, MIX64) >> 32)) -HASH_TO((int64 c), static_cast(Hash64NumWithSeed(c, MIX64) >> 32)) - -#undef HASH_TO // clean up the macro space - -inline uint16 HashTo16(const char* s, uint32 slen) { - uint16 retval = Hash32StringWithSeed(s, slen, MIX32) >> 16; - return retval == kIllegalHash16 ? static_cast(retval - 1) : retval; -} diff --git a/be/src/gutil/hash/string_hash.h b/be/src/gutil/hash/string_hash.h deleted file mode 100644 index 70f78d061728ce..00000000000000 --- a/be/src/gutil/hash/string_hash.h +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// -// These are the core hashing routines which operate on strings. We define -// strings loosely as a sequence of bytes, and these routines are designed to -// work with the most fundamental representations of a string of bytes. -// -// These routines provide "good" hash functions in terms of both quality and -// speed. Their values can and will change as their implementations change and -// evolve. - -#pragma once - -#include - -#include "gutil/hash/city.h" -#include "gutil/hash/jenkins.h" -#include "gutil/hash/jenkins_lookup2.h" -#include "gutil/integral_types.h" -#include "gutil/port.h" - -namespace hash_internal { - -// We have some special cases for 64-bit hardware and x86-64 in particular. -// Instead of sprinkling ifdefs through the file, we have one ugly ifdef here. -// Later code can then use "if" instead of "ifdef". -#if defined(__x86_64__) -enum { x86_64 = true, sixty_four_bit = true }; -#elif defined(_LP64) -enum { x86_64 = false, sixty_four_bit = true }; -#else -enum { x86_64 = false, sixty_four_bit = false }; -#endif - -// Arbitrary mix constants (pi). -static const uint32 kMix32 = 0x12b9b0a1UL; -static const uint64 kMix64 = GG_ULONGLONG(0x2b992ddfa23249d6); - -} // namespace hash_internal - -inline size_t HashStringThoroughlyWithSeed(const char* s, size_t len, size_t seed) { - if (hash_internal::x86_64) - return static_cast(util_hash::CityHash64WithSeed(s, len, seed)); - - if (hash_internal::sixty_four_bit) - return Hash64StringWithSeed(s, static_cast(len), seed); - - return static_cast( - Hash32StringWithSeed(s, static_cast(len), static_cast(seed))); -} - -inline size_t HashStringThoroughly(const char* s, size_t len) { - if (hash_internal::x86_64) return static_cast(util_hash::CityHash64(s, len)); - - if (hash_internal::sixty_four_bit) - return Hash64StringWithSeed(s, static_cast(len), hash_internal::kMix64); - - return static_cast( - Hash32StringWithSeed(s, static_cast(len), hash_internal::kMix32)); -} - -inline size_t HashStringThoroughlyWithSeeds(const char* s, size_t len, size_t seed0, size_t seed1) { - if (hash_internal::x86_64) return util_hash::CityHash64WithSeeds(s, len, seed0, seed1); - - if (hash_internal::sixty_four_bit) { - uint64 a = seed0; - uint64 b = seed1; - uint64 c = HashStringThoroughly(s, len); - mix(a, b, c); - return c; - } - - uint32 a = static_cast(seed0); - uint32 b = static_cast(seed1); - uint32 c = static_cast(HashStringThoroughly(s, len)); - mix(a, b, c); - return c; -} diff --git a/be/src/gutil/int128.cc b/be/src/gutil/int128.cc deleted file mode 100644 index cd2964a08cab4e..00000000000000 --- a/be/src/gutil/int128.cc +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2004 Google Inc. -// All Rights Reserved. -// -// - -#include -using std::cout; -using std::endl; -#include "gutil/int128.h" -#include "gutil/integral_types.h" - -const uint128_pod kuint128max = {static_cast(GG_LONGLONG(0xFFFFFFFFFFFFFFFF)), - static_cast(GG_LONGLONG(0xFFFFFFFFFFFFFFFF))}; - -std::ostream& operator<<(std::ostream& o, const uint128& b) { - return (o << b.hi_ << "::" << b.lo_); -} diff --git a/be/src/gutil/int128.h b/be/src/gutil/int128.h deleted file mode 100644 index 62f15813255f34..00000000000000 --- a/be/src/gutil/int128.h +++ /dev/null @@ -1,328 +0,0 @@ -// Copyright 2004 Google Inc. -// All Rights Reserved. -// - -#pragma once - -#include -using std::ostream; -#include "gutil/integral_types.h" - -struct uint128_pod; - -// An unsigned 128-bit integer type. Thread-compatible. -class uint128 { -public: - uint128(); // Sets to 0, but don't trust on this behavior. - uint128(uint64 top, uint64 bottom); -#ifndef SWIG - uint128(int bottom); - uint128(uint32 bottom); // Top 96 bits = 0 -#endif - uint128(uint64 bottom); // hi_ = 0 - uint128(const uint128& val); - uint128(const uint128_pod& val); - - void Initialize(uint64 top, uint64 bottom); - - uint128& operator=(const uint128& b); - - // Arithmetic operators. - // TODO: division, etc. - uint128& operator+=(const uint128& b); - uint128& operator-=(const uint128& b); - uint128& operator*=(const uint128& b); - uint128 operator++(int); - uint128 operator--(int); - uint128& operator<<=(int); - uint128& operator>>=(int); - uint128& operator&=(const uint128& b); - uint128& operator|=(const uint128& b); - uint128& operator^=(const uint128& b); - uint128& operator++(); - uint128& operator--(); - - friend uint64 Uint128Low64(const uint128& v); - friend uint64 Uint128High64(const uint128& v); - - // We add "std::" to avoid including all of port.h. - friend std::ostream& operator<<(std::ostream& o, const uint128& b); - -private: - // Little-endian memory order optimizations can benefit from - // having lo_ first, hi_ last. - // See util/endian/endian.h and Load128/Store128 for storing a uint128. - uint64 lo_; - uint64 hi_; - - // Not implemented, just declared for catching automatic type conversions. - uint128(uint8); - uint128(uint16); - uint128(float v); - uint128(double v); -}; - -// This is a POD form of uint128 which can be used for static variables which -// need to be operated on as uint128. -struct uint128_pod { - // Note: The ordering of fields is different than 'class uint128' but the - // same as its 2-arg constructor. This enables more obvious initialization - // of static instances, which is the primary reason for this struct in the - // first place. This does not seem to defeat any optimizations wrt - // operations involving this struct. - uint64 hi; - uint64 lo; -}; - -extern const uint128_pod kuint128max; - -// allow uint128 to be logged -extern std::ostream& operator<<(std::ostream& o, const uint128& b); - -// Methods to access low and high pieces of 128-bit value. -// Defined externally from uint128 to facilitate conversion -// to native 128-bit types when compilers support them. -inline uint64 Uint128Low64(const uint128& v) { - return v.lo_; -} -inline uint64 Uint128High64(const uint128& v) { - return v.hi_; -} - -// TODO: perhaps it would be nice to have int128, a signed 128-bit type? - -// -------------------------------------------------------------------------- -// Implementation details follow -// -------------------------------------------------------------------------- -inline bool operator==(const uint128& lhs, const uint128& rhs) { - return (Uint128Low64(lhs) == Uint128Low64(rhs) && Uint128High64(lhs) == Uint128High64(rhs)); -} -inline bool operator!=(const uint128& lhs, const uint128& rhs) { - return !(lhs == rhs); -} -inline uint128& uint128::operator=(const uint128& b) { - lo_ = b.lo_; - hi_ = b.hi_; - return *this; -} - -inline uint128::uint128() : lo_(0), hi_(0) {} -inline uint128::uint128(uint64 top, uint64 bottom) : lo_(bottom), hi_(top) {} -inline uint128::uint128(const uint128& v) : lo_(v.lo_), hi_(v.hi_) {} -inline uint128::uint128(const uint128_pod& v) : lo_(v.lo), hi_(v.hi) {} -inline uint128::uint128(uint64 bottom) : lo_(bottom), hi_(0) {} -#ifndef SWIG -inline uint128::uint128(uint32 bottom) : lo_(bottom), hi_(0) {} -inline uint128::uint128(int bottom) : lo_(bottom), hi_(0) { - if (bottom < 0) { - --hi_; - } -} -#endif -inline void uint128::Initialize(uint64 top, uint64 bottom) { - hi_ = top; - lo_ = bottom; -} - -// Comparison operators. - -#define CMP128(op) \ - inline bool operator op(const uint128& lhs, const uint128& rhs) { \ - return (Uint128High64(lhs) == Uint128High64(rhs)) \ - ? (Uint128Low64(lhs) op Uint128Low64(rhs)) \ - : (Uint128High64(lhs) op Uint128High64(rhs)); \ - } - -CMP128(<) -CMP128(>) -CMP128(>=) -CMP128(<=) - -#undef CMP128 - -// Unary operators - -inline uint128 operator-(const uint128& val) { - const uint64 hi_flip = ~Uint128High64(val); - const uint64 lo_flip = ~Uint128Low64(val); - const uint64 lo_add = lo_flip + 1; - if (lo_add < lo_flip) { - return uint128(hi_flip + 1, lo_add); - } - return uint128(hi_flip, lo_add); -} - -inline bool operator!(const uint128& val) { - return !Uint128High64(val) && !Uint128Low64(val); -} - -// Logical operators. - -inline uint128 operator~(const uint128& val) { - return uint128(~Uint128High64(val), ~Uint128Low64(val)); -} - -#define LOGIC128(op) \ - inline uint128 operator op(const uint128& lhs, const uint128& rhs) { \ - return uint128(Uint128High64(lhs) op Uint128High64(rhs), \ - Uint128Low64(lhs) op Uint128Low64(rhs)); \ - } - -LOGIC128(|) -LOGIC128(&) -LOGIC128(^) - -#undef LOGIC128 - -#define LOGICASSIGN128(op) \ - inline uint128& uint128::operator op(const uint128& other) { \ - hi_ op other.hi_; \ - lo_ op other.lo_; \ - return *this; \ - } - -LOGICASSIGN128(|=) -LOGICASSIGN128(&=) -LOGICASSIGN128(^=) - -#undef LOGICASSIGN128 - -// Shift operators. - -inline uint128 operator<<(const uint128& val, int amount) { - // uint64 shifts of >= 64 are undefined, so we will need some special-casing. - if (amount < 64) { - if (amount == 0) { - return val; - } - uint64 new_hi = (Uint128High64(val) << amount) | (Uint128Low64(val) >> (64 - amount)); - uint64 new_lo = Uint128Low64(val) << amount; - return uint128(new_hi, new_lo); - } else if (amount < 128) { - return uint128(Uint128Low64(val) << (amount - 64), 0); - } else { - return uint128(0, 0); - } -} - -inline uint128 operator>>(const uint128& val, int amount) { - // uint64 shifts of >= 64 are undefined, so we will need some special-casing. - if (amount < 64) { - if (amount == 0) { - return val; - } - uint64 new_hi = Uint128High64(val) >> amount; - uint64 new_lo = (Uint128Low64(val) >> amount) | (Uint128High64(val) << (64 - amount)); - return uint128(new_hi, new_lo); - } else if (amount < 128) { - return uint128(0, Uint128High64(val) >> (amount - 64)); - } else { - return uint128(0, 0); - } -} - -inline uint128& uint128::operator<<=(int amount) { - // uint64 shifts of >= 64 are undefined, so we will need some special-casing. - if (amount < 64) { - if (amount != 0) { - hi_ = (hi_ << amount) | (lo_ >> (64 - amount)); - lo_ = lo_ << amount; - } - } else if (amount < 128) { - hi_ = lo_ << (amount - 64); - lo_ = 0; - } else { - hi_ = 0; - lo_ = 0; - } - return *this; -} - -inline uint128& uint128::operator>>=(int amount) { - // uint64 shifts of >= 64 are undefined, so we will need some special-casing. - if (amount < 64) { - if (amount != 0) { - lo_ = (lo_ >> amount) | (hi_ << (64 - amount)); - hi_ = hi_ >> amount; - } - } else if (amount < 128) { - hi_ = 0; - lo_ = hi_ >> (amount - 64); - } else { - hi_ = 0; - lo_ = 0; - } - return *this; -} - -inline uint128 operator+(const uint128& lhs, const uint128& rhs) { - return uint128(lhs) += rhs; -} - -inline uint128 operator-(const uint128& lhs, const uint128& rhs) { - return uint128(lhs) -= rhs; -} - -inline uint128 operator*(const uint128& lhs, const uint128& rhs) { - return uint128(lhs) *= rhs; -} - -inline uint128& uint128::operator+=(const uint128& b) { - hi_ += b.hi_; - uint64 lolo = lo_ + b.lo_; - if (lolo < lo_) ++hi_; - lo_ = lolo; - return *this; -} - -inline uint128& uint128::operator-=(const uint128& b) { - hi_ -= b.hi_; - if (b.lo_ > lo_) --hi_; - lo_ -= b.lo_; - return *this; -} - -inline uint128& uint128::operator*=(const uint128& b) { - uint64 a96 = hi_ >> 32; - uint64 a64 = hi_ & 0xffffffffu; - uint64 a32 = lo_ >> 32; - uint64 a00 = lo_ & 0xffffffffu; - uint64 b96 = b.hi_ >> 32; - uint64 b64 = b.hi_ & 0xffffffffu; - uint64 b32 = b.lo_ >> 32; - uint64 b00 = b.lo_ & 0xffffffffu; - // multiply [a96 .. a00] x [b96 .. b00] - // terms higher than c96 disappear off the high side - // terms c96 and c64 are safe to ignore carry bit - uint64 c96 = a96 * b00 + a64 * b32 + a32 * b64 + a00 * b96; - uint64 c64 = a64 * b00 + a32 * b32 + a00 * b64; - this->hi_ = (c96 << 32) + c64; - this->lo_ = 0; - // add terms after this one at a time to capture carry - *this += uint128(a32 * b00) << 32; - *this += uint128(a00 * b32) << 32; - *this += a00 * b00; - return *this; -} - -inline uint128 uint128::operator++(int) { - uint128 tmp(*this); - *this += 1; - return tmp; -} - -inline uint128 uint128::operator--(int) { - uint128 tmp(*this); - *this -= 1; - return tmp; -} - -inline uint128& uint128::operator++() { - *this += 1; - return *this; -} - -inline uint128& uint128::operator--() { - *this -= 1; - return *this; -} diff --git a/be/src/gutil/strings/numbers.cc b/be/src/gutil/strings/numbers.cc index d6b7295f413d89..f471bf31bd08bb 100644 --- a/be/src/gutil/strings/numbers.cc +++ b/be/src/gutil/strings/numbers.cc @@ -30,7 +30,6 @@ using std::string; #include "common/logging.h" #include "gutil/gscoped_ptr.h" -#include "gutil/int128.h" #include "gutil/integral_types.h" #include "gutil/stringprintf.h" #include "gutil/strings/ascii_ctype.h" @@ -468,15 +467,6 @@ string Uint64ToString(uint64 fp) { snprintf(buf, sizeof(buf), "%016" PRIx64, fp); return string(buf); } - -// Default arguments -string Uint128ToHexString(uint128 ui128) { - char buf[33]; - snprintf(buf, sizeof(buf), "%016" PRIx64, Uint128High64(ui128)); - snprintf(buf + 16, sizeof(buf) - 16, "%016" PRIx64, Uint128Low64(ui128)); - return string(buf); -} - namespace { // Represents integer values of digits. diff --git a/be/src/gutil/strings/numbers.h b/be/src/gutil/strings/numbers.h index 291634133c4e0e..64f3d86daaa343 100644 --- a/be/src/gutil/strings/numbers.h +++ b/be/src/gutil/strings/numbers.h @@ -21,7 +21,6 @@ using std::string; using std::vector; -#include "gutil/int128.h" #include "gutil/integral_types.h" // IWYU pragma: no_include #include "gutil/macros.h" // IWYU pragma: keep @@ -35,9 +34,6 @@ using std::vector; // Convert a fingerprint to 16 hex digits. string Uint64ToString(uint64 fp); -// Formats a uint128 as a 32-digit hex string. -string Uint128ToHexString(uint128 ui128); - // Convert strings to numeric values, with strict error checking. // Leading and trailing spaces are allowed. // Negative inputs are not allowed for unsigned ints (unlike strtoul). diff --git a/be/src/gutil/strings/stringpiece.cc b/be/src/gutil/strings/stringpiece.cc index d4f2b554f01381..c3d59fb994a330 100644 --- a/be/src/gutil/strings/stringpiece.cc +++ b/be/src/gutil/strings/stringpiece.cc @@ -16,7 +16,6 @@ #include "gutil/stl_util.h" #include "gutil/strings/memutil.h" -#include "gutil/hash/legacy_hash.h" using std::copy; using std::max; @@ -26,10 +25,6 @@ using std::sort; using std::swap; using std::string; -size_t std::hash::operator()(StringPiece s) const { - return HashTo32(s.data(), s.size()); -} - std::ostream& operator<<(std::ostream& o, StringPiece piece) { o.write(piece.data(), piece.size()); return o; diff --git a/be/src/gutil/strings/stringpiece.h b/be/src/gutil/strings/stringpiece.h index 0ee2bcc47eb2ed..38e36a27099279 100644 --- a/be/src/gutil/strings/stringpiece.h +++ b/be/src/gutil/strings/stringpiece.h @@ -122,9 +122,6 @@ #include #include -#include "gutil/hash/string_hash.h" -#include "gutil/int128.h" - class StringPiece { private: const char* ptr_ = nullptr; @@ -320,35 +317,6 @@ inline bool operator<=(StringPiece x, StringPiece y) { inline bool operator>=(StringPiece x, StringPiece y) { return !(x < y); } -template -struct GoodFastHash; - -// ------------------------------------------------------------------ -// Functions used to create STL containers that use StringPiece -// Remember that a StringPiece's lifetime had better be less than -// that of the underlying string or char*. If it is not, then you -// cannot safely store a StringPiece into an STL container -// ------------------------------------------------------------------ - -// SWIG doesn't know how to parse this stuff properly. Omit it. -#ifndef SWIG - -template <> -struct std::hash { - size_t operator()(StringPiece s) const; -}; - -// An implementation of GoodFastHash for StringPiece. See -// GoodFastHash values. -template <> -struct GoodFastHash { - size_t operator()(StringPiece s) const { return HashStringThoroughly(s.data(), s.size()); } - // Less than operator, for MSVC. - bool operator()(const StringPiece& s1, const StringPiece& s2) const { return s1 < s2; } - static const size_t bucket_size = 4; // These are required by MSVC - static const size_t min_buckets = 8; // 4 and 8 are defaults. -}; -#endif // allow StringPiece to be logged -extern ostream& operator<<(ostream& o, StringPiece piece); +extern std::ostream& operator<<(std::ostream& o, StringPiece piece); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 21ee4dfc8b0583..e660a38e03f20d 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -31,7 +31,6 @@ inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 10; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; using uint128_t = vectorized::UInt128; -using UInt128Hash = vectorized::UInt128Hash; enum class FileCacheType { INDEX, @@ -63,13 +62,15 @@ struct FileCacheAllocatorBuilder { }; struct KeyHash { - std::size_t operator()(const UInt128Wrapper& w) const { return UInt128Hash()(w.value_); } + std::size_t operator()(const UInt128Wrapper& w) const { + return util_hash::HashLen16(w.value_.low(), w.value_.high()); + } }; using AccessKeyAndOffset = std::pair; struct KeyAndOffsetHash { std::size_t operator()(const AccessKeyAndOffset& key) const { - return UInt128Hash()(key.first.value_) ^ std::hash()(key.second); + return KeyHash()(key.first) ^ std::hash()(key.second); } }; diff --git a/be/src/io/fs/hdfs_file_system.cpp b/be/src/io/fs/hdfs_file_system.cpp index 24976f11941e3a..0d7cea5091ae17 100644 --- a/be/src/io/fs/hdfs_file_system.cpp +++ b/be/src/io/fs/hdfs_file_system.cpp @@ -31,7 +31,6 @@ #include "common/config.h" #include "common/status.h" -#include "gutil/hash/hash.h" #include "gutil/integral_types.h" #include "io/fs/err_utils.h" #include "io/fs/hdfs_file_reader.h" diff --git a/be/src/io/hdfs_util.cpp b/be/src/io/hdfs_util.cpp index 5e5998e9ea77b1..6c1bbf80a1526f 100644 --- a/be/src/io/hdfs_util.cpp +++ b/be/src/io/hdfs_util.cpp @@ -23,9 +23,9 @@ #include #include "common/logging.h" -#include "gutil/hash/hash.h" #include "io/fs/err_utils.h" #include "io/hdfs_builder.h" +#include "vec/common/string_ref.h" namespace doris::io { namespace { @@ -41,24 +41,24 @@ Status create_hdfs_fs(const THdfsParams& hdfs_params, const std::string& fs_name return Status::OK(); } -uint64 hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name) { - uint64 hash_code = 0; +uint64_t hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name) { + uint64_t hash_code = 0; // The specified fsname is used first. // If there is no specified fsname, the default fsname is used if (!fs_name.empty()) { - hash_code ^= Fingerprint(fs_name); + hash_code ^= crc32_hash(fs_name); } else if (hdfs_params.__isset.fs_name) { - hash_code ^= Fingerprint(hdfs_params.fs_name); + hash_code ^= crc32_hash(hdfs_params.fs_name); } if (hdfs_params.__isset.user) { - hash_code ^= Fingerprint(hdfs_params.user); + hash_code ^= crc32_hash(hdfs_params.user); } if (hdfs_params.__isset.hdfs_kerberos_principal) { - hash_code ^= Fingerprint(hdfs_params.hdfs_kerberos_principal); + hash_code ^= crc32_hash(hdfs_params.hdfs_kerberos_principal); } if (hdfs_params.__isset.hdfs_kerberos_keytab) { - hash_code ^= Fingerprint(hdfs_params.hdfs_kerberos_keytab); + hash_code ^= crc32_hash(hdfs_params.hdfs_kerberos_keytab); } if (hdfs_params.__isset.hdfs_conf) { std::map conf_map; @@ -66,8 +66,8 @@ uint64 hdfs_hash_code(const THdfsParams& hdfs_params, const std::string& fs_name conf_map[conf.key] = conf.value; } for (auto& conf : conf_map) { - hash_code ^= Fingerprint(conf.first); - hash_code ^= Fingerprint(conf.second); + hash_code ^= crc32_hash(conf.first); + hash_code ^= crc32_hash(conf.second); } } return hash_code; @@ -87,7 +87,7 @@ bvar::LatencyRecorder hdfs_hsync_latency("hdfs_hsync"); }; // namespace hdfs_bvar void HdfsHandlerCache::_clean_invalid() { - std::vector removed_handle; + std::vector removed_handle; for (auto& item : _cache) { if (item.second.use_count() == 1 && item.second->invalid()) { removed_handle.emplace_back(item.first); @@ -100,7 +100,7 @@ void HdfsHandlerCache::_clean_invalid() { void HdfsHandlerCache::_clean_oldest() { uint64_t oldest_time = ULONG_MAX; - uint64 oldest = 0; + uint64_t oldest = 0; for (auto& item : _cache) { if (item.second.use_count() == 1 && item.second->last_access_time() < oldest_time) { oldest_time = item.second->last_access_time(); @@ -112,7 +112,7 @@ void HdfsHandlerCache::_clean_oldest() { Status HdfsHandlerCache::get_connection(const THdfsParams& hdfs_params, const std::string& fs_name, std::shared_ptr* fs_handle) { - uint64 hash_code = hdfs_hash_code(hdfs_params, fs_name); + uint64_t hash_code = hdfs_hash_code(hdfs_params, fs_name); { std::lock_guard l(_lock); auto it = _cache.find(hash_code); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index 037b5504709bb4..b098d76b0c0396 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -25,7 +25,6 @@ #include #include "common/status.h" -#include "gutil/hash/string_hash.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/binary_plain_page.h" #include "olap/rowset/segment_v2/common.h" @@ -35,6 +34,7 @@ #include "util/faststring.h" #include "util/slice.h" #include "vec/common/arena.h" +#include "vec/common/string_ref.h" #include "vec/data_types/data_type.h" namespace doris { @@ -95,9 +95,7 @@ class BinaryDictPageBuilder : public PageBuilderHelper { EncodingTypePB _encoding_type; struct HashOfSlice { - size_t operator()(const Slice& slice) const { - return HashStringThoroughly(slice.data, slice.size); - } + size_t operator()(const Slice& slice) const { return crc32_hash(slice.data, slice.size); } }; // query for dict item -> dict id phmap::flat_hash_map _dictionary; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 74882a0e1a6a0d..3e80d2c6e618c2 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -49,13 +49,9 @@ struct BloomFilterTraits { using ValueDict = std::set; }; -struct Int128Comparator { - bool operator()(const int128_t& a, const int128_t& b) const { return a < b; } -}; - template <> struct BloomFilterTraits { - using ValueDict = std::set; + using ValueDict = std::set; }; // Builder for bloom filter. In doris, bloom filter index is used in diff --git a/be/src/olap/rowset/segment_v2/page_io.h b/be/src/olap/rowset/segment_v2/page_io.h index 31c81880dac650..736b3e521f6800 100644 --- a/be/src/olap/rowset/segment_v2/page_io.h +++ b/be/src/olap/rowset/segment_v2/page_io.h @@ -74,7 +74,7 @@ struct PageReadOptions { } }; -inline ostream& operator<<(ostream& os, const PageReadOptions& opt) { +inline std::ostream& operator<<(std::ostream& os, const PageReadOptions& opt) { return os << "PageReadOptions { verify_checksum=" << opt.verify_checksum << " use_page_cache=" << opt.use_page_cache << " kept_in_memory=" << opt.kept_in_memory << " pre_decode=" << opt.pre_decode diff --git a/be/src/olap/rowset/segment_v2/page_pointer.h b/be/src/olap/rowset/segment_v2/page_pointer.h index 32ef318aeba25b..1a5448d667e85a 100644 --- a/be/src/olap/rowset/segment_v2/page_pointer.h +++ b/be/src/olap/rowset/segment_v2/page_pointer.h @@ -73,7 +73,7 @@ struct PagePointer { bool operator!=(const PagePointer& other) const { return !(*this == other); } }; -inline ostream& operator<<(ostream& os, const PagePointer& pp) { +inline std::ostream& operator<<(std::ostream& os, const PagePointer& pp) { os << "PagePointer { offset=" << pp.offset << " size=" << pp.size << " }"; return os; } diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index 0ce2048e3be0a7..08bc150067eacd 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -32,7 +32,6 @@ #include "common/signal_handler.h" #include "common/status.h" #include "exec/schema_scanner/schema_metadata_name_ids_scanner.h" -#include "gutil/hash/hash.h" #include "gutil/integral_types.h" #include "gutil/strings/numbers.h" #include "io/fs/file_system.h" diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 536761f33516a7..2b1726f1def33f 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -21,7 +21,6 @@ #include "gen_cpp/Types_constants.h" #include "gen_cpp/internal_service.pb.h" #include "gutil/strings/split.h" -#include "gutil/strings/stringpiece.h" #include "http/http_client.h" #include "io/fs/file_system.h" #include "io/fs/local_file_system.h" @@ -387,8 +386,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, // Avoid of data is not complete, we copy the header file at last. // The header file's name is end of .hdr. for (int i = 0; i < file_name_list.size() - 1; ++i) { - StringPiece sp(file_name_list[i]); - if (sp.ends_with(".hdr")) { + if (file_name_list[i].ends_with(".hdr")) { std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]); break; } diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index edfa55a89bea83..08ff77fdfa3e28 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -61,7 +61,6 @@ #include "common/signal_handler.h" #include "common/status.h" #include "gutil/ref_counted.h" -#include "gutil/strings/stringpiece.h" #include "gutil/strings/substitute.h" #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 30bff9cbf07adf..eabb1e162cd6d6 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -42,7 +42,6 @@ #include "common/config.h" #include "common/logging.h" #include "gutil/strings/split.h" -#include "gutil/strings/stringpiece.h" #include "gutil/strings/strip.h" #include "http/http_client.h" #include "io/fs/file_system.h" @@ -512,8 +511,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re // Avoid of data is not complete, we copy the header file at last. // The header file's name is end of .hdr. for (int i = 0; i < file_name_list.size() - 1; ++i) { - StringPiece sp(file_name_list[i]); - if (sp.ends_with(".hdr")) { + if (file_name_list[i].ends_with(".hdr")) { std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]); break; } diff --git a/be/src/service/point_query_executor.h b/be/src/service/point_query_executor.h index 37345d80f68f62..61b597c5da8159 100644 --- a/be/src/service/point_query_executor.h +++ b/be/src/service/point_query_executor.h @@ -39,7 +39,6 @@ #include "common/config.h" #include "common/logging.h" #include "common/status.h" -#include "gutil/int128.h" #include "olap/lru_cache.h" #include "olap/olap_common.h" #include "olap/rowset/rowset.h" diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index 8cb1bc2754a51c..dc70b1c9f9c40b 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -27,7 +27,7 @@ #include #include "common/compiler_util.h" // IWYU pragma: keep -#include "gutil/hash/hash.h" // IWYU pragma: keep +#include "gutil/hash/city.h" #include "runtime/define_primitive_type.h" #include "util/cpu_info.h" #include "util/murmur_hash3.h" @@ -382,16 +382,6 @@ struct std::hash { } }; -#if __GNUC__ < 6 && !defined(__clang__) -// Cause this is builtin function -template <> -struct std::hash<__int128> { - std::size_t operator()(const __int128& val) const { - return doris::HashUtil::hash(&val, sizeof(val), 0); - } -}; -#endif - template <> struct std::hash> { size_t operator()(const std::pair& pair) const { @@ -402,3 +392,12 @@ struct std::hash> { return seed; } }; + +template +struct std::hash> { + size_t operator()(const pair& p) const { + size_t h1 = std::hash()(p.first); + size_t h2 = std::hash()(p.second); + return util_hash::HashLen16(h1, h2); + } +}; \ No newline at end of file diff --git a/be/src/util/lru_multi_cache.h b/be/src/util/lru_multi_cache.h index e1ff32fbe56ec1..8c810a06ee266e 100644 --- a/be/src/util/lru_multi_cache.h +++ b/be/src/util/lru_multi_cache.h @@ -29,7 +29,6 @@ #include #include -#include "gutil/hash/hash.h" #include "gutil/macros.h" #include "util/spinlock.h" diff --git a/be/src/util/lru_multi_cache.inline.h b/be/src/util/lru_multi_cache.inline.h index 8af288bff6f691..87d098913421fa 100644 --- a/be/src/util/lru_multi_cache.inline.h +++ b/be/src/util/lru_multi_cache.inline.h @@ -23,6 +23,7 @@ #include +#include "util/hash_util.hpp" #include "util/lru_multi_cache.h" #include "util/time.h" diff --git a/be/src/util/path_util.cpp b/be/src/util/path_util.cpp index f1c96a0fc00382..e05b9371e49a51 100644 --- a/be/src/util/path_util.cpp +++ b/be/src/util/path_util.cpp @@ -21,7 +21,6 @@ #include #include "gutil/strings/split.h" -#include "gutil/strings/stringpiece.h" #include "gutil/strings/strip.h" using std::string; @@ -32,8 +31,6 @@ using strings::Split; namespace doris { namespace path_util { -const string kTmpInfix = ".doristmp"; - std::string join_path_segments(const string& a, const string& b) { if (a.empty()) { return b; @@ -44,29 +41,6 @@ std::string join_path_segments(const string& a, const string& b) { } } -std::vector join_path_segments_v(const std::vector& v, const string& s) { - std::vector out; - for (const string& path : v) { - out.emplace_back(join_path_segments(path, s)); - } - return out; -} - -std::vector split_path(const string& path) { - if (path.empty()) { - return {}; - } - std::vector segments; - if (path[0] == '/') { - segments.emplace_back("/"); - } - std::vector pieces = Split(path, "/", SkipEmpty()); - for (const StringPiece& piece : pieces) { - segments.emplace_back(piece.data(), piece.size()); - } - return segments; -} - // strdup use malloc to obtain memory for the new string, it should be freed with free. // but std::unique_ptr use delete to free memory by default, so it should specify free memory using free diff --git a/be/src/util/path_util.h b/be/src/util/path_util.h index 1376d2c32ff99d..ee68122f4faaf3 100644 --- a/be/src/util/path_util.h +++ b/be/src/util/path_util.h @@ -27,18 +27,9 @@ namespace path_util { // NOTE: The methods here are only related to path processing, do not involve // any file and IO operations. -extern const std::string kTmpInfix; - // Join two path segments with the appropriate path separator, if necessary. std::string join_path_segments(const std::string& a, const std::string& b); -// Join each path segment in a list with a common suffix segment. -std::vector join_path_segments_v(const std::vector& v, - const std::string& s); - -// Split a path into segments with the appropriate path separator. -std::vector split_path(const std::string& path); - // Return the enclosing directory of path. // This is like dirname(3) but for C++ strings. // The following list of examples shows the strings returned by dirname() and basename(): diff --git a/be/src/util/s3_util.h b/be/src/util/s3_util.h index 46226b793591a4..5dd68069759a42 100644 --- a/be/src/util/s3_util.h +++ b/be/src/util/s3_util.h @@ -32,8 +32,8 @@ #include #include "common/status.h" -#include "gutil/hash/hash.h" #include "util/s3_rate_limiter.h" +#include "vec/common/string_ref.h" namespace Aws { namespace S3 { @@ -102,15 +102,15 @@ struct S3ClientConf { uint64_t get_hash() const { uint64_t hash_code = 0; - hash_code ^= Fingerprint(ak); - hash_code ^= Fingerprint(sk); - hash_code ^= Fingerprint(token); - hash_code ^= Fingerprint(endpoint); - hash_code ^= Fingerprint(region); - hash_code ^= Fingerprint(max_connections); - hash_code ^= Fingerprint(request_timeout_ms); - hash_code ^= Fingerprint(connect_timeout_ms); - hash_code ^= Fingerprint(use_virtual_addressing); + hash_code ^= crc32_hash(ak); + hash_code ^= crc32_hash(sk); + hash_code ^= crc32_hash(token); + hash_code ^= crc32_hash(endpoint); + hash_code ^= crc32_hash(region); + hash_code ^= max_connections; + hash_code ^= request_timeout_ms; + hash_code ^= connect_timeout_ms; + hash_code ^= use_virtual_addressing; return hash_code; } diff --git a/be/src/util/trace.h b/be/src/util/trace.h index 30da779624a3da..7a876fd3c3c09c 100644 --- a/be/src/util/trace.h +++ b/be/src/util/trace.h @@ -19,7 +19,6 @@ #include #include "gutil/ref_counted.h" -#include "gutil/strings/stringpiece.h" #include "gutil/strings/substitute.h" #include "gutil/threading/thread_collision_warner.h" #include "util/scoped_cleanup.h" diff --git a/be/src/vec/common/hash_table/hash.h b/be/src/vec/common/hash_table/hash.h index 9556bf87a0718b..ddc8b2a75471ce 100644 --- a/be/src/vec/common/hash_table/hash.h +++ b/be/src/vec/common/hash_table/hash.h @@ -119,8 +119,7 @@ inline size_t hash_crc32(doris::vectorized::UInt128 u) { template <> inline size_t hash_crc32(doris::vectorized::Int128 u) { - return doris::vectorized::UInt128HashCRC32()( - doris::vectorized::UInt128((u >> 64) & int64_t(-1), u & int64_t(-1))); + return doris::vectorized::UInt128HashCRC32()({(u >> 64) & int64_t(-1), u & int64_t(-1)}); } #define DEFINE_HASH(T) \ @@ -156,10 +155,10 @@ struct HashCRC32 { size_t operator()(const doris::vectorized::UInt256& x) const { #if defined(__SSE4_2__) || defined(__aarch64__) doris::vectorized::UInt64 crc = -1ULL; - crc = _mm_crc32_u64(crc, x.a); - crc = _mm_crc32_u64(crc, x.b); - crc = _mm_crc32_u64(crc, x.c); - crc = _mm_crc32_u64(crc, x.d); + crc = _mm_crc32_u64(crc, x.items[0]); + crc = _mm_crc32_u64(crc, x.items[1]); + crc = _mm_crc32_u64(crc, x.items[2]); + crc = _mm_crc32_u64(crc, x.items[3]); return crc; #else return Hash128to64({Hash128to64({x.a, x.b}), Hash128to64({x.c, x.d})}); diff --git a/be/src/vec/common/hash_table/string_hash_map.h b/be/src/vec/common/hash_table/string_hash_map.h index 61d304cf7d8b63..6c7a9e74dca749 100644 --- a/be/src/vec/common/hash_table/string_hash_map.h +++ b/be/src/vec/common/hash_table/string_hash_map.h @@ -47,8 +47,10 @@ struct StringHashMapCell // Zero means unoccupied cells in hash table. Use key with last word = 0 as // zero keys, because such keys are unrepresentable (no way to encode length). - static bool is_zero(const StringKey16& key, const HashTableNoState&) { return key.high == 0; } - void set_zero() { this->value.first.high = 0; } + static bool is_zero(const StringKey16& key, const HashTableNoState&) { + return key.items[1] == 0; + } + void set_zero() { this->value.first.items[1] = 0; } // external const doris::StringRef get_key() const { return to_string_ref(this->value.first); } /// NOLINT diff --git a/be/src/vec/common/hash_table/string_hash_table.h b/be/src/vec/common/hash_table/string_hash_table.h index 5d98ff0b720c4b..74be1e85e1efe8 100644 --- a/be/src/vec/common/hash_table/string_hash_table.h +++ b/be/src/vec/common/hash_table/string_hash_table.h @@ -53,8 +53,8 @@ inline doris::StringRef ALWAYS_INLINE to_string_ref(const T& n) { return {reinterpret_cast(&n), sizeof(T) - (__builtin_clzll(n) >> 3)}; } inline doris::StringRef ALWAYS_INLINE to_string_ref(const StringKey16& n) { - assert(n.high != 0); - return {reinterpret_cast(&n), 16ul - (__builtin_clzll(n.high) >> 3)}; + assert(n.items[1] != 0); + return {reinterpret_cast(&n), 16UL - (__builtin_clzll(n.items[1]) >> 3)}; } struct StringHashTableHash { @@ -67,8 +67,8 @@ struct StringHashTableHash { } size_t ALWAYS_INLINE operator()(StringKey16 key) const { size_t res = -1ULL; - res = _mm_crc32_u64(res, key.low); - res = _mm_crc32_u64(res, key.high); + res = _mm_crc32_u64(res, key.low()); + res = _mm_crc32_u64(res, key.high()); return res; } #else diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h index 9851bf8f3bf135..e2094ca8f39709 100644 --- a/be/src/vec/common/string_ref.h +++ b/be/src/vec/common/string_ref.h @@ -33,8 +33,6 @@ #include #include "gutil/hash/city.h" -#include "gutil/hash/hash128to64.h" -#include "gutil/int128.h" #include "util/hash_util.hpp" #include "util/slice.h" #include "util/sse_util.hpp" @@ -307,25 +305,13 @@ inline std::size_t hash_value(const StringRef& v) { using StringRefs = std::vector; -/** Hash functions. - * You can use either CityHash64, - * or a function based on the crc32 statement, - * which is obviously less qualitative, but on real data sets, - * when used in a hash table, works much faster. - * For more information, see hash_map_string_3.cpp - */ - -struct StringRefHash64 { - size_t operator()(StringRef x) const { return util_hash::CityHash64(x.data, x.size); } -}; - #if defined(__SSE4_2__) || defined(__aarch64__) /// Parts are taken from CityHash. inline doris::vectorized::UInt64 hash_len16(doris::vectorized::UInt64 u, doris::vectorized::UInt64 v) { - return Hash128to64(uint128(u, v)); + return util_hash::HashLen16(u, v); } inline doris::vectorized::UInt64 shift_mix(doris::vectorized::UInt64 val) { @@ -367,35 +353,38 @@ inline size_t hash_less_than16(const char* data, size_t size) { return hash_less_than8(data, size); } -struct CRC32Hash { - size_t operator()(const StringRef& x) const { - const char* pos = x.data; - size_t size = x.size; +inline size_t crc32_hash(const char* pos, size_t size) { + if (size == 0) { + return 0; + } - if (size == 0) { - return 0; - } + if (size < 8) { + return hash_less_than8(pos, size); + } - if (size < 8) { - return hash_less_than8(x.data, x.size); - } + const char* end = pos + size; + size_t res = -1ULL; - const char* end = pos + size; - size_t res = -1ULL; + do { + auto word = unaligned_load(pos); + res = _mm_crc32_u64(res, word); - do { - auto word = unaligned_load(pos); - res = _mm_crc32_u64(res, word); + pos += 8; + } while (pos + 8 < end); - pos += 8; - } while (pos + 8 < end); + auto word = + unaligned_load(end - 8); /// I'm not sure if this is normal. + res = _mm_crc32_u64(res, word); - auto word = unaligned_load( - end - 8); /// I'm not sure if this is normal. - res = _mm_crc32_u64(res, word); + return res; +} - return res; - } +inline size_t crc32_hash(const std::string str) { + return crc32_hash(str.data(), str.size()); +} + +struct CRC32Hash { + size_t operator()(const StringRef& x) const { return crc32_hash(x.data, x.size); } }; struct StringRefHash : CRC32Hash {}; diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h index 28b1f62dc72a7d..61e7410a236bb0 100644 --- a/be/src/vec/common/uint128.h +++ b/be/src/vec/common/uint128.h @@ -26,97 +26,13 @@ #include #include "gutil/hash/city.h" -#include "gutil/hash/hash128to64.h" #include "util/sse_util.hpp" #include "vec/core/types.h" +#include "vec/core/wide_integer.h" namespace doris::vectorized { -/// For aggregation by SipHash, UUID type or concatenation of several fields. -struct UInt128 { - /// This naming assumes little endian. - UInt64 low; - UInt64 high; - - UInt128() = default; - explicit UInt128(const UInt64 low_, const UInt64 high_) : low(low_), high(high_) {} - explicit UInt128(const Int128 rhs) : low(rhs % UINT64_MAX), high(rhs / UINT64_MAX) {} - explicit UInt128(const UInt64 rhs) : low(rhs), high() {} - explicit UInt128(const int rhs) : low(rhs), high() {} - explicit UInt128(const int64_t rhs) : low(rhs), high() {} - explicit UInt128(const uint32_t rhs) : low(rhs), high() {} - explicit UInt128(const double rhs) - : low((UInt64)rhs % UINT64_MAX), high(UInt64(rhs / double(INT64_MAX))) {} - - auto tuple() const { return std::tie(high, low); } - - String to_hex_string() const { - std::ostringstream os; - os << std::setw(16) << std::setfill('0') << std::hex << high << low; - return String(os.str()); - } - - bool operator==(const UInt128 rhs) const { return tuple() == rhs.tuple(); } - auto operator<=>(const UInt128 rhs) const { return tuple() <=> rhs.tuple(); } - - UInt128 operator<<(const UInt128& rhs) const { - const uint64_t shift = rhs.low; - if (((bool)rhs.high) || (shift >= 128)) { - return UInt128(0); - } else if (shift == 64) { - return UInt128(0, low); - } else if (shift == 0) { - return *this; - } else if (shift < 64) { - return UInt128(low << shift, (high << shift) + (low >> (64 - shift))); - } else if ((128 > shift) && (shift > 64)) { - return UInt128(0, low << (shift - 64)); - } else { - return UInt128(0); - } - } - - UInt128& operator<<=(const UInt128& rhs) { - *this = *this << rhs; - return *this; - } - - UInt128 operator+(const UInt128& rhs) const { - return UInt128(low + rhs.low, high + rhs.high + ((low + rhs.low) < low)); - } - - UInt128& operator+=(const UInt128& rhs) { - high += rhs.high + ((low + rhs.low) < low); - low += rhs.low; - return *this; - } - - template - bool operator==(const T rhs) const { - return *this == UInt128(rhs); - } - template - auto operator<=>(const T rhs) const { - return *this <=> UInt128(rhs); - } - - template - explicit operator T() const { - return static_cast(low); - } - - UInt128& operator=(const UInt64 rhs) { - low = rhs; - high = 0; - return *this; - } - - operator uint128_t() const { - uint128_t value = static_cast(high) << 64; - value |= low; - return value; - } -}; +using UInt128 = wide::UInt128; template <> inline constexpr bool IsNumber = true; @@ -129,17 +45,13 @@ struct TypeId { static constexpr const TypeIndex value = TypeIndex::UInt128; }; -struct UInt128Hash { - size_t operator()(UInt128 x) const { return Hash128to64({x.low, x.high}); } -}; - #if defined(__SSE4_2__) || defined(__aarch64__) struct UInt128HashCRC32 { size_t operator()(const UInt128& x) const { UInt64 crc = -1ULL; - crc = _mm_crc32_u64(crc, x.low); - crc = _mm_crc32_u64(crc, x.high); + crc = _mm_crc32_u64(crc, x.low()); + crc = _mm_crc32_u64(crc, x.high()); return crc; } }; @@ -152,31 +64,10 @@ struct UInt128HashCRC32 : public UInt128Hash {}; #endif struct UInt128TrivialHash { - size_t operator()(UInt128 x) const { return x.low; } + size_t operator()(UInt128 x) const { return x.low(); } }; -/** Used for aggregation, for putting a large number of constant-length keys in a hash table. - */ -struct UInt256 { - UInt64 a; - UInt64 b; - UInt64 c; - UInt64 d; - - bool operator==(const UInt256 rhs) const { - return a == rhs.a && b == rhs.b && c == rhs.c && d == rhs.d; - } - - bool operator==(const UInt64 rhs) const { return a == rhs && b == 0 && c == 0 && d == 0; } - - UInt256& operator=(const UInt64 rhs) { - a = rhs; - b = 0; - c = 0; - d = 0; - return *this; - } -}; +using UInt256 = wide::UInt256; #pragma pack(1) struct UInt136 { @@ -194,7 +85,7 @@ struct UInt136 { template <> struct std::hash { size_t operator()(const doris::vectorized::UInt128& u) const { - return Hash128to64({u.low, u.high}); + return util_hash::HashLen16(u.low(), u.high()); } }; diff --git a/be/src/vec/core/wide_integer.h b/be/src/vec/core/wide_integer.h index e7902e414a854f..d74aef27a49efd 100644 --- a/be/src/vec/core/wide_integer.h +++ b/be/src/vec/core/wide_integer.h @@ -42,7 +42,10 @@ #include #include +#include +#include #include +#include #include // NOLINTBEGIN(*) @@ -134,10 +137,28 @@ class integer { constexpr operator double() const noexcept; constexpr operator float() const noexcept; + std::string to_hex_string() const { + std::ostringstream os; + os << std::setw(16) << std::setfill('0') << std::hex; + for (size_t i = 0; i < _impl::item_count; i++) { + os << items[i]; + } + return os.str(); + } + struct _impl; base_type items[_impl::item_count]; + uint64_t low() const { + static_assert(_impl::item_count == 2); + return items[0]; + } + uint64_t high() const { + static_assert(_impl::item_count == 2); + return items[1]; + } + private: template friend class integer; @@ -146,6 +167,7 @@ class integer { friend class std::numeric_limits>; }; +using UInt128 = integer<128, unsigned>; using Int256 = integer<256, signed>; using UInt256 = integer<256, unsigned>; diff --git a/be/src/vec/data_types/convert_field_to_type.cpp b/be/src/vec/data_types/convert_field_to_type.cpp index 3717abc74e137b..f69e6a9dc81cba 100644 --- a/be/src/vec/data_types/convert_field_to_type.cpp +++ b/be/src/vec/data_types/convert_field_to_type.cpp @@ -43,13 +43,6 @@ #include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_nullable.h" -namespace doris { -namespace vectorized { -struct UInt128; -} // namespace vectorized -} // namespace doris - -// #include "vec/data_types/data_type_tuple.h" namespace doris::vectorized { /** Checking for a `Field from` of `From` type falls to a range of values of type `To`. * `From` and `To` - numeric types. They can be floating-point types. diff --git a/be/src/vec/json/path_in_data.cpp b/be/src/vec/json/path_in_data.cpp index d5128d2b32e2fc..74e5dade0fc684 100644 --- a/be/src/vec/json/path_in_data.cpp +++ b/be/src/vec/json/path_in_data.cpp @@ -163,7 +163,7 @@ void PathInData::to_protobuf(segment_v2::ColumnPathInfo* pb, int32_t parent_col_ size_t PathInData::Hash::operator()(const PathInData& value) const { auto hash = get_parts_hash(value.parts); - return hash.low ^ hash.high; + return hash.low() ^ hash.high(); } PathInData PathInData::copy_pop_front() const { diff --git a/be/test/util/path_util_test.cpp b/be/test/util/path_util_test.cpp index 63149d9ddaa08d..8806febb133dd0 100644 --- a/be/test/util/path_util_test.cpp +++ b/be/test/util/path_util_test.cpp @@ -73,17 +73,6 @@ TEST(TestPathUtil, DirNameTest) { EXPECT_EQ("/ab", path_util::dir_name("/ab/cd")); } -TEST(TestPathUtil, SplitPathTest) { - using Vec = std::vector; - EXPECT_EQ(Vec({"/"}), path_util::split_path("/")); - EXPECT_EQ(Vec({"/", "a", "b"}), path_util::split_path("/a/b")); - EXPECT_EQ(Vec({"/", "a", "b"}), path_util::split_path("/a/b/")); - EXPECT_EQ(Vec({"/", "a", "b"}), path_util::split_path("/a//b/")); - EXPECT_EQ(Vec({"a", "b"}), path_util::split_path("a/b")); - EXPECT_EQ(Vec({"."}), path_util::split_path(".")); - EXPECT_EQ(Vec(), path_util::split_path("")); -} - TEST(TestPathUtil, file_extension_test) { EXPECT_EQ("", path_util::file_extension("")); EXPECT_EQ("", path_util::file_extension(".")); diff --git a/regression-test/data/nereids_function_p0/scalar_function/Array.out b/regression-test/data/nereids_function_p0/scalar_function/Array.out index c6c32bf9b15af3..bf55a289cedd83 100644 --- a/regression-test/data/nereids_function_p0/scalar_function/Array.out +++ b/regression-test/data/nereids_function_p0/scalar_function/Array.out @@ -2524,90 +2524,90 @@ -- !sql_array_intersect_Char -- \N -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -- !sql_array_intersect_Char_notnull -- -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -- !sql_array_intersect_Varchar -- \N -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] -- !sql_array_intersect_Varchar_notnull -- -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] -- !sql_array_intersect_String -- \N -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] -- !sql_array_intersect_String_notnull -- -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] -- !sql_array_intersect_DatetimeV2 -- \N @@ -9803,90 +9803,90 @@ true -- !sql_array_union_Char -- \N -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -- !sql_array_union_Char_notnull -- -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] -["char12", "char32", "char22"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char11", "char21", "char31"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] +["char12", "char22", "char32"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] ["char13", "char23", "char33"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -["char21", "char11", "char31"] -- !sql_array_union_Varchar -- \N -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] -- !sql_array_union_Varchar_notnull -- -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar11", "varchar31", "char21", "char11", "char31", "varchar21"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "char12", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] -["varchar23", "char13", "char23", "char33", "varchar13", "varchar33"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char11", "char21", "char31", "varchar11", "varchar21", "varchar31"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char12", "char22", "char32", "varchar12", "varchar22", "varchar32"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] +["char13", "char23", "char33", "varchar13", "varchar23", "varchar33"] -- !sql_array_union_String -- \N -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] -- !sql_array_union_String_notnull -- -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar11", "varchar31", "char21", "char31", "string1", "varchar21"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar22", "string2", "char32", "varchar12", "char22", "varchar32"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] -["varchar23", "char23", "string3", "char33", "varchar13", "varchar33"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char21", "char31", "string1", "varchar11", "varchar21", "varchar31"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char22", "char32", "string2", "varchar12", "varchar22", "varchar32"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] +["char23", "char33", "string3", "varchar13", "varchar23", "varchar33"] -- !sql_array_union_DatetimeV2 -- \N @@ -14443,4 +14443,5 @@ true ["2012-03-09 09:08:09", "2012-03-09 09:08:17"] ["2012-03-10 10:09:10", "2012-03-10 10:09:19"] ["2012-03-11 11:10:11", "2012-03-11 11:10:21"] -["2012-03-12 12:11:12", "2012-03-12 12:11:23"] \ No newline at end of file +["2012-03-12 12:11:12", "2012-03-12 12:11:23"] + diff --git a/regression-test/suites/nereids_function_p0/scalar_function/Array.groovy b/regression-test/suites/nereids_function_p0/scalar_function/Array.groovy index 5b13ed4590bbfa..a786b9304388f4 100644 --- a/regression-test/suites/nereids_function_p0/scalar_function/Array.groovy +++ b/regression-test/suites/nereids_function_p0/scalar_function/Array.groovy @@ -204,37 +204,37 @@ suite("nereids_scalar_fn_Array") { order_qt_sql_array_except_DateV2_notnull "select array_except(kadtv2, kadtv2) from fn_test_not_nullable" // array_intersect - order_qt_sql_array_intersect_Double "select array_intersect(kadbl, kadbl) from fn_test" - order_qt_sql_array_intersect_Double_notnull "select array_intersect(kadbl, kadbl) from fn_test_not_nullable" - order_qt_sql_array_intersect_Float "select array_intersect(kafloat, kafloat) from fn_test" - order_qt_sql_array_intersect_Float_notnull "select array_intersect(kafloat, kafloat) from fn_test_not_nullable" - order_qt_sql_array_intersect_LargeInt "select array_intersect(kalint, kalint) from fn_test" - order_qt_sql_array_intersect_LargeInt_notnull "select array_intersect(kalint, kalint) from fn_test_not_nullable" - order_qt_sql_array_intersect_BigInt "select array_intersect(kabint, kabint) from fn_test" - order_qt_sql_array_intersect_BigInt_notnull "select array_intersect(kabint, kabint) from fn_test_not_nullable" - order_qt_sql_array_intersect_SmallInt "select array_intersect(kasint, kasint) from fn_test" - order_qt_sql_array_intersect_SmallInt_notnull "select array_intersect(kasint, kasint) from fn_test_not_nullable" - order_qt_sql_array_intersect_Integer "select array_intersect(kaint, kaint) from fn_test" - order_qt_sql_array_intersect_Integer_notnull "select array_intersect(kaint, kaint) from fn_test_not_nullable" - order_qt_sql_array_intersect_TinyInt "select array_intersect(katint, katint) from fn_test" - order_qt_sql_array_intersect_TinyInt_notnull "select array_intersect(katint, katint) from fn_test_not_nullable" - order_qt_sql_array_intersect_DecimalV3 "select array_intersect(kadcml, kadcml) from fn_test" - order_qt_sql_array_intersect_DecimalV3_notnull "select array_intersect(kadcml, kadcml) from fn_test_not_nullable" - - order_qt_sql_array_intersect_Boolean "select array_intersect(kabool, kabool) from fn_test" - order_qt_sql_array_intersect_Boolean_notnull "select array_intersect(kabool, kabool) from fn_test_not_nullable" - - order_qt_sql_array_intersect_Char "select array_intersect(kachr, kachr) from fn_test" - order_qt_sql_array_intersect_Char_notnull "select array_intersect(kachr, kachr) from fn_test_not_nullable" - order_qt_sql_array_intersect_Varchar "select array_intersect(kavchr, kavchr) from fn_test" - order_qt_sql_array_intersect_Varchar_notnull "select array_intersect(kavchr, kavchr) from fn_test_not_nullable" - order_qt_sql_array_intersect_String "select array_intersect(kastr, kastr) from fn_test" - order_qt_sql_array_intersect_String_notnull "select array_intersect(kastr, kastr) from fn_test_not_nullable" - - order_qt_sql_array_intersect_DatetimeV2 "select array_intersect(kadtmv2, kadtmv2) from fn_test" - order_qt_sql_array_intersect_DatetimeV2_notnull "select array_intersect(kadtmv2, kadtmv2) from fn_test_not_nullable" - order_qt_sql_array_intersect_DateV2 "select array_intersect(kadtv2, kadtv2) from fn_test" - order_qt_sql_array_intersect_DateV2_notnull "select array_intersect(kadtv2, kadtv2) from fn_test_not_nullable" + order_qt_sql_array_intersect_Double "select array_sort(array_intersect(kadbl, kadbl)) from fn_test" + order_qt_sql_array_intersect_Double_notnull "select array_sort(array_intersect(kadbl, kadbl)) from fn_test_not_nullable" + order_qt_sql_array_intersect_Float "select array_sort(array_intersect(kafloat, kafloat)) from fn_test" + order_qt_sql_array_intersect_Float_notnull "select array_sort(array_intersect(kafloat, kafloat)) from fn_test_not_nullable" + order_qt_sql_array_intersect_LargeInt "select array_sort(array_intersect(kalint, kalint)) from fn_test" + order_qt_sql_array_intersect_LargeInt_notnull "select array_sort(array_intersect(kalint, kalint)) from fn_test_not_nullable" + order_qt_sql_array_intersect_BigInt "select array_sort(array_intersect(kabint, kabint)) from fn_test" + order_qt_sql_array_intersect_BigInt_notnull "select array_sort(array_intersect(kabint, kabint)) from fn_test_not_nullable" + order_qt_sql_array_intersect_SmallInt "select array_sort(array_intersect(kasint, kasint)) from fn_test" + order_qt_sql_array_intersect_SmallInt_notnull "select array_sort(array_intersect(kasint, kasint)) from fn_test_not_nullable" + order_qt_sql_array_intersect_Integer "select array_sort(array_intersect(kaint, kaint)) from fn_test" + order_qt_sql_array_intersect_Integer_notnull "select array_sort(array_intersect(kaint, kaint)) from fn_test_not_nullable" + order_qt_sql_array_intersect_TinyInt "select array_sort(array_intersect(katint, katint)) from fn_test" + order_qt_sql_array_intersect_TinyInt_notnull "select array_sort(array_intersect(katint, katint)) from fn_test_not_nullable" + order_qt_sql_array_intersect_DecimalV3 "select array_sort(array_intersect(kadcml, kadcml)) from fn_test" + order_qt_sql_array_intersect_DecimalV3_notnull "select array_sort(array_intersect(kadcml, kadcml)) from fn_test_not_nullable" + + order_qt_sql_array_intersect_Boolean "select array_sort(array_intersect(kabool, kabool)) from fn_test" + order_qt_sql_array_intersect_Boolean_notnull "select array_sort(array_intersect(kabool, kabool)) from fn_test_not_nullable" + + order_qt_sql_array_intersect_Char "select array_sort(array_intersect(kachr, kachr)) from fn_test" + order_qt_sql_array_intersect_Char_notnull "select array_sort(array_intersect(kachr, kachr)) from fn_test_not_nullable" + order_qt_sql_array_intersect_Varchar "select array_sort(array_intersect(kavchr, kavchr)) from fn_test" + order_qt_sql_array_intersect_Varchar_notnull "select array_sort(array_intersect(kavchr, kavchr)) from fn_test_not_nullable" + order_qt_sql_array_intersect_String "select array_sort(array_intersect(kastr, kastr)) from fn_test" + order_qt_sql_array_intersect_String_notnull "select array_sort(array_intersect(kastr, kastr)) from fn_test_not_nullable" + + order_qt_sql_array_intersect_DatetimeV2 "select array_sort(array_intersect(kadtmv2, kadtmv2)) from fn_test" + order_qt_sql_array_intersect_DatetimeV2_notnull "select array_sort(array_intersect(kadtmv2, kadtmv2)) from fn_test_not_nullable" + order_qt_sql_array_intersect_DateV2 "select array_sort(array_intersect(kadtv2, kadtv2)) from fn_test" + order_qt_sql_array_intersect_DateV2_notnull "select array_sort(array_intersect(kadtv2, kadtv2)) from fn_test_not_nullable" // array_join order_qt_sql_array_join_Double "select array_join(kadbl, ',', 'null') from fn_test" @@ -808,37 +808,37 @@ suite("nereids_scalar_fn_Array") { order_qt_sql_array_sum_DecimalV3_notnull "select array_sum(kadcml) from fn_test_not_nullable" // array_union - order_qt_sql_array_union_Double "select array_union(kadbl, kadbl) from fn_test" - order_qt_sql_array_union_Double_notnull "select array_union(kadbl, kadbl) from fn_test_not_nullable" - order_qt_sql_array_union_Float "select array_union(kafloat, kafloat) from fn_test" - order_qt_sql_array_union_Float_notnull "select array_union(kafloat, kafloat) from fn_test_not_nullable" - order_qt_sql_array_union_LargeInt "select array_union(kalint, kalint) from fn_test" - order_qt_sql_array_union_LargeInt_notnull "select array_union(kalint, kalint) from fn_test_not_nullable" - order_qt_sql_array_union_BigInt "select array_union(kabint, kabint) from fn_test" - order_qt_sql_array_union_BigInt_notnull "select array_union(kabint, kabint) from fn_test_not_nullable" - order_qt_sql_array_union_SmallInt "select array_union(kasint, kasint) from fn_test" - order_qt_sql_array_union_SmallInt_notnull "select array_union(kasint, kasint) from fn_test_not_nullable" - order_qt_sql_array_union_Integer "select array_union(kaint, kaint) from fn_test" - order_qt_sql_array_union_Integer_notnull "select array_union(kaint, kaint) from fn_test_not_nullable" - order_qt_sql_array_union_TinyInt "select array_union(katint, katint) from fn_test" - order_qt_sql_array_union_TinyInt_notnull "select array_union(katint, katint) from fn_test_not_nullable" - order_qt_sql_array_union_DecimalV3 "select array_union(kadcml, kadcml) from fn_test" - order_qt_sql_array_union_DecimalV3_notnull "select array_union(kadcml, kadcml) from fn_test_not_nullable" - - order_qt_sql_array_union_Boolean "select array_union(kabool, kabool) from fn_test" - order_qt_sql_array_union_Boolean_notnull "select array_union(kabool, kabool) from fn_test_not_nullable" - - order_qt_sql_array_union_Char "select array_union(kachr, kachr) from fn_test" - order_qt_sql_array_union_Char_notnull "select array_union(kachr, kachr) from fn_test_not_nullable" - order_qt_sql_array_union_Varchar "select array_union(kavchr, kavchr) from fn_test" - order_qt_sql_array_union_Varchar_notnull "select array_union(kavchr, kavchr) from fn_test_not_nullable" - order_qt_sql_array_union_String "select array_union(kastr, kastr) from fn_test" - order_qt_sql_array_union_String_notnull "select array_union(kastr, kastr) from fn_test_not_nullable" - - order_qt_sql_array_union_DatetimeV2 "select array_union(kadtmv2, kadtmv2) from fn_test" - order_qt_sql_array_union_DatetimeV2_notnull "select array_union(kadtmv2, kadtmv2) from fn_test_not_nullable" - order_qt_sql_array_union_DateV2 "select array_union(kadtv2, kadtv2) from fn_test" - order_qt_sql_array_union_DateV2_notnull "select array_union(kadtv2, kadtv2) from fn_test_not_nullable" + order_qt_sql_array_union_Double "select array_sort(array_union(kadbl, kadbl)) from fn_test" + order_qt_sql_array_union_Double_notnull "select array_sort(array_union(kadbl, kadbl)) from fn_test_not_nullable" + order_qt_sql_array_union_Float "select array_sort(array_union(kafloat, kafloat)) from fn_test" + order_qt_sql_array_union_Float_notnull "select array_sort(array_union(kafloat, kafloat)) from fn_test_not_nullable" + order_qt_sql_array_union_LargeInt "select array_sort(array_union(kalint, kalint)) from fn_test" + order_qt_sql_array_union_LargeInt_notnull "select array_sort(array_union(kalint, kalint)) from fn_test_not_nullable" + order_qt_sql_array_union_BigInt "select array_sort(array_union(kabint, kabint)) from fn_test" + order_qt_sql_array_union_BigInt_notnull "select array_sort(array_union(kabint, kabint)) from fn_test_not_nullable" + order_qt_sql_array_union_SmallInt "select array_sort(array_union(kasint, kasint)) from fn_test" + order_qt_sql_array_union_SmallInt_notnull "select array_sort(array_union(kasint, kasint)) from fn_test_not_nullable" + order_qt_sql_array_union_Integer "select array_sort(array_union(kaint, kaint)) from fn_test" + order_qt_sql_array_union_Integer_notnull "select array_sort(array_union(kaint, kaint)) from fn_test_not_nullable" + order_qt_sql_array_union_TinyInt "select array_sort(array_union(katint, katint)) from fn_test" + order_qt_sql_array_union_TinyInt_notnull "select array_sort(array_union(katint, katint)) from fn_test_not_nullable" + order_qt_sql_array_union_DecimalV3 "select array_sort(array_union(kadcml, kadcml)) from fn_test" + order_qt_sql_array_union_DecimalV3_notnull "select array_sort(array_union(kadcml, kadcml)) from fn_test_not_nullable" + + order_qt_sql_array_union_Boolean "select array_sort(array_union(kabool, kabool)) from fn_test" + order_qt_sql_array_union_Boolean_notnull "select array_sort(array_union(kabool, kabool)) from fn_test_not_nullable" + + order_qt_sql_array_union_Char "select array_sort(array_union(kachr, kachr)) from fn_test" + order_qt_sql_array_union_Char_notnull "select array_sort(array_union(kachr, kachr)) from fn_test_not_nullable" + order_qt_sql_array_union_Varchar "select array_sort(array_union(kavchr, kavchr)) from fn_test" + order_qt_sql_array_union_Varchar_notnull "select array_sort(array_union(kavchr, kavchr)) from fn_test_not_nullable" + order_qt_sql_array_union_String "select array_sort(array_union(kastr, kastr)) from fn_test" + order_qt_sql_array_union_String_notnull "select array_sort(array_union(kastr, kastr)) from fn_test_not_nullable" + + order_qt_sql_array_union_DatetimeV2 "select array_sort(array_union(kadtmv2, kadtmv2)) from fn_test" + order_qt_sql_array_union_DatetimeV2_notnull "select array_sort(array_union(kadtmv2, kadtmv2)) from fn_test_not_nullable" + order_qt_sql_array_union_DateV2 "select array_sort(array_union(kadtv2, kadtv2)) from fn_test" + order_qt_sql_array_union_DateV2_notnull "select array_sort(array_union(kadtv2, kadtv2)) from fn_test_not_nullable" // array_with_constant order_qt_sql_array_with_constant_Double "select array_with_constant(kint, kdbl) from fn_test"