Skip to content

Commit

Permalink
xxhash64 for Date and String types
Browse files Browse the repository at this point in the history
Summary:
- Adds xxhash64 support for DATE and VARCHAR types.
- Adds test coverage

Differential Revision: D67885296
  • Loading branch information
pradeepvaka authored and facebook-github-bot committed Jan 25, 2025
1 parent dea4758 commit fb38cda
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 2 deletions.
4 changes: 2 additions & 2 deletions velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ struct XxHash64Function {
FOLLY_ALWAYS_INLINE
void call(out_type<Varbinary>& result, const arg_type<Varbinary>& input) {
// Seed is set to 0.
int64_t hash = folly::Endian::swap64(XXH64(input.data(), input.size(), 0));
static const auto kLen = sizeof(int64_t);
uint64_t hash = folly::Endian::big(XXH64(input.data(), input.size(), 0));
static constexpr auto kLen = sizeof(uint64_t);

// Resizing output and copy
result.resize(kLen);
Expand Down
17 changes: 17 additions & 0 deletions velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include <string_view>
#include "velox/expression/ComplexViewTypes.h"
#include "velox/functions/lib/DateTimeFormatter.h"
Expand Down Expand Up @@ -1880,4 +1883,18 @@ struct ToMillisecondFunction {
}
};

/// xxhash64(Date) → bigint
/// Return a xxhash64 of input Date
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Date>& input) {
// Casted to int64_t to feed into XXH64
auto date_input = static_cast<int64_t>(input);
result = XXH64(&date_input, sizeof(date_input), 0);
}
};

} // namespace facebook::velox::functions
15 changes: 15 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringCore.h"
#include "velox/functions/lib/string/StringImpl.h"
Expand Down Expand Up @@ -650,4 +653,16 @@ struct NormalizeFunction {
}
};

/// xxhash64(varchar) → bigint
/// Return a hash64 of input (Varchar such as string)
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Varchar>& input) {
result = XXH64(input.data(), input.size(), 0);
}
};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ void registerSimpleFunctions(const std::string& prefix) {

registerFunction<ToMillisecondFunction, int64_t, IntervalDayTime>(
{prefix + "to_milliseconds"});

registerFunction<XxHash64Function, int64_t, Date>({prefix + "xxhash64"});
}
} // namespace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void registerSimpleFunctions(const std::string& prefix) {
registerFunction<LevenshteinDistanceFunction, int64_t, Varchar, Varchar>(
{prefix + "levenshtein_distance"});
registerFunction<LengthFunction, int64_t, Varchar>({prefix + "length"});
registerFunction<XxHash64Function, int64_t, Varchar>({prefix + "xxhash64"});

// Length for varbinary have different semantics.
registerFunction<LengthVarbinaryFunction, int64_t, Varbinary>(
Expand Down
50 changes: 50 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
* limitations under the License.
*/

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/external/date/tz.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
Expand All @@ -23,6 +26,33 @@
using namespace facebook::velox;
using namespace facebook::velox::test;

// Helper to convert a uint64_t into a zero-padded 16-hex-digit string.
std::string toHex(uint64_t val) {
// 16 hex digits plus null terminator.
char buffer[17];
// PRIx64 prints a 64-bit value in hex.
std::snprintf(buffer, sizeof(buffer), "%016" PRIx64, val);
return std::string(buffer);
}

// Helper to convert a Varbinary (StringView) into a human‐readable hex string.
std::string varbinaryToHex(const StringView& varbinary) {
static const char* kHexDigits = "0123456789abcdef";
std::string out;
out.reserve(varbinary.size() * 2);
for (int i = 0; i < varbinary.size(); ++i) {
unsigned char c = static_cast<unsigned char>(varbinary.data()[i]);
out.push_back(kHexDigits[c >> 4]);
out.push_back(kHexDigits[c & 0x0F]);
}
return out;
}

// For local verification in test to match what xxhash64 is doing.
uint64_t computeXxhash64(const int32_t& dateVal) {
return XXH64(&dateVal, sizeof(dateVal), 0);
}

class DateTimeFunctionsTest : public functions::test::FunctionBaseTest {
protected:
std::string daysShort[7] = {"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"};
Expand Down Expand Up @@ -5272,3 +5302,23 @@ TEST_F(DateTimeFunctionsTest, toMilliseconds) {
INTERVAL_DAY_TIME(),
std::optional<int64_t>(123)));
}

TEST_F(DateTimeFunctionsTest, xxHash64FunctionDate) {
const auto xxhash64 = [&](std::optional<int32_t> date) {
return evaluateOnce<int64_t>("xxhash64(c0)", DATE(), date);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

// Epoch
EXPECT_EQ(3803688792395291579, xxhash64(parseDate("1970-01-01")));
EXPECT_EQ(3734916545851684445, xxhash64(parseDate("2024-10-07")));
EXPECT_EQ(1385444150471264300, xxhash64(parseDate("2025-01-10")));
EXPECT_EQ(-6977822845260490347, xxhash64(parseDate("1970-01-02")));
// Leap date
EXPECT_EQ(-5306598937769828126, xxhash64(parseDate("2020-02-29")));
// Max supported date
EXPECT_EQ(3856043376106280085, xxhash64(parseDate("9999-12-31")));
// Y2K
EXPECT_EQ(-7612541860844473816, xxhash64(parseDate("2000-01-01")));
}
50 changes: 50 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,35 @@ int expectedStart(int i) {
int expectedLength(int i) {
return i % 3;
}

FOLLY_ALWAYS_INLINE static uint8_t fromHex(char c) {
if (c >= '0' && c <= '9') {
return c - '0';
}

if (c >= 'A' && c <= 'F') {
return 10 + c - 'A';
}

if (c >= 'a' && c <= 'f') {
return 10 + c - 'a';
}

VELOX_USER_FAIL("Invalid hex character: {}", c);
}

std::string hexToDec(const std::string& str) {
VELOX_CHECK_EQ(str.size() % 2, 0);
std::string out;
out.resize(str.size() / 2);
for (int i = 0; i < out.size(); ++i) {
int high = fromHex(str[2 * i]);
int low = fromHex(str[2 * i + 1]);
out[i] = (high << 4) | (low & 0xf);
}
return out;
}

} // namespace

class StringFunctionsTest : public FunctionBaseTest {
Expand Down Expand Up @@ -2271,3 +2300,24 @@ TEST_F(StringFunctionsTest, trail) {
// Test empty
EXPECT_EQ("", trail("", 3));
}

TEST_F(StringFunctionsTest, xxHash64FunctionVarchar) {
const auto xxhash64 = [&](std::optional<std::string> value) {
return evaluateOnce<int64_t>("xxhash64(c0)", VARCHAR(), value);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

EXPECT_EQ(-1205034819632174695, xxhash64(""));
EXPECT_EQ(4952883123889572249, xxhash64("abc"));
EXPECT_EQ(-1843406881296486760, xxhash64("ABC"));
EXPECT_EQ(9087872763436141786, xxhash64("string to xxhash64 as param"));
EXPECT_EQ(6332497344822543626, xxhash64("special characters %_@"));
EXPECT_EQ(-3364246049109667261, xxhash64(" leading space"));
// Unicode characters
EXPECT_EQ(-7331673579364787606, xxhash64("café"));
// String with null bytes
EXPECT_EQ(160339756714205673, xxhash64("abc\\x00def"));
// Non-ASCII strings
EXPECT_EQ(8176744303664166369, xxhash64("日本語"));
}

0 comments on commit fb38cda

Please sign in to comment.