Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xxhash64 for Date and String types #12170

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ struct XxHash64Function {
void call(out_type<Varbinary>& result, const arg_type<Varbinary>& input) {
// Seed is set to 0.
int64_t hash = folly::Endian::swap64(XXH64(input.data(), input.size(), 0));
static const auto kLen = sizeof(int64_t);
static constexpr auto kLen = sizeof(int64_t);

// Resizing output and copy
result.resize(kLen);
Expand Down
17 changes: 17 additions & 0 deletions velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include <string_view>
#include "velox/expression/ComplexViewTypes.h"
#include "velox/functions/lib/DateTimeFormatter.h"
Expand Down Expand Up @@ -1880,4 +1883,18 @@ struct ToMillisecondFunction {
}
};

/// xxhash64(Date) → bigint
/// Return a xxhash64 of input Date
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Date>& input) {
// Casted to int64_t to feed into XXH64
auto date_input = static_cast<int64_t>(input);
result = XXH64(&date_input, sizeof(date_input), 0);
}
};

} // namespace facebook::velox::functions
15 changes: 15 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringCore.h"
#include "velox/functions/lib/string/StringImpl.h"
Expand Down Expand Up @@ -650,4 +653,16 @@ struct NormalizeFunction {
}
};

/// xxhash64(varchar) → bigint
/// Return a hash64 of input (Varchar such as string)
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Varchar>& input) {
result = XXH64(input.data(), input.size(), 0);
}
};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ void registerSimpleFunctions(const std::string& prefix) {

registerFunction<ToMillisecondFunction, int64_t, IntervalDayTime>(
{prefix + "to_milliseconds"});

registerFunction<XxHash64Function, int64_t, Date>({prefix + "xxhash64"});
}
} // namespace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void registerSimpleFunctions(const std::string& prefix) {
registerFunction<LevenshteinDistanceFunction, int64_t, Varchar, Varchar>(
{prefix + "levenshtein_distance"});
registerFunction<LengthFunction, int64_t, Varchar>({prefix + "length"});
registerFunction<XxHash64Function, int64_t, Varchar>({prefix + "xxhash64"});

// Length for varbinary have different semantics.
registerFunction<LengthVarbinaryFunction, int64_t, Varbinary>(
Expand Down
23 changes: 23 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
* limitations under the License.
*/

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/external/date/tz.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
Expand Down Expand Up @@ -5272,3 +5275,23 @@ TEST_F(DateTimeFunctionsTest, toMilliseconds) {
INTERVAL_DAY_TIME(),
std::optional<int64_t>(123)));
}

TEST_F(DateTimeFunctionsTest, xxHash64FunctionDate) {
const auto xxhash64 = [&](std::optional<int32_t> date) {
return evaluateOnce<int64_t>("xxhash64(c0)", DATE(), date);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

// Epoch
EXPECT_EQ(3803688792395291579, xxhash64(parseDate("1970-01-01")));
EXPECT_EQ(3734916545851684445, xxhash64(parseDate("2024-10-07")));
EXPECT_EQ(1385444150471264300, xxhash64(parseDate("2025-01-10")));
EXPECT_EQ(-6977822845260490347, xxhash64(parseDate("1970-01-02")));
// Leap date
EXPECT_EQ(-5306598937769828126, xxhash64(parseDate("2020-02-29")));
// Max supported date
EXPECT_EQ(3856043376106280085, xxhash64(parseDate("9999-12-31")));
// Y2K
EXPECT_EQ(-7612541860844473816, xxhash64(parseDate("2000-01-01")));
}
21 changes: 21 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2271,3 +2271,24 @@ TEST_F(StringFunctionsTest, trail) {
// Test empty
EXPECT_EQ("", trail("", 3));
}

TEST_F(StringFunctionsTest, xxHash64FunctionVarchar) {
const auto xxhash64 = [&](std::optional<std::string> value) {
return evaluateOnce<int64_t>("xxhash64(c0)", VARCHAR(), value);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

EXPECT_EQ(-1205034819632174695, xxhash64(""));
EXPECT_EQ(4952883123889572249, xxhash64("abc"));
EXPECT_EQ(-1843406881296486760, xxhash64("ABC"));
EXPECT_EQ(9087872763436141786, xxhash64("string to xxhash64 as param"));
EXPECT_EQ(6332497344822543626, xxhash64("special characters %_@"));
EXPECT_EQ(-3364246049109667261, xxhash64(" leading space"));
// Unicode characters
EXPECT_EQ(-7331673579364787606, xxhash64("café"));
// String with null bytes
EXPECT_EQ(160339756714205673, xxhash64("abc\\x00def"));
// Non-ASCII strings
EXPECT_EQ(8176744303664166369, xxhash64("日本語"));
}