Skip to content

Commit

Permalink
xxhash64 for Date and String types (#12170)
Browse files Browse the repository at this point in the history
Summary:

- Adds xxhash64 support for DATE and VARCHAR types.
- Adds test coverage

Differential Revision: D67885296
  • Loading branch information
pradeepvaka authored and facebook-github-bot committed Jan 25, 2025
1 parent dea4758 commit a6ffade
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 1 deletion.
2 changes: 1 addition & 1 deletion velox/functions/prestosql/BinaryFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ struct XxHash64Function {
void call(out_type<Varbinary>& result, const arg_type<Varbinary>& input) {
// Seed is set to 0.
int64_t hash = folly::Endian::swap64(XXH64(input.data(), input.size(), 0));
static const auto kLen = sizeof(int64_t);
static constexpr auto kLen = sizeof(int64_t);

// Resizing output and copy
result.resize(kLen);
Expand Down
17 changes: 17 additions & 0 deletions velox/functions/prestosql/DateTimeFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include <string_view>
#include "velox/expression/ComplexViewTypes.h"
#include "velox/functions/lib/DateTimeFormatter.h"
Expand Down Expand Up @@ -1880,4 +1883,18 @@ struct ToMillisecondFunction {
}
};

/// xxhash64(Date) → bigint
/// Return a xxhash64 of input Date
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Date>& input) {
// Casted to int64_t to feed into XXH64
auto date_input = static_cast<int64_t>(input);
result = XXH64(&date_input, sizeof(date_input), 0);
}
};

} // namespace facebook::velox::functions
15 changes: 15 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
*/
#pragma once

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringCore.h"
#include "velox/functions/lib/string/StringImpl.h"
Expand Down Expand Up @@ -650,4 +653,16 @@ struct NormalizeFunction {
}
};

/// xxhash64(varchar) → bigint
/// Return a hash64 of input (Varchar such as string)
template <typename T>
struct XxHash64Function {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
void call(out_type<int64_t>& result, const arg_type<Varchar>& input) {
result = XXH64(input.data(), input.size(), 0);
}
};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ void registerSimpleFunctions(const std::string& prefix) {

registerFunction<ToMillisecondFunction, int64_t, IntervalDayTime>(
{prefix + "to_milliseconds"});

registerFunction<XxHash64Function, int64_t, Date>({prefix + "xxhash64"});
}
} // namespace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ void registerSimpleFunctions(const std::string& prefix) {
registerFunction<LevenshteinDistanceFunction, int64_t, Varchar, Varchar>(
{prefix + "levenshtein_distance"});
registerFunction<LengthFunction, int64_t, Varchar>({prefix + "length"});
registerFunction<XxHash64Function, int64_t, Varchar>({prefix + "xxhash64"});

// Length for varbinary have different semantics.
registerFunction<LengthVarbinaryFunction, int64_t, Varbinary>(
Expand Down
23 changes: 23 additions & 0 deletions velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
* limitations under the License.
*/

#define XXH_INLINE_ALL
#include <xxhash.h>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/external/date/tz.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
Expand Down Expand Up @@ -5272,3 +5275,23 @@ TEST_F(DateTimeFunctionsTest, toMilliseconds) {
INTERVAL_DAY_TIME(),
std::optional<int64_t>(123)));
}

TEST_F(DateTimeFunctionsTest, xxHash64FunctionDate) {
const auto xxhash64 = [&](std::optional<int32_t> date) {
return evaluateOnce<int64_t>("xxhash64(c0)", DATE(), date);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

// Epoch
EXPECT_EQ(3803688792395291579, xxhash64(parseDate("1970-01-01")));
EXPECT_EQ(3734916545851684445, xxhash64(parseDate("2024-10-07")));
EXPECT_EQ(1385444150471264300, xxhash64(parseDate("2025-01-10")));
EXPECT_EQ(-6977822845260490347, xxhash64(parseDate("1970-01-02")));
// Leap date
EXPECT_EQ(-5306598937769828126, xxhash64(parseDate("2020-02-29")));
// Max supported date
EXPECT_EQ(3856043376106280085, xxhash64(parseDate("9999-12-31")));
// Y2K
EXPECT_EQ(-7612541860844473816, xxhash64(parseDate("2000-01-01")));
}
21 changes: 21 additions & 0 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2271,3 +2271,24 @@ TEST_F(StringFunctionsTest, trail) {
// Test empty
EXPECT_EQ("", trail("", 3));
}

TEST_F(StringFunctionsTest, xxHash64FunctionVarchar) {
const auto xxhash64 = [&](std::optional<std::string> value) {
return evaluateOnce<int64_t>("xxhash64(c0)", VARCHAR(), value);
};

EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));

EXPECT_EQ(-1205034819632174695, xxhash64(""));
EXPECT_EQ(4952883123889572249, xxhash64("abc"));
EXPECT_EQ(-1843406881296486760, xxhash64("ABC"));
EXPECT_EQ(9087872763436141786, xxhash64("string to xxhash64 as param"));
EXPECT_EQ(6332497344822543626, xxhash64("special characters %_@"));
EXPECT_EQ(-3364246049109667261, xxhash64(" leading space"));
// Unicode characters
EXPECT_EQ(-7331673579364787606, xxhash64("café"));
// String with null bytes
EXPECT_EQ(160339756714205673, xxhash64("abc\\x00def"));
// Non-ASCII strings
EXPECT_EQ(8176744303664166369, xxhash64("日本語"));
}

0 comments on commit a6ffade

Please sign in to comment.