Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement inverse_f_cdf() Presto function #11281

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions velox/docs/functions/presto/math.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,12 @@ Probability Functions: inverse_cdf
probability (p): P(N < n). The a, b parameters must be positive real values (all of type DOUBLE).
The probability p must lie on the interval [0, 1].

.. function:: inverse_f_cdf(df1, df2, p) -> double

Compute the inverse of the F cdf with a given ``df1`` (numerator degrees of freedom) and ``df2`` (denominator degrees of freedom) parameters
for the cumulative probability (p): P(N < n). The numerator and denominator df parameters must be positive real numbers.
The probability ``p`` must lie on the interval [0, 1].

.. function:: inverse_weibull_cdf(a, b, p) -> double

Compute the inverse of the Weibull cdf with given parameters ``a``, ``b`` for the probability ``p``.
Expand Down
15 changes: 15 additions & 0 deletions velox/functions/prestosql/Probability.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,21 @@ struct InverseBetaCDFFunction {
}
};

template <typename T>
struct InverseFCDFFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void
call(double& result, double df1, double df2, double p) {
VELOX_USER_CHECK((p >= 0) && (p <= 1), "p must be in the interval [0, 1]");
VELOX_USER_CHECK_GT(df1, 0, "numerator df must be greater than 0");
VELOX_USER_CHECK_GT(df2, 0, "denominator df must be greater than 0");

boost::math::fisher_f_distribution<> dist(df1, df2);
result = boost::math::quantile(dist, p);
}
};

template <typename T>
struct ChiSquaredCDFFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ void registerProbTrigFunctions(const std::string& prefix) {
{prefix + "f_cdf"});
registerFunction<InverseBetaCDFFunction, double, double, double, double>(
{prefix + "inverse_beta_cdf"});
registerFunction<InverseFCDFFunction, double, double, double, double>(
{prefix + "inverse_f_cdf"});
registerFunction<InverseNormalCDFFunction, double, double, double, double>(
{prefix + "inverse_normal_cdf"});
registerFunction<PoissonCDFFunction, double, double, int32_t>(
Expand Down
67 changes: 67 additions & 0 deletions velox/functions/prestosql/tests/ProbabilityTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,73 @@ TEST_F(ProbabilityTest, invBetaCDF) {
VELOX_ASSERT_THROW(invBetaCDF(3, 5, 1.1), "p must be in the interval [0, 1]");
}

TEST_F(ProbabilityTest, inverseFCDF) {
const auto inverseFCDF = [&](std::optional<double> df1,
std::optional<double> df2,
std::optional<double> p) {
return evaluateOnce<double>("inverse_f_cdf(c0, c1, c2)", df1, df2, p);
};

EXPECT_EQ(inverseFCDF(2.0, 5.0, 0.0), 0.0);
EXPECT_EQ(inverseFCDF(2.0, 5.0, 0.5), 0.79876977693223561);
EXPECT_EQ(inverseFCDF(2.0, 5.0, 0.9), 3.779716078773951);

EXPECT_EQ(inverseFCDF(2.0, 5.0, std::nullopt), std::nullopt);
EXPECT_EQ(inverseFCDF(2.0, std::nullopt, 3.7797), std::nullopt);
EXPECT_EQ(inverseFCDF(std::nullopt, 5.0, 3.7797), std::nullopt);

EXPECT_EQ(inverseFCDF(kDoubleMax, 5.0, 1), kInf);
EXPECT_EQ(inverseFCDF(1, kDoubleMax, 1), kInf);
EXPECT_EQ(inverseFCDF(82.6, 901.10, 1), kInf);
EXPECT_EQ(inverseFCDF(kDoubleMin, 50.620, 1), kInf);
EXPECT_EQ(
inverseFCDF(kBigIntMax, 5.0, 0.93256230095450132), 3.7797000000000009);
EXPECT_EQ(inverseFCDF(76.901, kBigIntMax, 1), kInf);
EXPECT_EQ(inverseFCDF(2.0, 5.0, 1), kInf);

// Test invalid inputs for df1.
VELOX_ASSERT_THROW(
inverseFCDF(0, 3, 0.5), "numerator df must be greater than 0");
VELOX_ASSERT_THROW(
inverseFCDF(kBigIntMin, 5.0, 0.999),
"numerator df must be greater than 0");

// Test invalid inputs for df2.
VELOX_ASSERT_THROW(
inverseFCDF(3, 0, 0.5), "denominator df must be greater than 0");
VELOX_ASSERT_THROW(
inverseFCDF(2.0, kBigIntMin, 0.0001),
"denominator df must be greater than 0");

// Test invalid inputs for p.
VELOX_ASSERT_THROW(
inverseFCDF(3, 5, -0.1), "p must be in the interval [0, 1]");
VELOX_ASSERT_THROW(
inverseFCDF(2.0, 5.0, kBigIntMin), "p must be in the interval [0, 1]");

// Test a combination of invalid inputs.
VELOX_ASSERT_THROW(
inverseFCDF(-1.2, 0, -0.1), "p must be in the interval [0, 1]");
VELOX_ASSERT_THROW(
inverseFCDF(1, -kInf, -0.1), "p must be in the interval [0, 1]");
}


// TODO: remove this test after completion of fuzzer debug
TEST_F(ProbabilityTest, inverseFCDF_FUZZER) {
const auto inverseFCDF = [&](std::optional<double> df1,
std::optional<double> df2,
std::optional<double> p) {
return evaluateOnce<double>("inverse_f_cdf(c0, c1, c2)", df1, df2, p);
};

// EXPECT_EQ(inverseFCDF(2.0, 5.0, 0.0), 0.0);

// inverse_f_cdf(0.36946232430636883, inverse_f_cdf(0.5703326677903533, 0.7623271467164159, 0.13314712885767221), 0.9104626753833145)
auto subExpr1 = inverseFCDF(0.5703326677903533, 0.7623271467164159, 0.13314712885767221);
auto mainExpr1 = inverseFCDF(0.36946232430636883, subExpr1, 0.9104626753833145);
}

TEST_F(ProbabilityTest, chiSquaredCDF) {
const auto chiSquaredCDF = [&](std::optional<double> df,
std::optional<double> value) {
Expand Down
Loading