Skip to content

Commit

Permalink
feat(mapping-optimizer): Support in operator for mapping optimizer (#…
Browse files Browse the repository at this point in the history
…5691) (#5692)

Re-apply #5685

This was a TODO item. But on the spans dataset, one easy to encounter situation is a condition like
```
sentry_tags[key] IN (value1, value2)
```
This results in a sql like
```
in((arrayElement(sentry_tags.value, indexOf(sentry_tags.key, 'key')) AS `_snuba_sentry_tags[key]`), ['value1', 'value1'])
```
which scans the entire `sentry_tags.key` and `sentry_tags.value` columns. The optimization here is to use the tags hash map which gives us a condition like
```
hasAny(_sentry_tags_hash_map, array(cityHash64('key=value1'), cityHash64('key=value1')))
```
  • Loading branch information
Zylphrex authored Mar 27, 2024
1 parent 1406965 commit 4788281
Show file tree
Hide file tree
Showing 3 changed files with 228 additions and 35 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ jobs:
tests/sentry/search/events \
tests/sentry/event_manager \
tests/sentry/api/endpoints/test_organization_profiling_functions.py \
tests/snuba/api/endpoints/test_organization_events_stats_mep.py \
tests/snuba/test_snql_snuba.py \
tests/snuba/test_metrics_layer.py \
-vv --cov . --cov-report="xml:.artifacts/snuba.coverage.xml"
Expand Down
160 changes: 125 additions & 35 deletions snuba/query/processors/physical/mapping_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from enum import Enum
from typing import Optional, Tuple
from typing import Optional, Tuple, cast

from snuba import environment
from snuba.clickhouse.query import Query
Expand All @@ -13,6 +13,7 @@
)
from snuba.query.conditions import (
BooleanFunctions,
ConditionFunctions,
combine_and_conditions,
combine_or_conditions,
get_first_level_and_conditions,
Expand Down Expand Up @@ -90,9 +91,8 @@ def __init__(
self.__killswitch = killswitch
self.__value_subcolumn_name = value_subcolumn_name

# TODO: Add the support for IN conditions.
self.__optimizable_pattern = FunctionCall(
function_name=String("equals"),
self.__equals_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.EQ),
parameters=(
Or(
[
Expand All @@ -106,6 +106,19 @@ def __init__(
Param("right_hand_side", Literal(Any(str))),
),
)
self.__in_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.IN),
parameters=(
mapping_pattern,
Param(
"right_hand_side",
FunctionCall(
Or([String("array"), String("tuple")]),
all_parameters=Literal(),
),
),
),
)
self.__tag_exists_patterns = [
FunctionCall(
function_name=String("notEquals"),
Expand Down Expand Up @@ -155,20 +168,43 @@ def __classify_combined_conditions(self, condition: Expression) -> ConditionClas

def __classify_condition(self, condition: Expression) -> ConditionClass:
# Expects this to be an individual condition
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is not None
and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = match.expression("right_hand_side")
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
return (
ConditionClass.NOT_OPTIMIZABLE
# ifNull(tags[asd], '') = '' is not optimizable.
if rhs.value == ""
else ConditionClass.OPTIMIZABLE
)
elif match is None:
if (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)

params = rhs.parameters
for param in params:
assert isinstance(param, LiteralExpr)

# tags[foo] IN array('') is not optimizable
if param.value == "":
return ConditionClass.NOT_OPTIMIZABLE

return ConditionClass.OPTIMIZABLE
elif equals_condition_match is None and in_condition_match is None:
# If this condition is not matching an optimizable condition,
# check that it does not reference the optimizable column.
# If it does, it means we should not optimize this query.
Expand All @@ -183,35 +219,89 @@ def __classify_condition(self, condition: Expression) -> ConditionClass:
return ConditionClass.IRRELEVANT

def __replace_with_hash(self, condition: Expression) -> Expression:
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is None
or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
return condition
rhs = match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=match.optional_string(TABLE_MAPPING_PARAM),
column_name=self.__hash_map_name,
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = equals_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=equals_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
),
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
)
elif (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
key = in_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)
for param in rhs.parameters:
assert isinstance(param, LiteralExpr)
params = cast(list[LiteralExpr], rhs.parameters)

return FunctionExpr(
alias=condition.alias,
function_name="hasAny",
parameters=(
Column(
alias=None,
table_name=in_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=rhs.alias,
function_name="array",
parameters=tuple(
[
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(
LiteralExpr(None, f"{key}={lit.value}"),
),
)
for lit in params
]
),
),
),
),
)
)

return condition

def _get_condition_without_redundant_checks(
self, condition: Expression, query: Query
Expand Down Expand Up @@ -265,7 +355,7 @@ def _get_condition_without_redundant_checks(
if tag_exist_match:
matched_tag_exists_conditions[condition_id] = tag_exist_match
if not tag_exist_match:
eq_match = self.__optimizable_pattern.match(cond)
eq_match = self.__equals_condition_pattern.match(cond)
if eq_match:
tag_eq_match_keys.add(eq_match.scalar(KEY_MAPPING_PARAM))
useful_conditions = []
Expand Down
102 changes: 102 additions & 0 deletions tests/query/processors/test_mapping_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,108 @@
nested_condition("tags", "my_tag", ConditionFunctions.EQ, "a"),
id="Non optimizable having",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"tuple",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using tuple",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using array",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, ""),
),
),
),
),
binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, ""),
),
),
),
id="Non optimizable IN condition with empty value",
),
]


Expand Down

0 comments on commit 4788281

Please sign in to comment.