From cfe6365dc630cc9c90d24cdc721c224257610375 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Mon, 25 Mar 2024 13:23:31 -0400 Subject: [PATCH] feat(mapping-optimizer): Support in operator for mapping optimizer This was a TODO item. But on the spans dataset, one easy to encounter situation is a condition like `sentry_tags[key] IN (value1, value2)`. This results in a sql like `in((arrayElement(sentry_tags.value, indexOf(sentry_tags.key, 'key')) AS `_snuba_sentry_tags[key]`), ['value1', 'value1'])` which scans the entire `sentry_tags.key` and `sentry_tags.value` columns. The optimization here is to use the tags hash map which gives us a condition like `hasAny(_sentry_tags_hash_map, array(cityHash64('environment=prod'), cityHash64('environment=production')))`. --- .../processors/physical/mapping_optimizer.py | 156 ++++++++++++++---- .../processors/test_mapping_optimizer.py | 70 ++++++++ 2 files changed, 191 insertions(+), 35 deletions(-) diff --git a/snuba/query/processors/physical/mapping_optimizer.py b/snuba/query/processors/physical/mapping_optimizer.py index 13f5f73960..0e1650a5b5 100644 --- a/snuba/query/processors/physical/mapping_optimizer.py +++ b/snuba/query/processors/physical/mapping_optimizer.py @@ -1,6 +1,6 @@ import logging from enum import Enum -from typing import Optional, Tuple +from typing import Optional, Tuple, cast from snuba import environment from snuba.clickhouse.query import Query @@ -13,6 +13,7 @@ ) from snuba.query.conditions import ( BooleanFunctions, + ConditionFunctions, combine_and_conditions, combine_or_conditions, get_first_level_and_conditions, @@ -90,9 +91,8 @@ def __init__( self.__killswitch = killswitch self.__value_subcolumn_name = value_subcolumn_name - # TODO: Add the support for IN conditions. - self.__optimizable_pattern = FunctionCall( - function_name=String("equals"), + self.__equals_condition_pattern = FunctionCall( + function_name=String(ConditionFunctions.EQ), parameters=( Or( [ @@ -106,6 +106,19 @@ def __init__( Param("right_hand_side", Literal(Any(str))), ), ) + self.__in_condition_pattern = FunctionCall( + function_name=String(ConditionFunctions.IN), + parameters=( + mapping_pattern, + Param( + "right_hand_side", + FunctionCall( + Or([String("array"), String("tuple")]), + all_parameters=Literal(), + ), + ), + ), + ) self.__tag_exists_patterns = [ FunctionCall( function_name=String("notEquals"), @@ -155,12 +168,18 @@ def __classify_combined_conditions(self, condition: Expression) -> ConditionClas def __classify_condition(self, condition: Expression) -> ConditionClass: # Expects this to be an individual condition - match = self.__optimizable_pattern.match(condition) + equals_condition_match = self.__equals_condition_pattern.match(condition) + in_condition_match = ( + self.__in_condition_pattern.match(condition) + if equals_condition_match is None + else None + ) if ( - match is not None - and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key" + equals_condition_match is not None + and equals_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" ): - rhs = match.expression("right_hand_side") + rhs = equals_condition_match.expression("right_hand_side") assert isinstance(rhs, LiteralExpr) return ( ConditionClass.NOT_OPTIMIZABLE @@ -168,7 +187,20 @@ def __classify_condition(self, condition: Expression) -> ConditionClass: if rhs.value == "" else ConditionClass.OPTIMIZABLE ) - elif match is None: + if ( + in_condition_match is not None + and in_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" + ): + rhs = in_condition_match.expression("right_hand_side") + assert isinstance(rhs, FunctionExpr) + + params = rhs.parameters + for param in params: + assert isinstance(param, LiteralExpr) + + return ConditionClass.OPTIMIZABLE + elif equals_condition_match is None and in_condition_match is None: # If this condition is not matching an optimizable condition, # check that it does not reference the optimizable column. # If it does, it means we should not optimize this query. @@ -183,35 +215,89 @@ def __classify_condition(self, condition: Expression) -> ConditionClass: return ConditionClass.IRRELEVANT def __replace_with_hash(self, condition: Expression) -> Expression: - match = self.__optimizable_pattern.match(condition) + equals_condition_match = self.__equals_condition_pattern.match(condition) + in_condition_match = ( + self.__in_condition_pattern.match(condition) + if equals_condition_match is None + else None + ) if ( - match is None - or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key" + equals_condition_match is not None + and equals_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" ): - return condition - rhs = match.expression("right_hand_side") - assert isinstance(rhs, LiteralExpr) - key = match.scalar(KEY_MAPPING_PARAM) - assert isinstance(key, (str, int)) - if isinstance(key, str): - key = key.translate(ESCAPE_TRANSLATION) - - return FunctionExpr( - alias=condition.alias, - function_name="has", - parameters=( - Column( - alias=None, - table_name=match.optional_string(TABLE_MAPPING_PARAM), - column_name=self.__hash_map_name, + rhs = equals_condition_match.expression("right_hand_side") + assert isinstance(rhs, LiteralExpr) + key = equals_condition_match.scalar(KEY_MAPPING_PARAM) + assert isinstance(key, (str, int)) + if isinstance(key, str): + key = key.translate(ESCAPE_TRANSLATION) + + return FunctionExpr( + alias=condition.alias, + function_name="has", + parameters=( + Column( + alias=None, + table_name=equals_condition_match.optional_string( + TABLE_MAPPING_PARAM + ), + column_name=self.__hash_map_name, + ), + FunctionExpr( + alias=None, + function_name="cityHash64", + parameters=(LiteralExpr(None, f"{key}={rhs.value}"),), + ), ), - FunctionExpr( - alias=None, - function_name="cityHash64", - parameters=(LiteralExpr(None, f"{key}={rhs.value}"),), + ) + elif ( + in_condition_match is not None + and in_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" + ): + key = in_condition_match.scalar(KEY_MAPPING_PARAM) + assert isinstance(key, (str, int)) + if isinstance(key, str): + key = key.translate(ESCAPE_TRANSLATION) + + rhs = in_condition_match.expression("right_hand_side") + assert isinstance(rhs, FunctionExpr) + for param in rhs.parameters: + assert isinstance(param, LiteralExpr) + params = cast(list[LiteralExpr], rhs.parameters) + + return FunctionExpr( + alias=condition.alias, + function_name="hasAny", + parameters=( + Column( + alias=None, + table_name=in_condition_match.optional_string( + TABLE_MAPPING_PARAM + ), + column_name=self.__hash_map_name, + ), + FunctionExpr( + alias=rhs.alias, + function_name="array", + parameters=tuple( + [ + FunctionExpr( + alias=None, + function_name="cityHash64", + parameters=( + LiteralExpr(None, f"{key}={lit.value}"), + ), + ) + for lit in params + ] + ), + ), ), - ), - ) + ) + + return condition def _get_condition_without_redundant_checks( self, condition: Expression, query: Query @@ -265,7 +351,7 @@ def _get_condition_without_redundant_checks( if tag_exist_match: matched_tag_exists_conditions[condition_id] = tag_exist_match if not tag_exist_match: - eq_match = self.__optimizable_pattern.match(cond) + eq_match = self.__equals_condition_pattern.match(cond) if eq_match: tag_eq_match_keys.add(eq_match.scalar(KEY_MAPPING_PARAM)) useful_conditions = [] diff --git a/tests/query/processors/test_mapping_optimizer.py b/tests/query/processors/test_mapping_optimizer.py index 03888c4ec9..4aa9cd93a4 100644 --- a/tests/query/processors/test_mapping_optimizer.py +++ b/tests/query/processors/test_mapping_optimizer.py @@ -278,6 +278,76 @@ nested_condition("tags", "my_tag", ConditionFunctions.EQ, "a"), id="Non optimizable having", ), + pytest.param( + build_query( + selected_columns=[column("event_id")], + condition=binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "tuple", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, "c"), + ), + ), + ), + ), + FunctionCall( + None, + "hasAny", + ( + column("_tags_hash_map", True), + FunctionCall( + None, + "array", + ( + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)), + ), + ), + ), + ), + id="Optimizable IN condition using tuple", + ), + pytest.param( + build_query( + selected_columns=[column("event_id")], + condition=binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "array", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, "c"), + ), + ), + ), + ), + FunctionCall( + None, + "hasAny", + ( + column("_tags_hash_map", True), + FunctionCall( + None, + "array", + ( + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)), + ), + ), + ), + ), + id="Optimizable IN condition using array", + ), ]