diff --git a/snuba/querylog/__init__.py b/snuba/querylog/__init__.py index ca88a94547..980ec82658 100644 --- a/snuba/querylog/__init__.py +++ b/snuba/querylog/__init__.py @@ -11,7 +11,7 @@ ) from snuba.datasets.storage import StorageNotAvailable from snuba.query.exceptions import QueryPlanException -from snuba.querylog.query_metadata import SnubaQueryMetadata, Status +from snuba.querylog.query_metadata import QueryStatus, SnubaQueryMetadata, Status from snuba.request import Request from snuba.utils.metrics.timer import Timer from snuba.utils.metrics.wrapper import MetricsWrapper @@ -32,6 +32,7 @@ def _record_timer_metrics( app_id = request.attribution_info.app_id.key or "none" parent_api = request.attribution_info.parent_api or "none" tags = { + "status": query_metadata.status.value, "request_status": query_metadata.request_status.value, "slo": query_metadata.slo.value, "referrer": referrer, @@ -51,6 +52,7 @@ def _record_timer_metrics( # As a result, its status and SLO values are not based on its query_list status = get_request_status(result) tags = { + "status": QueryStatus.ERROR.value, "request_status": status.status.value, "slo": status.slo.value, "referrer": referrer, @@ -153,7 +155,9 @@ def record_invalid_request( it records failures during parsing/validation. This is for client errors. """ - _record_failure_building_request(request_status, timer, referrer) + _record_failure_building_request( + QueryStatus.INVALID_REQUEST, request_status, timer, referrer + ) def record_error_building_request( @@ -164,10 +168,11 @@ def record_error_building_request( it records failures during parsing/validation. This is for system errors during parsing/validation. """ - _record_failure_building_request(request_status, timer, referrer) + _record_failure_building_request(QueryStatus.ERROR, request_status, timer, referrer) def _record_failure_building_request( + status: QueryStatus, request_status: Status, timer: Timer, referrer: Optional[str], @@ -178,6 +183,7 @@ def _record_failure_building_request( timer.send_metrics_to( metrics, tags={ + "status": status.value, "referrer": referrer or "none", "request_status": request_status.status.value, "slo": request_status.slo.value, diff --git a/snuba/querylog/query_metadata.py b/snuba/querylog/query_metadata.py index 6a4a1b8847..543bf34bf1 100644 --- a/snuba/querylog/query_metadata.py +++ b/snuba/querylog/query_metadata.py @@ -16,9 +16,31 @@ from snuba.utils.metrics.timer import Timer +class QueryStatus(Enum): + SUCCESS = "success" + ERROR = "error" # A system error + RATE_LIMITED = "rate-limited" + INVALID_REQUEST = "invalid-request" + TIMEOUT = "timeout" + + +CLICKHOUSE_ERROR_TO_SNUBA_ERROR_MAPPINGS = { + ErrorCodes.TOO_SLOW: QueryStatus.TIMEOUT, + ErrorCodes.TIMEOUT_EXCEEDED: QueryStatus.TIMEOUT, + ErrorCodes.SOCKET_TIMEOUT: QueryStatus.TIMEOUT, + ErrorCodes.NETWORK_ERROR: QueryStatus.TIMEOUT, +} + + +def get_query_status_from_error_codes(code: ErrorCodes) -> QueryStatus | None: + return CLICKHOUSE_ERROR_TO_SNUBA_ERROR_MAPPINGS.get(code) + + class RequestStatus(Enum): """ The different statuses we return for a request. + + TODO: This will replace QueryStatus, but both exist as we cut over. """ # Successfully returned a response @@ -165,6 +187,7 @@ class ClickhouseQueryMetadata: start_timestamp: Optional[datetime] end_timestamp: Optional[datetime] stats: Dict[str, Any] + status: QueryStatus request_status: Status profile: ClickhouseQueryProfile trace_id: Optional[str] = None @@ -179,7 +202,7 @@ def to_dict(self) -> snuba_queries_v1.QueryMetadata: "start_timestamp": start, "end_timestamp": end, "stats": self.stats, - "status": self.request_status.status.value, # TODO: Remove this from querylog schema + "status": self.status.value, "request_status": self.request_status.status.value, "slo": self.request_status.slo.value, "trace_id": self.trace_id, @@ -221,7 +244,7 @@ def to_dict(self) -> snuba_queries_v1.Querylog: "start_timestamp": start, "end_timestamp": end, "query_list": [q.to_dict() for q in self.query_list], - "status": self.request_status.value, # TODO: Remove this from querylog schema + "status": self.status.value, "request_status": self.request_status.value, "slo": self.slo.value, "timing": self.timer.for_json(), @@ -234,6 +257,12 @@ def to_dict(self) -> snuba_queries_v1.Querylog: request_dict["organization"] = org_id return request_dict + @property + def status(self) -> QueryStatus: + # If we do not have any recorded query and we did not specifically log + # invalid_query, we assume there was an error somewhere. + return self.query_list[-1].status if self.query_list else QueryStatus.ERROR + @property def request_status(self) -> RequestStatus: # If we do not have any recorded query and we did not specifically log diff --git a/snuba/web/db_query.py b/snuba/web/db_query.py index be0e253857..530a3464a6 100644 --- a/snuba/web/db_query.py +++ b/snuba/web/db_query.py @@ -48,12 +48,14 @@ from snuba.querylog.query_metadata import ( SLO, ClickhouseQueryMetadata, + QueryStatus, Status, + get_query_status_from_error_codes, get_request_status, ) from snuba.reader import Reader, Result from snuba.redis import RedisClientKey, get_redis_client -from snuba.state.cache.abstract import Cache +from snuba.state.cache.abstract import Cache, ExecutionTimeoutError from snuba.state.cache.redis.backend import RESULT_VALUE, RESULT_WAIT, RedisCache from snuba.state.quota import ResourceQuota from snuba.state.rate_limit import ( @@ -125,6 +127,7 @@ def update_query_metadata_and_stats( query_metadata_list: MutableSequence[ClickhouseQueryMetadata], query_settings: Mapping[str, Any], trace_id: Optional[str], + status: QueryStatus, request_status: Status, profile_data: Optional[Dict[str, Any]] = None, error_code: Optional[int] = None, @@ -150,6 +153,7 @@ def update_query_metadata_and_stats( start_timestamp=start, end_timestamp=end, stats=dict(stats), + status=status, request_status=request_status, profile=generate_profile(query), trace_id=trace_id, @@ -594,16 +598,24 @@ def _raw_query( except Exception as cause: error_code = None trigger_rate_limiter = None + status = None request_status = get_request_status(cause) if isinstance(cause, RateLimitExceeded): + status = QueryStatus.RATE_LIMITED trigger_rate_limiter = cause.extra_data.get("scope", "") elif isinstance(cause, ClickhouseError): error_code = cause.code + status = get_query_status_from_error_codes(error_code) + with configure_scope() as scope: fingerprint = ["{{default}}", str(cause.code), dataset_name] if error_code not in constants.CLICKHOUSE_SYSTEMATIC_FAILURES: fingerprint.append(attribution_info.referrer) scope.fingerprint = fingerprint + elif isinstance(cause, TimeoutError): + status = QueryStatus.TIMEOUT + elif isinstance(cause, ExecutionTimeoutError): + status = QueryStatus.TIMEOUT if request_status.slo == SLO.AGAINST: logger.exception("Error running query: %s\n%s", sql, cause) @@ -613,6 +625,7 @@ def _raw_query( sentry_sdk.set_tag("slo_status", request_status.status.value) stats = update_with_status( + status=status or QueryStatus.ERROR, request_status=request_status, error_code=error_code, triggered_rate_limiter=str(trigger_rate_limiter), @@ -630,6 +643,7 @@ def _raw_query( ) from cause else: stats = update_with_status( + status=QueryStatus.SUCCESS, request_status=get_request_status(), profile_data=result["profile"], ) diff --git a/tests/datasets/test_querylog_processor.py b/tests/datasets/test_querylog_processor.py index 822628276c..3ef0125e5e 100644 --- a/tests/datasets/test_querylog_processor.py +++ b/tests/datasets/test_querylog_processor.py @@ -17,6 +17,7 @@ ClickhouseQueryMetadata, ClickhouseQueryProfile, FilterProfile, + QueryStatus, RequestStatus, SnubaQueryMetadata, Status, @@ -78,6 +79,7 @@ def test_simple() -> None: "error_code": 386, "triggered_rate_limiter": "test_rate_limiter", }, + status=QueryStatus.SUCCESS, request_status=Status(RequestStatus.SUCCESS), profile=ClickhouseQueryProfile( time_range=10, @@ -200,6 +202,7 @@ def test_missing_fields() -> None: start_timestamp=None, end_timestamp=None, stats={"sample": 10}, + status=QueryStatus.SUCCESS, request_status=Status(RequestStatus.SUCCESS), profile=ClickhouseQueryProfile( time_range=10, @@ -222,15 +225,14 @@ def test_missing_fields() -> None: ).to_dict() messages = [] - first = dict(deepcopy(orig_message)) + first = deepcopy(orig_message) del first["timing"] del first["status"] messages.append(first) - second = dict(deepcopy(orig_message)) + second = deepcopy(orig_message) second["timing"] = None second["status"] = None - messages.append(second) for message in messages: processor = ( @@ -328,6 +330,7 @@ def test_negative_project_id_fields() -> None: start_timestamp=None, end_timestamp=None, stats={"sample": 10}, + status=QueryStatus.SUCCESS, request_status=Status(RequestStatus.SUCCESS), profile=ClickhouseQueryProfile( time_range=10, diff --git a/tests/test_snql_api.py b/tests/test_snql_api.py index 8036588fa7..057a10dfd9 100644 --- a/tests/test_snql_api.py +++ b/tests/test_snql_api.py @@ -861,7 +861,7 @@ def test_timing_metrics_tags(self) -> None: metric_calls = get_recorded_metric_calls("timing", "api.query") assert metric_calls is not None assert len(metric_calls) == 1 - assert metric_calls[0].tags["request_status"] == "success" + assert metric_calls[0].tags["status"] == "success" assert metric_calls[0].tags["referrer"] == "test" assert metric_calls[0].tags["parent_api"] == "some/endpoint" assert metric_calls[0].tags["final"] == "False" diff --git a/tests/web/test_db_query.py b/tests/web/test_db_query.py index fbb7b00f4d..043ebbb15f 100644 --- a/tests/web/test_db_query.py +++ b/tests/web/test_db_query.py @@ -265,7 +265,7 @@ def test_db_query_fail() -> None: ) assert len(query_metadata_list) == 1 - assert query_metadata_list[0].request_status.status.value == "error" + assert query_metadata_list[0].status.value == "error" assert excinfo.value.extra["stats"] == stats assert excinfo.value.extra["sql"] is not None