Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix snuba admin's query tracing to connect to right storage and query nodes #6268

Merged
merged 11 commits into from
Oct 24, 2024
24 changes: 23 additions & 1 deletion docker-compose.gcb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ x-test-common: &test-common
- ".artifacts:/.artifacts"
environment:
SNUBA_SETTINGS: "$SNUBA_SETTINGS"
CLICKHOUSE_HOST: clickhouse
CLICKHOUSE_HOST: clickhouse.local
USE_REDIS_CLUSTER: "1"
REDIS_HOST: "redis-cluster"
REDIS_PORT: 7000
Expand Down Expand Up @@ -71,6 +71,13 @@ services:
KAFKA_TOOLS_LOG4J_LOGLEVEL: "WARN"
clickhouse:
image: "${CLICKHOUSE_IMAGE:-ghcr.io/getsentry/image-mirror-altinity-clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse.local
extra_hosts:
- "clickhouse.local:127.0.0.1" # Add entry to /etc/hosts file
ports:
- "8123:8123"
- "9000:9000"
- "9009:9009"
volumes:
- ./config/clickhouse/macros.xml:/etc/clickhouse-server/config.d/macros.xml
- ./config/clickhouse/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml
Expand All @@ -85,6 +92,9 @@ services:
depends_on:
- zookeeper
image: "${CLICKHOUSE_IMAGE:-ghcr.io/getsentry/image-mirror-altinity-clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse-query.local
extra_hosts:
- "clickhouse-query.local:127.0.0.1" # Add entry to /etc/hosts file
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest rewording the comment to explain why this is desirable, rather than restating what the code does.

profiles: ["multi_node"]
volumes:
- ./test_distributed_migrations/config/clickhouse/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml
Expand All @@ -97,6 +107,9 @@ services:
depends_on:
- zookeeper
image: "${CLICKHOUSE_IMAGE:-altinity/clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse-01.local
extra_hosts:
- "clickhouse-01.local:127.0.0.1" # Add entry to /etc/hosts file
profiles: ["multi_node"]
volumes:
- ./test_distributed_migrations/config/clickhouse/macros-01.xml:/etc/clickhouse-server/config.d/macros.xml
Expand All @@ -110,6 +123,9 @@ services:
depends_on:
- zookeeper
image: "${CLICKHOUSE_IMAGE:-altinity/clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse-02.local
extra_hosts:
- "clickhouse-02.local:127.0.0.1" # Add entry to /etc/hosts file
profiles: ["multi_node"]
volumes:
- ./test_distributed_migrations/config/clickhouse/macros-02.xml:/etc/clickhouse-server/config.d/macros.xml
Expand All @@ -123,6 +139,9 @@ services:
depends_on:
- zookeeper
image: "${CLICKHOUSE_IMAGE:-altinity/clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse-03.local
extra_hosts:
- "clickhouse-03.local:127.0.0.1" # Add entry to /etc/hosts file
profiles: ["multi_node"]
volumes:
- ./test_distributed_migrations/config/clickhouse/macros-03.xml:/etc/clickhouse-server/config.d/macros.xml
Expand All @@ -136,6 +155,9 @@ services:
depends_on:
- zookeeper
image: "${CLICKHOUSE_IMAGE:-ghcr.io/getsentry/image-mirror-altinity-clickhouse-server:23.3.19.33.altinitystable}"
hostname: clickhouse-04.local
extra_hosts:
- "clickhouse-04.local:127.0.0.1" # Add entry to /etc/hosts file
profiles: ["multi_node"]
volumes:
- ./test_distributed_migrations/config/clickhouse/macros-04.xml:/etc/clickhouse-server/config.d/macros.xml
Expand Down
89 changes: 64 additions & 25 deletions snuba/admin/views.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

import io
import socket
import sys
import time
from contextlib import redirect_stdout
from dataclasses import asdict
from datetime import datetime
from typing import Any, List, Mapping, Optional, Sequence, Tuple, cast
from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast

import sentry_sdk
import simplejson as json
Expand Down Expand Up @@ -498,14 +500,46 @@ def clickhouse_trace_query() -> Response:

profile_events_raw_sql = "SELECT ProfileEvents FROM system.query_log WHERE query_id = '{}' AND type = 'QueryFinish'"

for query_trace_data in parse_trace_for_query_ids(query_trace, storage):
for query_trace_data in parse_trace_for_query_ids(query_trace):
sql = profile_events_raw_sql.format(query_trace_data.query_id)
logger.info(
"Profile event gathering host: {}, port = {}, storage = {}, sql = {}, g.user = {}".format(
"Gathering profile event using host: {}, port = {}, storage = {}, sql = {}, g.user = {}".format(
query_trace_data.host, query_trace_data.port, storage, sql, g.user
)
)
# TODO: Onkar to add the profile event logic later.
system_query_result, counter = None, 0
while counter < 30:
# There is a race between the trace query and the 'SELECT ProfileEvents...' query. ClickHouse does not immediately
# return the rows for 'SELECT ProfileEvents...' query. To make it return rows, sleep between the query executions.
system_query_result = run_system_query_on_host_with_sql(
query_trace_data.host,
int(query_trace_data.port),
storage,
sql,
False,
g.user,
)
if not system_query_result.results:
time.sleep(1)
counter += 1
else:
break

if system_query_result is not None and len(system_query_result.results) > 0:
query_trace.profile_events_meta.append(system_query_result.meta)
query_trace.profile_events_profile = cast(
Dict[str, int], system_query_result.profile
)
columns = system_query_result.meta
if columns:
res = {}
res["column_names"] = [name for name, _ in columns]
res["rows"] = []
for query_result in system_query_result.results:
if query_result[0]:
res["rows"].append(json.dumps(query_result[0]))
query_trace.profile_events_results[query_trace_data.node_name] = res

return make_response(jsonify(asdict(query_trace)), 200)
except InvalidCustomQuery as err:
return make_response(
Expand All @@ -520,41 +554,46 @@ def clickhouse_trace_query() -> Response:
400,
)
except ClickhouseError as err:
logger.error(err, exc_info=True)
onkar marked this conversation as resolved.
Show resolved Hide resolved
details = {
"type": "clickhouse",
"message": str(err),
"code": err.code,
}
return make_response(jsonify({"error": details}), 400)
except Exception as err:
logger.error(err, exc_info=True)
return make_response(
jsonify({"error": {"type": "unknown", "message": str(err)}}),
500,
)


def parse_trace_for_query_ids(
trace_output: TraceOutput, storage_key: str
) -> List[QueryTraceData]:
result = []
def hostname_resolves(hostname: str) -> bool:
try:
socket.gethostbyname(hostname)
except socket.error:
return False
else:
return True


def parse_trace_for_query_ids(trace_output: TraceOutput) -> List[QueryTraceData]:
summarized_trace_output = trace_output.summarized_trace_output
storage_info = get_storage_info()
matched = next(
(info for info in storage_info if info["storage_name"] == storage_key), None
)
if matched is not None:
local_nodes = matched.get("local_nodes", [])
query_node = matched.get("query_node", None)
result = [
QueryTraceData(
host=local_nodes[0].get("host") if local_nodes else query_node.get("host"), # type: ignore
port=local_nodes[0].get("port") if local_nodes else query_node.get("port"), # type: ignore
query_id=query_summary.query_id,
node_name=node_name,
)
for node_name, query_summary in summarized_trace_output.query_summaries.items()
]
return result
node_name_to_query_id = {
node_name: query_summary.query_id
for node_name, query_summary in summarized_trace_output.query_summaries.items()
}
logger.info("node to query id mapping: {}".format(node_name_to_query_id))
return [
QueryTraceData(
host=node_name if hostname_resolves(node_name) else "127.0.0.1",
port=9000,
query_id=query_id,
node_name=node_name,
)
for node_name, query_id in node_name_to_query_id.items()
]


@application.route("/clickhouse_querylog_query", methods=["POST"])
Expand Down
1 change: 1 addition & 0 deletions tests/admin/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def test_query_trace(admin_api: FlaskClient) -> None:
data = json.loads(response.data)
assert "<Debug> executeQuery" in data["trace_output"]
assert "summarized_trace_output" in data
assert "profile_events_results" in data


@pytest.mark.redis_db
Expand Down
Loading