From c956a6f8a840205144d2fa86fdb775f1e79b7f7a Mon Sep 17 00:00:00 2001 From: Iker Barriocanal <32816711+iker-barriocanal@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:21:02 +0200 Subject: [PATCH] fix(spans): Truncate span descriptions for span metrics (#2246) --- Cargo.lock | 1 + relay-server/Cargo.toml | 1 + .../src/metrics_extraction/spans/mod.rs | 62 +++++++++++++- tests/integration/test_metrics.py | 82 +++++++++++++++++++ 4 files changed, 143 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8eb48fbeb0..793ae4519e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3308,6 +3308,7 @@ dependencies = [ "axum-server", "backoff", "brotli", + "bytecount", "bytes", "chrono", "data-encoding", diff --git a/relay-server/Cargo.toml b/relay-server/Cargo.toml index 87f8ac0321..58a5541418 100644 --- a/relay-server/Cargo.toml +++ b/relay-server/Cargo.toml @@ -30,6 +30,7 @@ axum = { version = "0.6.15", features = ["headers", "macros", "matched-path", "m axum-server = "0.4.7" backoff = "0.4.0" brotli = "3.3.4" +bytecount = "0.6.0" bytes = { version = "1.4.0" } chrono = { version = "0.4.24", default-features = false, features = ["clock", "std", "serde"] } data-encoding = "2.3.3" diff --git a/relay-server/src/metrics_extraction/spans/mod.rs b/relay-server/src/metrics_extraction/spans/mod.rs index 4ef25e0bfb..41c87b1746 100644 --- a/relay-server/src/metrics_extraction/spans/mod.rs +++ b/relay-server/src/metrics_extraction/spans/mod.rs @@ -186,11 +186,18 @@ pub(crate) fn extract_span_metrics( ); if let Some(scrubbed_desc) = sanitized_description { - span_tags.insert(SpanTagKey::Description, scrubbed_desc.to_owned()); + // Truncating the span description's tag value is, for now, + // a temporary solution to not get large descriptions dropped. The + // group tag mustn't be affected by this, and still be + // computed from the full, untruncated description. - let mut span_group = format!("{:?}", md5::compute(scrubbed_desc)); + let mut span_group = format!("{:?}", md5::compute(&scrubbed_desc)); span_group.truncate(16); span_tags.insert(SpanTagKey::Group, span_group); + + let truncated = + truncate_string(scrubbed_desc, aggregator_config.max_tag_value_length); + span_tags.insert(SpanTagKey::Description, truncated); } } @@ -303,6 +310,32 @@ fn sanitized_span_description( Some(sanitized) } +/// Trims the given string with the given maximum bytes. Splitting only happens +/// on char boundaries. +/// +/// If the string is short, it remains unchanged. If it's long, this method +/// truncates it to the maximum allowed size and sets the last character to +/// `*`. +fn truncate_string(mut string: String, max_bytes: usize) -> String { + if string.len() <= max_bytes { + return string; + } + + if max_bytes == 0 { + return String::new(); + } + + let mut cutoff = max_bytes - 1; // Leave space for `*` + + while cutoff > 0 && !string.is_char_boundary(cutoff) { + cutoff -= 1; + } + + string.truncate(cutoff); + string.push('*'); + string +} + /// Regex with a capture group to extract the database action from a query. /// /// Currently, we're only interested in either `SELECT` or `INSERT` statements. @@ -539,7 +572,30 @@ mod tests { use relay_metrics::AggregatorConfig; - use crate::metrics_extraction::spans::extract_span_metrics; + use crate::metrics_extraction::spans::{extract_span_metrics, truncate_string}; + + #[test] + fn test_truncate_string_no_panic() { + let string = "ÆÆ".to_owned(); + + let truncated = truncate_string(string.clone(), 0); + assert_eq!(truncated, ""); + + let truncated = truncate_string(string.clone(), 1); + assert_eq!(truncated, "*"); + + let truncated = truncate_string(string.clone(), 2); + assert_eq!(truncated, "*"); + + let truncated = truncate_string(string.clone(), 3); + assert_eq!(truncated, "Æ*"); + + let truncated = truncate_string(string.clone(), 4); + assert_eq!(truncated, "ÆÆ"); + + let truncated = truncate_string(string, 5); + assert_eq!(truncated, "ÆÆ"); + } macro_rules! span_transaction_method_test { // Tests transaction.method is picked from the right place. diff --git a/tests/integration/test_metrics.py b/tests/integration/test_metrics.py index afd662ec83..4497697413 100644 --- a/tests/integration/test_metrics.py +++ b/tests/integration/test_metrics.py @@ -1,3 +1,4 @@ +import hashlib import time from collections import defaultdict from datetime import datetime, timedelta, timezone @@ -1066,3 +1067,84 @@ def test_limit_custom_measurements( "d:transactions/measurements.foo@none", "d:transactions/measurements.bar@none", } + + +@pytest.mark.parametrize( + "sent_description, expected_description", + [ + ( + "SELECT column FROM table WHERE another_col = %s", + "SELECT column FROM table WHERE another_col = %s", + ), + ( + "SELECT column FROM table WHERE another_col = %s AND yet_another_col = something_very_longgggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg", + "SELECT column FROM table WHERE another_col = %s AND yet_another_col = something_very_longgggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg*", + ), + ], + ids=["Must not truncate short descriptions", "Must truncate long descriptions"], +) +def test_span_metrics( + transactions_consumer, + metrics_consumer, + mini_sentry, + relay_with_processing, + sent_description, + expected_description, +): + project_id = 42 + mini_sentry.add_full_project_config(project_id) + config = mini_sentry.project_configs[project_id]["config"] + config["transactionMetrics"] = { + "version": 1, + } + config.setdefault("features", []).append("projects:span-metrics-extraction") + + transaction = { + "event_id": "d2132d31b39445f1938d7e21b6bf0ec4", + "type": "transaction", + "transaction": "/organizations/:orgId/performance/:eventSlug/", + "transaction_info": {"source": "route"}, + "start_timestamp": 1597976392.6542819, + "timestamp": 1597976400.6189718, + "user": {"id": "user123", "geo": {"country_code": "ES"}}, + "contexts": { + "trace": { + "trace_id": "4C79F60C11214EB38604F4AE0781BFB2", + "span_id": "FA90FDEAD5F74052", + "type": "trace", + } + }, + "spans": [ + { + "description": sent_description, + "op": "db", + "parent_span_id": "8f5a2b8768cafb4e", + "span_id": "bd429c44b67a3eb4", + "start_timestamp": 1597976393.4619668, + "timestamp": 1597976393.4718769, + "trace_id": "ff62a8b040f340bda5d830223def1d81", + } + ], + } + # Default timestamp is so old that relay drops metrics, setting a more recent one avoids the drop. + timestamp = datetime.now(tz=timezone.utc) + transaction["timestamp"] = timestamp.isoformat() + + metrics_consumer = metrics_consumer() + tx_consumer = transactions_consumer() + processing = relay_with_processing(options=TEST_CONFIG) + processing.send_transaction(project_id, transaction) + + transaction, _ = tx_consumer.get_event() + assert transaction["spans"][0]["description"] == sent_description + + expected_group = hashlib.md5(sent_description.encode("utf-8")).hexdigest()[:16] + + metrics = metrics_consumer.get_metrics() + span_metrics = [ + metric for metric in metrics if metric["name"].startswith("spans", 2) + ] + assert len(span_metrics) == 3 + for metric in span_metrics: + assert metric["tags"]["span.description"] == expected_description + assert metric["tags"]["span.group"] == expected_group