Merge branch 'master' into release-library/0.8.57

getsentry · Apr 15, 2024 · 0dd02e7 · 0dd02e7
2 parents 21b5457 + 60fa63b
commit 0dd02e7
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@
 - Extract `cache.item_size` and `cache.hit` metrics. ([#3371]https://github.com/getsentry/relay/pull/3371)
 - Optionally convert segment spans to transactions for compatibility. ([#3375](https://github.com/getsentry/relay/pull/3375))
 - Add feature flag for replay video event types. ([#3402](https://github.com/getsentry/relay/pull/3402))
+- Extract scrubbed IP addresses into the `span.domain` tag. ([#3383](https://github.com/getsentry/relay/pull/3383))
 
 **Internal**:
 

diff --git a/py/CHANGELOG.md b/py/CHANGELOG.md
@@ -2,19 +2,6 @@
 
 ## 0.8.57
 
-### Various fixes & improvements
-
-- release: 0.5.87 (841aeb56) by @getsentry-bot
-- Adding DataCategory::MetricHour (#3384) by @corps
-- release: 0.8.56 (d1ba4507) by @getsentry-bot
-- feat(feedback): Emit outcomes for user feedback events (#3026) by @cmanallen
-- release: 0.8.55 (109ac110) by @getsentry-bot
-- feat(profiles): Add a new category to count profile chunks (#3303) by @phacops
-- release: 0.8.52 (65defad1) by @getsentry-bot
-- feat(profiles): Add a data category for continuous profiling (#3284) by @phacops
-
-## 0.5.87
-
 - Add a data category for metirc hours.  [#3384](https://github.com/getsentry/relay/pull/3384)
 
 ## 0.8.56

diff --git a/relay-event-normalization/src/normalize/span/description/mod.rs b/relay-event-normalization/src/normalize/span/description/mod.rs
@@ -6,11 +6,12 @@ use once_cell::sync::Lazy;
 pub use sql::{scrub_queries, Mode};
 
 use std::borrow::Cow;
+use std::net::{Ipv4Addr, Ipv6Addr};
 use std::path::Path;
 
 use itertools::Itertools;
 use relay_event_schema::protocol::Span;
-use url::Url;
+use url::{Host, Url};
 
 use crate::regexes::{
     DB_SQL_TRANSACTION_CORE_DATA_REGEX, DB_SUPABASE_REGEX, REDIS_COMMAND_REGEX,
@@ -30,6 +31,9 @@ const MAX_SEGMENT_LENGTH: usize = 25;
 /// Some bundlers attach characters to the end of a filename, try to catch those.
 const MAX_EXTENSION_LENGTH: usize = 10;
 
+/// Domain names that are preserved during scrubbing
+const DOMAIN_ALLOW_LIST: &[&str] = &["localhost"];
+
 /// Attempts to replace identifiers in the span description with placeholders.
 ///
 /// Returns `None` if no scrubbing can be performed.
@@ -165,9 +169,9 @@ fn scrub_http(string: &str) -> Option<String> {
 
     let scrubbed = match Url::parse(url) {
         Ok(url) => {
-            let host = url.host().map(|h| h.to_string())?;
-            let domain = normalize_domain(host.as_str(), url.port())?;
             let scheme = url.scheme();
+            let scrubbed_host = url.host().map(scrub_host);
+            let domain = concatenate_host_and_port(scrubbed_host.as_deref(), url.port());
 
             format!("{method} {scheme}://{domain}")
         }
@@ -197,51 +201,119 @@ fn scrub_file(description: &str) -> Option<String> {
     }
 }
 
-/// Normalize the domain and port of a URL.
+/// Scrub a [`Host`] object.
+///
+/// Domain names are run through a scrubber. All IP addresses except well known ones are replaced with a scrubbed variant.
+/// Returns the scrubbed value as a string.
+///
+/// # Examples
+///
+/// ```
+/// use url::{Host, Url};
+/// use std::net::{Ipv4Addr, Ipv6Addr};
+/// use relay_event_normalization::span::description::scrub_host;
+///
+/// assert_eq!(scrub_host(Host::Domain("foo.bar.baz")), "*.bar.baz");
+/// assert_eq!(scrub_host(Host::Ipv4(Ipv4Addr::LOCALHOST)), "127.0.0.1");
+/// ```
+pub fn scrub_host(host: Host<&str>) -> Cow<'_, str> {
+    match host {
+        Host::Ipv4(ip) => Cow::Borrowed(scrub_ipv4(ip)),
+        Host::Ipv6(ip) => Cow::Borrowed(scrub_ipv6(ip)),
+        Host::Domain(domain) => scrub_domain_name(domain),
+    }
+}
+
+/// Scrub an IPv4 address.
+///
+/// Allow well-known IPs like loopback, and fully scrub out all other IPs.
+/// Returns the scrubbed value as a string.
+///
+/// # Examples
+///
+/// ```
+/// use std::net::Ipv4Addr;
+/// use relay_event_normalization::span::description::{scrub_ipv4};
+///
+/// assert_eq!(scrub_ipv4(Ipv4Addr::LOCALHOST), "127.0.0.1");
+/// assert_eq!(scrub_ipv4(Ipv4Addr::new(8, 8, 8, 8)), "*.*.*.*");
+/// ```
+pub fn scrub_ipv4(ip: Ipv4Addr) -> &'static str {
+    match ip {
+        Ipv4Addr::LOCALHOST => "127.0.0.1",
+        _ => "*.*.*.*",
+    }
+}
+
+/// Scrub an IPv6 address.
 ///
-/// # Arguments
+/// # Examples
 ///
-/// * `domain` - The domain of the URL.
-/// * `port` - The port of the URL.
+/// ```
+/// use std::net::Ipv6Addr;
+/// use relay_event_normalization::span::description::{scrub_ipv6};
 ///
-/// # Returns
+/// assert_eq!(scrub_ipv6(Ipv6Addr::LOCALHOST), "::1");
+/// assert_eq!(scrub_ipv6(Ipv6Addr::new(8, 8, 8, 8, 8, 8, 8, 8)), "*:*:*:*:*:*:*:*");
+/// ```
+pub fn scrub_ipv6(ip: Ipv6Addr) -> &'static str {
+    match ip {
+        Ipv6Addr::LOCALHOST => "::1",
+        _ => "*:*:*:*:*:*:*:*",
+    }
+}
+
+/// Sanitize a qualified domain string.
+///
+/// Replace all but the last two segments with asterisks.
+/// Returns a string. In cases where the string is not domain-like, returns the original string.
+///
+/// # Examples
 ///
-/// The normalized domain and port as a `String`, or `None` if normalization fails.
-pub fn normalize_domain(domain: &str, port: Option<u16>) -> Option<String> {
-    if let Some(allow_listed) = normalized_domain_from_allowlist(domain, port) {
-        return Some(allow_listed);
+/// ```
+/// use relay_event_normalization::span::description::scrub_domain_name;
+///
+/// assert_eq!(scrub_domain_name("my.domain.com"), "*.domain.com");
+/// assert_eq!(scrub_domain_name("hello world"), "hello world");
+/// ```
+pub fn scrub_domain_name(domain: &str) -> Cow<'_, str> {
+    if DOMAIN_ALLOW_LIST.contains(&domain) {
+        return Cow::Borrowed(domain);
     }
 
     let mut tokens = domain.rsplitn(3, '.');
     let tld = tokens.next();
     let domain = tokens.next();
     let prefix = tokens.next().map(|_| "*");
 
-    let mut replaced = prefix
-        .iter()
-        .chain(domain.iter())
-        .chain(tld.iter())
-        .join(".");
-
-    if let Some(port) = port {
-        replaced = format!("{replaced}:{port}");
-    }
-
-    if replaced.is_empty() {
-        return None;
-    }
-    Some(replaced)
+    Cow::Owned(
+        prefix
+            .iter()
+            .chain(domain.iter())
+            .chain(tld.iter())
+            .join("."),
+    )
 }
 
-/// Allow list of domains to not get subdomains scrubbed.
-const DOMAIN_ALLOW_LIST: &[&str] = &["127.0.0.1", "localhost"];
-
-fn normalized_domain_from_allowlist(domain: &str, port: Option<u16>) -> Option<String> {
-    if let Some(domain) = DOMAIN_ALLOW_LIST.iter().find(|allowed| **allowed == domain) {
-        let with_port = port.map_or_else(|| (*domain).to_owned(), |p| format!("{}:{}", domain, p));
-        return Some(with_port);
+/// Concatenate an optional host and an optional port.
+///
+/// Returns either a host + port combination, or the host. Never returns just the port.
+///
+/// # Examples
+///
+/// ```
+/// use relay_event_normalization::span::description::concatenate_host_and_port;
+///
+/// assert_eq!(concatenate_host_and_port(None, None), "");
+/// assert_eq!(concatenate_host_and_port(Some("my.domain.com"), None), "my.domain.com");
+/// assert_eq!(concatenate_host_and_port(Some("my.domain.com"), Some(1919)), "my.domain.com:1919");
+/// ```
+pub fn concatenate_host_and_port(host: Option<&str>, port: Option<u16>) -> Cow<str> {
+    match (host, port) {
+        (None, _) => Cow::Borrowed(""),
+        (Some(host), None) => Cow::Borrowed(host),
+        (Some(host), Some(port)) => Cow::Owned(format!("{host}:{port}")),
     }
-    None
 }
 
 fn scrub_redis_keys(string: &str) -> Option<String> {
@@ -297,10 +369,9 @@ fn scrub_resource(resource_type: &str, string: &str) -> Option<String> {
             return Some("browser-extension://*".to_owned());
         }
         scheme => {
-            let domain = url
-                .domain()
-                .and_then(|d| normalize_domain(d, url.port()))
-                .unwrap_or("".into());
+            let scrubbed_host = url.host().map(scrub_host);
+            let domain = concatenate_host_and_port(scrubbed_host.as_deref(), url.port());
+
             let segment_count = url.path_segments().map(|s| s.count()).unwrap_or_default();
             let mut output_segments = vec![];
             for (i, segment) in url.path_segments().into_iter().flatten().enumerate() {
@@ -510,6 +581,27 @@ mod tests {
         "GET https://*.service.io"
     );
 
+    span_description_test!(
+        localhost,
+        "GET https://localhost/data",
+        "http.client",
+        "GET https://localhost"
+    );
+
+    span_description_test!(
+        loopback,
+        "GET https://127.0.0.1/data",
+        "http.client",
+        "GET https://127.0.0.1"
+    );
+
+    span_description_test!(
+        ip_address,
+        "GET https://8.8.8.8/data",
+        "http.client",
+        "GET https://*.*.*.*"
+    );
+
     span_description_test!(
         path_md5_hashes,
         "GET /clients/563712f9722fb0996ac8f3905b40786f/project/01234",

diff --git a/relay-event-normalization/src/normalize/span/tag_extraction.rs b/relay-event-normalization/src/normalize/span/tag_extraction.rs
@@ -1,7 +1,9 @@
 //! Logic for persisting items into `span.sentry_tags` and `span.measurements` fields.
 //! These are then used for metrics extraction.
+use std::borrow::Cow;
 use std::collections::{BTreeMap, BTreeSet};
 use std::fmt::Write;
+use std::net::IpAddr;
 use std::ops::ControlFlow;
 
 use once_cell::sync::Lazy;
@@ -15,7 +17,9 @@ use sqlparser::ast::Visit;
 use sqlparser::ast::{ObjectName, Visitor};
 use url::Url;
 
-use crate::span::description::{normalize_domain, scrub_span_description};
+use crate::span::description::{
+    concatenate_host_and_port, scrub_domain_name, scrub_span_description,
+};
 use crate::utils::{
     extract_transaction_op, http_status_code_from_span, MAIN_THREAD_NAME, MOBILE_SDKS,
 };
@@ -358,16 +362,25 @@ pub fn extract_tags(
                     })
                 }) {
                     Some(domain)
-                } else if let Some(server_host) = span
+                } else if let Some(server_address) = span
                     .data
                     .value()
                     .and_then(|data| data.server_address.value())
                     .and_then(|value| value.as_str())
                 {
-                    let lowercase_host = server_host.to_lowercase();
-                    let (domain, port) = match lowercase_host.split_once(':') {
+                    let lowercase_address = server_address.to_lowercase();
+
+                    // According to OTel semantic conventions the server port should be in a separate property, called `server.port`, but incoming data sometimes disagrees
+                    let (domain, port) = match lowercase_address.split_once(':') {
                         Some((domain, port)) => (domain, port.parse::<u16>().ok()),
-                        None => (server_host, None),
+                        None => (server_address, None),
+                    };
+
+                    // Leave IP addresses alone. Scrub qualified domain names
+                    let domain = if domain.parse::<IpAddr>().is_ok() {
+                        Cow::Borrowed(domain)
+                    } else {
+                        scrub_domain_name(domain)
                     };
 
                     if let Some(url_scheme) = span
@@ -378,10 +391,11 @@ pub fn extract_tags(
                     {
                         span_tags.insert(
                             SpanTagKey::RawDomain,
-                            format!("{url_scheme}://{lowercase_host}"),
+                            format!("{url_scheme}://{lowercase_address}"),
                         );
                     }
-                    normalize_domain(domain, port)
+
+                    Some(concatenate_host_and_port(Some(domain.as_ref()), port).into_owned())
                 } else {
                     None
                 }
@@ -1452,24 +1466,22 @@ LIMIT 1
         let tags_1 = get_value!(span_1.sentry_tags).unwrap();
         let tags_2 = get_value!(span_2.sentry_tags).unwrap();
 
-        // Descriptions with loopback IPs preserve the IP and port but strip the URL path
+        // Allow loopback IPs
         assert_eq!(
             tags_1.get("description").unwrap().as_str(),
             Some("POST http://127.0.0.1:10007")
         );
-        // Domains of loopback IP descriptions preserve the IP and port
         assert_eq!(
             tags_1.get("domain").unwrap().as_str(),
             Some("127.0.0.1:10007")
         );
 
-        // Descriptions with non-loopback IPs scrub the IP naively
+        // Scrub other IPs
         assert_eq!(
             tags_2.get("description").unwrap().as_str(),
-            Some("GET http://*.8.8")
+            Some("GET http://*.*.*.*")
         );
-        // Domains of non-loopback IP descriptions are omitted
-        assert_eq!(tags_2.get("domain"), None);
+        assert_eq!(tags_2.get("domain").unwrap().as_str(), Some("*.*.*.*"));
     }
 
     #[test]