Skip to content

Commit

Permalink
Merge branch 'master' into release-library/0.8.57
Browse files Browse the repository at this point in the history
  • Loading branch information
Dav1dde authored Apr 15, 2024
2 parents 21b5457 + 60fa63b commit 0dd02e7
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 64 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
- Extract `cache.item_size` and `cache.hit` metrics. ([#3371]https://github.com/getsentry/relay/pull/3371)
- Optionally convert segment spans to transactions for compatibility. ([#3375](https://github.com/getsentry/relay/pull/3375))
- Add feature flag for replay video event types. ([#3402](https://github.com/getsentry/relay/pull/3402))
- Extract scrubbed IP addresses into the `span.domain` tag. ([#3383](https://github.com/getsentry/relay/pull/3383))

**Internal**:

Expand Down
13 changes: 0 additions & 13 deletions py/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,6 @@

## 0.8.57

### Various fixes & improvements

- release: 0.5.87 (841aeb56) by @getsentry-bot
- Adding DataCategory::MetricHour (#3384) by @corps
- release: 0.8.56 (d1ba4507) by @getsentry-bot
- feat(feedback): Emit outcomes for user feedback events (#3026) by @cmanallen
- release: 0.8.55 (109ac110) by @getsentry-bot
- feat(profiles): Add a new category to count profile chunks (#3303) by @phacops
- release: 0.8.52 (65defad1) by @getsentry-bot
- feat(profiles): Add a data category for continuous profiling (#3284) by @phacops

## 0.5.87

- Add a data category for metirc hours. [#3384](https://github.com/getsentry/relay/pull/3384)

## 0.8.56
Expand Down
168 changes: 130 additions & 38 deletions relay-event-normalization/src/normalize/span/description/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ use once_cell::sync::Lazy;
pub use sql::{scrub_queries, Mode};

use std::borrow::Cow;
use std::net::{Ipv4Addr, Ipv6Addr};
use std::path::Path;

use itertools::Itertools;
use relay_event_schema::protocol::Span;
use url::Url;
use url::{Host, Url};

use crate::regexes::{
DB_SQL_TRANSACTION_CORE_DATA_REGEX, DB_SUPABASE_REGEX, REDIS_COMMAND_REGEX,
Expand All @@ -30,6 +31,9 @@ const MAX_SEGMENT_LENGTH: usize = 25;
/// Some bundlers attach characters to the end of a filename, try to catch those.
const MAX_EXTENSION_LENGTH: usize = 10;

/// Domain names that are preserved during scrubbing
const DOMAIN_ALLOW_LIST: &[&str] = &["localhost"];

/// Attempts to replace identifiers in the span description with placeholders.
///
/// Returns `None` if no scrubbing can be performed.
Expand Down Expand Up @@ -165,9 +169,9 @@ fn scrub_http(string: &str) -> Option<String> {

let scrubbed = match Url::parse(url) {
Ok(url) => {
let host = url.host().map(|h| h.to_string())?;
let domain = normalize_domain(host.as_str(), url.port())?;
let scheme = url.scheme();
let scrubbed_host = url.host().map(scrub_host);
let domain = concatenate_host_and_port(scrubbed_host.as_deref(), url.port());

format!("{method} {scheme}://{domain}")
}
Expand Down Expand Up @@ -197,51 +201,119 @@ fn scrub_file(description: &str) -> Option<String> {
}
}

/// Normalize the domain and port of a URL.
/// Scrub a [`Host`] object.
///
/// Domain names are run through a scrubber. All IP addresses except well known ones are replaced with a scrubbed variant.
/// Returns the scrubbed value as a string.
///
/// # Examples
///
/// ```
/// use url::{Host, Url};
/// use std::net::{Ipv4Addr, Ipv6Addr};
/// use relay_event_normalization::span::description::scrub_host;
///
/// assert_eq!(scrub_host(Host::Domain("foo.bar.baz")), "*.bar.baz");
/// assert_eq!(scrub_host(Host::Ipv4(Ipv4Addr::LOCALHOST)), "127.0.0.1");
/// ```
pub fn scrub_host(host: Host<&str>) -> Cow<'_, str> {
match host {
Host::Ipv4(ip) => Cow::Borrowed(scrub_ipv4(ip)),
Host::Ipv6(ip) => Cow::Borrowed(scrub_ipv6(ip)),
Host::Domain(domain) => scrub_domain_name(domain),
}
}

/// Scrub an IPv4 address.
///
/// Allow well-known IPs like loopback, and fully scrub out all other IPs.
/// Returns the scrubbed value as a string.
///
/// # Examples
///
/// ```
/// use std::net::Ipv4Addr;
/// use relay_event_normalization::span::description::{scrub_ipv4};
///
/// assert_eq!(scrub_ipv4(Ipv4Addr::LOCALHOST), "127.0.0.1");
/// assert_eq!(scrub_ipv4(Ipv4Addr::new(8, 8, 8, 8)), "*.*.*.*");
/// ```
pub fn scrub_ipv4(ip: Ipv4Addr) -> &'static str {
match ip {
Ipv4Addr::LOCALHOST => "127.0.0.1",
_ => "*.*.*.*",
}
}

/// Scrub an IPv6 address.
///
/// # Arguments
/// # Examples
///
/// * `domain` - The domain of the URL.
/// * `port` - The port of the URL.
/// ```
/// use std::net::Ipv6Addr;
/// use relay_event_normalization::span::description::{scrub_ipv6};
///
/// # Returns
/// assert_eq!(scrub_ipv6(Ipv6Addr::LOCALHOST), "::1");
/// assert_eq!(scrub_ipv6(Ipv6Addr::new(8, 8, 8, 8, 8, 8, 8, 8)), "*:*:*:*:*:*:*:*");
/// ```
pub fn scrub_ipv6(ip: Ipv6Addr) -> &'static str {
match ip {
Ipv6Addr::LOCALHOST => "::1",
_ => "*:*:*:*:*:*:*:*",
}
}

/// Sanitize a qualified domain string.
///
/// Replace all but the last two segments with asterisks.
/// Returns a string. In cases where the string is not domain-like, returns the original string.
///
/// # Examples
///
/// The normalized domain and port as a `String`, or `None` if normalization fails.
pub fn normalize_domain(domain: &str, port: Option<u16>) -> Option<String> {
if let Some(allow_listed) = normalized_domain_from_allowlist(domain, port) {
return Some(allow_listed);
/// ```
/// use relay_event_normalization::span::description::scrub_domain_name;
///
/// assert_eq!(scrub_domain_name("my.domain.com"), "*.domain.com");
/// assert_eq!(scrub_domain_name("hello world"), "hello world");
/// ```
pub fn scrub_domain_name(domain: &str) -> Cow<'_, str> {
if DOMAIN_ALLOW_LIST.contains(&domain) {
return Cow::Borrowed(domain);
}

let mut tokens = domain.rsplitn(3, '.');
let tld = tokens.next();
let domain = tokens.next();
let prefix = tokens.next().map(|_| "*");

let mut replaced = prefix
.iter()
.chain(domain.iter())
.chain(tld.iter())
.join(".");

if let Some(port) = port {
replaced = format!("{replaced}:{port}");
}

if replaced.is_empty() {
return None;
}
Some(replaced)
Cow::Owned(
prefix
.iter()
.chain(domain.iter())
.chain(tld.iter())
.join("."),
)
}

/// Allow list of domains to not get subdomains scrubbed.
const DOMAIN_ALLOW_LIST: &[&str] = &["127.0.0.1", "localhost"];

fn normalized_domain_from_allowlist(domain: &str, port: Option<u16>) -> Option<String> {
if let Some(domain) = DOMAIN_ALLOW_LIST.iter().find(|allowed| **allowed == domain) {
let with_port = port.map_or_else(|| (*domain).to_owned(), |p| format!("{}:{}", domain, p));
return Some(with_port);
/// Concatenate an optional host and an optional port.
///
/// Returns either a host + port combination, or the host. Never returns just the port.
///
/// # Examples
///
/// ```
/// use relay_event_normalization::span::description::concatenate_host_and_port;
///
/// assert_eq!(concatenate_host_and_port(None, None), "");
/// assert_eq!(concatenate_host_and_port(Some("my.domain.com"), None), "my.domain.com");
/// assert_eq!(concatenate_host_and_port(Some("my.domain.com"), Some(1919)), "my.domain.com:1919");
/// ```
pub fn concatenate_host_and_port(host: Option<&str>, port: Option<u16>) -> Cow<str> {
match (host, port) {
(None, _) => Cow::Borrowed(""),
(Some(host), None) => Cow::Borrowed(host),
(Some(host), Some(port)) => Cow::Owned(format!("{host}:{port}")),
}
None
}

fn scrub_redis_keys(string: &str) -> Option<String> {
Expand Down Expand Up @@ -297,10 +369,9 @@ fn scrub_resource(resource_type: &str, string: &str) -> Option<String> {
return Some("browser-extension://*".to_owned());
}
scheme => {
let domain = url
.domain()
.and_then(|d| normalize_domain(d, url.port()))
.unwrap_or("".into());
let scrubbed_host = url.host().map(scrub_host);
let domain = concatenate_host_and_port(scrubbed_host.as_deref(), url.port());

let segment_count = url.path_segments().map(|s| s.count()).unwrap_or_default();
let mut output_segments = vec![];
for (i, segment) in url.path_segments().into_iter().flatten().enumerate() {
Expand Down Expand Up @@ -510,6 +581,27 @@ mod tests {
"GET https://*.service.io"
);

span_description_test!(
localhost,
"GET https://localhost/data",
"http.client",
"GET https://localhost"
);

span_description_test!(
loopback,
"GET https://127.0.0.1/data",
"http.client",
"GET https://127.0.0.1"
);

span_description_test!(
ip_address,
"GET https://8.8.8.8/data",
"http.client",
"GET https://*.*.*.*"
);

span_description_test!(
path_md5_hashes,
"GET /clients/563712f9722fb0996ac8f3905b40786f/project/01234",
Expand Down
38 changes: 25 additions & 13 deletions relay-event-normalization/src/normalize/span/tag_extraction.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
//! Logic for persisting items into `span.sentry_tags` and `span.measurements` fields.
//! These are then used for metrics extraction.
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Write;
use std::net::IpAddr;
use std::ops::ControlFlow;

use once_cell::sync::Lazy;
Expand All @@ -15,7 +17,9 @@ use sqlparser::ast::Visit;
use sqlparser::ast::{ObjectName, Visitor};
use url::Url;

use crate::span::description::{normalize_domain, scrub_span_description};
use crate::span::description::{
concatenate_host_and_port, scrub_domain_name, scrub_span_description,
};
use crate::utils::{
extract_transaction_op, http_status_code_from_span, MAIN_THREAD_NAME, MOBILE_SDKS,
};
Expand Down Expand Up @@ -358,16 +362,25 @@ pub fn extract_tags(
})
}) {
Some(domain)
} else if let Some(server_host) = span
} else if let Some(server_address) = span
.data
.value()
.and_then(|data| data.server_address.value())
.and_then(|value| value.as_str())
{
let lowercase_host = server_host.to_lowercase();
let (domain, port) = match lowercase_host.split_once(':') {
let lowercase_address = server_address.to_lowercase();

// According to OTel semantic conventions the server port should be in a separate property, called `server.port`, but incoming data sometimes disagrees
let (domain, port) = match lowercase_address.split_once(':') {
Some((domain, port)) => (domain, port.parse::<u16>().ok()),
None => (server_host, None),
None => (server_address, None),
};

// Leave IP addresses alone. Scrub qualified domain names
let domain = if domain.parse::<IpAddr>().is_ok() {
Cow::Borrowed(domain)
} else {
scrub_domain_name(domain)
};

if let Some(url_scheme) = span
Expand All @@ -378,10 +391,11 @@ pub fn extract_tags(
{
span_tags.insert(
SpanTagKey::RawDomain,
format!("{url_scheme}://{lowercase_host}"),
format!("{url_scheme}://{lowercase_address}"),
);
}
normalize_domain(domain, port)

Some(concatenate_host_and_port(Some(domain.as_ref()), port).into_owned())
} else {
None
}
Expand Down Expand Up @@ -1452,24 +1466,22 @@ LIMIT 1
let tags_1 = get_value!(span_1.sentry_tags).unwrap();
let tags_2 = get_value!(span_2.sentry_tags).unwrap();

// Descriptions with loopback IPs preserve the IP and port but strip the URL path
// Allow loopback IPs
assert_eq!(
tags_1.get("description").unwrap().as_str(),
Some("POST http://127.0.0.1:10007")
);
// Domains of loopback IP descriptions preserve the IP and port
assert_eq!(
tags_1.get("domain").unwrap().as_str(),
Some("127.0.0.1:10007")
);

// Descriptions with non-loopback IPs scrub the IP naively
// Scrub other IPs
assert_eq!(
tags_2.get("description").unwrap().as_str(),
Some("GET http://*.8.8")
Some("GET http://*.*.*.*")
);
// Domains of non-loopback IP descriptions are omitted
assert_eq!(tags_2.get("domain"), None);
assert_eq!(tags_2.get("domain").unwrap().as_str(), Some("*.*.*.*"));
}

#[test]
Expand Down

0 comments on commit 0dd02e7

Please sign in to comment.