Skip to content

Commit

Permalink
ref(relay): Optimize normalization regexes (#3921)
Browse files Browse the repository at this point in the history
Optimizes memory footprint of regexes by:
- Avoiding (unicde aware) case insensitive matching
- Avoiding the use of unciode character sets like `\d` -> `[0-9]`, `\b`
-> `(?-u:\b)`, `\s` -> `(?-u:\s)`
  • Loading branch information
Dav1dde authored Aug 13, 2024
1 parent d5c5ad3 commit 6d01e72
Showing 1 changed file with 35 additions and 34 deletions.
69 changes: 35 additions & 34 deletions relay-event-normalization/src/regexes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,51 +9,52 @@ pub static TRANSACTION_NAME_NORMALIZER_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?x)
(?P<uuid>[^/\\]*
\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b
(?-u:\b)[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}(?-u:\b)
[^/\\]*) |
(?P<sha1>[^/\\]*
\b[0-9a-fA-F]{40}\b
(?-u:\b)[0-9a-fA-F]{40}(?-u:\b)
[^/\\]*) |
(?P<md5>[^/\\]*
\b[0-9a-fA-F]{32}\b
(?-u:\b)[0-9a-fA-F]{32}(?-u:\b)
[^/\\]*) |
(?P<date>[^/\\]*
(?:
(?:\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|
(?:\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|
(?:\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]\.[0-9]+([+-][0-2][0-9]:[0-5][0-9]|Z))|
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]([+-][0-2][0-9]:[0-5][0-9]|Z))|
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]([+-][0-2][0-9]:[0-5][0-9]|Z))
) |
(?:
\b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)\s+)?
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
(?:[\d]{1,2})\s+
(?:[\d]{2}:[\d]{2}:[\d]{2})\s+
[\d]{4}
(?-u:\b)(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)(?-u:\s)+)?
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?-u:\s)+
(?:[0-9]{1,2})(?-u:\s)+
(?:[0-9]{2}:[0-9]{2}:[0-9]{2})(?-u:\s)+
[0-9]{4}
) |
(?:
\b(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),\s+)?
(?:0[1-9]|[1-2]?[\d]|3[01])\s+
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+
(?:19[\d]{2}|[2-9][\d]{3})\s+
(?:2[0-3]|[0-1][\d]):([0-5][\d])
(?::(60|[0-5][\d]))?\s+
(?:[-\+][\d]{2}[0-5][\d]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
(?-u:\b)(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),(?-u:\s)+)?
(?:0[1-9]|[1-2]?[0-9]|3[01])(?-u:\s)+
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?-u:\s)+
(?:19[0-9]{2}|[2-9][0-9]{3})(?-u:\s)+
(?:2[0-3]|[0-1][0-9]):([0-5][0-9])
(?::(60|[0-5][0-9]))?(?-u:\s)+
(?:[-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
)
[^/\\]*) |
(?P<hex>[^/\\]*
\b0[xX][0-9a-fA-F]+\b
(?-u:\b)0[xX][0-9a-fA-F]+(?-u:\b)
[^/\\]*) |
(?:^|[/\\])
(?P<int>
(:?[^%/\\]|%[0-9a-fA-F]{2})*\d{2,}
(:?[^%/\\]|%[0-9a-fA-F]{2})*[0-9]{2,}
[^/\\]*)",
)
.unwrap()
});

/// Captures initial all-caps words as redis command, the rest as arguments.
pub static REDIS_COMMAND_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\s*(?P<command>[A-Z]+(\s+[A-Z]+)*\b)(?P<args>.+)?").unwrap());
pub static REDIS_COMMAND_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(?-u:\s)*(?P<command>[A-Z]+((?-u:\s)+[A-Z]+)*(?-u:\b))(?P<args>.+)?").unwrap()
});

/// Regex with multiple capture groups for resource tokens we should scrub.
///
Expand All @@ -65,44 +66,44 @@ pub static REDIS_COMMAND_REGEX: Lazy<Regex> =
/// <https://github.com/getsentry/sentry/blob/de5949a9a313d7ef0bf0685f84fe6e981ac38558/src/sentry/utils/performance_issues/base.py#L292-L306>
pub static RESOURCE_NORMALIZER_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?xi)
r"(?x)
# UUIDs.
(?P<uuid>[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) |
(?P<uuid>[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}) |
# Version strings.
(?P<version>(v[0-9]+(?:\.[0-9]+)*)) |
# Hexadecimal strings with more than 5 digits.
(?P<hex>[a-f0-9]{5}[a-f0-9]+) |
(?P<hex>[a-fA-F0-9]{5}[a-fA-F0-9]+) |
# Integer IDs with more than one digit.
(?P<int>\d\d+)
(?P<int>[0-9][0-9]+)
",
)
.unwrap()
});

pub static DB_SQL_TRANSACTION_CORE_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?P<int>\d+)").unwrap());
Lazy::new(|| Regex::new(r"(?P<int>[0-9]+)").unwrap());

pub static DB_SUPABASE_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?xi)
r"(?x)
# UUIDs.
(?P<uuid>[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) |
(?P<uuid>[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}) |
# Hexadecimal strings with more than 5 digits.
(?P<hex>[a-f0-9]{5}[a-f0-9]+) |
(?P<hex>[a-fA-F0-9]{5}[a-fA-F0-9]+) |
# Integer IDs with more than one digit.
(?P<int>\d\d+)
(?P<int>[0-9][0-9]+)
",
)
.unwrap()
});

pub static FUNCTION_NORMALIZER_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"(?xi)
r"(?x)
# UUIDs.
(?P<uuid>[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) |
(?P<uuid>[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}) |
# Hexadecimal strings with more than 5 digits.
(?P<hex>[a-f0-9]{5}[a-f0-9]+)
(?P<hex>[a-fA-F0-9]{5}[a-fA-F0-9]+)
",
)
.unwrap()
Expand Down

0 comments on commit 6d01e72

Please sign in to comment.