Skip to content

Commit

Permalink
add normalization tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Dav1dde committed Aug 7, 2024
1 parent 37e53c5 commit a6b1002
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 52 deletions.
49 changes: 2 additions & 47 deletions relay-metrics/src/aggregator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ mod tests {
Bucket {
timestamp,
width: 0,
name: "c:transactions/foo".into(),
name: "c:transactions/foo@none".into(),
value: BucketValue::counter(42.into()),
tags: BTreeMap::new(),
metadata: BucketMetadata::new(timestamp),
Expand Down Expand Up @@ -1387,52 +1387,7 @@ mod tests {
}

#[test]
fn test_validate_bucket_key_chars() {
let project_key = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fee").unwrap();

let bucket_key = BucketKey {
project_key,
timestamp: UnixTimestamp::now(),
metric_name: "c:transactions/hergus.bergus".into(),
tags: {
let mut tags = BTreeMap::new();
// There are some SDKs which mess up content encodings, and interpret the raw bytes
// of an UTF-16 string as UTF-8. Leading to ASCII
// strings getting null-bytes interleaved.
//
// Somehow those values end up as release tag in sessions, while in error events we
// haven't observed this malformed encoding. We believe it's slightly better to
// strip out NUL-bytes instead of dropping the tag such that those values line up
// again across sessions and events. Should that cause too high cardinality we'll
// have to drop tags.
//
// Note that releases are validated separately against much stricter character set,
// but the above idea should still apply to other tags.
tags.insert(
"is_it_garbage".to_owned(),
"a\0b\0s\0o\0l\0u\0t\0e\0l\0y".to_owned(),
);
tags.insert("another\0garbage".to_owned(), "bye".to_owned());
tags
},
extracted_from_indexed: false,
};

let mut bucket_key = validate_bucket_key(bucket_key, &test_config()).unwrap();

assert_eq!(bucket_key.tags.len(), 1);
assert_eq!(
bucket_key.tags.get("is_it_garbage"),
Some(&"absolutely".to_owned())
);
assert_eq!(bucket_key.tags.get("another\0garbage"), None);

bucket_key.metric_name = "hergus\0bergus".into();
validate_bucket_key(bucket_key, &test_config()).unwrap_err();
}

#[test]
fn test_validate_bucket_key_str_lens() {
fn test_validate_bucket_key_str_length() {
relay_test::setup();
let project_key = ProjectKey::parse("a94ae32be2584e0bbd7a4cbb95971fee").unwrap();

Expand Down
111 changes: 106 additions & 5 deletions relay-metrics/src/protocol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ pub fn normalize_bucket(bucket: &mut Bucket) -> Result<(), NormalizationError> {
/// Normalization includes expanding valid metric names without a namespace to the default
/// namespace.
///
/// Invalid metric names are rejected with [`Error`].
/// Invalid metric names are rejected with [`NormalizationError`].
fn normalize_metric_name(name: &mut MetricName) -> Result<(), NormalizationError> {
*name = match MetricResourceIdentifier::parse(name) {
Ok(mri) => {
Expand Down Expand Up @@ -86,7 +86,7 @@ fn normalize_metric_tags(tags: &mut MetricTags) {
return false;
}

validate_tag_value(tag_value);
normalize_tag_value(tag_value);

true
});
Expand Down Expand Up @@ -143,15 +143,15 @@ pub(crate) fn escape_tag_value(raw: &str) -> String {
/// [`validate_tag_value`].
pub(crate) fn unescape_tag_value(escaped: &str) -> Result<String, UnescapeError> {
let mut unescaped = unescaper::unescape(escaped)?;
validate_tag_value(&mut unescaped);
normalize_tag_value(&mut unescaped);
Ok(unescaped)
}

/// Validates a tag value.
/// Normalizes a tag value.
///
/// Tag values are never entirely rejected, but invalid characters (control characters) are stripped
/// out.
pub(crate) fn validate_tag_value(tag_value: &mut String) {
pub(crate) fn normalize_tag_value(tag_value: &mut String) {
tag_value.retain(|c| !c.is_control());
}

Expand All @@ -167,6 +167,10 @@ pub(crate) fn hash_set_value(string: &str) -> u32 {

#[cfg(test)]
mod tests {
use insta::assert_json_snapshot;

use crate::BucketValue;

use super::*;

#[test]
Expand Down Expand Up @@ -226,4 +230,101 @@ mod tests {
assert_eq!(escape_tag_value("plain\u{07}text"), "plaintext");
assert_eq!(escape_tag_value("plain\u{9c}text"), "plaintext");
}

#[test]
fn test_normalize_invalid_name() {
let mut bucket = Bucket {
timestamp: UnixTimestamp::from_secs(5000),
width: 0,
name: "c:transactions/\0hergus.bergus@none".into(),
value: BucketValue::Counter(0.into()),
tags: Default::default(),
metadata: Default::default(),
};

assert!(matches!(
normalize_bucket(&mut bucket),
Err(NormalizationError::InvalidMetricName(_))
));
}

#[test]
fn test_normalize_invalid_namespace() {
let mut bucket = Bucket {
timestamp: UnixTimestamp::from_secs(5000),
width: 0,
name: "c:lol/hergus.bergus@none".into(),
value: BucketValue::Counter(0.into()),
tags: Default::default(),
metadata: Default::default(),
};

assert!(matches!(
normalize_bucket(&mut bucket),
Err(NormalizationError::UnsupportedNamespace)
));
}

#[test]
fn test_normalize_name() {
let mut bucket = Bucket {
timestamp: UnixTimestamp::from_secs(5000),
width: 0,
name: "c:hergus\0\0bergus".into(),
value: BucketValue::Counter(0.into()),
tags: Default::default(),
metadata: Default::default(),
};

normalize_bucket(&mut bucket).unwrap();

assert_eq!(&bucket.name, "c:custom/hergus_bergus@none");
}

#[test]
fn test_normalize_tag_key_chars() {
let mut bucket = Bucket {
timestamp: UnixTimestamp::from_secs(5000),
width: 0,
name: "c:transactions/hergus.bergus".into(),
value: BucketValue::Counter(0.into()),
tags: {
let mut tags = MetricTags::new();
// There are some SDKs which mess up content encodings, and interpret the raw bytes
// of an UTF-16 string as UTF-8. Leading to ASCII
// strings getting null-bytes interleaved.
//
// Somehow those values end up as release tag in sessions, while in error events we
// haven't observed this malformed encoding. We believe it's slightly better to
// strip out NUL-bytes instead of dropping the tag such that those values line up
// again across sessions and events. Should that cause too high cardinality we'll
// have to drop tags.
//
// Note that releases are validated separately against much stricter character set,
// but the above idea should still apply to other tags.
tags.insert(
"is_it_garbage".to_owned(),
"a\0b\0s\0o\0l\0u\0t\0e\0l\0y".to_owned(),
);
tags.insert("another\0garbage".to_owned(), "bye".to_owned());
tags
},
metadata: Default::default(),
};

normalize_bucket(&mut bucket).unwrap();

assert_json_snapshot!(bucket, @r###"
{
"timestamp": 5000,
"width": 0,
"name": "c:transactions/hergus.bergus@none",
"type": "c",
"value": 0.0,
"tags": {
"is_it_garbage": "absolutely"
}
}
"###);
}
}

0 comments on commit a6b1002

Please sign in to comment.