Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(seer grouping): Store Seer metadata on grouphashes during ingest #77956

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions fixtures/backup/model_dependencies/detailed.json
Original file line number Diff line number Diff line change
Expand Up @@ -2319,6 +2319,16 @@
"kind": "DefaultOneToOneField",
"model": "sentry.grouphash",
"nullable": false
},
"seer_grouphash_sent": {
"kind": "FlexibleForeignKey",
"model": "sentry.grouphash",
"nullable": true
},
"seer_matched_grouphash": {
"kind": "FlexibleForeignKey",
"model": "sentry.grouphash",
"nullable": true
}
},
"model": "sentry.grouphashmetadata",
Expand Down
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ hybridcloud: 0016_add_control_cacheversion
nodestore: 0002_nodestore_no_dictfield
remote_subscriptions: 0003_drop_remote_subscription
replays: 0004_index_together
sentry: 0764_migrate_bad_status_substatus_rows
sentry: 0765_add_seer_fields_to_grouphash_metadata
social_auth: 0002_default_auto_field
uptime: 0013_uptime_subscription_new_unique
workflow_engine: 0005_data_source_detector
26 changes: 2 additions & 24 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,7 +1445,7 @@ def _save_aggregate(
# If we still haven't found a matching grouphash, we're now safe to go ahead and talk to
# seer and/or create the group.
if existing_grouphash is None:
seer_matched_grouphash = maybe_check_seer_for_matching_grouphash(event)
seer_matched_grouphash = maybe_check_seer_for_matching_grouphash(event, grouphashes)
seer_matched_group = (
Group.objects.filter(id=seer_matched_grouphash.group_id).first()
if seer_matched_grouphash
Expand Down Expand Up @@ -1614,7 +1614,7 @@ def _save_aggregate_new(
result = "found_secondary"
# If we still haven't found a group, ask Seer for a match (if enabled for the project)
else:
seer_matched_grouphash = maybe_check_seer_for_matching_grouphash(event)
seer_matched_grouphash = maybe_check_seer_for_matching_grouphash(event, all_grouphashes)

if seer_matched_grouphash:
group_info = handle_existing_grouphash(job, seer_matched_grouphash, all_grouphashes)
Expand Down Expand Up @@ -1806,16 +1806,6 @@ def _create_group(
first_release: Release | None = None,
**group_creation_kwargs: Any,
) -> Group:
# Temporary log to debug events seeming to disappear after being sent to Seer
if event.data.get("seer_similarity"):
logger.info(
"seer.similarity.pre_create_group",
extra={
"event_id": event.event_id,
"hash": event.get_primary_hash(),
"project": project.id,
},
)

short_id = _get_next_short_id(project)

Expand Down Expand Up @@ -1891,18 +1881,6 @@ def _create_group(
logger.exception("Error after unsticking project counter")
raise

# Temporary log to debug events seeming to disappear after being sent to Seer
if event.data.get("seer_similarity"):
logger.info(
"seer.similarity.post_create_group",
extra={
"event_id": event.event_id,
"hash": event.get_primary_hash(),
"project": project.id,
"group_id": group.id,
},
)

return group


Expand Down
44 changes: 39 additions & 5 deletions src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,9 @@ def get_seer_similar_issues(
return (similar_issues_metadata, parent_grouphash)


def maybe_check_seer_for_matching_grouphash(event: Event) -> GroupHash | None:
def maybe_check_seer_for_matching_grouphash(
event: Event, all_grouphashes: list[GroupHash]
) -> GroupHash | None:
seer_matched_grouphash = None

if should_call_seer_for_grouping(event):
Expand All @@ -241,16 +243,48 @@ def maybe_check_seer_for_matching_grouphash(event: Event) -> GroupHash | None:
sample_rate=options.get("seer.similarity.metrics_sample_rate"),
tags={"call_made": True, "blocker": "none"},
)

try:
# If no matching group is found in Seer, we'll still get back result
# metadata, but `seer_matched_grouphash` will be None
seer_response_data, seer_matched_grouphash = get_seer_similar_issues(event)
event.data["seer_similarity"] = seer_response_data

# Insurance - in theory we shouldn't ever land here
except Exception as e:
except Exception as e: # Insurance - in theory we shouldn't ever land here
sentry_sdk.capture_exception(
e, tags={"event": event.event_id, "project": event.project.id}
)
return None

# Find the GroupHash for the hash value sent to Seer
primary_hash = event.get_primary_hash()
grouphash_sent = list(
filter(lambda grouphash: grouphash.hash == primary_hash, all_grouphashes)
)[0]

# Update GroupHashes with Seer results
for grouphash in all_grouphashes:
metadata = grouphash.metadata

if metadata:
# Mark all the GroupHashes as having been represented by the one we sent
metadata.seer_grouphash_sent = grouphash_sent

# Store the Seer results only on the GroupHash which was actually sent
if grouphash is grouphash_sent:
# Technically the time of the metadata record creation and the time of the Seer
# request will be some milliseconds apart, but the difference isn't meaningful
# and forcing them to be the same (rather than just close) lets us use their
# equality as a signal that the Seer call happened during ingest rather than
# during a backfill, without having to store that information separately
metadata.seer_date_sent = metadata.date_added
metadata.seer_event_sent = event.event_id
metadata.seer_model = seer_response_data["similarity_model_version"]
metadata.seer_matched_grouphash = seer_matched_grouphash
metadata.seer_match_distance = (
seer_response_data["results"][0]["stacktrace_distance"]
if seer_matched_grouphash
else None
)

metadata.save()

return seer_matched_grouphash
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Generated by Django 5.1.1 on 2024-09-23 15:40

import django.db.models.deletion
from django.db import migrations, models

import sentry.db.models.fields.foreignkey
from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("sentry", "0764_migrate_bad_status_substatus_rows"),
]

operations = [
migrations.AddField(
model_name="grouphashmetadata",
name="seer_date_sent",
field=models.DateTimeField(null=True),
),
migrations.AddField(
model_name="grouphashmetadata",
name="seer_event_sent",
field=models.CharField(max_length=32, null=True),
),
migrations.AddField(
model_name="grouphashmetadata",
name="seer_grouphash_sent",
field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
null=True,
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="+",
to="sentry.grouphash",
),
),
migrations.AddField(
model_name="grouphashmetadata",
name="seer_match_distance",
field=models.FloatField(null=True),
),
migrations.AddField(
model_name="grouphashmetadata",
name="seer_matched_grouphash",
field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
null=True,
on_delete=django.db.models.deletion.DO_NOTHING,
related_name="seer_matchees",
to="sentry.grouphash",
),
),
migrations.AddField(
model_name="grouphashmetadata",
name="seer_model",
field=models.CharField(null=True),
),
]
36 changes: 36 additions & 0 deletions src/sentry/models/grouphashmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,54 @@
from sentry.backup.scopes import RelocationScope
from sentry.db.models import Model, region_silo_model
from sentry.db.models.base import sane_repr
from sentry.db.models.fields.foreignkey import FlexibleForeignKey


@region_silo_model
class GroupHashMetadata(Model):
__relocation_scope__ = RelocationScope.Excluded

# GENERAL

grouphash = models.OneToOneField(
"sentry.GroupHash", related_name="_metadata", on_delete=models.CASCADE
)
date_added = models.DateTimeField(default=timezone.now)

# SEER

# Only one hash representing each group is sent to Seer. For the grouphash actually sent, this
# field and the `grouphash` field will be identical. For the grouphashes assigned to the same
# group but which aren't sent, this will point to the GroupHash record for the sent hash. Note
# that because of merging/unmerging, the sent GroupHash and this metadata's GroupHash (if not
# one and the same) aren't guaranteed to forever point to the same group (though they will when
# this field is written).
seer_grouphash_sent = FlexibleForeignKey(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would prefer not to keep track of this on GroupHashMetadata. -- we can simply infer this between the relations between the grouphash and group.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see note above on merging/unmerging, i'll think further on this

"sentry.GroupHash",
# If we end up needing to reference in this direction, we can handle it with a property on
# GroupHash
related_name="+",
on_delete=models.DO_NOTHING,
null=True,
)

# NOTE: The rest of the Seer-related fields are only stored on the metadata of the GroupHash
# actually sent to Seer.

# When this hash was sent to Seer. This will be different than `date_added` if we send it to
# Seer as part of a backfill rather than during ingest.
seer_date_sent = models.DateTimeField(null=True)
# Id of the event whose stacktrace was sent to Seer
seer_event_sent = models.CharField(max_length=32, null=True)
# The version of the Seer model used to process this hash value
seer_model = models.CharField(null=True)
# The `GroupHash` record representing the match Seer sent back as a match (if any)
seer_matched_grouphash = FlexibleForeignKey(
"sentry.GroupHash", related_name="seer_matchees", on_delete=models.DO_NOTHING, null=True
)
# The similarity between this hash's stacktrace and the parent (matched) hash's stacktrace
seer_match_distance = models.FloatField(null=True)

class Meta:
app_label = "sentry"
db_table = "sentry_grouphashmetadata"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
created: '2024-09-17T12:19:51.014511+00:00'
created: '2024-09-23T17:48:55.681597+00:00'
creator: sentry
source: tests/sentry/backup/test_comparators.py
---
Expand Down Expand Up @@ -579,6 +579,8 @@ source: tests/sentry/backup/test_comparators.py
- class: ForeignKeyComparator
fields:
- grouphash
- seer_grouphash_sent
- seer_matched_grouphash
model_name: sentry.grouphashmetadata
- comparators:
- class: ForeignKeyComparator
Expand Down
Loading
Loading