-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Process delayed alert conditions in batches of 10,000 #75302
Changes from all commits
79ba7d0
1ed44ad
70cbbb4
5225441
8b20402
7aafed8
cdf288f
d39202e
9af164b
c94c18e
41fbcb7
c1a7982
ba6b28d
b767878
267d71d
c29c5e3
babca17
1cd92fe
834445d
4bac969
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,13 +4,15 @@ | |
from collections import defaultdict | ||
from collections.abc import Sequence | ||
from datetime import datetime, timedelta, timezone | ||
from itertools import islice | ||
from typing import Any, DefaultDict, NamedTuple | ||
|
||
import sentry_sdk | ||
from django.db.models import OuterRef, Subquery | ||
|
||
from sentry import buffer, nodestore | ||
from sentry import buffer, nodestore, options | ||
from sentry.buffer.redis import BufferHookEvent, redis_buffer_registry | ||
from sentry.db import models | ||
from sentry.eventstore.models import Event, GroupEvent | ||
from sentry.issues.issue_occurrence import IssueOccurrence | ||
from sentry.models.group import Group | ||
|
@@ -85,8 +87,15 @@ def fetch_project(project_id: int) -> Project | None: | |
return None | ||
|
||
|
||
def fetch_rulegroup_to_event_data(project_id: int) -> dict[str, str]: | ||
return buffer.backend.get_hash(model=Project, field={"project_id": project_id}) | ||
def fetch_rulegroup_to_event_data(project_id: int, batch_key: str | None = None) -> dict[str, str]: | ||
field: dict[str, models.Model | int | str] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where does this models.Model typing come from on the key 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's like a weird nested definition on the function call to ideally i could not type this at all, and mypy would evaluate the field data type as any recommendations on cleanup or better ways to appease the mypy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that makes sense, I would think the implicit typing would work but I guess not lol |
||
"project_id": project_id, | ||
} | ||
|
||
if batch_key: | ||
field["batch_key"] = batch_key | ||
|
||
return buffer.backend.get_hash(model=Project, field=field) | ||
|
||
|
||
def get_rules_to_groups(rulegroup_to_event_data: dict[str, str]) -> DefaultDict[int, set[int]]: | ||
|
@@ -447,13 +456,17 @@ def fire_rules( | |
safe_execute(callback, groupevent, futures) | ||
|
||
|
||
def cleanup_redis_buffer(project_id: int, rules_to_groups: DefaultDict[int, set[int]]) -> None: | ||
def cleanup_redis_buffer( | ||
project_id: int, rules_to_groups: DefaultDict[int, set[int]], batch_key: str | None | ||
) -> None: | ||
hashes_to_delete = [ | ||
f"{rule}:{group}" for rule, groups in rules_to_groups.items() for group in groups | ||
] | ||
buffer.backend.delete_hash( | ||
model=Project, filters={"project_id": project_id}, fields=hashes_to_delete | ||
) | ||
filters: dict[str, models.Model | str | int] = {"project_id": project_id} | ||
if batch_key: | ||
filters["batch_key"] = batch_key | ||
|
||
buffer.backend.delete_hash(model=Project, filters=filters, fields=hashes_to_delete) | ||
|
||
|
||
def bucket_num_groups(num_groups: int) -> str: | ||
|
@@ -463,6 +476,55 @@ def bucket_num_groups(num_groups: int) -> str: | |
return "1" | ||
|
||
|
||
def process_rulegroups_in_batches(project_id: int): | ||
""" | ||
This will check the number of rulegroup_to_event_data items in the Redis buffer for a project. | ||
|
||
If the number is larger than the batch size, it will chunk the items and process them in batches. | ||
|
||
The batches are replicated into a new redis hash with a unique filter (a uuid) to identify the batch. | ||
We need to use a UUID because these batches can be created in multiple processes and we need to ensure | ||
uniqueness across all of them for the centralized redis buffer. The batches are stored in redis because | ||
we shouldn't pass objects that need to be pickled and 10k items could be problematic in the celery tasks | ||
as arguments could be problematic. Finally, we can't use a pagination system on the data because | ||
redis doesn't maintain the sort order of the hash keys. | ||
|
||
`apply_delayed` will fetch the batch from redis and process the rules. | ||
""" | ||
batch_size = options.get("delayed_processing.batch_size") | ||
event_count = buffer.backend.get_hash_length(Project, {"project_id": project_id}) | ||
|
||
if event_count < batch_size: | ||
return apply_delayed.delayed(project_id) | ||
|
||
logger.info( | ||
"delayed_processing.process_large_batch", | ||
extra={"project_id": project_id, "count": event_count}, | ||
) | ||
|
||
# if the dictionary is large, get the items and chunk them. | ||
rulegroup_to_event_data = fetch_rulegroup_to_event_data(project_id) | ||
|
||
with metrics.timer("delayed_processing.process_batch.duration"): | ||
items = iter(rulegroup_to_event_data.items()) | ||
|
||
while batch := dict(islice(items, batch_size)): | ||
batch_key = str(uuid.uuid4()) | ||
|
||
buffer.backend.push_to_hash_bulk( | ||
model=Project, | ||
filters={"project_id": project_id, "batch_key": batch_key}, | ||
data=batch, | ||
) | ||
|
||
# remove the batched items from the project rulegroup_to_event_data | ||
buffer.backend.delete_hash( | ||
model=Project, filters={"project_id": project_id}, fields=list(batch.keys()) | ||
) | ||
|
||
apply_delayed.delayed(project_id, batch_key) | ||
|
||
|
||
def process_delayed_alert_conditions() -> None: | ||
with metrics.timer("delayed_processing.process_all_conditions.duration"): | ||
fetch_time = datetime.now(tz=timezone.utc) | ||
|
@@ -473,7 +535,7 @@ def process_delayed_alert_conditions() -> None: | |
logger.info("delayed_processing.project_id_list", extra={"project_ids": log_str}) | ||
|
||
for project_id, _ in project_ids: | ||
apply_delayed.delay(project_id) | ||
process_rulegroups_in_batches(project_id) | ||
|
||
buffer.backend.delete_key(PROJECT_ID_BUFFER_LIST_KEY, min=0, max=fetch_time.timestamp()) | ||
|
||
|
@@ -487,32 +549,15 @@ def process_delayed_alert_conditions() -> None: | |
time_limit=60, | ||
silo_mode=SiloMode.REGION, | ||
) | ||
def apply_delayed(project_id: int, *args: Any, **kwargs: Any) -> None: | ||
def apply_delayed(project_id: int, batch_key: str | None = None, *args: Any, **kwargs: Any) -> None: | ||
""" | ||
Grab rules, groups, and events from the Redis buffer, evaluate the "slow" conditions in a bulk snuba query, and fire them if they pass | ||
""" | ||
project = fetch_project(project_id) | ||
if not project: | ||
# Should we remove the project_id from the redis queue? | ||
return | ||
|
||
rulegroup_to_event_data = fetch_rulegroup_to_event_data(project_id) | ||
num_groups = len(rulegroup_to_event_data) | ||
num_groups_bucketed = bucket_num_groups(num_groups) | ||
metrics.incr("delayed_processing.num_groups", tags={"num_groups": num_groups_bucketed}) | ||
|
||
if num_groups >= 10000: | ||
logger.error( | ||
"delayed_processing.too_many_groups", | ||
extra={ | ||
"project_id": project_id, | ||
"num_groups": num_groups, | ||
"organization_id": project.organization_id, | ||
}, | ||
) | ||
# TODO @saponifi3d - Split the processing from here into smaller groups | ||
return | ||
|
||
rulegroup_to_event_data = fetch_rulegroup_to_event_data(project_id, batch_key) | ||
rules_to_groups = get_rules_to_groups(rulegroup_to_event_data) | ||
alert_rules = fetch_alert_rules(list(rules_to_groups.keys())) | ||
condition_groups = get_condition_query_groups(alert_rules, rules_to_groups) | ||
|
@@ -542,7 +587,7 @@ def apply_delayed(project_id: int, *args: Any, **kwargs: Any) -> None: | |
with metrics.timer("delayed_processing.fire_rules.duration"): | ||
fire_rules(rules_to_fire, parsed_rulegroup_to_event_data, alert_rules, project) | ||
|
||
cleanup_redis_buffer(project_id, rules_to_groups) | ||
cleanup_redis_buffer(project_id, rules_to_groups, batch_key) | ||
|
||
|
||
if not redis_buffer_registry.has(BufferHookEvent.FLUSH): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to https://redis.io/docs/latest/commands/hmset/, this command is deprecated after 4.0 and can be replaced with
hset
which can take multiple key value pairs.I'm not what version of redis we use tho
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, i had initially tried the
hset
withmapping
syntax and it threw errors, so we'll need to use hmset until then.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah ok, could we add a comment somewhere then referencing the future deprecation?