Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crons): Fan out check_missing task to each monitor_environment #55924

Merged
merged 8 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 37 additions & 31 deletions src/sentry/monitors/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,39 +205,45 @@ def check_missing(current_datetime: datetime):
)
metrics.gauge("sentry.monitors.tasks.check_missing.count", qs.count(), sample_rate=1.0)
for monitor_environment in qs:
try:
logger.info(
"monitor.missed-checkin", extra={"monitor_environment_id": monitor_environment.id}
)
check_missing_environment.delay(monitor_environment.id)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not a celery expert but what's the overhead of a task here? would it make sense at all to batch them?

is the overall goal of this PR simply to increase parallelism?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right now we're only at a few hundred but could always switch to chunks later
https://docs.celeryq.dev/en/latest/userguide/canvas.html#chunks


monitor = monitor_environment.monitor
expected_time = monitor_environment.next_checkin

# add missed checkin.
#
# XXX(epurkhiser): The date_added is backdated so that this missed
# check-in correctly reflects the time of when the checkin SHOULD
# have happened. It is the same as the expected_time.
MonitorCheckIn.objects.create(
project_id=monitor_environment.monitor.project_id,
monitor=monitor_environment.monitor,
monitor_environment=monitor_environment,
status=CheckInStatus.MISSED,
date_added=expected_time,
expected_time=expected_time,
monitor_config=monitor.get_validated_config(),
)
mark_failed(
monitor_environment,
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={
"expected_time": expected_time.strftime(SUBTITLE_DATETIME_FORMAT)
if expected_time
else expected_time
},
)
except Exception:
logger.exception("Exception in check_monitors - mark missed", exc_info=True)
@instrumented_task(
name="sentry.monitors.tasks.check_missing_environment",
max_retries=0,
)
def check_missing_environment(monitor_environment_id: int):
logger.info("monitor.missed-checkin", extra={"monitor_environment_id": monitor_environment_id})

monitor_environment = MonitorEnvironment.objects.select_related("monitor").get(
id=monitor_environment_id
)
monitor = monitor_environment.monitor
expected_time = monitor_environment.next_checkin

# add missed checkin.
#
# XXX(epurkhiser): The date_added is backdated so that this missed
# check-in correctly reflects the time of when the checkin SHOULD
# have happened. It is the same as the expected_time.
MonitorCheckIn.objects.create(
project_id=monitor_environment.monitor.project_id,
monitor=monitor_environment.monitor,
monitor_environment=monitor_environment,
status=CheckInStatus.MISSED,
date_added=expected_time,
expected_time=expected_time,
monitor_config=monitor.get_validated_config(),
)
mark_failed(
monitor_environment,
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={
"expected_time": expected_time.strftime(SUBTITLE_DATETIME_FORMAT)
if expected_time
else expected_time
},
)


@instrumented_task(
Expand Down
52 changes: 41 additions & 11 deletions tests/sentry/monitors/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest import mock

import msgpack
import pytest
from arroyo.backends.kafka import KafkaPayload
from django.test import override_settings
from django.utils import timezone
Expand All @@ -18,6 +19,7 @@
)
from sentry.monitors.tasks import (
check_missing,
check_missing_environment,
check_timeout,
clock_pulse,
try_monitor_tasks_trigger,
Expand Down Expand Up @@ -48,7 +50,8 @@ def make_ref_time():


class MonitorTaskCheckMissingTest(TestCase):
def test_missing_checkin(self):
@mock.patch("sentry.monitors.tasks.check_missing_environment")
def test_missing_checkin(self, check_missing_environment_mock):
org = self.create_organization()
project = self.create_project(organization=org)

Expand Down Expand Up @@ -78,6 +81,14 @@ def test_missing_checkin(self):

check_missing(task_run_ts)

# assert that task is called for the specific environment
assert check_missing_environment_mock.delay.call_count == 1
assert check_missing_environment_mock.delay.mock_calls[0] == mock.call(
monitor_environment.id
)

check_missing_environment(monitor_environment.id)

# Monitor status is updated
monitor_environment = MonitorEnvironment.objects.get(
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
Expand All @@ -98,7 +109,8 @@ def test_missing_checkin(self):
).replace(second=0, microsecond=0)
assert missed_checkin.monitor_config == monitor.config

def test_missing_checkin_with_margin(self):
@mock.patch("sentry.monitors.tasks.check_missing_environment")
def test_missing_checkin_with_margin(self, check_missing_environment_mock):
org = self.create_organization()
project = self.create_project(organization=org)

Expand Down Expand Up @@ -132,6 +144,9 @@ def test_missing_checkin_with_margin(self):
# No missed check-in generated as we're still within the check-in margin
check_missing(task_run_ts)

# assert that task is not called for the specific environment
assert check_missing_environment_mock.delay.call_count == 0

assert not MonitorEnvironment.objects.filter(
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
).exists()
Expand All @@ -142,6 +157,14 @@ def test_missing_checkin_with_margin(self):
# Missed check-in generated as clock now exceeds expected time plus margin
check_missing(task_run_ts + timedelta(minutes=4))

# assert that task is called for the specific environment
assert check_missing_environment_mock.delay.call_count == 1
assert check_missing_environment_mock.delay.mock_calls[0] == mock.call(
monitor_environment.id
)

check_missing_environment(monitor_environment.id)

monitor_environment = MonitorEnvironment.objects.get(
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
)
Expand All @@ -157,7 +180,7 @@ def test_missing_checkin_with_margin(self):
)

# Missed checkins are back-dated to when the checkin was expected to
# happpen. In this case the expected_time is equal to the date_added.
# happen. In this case the expected_time is equal to the date_added.
assert missed_check.date_added == (
monitor_environment.last_checkin + timedelta(minutes=10)
).replace(second=0, microsecond=0)
Expand Down Expand Up @@ -269,8 +292,8 @@ def test_not_missing_checkin(self):
monitor_environment=monitor_environment.id, status=CheckInStatus.MISSED
).exists()

@mock.patch("sentry.monitors.tasks.logger")
def test_missed_exception_handling(self, logger):
@mock.patch("sentry.monitors.tasks.check_missing_environment")
def test_missed_exception_handling(self, check_missing_environment_mock):
org = self.create_organization()
project = self.create_project(organization=org)

Expand All @@ -287,7 +310,7 @@ def test_missed_exception_handling(self, logger):
"schedule": [-2, "minute"],
},
)
MonitorEnvironment.objects.create(
failing_monitor_environment = MonitorEnvironment.objects.create(
monitor=exception_monitor,
environment=self.environment,
next_checkin=ts - timedelta(minutes=1),
Expand All @@ -301,7 +324,7 @@ def test_missed_exception_handling(self, logger):
type=MonitorType.CRON_JOB,
config={"schedule": "* * * * *"},
)
monitor_environment = MonitorEnvironment.objects.create(
successful_monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
next_checkin=ts - timedelta(minutes=1),
Expand All @@ -311,15 +334,22 @@ def test_missed_exception_handling(self, logger):

check_missing(task_run_ts)

# Logged the exception
assert logger.exception.call_count == 1
# assert that task is called for the specific environments
assert check_missing_environment_mock.delay.call_count == 2

# assert failing monitor raises an error
with pytest.raises(ValueError):
check_missing_environment(failing_monitor_environment.id)

# assert regular monitor works
check_missing_environment(successful_monitor_environment.id)

# We still marked a monitor as missed
assert MonitorEnvironment.objects.filter(
id=monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
id=successful_monitor_environment.id, status=MonitorStatus.MISSED_CHECKIN
).exists()
assert MonitorCheckIn.objects.filter(
monitor_environment=monitor_environment.id, status=CheckInStatus.MISSED
monitor_environment=successful_monitor_environment.id, status=CheckInStatus.MISSED
).exists()


Expand Down
Loading