-
-
Notifications
You must be signed in to change notification settings - Fork 4.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ref(crons): Correct logic for mark_ok #57730
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,18 +13,6 @@ | |
def mark_ok(checkin: MonitorCheckIn, ts: datetime): | ||
monitor_env = checkin.monitor_environment | ||
|
||
recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) | ||
if recovery_threshold: | ||
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by( | ||
"-date_added" | ||
)[:recovery_threshold] | ||
# check for successive OK previous check-ins | ||
if not all( | ||
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins | ||
): | ||
# don't send occurrence for active issue on an OK check-in | ||
return | ||
|
||
next_checkin = monitor_env.monitor.get_next_expected_checkin(ts) | ||
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts) | ||
|
||
|
@@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): | |
"next_checkin": next_checkin, | ||
"next_checkin_latest": next_checkin_latest, | ||
} | ||
if checkin.status == CheckInStatus.OK: | ||
|
||
# If the monitor is already in OK status there's not incident update or | ||
# status change necessary | ||
if monitor_env.status == CheckInStatus.OK: | ||
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( | ||
**params | ||
) | ||
return | ||
|
||
recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) | ||
using_incidents = bool(recovery_threshold) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just change this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I basically just duplicated what the logic was before |
||
|
||
# If we're not using incidents we can just immediately update the status | ||
if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED: | ||
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( | ||
status=MonitorStatus.OK, | ||
**params, | ||
) | ||
return | ||
|
||
# Check if our incident is recovering | ||
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by( | ||
"-date_added" | ||
)[:recovery_threshold] | ||
|
||
# Incident recovers when we have successive threshold check-ins | ||
incident_recovering = all( | ||
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins | ||
) | ||
|
||
# Resolve the incident | ||
if incident_recovering and monitor_env.status != MonitorStatus.OK: | ||
MonitorIncident.objects.filter( | ||
monitor_environment=monitor_env, | ||
grouphash=monitor_env.incident_grouphash, | ||
).update( | ||
resolving_checkin=checkin, | ||
resolving_timestamp=checkin.date_added, | ||
) | ||
|
||
if monitor_env.monitor.status != ObjectStatus.DISABLED: | ||
params["status"] = MonitorStatus.OK | ||
# in the future this will auto-resolve associated issues | ||
if monitor_env.status != MonitorStatus.OK: | ||
params["last_state_change"] = ts | ||
# resolve any associated incidents | ||
incidents = MonitorIncident.objects.filter( | ||
monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash | ||
) | ||
incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added) | ||
params["status"] = MonitorStatus.OK | ||
|
||
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( | ||
**params | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from unittest.mock import patch | ||
from datetime import timedelta | ||
|
||
from django.utils import timezone | ||
|
||
from sentry.monitors.logic.mark_failed import mark_failed | ||
from sentry.monitors.logic.mark_ok import mark_ok | ||
from sentry.monitors.models import ( | ||
CheckInStatus, | ||
|
@@ -13,96 +14,154 @@ | |
ScheduleType, | ||
) | ||
from sentry.testutils.cases import TestCase | ||
from sentry.testutils.helpers import with_feature | ||
from sentry.testutils.silo import region_silo_test | ||
|
||
|
||
@region_silo_test(stable=True) | ||
class MarkOkTestCase(TestCase): | ||
@with_feature("organizations:issue-platform") | ||
@patch("sentry.issues.producer.produce_occurrence_to_kafka") | ||
def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka): | ||
Comment on lines
-22
to
-23
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've made this more unit-test like by removing the dependency on |
||
def test_mark_ok_simple(self): | ||
now = timezone.now().replace(second=0, microsecond=0) | ||
|
||
monitor = Monitor.objects.create( | ||
name="test monitor", | ||
organization_id=self.organization.id, | ||
project_id=self.project.id, | ||
type=MonitorType.CRON_JOB, | ||
config={ | ||
"schedule": "* * * * *", | ||
"schedule_type": ScheduleType.CRONTAB, | ||
"max_runtime": None, | ||
"checkin_margin": None, | ||
}, | ||
) | ||
|
||
# Start with monitor in an ERROR state | ||
monitor_environment = MonitorEnvironment.objects.create( | ||
monitor=monitor, | ||
environment=self.environment, | ||
status=MonitorStatus.ERROR, | ||
last_checkin=now - timedelta(minutes=1), | ||
next_checkin=now, | ||
) | ||
|
||
# OK checkin comes in | ||
success_checkin = MonitorCheckIn.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
project_id=self.project.id, | ||
status=CheckInStatus.OK, | ||
date_added=now, | ||
) | ||
mark_ok(success_checkin, ts=now) | ||
|
||
# Monitor has recovered to OK with updated upcoming timestamps | ||
monitor_environment.refresh_from_db() | ||
assert monitor_environment.status == MonitorStatus.OK | ||
assert monitor_environment.next_checkin == now + timedelta(minutes=1) | ||
assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2) | ||
assert monitor_environment.last_checkin == now | ||
|
||
def test_mark_ok_recovery_threshold(self): | ||
now = timezone.now().replace(second=0, microsecond=0) | ||
|
||
recovery_threshold = 8 | ||
monitor = Monitor.objects.create( | ||
name="test monitor", | ||
organization_id=self.organization.id, | ||
project_id=self.project.id, | ||
type=MonitorType.CRON_JOB, | ||
config={ | ||
"schedule": [1, "month"], | ||
"schedule_type": ScheduleType.INTERVAL, | ||
"schedule": "* * * * *", | ||
"schedule_type": ScheduleType.CRONTAB, | ||
"recovery_threshold": recovery_threshold, | ||
"max_runtime": None, | ||
"checkin_margin": None, | ||
}, | ||
) | ||
|
||
# Start with monitor in an ERROR state with an active incident | ||
monitor_environment = MonitorEnvironment.objects.create( | ||
monitor=monitor, | ||
environment=self.environment, | ||
status=MonitorStatus.ERROR, | ||
last_state_change=None, | ||
) | ||
|
||
checkin = MonitorCheckIn.objects.create( | ||
first_checkin = MonitorCheckIn.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
project_id=self.project.id, | ||
status=CheckInStatus.ERROR, | ||
date_added=now, | ||
) | ||
|
||
incident = MonitorIncident.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
starting_checkin=checkin, | ||
starting_timestamp=checkin.date_added, | ||
starting_checkin=first_checkin, | ||
starting_timestamp=first_checkin.date_added, | ||
grouphash=monitor_environment.incident_grouphash, | ||
) | ||
|
||
# Create OK check-ins | ||
for i in range(0, recovery_threshold - 1): | ||
now = now + timedelta(minutes=1) | ||
checkin = MonitorCheckIn.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
project_id=self.project.id, | ||
status=CheckInStatus.OK, | ||
date_added=now, | ||
) | ||
mark_ok(checkin, checkin.date_added) | ||
|
||
# failure has not hit threshold, monitor should be in an OK status | ||
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) | ||
incident.refresh_from_db() | ||
monitor_environment.refresh_from_db() | ||
|
||
assert monitor_environment.status != MonitorStatus.OK | ||
assert monitor_environment.next_checkin == now + timedelta(minutes=1) | ||
|
||
# check that timestamp has not updated | ||
assert monitor_environment.last_state_change is None | ||
# Incidnet has not resolved | ||
assert incident.resolving_checkin is None | ||
assert incident.resolving_timestamp is None | ||
|
||
# create another failed check-in to break the chain | ||
failed_checkin = MonitorCheckIn.objects.create( | ||
last_checkin = MonitorCheckIn.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
project_id=self.project.id, | ||
status=CheckInStatus.ERROR, | ||
) | ||
mark_failed(failed_checkin, ts=failed_checkin.date_added) | ||
# assert occurrence was sent | ||
assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 | ||
|
||
last_checkin = None | ||
# Still not resolved | ||
incident.refresh_from_db() | ||
assert incident.resolving_checkin is None | ||
assert incident.resolving_timestamp is None | ||
|
||
# Create enough check-ins to resolve the incident | ||
for i in range(0, recovery_threshold): | ||
now = now + timedelta(minutes=1) | ||
checkin = MonitorCheckIn.objects.create( | ||
monitor=monitor, | ||
monitor_environment=monitor_environment, | ||
project_id=self.project.id, | ||
status=CheckInStatus.OK, | ||
date_added=now, | ||
) | ||
if i == (recovery_threshold - 1): | ||
last_checkin = checkin | ||
mark_ok(checkin, checkin.date_added) | ||
|
||
# recovery has hit threshold, monitor should be in an ok state | ||
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) | ||
incident.refresh_from_db() | ||
monitor_environment.refresh_from_db() | ||
|
||
assert monitor_environment.status == MonitorStatus.OK | ||
assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1) | ||
|
||
# check that monitor environment has updated timestamp used for fingerprinting | ||
assert monitor_environment.last_state_change == monitor_environment.last_checkin | ||
|
||
# check that resolving check-in is set on the incident | ||
incident = MonitorIncident.objects.get(id=incident.id) | ||
# Incident reoslved | ||
assert incident.resolving_checkin == last_checkin | ||
assert incident.resolving_timestamp == last_checkin.date_added |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be
1
as the default? or idk maybe a comment. the min value is 1There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you have the default here as
0
since if it's not set we don't want to enable incidents yet