diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index 4dea9bb6fb9e0c..582f2a5cd76597 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -13,20 +13,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): monitor_env = checkin.monitor_environment - recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) - if recovery_threshold: - previous_checkins = ( - MonitorCheckIn.objects.filter(monitor_environment=monitor_env) - .order_by("-date_added") - .values("id", "date_added", "status")[:recovery_threshold] - ) - # check for successive OK previous check-ins - if not all( - previous_checkin["status"] == CheckInStatus.OK for previous_checkin in previous_checkins - ): - # don't send occurrence for active issue on an OK check-in - return - next_checkin = monitor_env.monitor.get_next_expected_checkin(ts) next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts) @@ -35,17 +21,43 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): "next_checkin": next_checkin, "next_checkin_latest": next_checkin_latest, } - if checkin.status == CheckInStatus.OK: - if monitor_env.monitor.status != ObjectStatus.DISABLED: - params["status"] = MonitorStatus.OK - # in the future this will auto-resolve associated issues - if monitor_env.status != MonitorStatus.OK: - params["last_state_change"] = ts - # resolve any associated incidents - incidents = MonitorIncident.objects.filter( - monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash + + if ( + monitor_env.monitor.status != ObjectStatus.DISABLED + and monitor_env.status != MonitorStatus.OK + ): + params["status"] = MonitorStatus.OK + recovery_threshold = monitor_env.monitor.config.get("recovery_threshold") + + # Run incident logic if recovery threshold is set + if recovery_threshold: + # Check if our incident is recovering + previous_checkins = ( + MonitorCheckIn.objects.filter(monitor_environment=monitor_env) + .values("id", "date_added", "status") + .order_by("-date_added")[:recovery_threshold] ) - incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added) + + # Incident recovers when we have successive threshold check-ins + incident_recovering = all( + previous_checkin["status"] == CheckInStatus.OK + for previous_checkin in previous_checkins + ) + + # Resolve the incident + if incident_recovering: + MonitorIncident.objects.filter( + monitor_environment=monitor_env, + grouphash=monitor_env.incident_grouphash, + ).update( + resolving_checkin=checkin, + resolving_timestamp=checkin.date_added, + ) + + params["last_state_change"] = ts + else: + # Don't update status if incident isn't recovered + params.pop("status", None) MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( **params diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py index bb12924aa55b5e..8ceb4e04652376 100644 --- a/tests/sentry/monitors/logic/test_mark_ok.py +++ b/tests/sentry/monitors/logic/test_mark_ok.py @@ -1,6 +1,7 @@ -from unittest.mock import patch +from datetime import timedelta + +from django.utils import timezone -from sentry.monitors.logic.mark_failed import mark_failed from sentry.monitors.logic.mark_ok import mark_ok from sentry.monitors.models import ( CheckInStatus, @@ -13,15 +14,57 @@ ScheduleType, ) from sentry.testutils.cases import TestCase -from sentry.testutils.helpers import with_feature from sentry.testutils.silo import region_silo_test @region_silo_test(stable=True) class MarkOkTestCase(TestCase): - @with_feature("organizations:issue-platform") - @patch("sentry.issues.producer.produce_occurrence_to_kafka") - def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka): + def test_mark_ok_simple(self): + now = timezone.now().replace(second=0, microsecond=0) + + monitor = Monitor.objects.create( + name="test monitor", + organization_id=self.organization.id, + project_id=self.project.id, + type=MonitorType.CRON_JOB, + config={ + "schedule": "* * * * *", + "schedule_type": ScheduleType.CRONTAB, + "max_runtime": None, + "checkin_margin": None, + "recovery_threshold": None, + }, + ) + + # Start with monitor in an ERROR state + monitor_environment = MonitorEnvironment.objects.create( + monitor=monitor, + environment=self.environment, + status=MonitorStatus.ERROR, + last_checkin=now - timedelta(minutes=1), + next_checkin=now, + ) + + # OK checkin comes in + success_checkin = MonitorCheckIn.objects.create( + monitor=monitor, + monitor_environment=monitor_environment, + project_id=self.project.id, + status=CheckInStatus.OK, + date_added=now, + ) + mark_ok(success_checkin, ts=now) + + # Monitor has recovered to OK with updated upcoming timestamps + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.OK + assert monitor_environment.next_checkin == now + timedelta(minutes=1) + assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2) + assert monitor_environment.last_checkin == now + + def test_mark_ok_recovery_threshold(self): + now = timezone.now().replace(second=0, microsecond=0) + recovery_threshold = 8 monitor = Monitor.objects.create( name="test monitor", @@ -29,80 +72,97 @@ def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka): project_id=self.project.id, type=MonitorType.CRON_JOB, config={ - "schedule": [1, "month"], - "schedule_type": ScheduleType.INTERVAL, + "schedule": "* * * * *", + "schedule_type": ScheduleType.CRONTAB, "recovery_threshold": recovery_threshold, "max_runtime": None, "checkin_margin": None, }, ) + + # Start with monitor in an ERROR state with an active incident monitor_environment = MonitorEnvironment.objects.create( monitor=monitor, environment=self.environment, status=MonitorStatus.ERROR, last_state_change=None, ) - - checkin = MonitorCheckIn.objects.create( + first_checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.ERROR, + date_added=now, ) - incident = MonitorIncident.objects.create( monitor=monitor, monitor_environment=monitor_environment, - starting_checkin=checkin, - starting_timestamp=checkin.date_added, + starting_checkin=first_checkin, + starting_timestamp=first_checkin.date_added, grouphash=monitor_environment.incident_grouphash, ) + # Create OK check-ins for i in range(0, recovery_threshold - 1): + now = now + timedelta(minutes=1) checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.OK, + date_added=now, ) mark_ok(checkin, checkin.date_added) # failure has not hit threshold, monitor should be in an OK status - monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) + incident.refresh_from_db() + monitor_environment.refresh_from_db() + assert monitor_environment.status != MonitorStatus.OK + assert monitor_environment.next_checkin == now + timedelta(minutes=1) + # check that timestamp has not updated assert monitor_environment.last_state_change is None + # Incident has not resolved + assert incident.resolving_checkin is None + assert incident.resolving_timestamp is None # create another failed check-in to break the chain - failed_checkin = MonitorCheckIn.objects.create( + last_checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.ERROR, ) - mark_failed(failed_checkin, ts=failed_checkin.date_added) - # assert occurrence was sent - assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 - last_checkin = None + # Still not resolved + incident.refresh_from_db() + assert incident.resolving_checkin is None + assert incident.resolving_timestamp is None + + # Create enough check-ins to resolve the incident for i in range(0, recovery_threshold): + now = now + timedelta(minutes=1) checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.OK, + date_added=now, ) if i == (recovery_threshold - 1): last_checkin = checkin mark_ok(checkin, checkin.date_added) # recovery has hit threshold, monitor should be in an ok state - monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) + incident.refresh_from_db() + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.OK + assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1) + # check that monitor environment has updated timestamp used for fingerprinting assert monitor_environment.last_state_change == monitor_environment.last_checkin - - # check that resolving check-in is set on the incident - incident = MonitorIncident.objects.get(id=incident.id) + # Incident resolved assert incident.resolving_checkin == last_checkin assert incident.resolving_timestamp == last_checkin.date_added