ref(crons): Correct logic for mark_ok (#57730)

Previously when incidents were enabled (a recovery threshold was set) a monitor would not have it's `next_checkin` or `next_checkin_latest` updated until it recovered.
getsentry · Oct 10, 2023 · dcd6da4 · dcd6da4
1 parent 23cef81
commit dcd6da4
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 44 deletions.
diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
@@ -13,18 +13,6 @@
 def mark_ok(checkin: MonitorCheckIn, ts: datetime):
     monitor_env = checkin.monitor_environment
 
-    recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
-    if recovery_threshold:
-        previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
-            "-date_added"
-        )[:recovery_threshold]
-        # check for successive OK previous check-ins
-        if not all(
-            previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
-        ):
-            # don't send occurrence for active issue on an OK check-in
-            return
-
     next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
     next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)
 
@@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         "next_checkin": next_checkin,
         "next_checkin_latest": next_checkin_latest,
     }
-    if checkin.status == CheckInStatus.OK:
+
+    # If the monitor is already in OK status there's not incident update or
+    # status change necessary
+    if monitor_env.status == CheckInStatus.OK:
+        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+            **params
+        )
+        return
+
+    recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
+    using_incidents = bool(recovery_threshold)
+
+    # If we're not using incidents we can just immediately update the status
+    if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
+        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+            status=MonitorStatus.OK,
+            **params,
+        )
+        return
+
+    # Check if our incident is recovering
+    previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
+        "-date_added"
+    )[:recovery_threshold]
+
+    # Incident recovers when we have successive threshold check-ins
+    incident_recovering = all(
+        previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
+    )
+
+    # Resolve the incident
+    if incident_recovering and monitor_env.status != MonitorStatus.OK:
+        MonitorIncident.objects.filter(
+            monitor_environment=monitor_env,
+            grouphash=monitor_env.incident_grouphash,
+        ).update(
+            resolving_checkin=checkin,
+            resolving_timestamp=checkin.date_added,
+        )
+
         if monitor_env.monitor.status != ObjectStatus.DISABLED:
-            params["status"] = MonitorStatus.OK
-        # in the future this will auto-resolve associated issues
-        if monitor_env.status != MonitorStatus.OK:
             params["last_state_change"] = ts
-            # resolve any associated incidents
-            incidents = MonitorIncident.objects.filter(
-                monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash
-            )
-            incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)
+            params["status"] = MonitorStatus.OK
 
     MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
         **params

diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py
@@ -1,6 +1,7 @@
-from unittest.mock import patch
+from datetime import timedelta
+
+from django.utils import timezone
 
-from sentry.monitors.logic.mark_failed import mark_failed
 from sentry.monitors.logic.mark_ok import mark_ok
 from sentry.monitors.models import (
     CheckInStatus,
@@ -13,96 +14,154 @@
     ScheduleType,
 )
 from sentry.testutils.cases import TestCase
-from sentry.testutils.helpers import with_feature
 from sentry.testutils.silo import region_silo_test
 
 
 @region_silo_test(stable=True)
 class MarkOkTestCase(TestCase):
-    @with_feature("organizations:issue-platform")
-    @patch("sentry.issues.producer.produce_occurrence_to_kafka")
-    def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
+    def test_mark_ok_simple(self):
+        now = timezone.now().replace(second=0, microsecond=0)
+
+        monitor = Monitor.objects.create(
+            name="test monitor",
+            organization_id=self.organization.id,
+            project_id=self.project.id,
+            type=MonitorType.CRON_JOB,
+            config={
+                "schedule": "* * * * *",
+                "schedule_type": ScheduleType.CRONTAB,
+                "max_runtime": None,
+                "checkin_margin": None,
+            },
+        )
+
+        # Start with monitor in an ERROR state
+        monitor_environment = MonitorEnvironment.objects.create(
+            monitor=monitor,
+            environment=self.environment,
+            status=MonitorStatus.ERROR,
+            last_checkin=now - timedelta(minutes=1),
+            next_checkin=now,
+        )
+
+        # OK checkin comes in
+        success_checkin = MonitorCheckIn.objects.create(
+            monitor=monitor,
+            monitor_environment=monitor_environment,
+            project_id=self.project.id,
+            status=CheckInStatus.OK,
+            date_added=now,
+        )
+        mark_ok(success_checkin, ts=now)
+
+        # Monitor has recovered to OK with updated upcoming timestamps
+        monitor_environment.refresh_from_db()
+        assert monitor_environment.status == MonitorStatus.OK
+        assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+        assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
+        assert monitor_environment.last_checkin == now
+
+    def test_mark_ok_recovery_threshold(self):
+        now = timezone.now().replace(second=0, microsecond=0)
+
         recovery_threshold = 8
         monitor = Monitor.objects.create(
             name="test monitor",
             organization_id=self.organization.id,
             project_id=self.project.id,
             type=MonitorType.CRON_JOB,
             config={
-                "schedule": [1, "month"],
-                "schedule_type": ScheduleType.INTERVAL,
+                "schedule": "* * * * *",
+                "schedule_type": ScheduleType.CRONTAB,
                 "recovery_threshold": recovery_threshold,
                 "max_runtime": None,
                 "checkin_margin": None,
             },
         )
+
+        # Start with monitor in an ERROR state with an active incident
         monitor_environment = MonitorEnvironment.objects.create(
             monitor=monitor,
             environment=self.environment,
             status=MonitorStatus.ERROR,
             last_state_change=None,
         )
-
-        checkin = MonitorCheckIn.objects.create(
+        first_checkin = MonitorCheckIn.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
             project_id=self.project.id,
             status=CheckInStatus.ERROR,
+            date_added=now,
         )
-
         incident = MonitorIncident.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
-            starting_checkin=checkin,
-            starting_timestamp=checkin.date_added,
+            starting_checkin=first_checkin,
+            starting_timestamp=first_checkin.date_added,
             grouphash=monitor_environment.incident_grouphash,
         )
 
+        # Create OK check-ins
         for i in range(0, recovery_threshold - 1):
+            now = now + timedelta(minutes=1)
             checkin = MonitorCheckIn.objects.create(
                 monitor=monitor,
                 monitor_environment=monitor_environment,
                 project_id=self.project.id,
                 status=CheckInStatus.OK,
+                date_added=now,
             )
             mark_ok(checkin, checkin.date_added)
 
         # failure has not hit threshold, monitor should be in an OK status
-        monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+        incident.refresh_from_db()
+        monitor_environment.refresh_from_db()
+
         assert monitor_environment.status != MonitorStatus.OK
+        assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+
         # check that timestamp has not updated
         assert monitor_environment.last_state_change is None
+        # Incidnet has not resolved
+        assert incident.resolving_checkin is None
+        assert incident.resolving_timestamp is None
 
         # create another failed check-in to break the chain
-        failed_checkin = MonitorCheckIn.objects.create(
+        last_checkin = MonitorCheckIn.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
             project_id=self.project.id,
             status=CheckInStatus.ERROR,
         )
-        mark_failed(failed_checkin, ts=failed_checkin.date_added)
-        # assert occurrence was sent
-        assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
-        last_checkin = None
+        # Still not resolved
+        incident.refresh_from_db()
+        assert incident.resolving_checkin is None
+        assert incident.resolving_timestamp is None
+
+        # Create enough check-ins to resolve the incident
         for i in range(0, recovery_threshold):
+            now = now + timedelta(minutes=1)
             checkin = MonitorCheckIn.objects.create(
                 monitor=monitor,
                 monitor_environment=monitor_environment,
                 project_id=self.project.id,
                 status=CheckInStatus.OK,
+                date_added=now,
             )
             if i == (recovery_threshold - 1):
                 last_checkin = checkin
             mark_ok(checkin, checkin.date_added)
 
         # recovery has hit threshold, monitor should be in an ok state
-        monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+        incident.refresh_from_db()
+        monitor_environment.refresh_from_db()
+
         assert monitor_environment.status == MonitorStatus.OK
+        assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)
+
         # check that monitor environment has updated timestamp used for fingerprinting
         assert monitor_environment.last_state_change == monitor_environment.last_checkin
-
-        # check that resolving check-in is set on the incident
-        incident = MonitorIncident.objects.get(id=incident.id)
+        # Incident reoslved
         assert incident.resolving_checkin == last_checkin
         assert incident.resolving_timestamp == last_checkin.date_added