From a588ab33d87244c7b2f666e13b5ce67a4714649d Mon Sep 17 00:00:00 2001
From: Evan Purkhiser <evanpurkhiser@gmail.com>
Date: Fri, 6 Oct 2023 12:30:05 -0700
Subject: [PATCH 1/7] ref(crons): Correct logic for mark_ok

Previously when incidents were enabled (a recovery threshold was set) a
monitor would not have it's `next_checkin` or `next_checkin_latest`
updated until it recovered.
---
 src/sentry/monitors/logic/mark_ok.py        |  62 ++++++++----
 tests/sentry/monitors/logic/test_mark_ok.py | 105 +++++++++++++++-----
 2 files changed, 123 insertions(+), 44 deletions(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index 63a387c8da9d33..af3bd168c9daf0 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -13,18 +13,6 @@
 def mark_ok(checkin: MonitorCheckIn, ts: datetime):
     monitor_env = checkin.monitor_environment
 
-    recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
-    if recovery_threshold:
-        previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
-            "-date_added"
-        )[:recovery_threshold]
-        # check for successive OK previous check-ins
-        if not all(
-            previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
-        ):
-            # don't send occurrence for active issue on an OK check-in
-            return
-
     next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
     next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)
 
@@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         "next_checkin": next_checkin,
         "next_checkin_latest": next_checkin_latest,
     }
-    if checkin.status == CheckInStatus.OK:
+
+    # If the monitor is already in OK status there's not incident update or
+    # status change necessary
+    if monitor_env.status == CheckInStatus.OK:
+        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+            **params
+        )
+        return
+
+    recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
+    using_incidents = bool(recovery_threshold)
+
+    # If we're not using incidents we can just immediately update the status
+    if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
+        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+            status=MonitorStatus.OK,
+            **params,
+        )
+        return
+
+    # Check if our incident is recovering
+    previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
+        "-date_added"
+    )[:recovery_threshold]
+
+    # Incident recovers when we have successive threshold check-ins
+    incident_recovering = all(
+        previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
+    )
+
+    # Resolve the incident
+    if incident_recovering and monitor_env.status != MonitorStatus.OK:
+        MonitorIncident.objects.filter(
+            monitor_environment=monitor_env,
+            grouphash=monitor_env.incident_grouphash,
+        ).update(
+            resolving_checkin=checkin,
+            resolving_timestamp=checkin.date_added,
+        )
+
         if monitor_env.monitor.status != ObjectStatus.DISABLED:
-            params["status"] = MonitorStatus.OK
-        # in the future this will auto-resolve associated issues
-        if monitor_env.status != MonitorStatus.OK:
             params["last_state_change"] = ts
-            # resolve any associated incidents
-            incidents = MonitorIncident.objects.filter(
-                monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash
-            )
-            incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)
+            params["status"] = MonitorStatus.OK
 
     MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
         **params
diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py
index bb12924aa55b5e..201a76951faf34 100644
--- a/tests/sentry/monitors/logic/test_mark_ok.py
+++ b/tests/sentry/monitors/logic/test_mark_ok.py
@@ -1,6 +1,7 @@
-from unittest.mock import patch
+from datetime import timedelta
+
+from django.utils import timezone
 
-from sentry.monitors.logic.mark_failed import mark_failed
 from sentry.monitors.logic.mark_ok import mark_ok
 from sentry.monitors.models import (
     CheckInStatus,
@@ -13,15 +14,56 @@
     ScheduleType,
 )
 from sentry.testutils.cases import TestCase
-from sentry.testutils.helpers import with_feature
 from sentry.testutils.silo import region_silo_test
 
 
 @region_silo_test(stable=True)
 class MarkOkTestCase(TestCase):
-    @with_feature("organizations:issue-platform")
-    @patch("sentry.issues.producer.produce_occurrence_to_kafka")
-    def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
+    def test_mark_ok_simple(self):
+        now = timezone.now().replace(second=0, microsecond=0)
+
+        monitor = Monitor.objects.create(
+            name="test monitor",
+            organization_id=self.organization.id,
+            project_id=self.project.id,
+            type=MonitorType.CRON_JOB,
+            config={
+                "schedule": "* * * * *",
+                "schedule_type": ScheduleType.CRONTAB,
+                "max_runtime": None,
+                "checkin_margin": None,
+            },
+        )
+
+        # Start with monitor in an ERROR state
+        monitor_environment = MonitorEnvironment.objects.create(
+            monitor=monitor,
+            environment=self.environment,
+            status=MonitorStatus.ERROR,
+            last_checkin=now - timedelta(minutes=1),
+            next_checkin=now,
+        )
+
+        # OK checkin comes in
+        success_checkin = MonitorCheckIn.objects.create(
+            monitor=monitor,
+            monitor_environment=monitor_environment,
+            project_id=self.project.id,
+            status=CheckInStatus.OK,
+            date_added=now,
+        )
+        mark_ok(success_checkin, ts=now)
+
+        # Monitor has recovered to OK with updated upcoming timestamps
+        monitor_environment.refresh_from_db()
+        assert monitor_environment.status == MonitorStatus.OK
+        assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+        assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
+        assert monitor_environment.last_checkin == now
+
+    def test_mark_ok_recovery_threshold(self):
+        now = timezone.now().replace(second=0, microsecond=0)
+
         recovery_threshold = 8
         monitor = Monitor.objects.create(
             name="test monitor",
@@ -29,80 +71,97 @@ def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
             project_id=self.project.id,
             type=MonitorType.CRON_JOB,
             config={
-                "schedule": [1, "month"],
-                "schedule_type": ScheduleType.INTERVAL,
+                "schedule": "* * * * *",
+                "schedule_type": ScheduleType.CRONTAB,
                 "recovery_threshold": recovery_threshold,
                 "max_runtime": None,
                 "checkin_margin": None,
             },
         )
+
+        # Start with monitor in an ERROR state with an active incident
         monitor_environment = MonitorEnvironment.objects.create(
             monitor=monitor,
             environment=self.environment,
             status=MonitorStatus.ERROR,
             last_state_change=None,
         )
-
-        checkin = MonitorCheckIn.objects.create(
+        first_checkin = MonitorCheckIn.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
             project_id=self.project.id,
             status=CheckInStatus.ERROR,
+            date_added=now,
         )
-
         incident = MonitorIncident.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
-            starting_checkin=checkin,
-            starting_timestamp=checkin.date_added,
+            starting_checkin=first_checkin,
+            starting_timestamp=first_checkin.date_added,
             grouphash=monitor_environment.incident_grouphash,
         )
 
+        # Create OK check-ins
         for i in range(0, recovery_threshold - 1):
+            now = now + timedelta(minutes=1)
             checkin = MonitorCheckIn.objects.create(
                 monitor=monitor,
                 monitor_environment=monitor_environment,
                 project_id=self.project.id,
                 status=CheckInStatus.OK,
+                date_added=now,
             )
             mark_ok(checkin, checkin.date_added)
 
         # failure has not hit threshold, monitor should be in an OK status
-        monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+        incident.refresh_from_db()
+        monitor_environment.refresh_from_db()
+
         assert monitor_environment.status != MonitorStatus.OK
+        assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+
         # check that timestamp has not updated
         assert monitor_environment.last_state_change is None
+        # Incidnet has not resolved
+        assert incident.resolving_checkin is None
+        assert incident.resolving_timestamp is None
 
         # create another failed check-in to break the chain
-        failed_checkin = MonitorCheckIn.objects.create(
+        last_checkin = MonitorCheckIn.objects.create(
             monitor=monitor,
             monitor_environment=monitor_environment,
             project_id=self.project.id,
             status=CheckInStatus.ERROR,
         )
-        mark_failed(failed_checkin, ts=failed_checkin.date_added)
-        # assert occurrence was sent
-        assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
-        last_checkin = None
+        # Still not resolved
+        incident.refresh_from_db()
+        assert incident.resolving_checkin is None
+        assert incident.resolving_timestamp is None
+
+        # Create enough check-ins to resolve the incident
         for i in range(0, recovery_threshold):
+            now = now + timedelta(minutes=1)
             checkin = MonitorCheckIn.objects.create(
                 monitor=monitor,
                 monitor_environment=monitor_environment,
                 project_id=self.project.id,
                 status=CheckInStatus.OK,
+                date_added=now,
             )
             if i == (recovery_threshold - 1):
                 last_checkin = checkin
             mark_ok(checkin, checkin.date_added)
 
         # recovery has hit threshold, monitor should be in an ok state
-        monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+        incident.refresh_from_db()
+        monitor_environment.refresh_from_db()
+
         assert monitor_environment.status == MonitorStatus.OK
+        assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)
+
         # check that monitor environment has updated timestamp used for fingerprinting
         assert monitor_environment.last_state_change == monitor_environment.last_checkin
-
-        # check that resolving check-in is set on the incident
-        incident = MonitorIncident.objects.get(id=incident.id)
+        # Incident reoslved
         assert incident.resolving_checkin == last_checkin
         assert incident.resolving_timestamp == last_checkin.date_added

From e2cc208cd5d03b96c1e4d9cb1d093e34c2ca78ab Mon Sep 17 00:00:00 2001
From: Richard Ortenberg <richard.ortenberg@sentry.io>
Date: Wed, 11 Oct 2023 16:48:47 -0700
Subject: [PATCH 2/7] fix(crons): Properly update monitorenvs in mark_ok with
 thresholds

---
 src/sentry/monitors/logic/mark_ok.py        | 70 ++++++++++-----------
 tests/sentry/monitors/logic/test_mark_ok.py |  1 +
 2 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index af3bd168c9daf0..ce5c72f7fdf054 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -22,48 +22,44 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         "next_checkin_latest": next_checkin_latest,
     }
 
-    # If the monitor is already in OK status there's not incident update or
-    # status change necessary
-    if monitor_env.status == CheckInStatus.OK:
-        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
-            **params
-        )
-        return
+    # Don't update incidents + certain params if monitor is disabled or already in an OK state
+    if (
+        monitor_env.monitor.status != ObjectStatus.DISABLED
+        and monitor_env.status != MonitorStatus.OK
+    ):
+        params["status"] = MonitorStatus.OK
+        recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
 
-    recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
-    using_incidents = bool(recovery_threshold)
+        # Run incident logic if recovery threshold is set
+        if recovery_threshold:
+            # Safety check to prevent db lockup on check-in lookup
+            assert recovery_threshold is not None
 
-    # If we're not using incidents we can just immediately update the status
-    if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
-        MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
-            status=MonitorStatus.OK,
-            **params,
-        )
-        return
+            # Check if our incident is recovering
+            previous_checkins = MonitorCheckIn.objects.filter(
+                monitor_environment=monitor_env
+            ).order_by("-date_added")[:recovery_threshold]
 
-    # Check if our incident is recovering
-    previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
-        "-date_added"
-    )[:recovery_threshold]
+            # Incident recovers when we have successive threshold check-ins
+            incident_recovering = all(
+                previous_checkin.status == CheckInStatus.OK
+                for previous_checkin in previous_checkins
+            )
 
-    # Incident recovers when we have successive threshold check-ins
-    incident_recovering = all(
-        previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
-    )
-
-    # Resolve the incident
-    if incident_recovering and monitor_env.status != MonitorStatus.OK:
-        MonitorIncident.objects.filter(
-            monitor_environment=monitor_env,
-            grouphash=monitor_env.incident_grouphash,
-        ).update(
-            resolving_checkin=checkin,
-            resolving_timestamp=checkin.date_added,
-        )
+            # Resolve the incident
+            if incident_recovering:
+                MonitorIncident.objects.filter(
+                    monitor_environment=monitor_env,
+                    grouphash=monitor_env.incident_grouphash,
+                ).update(
+                    resolving_checkin=checkin,
+                    resolving_timestamp=checkin.date_added,
+                )
 
-        if monitor_env.monitor.status != ObjectStatus.DISABLED:
-            params["last_state_change"] = ts
-            params["status"] = MonitorStatus.OK
+                params["last_state_change"] = ts
+            else:
+                # Don't update status if incident isn't recovered
+                params.pop("status", None)
 
     MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
         **params
diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py
index 201a76951faf34..6d9ad1c9444fab 100644
--- a/tests/sentry/monitors/logic/test_mark_ok.py
+++ b/tests/sentry/monitors/logic/test_mark_ok.py
@@ -32,6 +32,7 @@ def test_mark_ok_simple(self):
                 "schedule_type": ScheduleType.CRONTAB,
                 "max_runtime": None,
                 "checkin_margin": None,
+                "recovery_threshold": None,
             },
         )
 

From cd3fd116bff4e63302b20dca032c493f485c4b14 Mon Sep 17 00:00:00 2001
From: Richard Ortenberg <richard.ortenberg@sentry.io>
Date: Wed, 11 Oct 2023 16:53:40 -0700
Subject: [PATCH 3/7] drop comment

---
 src/sentry/monitors/logic/mark_ok.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index ce5c72f7fdf054..b7ed3713168c8a 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -22,7 +22,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         "next_checkin_latest": next_checkin_latest,
     }
 
-    # Don't update incidents + certain params if monitor is disabled or already in an OK state
     if (
         monitor_env.monitor.status != ObjectStatus.DISABLED
         and monitor_env.status != MonitorStatus.OK

From 91bd064f340e3f4d0163e4e2541619f298623de4 Mon Sep 17 00:00:00 2001
From: Richard Ortenberg <richard.ortenberg@sentry.io>
Date: Wed, 11 Oct 2023 16:54:45 -0700
Subject: [PATCH 4/7] remove default

---
 src/sentry/monitors/logic/mark_ok.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index b7ed3713168c8a..d1e72e6ecfd7f6 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -27,7 +27,7 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         and monitor_env.status != MonitorStatus.OK
     ):
         params["status"] = MonitorStatus.OK
-        recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
+        recovery_threshold = monitor_env.monitor.config.get("recovery_threshold")
 
         # Run incident logic if recovery threshold is set
         if recovery_threshold:

From c16c70a4382dd1d88cff3824ccf3a3c231d125ab Mon Sep 17 00:00:00 2001
From: Richard Ortenberg <richard.ortenberg@sentry.io>
Date: Wed, 11 Oct 2023 16:55:45 -0700
Subject: [PATCH 5/7] remove safety check

---
 src/sentry/monitors/logic/mark_ok.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index d1e72e6ecfd7f6..c3f0d47c2a8d04 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -31,9 +31,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
 
         # Run incident logic if recovery threshold is set
         if recovery_threshold:
-            # Safety check to prevent db lockup on check-in lookup
-            assert recovery_threshold is not None
-
             # Check if our incident is recovering
             previous_checkins = MonitorCheckIn.objects.filter(
                 monitor_environment=monitor_env

From 4eb51399331a91ad0b30f40d9f96ba70cb5e929a Mon Sep 17 00:00:00 2001
From: Richard Ortenberg <richard.ortenberg@sentry.io>
Date: Tue, 17 Oct 2023 13:35:53 -0700
Subject: [PATCH 6/7] fixed typos

---
 tests/sentry/monitors/logic/test_mark_ok.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py
index 6d9ad1c9444fab..8ceb4e04652376 100644
--- a/tests/sentry/monitors/logic/test_mark_ok.py
+++ b/tests/sentry/monitors/logic/test_mark_ok.py
@@ -123,7 +123,7 @@ def test_mark_ok_recovery_threshold(self):
 
         # check that timestamp has not updated
         assert monitor_environment.last_state_change is None
-        # Incidnet has not resolved
+        # Incident has not resolved
         assert incident.resolving_checkin is None
         assert incident.resolving_timestamp is None
 
@@ -163,6 +163,6 @@ def test_mark_ok_recovery_threshold(self):
 
         # check that monitor environment has updated timestamp used for fingerprinting
         assert monitor_environment.last_state_change == monitor_environment.last_checkin
-        # Incident reoslved
+        # Incident resolved
         assert incident.resolving_checkin == last_checkin
         assert incident.resolving_timestamp == last_checkin.date_added

From 6a0568edcb862f384a8466f4daaa0525a33640b5 Mon Sep 17 00:00:00 2001
From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com>
Date: Tue, 17 Oct 2023 20:44:42 +0000
Subject: [PATCH 7/7] :hammer_and_wrench: apply pre-commit fixes

---
 src/sentry/monitors/logic/mark_ok.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
index a3825e765951c3..582f2a5cd76597 100644
--- a/src/sentry/monitors/logic/mark_ok.py
+++ b/src/sentry/monitors/logic/mark_ok.py
@@ -32,9 +32,11 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
         # Run incident logic if recovery threshold is set
         if recovery_threshold:
             # Check if our incident is recovering
-            previous_checkins = MonitorCheckIn.objects.filter(
-                monitor_environment=monitor_env
-            ).values("id", "date_added", "status").order_by("-date_added")[:recovery_threshold]
+            previous_checkins = (
+                MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
+                .values("id", "date_added", "status")
+                .order_by("-date_added")[:recovery_threshold]
+            )
 
             # Incident recovers when we have successive threshold check-ins
             incident_recovering = all(