From a588ab33d87244c7b2f666e13b5ce67a4714649d Mon Sep 17 00:00:00 2001 From: Evan Purkhiser Date: Fri, 6 Oct 2023 12:30:05 -0700 Subject: [PATCH 1/7] ref(crons): Correct logic for mark_ok Previously when incidents were enabled (a recovery threshold was set) a monitor would not have it's `next_checkin` or `next_checkin_latest` updated until it recovered. --- src/sentry/monitors/logic/mark_ok.py | 62 ++++++++---- tests/sentry/monitors/logic/test_mark_ok.py | 105 +++++++++++++++----- 2 files changed, 123 insertions(+), 44 deletions(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index 63a387c8da9d33..af3bd168c9daf0 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -13,18 +13,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): monitor_env = checkin.monitor_environment - recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) - if recovery_threshold: - previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by( - "-date_added" - )[:recovery_threshold] - # check for successive OK previous check-ins - if not all( - previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins - ): - # don't send occurrence for active issue on an OK check-in - return - next_checkin = monitor_env.monitor.get_next_expected_checkin(ts) next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts) @@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): "next_checkin": next_checkin, "next_checkin_latest": next_checkin_latest, } - if checkin.status == CheckInStatus.OK: + + # If the monitor is already in OK status there's not incident update or + # status change necessary + if monitor_env.status == CheckInStatus.OK: + MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( + **params + ) + return + + recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) + using_incidents = bool(recovery_threshold) + + # If we're not using incidents we can just immediately update the status + if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED: + MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( + status=MonitorStatus.OK, + **params, + ) + return + + # Check if our incident is recovering + previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by( + "-date_added" + )[:recovery_threshold] + + # Incident recovers when we have successive threshold check-ins + incident_recovering = all( + previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins + ) + + # Resolve the incident + if incident_recovering and monitor_env.status != MonitorStatus.OK: + MonitorIncident.objects.filter( + monitor_environment=monitor_env, + grouphash=monitor_env.incident_grouphash, + ).update( + resolving_checkin=checkin, + resolving_timestamp=checkin.date_added, + ) + if monitor_env.monitor.status != ObjectStatus.DISABLED: - params["status"] = MonitorStatus.OK - # in the future this will auto-resolve associated issues - if monitor_env.status != MonitorStatus.OK: params["last_state_change"] = ts - # resolve any associated incidents - incidents = MonitorIncident.objects.filter( - monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash - ) - incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added) + params["status"] = MonitorStatus.OK MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( **params diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py index bb12924aa55b5e..201a76951faf34 100644 --- a/tests/sentry/monitors/logic/test_mark_ok.py +++ b/tests/sentry/monitors/logic/test_mark_ok.py @@ -1,6 +1,7 @@ -from unittest.mock import patch +from datetime import timedelta + +from django.utils import timezone -from sentry.monitors.logic.mark_failed import mark_failed from sentry.monitors.logic.mark_ok import mark_ok from sentry.monitors.models import ( CheckInStatus, @@ -13,15 +14,56 @@ ScheduleType, ) from sentry.testutils.cases import TestCase -from sentry.testutils.helpers import with_feature from sentry.testutils.silo import region_silo_test @region_silo_test(stable=True) class MarkOkTestCase(TestCase): - @with_feature("organizations:issue-platform") - @patch("sentry.issues.producer.produce_occurrence_to_kafka") - def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka): + def test_mark_ok_simple(self): + now = timezone.now().replace(second=0, microsecond=0) + + monitor = Monitor.objects.create( + name="test monitor", + organization_id=self.organization.id, + project_id=self.project.id, + type=MonitorType.CRON_JOB, + config={ + "schedule": "* * * * *", + "schedule_type": ScheduleType.CRONTAB, + "max_runtime": None, + "checkin_margin": None, + }, + ) + + # Start with monitor in an ERROR state + monitor_environment = MonitorEnvironment.objects.create( + monitor=monitor, + environment=self.environment, + status=MonitorStatus.ERROR, + last_checkin=now - timedelta(minutes=1), + next_checkin=now, + ) + + # OK checkin comes in + success_checkin = MonitorCheckIn.objects.create( + monitor=monitor, + monitor_environment=monitor_environment, + project_id=self.project.id, + status=CheckInStatus.OK, + date_added=now, + ) + mark_ok(success_checkin, ts=now) + + # Monitor has recovered to OK with updated upcoming timestamps + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.OK + assert monitor_environment.next_checkin == now + timedelta(minutes=1) + assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2) + assert monitor_environment.last_checkin == now + + def test_mark_ok_recovery_threshold(self): + now = timezone.now().replace(second=0, microsecond=0) + recovery_threshold = 8 monitor = Monitor.objects.create( name="test monitor", @@ -29,80 +71,97 @@ def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka): project_id=self.project.id, type=MonitorType.CRON_JOB, config={ - "schedule": [1, "month"], - "schedule_type": ScheduleType.INTERVAL, + "schedule": "* * * * *", + "schedule_type": ScheduleType.CRONTAB, "recovery_threshold": recovery_threshold, "max_runtime": None, "checkin_margin": None, }, ) + + # Start with monitor in an ERROR state with an active incident monitor_environment = MonitorEnvironment.objects.create( monitor=monitor, environment=self.environment, status=MonitorStatus.ERROR, last_state_change=None, ) - - checkin = MonitorCheckIn.objects.create( + first_checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.ERROR, + date_added=now, ) - incident = MonitorIncident.objects.create( monitor=monitor, monitor_environment=monitor_environment, - starting_checkin=checkin, - starting_timestamp=checkin.date_added, + starting_checkin=first_checkin, + starting_timestamp=first_checkin.date_added, grouphash=monitor_environment.incident_grouphash, ) + # Create OK check-ins for i in range(0, recovery_threshold - 1): + now = now + timedelta(minutes=1) checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.OK, + date_added=now, ) mark_ok(checkin, checkin.date_added) # failure has not hit threshold, monitor should be in an OK status - monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) + incident.refresh_from_db() + monitor_environment.refresh_from_db() + assert monitor_environment.status != MonitorStatus.OK + assert monitor_environment.next_checkin == now + timedelta(minutes=1) + # check that timestamp has not updated assert monitor_environment.last_state_change is None + # Incidnet has not resolved + assert incident.resolving_checkin is None + assert incident.resolving_timestamp is None # create another failed check-in to break the chain - failed_checkin = MonitorCheckIn.objects.create( + last_checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.ERROR, ) - mark_failed(failed_checkin, ts=failed_checkin.date_added) - # assert occurrence was sent - assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 - last_checkin = None + # Still not resolved + incident.refresh_from_db() + assert incident.resolving_checkin is None + assert incident.resolving_timestamp is None + + # Create enough check-ins to resolve the incident for i in range(0, recovery_threshold): + now = now + timedelta(minutes=1) checkin = MonitorCheckIn.objects.create( monitor=monitor, monitor_environment=monitor_environment, project_id=self.project.id, status=CheckInStatus.OK, + date_added=now, ) if i == (recovery_threshold - 1): last_checkin = checkin mark_ok(checkin, checkin.date_added) # recovery has hit threshold, monitor should be in an ok state - monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id) + incident.refresh_from_db() + monitor_environment.refresh_from_db() + assert monitor_environment.status == MonitorStatus.OK + assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1) + # check that monitor environment has updated timestamp used for fingerprinting assert monitor_environment.last_state_change == monitor_environment.last_checkin - - # check that resolving check-in is set on the incident - incident = MonitorIncident.objects.get(id=incident.id) + # Incident reoslved assert incident.resolving_checkin == last_checkin assert incident.resolving_timestamp == last_checkin.date_added From e2cc208cd5d03b96c1e4d9cb1d093e34c2ca78ab Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Wed, 11 Oct 2023 16:48:47 -0700 Subject: [PATCH 2/7] fix(crons): Properly update monitorenvs in mark_ok with thresholds --- src/sentry/monitors/logic/mark_ok.py | 70 ++++++++++----------- tests/sentry/monitors/logic/test_mark_ok.py | 1 + 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index af3bd168c9daf0..ce5c72f7fdf054 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -22,48 +22,44 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): "next_checkin_latest": next_checkin_latest, } - # If the monitor is already in OK status there's not incident update or - # status change necessary - if monitor_env.status == CheckInStatus.OK: - MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( - **params - ) - return + # Don't update incidents + certain params if monitor is disabled or already in an OK state + if ( + monitor_env.monitor.status != ObjectStatus.DISABLED + and monitor_env.status != MonitorStatus.OK + ): + params["status"] = MonitorStatus.OK + recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) - recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) - using_incidents = bool(recovery_threshold) + # Run incident logic if recovery threshold is set + if recovery_threshold: + # Safety check to prevent db lockup on check-in lookup + assert recovery_threshold is not None - # If we're not using incidents we can just immediately update the status - if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED: - MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( - status=MonitorStatus.OK, - **params, - ) - return + # Check if our incident is recovering + previous_checkins = MonitorCheckIn.objects.filter( + monitor_environment=monitor_env + ).order_by("-date_added")[:recovery_threshold] - # Check if our incident is recovering - previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by( - "-date_added" - )[:recovery_threshold] + # Incident recovers when we have successive threshold check-ins + incident_recovering = all( + previous_checkin.status == CheckInStatus.OK + for previous_checkin in previous_checkins + ) - # Incident recovers when we have successive threshold check-ins - incident_recovering = all( - previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins - ) - - # Resolve the incident - if incident_recovering and monitor_env.status != MonitorStatus.OK: - MonitorIncident.objects.filter( - monitor_environment=monitor_env, - grouphash=monitor_env.incident_grouphash, - ).update( - resolving_checkin=checkin, - resolving_timestamp=checkin.date_added, - ) + # Resolve the incident + if incident_recovering: + MonitorIncident.objects.filter( + monitor_environment=monitor_env, + grouphash=monitor_env.incident_grouphash, + ).update( + resolving_checkin=checkin, + resolving_timestamp=checkin.date_added, + ) - if monitor_env.monitor.status != ObjectStatus.DISABLED: - params["last_state_change"] = ts - params["status"] = MonitorStatus.OK + params["last_state_change"] = ts + else: + # Don't update status if incident isn't recovered + params.pop("status", None) MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update( **params diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py index 201a76951faf34..6d9ad1c9444fab 100644 --- a/tests/sentry/monitors/logic/test_mark_ok.py +++ b/tests/sentry/monitors/logic/test_mark_ok.py @@ -32,6 +32,7 @@ def test_mark_ok_simple(self): "schedule_type": ScheduleType.CRONTAB, "max_runtime": None, "checkin_margin": None, + "recovery_threshold": None, }, ) From cd3fd116bff4e63302b20dca032c493f485c4b14 Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Wed, 11 Oct 2023 16:53:40 -0700 Subject: [PATCH 3/7] drop comment --- src/sentry/monitors/logic/mark_ok.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index ce5c72f7fdf054..b7ed3713168c8a 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -22,7 +22,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): "next_checkin_latest": next_checkin_latest, } - # Don't update incidents + certain params if monitor is disabled or already in an OK state if ( monitor_env.monitor.status != ObjectStatus.DISABLED and monitor_env.status != MonitorStatus.OK From 91bd064f340e3f4d0163e4e2541619f298623de4 Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Wed, 11 Oct 2023 16:54:45 -0700 Subject: [PATCH 4/7] remove default --- src/sentry/monitors/logic/mark_ok.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index b7ed3713168c8a..d1e72e6ecfd7f6 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -27,7 +27,7 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): and monitor_env.status != MonitorStatus.OK ): params["status"] = MonitorStatus.OK - recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0) + recovery_threshold = monitor_env.monitor.config.get("recovery_threshold") # Run incident logic if recovery threshold is set if recovery_threshold: From c16c70a4382dd1d88cff3824ccf3a3c231d125ab Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Wed, 11 Oct 2023 16:55:45 -0700 Subject: [PATCH 5/7] remove safety check --- src/sentry/monitors/logic/mark_ok.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index d1e72e6ecfd7f6..c3f0d47c2a8d04 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -31,9 +31,6 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): # Run incident logic if recovery threshold is set if recovery_threshold: - # Safety check to prevent db lockup on check-in lookup - assert recovery_threshold is not None - # Check if our incident is recovering previous_checkins = MonitorCheckIn.objects.filter( monitor_environment=monitor_env From 4eb51399331a91ad0b30f40d9f96ba70cb5e929a Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Tue, 17 Oct 2023 13:35:53 -0700 Subject: [PATCH 6/7] fixed typos --- tests/sentry/monitors/logic/test_mark_ok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py index 6d9ad1c9444fab..8ceb4e04652376 100644 --- a/tests/sentry/monitors/logic/test_mark_ok.py +++ b/tests/sentry/monitors/logic/test_mark_ok.py @@ -123,7 +123,7 @@ def test_mark_ok_recovery_threshold(self): # check that timestamp has not updated assert monitor_environment.last_state_change is None - # Incidnet has not resolved + # Incident has not resolved assert incident.resolving_checkin is None assert incident.resolving_timestamp is None @@ -163,6 +163,6 @@ def test_mark_ok_recovery_threshold(self): # check that monitor environment has updated timestamp used for fingerprinting assert monitor_environment.last_state_change == monitor_environment.last_checkin - # Incident reoslved + # Incident resolved assert incident.resolving_checkin == last_checkin assert incident.resolving_timestamp == last_checkin.date_added From 6a0568edcb862f384a8466f4daaa0525a33640b5 Mon Sep 17 00:00:00 2001 From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com> Date: Tue, 17 Oct 2023 20:44:42 +0000 Subject: [PATCH 7/7] :hammer_and_wrench: apply pre-commit fixes --- src/sentry/monitors/logic/mark_ok.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py index a3825e765951c3..582f2a5cd76597 100644 --- a/src/sentry/monitors/logic/mark_ok.py +++ b/src/sentry/monitors/logic/mark_ok.py @@ -32,9 +32,11 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime): # Run incident logic if recovery threshold is set if recovery_threshold: # Check if our incident is recovering - previous_checkins = MonitorCheckIn.objects.filter( - monitor_environment=monitor_env - ).values("id", "date_added", "status").order_by("-date_added")[:recovery_threshold] + previous_checkins = ( + MonitorCheckIn.objects.filter(monitor_environment=monitor_env) + .values("id", "date_added", "status") + .order_by("-date_added")[:recovery_threshold] + ) # Incident recovers when we have successive threshold check-ins incident_recovering = all(