ref(crons): Correct logic for mark_ok

Previously when incidents were enabled (a recovery threshold was set) a monitor would not have it's `next_checkin` or `next_checkin_latest` updated until it recovered.
getsentry · Oct 10, 2023 · 5893654 · 5893654
1 parent c54b191
commit 5893654
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 44 deletions.
diff --git a/src/sentry/monitors/logic/mark_ok.py b/src/sentry/monitors/logic/mark_ok.py
@@ -13,18 +13,6 @@
 def mark_ok(checkin: MonitorCheckIn, ts: datetime):
  monitor_env = checkin.monitor_environment
 
- recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
- if recovery_threshold:
- previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
- "-date_added"
- )[:recovery_threshold]
- # check for successive OK previous check-ins
- if not all(
- previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
- ):
- # don't send occurrence for active issue on an OK check-in
- return
-
  next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
  next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)
 
@@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
  "next_checkin": next_checkin,
  "next_checkin_latest": next_checkin_latest,
  }
- if checkin.status == CheckInStatus.OK:
+
+ # If the monitor is already in OK status there's not incident update or
+ # status change necessary
+ if monitor_env.status == CheckInStatus.OK:
+ MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+ **params
+ )
+ return
+
+ recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
+ using_incidents = bool(recovery_threshold)
+
+ # If we're not using incidents we can just immediately update the status
+ if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
+ MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
+ status=MonitorStatus.OK,
+ **params,
+ )
+ return
+
+ # Check if our incident is recovering
+ previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
+ "-date_added"
+ )[:recovery_threshold]
+
+ # Incident recovers when we have successive threshold check-ins
+ incident_recovering = all(
+ previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
+ )
+
+ # Resolve the incident
+ if incident_recovering and monitor_env.status != MonitorStatus.OK:
+ MonitorIncident.objects.filter(
+ monitor_environment=monitor_env,
+ grouphash=monitor_env.incident_grouphash,
+ ).update(
+ resolving_checkin=checkin,
+ resolving_timestamp=checkin.date_added,
+ )
+
  if monitor_env.monitor.status != ObjectStatus.DISABLED:
- params["status"] = MonitorStatus.OK
- # in the future this will auto-resolve associated issues
- if monitor_env.status != MonitorStatus.OK:
  params["last_state_change"] = ts
- # resolve any associated incidents
- incidents = MonitorIncident.objects.filter(
- monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash
- )
- incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)
+ params["status"] = MonitorStatus.OK
 
  MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
  **params

diff --git a/tests/sentry/monitors/logic/test_mark_ok.py b/tests/sentry/monitors/logic/test_mark_ok.py
@@ -1,6 +1,7 @@
-from unittest.mock import patch
+from datetime import timedelta
+
+from django.utils import timezone
 
-from sentry.monitors.logic.mark_failed import mark_failed
 from sentry.monitors.logic.mark_ok import mark_ok
 from sentry.monitors.models import (
  CheckInStatus,
@@ -13,96 +14,154 @@
  ScheduleType,
 )
 from sentry.testutils.cases import TestCase
-from sentry.testutils.helpers import with_feature
 from sentry.testutils.silo import region_silo_test
 
 
 @region_silo_test(stable=True)
 class MarkOkTestCase(TestCase):
- @with_feature("organizations:issue-platform")
- @patch("sentry.issues.producer.produce_occurrence_to_kafka")
- def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
+ def test_mark_ok_simple(self):
+ now = timezone.now().replace(second=0, microsecond=0)
+
+ monitor = Monitor.objects.create(
+ name="test monitor",
+ organization_id=self.organization.id,
+ project_id=self.project.id,
+ type=MonitorType.CRON_JOB,
+ config={
+ "schedule": "* * * * *",
+ "schedule_type": ScheduleType.CRONTAB,
+ "max_runtime": None,
+ "checkin_margin": None,
+ },
+ )
+
+ # Start with monitor in an ERROR state
+ monitor_environment = MonitorEnvironment.objects.create(
+ monitor=monitor,
+ environment=self.environment,
+ status=MonitorStatus.ERROR,
+ last_checkin=now - timedelta(minutes=1),
+ next_checkin=now,
+ )
+
+ # OK checkin comes in
+ success_checkin = MonitorCheckIn.objects.create(
+ monitor=monitor,
+ monitor_environment=monitor_environment,
+ project_id=self.project.id,
+ status=CheckInStatus.OK,
+ date_added=now,
+ )
+ mark_ok(success_checkin, ts=now)
+
+ # Monitor has recovered to OK with updated upcoming timestamps
+ monitor_environment.refresh_from_db()
+ assert monitor_environment.status == MonitorStatus.OK
+ assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+ assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
+ assert monitor_environment.last_checkin == now
+
+ def test_mark_ok_recovery_threshold(self):
+ now = timezone.now().replace(second=0, microsecond=0)
+
  recovery_threshold = 8
  monitor = Monitor.objects.create(
  name="test monitor",
  organization_id=self.organization.id,
  project_id=self.project.id,
  type=MonitorType.CRON_JOB,
  config={
- "schedule": [1, "month"],
- "schedule_type": ScheduleType.INTERVAL,
+ "schedule": "* * * * *",
+ "schedule_type": ScheduleType.CRONTAB,
  "recovery_threshold": recovery_threshold,
  "max_runtime": None,
  "checkin_margin": None,
  },
  )
+
+ # Start with monitor in an ERROR state with an active incident
  monitor_environment = MonitorEnvironment.objects.create(
  monitor=monitor,
  environment=self.environment,
  status=MonitorStatus.ERROR,
  last_state_change=None,
  )
-
- checkin = MonitorCheckIn.objects.create(
+ first_checkin = MonitorCheckIn.objects.create(
  monitor=monitor,
  monitor_environment=monitor_environment,
  project_id=self.project.id,
  status=CheckInStatus.ERROR,
+ date_added=now,
  )
-
  incident = MonitorIncident.objects.create(
  monitor=monitor,
  monitor_environment=monitor_environment,
- starting_checkin=checkin,
- starting_timestamp=checkin.date_added,
+ starting_checkin=first_checkin,
+ starting_timestamp=first_checkin.date_added,
  grouphash=monitor_environment.incident_grouphash,
  )
 
+ # Create OK check-ins
  for i in range(0, recovery_threshold - 1):
+ now = now + timedelta(minutes=1)
  checkin = MonitorCheckIn.objects.create(
  monitor=monitor,
  monitor_environment=monitor_environment,
  project_id=self.project.id,
  status=CheckInStatus.OK,
+ date_added=now,
  )
  mark_ok(checkin, checkin.date_added)
 
  # failure has not hit threshold, monitor should be in an OK status
- monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+ incident.refresh_from_db()
+ monitor_environment.refresh_from_db()
+
  assert monitor_environment.status != MonitorStatus.OK
+ assert monitor_environment.next_checkin == now + timedelta(minutes=1)
+
  # check that timestamp has not updated
  assert monitor_environment.last_state_change is None
+ # Incidnet has not resolved
+ assert incident.resolving_checkin is None
+ assert incident.resolving_timestamp is None
 
  # create another failed check-in to break the chain
- failed_checkin = MonitorCheckIn.objects.create(
+ last_checkin = MonitorCheckIn.objects.create(
  monitor=monitor,
  monitor_environment=monitor_environment,
  project_id=self.project.id,
  status=CheckInStatus.ERROR,
  )
- mark_failed(failed_checkin, ts=failed_checkin.date_added)
- # assert occurrence was sent
- assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
 
- last_checkin = None
+ # Still not resolved
+ incident.refresh_from_db()
+ assert incident.resolving_checkin is None
+ assert incident.resolving_timestamp is None
+
+ # Create enough check-ins to resolve the incident
  for i in range(0, recovery_threshold):
+ now = now + timedelta(minutes=1)
  checkin = MonitorCheckIn.objects.create(
  monitor=monitor,
  monitor_environment=monitor_environment,
  project_id=self.project.id,
  status=CheckInStatus.OK,
+ date_added=now,
  )
  if i == (recovery_threshold - 1):
  last_checkin = checkin
  mark_ok(checkin, checkin.date_added)
 
  # recovery has hit threshold, monitor should be in an ok state
- monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
+ incident.refresh_from_db()
+ monitor_environment.refresh_from_db()
+
  assert monitor_environment.status == MonitorStatus.OK
+ assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)
+
  # check that monitor environment has updated timestamp used for fingerprinting
  assert monitor_environment.last_state_change == monitor_environment.last_checkin
-
- # check that resolving check-in is set on the incident
- incident = MonitorIncident.objects.get(id=incident.id)
+ # Incident reoslved
  assert incident.resolving_checkin == last_checkin
  assert incident.resolving_timestamp == last_checkin.date_added