Skip to content

Commit

Permalink
ref(crons): Correct logic for mark_ok (#57730)
Browse files Browse the repository at this point in the history
Previously when incidents were enabled (a recovery threshold was set) a
monitor would not have it's `next_checkin` or `next_checkin_latest`
updated until it recovered.
  • Loading branch information
evanpurkhiser authored Oct 10, 2023
1 parent 23cef81 commit dcd6da4
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 44 deletions.
62 changes: 41 additions & 21 deletions src/sentry/monitors/logic/mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@
def mark_ok(checkin: MonitorCheckIn, ts: datetime):
monitor_env = checkin.monitor_environment

recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
if recovery_threshold:
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
"-date_added"
)[:recovery_threshold]
# check for successive OK previous check-ins
if not all(
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
):
# don't send occurrence for active issue on an OK check-in
return

next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)

Expand All @@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
"next_checkin": next_checkin,
"next_checkin_latest": next_checkin_latest,
}
if checkin.status == CheckInStatus.OK:

# If the monitor is already in OK status there's not incident update or
# status change necessary
if monitor_env.status == CheckInStatus.OK:
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
)
return

recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
using_incidents = bool(recovery_threshold)

# If we're not using incidents we can just immediately update the status
if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
status=MonitorStatus.OK,
**params,
)
return

# Check if our incident is recovering
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
"-date_added"
)[:recovery_threshold]

# Incident recovers when we have successive threshold check-ins
incident_recovering = all(
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
)

# Resolve the incident
if incident_recovering and monitor_env.status != MonitorStatus.OK:
MonitorIncident.objects.filter(
monitor_environment=monitor_env,
grouphash=monitor_env.incident_grouphash,
).update(
resolving_checkin=checkin,
resolving_timestamp=checkin.date_added,
)

if monitor_env.monitor.status != ObjectStatus.DISABLED:
params["status"] = MonitorStatus.OK
# in the future this will auto-resolve associated issues
if monitor_env.status != MonitorStatus.OK:
params["last_state_change"] = ts
# resolve any associated incidents
incidents = MonitorIncident.objects.filter(
monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash
)
incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)
params["status"] = MonitorStatus.OK

MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
Expand Down
105 changes: 82 additions & 23 deletions tests/sentry/monitors/logic/test_mark_ok.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch
from datetime import timedelta

from django.utils import timezone

from sentry.monitors.logic.mark_failed import mark_failed
from sentry.monitors.logic.mark_ok import mark_ok
from sentry.monitors.models import (
CheckInStatus,
Expand All @@ -13,96 +14,154 @@
ScheduleType,
)
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers import with_feature
from sentry.testutils.silo import region_silo_test


@region_silo_test(stable=True)
class MarkOkTestCase(TestCase):
@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
def test_mark_ok_simple(self):
now = timezone.now().replace(second=0, microsecond=0)

monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"max_runtime": None,
"checkin_margin": None,
},
)

# Start with monitor in an ERROR state
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_checkin=now - timedelta(minutes=1),
next_checkin=now,
)

# OK checkin comes in
success_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)
assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
assert monitor_environment.last_checkin == now

def test_mark_ok_recovery_threshold(self):
now = timezone.now().replace(second=0, microsecond=0)

recovery_threshold = 8
monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": [1, "month"],
"schedule_type": ScheduleType.INTERVAL,
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"recovery_threshold": recovery_threshold,
"max_runtime": None,
"checkin_margin": None,
},
)

# Start with monitor in an ERROR state with an active incident
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_state_change=None,
)

checkin = MonitorCheckIn.objects.create(
first_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
date_added=now,
)

incident = MonitorIncident.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
starting_checkin=checkin,
starting_timestamp=checkin.date_added,
starting_checkin=first_checkin,
starting_timestamp=first_checkin.date_added,
grouphash=monitor_environment.incident_grouphash,
)

# Create OK check-ins
for i in range(0, recovery_threshold - 1):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(checkin, checkin.date_added)

# failure has not hit threshold, monitor should be in an OK status
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status != MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)

# check that timestamp has not updated
assert monitor_environment.last_state_change is None
# Incidnet has not resolved
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# create another failed check-in to break the chain
failed_checkin = MonitorCheckIn.objects.create(
last_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
)
mark_failed(failed_checkin, ts=failed_checkin.date_added)
# assert occurrence was sent
assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

last_checkin = None
# Still not resolved
incident.refresh_from_db()
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# Create enough check-ins to resolve the incident
for i in range(0, recovery_threshold):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
if i == (recovery_threshold - 1):
last_checkin = checkin
mark_ok(checkin, checkin.date_added)

# recovery has hit threshold, monitor should be in an ok state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)

# check that monitor environment has updated timestamp used for fingerprinting
assert monitor_environment.last_state_change == monitor_environment.last_checkin

# check that resolving check-in is set on the incident
incident = MonitorIncident.objects.get(id=incident.id)
# Incident reoslved
assert incident.resolving_checkin == last_checkin
assert incident.resolving_timestamp == last_checkin.date_added

0 comments on commit dcd6da4

Please sign in to comment.