Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(crons): Properly update monitorenvs in mark_ok with thresholds #57955

Merged
merged 9 commits into from
Oct 19, 2023
60 changes: 36 additions & 24 deletions src/sentry/monitors/logic/mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,6 @@
def mark_ok(checkin: MonitorCheckIn, ts: datetime):
monitor_env = checkin.monitor_environment

recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
if recovery_threshold:
previous_checkins = (
MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
.order_by("-date_added")
.values("id", "date_added", "status")[:recovery_threshold]
)
# check for successive OK previous check-ins
if not all(
previous_checkin["status"] == CheckInStatus.OK for previous_checkin in previous_checkins
):
# don't send occurrence for active issue on an OK check-in
return

next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)

Expand All @@ -35,17 +21,43 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
"next_checkin": next_checkin,
"next_checkin_latest": next_checkin_latest,
}
if checkin.status == CheckInStatus.OK:
if monitor_env.monitor.status != ObjectStatus.DISABLED:
params["status"] = MonitorStatus.OK
# in the future this will auto-resolve associated issues
if monitor_env.status != MonitorStatus.OK:
params["last_state_change"] = ts
# resolve any associated incidents
incidents = MonitorIncident.objects.filter(
monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash

if (
monitor_env.monitor.status != ObjectStatus.DISABLED
and monitor_env.status != MonitorStatus.OK
):
params["status"] = MonitorStatus.OK
recovery_threshold = monitor_env.monitor.config.get("recovery_threshold")

# Run incident logic if recovery threshold is set
if recovery_threshold:
# Check if our incident is recovering
previous_checkins = (
MonitorCheckIn.objects.filter(monitor_environment=monitor_env)
.values("id", "date_added", "status")
.order_by("-date_added")[:recovery_threshold]
)
incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)

# Incident recovers when we have successive threshold check-ins
incident_recovering = all(
previous_checkin["status"] == CheckInStatus.OK
for previous_checkin in previous_checkins
)

# Resolve the incident
if incident_recovering:
MonitorIncident.objects.filter(
monitor_environment=monitor_env,
grouphash=monitor_env.incident_grouphash,
).update(
resolving_checkin=checkin,
resolving_timestamp=checkin.date_added,
)

params["last_state_change"] = ts
else:
# Don't update status if incident isn't recovered
params.pop("status", None)

MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
Expand Down
106 changes: 83 additions & 23 deletions tests/sentry/monitors/logic/test_mark_ok.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch
from datetime import timedelta

from django.utils import timezone

from sentry.monitors.logic.mark_failed import mark_failed
from sentry.monitors.logic.mark_ok import mark_ok
from sentry.monitors.models import (
CheckInStatus,
Expand All @@ -13,96 +14,155 @@
ScheduleType,
)
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers import with_feature
from sentry.testutils.silo import region_silo_test


@region_silo_test(stable=True)
class MarkOkTestCase(TestCase):
@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
def test_mark_ok_simple(self):
now = timezone.now().replace(second=0, microsecond=0)

monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"max_runtime": None,
"checkin_margin": None,
"recovery_threshold": None,
},
)

# Start with monitor in an ERROR state
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_checkin=now - timedelta(minutes=1),
next_checkin=now,
)

# OK checkin comes in
success_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)
assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
assert monitor_environment.last_checkin == now

def test_mark_ok_recovery_threshold(self):
now = timezone.now().replace(second=0, microsecond=0)

recovery_threshold = 8
monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": [1, "month"],
"schedule_type": ScheduleType.INTERVAL,
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"recovery_threshold": recovery_threshold,
"max_runtime": None,
"checkin_margin": None,
},
)

# Start with monitor in an ERROR state with an active incident
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_state_change=None,
)

checkin = MonitorCheckIn.objects.create(
first_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
date_added=now,
)

incident = MonitorIncident.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
starting_checkin=checkin,
starting_timestamp=checkin.date_added,
starting_checkin=first_checkin,
starting_timestamp=first_checkin.date_added,
grouphash=monitor_environment.incident_grouphash,
)

# Create OK check-ins
for i in range(0, recovery_threshold - 1):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(checkin, checkin.date_added)

# failure has not hit threshold, monitor should be in an OK status
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status != MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)

# check that timestamp has not updated
assert monitor_environment.last_state_change is None
# Incident has not resolved
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# create another failed check-in to break the chain
failed_checkin = MonitorCheckIn.objects.create(
last_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
)
mark_failed(failed_checkin, ts=failed_checkin.date_added)
# assert occurrence was sent
assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

last_checkin = None
# Still not resolved
incident.refresh_from_db()
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# Create enough check-ins to resolve the incident
for i in range(0, recovery_threshold):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
if i == (recovery_threshold - 1):
last_checkin = checkin
mark_ok(checkin, checkin.date_added)

# recovery has hit threshold, monitor should be in an ok state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)

# check that monitor environment has updated timestamp used for fingerprinting
assert monitor_environment.last_state_change == monitor_environment.last_checkin

# check that resolving check-in is set on the incident
incident = MonitorIncident.objects.get(id=incident.id)
# Incident resolved
assert incident.resolving_checkin == last_checkin
assert incident.resolving_timestamp == last_checkin.date_added
Loading