Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ref(crons): Correct logic for mark_ok #57730

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions src/sentry/monitors/logic/mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,6 @@
def mark_ok(checkin: MonitorCheckIn, ts: datetime):
monitor_env = checkin.monitor_environment

recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
if recovery_threshold:
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
"-date_added"
)[:recovery_threshold]
# check for successive OK previous check-ins
if not all(
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
):
# don't send occurrence for active issue on an OK check-in
return

next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)

Expand All @@ -33,17 +21,49 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime):
"next_checkin": next_checkin,
"next_checkin_latest": next_checkin_latest,
}
if checkin.status == CheckInStatus.OK:

# If the monitor is already in OK status there's not incident update or
# status change necessary
if monitor_env.status == CheckInStatus.OK:
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
)
return

recovery_threshold = monitor_env.monitor.config.get("recovery_threshold", 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be 1 as the default? or idk maybe a comment. the min value is 1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you have the default here as 0 since if it's not set we don't want to enable incidents yet

using_incidents = bool(recovery_threshold)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just change this to recovery_threshold > 1 which is functionally the same thing for now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I basically just duplicated what the logic was before if recovery_threshold:


# If we're not using incidents we can just immediately update the status
if not using_incidents and monitor_env.monitor.status != ObjectStatus.DISABLED:
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
status=MonitorStatus.OK,
**params,
)
return

# Check if our incident is recovering
previous_checkins = MonitorCheckIn.objects.filter(monitor_environment=monitor_env).order_by(
"-date_added"
)[:recovery_threshold]

# Incident recovers when we have successive threshold check-ins
incident_recovering = all(
previous_checkin.status == CheckInStatus.OK for previous_checkin in previous_checkins
)

# Resolve the incident
if incident_recovering and monitor_env.status != MonitorStatus.OK:
MonitorIncident.objects.filter(
monitor_environment=monitor_env,
grouphash=monitor_env.incident_grouphash,
).update(
resolving_checkin=checkin,
resolving_timestamp=checkin.date_added,
)

if monitor_env.monitor.status != ObjectStatus.DISABLED:
params["status"] = MonitorStatus.OK
# in the future this will auto-resolve associated issues
if monitor_env.status != MonitorStatus.OK:
params["last_state_change"] = ts
# resolve any associated incidents
incidents = MonitorIncident.objects.filter(
monitor_environment=monitor_env, grouphash=monitor_env.incident_grouphash
)
incidents.update(resolving_checkin=checkin, resolving_timestamp=checkin.date_added)
params["status"] = MonitorStatus.OK

MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
Expand Down
105 changes: 82 additions & 23 deletions tests/sentry/monitors/logic/test_mark_ok.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import patch
from datetime import timedelta

from django.utils import timezone

from sentry.monitors.logic.mark_failed import mark_failed
from sentry.monitors.logic.mark_ok import mark_ok
from sentry.monitors.models import (
CheckInStatus,
Expand All @@ -13,96 +14,154 @@
ScheduleType,
)
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers import with_feature
from sentry.testutils.silo import region_silo_test


@region_silo_test(stable=True)
class MarkOkTestCase(TestCase):
@with_feature("organizations:issue-platform")
@patch("sentry.issues.producer.produce_occurrence_to_kafka")
def test_mark_ok_recovery_threshold(self, mock_produce_occurrence_to_kafka):
Comment on lines -22 to -23
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made this more unit-test like by removing the dependency on mark_failed from this test.

def test_mark_ok_simple(self):
now = timezone.now().replace(second=0, microsecond=0)

monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"max_runtime": None,
"checkin_margin": None,
},
)

# Start with monitor in an ERROR state
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_checkin=now - timedelta(minutes=1),
next_checkin=now,
)

# OK checkin comes in
success_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)
assert monitor_environment.next_checkin_latest == now + timedelta(minutes=2)
assert monitor_environment.last_checkin == now

def test_mark_ok_recovery_threshold(self):
now = timezone.now().replace(second=0, microsecond=0)

recovery_threshold = 8
monitor = Monitor.objects.create(
name="test monitor",
organization_id=self.organization.id,
project_id=self.project.id,
type=MonitorType.CRON_JOB,
config={
"schedule": [1, "month"],
"schedule_type": ScheduleType.INTERVAL,
"schedule": "* * * * *",
"schedule_type": ScheduleType.CRONTAB,
"recovery_threshold": recovery_threshold,
"max_runtime": None,
"checkin_margin": None,
},
)

# Start with monitor in an ERROR state with an active incident
monitor_environment = MonitorEnvironment.objects.create(
monitor=monitor,
environment=self.environment,
status=MonitorStatus.ERROR,
last_state_change=None,
)

checkin = MonitorCheckIn.objects.create(
first_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
date_added=now,
)

incident = MonitorIncident.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
starting_checkin=checkin,
starting_timestamp=checkin.date_added,
starting_checkin=first_checkin,
starting_timestamp=first_checkin.date_added,
grouphash=monitor_environment.incident_grouphash,
)

# Create OK check-ins
for i in range(0, recovery_threshold - 1):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(checkin, checkin.date_added)

# failure has not hit threshold, monitor should be in an OK status
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status != MonitorStatus.OK
assert monitor_environment.next_checkin == now + timedelta(minutes=1)

# check that timestamp has not updated
assert monitor_environment.last_state_change is None
# Incidnet has not resolved
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# create another failed check-in to break the chain
failed_checkin = MonitorCheckIn.objects.create(
last_checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.ERROR,
)
mark_failed(failed_checkin, ts=failed_checkin.date_added)
# assert occurrence was sent
assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1

last_checkin = None
# Still not resolved
incident.refresh_from_db()
assert incident.resolving_checkin is None
assert incident.resolving_timestamp is None

# Create enough check-ins to resolve the incident
for i in range(0, recovery_threshold):
now = now + timedelta(minutes=1)
checkin = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
date_added=now,
)
if i == (recovery_threshold - 1):
last_checkin = checkin
mark_ok(checkin, checkin.date_added)

# recovery has hit threshold, monitor should be in an ok state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
incident.refresh_from_db()
monitor_environment.refresh_from_db()

assert monitor_environment.status == MonitorStatus.OK
assert monitor_environment.next_checkin == last_checkin.date_added + timedelta(minutes=1)

# check that monitor environment has updated timestamp used for fingerprinting
assert monitor_environment.last_state_change == monitor_environment.last_checkin

# check that resolving check-in is set on the incident
incident = MonitorIncident.objects.get(id=incident.id)
# Incident reoslved
assert incident.resolving_checkin == last_checkin
assert incident.resolving_timestamp == last_checkin.date_added
Loading