Skip to content

Commit

Permalink
feat(crons): Add helpful subtitle to crons issue platform + update ev…
Browse files Browse the repository at this point in the history
…idence (#51877)

Adds helpful human-readable subtitle to issue platform crons issues +
updates `last check-in` evidence to `last successful check-in`
  • Loading branch information
rjo100 authored Jul 13, 2023
1 parent ee8ef70 commit b466fbc
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 21 deletions.
53 changes: 45 additions & 8 deletions src/sentry/monitors/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,16 @@ class Meta:
def get_audit_log_data(self):
return {"name": self.environment.name, "status": self.status, "monitor": self.monitor.name}

def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
def get_last_successful_checkin(self):
return (
MonitorCheckIn.objects.filter(monitor_environment=self, status=CheckInStatus.OK)
.order_by("-date_added")
.first()
)

def mark_failed(
self, last_checkin=None, reason=MonitorFailure.UNKNOWN, occurrence_context=None
):
from sentry.signals import monitor_environment_failed

if last_checkin is None:
Expand Down Expand Up @@ -529,7 +538,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.producer import produce_occurrence_to_kafka

occurrence_data = get_occurrence_data(reason)
if not occurrence_context:
occurrence_context = {}

occurrence_data = get_occurrence_data(reason, **occurrence_context)

# Get last successful check-in to show in evidence display
last_successful_checkin_timestamp = "None"
last_successful_checkin = self.get_last_successful_checkin()
if last_successful_checkin:
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()

occurrence = IssueOccurrence(
id=uuid.uuid4().hex,
Expand All @@ -541,14 +559,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
],
type=occurrence_data["group_type"],
issue_title=f"Monitor failure: {self.monitor.name}",
subtitle="",
subtitle=occurrence_data["subtitle"],
evidence_display=[
IssueEvidence(
name="Failure reason", value=occurrence_data["reason"], important=True
),
IssueEvidence(name="Environment", value=self.environment.name, important=False),
IssueEvidence(
name="Last check-in", value=last_checkin.isoformat(), important=False
name="Last successful check-in",
value=last_successful_checkin_timestamp,
important=False,
),
],
evidence_data={},
Expand Down Expand Up @@ -613,13 +633,30 @@ def mark_ok(self, checkin: MonitorCheckIn, ts: datetime):
MonitorEnvironment.objects.filter(id=self.id).exclude(last_checkin__gt=ts).update(**params)


def get_occurrence_data(reason: str):
def get_occurrence_data(reason: str, **kwargs):
if reason == MonitorFailure.MISSED_CHECKIN:
return {"group_type": MonitorCheckInMissed, "level": "warning", "reason": "missed_checkin"}
expected_time = kwargs.get("expected_time", "the expected time")
return {
"group_type": MonitorCheckInMissed,
"level": "warning",
"reason": "missed_checkin",
"subtitle": f"No check-in reported at {expected_time}.",
}
elif reason == MonitorFailure.DURATION:
return {"group_type": MonitorCheckInTimeout, "level": "error", "reason": "duration"}
timeout = kwargs.get("timeout", 30)
return {
"group_type": MonitorCheckInTimeout,
"level": "error",
"reason": "duration",
"subtitle": f"Check-in exceeded maximum duration of {timeout} minutes.",
}

return {"group_type": MonitorCheckInFailure, "level": "error", "reason": "error"}
return {
"group_type": MonitorCheckInFailure,
"level": "error",
"reason": "error",
"subtitle": "An error occurred during the last check-in.",
}


@receiver(pre_save, sender=MonitorEnvironment)
Expand Down
10 changes: 8 additions & 2 deletions src/sentry/monitors/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ def check_monitors(current_datetime=None):
expected_time=expected_time,
monitor_config=monitor.get_validated_config(),
)
monitor_environment.mark_failed(reason=MonitorFailure.MISSED_CHECKIN)
monitor_environment.mark_failed(
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={"expected_time": expected_time},
)
except Exception:
logger.exception("Exception in check_monitors - mark missed")

Expand Down Expand Up @@ -124,6 +127,9 @@ def check_monitors(current_datetime=None):
status__in=[CheckInStatus.OK, CheckInStatus.ERROR],
).exists()
if not has_newer_result:
monitor_environment.mark_failed(reason=MonitorFailure.DURATION)
monitor_environment.mark_failed(
reason=MonitorFailure.DURATION,
occurrence_context={"duration": (timeout.seconds // 60) % 60},
)
except Exception:
logger.exception("Exception in check_monitors - mark timeout")
42 changes: 31 additions & 11 deletions tests/sentry/monitors/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
MonitorCheckInTimeout,
)
from sentry.monitors.models import (
CheckInStatus,
Monitor,
MonitorCheckIn,
MonitorEnvironment,
MonitorEnvironmentLimitsExceeded,
MonitorFailure,
Expand Down Expand Up @@ -314,6 +316,13 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
status=monitor.status,
)

successful_check_in = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
)

last_checkin = timezone.now()
assert monitor_environment.mark_failed(last_checkin=last_checkin)

Expand All @@ -328,7 +337,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": "An error occurred during the last check-in.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -339,8 +348,8 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": successful_check_in.date_added.isoformat(),
"important": False,
},
],
Expand Down Expand Up @@ -391,9 +400,17 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
environment=self.environment,
status=monitor.status,
)
successful_check_in = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
)
last_checkin = timezone.now()
assert monitor_environment.mark_failed(
last_checkin=last_checkin, reason=MonitorFailure.DURATION
last_checkin=last_checkin,
reason=MonitorFailure.DURATION,
occurrence_context={"duration": 30},
)

assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
Expand All @@ -407,7 +424,7 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": "Check-in exceeded maximum duration of 30 minutes.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -418,8 +435,8 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": successful_check_in.date_added.isoformat(),
"important": False,
},
],
Expand Down Expand Up @@ -471,8 +488,11 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
status=monitor.status,
)
last_checkin = timezone.now()
expected_time = monitor.get_next_scheduled_checkin_without_margin(last_checkin)
assert monitor_environment.mark_failed(
last_checkin=last_checkin, reason=MonitorFailure.MISSED_CHECKIN
last_checkin=last_checkin,
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={"expected_time": expected_time},
)

monitor.refresh_from_db()
Expand All @@ -490,7 +510,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": f"No check-in reported at {expected_time}.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -501,8 +521,8 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": "None",
"important": False,
},
],
Expand Down

0 comments on commit b466fbc

Please sign in to comment.