From 9d812043d244f43e8e54c8422ede94544c87238f Mon Sep 17 00:00:00 2001 From: Richard Ortenberg Date: Thu, 13 Jul 2023 10:31:28 -0700 Subject: [PATCH] feat(crons): Add helpful subtitle to crons issue platform + update evidence (#51877) Adds helpful human-readable subtitle to issue platform crons issues + updates `last check-in` evidence to `last successful check-in` --- src/sentry/monitors/models.py | 53 +++++++++++++++++++++++----- src/sentry/monitors/tasks.py | 10 ++++-- tests/sentry/monitors/test_models.py | 42 ++++++++++++++++------ 3 files changed, 84 insertions(+), 21 deletions(-) diff --git a/src/sentry/monitors/models.py b/src/sentry/monitors/models.py index cb8842fc9d1eaa..a89c20389f32ef 100644 --- a/src/sentry/monitors/models.py +++ b/src/sentry/monitors/models.py @@ -480,7 +480,16 @@ class Meta: def get_audit_log_data(self): return {"name": self.environment.name, "status": self.status, "monitor": self.monitor.name} - def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN): + def get_last_successful_checkin(self): + return ( + MonitorCheckIn.objects.filter(monitor_environment=self, status=CheckInStatus.OK) + .order_by("-date_added") + .first() + ) + + def mark_failed( + self, last_checkin=None, reason=MonitorFailure.UNKNOWN, occurrence_context=None + ): from sentry.signals import monitor_environment_failed if last_checkin is None: @@ -529,7 +538,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN): from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence from sentry.issues.producer import produce_occurrence_to_kafka - occurrence_data = get_occurrence_data(reason) + if not occurrence_context: + occurrence_context = {} + + occurrence_data = get_occurrence_data(reason, **occurrence_context) + + # Get last successful check-in to show in evidence display + last_successful_checkin_timestamp = "None" + last_successful_checkin = self.get_last_successful_checkin() + if last_successful_checkin: + last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat() occurrence = IssueOccurrence( id=uuid.uuid4().hex, @@ -541,14 +559,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN): ], type=occurrence_data["group_type"], issue_title=f"Monitor failure: {self.monitor.name}", - subtitle="", + subtitle=occurrence_data["subtitle"], evidence_display=[ IssueEvidence( name="Failure reason", value=occurrence_data["reason"], important=True ), IssueEvidence(name="Environment", value=self.environment.name, important=False), IssueEvidence( - name="Last check-in", value=last_checkin.isoformat(), important=False + name="Last successful check-in", + value=last_successful_checkin_timestamp, + important=False, ), ], evidence_data={}, @@ -613,13 +633,30 @@ def mark_ok(self, checkin: MonitorCheckIn, ts: datetime): MonitorEnvironment.objects.filter(id=self.id).exclude(last_checkin__gt=ts).update(**params) -def get_occurrence_data(reason: str): +def get_occurrence_data(reason: str, **kwargs): if reason == MonitorFailure.MISSED_CHECKIN: - return {"group_type": MonitorCheckInMissed, "level": "warning", "reason": "missed_checkin"} + expected_time = kwargs.get("expected_time", "the expected time") + return { + "group_type": MonitorCheckInMissed, + "level": "warning", + "reason": "missed_checkin", + "subtitle": f"No check-in reported at {expected_time}.", + } elif reason == MonitorFailure.DURATION: - return {"group_type": MonitorCheckInTimeout, "level": "error", "reason": "duration"} + timeout = kwargs.get("timeout", 30) + return { + "group_type": MonitorCheckInTimeout, + "level": "error", + "reason": "duration", + "subtitle": f"Check-in exceeded maximum duration of {timeout} minutes.", + } - return {"group_type": MonitorCheckInFailure, "level": "error", "reason": "error"} + return { + "group_type": MonitorCheckInFailure, + "level": "error", + "reason": "error", + "subtitle": "An error occurred during the last check-in.", + } @receiver(pre_save, sender=MonitorEnvironment) diff --git a/src/sentry/monitors/tasks.py b/src/sentry/monitors/tasks.py index 43bfee9f956d57..ca8f3ecf114837 100644 --- a/src/sentry/monitors/tasks.py +++ b/src/sentry/monitors/tasks.py @@ -88,7 +88,10 @@ def check_monitors(current_datetime=None): expected_time=expected_time, monitor_config=monitor.get_validated_config(), ) - monitor_environment.mark_failed(reason=MonitorFailure.MISSED_CHECKIN) + monitor_environment.mark_failed( + reason=MonitorFailure.MISSED_CHECKIN, + occurrence_context={"expected_time": expected_time}, + ) except Exception: logger.exception("Exception in check_monitors - mark missed") @@ -124,6 +127,9 @@ def check_monitors(current_datetime=None): status__in=[CheckInStatus.OK, CheckInStatus.ERROR], ).exists() if not has_newer_result: - monitor_environment.mark_failed(reason=MonitorFailure.DURATION) + monitor_environment.mark_failed( + reason=MonitorFailure.DURATION, + occurrence_context={"duration": (timeout.seconds // 60) % 60}, + ) except Exception: logger.exception("Exception in check_monitors - mark timeout") diff --git a/tests/sentry/monitors/test_models.py b/tests/sentry/monitors/test_models.py index f8d0d5a9bc55d9..af6b748cfba87c 100644 --- a/tests/sentry/monitors/test_models.py +++ b/tests/sentry/monitors/test_models.py @@ -13,7 +13,9 @@ MonitorCheckInTimeout, ) from sentry.monitors.models import ( + CheckInStatus, Monitor, + MonitorCheckIn, MonitorEnvironment, MonitorEnvironmentLimitsExceeded, MonitorFailure, @@ -314,6 +316,13 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence status=monitor.status, ) + successful_check_in = MonitorCheckIn.objects.create( + monitor=monitor, + monitor_environment=monitor_environment, + project_id=self.project.id, + status=CheckInStatus.OK, + ) + last_checkin = timezone.now() assert monitor_environment.mark_failed(last_checkin=last_checkin) @@ -328,7 +337,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence "project_id": self.project.id, "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])], "issue_title": f"Monitor failure: {monitor.name}", - "subtitle": "", + "subtitle": "An error occurred during the last check-in.", "resource_id": None, "evidence_data": {}, "evidence_display": [ @@ -339,8 +348,8 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence "important": False, }, { - "name": "Last check-in", - "value": last_checkin.isoformat(), + "name": "Last successful check-in", + "value": successful_check_in.date_added.isoformat(), "important": False, }, ], @@ -391,9 +400,17 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to environment=self.environment, status=monitor.status, ) + successful_check_in = MonitorCheckIn.objects.create( + monitor=monitor, + monitor_environment=monitor_environment, + project_id=self.project.id, + status=CheckInStatus.OK, + ) last_checkin = timezone.now() assert monitor_environment.mark_failed( - last_checkin=last_checkin, reason=MonitorFailure.DURATION + last_checkin=last_checkin, + reason=MonitorFailure.DURATION, + occurrence_context={"duration": 30}, ) assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1 @@ -407,7 +424,7 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to "project_id": self.project.id, "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])], "issue_title": f"Monitor failure: {monitor.name}", - "subtitle": "", + "subtitle": "Check-in exceeded maximum duration of 30 minutes.", "resource_id": None, "evidence_data": {}, "evidence_display": [ @@ -418,8 +435,8 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to "important": False, }, { - "name": "Last check-in", - "value": last_checkin.isoformat(), + "name": "Last successful check-in", + "value": successful_check_in.date_added.isoformat(), "important": False, }, ], @@ -471,8 +488,11 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr status=monitor.status, ) last_checkin = timezone.now() + expected_time = monitor.get_next_scheduled_checkin_without_margin(last_checkin) assert monitor_environment.mark_failed( - last_checkin=last_checkin, reason=MonitorFailure.MISSED_CHECKIN + last_checkin=last_checkin, + reason=MonitorFailure.MISSED_CHECKIN, + occurrence_context={"expected_time": expected_time}, ) monitor.refresh_from_db() @@ -490,7 +510,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr "project_id": self.project.id, "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])], "issue_title": f"Monitor failure: {monitor.name}", - "subtitle": "", + "subtitle": f"No check-in reported at {expected_time}.", "resource_id": None, "evidence_data": {}, "evidence_display": [ @@ -501,8 +521,8 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr "important": False, }, { - "name": "Last check-in", - "value": last_checkin.isoformat(), + "name": "Last successful check-in", + "value": "None", "important": False, }, ],