feat(crons): Add helpful subtitle to crons issue platform + update ev…

…idence (#51877) Adds helpful human-readable subtitle to issue platform crons issues + updates `last check-in` evidence to `last successful check-in`
getsentry · Jul 13, 2023 · 87e5656 · 87e5656
1 parent 4d405d4
commit 87e5656
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 21 deletions.
diff --git a/src/sentry/monitors/models.py b/src/sentry/monitors/models.py
@@ -480,7 +480,16 @@ class Meta:
  def get_audit_log_data(self):
  return {"name": self.environment.name, "status": self.status, "monitor": self.monitor.name}
 
- def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
+ def get_last_successful_checkin(self):
+ return (
+ MonitorCheckIn.objects.filter(monitor_environment=self, status=CheckInStatus.OK)
+ .order_by("-date_added")
+ .first()
+ )
+
+ def mark_failed(
+ self, last_checkin=None, reason=MonitorFailure.UNKNOWN, occurrence_context=None
+ ):
  from sentry.signals import monitor_environment_failed
 
  if last_checkin is None:
@@ -529,7 +538,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
  from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
  from sentry.issues.producer import produce_occurrence_to_kafka
 
- occurrence_data = get_occurrence_data(reason)
+ if not occurrence_context:
+ occurrence_context = {}
+
+ occurrence_data = get_occurrence_data(reason, **occurrence_context)
+
+ # Get last successful check-in to show in evidence display
+ last_successful_checkin_timestamp = "None"
+ last_successful_checkin = self.get_last_successful_checkin()
+ if last_successful_checkin:
+ last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
 
  occurrence = IssueOccurrence(
  id=uuid.uuid4().hex,
@@ -541,14 +559,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
  ],
  type=occurrence_data["group_type"],
  issue_title=f"Monitor failure: {self.monitor.name}",
- subtitle="",
+ subtitle=occurrence_data["subtitle"],
  evidence_display=[
  IssueEvidence(
  name="Failure reason", value=occurrence_data["reason"], important=True
  ),
  IssueEvidence(name="Environment", value=self.environment.name, important=False),
  IssueEvidence(
- name="Last check-in", value=last_checkin.isoformat(), important=False
+ name="Last successful check-in",
+ value=last_successful_checkin_timestamp,
+ important=False,
  ),
  ],
  evidence_data={},
@@ -613,13 +633,30 @@ def mark_ok(self, checkin: MonitorCheckIn, ts: datetime):
  MonitorEnvironment.objects.filter(id=self.id).exclude(last_checkin__gt=ts).update(**params)
 
 
-def get_occurrence_data(reason: str):
+def get_occurrence_data(reason: str, **kwargs):
  if reason == MonitorFailure.MISSED_CHECKIN:
- return {"group_type": MonitorCheckInMissed, "level": "warning", "reason": "missed_checkin"}
+ expected_time = kwargs.get("expected_time", "the expected time")
+ return {
+ "group_type": MonitorCheckInMissed,
+ "level": "warning",
+ "reason": "missed_checkin",
+ "subtitle": f"No check-in reported at {expected_time}.",
+ }
  elif reason == MonitorFailure.DURATION:
- return {"group_type": MonitorCheckInTimeout, "level": "error", "reason": "duration"}
+ timeout = kwargs.get("timeout", 30)
+ return {
+ "group_type": MonitorCheckInTimeout,
+ "level": "error",
+ "reason": "duration",
+ "subtitle": f"Check-in exceeded maximum duration of {timeout} minutes.",
+ }
 
- return {"group_type": MonitorCheckInFailure, "level": "error", "reason": "error"}
+ return {
+ "group_type": MonitorCheckInFailure,
+ "level": "error",
+ "reason": "error",
+ "subtitle": "An error occurred during the last check-in.",
+ }
 
 
 @receiver(pre_save, sender=MonitorEnvironment)

diff --git a/src/sentry/monitors/tasks.py b/src/sentry/monitors/tasks.py
@@ -88,7 +88,10 @@ def check_monitors(current_datetime=None):
  expected_time=expected_time,
  monitor_config=monitor.get_validated_config(),
  )
- monitor_environment.mark_failed(reason=MonitorFailure.MISSED_CHECKIN)
+ monitor_environment.mark_failed(
+ reason=MonitorFailure.MISSED_CHECKIN,
+ occurrence_context={"expected_time": expected_time},
+ )
  except Exception:
  logger.exception("Exception in check_monitors - mark missed")
 
@@ -124,6 +127,9 @@ def check_monitors(current_datetime=None):
  status__in=[CheckInStatus.OK, CheckInStatus.ERROR],
  ).exists()
  if not has_newer_result:
- monitor_environment.mark_failed(reason=MonitorFailure.DURATION)
+ monitor_environment.mark_failed(
+ reason=MonitorFailure.DURATION,
+ occurrence_context={"duration": (timeout.seconds // 60) % 60},
+ )
  except Exception:
  logger.exception("Exception in check_monitors - mark timeout")
diff --git a/tests/sentry/monitors/test_models.py b/tests/sentry/monitors/test_models.py
@@ -13,7 +13,9 @@
  MonitorCheckInTimeout,
 )
 from sentry.monitors.models import (
+ CheckInStatus,
  Monitor,
+ MonitorCheckIn,
  MonitorEnvironment,
  MonitorEnvironmentLimitsExceeded,
  MonitorFailure,
@@ -314,6 +316,13 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
  status=monitor.status,
  )
 
+ successful_check_in = MonitorCheckIn.objects.create(
+ monitor=monitor,
+ monitor_environment=monitor_environment,
+ project_id=self.project.id,
+ status=CheckInStatus.OK,
+ )
+
  last_checkin = timezone.now()
  assert monitor_environment.mark_failed(last_checkin=last_checkin)
 
@@ -328,7 +337,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
  "project_id": self.project.id,
  "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
  "issue_title": f"Monitor failure: {monitor.name}",
- "subtitle": "",
+ "subtitle": "An error occurred during the last check-in.",
  "resource_id": None,
  "evidence_data": {},
  "evidence_display": [
@@ -339,8 +348,8 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
  "important": False,
  },
  {
- "name": "Last check-in",
- "value": last_checkin.isoformat(),
+ "name": "Last successful check-in",
+ "value": successful_check_in.date_added.isoformat(),
  "important": False,
  },
  ],
@@ -391,9 +400,17 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
  environment=self.environment,
  status=monitor.status,
  )
+ successful_check_in = MonitorCheckIn.objects.create(
+ monitor=monitor,
+ monitor_environment=monitor_environment,
+ project_id=self.project.id,
+ status=CheckInStatus.OK,
+ )
  last_checkin = timezone.now()
  assert monitor_environment.mark_failed(
- last_checkin=last_checkin, reason=MonitorFailure.DURATION
+ last_checkin=last_checkin,
+ reason=MonitorFailure.DURATION,
+ occurrence_context={"duration": 30},
  )
 
  assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
@@ -407,7 +424,7 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
  "project_id": self.project.id,
  "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
  "issue_title": f"Monitor failure: {monitor.name}",
- "subtitle": "",
+ "subtitle": "Check-in exceeded maximum duration of 30 minutes.",
  "resource_id": None,
  "evidence_data": {},
  "evidence_display": [
@@ -418,8 +435,8 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
  "important": False,
  },
  {
- "name": "Last check-in",
- "value": last_checkin.isoformat(),
+ "name": "Last successful check-in",
+ "value": successful_check_in.date_added.isoformat(),
  "important": False,
  },
  ],
@@ -471,8 +488,11 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
  status=monitor.status,
  )
  last_checkin = timezone.now()
+ expected_time = monitor.get_next_scheduled_checkin_without_margin(last_checkin)
  assert monitor_environment.mark_failed(
- last_checkin=last_checkin, reason=MonitorFailure.MISSED_CHECKIN
+ last_checkin=last_checkin,
+ reason=MonitorFailure.MISSED_CHECKIN,
+ occurrence_context={"expected_time": expected_time},
  )
 
  monitor.refresh_from_db()
@@ -490,7 +510,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
  "project_id": self.project.id,
  "fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
  "issue_title": f"Monitor failure: {monitor.name}",
- "subtitle": "",
+ "subtitle": f"No check-in reported at {expected_time}.",
  "resource_id": None,
  "evidence_data": {},
  "evidence_display": [
@@ -501,8 +521,8 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
  "important": False,
  },
  {
- "name": "Last check-in",
- "value": last_checkin.isoformat(),
+ "name": "Last successful check-in",
+ "value": "None",
  "important": False,
  },
  ],