Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(crons): Add helpful subtitle to crons issue platform + update evidence #51877

Merged
merged 2 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions src/sentry/monitors/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,16 @@ class Meta:
def get_audit_log_data(self):
return {"name": self.environment.name, "status": self.status, "monitor": self.monitor.name}

def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
def get_last_successful_checkin(self):
return (
MonitorCheckIn.objects.filter(monitor_environment=self, status=CheckInStatus.OK)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need an index on monitor_env, status, date_added?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't hurt...as long as the DB doesn't go crazy on us again. Would like to add that I guess and then will want to index on trace_id as well. Would it make sense to bundle together or separately?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might make sense to split the trace_id one out since we're not querying on it here? You could still bundle them together in the same ops ticket though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this indexing is complete

.order_by("-date_added")
.first()
)

def mark_failed(
self, last_checkin=None, reason=MonitorFailure.UNKNOWN, occurrence_context=None
):
from sentry.signals import monitor_environment_failed

if last_checkin is None:
Expand Down Expand Up @@ -527,7 +536,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.producer import produce_occurrence_to_kafka

occurrence_data = get_occurrence_data(reason)
if not occurrence_context:
occurrence_context = {}

occurrence_data = get_occurrence_data(reason, **occurrence_context)

# Get last successful check-in to show in evidence display
last_successful_checkin_timestamp = "None"
last_successful_checkin = self.get_last_successful_checkin()
if last_successful_checkin:
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()

occurrence = IssueOccurrence(
id=uuid.uuid4().hex,
Expand All @@ -539,14 +557,16 @@ def mark_failed(self, last_checkin=None, reason=MonitorFailure.UNKNOWN):
],
type=occurrence_data["group_type"],
issue_title=f"Monitor failure: {self.monitor.name}",
subtitle="",
subtitle=occurrence_data["subtitle"],
evidence_display=[
IssueEvidence(
name="Failure reason", value=occurrence_data["reason"], important=True
),
IssueEvidence(name="Environment", value=self.environment.name, important=False),
IssueEvidence(
name="Last check-in", value=last_checkin.isoformat(), important=False
name="Last successful check-in",
value=last_successful_checkin_timestamp,
important=False,
),
],
evidence_data={},
Expand Down Expand Up @@ -611,13 +631,30 @@ def mark_ok(self, checkin: MonitorCheckIn, ts: datetime):
MonitorEnvironment.objects.filter(id=self.id).exclude(last_checkin__gt=ts).update(**params)


def get_occurrence_data(reason: str):
def get_occurrence_data(reason: str, **kwargs):
if reason == MonitorFailure.MISSED_CHECKIN:
return {"group_type": MonitorCheckInMissed, "level": "warning", "reason": "missed_checkin"}
expected_time = kwargs.get("expected_time", "the expected time")
return {
"group_type": MonitorCheckInMissed,
"level": "warning",
"reason": "missed_checkin",
"subtitle": f"No check-in reported at {expected_time}.",
}
elif reason == MonitorFailure.DURATION:
return {"group_type": MonitorCheckInTimeout, "level": "error", "reason": "duration"}
timeout = kwargs.get("timeout", 30)
return {
"group_type": MonitorCheckInTimeout,
"level": "error",
"reason": "duration",
"subtitle": f"Check-in exceeded maximum duration of {timeout} minutes.",
}

return {"group_type": MonitorCheckInFailure, "level": "error", "reason": "error"}
return {
"group_type": MonitorCheckInFailure,
"level": "error",
"reason": "error",
"subtitle": "An error occurred during the last check-in.",
}


@receiver(pre_save, sender=MonitorEnvironment)
Expand Down
10 changes: 8 additions & 2 deletions src/sentry/monitors/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ def check_monitors(current_datetime=None):
expected_time=expected_time,
monitor_config=monitor.get_validated_config(),
)
monitor_environment.mark_failed(reason=MonitorFailure.MISSED_CHECKIN)
monitor_environment.mark_failed(
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={"expected_time": expected_time},
)
except Exception:
logger.exception("Exception in check_monitors - mark missed")

Expand Down Expand Up @@ -128,6 +131,9 @@ def check_monitors(current_datetime=None):
status__in=[CheckInStatus.OK, CheckInStatus.ERROR],
).exists()
if not has_newer_result:
monitor_environment.mark_failed(reason=MonitorFailure.DURATION)
monitor_environment.mark_failed(
reason=MonitorFailure.DURATION,
occurrence_context={"duration": (timeout.seconds // 60) % 60},
)
except Exception:
logger.exception("Exception in check_monitors - mark timeout")
42 changes: 31 additions & 11 deletions tests/sentry/monitors/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
MonitorCheckInTimeout,
)
from sentry.monitors.models import (
CheckInStatus,
Monitor,
MonitorCheckIn,
MonitorEnvironment,
MonitorEnvironmentLimitsExceeded,
MonitorFailure,
Expand Down Expand Up @@ -314,6 +316,13 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
status=monitor.status,
)

successful_check_in = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
)

last_checkin = timezone.now()
assert monitor_environment.mark_failed(last_checkin=last_checkin)

Expand All @@ -328,7 +337,7 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "error"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": "An error occurred during the last check-in.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -339,8 +348,8 @@ def test_mark_failed_default_params_issue_platform(self, mock_produce_occurrence
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": successful_check_in.date_added.isoformat(),
"important": False,
},
],
Expand Down Expand Up @@ -391,9 +400,17 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
environment=self.environment,
status=monitor.status,
)
successful_check_in = MonitorCheckIn.objects.create(
monitor=monitor,
monitor_environment=monitor_environment,
project_id=self.project.id,
status=CheckInStatus.OK,
)
last_checkin = timezone.now()
assert monitor_environment.mark_failed(
last_checkin=last_checkin, reason=MonitorFailure.DURATION
last_checkin=last_checkin,
reason=MonitorFailure.DURATION,
occurrence_context={"duration": 30},
)

assert len(mock_produce_occurrence_to_kafka.mock_calls) == 1
Expand All @@ -407,7 +424,7 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "duration"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": "Check-in exceeded maximum duration of 30 minutes.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -418,8 +435,8 @@ def test_mark_failed_with_reason_issue_platform(self, mock_produce_occurrence_to
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": successful_check_in.date_added.isoformat(),
"important": False,
},
],
Expand Down Expand Up @@ -471,8 +488,11 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
status=monitor.status,
)
last_checkin = timezone.now()
expected_time = monitor.get_next_scheduled_checkin_without_margin(last_checkin)
assert monitor_environment.mark_failed(
last_checkin=last_checkin, reason=MonitorFailure.MISSED_CHECKIN
last_checkin=last_checkin,
reason=MonitorFailure.MISSED_CHECKIN,
occurrence_context={"expected_time": expected_time},
)

monitor.refresh_from_db()
Expand All @@ -490,7 +510,7 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
"project_id": self.project.id,
"fingerprint": [hash_from_values(["monitor", str(monitor.guid), "missed_checkin"])],
"issue_title": f"Monitor failure: {monitor.name}",
"subtitle": "",
"subtitle": f"No check-in reported at {expected_time}.",
"resource_id": None,
"evidence_data": {},
"evidence_display": [
Expand All @@ -501,8 +521,8 @@ def test_mark_failed_with_missed_reason_issue_platform(self, mock_produce_occurr
"important": False,
},
{
"name": "Last check-in",
"value": last_checkin.isoformat(),
"name": "Last successful check-in",
"value": "None",
"important": False,
},
],
Expand Down