Skip to content

Commit

Permalink
ref(crons): Clarify ts argument in mark_{ok,failed} (#80560)
Browse files Browse the repository at this point in the history
Since I'll be adding a `clock_tick` argument to `mark_failed` it was
going to start becoming confusing what the `ts` argument in
`mark_failed` means.

This updates mark_{ok,failed} to use more appropriate names for what the
timestamp represents

Refactoring as part of GH-79328
  • Loading branch information
evanpurkhiser authored Nov 12, 2024
1 parent f9a9a86 commit a35efff
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 31 deletions.
2 changes: 1 addition & 1 deletion src/sentry/monitors/clock_tasks/check_missed.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,4 @@ def mark_environment_missing(monitor_environment_id: int, ts: datetime):
monitor.schedule,
)

mark_failed(checkin, ts=most_recent_expected_ts)
mark_failed(checkin, failed_at=most_recent_expected_ts)
2 changes: 1 addition & 1 deletion src/sentry/monitors/clock_tasks/check_timeout.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ def mark_checkin_timeout(checkin_id: int, ts: datetime) -> None:
monitor.schedule,
)

mark_failed(checkin, ts=most_recent_expected_ts)
mark_failed(checkin, failed_at=most_recent_expected_ts)
4 changes: 2 additions & 2 deletions src/sentry/monitors/consumers/monitor_consumer.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,9 +877,9 @@ def _process_checkin(item: CheckinItem, txn: Transaction | Span):
# Note: We use `start_time` for received here since it's the time that this
# checkin was received by relay. Potentially, `ts` should be the client
# timestamp. If we change that, leave `received` the same.
mark_failed(check_in, ts=start_time, received=start_time)
mark_failed(check_in, failed_at=start_time, received=start_time)
else:
mark_ok(check_in, ts=start_time)
mark_ok(check_in, succeeded_at=start_time)

# track how much time it took for the message to make it through
# relay into kafka. This should help us understand when missed
Expand Down
14 changes: 7 additions & 7 deletions src/sentry/monitors/logic/mark_failed.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@

def mark_failed(
failed_checkin: MonitorCheckIn,
ts: datetime,
failed_at: datetime,
received: datetime | None = None,
) -> bool:
"""
Given a failing check-in, mark the monitor environment as failed and trigger
side effects for creating monitor incidents and issues.
The provided `ts` is the reference time for when the next check-in time is
calculated from. This typically would be the failed check-in's `date_added`
or completion time. Though for the missed and timedout tasks this may be
computed based on the tasks reference time.
The provided `failed_at` is the reference time for when the next check-in
time is calculated from. This typically would be the failed check-in's
`date_added` or completion time. Though for the missed and time-out tasks
this may be computed based on the tasks reference time.
"""
monitor_env = failed_checkin.monitor_environment

Expand All @@ -35,8 +35,8 @@ def mark_failed(
failure_issue_threshold = 1

# Compute the next check-in time from our reference time
next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)
next_checkin = monitor_env.monitor.get_next_expected_checkin(failed_at)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(failed_at)

# When the failed check-in is a synthetic missed check-in we do not move
# the `last_checkin` timestamp forward.
Expand Down
20 changes: 14 additions & 6 deletions src/sentry/monitors/logic/mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,22 @@ class _Params(TypedDict):
status: NotRequired[int]


def mark_ok(checkin: MonitorCheckIn, ts: datetime) -> None:
def mark_ok(checkin: MonitorCheckIn, succeeded_at: datetime) -> None:
"""
Given a successful check-in, attempt to resolve the active incident and
mark the monitor as OK.
The provided `succeeded_at` is the reference time for when the next check-in
time is calculated from. This typically would be when the successful
check-in was received.
"""
monitor_env = checkin.monitor_environment

if monitor_env is None:
return None

next_checkin = monitor_env.monitor.get_next_expected_checkin(ts)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(ts)
next_checkin = monitor_env.monitor.get_next_expected_checkin(succeeded_at)
next_checkin_latest = monitor_env.monitor.get_next_expected_checkin_latest(succeeded_at)

params: _Params = {
"last_checkin": checkin.date_added,
Expand Down Expand Up @@ -91,9 +99,9 @@ def mark_ok(checkin: MonitorCheckIn, ts: datetime) -> None:
monitor_env_id=monitor_env.id,
)

MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(last_checkin__gt=ts).update(
**params
)
MonitorEnvironment.objects.filter(id=monitor_env.id).exclude(
last_checkin__gt=succeeded_at
).update(**params)


def resolve_incident_group(
Expand Down
22 changes: 11 additions & 11 deletions tests/sentry/monitors/logic/test_mark_failed.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_mark_failed_default_params(self, mock_produce_occurrence_to_kafka):
trace_id=trace_id,
date_added=last_checkin,
)
assert mark_failed(checkin, ts=checkin.date_added)
assert mark_failed(checkin, failed_at=checkin.date_added)

monitor_environment.refresh_from_db()
assert monitor_environment.status == MonitorStatus.ERROR
Expand Down Expand Up @@ -171,7 +171,7 @@ def test_mark_failed_muted(self, mock_produce_occurrence_to_kafka):
project_id=self.project.id,
status=CheckInStatus.UNKNOWN,
)
assert mark_failed(checkin, ts=checkin.date_added)
assert mark_failed(checkin, failed_at=checkin.date_added)

monitor.refresh_from_db()
monitor_environment.refresh_from_db()
Expand Down Expand Up @@ -210,7 +210,7 @@ def test_mark_failed_env_muted(self, mock_produce_occurrence_to_kafka):
project_id=self.project.id,
status=CheckInStatus.UNKNOWN,
)
assert mark_failed(checkin, ts=checkin.date_added)
assert mark_failed(checkin, failed_at=checkin.date_added)

monitor.refresh_from_db()
monitor_environment.refresh_from_db()
Expand Down Expand Up @@ -259,7 +259,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
project_id=self.project.id,
status=status,
)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

# failure has not hit threshold, monitor should be in an OK status
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
Expand All @@ -283,7 +283,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
)
if _ == 0:
first_checkin = checkin
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

# failure has hit threshold, monitor should be in a failed state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
Expand Down Expand Up @@ -318,7 +318,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
project_id=self.project.id,
status=status,
)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
assert monitor_environment.status == MonitorStatus.ERROR

Expand Down Expand Up @@ -357,7 +357,7 @@ def test_mark_failed_issue_threshold(self, mock_produce_occurrence_to_kafka):
project_id=self.project.id,
status=status,
)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

monitor_incidents = MonitorIncident.objects.filter(monitor_environment=monitor_environment)
assert len(monitor_incidents) == 2
Expand Down Expand Up @@ -410,15 +410,15 @@ def test_mark_failed_issue_threshold_timeout(self, mock_produce_occurrence_to_ka
for _ in range(0, failure_issue_threshold - 1):
checkin = checkins.pop(0)
checkin.update(status=CheckInStatus.TIMEOUT)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

# failure has not hit threshold, monitor should be in an OK status
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
assert monitor_environment.status == MonitorStatus.OK

checkin = checkins.pop(0)
checkin.update(status=CheckInStatus.TIMEOUT)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

# failure has hit threshold, monitor should be in a failed state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
Expand Down Expand Up @@ -473,7 +473,7 @@ def test_mark_failed_issue_threshold_disabled(self, mock_produce_occurrence_to_k
project_id=self.project.id,
status=CheckInStatus.UNKNOWN,
)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

monitor.refresh_from_db()
monitor_environment.refresh_from_db()
Expand Down Expand Up @@ -516,7 +516,7 @@ def test_mark_failed_issue_assignment(self):
project_id=self.project.id,
status=CheckInStatus.IN_PROGRESS,
)
mark_failed(checkin, ts=checkin.date_added)
mark_failed(checkin, failed_at=checkin.date_added)

# failure has hit threshold, monitor should be in a failed state
monitor_environment = MonitorEnvironment.objects.get(id=monitor_environment.id)
Expand Down
6 changes: 3 additions & 3 deletions tests/sentry/monitors/logic/test_mark_ok.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_mark_ok_simple(self):
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)
mark_ok(success_checkin, now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
Expand Down Expand Up @@ -101,7 +101,7 @@ def test_muted_ok(self):
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)
mark_ok(success_checkin, now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
Expand Down Expand Up @@ -294,7 +294,7 @@ def test_mark_ok_broken_recovery(self, mock_record):
status=CheckInStatus.OK,
date_added=now,
)
mark_ok(success_checkin, ts=now)
mark_ok(success_checkin, now)

# Monitor has recovered to OK with updated upcoming timestamps
monitor_environment.refresh_from_db()
Expand Down

0 comments on commit a35efff

Please sign in to comment.