Skip to content

Commit

Permalink
v1.8.1
Browse files Browse the repository at this point in the history
  • Loading branch information
joeyorlando authored Jul 9, 2024
2 parents 2b20c69 + 34a9013 commit 165f7d7
Show file tree
Hide file tree
Showing 20 changed files with 344 additions and 78 deletions.
36 changes: 35 additions & 1 deletion Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ if not is_ci:

local_resource(
"e2e-tests",
labels=["E2eTests"],
labels=["allTests"],
cmd=e2e_tests_cmd,
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=is_ci,
Expand Down Expand Up @@ -127,6 +127,34 @@ cmd_button(
icon_name="dangerous",
)

# Inspired by https://github.com/grafana/slo/blob/main/Tiltfile#L72
pod_engine_pytest_script = '''
set -eu
# get engine k8s pod name from tilt resource name
POD_NAME="$(tilt get kubernetesdiscovery "engine" -ojsonpath='{.status.pods[0].name}')"
kubectl exec "$POD_NAME" -- pytest . $STOP_ON_FIRST_FAILURE $TESTS_FILTER
'''
local_resource(
"pytest-tests",
labels=["allTests"],
cmd=['sh', '-c', pod_engine_pytest_script],
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
resource_deps=["engine"]
)

cmd_button(
name="pytest Tests - headless run",
argv=['sh', '-c', pod_engine_pytest_script],
text="Run pytest",
resource="pytest-tests",
icon_name="replay",
inputs=[
text_input("TESTS_FILTER", "pytest optional arguments (e.g. \"apps/webhooks/tests/test_webhook.py::test_build_url_private_raises\")", "", "Test file names to run"),
bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""),
]
)

helm_oncall_values = ["./dev/helm-local.yml", "./dev/helm-local.dev.yml"]
if is_ci:
helm_oncall_values = helm_oncall_values + ["./.github/helm-ci.yml"]
Expand Down Expand Up @@ -174,7 +202,10 @@ k8s_resource(
resource_deps=["mariadb", "redis-master"],
labels=["OnCallBackend"],
)
k8s_resource(workload="engine-migrate", labels=["OnCallBackend"])

k8s_resource(workload="redis-master", labels=["OnCallDeps"])
k8s_resource(workload="prometheus-server", labels=["OnCallDeps"])
k8s_resource(
workload="mariadb",
port_forwards='3307:3306', # <host_port>:<container_port>
Expand All @@ -184,6 +215,9 @@ k8s_resource(

# name all tilt resources after the k8s object namespace + name
def resource_name(id):
# Remove variable date from job name
if id.name.startswith(HELM_PREFIX + "-engine-migrate"):
return "engine-migrate"
return id.name.replace(HELM_PREFIX + "-", "")

workload_to_resource_function(resource_name)
16 changes: 16 additions & 0 deletions dev/helm-local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@ env:
value: "False"
- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED
value: "True"
- name: DJANGO_SETTINGS_MODULE
value: "settings.dev"
- name: FEATURE_TELEGRAM_INTEGRATION_ENABLED
value: "True"
- name: FEATURE_SLACK_INTEGRATION_ENABLED
value: "True"
- name: SLACK_SLASH_COMMAND_NAME
value: "/oncall"
# enabled to be able to test docker.host.internal in the webhook e2e tests
- name: DANGEROUS_WEBHOOKS_ENABLED
value: "True"
Expand Down Expand Up @@ -131,6 +139,14 @@ service:
nodePort: 30001
prometheus:
enabled: true
alertmanager:
enabled: false
kube-state-metrics:
enabled: false
prometheus-node-exporter:
enabled: false
prometheus-pushgateway:
enabled: false
server:
global:
scrape_interval: 10s
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,15 +135,15 @@ def render_alert_group_attachments(self):
if self.alert_group.resolved:
resolve_attachment = {
"fallback": "Resolved...",
"text": self.alert_group.get_resolve_text(mention_user=True),
"text": self.alert_group.get_resolve_text(mention_user=False),
"callback_id": "alert",
}
attachments.append(resolve_attachment)
else:
if self.alert_group.acknowledged:
ack_attachment = {
"fallback": "Acknowledged...",
"text": self.alert_group.get_acknowledge_text(mention_user=True),
"text": self.alert_group.get_acknowledge_text(mention_user=False),
"callback_id": "alert",
}
attachments.append(ack_attachment)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ def _get_notification_plan_for_user(
# last passed step order + 1
notification_policy_order = last_user_log.notification_policy.order + 1

notification_policies = user_to_notify.get_notification_policies_or_use_default_fallback(important=important)
_, notification_policies = user_to_notify.get_notification_policies_or_use_default_fallback(important=important)

for notification_policy in notification_policies:
future_notification = notification_policy.order >= notification_policy_order
Expand Down
2 changes: 1 addition & 1 deletion engine/apps/alerts/tasks/notify_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def notify_group_task(alert_group_pk, escalation_policy_snapshot_order=None):
continue

important = escalation_policy_step == EscalationPolicy.STEP_NOTIFY_GROUP_IMPORTANT
notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)
_, notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)

if notification_policies:
usergroup_notification_plan += "\n_{} (".format(
Expand Down
93 changes: 66 additions & 27 deletions engine/apps/alerts/tasks/notify_user.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import typing
from functools import partial

from celery.exceptions import Retry
Expand Down Expand Up @@ -42,7 +43,6 @@ def notify_user_task(

countdown = 0
stop_escalation = False
log_message = ""
log_record = None

with transaction.atomic():
Expand Down Expand Up @@ -70,9 +70,13 @@ def notify_user_task(
)

user_has_notification = UserHasNotification.objects.filter(pk=user_has_notification.pk).select_for_update()[0]
using_fallback_default_notification_policy_step = False

if previous_notification_policy_pk is None:
notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)
(
using_fallback_default_notification_policy_step,
notification_policies,
) = user.get_notification_policies_or_use_default_fallback(important=important)
if not notification_policies:
task_logger.info(
f"notify_user_task: Failed to notify. No notification policies. user_id={user_pk} alert_group_id={alert_group_pk} important={important}"
Expand Down Expand Up @@ -115,16 +119,25 @@ def notify_user_task(
)
return
reason = None
if notification_policy is None:
stop_escalation = True
log_record = UserNotificationPolicyLogRecord(

def _create_user_notification_policy_log_record(**kwargs):
return UserNotificationPolicyLogRecord(
**kwargs,
using_fallback_default_notification_policy_step=using_fallback_default_notification_policy_step,
)

def _create_notification_finished_user_notification_policy_log_record():
return _create_user_notification_policy_log_record(
author=user,
type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FINISHED,
notification_policy=notification_policy,
alert_group=alert_group,
slack_prevent_posting=prevent_posting_to_thread,
)
log_message += "Personal escalation exceeded"

if notification_policy is None:
stop_escalation = True
log_record = _create_notification_finished_user_notification_policy_log_record()
else:
if (
(alert_group.acknowledged and not notify_even_acknowledged)
Expand All @@ -146,7 +159,7 @@ def notify_user_task(
else:
delay_in_seconds = 0
countdown = delay_in_seconds
log_record = UserNotificationPolicyLogRecord(
log_record = _create_user_notification_policy_log_record(
author=user,
type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
notification_policy=notification_policy,
Expand All @@ -160,7 +173,7 @@ def notify_user_task(
notification_policy.notify_by == UserNotificationPolicy.NotificationChannel.SLACK
)
if user_to_be_notified_in_slack and alert_group.notify_in_slack_enabled is False:
log_record = UserNotificationPolicyLogRecord(
log_record = _create_user_notification_policy_log_record(
author=user,
type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED,
notification_policy=notification_policy,
Expand All @@ -172,7 +185,7 @@ def notify_user_task(
notification_error_code=UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED,
)
else:
log_record = UserNotificationPolicyLogRecord(
log_record = _create_user_notification_policy_log_record(
author=user,
type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
notification_policy=notification_policy,
Expand All @@ -182,6 +195,7 @@ def notify_user_task(
notification_step=notification_policy.step,
notification_channel=notification_policy.notify_by,
)

if log_record: # log_record is None if user notification policy step is unspecified
# if this is the first notification step, and user hasn't been notified for this alert group - update metric
if (
Expand All @@ -198,25 +212,43 @@ def notify_user_task(
if notify_user_task.request.retries == 0:
transaction.on_commit(partial(send_user_notification_signal.apply_async, (log_record.pk,)))

if not stop_escalation:
if notification_policy.step != UserNotificationPolicy.Step.WAIT:
def _create_perform_notification_task(log_record_pk, alert_group_pk, use_default_notification_policy_fallback):
task = perform_notification.apply_async((log_record_pk, use_default_notification_policy_fallback))
task_logger.info(
f"Created perform_notification task {task} log_record={log_record_pk} " f"alert_group={alert_group_pk}"
)

def _create_perform_notification_task(log_record_pk, alert_group_pk):
task = perform_notification.apply_async((log_record_pk,))
task_logger.info(
f"Created perform_notification task {task} log_record={log_record_pk} "
f"alert_group={alert_group_pk}"
)
def _update_user_has_notification_active_notification_policy_id(active_policy_id: typing.Optional[str]) -> None:
user_has_notification.active_notification_policy_id = active_policy_id
user_has_notification.save(update_fields=["active_notification_policy_id"])

def _reset_user_has_notification_active_notification_policy_id() -> None:
_update_user_has_notification_active_notification_policy_id(None)

create_perform_notification_task = partial(
_create_perform_notification_task,
log_record.pk,
alert_group_pk,
using_fallback_default_notification_policy_step,
)

transaction.on_commit(partial(_create_perform_notification_task, log_record.pk, alert_group_pk))
if using_fallback_default_notification_policy_step:
# if we are using default notification policy, we're done escalating.. there's no further notification
# policy steps in this case. Kick off the perform_notification task, create the
# TYPE_PERSONAL_NOTIFICATION_FINISHED log record, and reset the active_notification_policy_id
transaction.on_commit(create_perform_notification_task)
_create_notification_finished_user_notification_policy_log_record()
_reset_user_has_notification_active_notification_policy_id()
elif not stop_escalation:
if notification_policy.step != UserNotificationPolicy.Step.WAIT:
transaction.on_commit(create_perform_notification_task)

delay = NEXT_ESCALATION_DELAY
if countdown is not None:
delay += countdown
task_id = celery_uuid()

user_has_notification.active_notification_policy_id = task_id
user_has_notification.save(update_fields=["active_notification_policy_id"])
_update_user_has_notification_active_notification_policy_id(task_id)

transaction.on_commit(
partial(
Expand All @@ -231,10 +263,8 @@ def _create_perform_notification_task(log_record_pk, alert_group_pk):
task_id=task_id,
)
)

else:
user_has_notification.active_notification_policy_id = None
user_has_notification.save(update_fields=["active_notification_policy_id"])
_reset_user_has_notification_active_notification_policy_id()


@shared_dedicated_queue_retry_task(
Expand All @@ -243,7 +273,7 @@ def _create_perform_notification_task(log_record_pk, alert_group_pk):
dont_autoretry_for=(Retry,),
max_retries=1 if settings.DEBUG else None,
)
def perform_notification(log_record_pk):
def perform_notification(log_record_pk, use_default_notification_policy_fallback):
from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
from apps.telegram.models import TelegramToUserConnector

Expand All @@ -262,8 +292,13 @@ def perform_notification(log_record_pk):

user = log_record.author
alert_group = log_record.alert_group
notification_policy = log_record.notification_policy
notification_policy = (
UserNotificationPolicy.get_default_fallback_policy(user)
if use_default_notification_policy_fallback
else log_record.notification_policy
)
notification_channel = notification_policy.notify_by if notification_policy else None

if user is None or notification_policy is None:
UserNotificationPolicyLogRecord(
author=user,
Expand Down Expand Up @@ -300,7 +335,9 @@ def perform_notification(log_record_pk):
TelegramToUserConnector.notify_user(user, alert_group, notification_policy)
except RetryAfter as e:
countdown = getattr(e, "retry_after", 3)
raise perform_notification.retry((log_record_pk,), countdown=countdown, exc=e)
raise perform_notification.retry(
(log_record_pk, use_default_notification_policy_fallback), countdown=countdown, exc=e
)

elif notification_channel == UserNotificationPolicy.NotificationChannel.SLACK:
# TODO: refactor checking the possibility of sending a notification in slack
Expand Down Expand Up @@ -380,7 +417,9 @@ def perform_notification(log_record_pk):
f"does not exist. Restarting perform_notification."
)
restart_delay_seconds = 60
perform_notification.apply_async((log_record_pk,), countdown=restart_delay_seconds)
perform_notification.apply_async(
(log_record_pk, use_default_notification_policy_fallback), countdown=restart_delay_seconds
)
else:
task_logger.debug(
f"send_slack_notification for alert_group {alert_group.pk} failed because slack message "
Expand Down
10 changes: 10 additions & 0 deletions engine/apps/alerts/tests/test_alert_receiver_channel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
from unittest import mock
from unittest.mock import patch

import pytest
from django.conf import settings
from django.db import IntegrityError
from django.urls import reverse
from django.utils import timezone
Expand All @@ -10,6 +12,7 @@
from common.api_helpers.utils import create_engine_url
from common.exceptions import UnableToSendDemoAlert
from engine.management.commands import alertmanager_v2_migrate
from settings.base import DatabaseTypes


@pytest.mark.django_db
Expand Down Expand Up @@ -272,6 +275,13 @@ def test_create_missing_direct_paging_integrations(
def test_create_duplicate_direct_paging_integrations(make_organization, make_team, make_alert_receive_channel):
"""Check that it's not possible to have more than one active direct paging integration per team."""

# MariaDB is not supported for this test
# See comment: https://github.com/grafana/oncall/commit/381a9ecf54bf0dd076f233b207c13d72ed792181#diff-9d96504027309f2bd1e95352bac1433b09b60eb4fafb611b52a6c15ed16cbc48R219-R223
is_local_dev_env = os.environ.get("DJANGO_SETTINGS_MODULE") == "settings.dev"
is_db_type_mysql = settings.DATABASE_TYPE == DatabaseTypes.MYSQL
if is_local_dev_env and is_db_type_mysql:
pytest.skip("This test is not supported by Mariadb (used by settings.dev)")

organization = make_organization()
team = make_team(organization)
make_alert_receive_channel(organization, team=team, integration=AlertReceiveChannel.INTEGRATION_DIRECT_PAGING)
Expand Down
Loading

0 comments on commit 165f7d7

Please sign in to comment.