v1.8.1

grafana · Jul 9, 2024 · 165f7d7 · 165f7d7
2 parents 2b20c69 + 34a9013
commit 165f7d7
Show file tree

Hide file tree

Showing 20 changed files with 344 additions and 78 deletions.
diff --git a/Tiltfile b/Tiltfile
@@ -83,7 +83,7 @@ if not is_ci:
 
 local_resource(
     "e2e-tests",
-    labels=["E2eTests"],
+    labels=["allTests"],
     cmd=e2e_tests_cmd,
     trigger_mode=TRIGGER_MODE_MANUAL,
     auto_init=is_ci,
@@ -127,6 +127,34 @@ cmd_button(
     icon_name="dangerous",
 )
 
+# Inspired by https://github.com/grafana/slo/blob/main/Tiltfile#L72
+pod_engine_pytest_script = '''
+set -eu
+# get engine k8s pod name from tilt resource name
+POD_NAME="$(tilt get kubernetesdiscovery "engine" -ojsonpath='{.status.pods[0].name}')"
+kubectl exec "$POD_NAME" -- pytest . $STOP_ON_FIRST_FAILURE $TESTS_FILTER
+'''
+local_resource(
+    "pytest-tests",
+    labels=["allTests"],
+    cmd=['sh', '-c', pod_engine_pytest_script],
+    trigger_mode=TRIGGER_MODE_MANUAL,
+    auto_init=False,
+    resource_deps=["engine"]
+)
+
+cmd_button(
+    name="pytest Tests - headless run",
+    argv=['sh', '-c', pod_engine_pytest_script],
+    text="Run pytest",
+    resource="pytest-tests",
+    icon_name="replay",
+    inputs=[
+        text_input("TESTS_FILTER", "pytest optional arguments (e.g. \"apps/webhooks/tests/test_webhook.py::test_build_url_private_raises\")", "", "Test file names to run"), 
+        bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""),
+    ]
+)
+
 helm_oncall_values = ["./dev/helm-local.yml", "./dev/helm-local.dev.yml"]
 if is_ci:
     helm_oncall_values = helm_oncall_values + ["./.github/helm-ci.yml"]
@@ -174,7 +202,10 @@ k8s_resource(
     resource_deps=["mariadb", "redis-master"],
     labels=["OnCallBackend"],
 )
+k8s_resource(workload="engine-migrate", labels=["OnCallBackend"])
+
 k8s_resource(workload="redis-master", labels=["OnCallDeps"])
+k8s_resource(workload="prometheus-server", labels=["OnCallDeps"])
 k8s_resource(
     workload="mariadb",
     port_forwards='3307:3306', # <host_port>:<container_port>
@@ -184,6 +215,9 @@ k8s_resource(
 
 # name all tilt resources after the k8s object namespace + name
 def resource_name(id):
+    # Remove variable date from job name
+    if id.name.startswith(HELM_PREFIX + "-engine-migrate"):
+        return "engine-migrate"
     return id.name.replace(HELM_PREFIX + "-", "")
 
 workload_to_resource_function(resource_name)
diff --git a/dev/helm-local.yml b/dev/helm-local.yml
@@ -5,6 +5,14 @@ env:
     value: "False"
   - name: FEATURE_PROMETHEUS_EXPORTER_ENABLED
     value: "True"
+  - name: DJANGO_SETTINGS_MODULE
+    value: "settings.dev"
+  - name: FEATURE_TELEGRAM_INTEGRATION_ENABLED
+    value: "True"
+  - name: FEATURE_SLACK_INTEGRATION_ENABLED
+    value: "True"
+  - name: SLACK_SLASH_COMMAND_NAME
+    value: "/oncall"
   # enabled to be able to test docker.host.internal in the webhook e2e tests
   - name: DANGEROUS_WEBHOOKS_ENABLED
     value: "True"
@@ -131,6 +139,14 @@ service:
   nodePort: 30001
 prometheus:
   enabled: true
+  alertmanager:
+    enabled: false
+  kube-state-metrics:
+    enabled: false
+  prometheus-node-exporter:
+    enabled: false
+  prometheus-pushgateway:
+    enabled: false
   server:
     global:
       scrape_interval: 10s

diff --git a/engine/apps/alerts/incident_appearance/renderers/slack_renderer.py b/engine/apps/alerts/incident_appearance/renderers/slack_renderer.py
@@ -135,15 +135,15 @@ def render_alert_group_attachments(self):
         if self.alert_group.resolved:
             resolve_attachment = {
                 "fallback": "Resolved...",
-                "text": self.alert_group.get_resolve_text(mention_user=True),
+                "text": self.alert_group.get_resolve_text(mention_user=False),
                 "callback_id": "alert",
             }
             attachments.append(resolve_attachment)
         else:
             if self.alert_group.acknowledged:
                 ack_attachment = {
                     "fallback": "Acknowledged...",
-                    "text": self.alert_group.get_acknowledge_text(mention_user=True),
+                    "text": self.alert_group.get_acknowledge_text(mention_user=False),
                     "callback_id": "alert",
                 }
                 attachments.append(ack_attachment)

diff --git a/engine/apps/alerts/incident_log_builder/incident_log_builder.py b/engine/apps/alerts/incident_log_builder/incident_log_builder.py
@@ -670,7 +670,7 @@ def _get_notification_plan_for_user(
                     # last passed step order + 1
                     notification_policy_order = last_user_log.notification_policy.order + 1
 
-        notification_policies = user_to_notify.get_notification_policies_or_use_default_fallback(important=important)
+        _, notification_policies = user_to_notify.get_notification_policies_or_use_default_fallback(important=important)
 
         for notification_policy in notification_policies:
             future_notification = notification_policy.order >= notification_policy_order

diff --git a/engine/apps/alerts/tasks/notify_group.py b/engine/apps/alerts/tasks/notify_group.py
@@ -83,7 +83,7 @@ def notify_group_task(alert_group_pk, escalation_policy_snapshot_order=None):
                 continue
 
             important = escalation_policy_step == EscalationPolicy.STEP_NOTIFY_GROUP_IMPORTANT
-            notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)
+            _, notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)
 
             if notification_policies:
                 usergroup_notification_plan += "\n_{} (".format(

diff --git a/engine/apps/alerts/tasks/notify_user.py b/engine/apps/alerts/tasks/notify_user.py
@@ -1,4 +1,5 @@
 import time
+import typing
 from functools import partial
 
 from celery.exceptions import Retry
@@ -42,7 +43,6 @@ def notify_user_task(
 
     countdown = 0
     stop_escalation = False
-    log_message = ""
     log_record = None
 
     with transaction.atomic():
@@ -70,9 +70,13 @@ def notify_user_task(
         )
 
         user_has_notification = UserHasNotification.objects.filter(pk=user_has_notification.pk).select_for_update()[0]
+        using_fallback_default_notification_policy_step = False
 
         if previous_notification_policy_pk is None:
-            notification_policies = user.get_notification_policies_or_use_default_fallback(important=important)
+            (
+                using_fallback_default_notification_policy_step,
+                notification_policies,
+            ) = user.get_notification_policies_or_use_default_fallback(important=important)
             if not notification_policies:
                 task_logger.info(
                     f"notify_user_task: Failed to notify. No notification policies. user_id={user_pk} alert_group_id={alert_group_pk} important={important}"
@@ -115,16 +119,25 @@ def notify_user_task(
                 )
                 return
             reason = None
-        if notification_policy is None:
-            stop_escalation = True
-            log_record = UserNotificationPolicyLogRecord(
+
+        def _create_user_notification_policy_log_record(**kwargs):
+            return UserNotificationPolicyLogRecord(
+                **kwargs,
+                using_fallback_default_notification_policy_step=using_fallback_default_notification_policy_step,
+            )
+
+        def _create_notification_finished_user_notification_policy_log_record():
+            return _create_user_notification_policy_log_record(
                 author=user,
                 type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FINISHED,
                 notification_policy=notification_policy,
                 alert_group=alert_group,
                 slack_prevent_posting=prevent_posting_to_thread,
             )
-            log_message += "Personal escalation exceeded"
+
+        if notification_policy is None:
+            stop_escalation = True
+            log_record = _create_notification_finished_user_notification_policy_log_record()
         else:
             if (
                 (alert_group.acknowledged and not notify_even_acknowledged)
@@ -146,7 +159,7 @@ def notify_user_task(
                 else:
                     delay_in_seconds = 0
                 countdown = delay_in_seconds
-                log_record = UserNotificationPolicyLogRecord(
+                log_record = _create_user_notification_policy_log_record(
                     author=user,
                     type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
                     notification_policy=notification_policy,
@@ -160,7 +173,7 @@ def notify_user_task(
                     notification_policy.notify_by == UserNotificationPolicy.NotificationChannel.SLACK
                 )
                 if user_to_be_notified_in_slack and alert_group.notify_in_slack_enabled is False:
-                    log_record = UserNotificationPolicyLogRecord(
+                    log_record = _create_user_notification_policy_log_record(
                         author=user,
                         type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED,
                         notification_policy=notification_policy,
@@ -172,7 +185,7 @@ def notify_user_task(
                         notification_error_code=UserNotificationPolicyLogRecord.ERROR_NOTIFICATION_POSTING_TO_SLACK_IS_DISABLED,
                     )
                 else:
-                    log_record = UserNotificationPolicyLogRecord(
+                    log_record = _create_user_notification_policy_log_record(
                         author=user,
                         type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
                         notification_policy=notification_policy,
@@ -182,6 +195,7 @@ def notify_user_task(
                         notification_step=notification_policy.step,
                         notification_channel=notification_policy.notify_by,
                     )
+
         if log_record:  # log_record is None if user notification policy step is unspecified
             # if this is the first notification step, and user hasn't been notified for this alert group - update metric
             if (
@@ -198,25 +212,43 @@ def notify_user_task(
             if notify_user_task.request.retries == 0:
                 transaction.on_commit(partial(send_user_notification_signal.apply_async, (log_record.pk,)))
 
-        if not stop_escalation:
-            if notification_policy.step != UserNotificationPolicy.Step.WAIT:
+        def _create_perform_notification_task(log_record_pk, alert_group_pk, use_default_notification_policy_fallback):
+            task = perform_notification.apply_async((log_record_pk, use_default_notification_policy_fallback))
+            task_logger.info(
+                f"Created perform_notification task {task} log_record={log_record_pk} " f"alert_group={alert_group_pk}"
+            )
 
-                def _create_perform_notification_task(log_record_pk, alert_group_pk):
-                    task = perform_notification.apply_async((log_record_pk,))
-                    task_logger.info(
-                        f"Created perform_notification task {task} log_record={log_record_pk} "
-                        f"alert_group={alert_group_pk}"
-                    )
+        def _update_user_has_notification_active_notification_policy_id(active_policy_id: typing.Optional[str]) -> None:
+            user_has_notification.active_notification_policy_id = active_policy_id
+            user_has_notification.save(update_fields=["active_notification_policy_id"])
+
+        def _reset_user_has_notification_active_notification_policy_id() -> None:
+            _update_user_has_notification_active_notification_policy_id(None)
+
+        create_perform_notification_task = partial(
+            _create_perform_notification_task,
+            log_record.pk,
+            alert_group_pk,
+            using_fallback_default_notification_policy_step,
+        )
 
-                transaction.on_commit(partial(_create_perform_notification_task, log_record.pk, alert_group_pk))
+        if using_fallback_default_notification_policy_step:
+            # if we are using default notification policy, we're done escalating.. there's no further notification
+            # policy steps in this case. Kick off the perform_notification task, create the
+            # TYPE_PERSONAL_NOTIFICATION_FINISHED log record, and reset the active_notification_policy_id
+            transaction.on_commit(create_perform_notification_task)
+            _create_notification_finished_user_notification_policy_log_record()
+            _reset_user_has_notification_active_notification_policy_id()
+        elif not stop_escalation:
+            if notification_policy.step != UserNotificationPolicy.Step.WAIT:
+                transaction.on_commit(create_perform_notification_task)
 
             delay = NEXT_ESCALATION_DELAY
             if countdown is not None:
                 delay += countdown
             task_id = celery_uuid()
 
-            user_has_notification.active_notification_policy_id = task_id
-            user_has_notification.save(update_fields=["active_notification_policy_id"])
+            _update_user_has_notification_active_notification_policy_id(task_id)
 
             transaction.on_commit(
                 partial(
@@ -231,10 +263,8 @@ def _create_perform_notification_task(log_record_pk, alert_group_pk):
                     task_id=task_id,
                 )
             )
-
         else:
-            user_has_notification.active_notification_policy_id = None
-            user_has_notification.save(update_fields=["active_notification_policy_id"])
+            _reset_user_has_notification_active_notification_policy_id()
 
 
 @shared_dedicated_queue_retry_task(
@@ -243,7 +273,7 @@ def _create_perform_notification_task(log_record_pk, alert_group_pk):
     dont_autoretry_for=(Retry,),
     max_retries=1 if settings.DEBUG else None,
 )
-def perform_notification(log_record_pk):
+def perform_notification(log_record_pk, use_default_notification_policy_fallback):
     from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
     from apps.telegram.models import TelegramToUserConnector
 
@@ -262,8 +292,13 @@ def perform_notification(log_record_pk):
 
     user = log_record.author
     alert_group = log_record.alert_group
-    notification_policy = log_record.notification_policy
+    notification_policy = (
+        UserNotificationPolicy.get_default_fallback_policy(user)
+        if use_default_notification_policy_fallback
+        else log_record.notification_policy
+    )
     notification_channel = notification_policy.notify_by if notification_policy else None
+
     if user is None or notification_policy is None:
         UserNotificationPolicyLogRecord(
             author=user,
@@ -300,7 +335,9 @@ def perform_notification(log_record_pk):
             TelegramToUserConnector.notify_user(user, alert_group, notification_policy)
         except RetryAfter as e:
             countdown = getattr(e, "retry_after", 3)
-            raise perform_notification.retry((log_record_pk,), countdown=countdown, exc=e)
+            raise perform_notification.retry(
+                (log_record_pk, use_default_notification_policy_fallback), countdown=countdown, exc=e
+            )
 
     elif notification_channel == UserNotificationPolicy.NotificationChannel.SLACK:
         # TODO: refactor checking the possibility of sending a notification in slack
@@ -380,7 +417,9 @@ def perform_notification(log_record_pk):
                     f"does not exist. Restarting perform_notification."
                 )
                 restart_delay_seconds = 60
-                perform_notification.apply_async((log_record_pk,), countdown=restart_delay_seconds)
+                perform_notification.apply_async(
+                    (log_record_pk, use_default_notification_policy_fallback), countdown=restart_delay_seconds
+                )
             else:
                 task_logger.debug(
                     f"send_slack_notification for alert_group {alert_group.pk} failed because slack message "

diff --git a/engine/apps/alerts/tests/test_alert_receiver_channel.py b/engine/apps/alerts/tests/test_alert_receiver_channel.py
@@ -1,7 +1,9 @@
+import os
 from unittest import mock
 from unittest.mock import patch
 
 import pytest
+from django.conf import settings
 from django.db import IntegrityError
 from django.urls import reverse
 from django.utils import timezone
@@ -10,6 +12,7 @@
 from common.api_helpers.utils import create_engine_url
 from common.exceptions import UnableToSendDemoAlert
 from engine.management.commands import alertmanager_v2_migrate
+from settings.base import DatabaseTypes
 
 
 @pytest.mark.django_db
@@ -272,6 +275,13 @@ def test_create_missing_direct_paging_integrations(
 def test_create_duplicate_direct_paging_integrations(make_organization, make_team, make_alert_receive_channel):
     """Check that it's not possible to have more than one active direct paging integration per team."""
 
+    # MariaDB is not supported for this test
+    # See comment: https://github.com/grafana/oncall/commit/381a9ecf54bf0dd076f233b207c13d72ed792181#diff-9d96504027309f2bd1e95352bac1433b09b60eb4fafb611b52a6c15ed16cbc48R219-R223
+    is_local_dev_env = os.environ.get("DJANGO_SETTINGS_MODULE") == "settings.dev"
+    is_db_type_mysql = settings.DATABASE_TYPE == DatabaseTypes.MYSQL
+    if is_local_dev_env and is_db_type_mysql:
+        pytest.skip("This test is not supported by Mariadb (used by settings.dev)")
+
     organization = make_organization()
     team = make_team(organization)
     make_alert_receive_channel(organization, team=team, integration=AlertReceiveChannel.INTEGRATION_DIRECT_PAGING)