-
Notifications
You must be signed in to change notification settings - Fork 2
/
alerts.tf
171 lines (148 loc) · 8.17 KB
/
alerts.tf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
module "wa-exception-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-dlq-alert"
alert_desc = "Triggers when a message falls into the Dead Letter Queue, works with 5 minute poll in wa-${var.env}."
app_insights_query = "union traces, exceptions | where customDimensions[\"Logger Message\"] contains \"dead lettered\" | sort by timestamp desc"
custom_email_subject = "Alert: Message was dead-lettered in wa-${var.env}"
frequency_in_minutes = "5"
time_window_in_minutes = "5"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-camunda-task-uninitiated-exception-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-camunda-task-uninitiated-alert"
alert_desc = "Triggers when a task could not be initiated and it is saved with an unconfigured task state, works with 120 minute poll in wa-${var.env}."
app_insights_query = "union traces, exceptions | where message contains \"TASK_INITIATION_FAILURES There are some uninitiated tasks\" | sort by timestamp desc"
custom_email_subject = "Alert: A task could not be initiated in wa-${var.env}"
frequency_in_minutes = "60"
time_window_in_minutes = "60"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-camunda-task-unterminated-exception-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-camunda-task-unterminated-alert"
alert_desc = "Triggers when a task could not be terminated, works with 120 minute poll in wa-${var.env}."
app_insights_query = "union traces, exceptions | where message contains \"TASK_TERMINATION_FAILURES There are some unterminated tasks\" | sort by timestamp desc"
custom_email_subject = "Alert: A task could not be terminated in wa-${var.env}"
frequency_in_minutes = "60"
time_window_in_minutes = "60"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-messages-find-problem-messages-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-case-event-handler-find-problem-messages-alert"
alert_desc = "Triggers when a ccd message is unprocessable state or remains in ready state for more than 1 hour, works with 60 minute poll in case-event-handler-appinsights-${var.env}."
app_insights_query = "union traces | where message contains \"FIND_PROBLEM_MESSAGES Retrieved problem messages\" and ( message contains \"UNPROCESSABLE\" or message contains \"READY\") | sort by timestamp desc"
custom_email_subject = "Alert: some CCD messages could not be processed in wa-${var.env}"
frequency_in_minutes = "60"
time_window_in_minutes = "60"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-cft-task-reconfiguration-exception-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-task-management-api-reconfiguration-exception-alert"
alert_desc = "Triggers when a task could not be reconfigured for a defined time in task-management-api-appinsights-${var.env}."
app_insights_query = "union traces | where message contains \"Task Execute Reconfiguration Failed\" | sort by timestamp desc"
custom_email_subject = "Alert: some tasks could not be reconfigured in wa-${var.env}"
frequency_in_minutes = "60"
time_window_in_minutes = "60"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-task-replication-problem-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-task-management-api-replication-problem-alert"
alert_desc = "Triggers when a task could not be replicated in task-management-api-appinsights-${var.env}."
app_insights_query = "union traces | where message contains \"TASK_REPLICATION_ERROR: \" | sort by timestamp desc"
custom_email_subject = "Alert: some tasks could not be replicated in wa-${var.env}"
frequency_in_minutes = "60"
time_window_in_minutes = "60"
severity_level = "2"
action_group_name = "wa-support"
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
common_tags = var.common_tags
enabled = true
}
module "wa-task-deletion-failure-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-task-management-api-task-deletion-failure-alert"
alert_desc = "Alert when task fail to delete for case id"
app_insights_query = "traces | where message contains 'Unable to delete all tasks for case id:' or message contains 'Deleted some UNTERMINATED tasks:'"
custom_email_subject = "Alert: Task deletion failure in wa-${var.env}"
#run every 6 hrs for early alert
frequency_in_minutes = "360"
# window of 1 day as data extract needs to run daily
time_window_in_minutes = "1440"
severity_level = "2"
action_group_name = module.wa-action-group.action_group_name
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
enabled = var.enable-wa-task-management-api-task-deletion-failure-alert
common_tags = var.common_tags
}
module "wa-message-readiness-check-failure-alert" {
source = "git@github.com:hmcts/cnp-module-metric-alert"
location = var.location
app_insights_name = "wa-${var.env}"
alert_name = "wa-case-event-handler-message-readiness-check-failure-alert"
alert_desc = "Alert when case event handler message readiness check fails and auto restart of pod happens"
app_insights_query = "traces | where message contains 'Liveness check failed' or message contains 'Readiness check failed'"
custom_email_subject = "Alert: Task readiness message checks failed wa-${var.env}"
#run every 3 hrs for early alert
frequency_in_minutes = "180"
# window of 1 day as data extract needs to run daily
time_window_in_minutes = "1440"
severity_level = "2"
action_group_name = module.wa-action-group.action_group_name
trigger_threshold_operator = "GreaterThan"
trigger_threshold = "0"
resourcegroup_name = azurerm_resource_group.rg.name
enabled = true
common_tags = var.common_tags
}