Skip to content

Commit

Permalink
add env to queries, improve titles, fix queries
Browse files Browse the repository at this point in the history
  • Loading branch information
kmackowick committed Sep 24, 2024
1 parent aec6215 commit 950e2f6
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 103 deletions.
26 changes: 13 additions & 13 deletions aws/alb/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ locals {
resource "datadog_monitor" "http_5xx_responses" {
count = var.http_5xx_responses_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB 5xx Responses - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ALB 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -27,8 +27,8 @@ resource "datadog_monitor" "http_5xx_responses" {

query = <<END
min(${var.http_5xx_responses_evaluation_window}):
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account}.as_rate(), 1)
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {aws_account,env,loadbalancer,region}.as_rate(), 1)
) * 100 > ${var.http_5xx_responses_threshold_critical}
END

Expand All @@ -41,7 +41,7 @@ END
resource "datadog_monitor" "http_5xx_tg_responses" {
count = var.http_5xx_tg_responses_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ALB Target Group 5xx Responses - {{loadbalancer.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -57,8 +57,8 @@ resource "datadog_monitor" "http_5xx_tg_responses" {

query = <<END
min(${var.http_5xx_tg_responses_evaluation_window}):
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup}.as_rate(), 1)
default(avg:aws.applicationelb.httpcode_elb_5xx${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 0) / (
default(avg:aws.applicationelb.request_count${local.query_filter} by {loadbalancer,region,aws_account,targetgroup,env}.as_rate(), 1)
) * 100 > ${var.http_5xx_tg_responses_threshold_critical}
END

Expand All @@ -72,7 +72,7 @@ END
resource "datadog_monitor" "latency" {
count = var.latency_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB latency - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB latency - {{value}}s ", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -88,7 +88,7 @@ resource "datadog_monitor" "latency" {

query = <<END
avg(${var.latency_evaluation_window}):
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {loadbalancer,region,aws_account}, 0
default(avg:aws.applicationelb.target_response_time.average${local.query_filter} by {aws_account,env,loadbalancer,region}, 0
) > ${var.latency_threshold_critical}
END

Expand All @@ -101,7 +101,7 @@ END
resource "datadog_monitor" "no_healthy_instances" {
count = var.no_healthy_instances_enabled ? 1 : 0

name = join("", [local.title_prefix, "ALB healthy instances - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "{{loadbalancer.name}} ALB healthy instances is at {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -117,10 +117,10 @@ resource "datadog_monitor" "no_healthy_instances" {

query = <<END
min(${var.no_healthy_instances_evaluation_window}): (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} / (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {loadbalancer,region,aws_account} +
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {loadbalancer,region,aws_account} )
) <= ${var.no_healthy_instances_threshold_critical}
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} / (
sum:aws.applicationelb.healthy_host_count.minimum${local.query_filter} by {aws_account,env,region,loadbalancer} +
sum:aws.applicationelb.un_healthy_host_count.maximum${local.query_filter} by {aws_account,env,region,loadbalancer} )
) * 100 <= ${var.no_healthy_instances_threshold_critical}
END

monitor_thresholds {
Expand Down
16 changes: 8 additions & 8 deletions aws/ec2/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ locals {
resource "datadog_monitor" "status_failed_check" {
count = var.status_failed_check_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - status check failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -26,7 +26,7 @@ resource "datadog_monitor" "status_failed_check" {

query = <<END
max(${var.status_failed_check_evaluation_window}):
max:aws.ec2.status_check_failed${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -38,7 +38,7 @@ END
resource "datadog_monitor" "status_failed_instance" {
count = var.status_failed_instance_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - instance failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -53,7 +53,7 @@ resource "datadog_monitor" "status_failed_instance" {

query = <<END
max(${var.status_failed_instance_evaluation_window}):
max:aws.ec2.status_check_failed_instance${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_instance${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -65,7 +65,7 @@ END
resource "datadog_monitor" "status_failed_system" {
count = var.status_failed_system_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - host failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - host failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -80,7 +80,7 @@ resource "datadog_monitor" "status_failed_system" {

query = <<END
max(${var.status_failed_system_evaluation_window}):
max:aws.ec2.status_check_failed_system${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_system${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand All @@ -92,7 +92,7 @@ END
resource "datadog_monitor" "status_failed_volume" {
count = var.status_failed_volume_enabled ? 1 : 0

name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "EC2 instance status - volume failure - {{name.name}}({{instance_id.name}})", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -107,7 +107,7 @@ resource "datadog_monitor" "status_failed_volume" {

query = <<END
max(${var.status_failed_volume_evaluation_window}):
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {instance_id,region,aws_account}
max:aws.ec2.status_check_failed_attached_ebs${local.query_filter} by {aws_account,env,instance_id,name,region}
>= 1
END

Expand Down
24 changes: 11 additions & 13 deletions aws/ecs-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ locals {
resource "datadog_monitor" "agent_status" {
count = var.agent_status_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster Agent Status - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Agent disconnected - {{clustername.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
type = "query alert"
type = "service check"

evaluation_delay = var.evaluation_delay
new_group_delay = var.new_group_delay
Expand All @@ -26,11 +26,9 @@ resource "datadog_monitor" "agent_status" {
require_full_window = true
timeout_h = var.timeout_h

query = <<END
min(${var.agent_status_evaluation_window}):
aws.ecs.agent_connected${local.service_filter}.by("cluster", "instance_id").last(6).count_by_status()
>= ${var.agent_status_threshold_critical}
END
query = <<EOQ
"aws.ecs.agent_connected"${local.service_filter}.by("clustername","instance_id").last(6).count_by_status()
EOQ

monitor_thresholds {
critical = var.agent_status_threshold_critical
Expand All @@ -41,7 +39,7 @@ END
resource "datadog_monitor" "cpu_utilization" {
count = var.cpu_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Cluster CPU Utilization - {{clustername.name}} - {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -57,7 +55,7 @@ resource "datadog_monitor" "cpu_utilization" {

query = <<END
min(${var.cpu_utilization_evaluation_window}):
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}
> ${var.cpu_utilization_threshold_critical}
END

Expand All @@ -70,7 +68,7 @@ END
resource "datadog_monitor" "cpu_utilization_anomaly" {
count = var.cpu_utilization_anomaly_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS cluster CPU utilization anomalous activity - {{clustername.name}}", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -86,7 +84,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {

query = <<END
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
avg:aws.ecs.cluster.cpuutilization${local.query_filter} by {clustername,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
seasonality='${var.cpu_utilization_anomaly_seasonality}'
) >= ${var.cpu_utilization_anomaly_threshold_critical}
Expand All @@ -106,7 +104,7 @@ END
resource "datadog_monitor" "memory_reservation" {
count = var.memory_reservation_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Cluster CPU Reservation - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Cluster Memory Reservation High - {{clustername.name}} - {{value}}%", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -122,7 +120,7 @@ resource "datadog_monitor" "memory_reservation" {

query = <<END
min(${var.memory_reservation_evaluation_window}):
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account}
avg:aws.ecs.cluster.memory_reservation${local.query_filter} by {clustername,region,aws_account,env}
> ${var.memory_reservation_threshold_critical}
END

Expand Down
16 changes: 8 additions & 8 deletions aws/ecs-fargate/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ locals {
resource "datadog_monitor" "fargate_check" {
count = var.fargate_check_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task status check - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "Fargate service not responding", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand Down Expand Up @@ -40,7 +40,7 @@ END
resource "datadog_monitor" "cpu_utilization" {
count = var.cpu_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task CPU utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Fargate task CPU utilization", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -56,7 +56,7 @@ resource "datadog_monitor" "cpu_utilization" {

query = <<END
avg(${var.cpu_utilization_evaluation_window}):
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
avg:ecs.fargate.cpu.percent${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
> ${var.cpu_utilization_threshold_critical}
END

Expand All @@ -69,7 +69,7 @@ END
resource "datadog_monitor" "cpu_utilization_anomaly" {
count = var.cpu_utilization_anomaly_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS service CPU utilization anomalous activity", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -85,7 +85,7 @@ resource "datadog_monitor" "cpu_utilization_anomaly" {

query = <<END
avg(${var.cpu_utilization_anomaly_evaluation_window}):anomalies(
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account}, 'agile', ${var.cpu_utilization_anomaly_deviations},
avg:ecs.fargate.cpu.percent${local.query_filter} by {servicename,region,aws_account,env}, 'agile', ${var.cpu_utilization_anomaly_deviations},
direction='above', count_default_zero='true', interval=${var.cpu_utilization_anomaly_rollup},
seasonality='${var.cpu_utilization_anomaly_seasonality}'
) >= ${var.cpu_utilization_anomaly_threshold_critical}
Expand All @@ -105,7 +105,7 @@ END
resource "datadog_monitor" "memory_utilization" {
count = var.memory_utilization_enabled ? 1 : 0

name = join("", [local.title_prefix, "ECS Fargate task memory utilization - {{host.name}}", local.title_suffix])
name = join("", [local.title_prefix, "ECS Fargate task memory utilization", local.title_suffix])
include_tags = true
message = local.query_alert_base_message
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
Expand All @@ -121,8 +121,8 @@ resource "datadog_monitor" "memory_utilization" {

query = <<END
avg(${var.memory_utilization_evaluation_window}):(
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account} /
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account}
avg:ecs.fargate.mem.usage${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env} /
avg:ecs.fargate.mem.limit${local.query_filter} by {ecs_container_name,task_family,region,aws_account,env}
) >= ${var.memory_utilization_threshold_critical}
END

Expand Down
Loading

0 comments on commit 950e2f6

Please sign in to comment.