-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d156f10
commit c04b4b6
Showing
36 changed files
with
1,557 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,27 @@ | ||
# terraform-datadog-kafka | ||
|
||
[//]: # (This file is generated. Do not edit) | ||
|
||
# kafka | ||
|
||
TOC: | ||
<!--ts--> | ||
* [kafka](#kafka) | ||
* [Module Variables](#module-variables) | ||
|
||
<!-- Added by: sjuuljanssen, at: za 13 mrt 2021 15:51:21 CET --> | ||
|
||
<!--te--> | ||
|
||
## Module Variables | ||
|
||
| variable | default | required | description | | ||
|----------------------|----------|----------|--------------| | ||
| env | | Yes | | | ||
| alert_env | | Yes | | | ||
| service | Kafka | No | | | ||
| notification_channel | | Yes | | | ||
| additional_tags | [] | No | | | ||
| filter_str | | Yes | | | ||
| is_hosted_service | False | No | | | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
variable "bytesin_high_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "bytesin_high_warning" { | ||
type = number | ||
default = 2500000 | ||
} | ||
|
||
variable "bytesin_high_critical" { | ||
type = number | ||
default = 5000000 | ||
} | ||
|
||
variable "bytesin_high_evaluation_period" { | ||
type = string | ||
default = "last_30m" | ||
} | ||
|
||
variable "bytesin_high_severity" { | ||
type = string | ||
default = "minor" | ||
} | ||
|
||
variable "bytesin_high_note" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "bytesin_high_docs" { | ||
type = string | ||
default = <<EOFF | ||
NOTE: This is based on a baseline and might need adjusting further down the road. | ||
Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages. | ||
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/ | ||
EOFF | ||
} | ||
|
||
variable "bytesin_high_filter_override" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "bytesin_high_alerting_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "bytesin_high_priority" { | ||
description = "Number from 1 (high) to 5 (low)." | ||
|
||
type = number | ||
default = null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
locals { | ||
bytesin_high_filter = var.bytesin_high_filter_override != "" ? var.bytesin_high_filter_override : var.filter_str | ||
bytesin_high_notification_channel = var.bytesin_high_alerting_enabled ? var.notification_channel : "" | ||
} | ||
|
||
module "bytesin_high" { | ||
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2" | ||
|
||
name = "BytesIn unusually high" | ||
query = "avg(${var.bytesin_high_evaluation_period}):avg:kafka.net.bytes_in.rate{${local.bytesin_high_filter}} by {host} > ${var.bytesin_high_critical}" | ||
alert_message = "The Plan specified {{threshold}} APM Analyzed Spans. The current estimate ({{value}}) exceeds the plan" | ||
recovery_message = "Analyzed APM Spans have recovered" | ||
|
||
# monitor level vars | ||
enabled = var.bytesin_high_enabled | ||
alerting_enabled = var.bytesin_high_alerting_enabled | ||
critical_threshold = var.bytesin_high_critical | ||
warning_threshold = var.bytesin_high_warning | ||
priority = var.bytesin_high_priority | ||
severity = var.bytesin_high_severity | ||
docs = var.bytesin_high_docs | ||
note = var.bytesin_high_note | ||
|
||
# module level vars | ||
env = var.alert_env | ||
service = var.service | ||
notification_channel = var.notification_channel | ||
additional_tags = var.additional_tags | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
variable "bytesout_high_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "bytesout_high_warning" { | ||
type = number | ||
default = 2500000 | ||
} | ||
|
||
variable "bytesout_high_critical" { | ||
type = number | ||
default = 5000000 | ||
} | ||
|
||
variable "bytesout_high_evaluation_period" { | ||
type = string | ||
default = "last_30m" | ||
} | ||
|
||
variable "bytesout_high_severity" { | ||
type = string | ||
default = "minor" | ||
} | ||
|
||
variable "bytesout_high_note" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "bytesout_high_docs" { | ||
type = string | ||
default = <<EOFF | ||
NOTE: This is based on a baseline and might need adjusting further down the road. | ||
Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages. | ||
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/ | ||
EOFF | ||
} | ||
|
||
variable "bytesout_high_filter_override" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "bytesout_high_alerting_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "bytesout_high_priority" { | ||
description = "Number from 1 (high) to 5 (low)." | ||
|
||
type = number | ||
default = null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
locals { | ||
bytesout_high_filter = var.bytesout_high_filter_override != "" ? var.bytesout_high_filter_override : var.filter_str | ||
bytesout_high_notification_channel = var.bytesout_high_alerting_enabled ? var.notification_channel : "" | ||
} | ||
|
||
module "bytesout_high" { | ||
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2" | ||
|
||
name = "BytesOut unusually high" | ||
query = "avg(${var.bytesout_high_evaluation_period}):avg:kafka.net.bytes_out.rate{${local.bytesout_high_filter}} by {host} > ${var.bytesout_high_critical}" | ||
alert_message = "Kafka is sending an unusual high network traffic" | ||
recovery_message = "Analyzed APM Spans have recovered" | ||
|
||
# monitor level vars | ||
enabled = var.bytesout_high_enabled | ||
alerting_enabled = var.bytesout_high_alerting_enabled | ||
critical_threshold = var.bytesout_high_critical | ||
warning_threshold = var.bytesout_high_warning | ||
priority = var.bytesout_high_priority | ||
severity = var.bytesout_high_severity | ||
docs = var.bytesout_high_docs | ||
note = var.bytesout_high_note | ||
|
||
# module level vars | ||
env = var.alert_env | ||
service = var.service | ||
notification_channel = var.notification_channel | ||
additional_tags = var.additional_tags | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
variable "fetch_purgatory_size_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "fetch_purgatory_size_warning" { | ||
type = number | ||
default = 70000 | ||
} | ||
|
||
variable "fetch_purgatory_size_critical" { | ||
type = number | ||
default = 100000 | ||
} | ||
|
||
variable "fetch_purgatory_size_evaluation_period" { | ||
type = string | ||
default = "last_30m" | ||
} | ||
|
||
variable "fetch_purgatory_size_severity" { | ||
type = string | ||
default = "minor" | ||
} | ||
|
||
variable "fetch_purgatory_size_note" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "fetch_purgatory_size_docs" { | ||
type = string | ||
default = <<EOFF | ||
The request purgatory serves as a temporary holding pen for produce and fetch requests waiting to be satisfied. | ||
Fetch requests are added to purgatory if there is not enough data to fulfill the request (fetch.min.bytes on consumers) until the time specified by fetch.wait.max.ms is reached or enough data becomes available | ||
Keeping an eye on the size of purgatory is useful to determine the underlying causes of latency. Increases in consumer fetch times, for example, can be easily explained if there is a corresponding increase in the number of fetch requests in purgatory. | ||
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/ | ||
EOFF | ||
} | ||
|
||
variable "fetch_purgatory_size_filter_override" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "fetch_purgatory_size_alerting_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "fetch_purgatory_size_priority" { | ||
description = "Number from 1 (high) to 5 (low)." | ||
|
||
type = number | ||
default = null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
locals { | ||
fetch_purgatory_size_filter = var.fetch_purgatory_size_filter_override != "" ? var.fetch_purgatory_size_filter_override : var.filter_str | ||
fetch_purgatory_size_notification_channel = var.fetch_purgatory_size_alerting_enabled ? var.notification_channel : "" | ||
} | ||
|
||
module "fetch_purgatory_size" { | ||
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2" | ||
|
||
name = "Fetch Purgatory Size" | ||
query = "avg(${var.fetch_purgatory_size_evaluation_period}):max:kafka.request.fetch_request_purgatory.size{${local.fetch_purgatory_size_filter}} by {host} > ${var.fetch_purgatory_size_critical}" | ||
alert_message = "Purgatory size ({{value}}) larger than usual." | ||
recovery_message = "Purgatory size ({{values}} recovered" | ||
|
||
# monitor level vars | ||
enabled = var.fetch_purgatory_size_enabled | ||
alerting_enabled = var.fetch_purgatory_size_alerting_enabled | ||
critical_threshold = var.fetch_purgatory_size_critical | ||
warning_threshold = var.fetch_purgatory_size_warning | ||
priority = var.fetch_purgatory_size_priority | ||
severity = var.fetch_purgatory_size_severity | ||
docs = var.fetch_purgatory_size_docs | ||
note = var.fetch_purgatory_size_note | ||
|
||
# module level vars | ||
env = var.alert_env | ||
service = var.service | ||
notification_channel = var.notification_channel | ||
additional_tags = var.additional_tags | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
variable "in_sync_nodes_dropped_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "in_sync_nodes_dropped_warning" { | ||
type = number | ||
default = 0 | ||
} | ||
|
||
variable "in_sync_nodes_dropped_critical" { | ||
type = number | ||
default = 0 | ||
} | ||
|
||
variable "in_sync_nodes_dropped_evaluation_period" { | ||
type = string | ||
default = "last_5m" | ||
} | ||
|
||
variable "in_sync_nodes_dropped_severity" { | ||
type = string | ||
default = "major" | ||
} | ||
|
||
variable "in_sync_nodes_dropped_note" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "in_sync_nodes_dropped_docs" { | ||
type = string | ||
default = <<EOFF | ||
The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, except when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool if it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). You should investigate any flapping in the values of these metrics, and any increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter. | ||
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/ | ||
EOFF | ||
} | ||
|
||
variable "in_sync_nodes_dropped_filter_override" { | ||
type = string | ||
default = "" | ||
} | ||
|
||
variable "in_sync_nodes_dropped_alerting_enabled" { | ||
type = bool | ||
default = true | ||
} | ||
|
||
variable "in_sync_nodes_dropped_priority" { | ||
description = "Number from 1 (high) to 5 (low)." | ||
|
||
type = number | ||
default = null | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
locals { | ||
in_sync_nodes_dropped_filter = var.in_sync_nodes_dropped_filter_override != "" ? var.in_sync_nodes_dropped_filter_override : var.filter_str | ||
in_sync_nodes_dropped_notification_channel = var.in_sync_nodes_dropped_alerting_enabled ? var.notification_channel : "" | ||
} | ||
|
||
module "in_sync_nodes_dropped" { | ||
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2" | ||
|
||
name = "In Sync Nodes dropped" | ||
query = "avg(${var.in_sync_nodes_dropped_evaluation_period}):max:kafka.replication.isr_shrinks.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} - max:kafka.replication.isr_expands.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} > ${var.in_sync_nodes_dropped_critical}" | ||
alert_message = "The number of in Sync Nodes dropped compared to the number of In Sync Nodes that were added" | ||
recovery_message = "" | ||
|
||
# monitor level vars | ||
enabled = var.in_sync_nodes_dropped_enabled | ||
alerting_enabled = var.in_sync_nodes_dropped_alerting_enabled | ||
critical_threshold = var.in_sync_nodes_dropped_critical | ||
# warning_threshold = var.in_sync_nodes_dropped_warning | ||
priority = var.in_sync_nodes_dropped_priority | ||
severity = var.in_sync_nodes_dropped_severity | ||
docs = var.in_sync_nodes_dropped_docs | ||
note = var.in_sync_nodes_dropped_note | ||
|
||
# module level vars | ||
env = var.alert_env | ||
service = var.service | ||
notification_channel = var.notification_channel | ||
additional_tags = var.additional_tags | ||
} |
Oops, something went wrong.