Initial commit

kabisa · May 5, 2021 · c04b4b6 · c04b4b6
1 parent d156f10
commit c04b4b6
Show file tree

Hide file tree

Showing 36 changed files with 1,557 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -1 +1,27 @@
-# terraform-datadog-kafka
+
+[//]: # (This file is generated. Do not edit)
+
+# kafka
+
+TOC:
+<!--ts-->
+   * [kafka](#kafka)
+      * [Module Variables](#module-variables)
+
+<!-- Added by: sjuuljanssen, at: za 13 mrt 2021 15:51:21 CET -->
+
+<!--te-->
+
+## Module Variables
+
+| variable             | default  | required | description  |
+|----------------------|----------|----------|--------------|
+| env                  |          | Yes      |              |
+| alert_env            |          | Yes      |              |
+| service              | Kafka    | No       |              |
+| notification_channel |          | Yes      |              |
+| additional_tags      | []       | No       |              |
+| filter_str           |          | Yes      |              |
+| is_hosted_service    | False    | No       |              |
+
+
diff --git a/bytesin-high-variables.tf b/bytesin-high-variables.tf
@@ -0,0 +1,57 @@
+variable "bytesin_high_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "bytesin_high_warning" {
+  type    = number
+  default = 2500000
+}
+
+variable "bytesin_high_critical" {
+  type    = number
+  default = 5000000
+}
+
+variable "bytesin_high_evaluation_period" {
+  type    = string
+  default = "last_30m"
+}
+
+variable "bytesin_high_severity" {
+  type    = string
+  default = "minor"
+}
+
+variable "bytesin_high_note" {
+  type    = string
+  default = ""
+}
+
+variable "bytesin_high_docs" {
+  type    = string
+  default = <<EOFF
+  NOTE: This is based on a baseline and might need adjusting further down the road. 
+                                        
+  Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.
+
+  https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
+  EOFF
+}
+
+variable "bytesin_high_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "bytesin_high_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "bytesin_high_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = null
+}
diff --git a/bytesin-high.tf b/bytesin-high.tf
@@ -0,0 +1,29 @@
+locals {
+  bytesin_high_filter               = var.bytesin_high_filter_override != "" ? var.bytesin_high_filter_override : var.filter_str
+  bytesin_high_notification_channel = var.bytesin_high_alerting_enabled ? var.notification_channel : ""
+}
+
+module "bytesin_high" {
+  source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"
+
+  name             = "BytesIn unusually high"
+  query            = "avg(${var.bytesin_high_evaluation_period}):avg:kafka.net.bytes_in.rate{${local.bytesin_high_filter}} by {host} > ${var.bytesin_high_critical}"
+  alert_message    = "The Plan specified {{threshold}} APM Analyzed Spans. The current estimate ({{value}}) exceeds the plan"
+  recovery_message = "Analyzed APM Spans have recovered"
+
+  # monitor level vars
+  enabled            = var.bytesin_high_enabled
+  alerting_enabled   = var.bytesin_high_alerting_enabled
+  critical_threshold = var.bytesin_high_critical
+  warning_threshold  = var.bytesin_high_warning
+  priority           = var.bytesin_high_priority
+  severity           = var.bytesin_high_severity
+  docs               = var.bytesin_high_docs
+  note               = var.bytesin_high_note
+
+  # module level vars
+  env                  = var.alert_env
+  service              = var.service
+  notification_channel = var.notification_channel
+  additional_tags      = var.additional_tags
+}
diff --git a/bytesout_high-variables.tf b/bytesout_high-variables.tf
@@ -0,0 +1,57 @@
+variable "bytesout_high_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "bytesout_high_warning" {
+  type    = number
+  default = 2500000
+}
+
+variable "bytesout_high_critical" {
+  type    = number
+  default = 5000000
+}
+
+variable "bytesout_high_evaluation_period" {
+  type    = string
+  default = "last_30m"
+}
+
+variable "bytesout_high_severity" {
+  type    = string
+  default = "minor"
+}
+
+variable "bytesout_high_note" {
+  type    = string
+  default = ""
+}
+
+variable "bytesout_high_docs" {
+  type    = string
+  default = <<EOFF
+  NOTE: This is based on a baseline and might need adjusting further down the road.
+
+  Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.
+  
+  https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
+  EOFF
+}
+
+variable "bytesout_high_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "bytesout_high_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "bytesout_high_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = null
+}
diff --git a/bytesout_high.tf b/bytesout_high.tf
@@ -0,0 +1,29 @@
+locals {
+  bytesout_high_filter               = var.bytesout_high_filter_override != "" ? var.bytesout_high_filter_override : var.filter_str
+  bytesout_high_notification_channel = var.bytesout_high_alerting_enabled ? var.notification_channel : ""
+}
+
+module "bytesout_high" {
+  source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"
+
+  name             = "BytesOut unusually high"
+  query            = "avg(${var.bytesout_high_evaluation_period}):avg:kafka.net.bytes_out.rate{${local.bytesout_high_filter}} by {host} > ${var.bytesout_high_critical}"
+  alert_message    = "Kafka is sending an unusual high network traffic"
+  recovery_message = "Analyzed APM Spans have recovered"
+
+  # monitor level vars
+  enabled            = var.bytesout_high_enabled
+  alerting_enabled   = var.bytesout_high_alerting_enabled
+  critical_threshold = var.bytesout_high_critical
+  warning_threshold  = var.bytesout_high_warning
+  priority           = var.bytesout_high_priority
+  severity           = var.bytesout_high_severity
+  docs               = var.bytesout_high_docs
+  note               = var.bytesout_high_note
+
+  # module level vars
+  env                  = var.alert_env
+  service              = var.service
+  notification_channel = var.notification_channel
+  additional_tags      = var.additional_tags
+}
diff --git a/fetch_purgatory_size-variables.tf b/fetch_purgatory_size-variables.tf
@@ -0,0 +1,58 @@
+variable "fetch_purgatory_size_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "fetch_purgatory_size_warning" {
+  type    = number
+  default = 70000
+}
+
+variable "fetch_purgatory_size_critical" {
+  type    = number
+  default = 100000
+}
+
+variable "fetch_purgatory_size_evaluation_period" {
+  type    = string
+  default = "last_30m"
+}
+
+variable "fetch_purgatory_size_severity" {
+  type    = string
+  default = "minor"
+}
+
+variable "fetch_purgatory_size_note" {
+  type    = string
+  default = ""
+}
+
+variable "fetch_purgatory_size_docs" {
+  type    = string
+  default = <<EOFF
+  The request purgatory serves as a temporary holding pen for produce and fetch requests waiting to be satisfied.
+  Fetch requests are added to purgatory if there is not enough data to fulfill the request (fetch.min.bytes on consumers) until the time specified by fetch.wait.max.ms is reached or enough data becomes available
+
+  Keeping an eye on the size of purgatory is useful to determine the underlying causes of latency. Increases in consumer fetch times, for example, can be easily explained if there is a corresponding increase in the number of fetch requests in purgatory.
+  
+  https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
+  EOFF
+}
+
+variable "fetch_purgatory_size_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "fetch_purgatory_size_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "fetch_purgatory_size_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = null
+}
diff --git a/fetch_purgatory_size.tf b/fetch_purgatory_size.tf
@@ -0,0 +1,29 @@
+locals {
+  fetch_purgatory_size_filter               = var.fetch_purgatory_size_filter_override != "" ? var.fetch_purgatory_size_filter_override : var.filter_str
+  fetch_purgatory_size_notification_channel = var.fetch_purgatory_size_alerting_enabled ? var.notification_channel : ""
+}
+
+module "fetch_purgatory_size" {
+  source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"
+
+  name             = "Fetch Purgatory Size"
+  query            = "avg(${var.fetch_purgatory_size_evaluation_period}):max:kafka.request.fetch_request_purgatory.size{${local.fetch_purgatory_size_filter}} by {host} > ${var.fetch_purgatory_size_critical}"
+  alert_message    = "Purgatory size ({{value}}) larger than usual."
+  recovery_message = "Purgatory size ({{values}} recovered"
+
+  # monitor level vars
+  enabled            = var.fetch_purgatory_size_enabled
+  alerting_enabled   = var.fetch_purgatory_size_alerting_enabled
+  critical_threshold = var.fetch_purgatory_size_critical
+  warning_threshold  = var.fetch_purgatory_size_warning
+  priority           = var.fetch_purgatory_size_priority
+  severity           = var.fetch_purgatory_size_severity
+  docs               = var.fetch_purgatory_size_docs
+  note               = var.fetch_purgatory_size_note
+
+  # module level vars
+  env                  = var.alert_env
+  service              = var.service
+  notification_channel = var.notification_channel
+  additional_tags      = var.additional_tags
+}
diff --git a/in_sync_nodes_dropped-variables.tf b/in_sync_nodes_dropped-variables.tf
@@ -0,0 +1,55 @@
+variable "in_sync_nodes_dropped_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "in_sync_nodes_dropped_warning" {
+  type    = number
+  default = 0
+}
+
+variable "in_sync_nodes_dropped_critical" {
+  type    = number
+  default = 0
+}
+
+variable "in_sync_nodes_dropped_evaluation_period" {
+  type    = string
+  default = "last_5m"
+}
+
+variable "in_sync_nodes_dropped_severity" {
+  type    = string
+  default = "major"
+}
+
+variable "in_sync_nodes_dropped_note" {
+  type    = string
+  default = ""
+}
+
+variable "in_sync_nodes_dropped_docs" {
+  type    = string
+  default = <<EOFF
+  The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, except when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool if it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). You should investigate any flapping in the values of these metrics, and any increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter.
+  
+  https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
+  EOFF
+}
+
+variable "in_sync_nodes_dropped_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "in_sync_nodes_dropped_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "in_sync_nodes_dropped_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = null
+}
diff --git a/in_sync_nodes_dropped.tf b/in_sync_nodes_dropped.tf
@@ -0,0 +1,29 @@
+locals {
+  in_sync_nodes_dropped_filter               = var.in_sync_nodes_dropped_filter_override != "" ? var.in_sync_nodes_dropped_filter_override : var.filter_str
+  in_sync_nodes_dropped_notification_channel = var.in_sync_nodes_dropped_alerting_enabled ? var.notification_channel : ""
+}
+
+module "in_sync_nodes_dropped" {
+  source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"
+
+  name             = "In Sync Nodes dropped"
+  query            = "avg(${var.in_sync_nodes_dropped_evaluation_period}):max:kafka.replication.isr_shrinks.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} - max:kafka.replication.isr_expands.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} > ${var.in_sync_nodes_dropped_critical}"
+  alert_message    = "The number of in Sync Nodes dropped compared to the number of In Sync Nodes that were added"
+  recovery_message = ""
+
+  # monitor level vars
+  enabled            = var.in_sync_nodes_dropped_enabled
+  alerting_enabled   = var.in_sync_nodes_dropped_alerting_enabled
+  critical_threshold = var.in_sync_nodes_dropped_critical
+  # warning_threshold  = var.in_sync_nodes_dropped_warning
+  priority           = var.in_sync_nodes_dropped_priority
+  severity           = var.in_sync_nodes_dropped_severity
+  docs               = var.in_sync_nodes_dropped_docs
+  note               = var.in_sync_nodes_dropped_note
+
+  # module level vars
+  env                  = var.alert_env
+  service              = var.service
+  notification_channel = var.notification_channel
+  additional_tags      = var.additional_tags
+}