Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
echo-devnull committed May 5, 2021
1 parent d156f10 commit c04b4b6
Show file tree
Hide file tree
Showing 36 changed files with 1,557 additions and 1 deletion.
28 changes: 27 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,27 @@
# terraform-datadog-kafka

[//]: # (This file is generated. Do not edit)

# kafka

TOC:
<!--ts-->
* [kafka](#kafka)
* [Module Variables](#module-variables)

<!-- Added by: sjuuljanssen, at: za 13 mrt 2021 15:51:21 CET -->

<!--te-->

## Module Variables

| variable | default | required | description |
|----------------------|----------|----------|--------------|
| env | | Yes | |
| alert_env | | Yes | |
| service | Kafka | No | |
| notification_channel | | Yes | |
| additional_tags | [] | No | |
| filter_str | | Yes | |
| is_hosted_service | False | No | |


57 changes: 57 additions & 0 deletions bytesin-high-variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
variable "bytesin_high_enabled" {
type = bool
default = true
}

variable "bytesin_high_warning" {
type = number
default = 2500000
}

variable "bytesin_high_critical" {
type = number
default = 5000000
}

variable "bytesin_high_evaluation_period" {
type = string
default = "last_30m"
}

variable "bytesin_high_severity" {
type = string
default = "minor"
}

variable "bytesin_high_note" {
type = string
default = ""
}

variable "bytesin_high_docs" {
type = string
default = <<EOFF
NOTE: This is based on a baseline and might need adjusting further down the road.
Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
EOFF
}

variable "bytesin_high_filter_override" {
type = string
default = ""
}

variable "bytesin_high_alerting_enabled" {
type = bool
default = true
}

variable "bytesin_high_priority" {
description = "Number from 1 (high) to 5 (low)."

type = number
default = null
}
29 changes: 29 additions & 0 deletions bytesin-high.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
locals {
bytesin_high_filter = var.bytesin_high_filter_override != "" ? var.bytesin_high_filter_override : var.filter_str
bytesin_high_notification_channel = var.bytesin_high_alerting_enabled ? var.notification_channel : ""
}

module "bytesin_high" {
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"

name = "BytesIn unusually high"
query = "avg(${var.bytesin_high_evaluation_period}):avg:kafka.net.bytes_in.rate{${local.bytesin_high_filter}} by {host} > ${var.bytesin_high_critical}"
alert_message = "The Plan specified {{threshold}} APM Analyzed Spans. The current estimate ({{value}}) exceeds the plan"
recovery_message = "Analyzed APM Spans have recovered"

# monitor level vars
enabled = var.bytesin_high_enabled
alerting_enabled = var.bytesin_high_alerting_enabled
critical_threshold = var.bytesin_high_critical
warning_threshold = var.bytesin_high_warning
priority = var.bytesin_high_priority
severity = var.bytesin_high_severity
docs = var.bytesin_high_docs
note = var.bytesin_high_note

# module level vars
env = var.alert_env
service = var.service
notification_channel = var.notification_channel
additional_tags = var.additional_tags
}
57 changes: 57 additions & 0 deletions bytesout_high-variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
variable "bytesout_high_enabled" {
type = bool
default = true
}

variable "bytesout_high_warning" {
type = number
default = 2500000
}

variable "bytesout_high_critical" {
type = number
default = 5000000
}

variable "bytesout_high_evaluation_period" {
type = string
default = "last_30m"
}

variable "bytesout_high_severity" {
type = string
default = "minor"
}

variable "bytesout_high_note" {
type = string
default = ""
}

variable "bytesout_high_docs" {
type = string
default = <<EOFF
NOTE: This is based on a baseline and might need adjusting further down the road.
Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Network throughput can affect Kafka’s performance if you are sending messages across data centers, if your topics have a large number of consumers, or if your replicas are catching up to their leaders. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
EOFF
}

variable "bytesout_high_filter_override" {
type = string
default = ""
}

variable "bytesout_high_alerting_enabled" {
type = bool
default = true
}

variable "bytesout_high_priority" {
description = "Number from 1 (high) to 5 (low)."

type = number
default = null
}
29 changes: 29 additions & 0 deletions bytesout_high.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
locals {
bytesout_high_filter = var.bytesout_high_filter_override != "" ? var.bytesout_high_filter_override : var.filter_str
bytesout_high_notification_channel = var.bytesout_high_alerting_enabled ? var.notification_channel : ""
}

module "bytesout_high" {
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"

name = "BytesOut unusually high"
query = "avg(${var.bytesout_high_evaluation_period}):avg:kafka.net.bytes_out.rate{${local.bytesout_high_filter}} by {host} > ${var.bytesout_high_critical}"
alert_message = "Kafka is sending an unusual high network traffic"
recovery_message = "Analyzed APM Spans have recovered"

# monitor level vars
enabled = var.bytesout_high_enabled
alerting_enabled = var.bytesout_high_alerting_enabled
critical_threshold = var.bytesout_high_critical
warning_threshold = var.bytesout_high_warning
priority = var.bytesout_high_priority
severity = var.bytesout_high_severity
docs = var.bytesout_high_docs
note = var.bytesout_high_note

# module level vars
env = var.alert_env
service = var.service
notification_channel = var.notification_channel
additional_tags = var.additional_tags
}
58 changes: 58 additions & 0 deletions fetch_purgatory_size-variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
variable "fetch_purgatory_size_enabled" {
type = bool
default = true
}

variable "fetch_purgatory_size_warning" {
type = number
default = 70000
}

variable "fetch_purgatory_size_critical" {
type = number
default = 100000
}

variable "fetch_purgatory_size_evaluation_period" {
type = string
default = "last_30m"
}

variable "fetch_purgatory_size_severity" {
type = string
default = "minor"
}

variable "fetch_purgatory_size_note" {
type = string
default = ""
}

variable "fetch_purgatory_size_docs" {
type = string
default = <<EOFF
The request purgatory serves as a temporary holding pen for produce and fetch requests waiting to be satisfied.
Fetch requests are added to purgatory if there is not enough data to fulfill the request (fetch.min.bytes on consumers) until the time specified by fetch.wait.max.ms is reached or enough data becomes available
Keeping an eye on the size of purgatory is useful to determine the underlying causes of latency. Increases in consumer fetch times, for example, can be easily explained if there is a corresponding increase in the number of fetch requests in purgatory.
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
EOFF
}

variable "fetch_purgatory_size_filter_override" {
type = string
default = ""
}

variable "fetch_purgatory_size_alerting_enabled" {
type = bool
default = true
}

variable "fetch_purgatory_size_priority" {
description = "Number from 1 (high) to 5 (low)."

type = number
default = null
}
29 changes: 29 additions & 0 deletions fetch_purgatory_size.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
locals {
fetch_purgatory_size_filter = var.fetch_purgatory_size_filter_override != "" ? var.fetch_purgatory_size_filter_override : var.filter_str
fetch_purgatory_size_notification_channel = var.fetch_purgatory_size_alerting_enabled ? var.notification_channel : ""
}

module "fetch_purgatory_size" {
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"

name = "Fetch Purgatory Size"
query = "avg(${var.fetch_purgatory_size_evaluation_period}):max:kafka.request.fetch_request_purgatory.size{${local.fetch_purgatory_size_filter}} by {host} > ${var.fetch_purgatory_size_critical}"
alert_message = "Purgatory size ({{value}}) larger than usual."
recovery_message = "Purgatory size ({{values}} recovered"

# monitor level vars
enabled = var.fetch_purgatory_size_enabled
alerting_enabled = var.fetch_purgatory_size_alerting_enabled
critical_threshold = var.fetch_purgatory_size_critical
warning_threshold = var.fetch_purgatory_size_warning
priority = var.fetch_purgatory_size_priority
severity = var.fetch_purgatory_size_severity
docs = var.fetch_purgatory_size_docs
note = var.fetch_purgatory_size_note

# module level vars
env = var.alert_env
service = var.service
notification_channel = var.notification_channel
additional_tags = var.additional_tags
}
55 changes: 55 additions & 0 deletions in_sync_nodes_dropped-variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
variable "in_sync_nodes_dropped_enabled" {
type = bool
default = true
}

variable "in_sync_nodes_dropped_warning" {
type = number
default = 0
}

variable "in_sync_nodes_dropped_critical" {
type = number
default = 0
}

variable "in_sync_nodes_dropped_evaluation_period" {
type = string
default = "last_5m"
}

variable "in_sync_nodes_dropped_severity" {
type = string
default = "major"
}

variable "in_sync_nodes_dropped_note" {
type = string
default = ""
}

variable "in_sync_nodes_dropped_docs" {
type = string
default = <<EOFF
The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, except when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool if it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). You should investigate any flapping in the values of these metrics, and any increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter.
https://www.datadoghq.com/blog/monitoring-kafka-performance-metrics/
EOFF
}

variable "in_sync_nodes_dropped_filter_override" {
type = string
default = ""
}

variable "in_sync_nodes_dropped_alerting_enabled" {
type = bool
default = true
}

variable "in_sync_nodes_dropped_priority" {
description = "Number from 1 (high) to 5 (low)."

type = number
default = null
}
29 changes: 29 additions & 0 deletions in_sync_nodes_dropped.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
locals {
in_sync_nodes_dropped_filter = var.in_sync_nodes_dropped_filter_override != "" ? var.in_sync_nodes_dropped_filter_override : var.filter_str
in_sync_nodes_dropped_notification_channel = var.in_sync_nodes_dropped_alerting_enabled ? var.notification_channel : ""
}

module "in_sync_nodes_dropped" {
source = "git@github.com:kabisa/terraform-datadog-generic-monitor.git?ref=0.5.2"

name = "In Sync Nodes dropped"
query = "avg(${var.in_sync_nodes_dropped_evaluation_period}):max:kafka.replication.isr_shrinks.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} - max:kafka.replication.isr_expands.rate{${local.in_sync_nodes_dropped_filter}} by {aiven-service} > ${var.in_sync_nodes_dropped_critical}"
alert_message = "The number of in Sync Nodes dropped compared to the number of In Sync Nodes that were added"
recovery_message = ""

# monitor level vars
enabled = var.in_sync_nodes_dropped_enabled
alerting_enabled = var.in_sync_nodes_dropped_alerting_enabled
critical_threshold = var.in_sync_nodes_dropped_critical
# warning_threshold = var.in_sync_nodes_dropped_warning
priority = var.in_sync_nodes_dropped_priority
severity = var.in_sync_nodes_dropped_severity
docs = var.in_sync_nodes_dropped_docs
note = var.in_sync_nodes_dropped_note

# module level vars
env = var.alert_env
service = var.service
notification_channel = var.notification_channel
additional_tags = var.additional_tags
}
Loading

0 comments on commit c04b4b6

Please sign in to comment.