From 732c43b13a24eea91e4b91eb26b56c78750ede5d Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 09:08:03 +0200 Subject: [PATCH 01/12] Introduced custom actions.py setup --- README.md | 12 +++++++++--- lambda_cw_alarm_creator.tf | 18 +++++++++++++++++- locals.tf | 6 +++--- variables.tf | 6 ++++++ zip.tf | 10 ++++++++++ 5 files changed, 45 insertions(+), 7 deletions(-) create mode 100644 zip.tf diff --git a/README.md b/README.md index f1b0095..fa67c07 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,10 @@ The file contains the alarms per service. In the example below you see the EC2 service that contains the CPU Utilization alarm. This will create the CPU Utilization alarm for every EC2 instance. ``` "EC2" : { <- Service - "CPUUtilization": { <- Alarmname - "AlarmThresholds" : { + "CPUUtilization": { <- Alarmname + "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], <- for every priority there needs to be a threshold and vice versa - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { <- Description is used for naming the alarm in cloudwatch @@ -76,6 +76,7 @@ module "observability_sender" { | Name | Version | |------|---------| +| [archive](#provider\_archive) | n/a | | [aws](#provider\_aws) | > 4.3.0 | ## Modules @@ -98,20 +99,24 @@ module "observability_sender" { | [aws_cloudwatch_event_target.lambda_target](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_kms_grant.give_lambda_role_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/kms_grant) | resource | +| [aws_lambda_layer_version.custom_actions](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_layer_version) | resource | | [aws_lambda_permission.allow_eventbridge](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.allow_eventbridge_instance_terminate_rule](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.payload_forwarder](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_sns_topic.notification_receiver](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic) | resource | | [aws_sns_topic_policy.allow_lambda_sns_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_policy) | resource | | [aws_sns_topic_subscription.lambda_eventbridge_forwarder](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sns_topic_subscription) | resource | +| [archive_file.custom_action](https://registry.terraform.io/providers/hashicorp/archive/latest/docs/data-sources/file) | data source | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.cloudwatch_alarms](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.eventbus](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.kms](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_cw_alarm_creator_dlq_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_ec2_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_ecs_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_monitoring_account_sqs_access_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_payload_forwarder_dlq_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_rds_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.sns_topic_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | @@ -123,6 +128,7 @@ module "observability_sender" { | [eventbridge\_rules](#input\_eventbridge\_rules) | EventBridge rule settings. |
map(object({
description : string
state : string
event_pattern : string
})
)
| `{}` | no | | [kms\_key\_arn](#input\_kms\_key\_arn) | ARN of the KMS key. | `string` | n/a | yes | | [monitoring\_account\_configuration](#input\_monitoring\_account\_configuration) | Configuration settings of the monitoring account. |
object({
sqs_name = string
sqs_region = string
sqs_account = number
})
| n/a | yes | +| [source\_directory\_location](#input\_source\_directory\_location) | Source Directory location for the custom alarm creator actions.py. | `string` | `null` | no | | [sqs\_dlq\_arn](#input\_sqs\_dlq\_arn) | ARN of the Dead Letter Queue. | `string` | n/a | yes | ## Outputs diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index 7dbe04e..916aa9d 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -18,12 +18,28 @@ module "lambda_cw_alarm_creator" { source_file_name = null environment_variables = { - SNS_ARN = "${aws_sns_topic.notification_receiver.arn}" + SNS_ARN = "${aws_sns_topic.notification_receiver.arn}" + CUSTOM_ALERT_ACTION = var.source_directory_location != null ? true : false } sqs_dlq_arn = var.sqs_dlq_arn } +# Create Lambda layer to host custom actions.py + +resource "aws_lambda_layer_version" "custom_actions" { + count = var.source_directory_location != null ? 1 : 0 + + layer_name = "alarm_creator_custom_alert_actions" + descriptions = "Contains a customer specific actions.py used for the alarm_creator" + + filename = data.archive_file.custom_action[0].output_path + + source_code_hash = data.archive_file.custom_action[0].output_base64sha256 + + compatible_runtimes = ["python3.9"] +} + # Cron job event rule directly tied to lambda function. resource "aws_cloudwatch_event_rule" "refresh_alarms" { name = "refresh-cloudwatch-alarms-rule" diff --git a/locals.tf b/locals.tf index 5e3b1bb..5434b25 100644 --- a/locals.tf +++ b/locals.tf @@ -12,9 +12,9 @@ locals { "event_pattern" : jsonencode({ "source" : ["aws.cloudwatch"], "detail-type" : ["CloudWatch Alarm State Change"], - "detail": { - "configuration": { - "description": [ { "anything-but": "Autoscaling_alarm" } ] + "detail" : { + "configuration" : { + "description" : [{ "anything-but" : "Autoscaling_alarm" }] } } }) diff --git a/variables.tf b/variables.tf index 1b3d0c8..5f5bd19 100644 --- a/variables.tf +++ b/variables.tf @@ -29,3 +29,9 @@ variable "monitoring_account_configuration" { sqs_account = number }) } + +variable "source_directory_location" { + description = "Source Directory location for the custom alarm creator actions.py." + type = string + default = null +} diff --git a/zip.tf b/zip.tf new file mode 100644 index 0000000..c4b0094 --- /dev/null +++ b/zip.tf @@ -0,0 +1,10 @@ +# stolen from https://github.com/hashicorp/terraform/issues/8344 + +data "archive_file" "custom_action" { + count = var.source_directory_location != null ? 1 : 0 + + type = "zip" + source_dir = var.source_directory_location + output_path = "${path.module}/lambda_function_custom_actions.zip" # include name to prevent overwrite when module is reused + output_file_mode = "0666" # cross platform consistent output +} From c77093e0a630c10fa0055bb16fdf1a9446fb99a3 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 09:12:11 +0200 Subject: [PATCH 02/12] Added logic to actions.py to check if custom file is needed or not --- alarm_creator/actions.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index d5d211b..bd7a394 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -1,7 +1,10 @@ -import boto3, json +import boto3, json, subprocess from pip import main +# environment_variables +custom_alert_action = os.environ['CUSTOM_ALERT_ACTION'] + # Create boto3 clients CWclient = boto3.client("cloudwatch") ec2 = boto3.resource("ec2") @@ -9,9 +12,16 @@ ec2client = boto3.client("ec2") ecsclient = boto3.client("ecs") -# Load json file containing the alarms -with open('./alarms.json') as alarms_file: - alarms = json.load(alarms_file) +# Create Lambda layer create if statement to choose which one depending on which variable is enabled. + + +# Load json file containing the alarms, checks if it needs to use a custom alarms json or defaul json. +if custom_alert_action == True: + with open('./custom_alarms.json') as alarms_file: + alarms = json.load(alarms_file) +else: + with open('./alarms.json') as alarms_file: + alarms = json.load(alarms_file) # Alarm creator def AWS_Alarms(): @@ -30,7 +40,7 @@ def AWS_Alarms(): response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H',) for metrics in response["Metrics"]: - # Check if any of the found metricnames are equal to metric names in alarms file + # Check if any of the found metricnames are equal to metric names in alarms file if metrics["MetricName"] == alarms[service][alarm]['MetricName']: for dimensions in metrics["Dimensions"]: if dimensions["Name"] == alarms[service][alarm]['Dimensions']: @@ -43,21 +53,21 @@ def AWS_Alarms(): cw_threshold = int(threshold) * 1000000 else: cw_threshold = int(threshold) - + # Handling dimensions instanceDimensions = { - "Name": f"{dimensions['Name']}", + "Name": f"{dimensions['Name']}", "Value": f"{dimensions['Value']}" } dimensionlist = [] - # For disk alarms there are more dimensions than other alarms - try: + # For disk alarms there are more dimensions than other alarms + try: for item in alarms[service][alarm]['DiskDimensions']: dimensionlist.append(item) except KeyError: # dimensionlist = [] dimensionlist.insert(0, instanceDimensions) - + for instance in instances: # Create alarms @@ -114,13 +124,13 @@ def DeleteAlarms(): RunningInstances = GetRunningInstances() RunningRDSInstances = GetRunningDBInstances() RunningClusters = GetRunningClusters() - + # collect alarm metrics and compare alarm metric instanceId with instance id's in array. if the state reason is breaching and instance does not exist delete alarm. for metricalarm in get_alarm_info["MetricAlarms"]: instance_id = list(filter(lambda x: x["Name"] == "InstanceId", metricalarm["Dimensions"])) rds_instance_name = list(filter(lambda x: x["Name"] == "DBInstanceIdentifier", metricalarm["Dimensions"])) cluster_name = list(filter(lambda x: x["Name"] == "ClusterName", metricalarm["Dimensions"])) - + if len(instance_id) == 1: if instance_id[0]["Value"] not in RunningInstances: CWclient.delete_alarms(AlarmNames=[metricalarm["AlarmName"]]) From ef63638548e7a4b3ba3c8613750cd3c34b71cc5d Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 09:39:14 +0200 Subject: [PATCH 03/12] Added Lambda layer to alarm creator lambda function --- lambda_cw_alarm_creator.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index 916aa9d..e4fe588 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -17,6 +17,8 @@ module "lambda_cw_alarm_creator" { source_directory_location = "${path.module}/alarm_creator/" source_file_name = null + layers = var.source_directory_location != null ? [aws_lambda_layer_version.arn] : null + environment_variables = { SNS_ARN = "${aws_sns_topic.notification_receiver.arn}" CUSTOM_ALERT_ACTION = var.source_directory_location != null ? true : false From dbd746fba645dcbec2beef63bb92fdddfaf89993 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 10:20:56 +0200 Subject: [PATCH 04/12] Updated lamda cw alarm creator source --- README.md | 2 +- lambda_cw_alarm_creator.tf | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa67c07..3cd5045 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ module "observability_sender" { |------|--------|---------| | [iam\_role\_lambda\_cw\_alarm\_creator](#module\_iam\_role\_lambda\_cw\_alarm\_creator) | git@github.com:TechNative-B-V/modules-aws.git//identity_and_access_management/iam_role | v1.1.7 | | [iam\_role\_lambda\_payload\_forwarder](#module\_iam\_role\_lambda\_payload\_forwarder) | git@github.com:TechNative-B-V/modules-aws.git//identity_and_access_management/iam_role | v1.1.7 | -| [lambda\_cw\_alarm\_creator](#module\_lambda\_cw\_alarm\_creator) | git@github.com:TechNative-B-V/modules-aws.git//lambda | v1.1.7 | +| [lambda\_cw\_alarm\_creator](#module\_lambda\_cw\_alarm\_creator) | git@github.com:wearetechnative/terraform-aws-lambda.git | 13eda5f9e8ae40e51f66a45837cd41a6b35af988 | | [lambda\_payload\_forwarder](#module\_lambda\_payload\_forwarder) | git@github.com:TechNative-B-V/modules-aws.git//lambda | v1.1.7 | ## Resources diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index e4fe588..224ca8c 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -1,6 +1,7 @@ module "lambda_cw_alarm_creator" { # Pinned to a tag but needs to be updated once we add an official release tag. - source = "git@github.com:TechNative-B-V/modules-aws.git//lambda?ref=v1.1.7" + #source = "git@github.com:TechNative-B-V/modules-aws.git//lambda?ref=v1.1.7" + source = "git@github.com:wearetechnative/terraform-aws-lambda.git?ref=13eda5f9e8ae40e51f66a45837cd41a6b35af988" name = local.lambda_cw_alarm_name From 5cd1da8fc723f531e96294fcc5fe4b88ff76c6f8 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 10:23:20 +0200 Subject: [PATCH 05/12] Fixed incorrect attributes --- lambda_cw_alarm_creator.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index 224ca8c..10334d7 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -18,7 +18,7 @@ module "lambda_cw_alarm_creator" { source_directory_location = "${path.module}/alarm_creator/" source_file_name = null - layers = var.source_directory_location != null ? [aws_lambda_layer_version.arn] : null + layers = var.source_directory_location != null ? [aws_lambda_layer_version.custom_actions.arn] : null environment_variables = { SNS_ARN = "${aws_sns_topic.notification_receiver.arn}" @@ -33,8 +33,8 @@ module "lambda_cw_alarm_creator" { resource "aws_lambda_layer_version" "custom_actions" { count = var.source_directory_location != null ? 1 : 0 - layer_name = "alarm_creator_custom_alert_actions" - descriptions = "Contains a customer specific actions.py used for the alarm_creator" + layer_name = "alarm_creator_custom_alert_actions" + description = "Contains a customer specific actions.py used for the alarm_creator" filename = data.archive_file.custom_action[0].output_path From 20e185ddefc49bfb674b86a390a7061464c45938 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 10:24:48 +0200 Subject: [PATCH 06/12] fixed layer reference --- lambda_cw_alarm_creator.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index 10334d7..b9a0822 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -18,7 +18,7 @@ module "lambda_cw_alarm_creator" { source_directory_location = "${path.module}/alarm_creator/" source_file_name = null - layers = var.source_directory_location != null ? [aws_lambda_layer_version.custom_actions.arn] : null + layers = var.source_directory_location != null ? [aws_lambda_layer_version.custom_actions[0].arn] : null environment_variables = { SNS_ARN = "${aws_sns_topic.notification_receiver.arn}" From 20e283d21253a1391c8d55fd4a4572e075f10300 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 10:28:51 +0200 Subject: [PATCH 07/12] fixed alarm creator function --- alarm_creator/actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index bd7a394..2271d89 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -16,7 +16,7 @@ # Load json file containing the alarms, checks if it needs to use a custom alarms json or defaul json. -if custom_alert_action == True: +if custom_alert_action == "true": with open('./custom_alarms.json') as alarms_file: alarms = json.load(alarms_file) else: From abe4a1e6e5fc496b0215001f13d0880fb992581b Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 11:32:46 +0200 Subject: [PATCH 08/12] updated alarms json logic --- alarm_creator/actions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index 2271d89..34dedb6 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -15,9 +15,9 @@ # Create Lambda layer create if statement to choose which one depending on which variable is enabled. -# Load json file containing the alarms, checks if it needs to use a custom alarms json or defaul json. +# Load json file containing the alarms, checks if it needs to use a custom alarms json or default json. if custom_alert_action == "true": - with open('./custom_alarms.json') as alarms_file: + with open('/opt/custom_alarms.json') as alarms_file: alarms = json.load(alarms_file) else: with open('./alarms.json') as alarms_file: From 55a964fa01e7e2282325ff5ee6bcef8e0055f253 Mon Sep 17 00:00:00 2001 From: Andrew Ng Date: Wed, 17 Apr 2024 11:41:43 +0200 Subject: [PATCH 09/12] added OS library to actions.py --- alarm_creator/actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index 34dedb6..88ff1b2 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -1,4 +1,4 @@ -import boto3, json, subprocess +import boto3, json, subprocess, os from pip import main From 602140f4087773ed57ed06b70dbd6fe2ae528662 Mon Sep 17 00:00:00 2001 From: "andrew@technative.eu" Date: Thu, 15 Aug 2024 14:59:16 +0200 Subject: [PATCH 10/12] exclude pre-commit json check --- .pre-commit-config.yaml | 2 +- README.md | 1 + alarm_creator/actions.py | 104 +++++++++++++++++--------------- alarm_creator/alarms.json | 44 +++++++------- lambda_cw_alarm_creator.tf | 4 +- lambda_cw_alarm_creator_role.tf | 12 ++++ 6 files changed, 94 insertions(+), 73 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5277321..712572f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,4 +16,4 @@ repos: - id: trailing-whitespace - id: detect-aws-credentials - id: check-json - - id: pretty-format-json \ No newline at end of file + # - id: pretty-format-json diff --git a/README.md b/README.md index 3cd5045..bee6280 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ module "observability_sender" { | [aws_iam_policy_document.lambda_cw_alarm_creator_dlq_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_ec2_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_ecs_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_elasticache_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_monitoring_account_sqs_access_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_payload_forwarder_dlq_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_rds_read_access](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index 88ff1b2..8e7e522 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -11,6 +11,7 @@ rds = boto3.client("rds") ec2client = boto3.client("ec2") ecsclient = boto3.client("ecs") +elasticlient = boto3.client("elasticache") # Create Lambda layer create if statement to choose which one depending on which variable is enabled. @@ -34,58 +35,57 @@ def AWS_Alarms(): instances = GetRunningDBInstances() elif service == "ECS": instances = GetRunningClusters() - for alarm in alarms[service]: + elif service == "ElastiCache": + instances = GetRunningCacheClusters() + elif service == "CWAgent": + instances = GetRunningInstances() + for alarm in alarms[service]: # Query the namespaces in CloudWatch Metrics - response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H',) + response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H') for metrics in response["Metrics"]: - - # Check if any of the found metricnames are equal to metric names in alarms file + # Check if any of the found metric names are equal to metric names in alarms file if metrics["MetricName"] == alarms[service][alarm]['MetricName']: - for dimensions in metrics["Dimensions"]: - if dimensions["Name"] == alarms[service][alarm]['Dimensions']: - for priority, threshold in zip(alarms[service][alarm]['AlarmThresholds']["priority"], alarms[service][alarm]['AlarmThresholds']["alarm_threshold"]): - - # To make alarmnames pretty, 'MB/GB' is used instead of 1000000/1000000000 bytes, needs to be in bytes for actual threshold - if alarms[service][alarm]['Description']['ThresholdUnit'] == "GB": - cw_threshold = int(threshold) * 1000000000 - elif alarms[service][alarm]['Description']['ThresholdUnit'] == "MB": - cw_threshold = int(threshold) * 1000000 - else: - cw_threshold = int(threshold) - - # Handling dimensions - instanceDimensions = { - "Name": f"{dimensions['Name']}", - "Value": f"{dimensions['Value']}" - } - dimensionlist = [] - # For disk alarms there are more dimensions than other alarms - try: - for item in alarms[service][alarm]['DiskDimensions']: - dimensionlist.append(item) - except KeyError: # - dimensionlist = [] - dimensionlist.insert(0, instanceDimensions) - - for instance in instances: - - # Create alarms - CWclient.put_metric_alarm( - AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}", - ComparisonOperator=alarms[service][alarm]['ComparisonOperator'], - EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'], - MetricName=alarms[service][alarm]['MetricName'], - Namespace=alarms[service][alarm]['Namespace'], - Period=alarms[service][alarm]['Period'], - Statistic=alarms[service][alarm]['Statistic'], - Threshold=cw_threshold, - ActionsEnabled=True, - TreatMissingData=alarms[service][alarm]['TreatMissingData'], - AlarmDescription=f"{priority}", - Dimensions=dimensionlist, - Tags=[{"Key": "CreatedbyLambda", "Value": "True"}], - ) + for priority, threshold in zip(alarms[service][alarm]['AlarmThresholds']["priority"], alarms[service][alarm]['AlarmThresholds']["alarm_threshold"]): + # Convert thresholds to bytes if needed + if alarms[service][alarm]['Description']['ThresholdUnit'] == "GB": + cw_threshold = int(threshold) * 1000000000 + elif alarms[service][alarm]['Description']['ThresholdUnit'] == "MB": + cw_threshold = int(threshold) * 1000000 + else: + cw_threshold = int(threshold) + + # Handling dimensions + for instance in instances: + + instanceDimensions = { + "Name": f"{alarms[service][alarm]['Dimensions']}", + "Value": instance + } + + # Initialize the dimension list + dimensionlist = [instanceDimensions] + + # Add any additional disk-related dimensions if present + if 'ExtraDimensions' in alarms[service][alarm]: + dimensionlist.extend(alarms[service][alarm]['ExtraDimensions']) + + # Create the alarms + CWclient.put_metric_alarm( + AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}", + ComparisonOperator=alarms[service][alarm]['ComparisonOperator'], + EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'], + MetricName=alarms[service][alarm]['MetricName'], + Namespace=alarms[service][alarm]['Namespace'], + Period=alarms[service][alarm]['Period'], + Statistic=alarms[service][alarm]['Statistic'], + Threshold=cw_threshold, + ActionsEnabled=True, + TreatMissingData=alarms[service][alarm]['TreatMissingData'], + AlarmDescription=f"{priority}", + Dimensions=dimensionlist, + Tags=[{"Key": "CreatedbyLambda", "Value": "True"}], + ) def GetRunningInstances(): get_running_instances = ec2client.describe_instances( @@ -119,6 +119,14 @@ def GetRunningClusters(): return RunningClusterNames +def GetRunningCacheClusters(): + get_running_cacheclusters = elasticlient.describe_cache_clusters() + RunningCacheClusters = [] + for cachecluster in get_running_cacheclusters["CacheClusters"]: + RunningCacheClusters.append(cachecluster['CacheClusterId']) + + return RunningCacheClusters + def DeleteAlarms(): get_alarm_info = CWclient.describe_alarms() RunningInstances = GetRunningInstances() diff --git a/alarm_creator/alarms.json b/alarm_creator/alarms.json index f5108ec..2df8ee9 100644 --- a/alarm_creator/alarms.json +++ b/alarm_creator/alarms.json @@ -3,7 +3,7 @@ "CPUUtilization": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { @@ -20,16 +20,16 @@ } }, - "CWAgent" : { + "CWAgent" : { "mem_used_percent": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { "Operatorsymbol" : ">", - "ThresholdUnit" : "%" + "ThresholdUnit" : "%" }, "EvaluationPeriods" : 2, "MetricName" : "mem_used_percent", @@ -43,12 +43,12 @@ "disk_used_percent_root": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { "Operatorsymbol" : ">", - "ThresholdUnit" : "%" + "ThresholdUnit" : "%" }, "EvaluationPeriods" : 2, "MetricName" : "disk_used_percent", @@ -57,7 +57,7 @@ "Statistic" : "Average", "TreatMissingData" : "breaching", "Dimensions" : "InstanceId", - "DiskDimensions": [ + "ExtraDimensions": [ { "Name": "path", "Value": "/" @@ -66,7 +66,7 @@ "Name": "device", "Value": "nvme0n1p1" }, - { + { "Name": "fstype", "Value": "ext4" } @@ -76,12 +76,12 @@ "disk_used_percent_sys_fs": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { "Operatorsymbol" : ">", - "ThresholdUnit" : "%" + "ThresholdUnit" : "%" }, "EvaluationPeriods" : 2, "MetricName" : "disk_used_percent", @@ -90,7 +90,7 @@ "Statistic" : "Average", "TreatMissingData" : "breaching", "Dimensions" : "InstanceId", - "DiskDimensions": [ + "ExtraDimensions": [ { "Name": "path", "Value": "/sys/fs/cgroup" @@ -109,12 +109,12 @@ "disk_used_percent_dev": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["90", "80", "75"] + "alarm_threshold": ["90", "80", "75"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { "Operatorsymbol" : ">", - "ThresholdUnit" : "%" + "ThresholdUnit" : "%" }, "EvaluationPeriods" : 2, "MetricName" : "disk_used_percent", @@ -123,7 +123,7 @@ "Statistic" : "Average", "TreatMissingData" : "breaching", "Dimensions" : "InstanceId", - "DiskDimensions": [ + "ExtraDimensions": [ { "Name": "path", "Value": "/dev" @@ -139,13 +139,13 @@ ] } }, - + "RDS" : { "FreeStorageSpace": { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["2", "3", "4"] + "alarm_threshold": ["2", "3", "4"] }, "ComparisonOperator" : "LessThanOrEqualToThreshold", "Description" : { @@ -158,13 +158,13 @@ "Period" : 300, "Statistic" : "Minimum", "TreatMissingData" : "breaching", - "Dimensions" : "DBInstanceIdentifier" + "Dimensions" : "DBInstanceIdentifier" }, - + "SwapUsage" : { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["512", "400", "300"] + "alarm_threshold": ["512", "400", "300"] }, "ComparisonOperator" : "GreaterThanThreshold", "Description" : { @@ -183,7 +183,7 @@ "FreeableMemory" : { "AlarmThresholds" : { "priority": ["P1", "P2", "P3"], - "alarm_threshold": ["20", "50", "100"] + "alarm_threshold": ["20", "50", "100"] }, "ComparisonOperator" : "LessThanThreshold", "Description" : { @@ -204,7 +204,7 @@ "TaskCount" : { "AlarmThresholds" : { "priority": ["P1"], - "alarm_threshold": ["1"] + "alarm_threshold": ["1"] }, "ComparisonOperator" : "LessThanThreshold", "Description" : { @@ -220,5 +220,5 @@ "Dimensions" : "ClusterName" } } - + } diff --git a/lambda_cw_alarm_creator.tf b/lambda_cw_alarm_creator.tf index b9a0822..11079b0 100644 --- a/lambda_cw_alarm_creator.tf +++ b/lambda_cw_alarm_creator.tf @@ -28,13 +28,13 @@ module "lambda_cw_alarm_creator" { sqs_dlq_arn = var.sqs_dlq_arn } -# Create Lambda layer to host custom actions.py +# Create Lambda layer to host custom_alarms.json resource "aws_lambda_layer_version" "custom_actions" { count = var.source_directory_location != null ? 1 : 0 layer_name = "alarm_creator_custom_alert_actions" - description = "Contains a customer specific actions.py used for the alarm_creator" + description = "Contains a customer specific custom_alarms.json used for the alarm_creator" filename = data.archive_file.custom_action[0].output_path diff --git a/lambda_cw_alarm_creator_role.tf b/lambda_cw_alarm_creator_role.tf index 180b51a..b59f284 100644 --- a/lambda_cw_alarm_creator_role.tf +++ b/lambda_cw_alarm_creator_role.tf @@ -14,6 +14,7 @@ module "iam_role_lambda_cw_alarm_creator" { "lambda_ec2_read_access" : jsondecode(data.aws_iam_policy_document.lambda_ec2_read_access.json) "lambda_rds_read_access" : jsondecode(data.aws_iam_policy_document.lambda_rds_read_access.json) "lambda_ecs_read_access" : jsondecode(data.aws_iam_policy_document.lambda_ecs_read_access.json) + "lambda_elasticache_read_access" : jsondecode(data.aws_iam_policy_document.lambda_elasticache_read_access.json) } trust_relationship = { @@ -93,6 +94,17 @@ data "aws_iam_policy_document" "lambda_ecs_read_access" { } } +data "aws_iam_policy_document" "lambda_elasticache_read_access" { + statement { + sid = "AllowLambdaElasticacheAccess" + + actions = ["elasticache:Describe*"] + + resources = ["*"] + } +} + + # The Lambda role needs to access KMS key in order to access SNS topic. resource "aws_kms_grant" "give_lambda_role_access" { name = "lambda-role-kms-grant-access" From c4a2cb53f8eb7d6435efff02042aa5906a9185e0 Mon Sep 17 00:00:00 2001 From: "andrew@technative.eu" Date: Mon, 19 Aug 2024 10:24:54 +0200 Subject: [PATCH 11/12] Updated actions.py to be more dynamicall when looking at device root --- alarm_creator/actions.py | 101 +++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 35 deletions(-) diff --git a/alarm_creator/actions.py b/alarm_creator/actions.py index 8e7e522..c5e873b 100644 --- a/alarm_creator/actions.py +++ b/alarm_creator/actions.py @@ -28,22 +28,26 @@ def AWS_Alarms(): for service in alarms: - # Fill instances variable with Running instances per service + dimensionlist = [] + # instances = None + #Fill instances variable with Running instances per service if service == "EC2": instances = GetRunningInstances() elif service == "RDS": instances = GetRunningDBInstances() - elif service == "ECS": - instances = GetRunningClusters() - elif service == "ElastiCache": - instances = GetRunningCacheClusters() elif service == "CWAgent": instances = GetRunningInstances() + # elif service == "ECS": + # instances = GetRunningClusters() + # elif service == "ElastiCache": + # instances = GetRunningCacheClusters() for alarm in alarms[service]: # Query the namespaces in CloudWatch Metrics response = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H') + for metrics in response["Metrics"]: + # Check if any of the found metric names are equal to metric names in alarms file if metrics["MetricName"] == alarms[service][alarm]['MetricName']: for priority, threshold in zip(alarms[service][alarm]['AlarmThresholds']["priority"], alarms[service][alarm]['AlarmThresholds']["alarm_threshold"]): @@ -55,38 +59,65 @@ def AWS_Alarms(): else: cw_threshold = int(threshold) - # Handling dimensions - for instance in instances: - - instanceDimensions = { - "Name": f"{alarms[service][alarm]['Dimensions']}", - "Value": instance - } - - # Initialize the dimension list - dimensionlist = [instanceDimensions] - - # Add any additional disk-related dimensions if present - if 'ExtraDimensions' in alarms[service][alarm]: - dimensionlist.extend(alarms[service][alarm]['ExtraDimensions']) - - # Create the alarms - CWclient.put_metric_alarm( - AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}", - ComparisonOperator=alarms[service][alarm]['ComparisonOperator'], - EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'], - MetricName=alarms[service][alarm]['MetricName'], - Namespace=alarms[service][alarm]['Namespace'], - Period=alarms[service][alarm]['Period'], - Statistic=alarms[service][alarm]['Statistic'], - Threshold=cw_threshold, - ActionsEnabled=True, - TreatMissingData=alarms[service][alarm]['TreatMissingData'], - AlarmDescription=f"{priority}", - Dimensions=dimensionlist, - Tags=[{"Key": "CreatedbyLambda", "Value": "True"}], + # Handling dimensions + for instance in instances: + + instanceDimensions = { + "Name": f"{alarms[service][alarm]['Dimensions']}", + "Value": instance + } + + #Add any additional disk-related dimensions if present + if 'ExtraDimensions' in alarms[service][alarm]: + dimensionlist.extend(alarms[service][alarm]['ExtraDimensions']) + + for dimension in dimensionlist: + if dimension["Name"] == "path" and dimension["Value"] == "/": + # Query the namespaces in CloudWatch Metrics + # Find the correct device dimension for the root volume + response_2 = CWclient.list_metrics(Namespace=f"{alarms[service][alarm]['Namespace']}", RecentlyActive='PT3H', + Dimensions=[instanceDimensions, {'Name': 'path', 'Value': '/'}] ) + for metrics in response_2["Metrics"]: + for dimension in metrics["Dimensions"]: + if dimension['Name'] == "device": + + dimensionlist = [ + instanceDimensions, + { + "Name": "device", + "Value": f"{dimension['Value']}" + } + ] + dimensionlist.extend(alarms[service][alarm]['ExtraDimensions']) + else: + continue + else: + #Clean up dimensionlist if not extra dimensions are present and only add the instance dimension + dimensionlist = [] + dimensionlist = [instanceDimensions] + + + # Create the alarms + CWclient.put_metric_alarm( + AlarmName=f"{instance}-{alarm} {alarms[service][alarm]['Description']['Operatorsymbol']} {threshold} {alarms[service][alarm]['Description']['ThresholdUnit']}", + ComparisonOperator=alarms[service][alarm]['ComparisonOperator'], + EvaluationPeriods=alarms[service][alarm]['EvaluationPeriods'], + MetricName=alarms[service][alarm]['MetricName'], + Namespace=alarms[service][alarm]['Namespace'], + Period=alarms[service][alarm]['Period'], + Statistic=alarms[service][alarm]['Statistic'], + Threshold=cw_threshold, + ActionsEnabled=True, + TreatMissingData=alarms[service][alarm]['TreatMissingData'], + AlarmDescription=f"{priority}", + Dimensions=dimensionlist, + Tags=[{"Key": "CreatedbyLambda", "Value": "True"}], + ) + + + def GetRunningInstances(): get_running_instances = ec2client.describe_instances( Filters=[{"Name": "instance-state-name", "Values": ["running"]}] From 874f1c6daaa459f6951b715fbd639bdf9c1c9c7d Mon Sep 17 00:00:00 2001 From: "andrew@technative.eu" Date: Mon, 19 Aug 2024 14:07:43 +0200 Subject: [PATCH 12/12] Updated default alarms.json --- alarm_creator/alarms.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/alarm_creator/alarms.json b/alarm_creator/alarms.json index 2df8ee9..9b1047d 100644 --- a/alarm_creator/alarms.json +++ b/alarm_creator/alarms.json @@ -62,10 +62,6 @@ "Name": "path", "Value": "/" }, - { - "Name": "device", - "Value": "nvme0n1p1" - }, { "Name": "fstype", "Value": "ext4"