From 05fe4f39e2f8b6f0e8133858ae663b25a1891949 Mon Sep 17 00:00:00 2001 From: gustavderdrache Date: Tue, 5 Mar 2024 09:59:44 -0500 Subject: [PATCH] EPAD8-2370: Remove metrics sidecar --- .buildkite/feature.yml | 13 --- .buildkite/webcms.yml | 12 --- services/metrics/Dockerfile | 27 ------- services/metrics/README.md | 87 --------------------- services/metrics/entrypoint.sh | 36 --------- services/metrics/transform.jq | 33 -------- terraform/infrastructure/drupal_iam_task.tf | 25 ------ terraform/infrastructure/ecr.tf | 10 --- terraform/infrastructure/logging.tf | 9 --- terraform/infrastructure/parameters.tf | 20 ----- terraform/webcms/README.md | 11 +-- terraform/webcms/drupal.tf | 22 ------ terraform/webcms/shared.tf | 8 -- 13 files changed, 3 insertions(+), 310 deletions(-) delete mode 100644 services/metrics/Dockerfile delete mode 100644 services/metrics/README.md delete mode 100644 services/metrics/entrypoint.sh delete mode 100644 services/metrics/transform.jq diff --git a/.buildkite/feature.yml b/.buildkite/feature.yml index 28b9b442b5..b80059846f 100644 --- a/.buildkite/feature.yml +++ b/.buildkite/feature.yml @@ -110,19 +110,6 @@ steps: --target="$$TARGET" \ --no-push - # The FPM metrics sidecar is a much smaller image, so we can just let - # Kaniko build it without any caching logic. The build is otherwise - # identical to the one above. - - label: "Build fpm-metrics" - concurrency_group: $BUILDKITE_PIPELINE_SLUG/build-$BUILDKITE_BRANCH - concurrency: 4 - - commands: docker build services/metrics - - plugins: - - cultureamp/aws-assume-role#v0.1.0: - role: arn:aws:iam::316981092358:role/BuildkiteRoleForImageBuilds - # Perform a Terraform formatting check. See the terraform-fmt.sh script for more details # on what is executed in this step. - label: ":terraform: Formatting" diff --git a/.buildkite/webcms.yml b/.buildkite/webcms.yml index e1cd08d807..4222263643 100644 --- a/.buildkite/webcms.yml +++ b/.buildkite/webcms.yml @@ -76,18 +76,6 @@ steps: --target="$$TARGET" \ --destination="${WEBCMS_REPO_URL}/webcms-${WEBCMS_ENVIRONMENT}-${WEBCMS_SITE}-$$TARGET:${WEBCMS_IMAGE_TAG}" - - label: "Build fpm-metrics" - concurrency_group: $BUILDKITE_PIPELINE_SLUG/build-$BUILDKITE_BRANCH - concurrency: 4 - - commands: - - docker build services/metrics --tag "${WEBCMS_REPO_URL}/webcms-${WEBCMS_ENVIRONMENT}-${WEBCMS_SITE}-fpm-metrics:${WEBCMS_IMAGE_TAG}" - - docker push "${WEBCMS_REPO_URL}/webcms-${WEBCMS_ENVIRONMENT}-${WEBCMS_SITE}-fpm-metrics:${WEBCMS_IMAGE_TAG}" - - plugins: - - cultureamp/aws-assume-role#v0.1.0: - role: arn:aws:iam::316981092358:role/BuildkiteRoleForImageBuilds - - wait: ~ - label: ":terraform: WebCMS (${WEBCMS_SITE}-en)" diff --git a/services/metrics/Dockerfile b/services/metrics/Dockerfile deleted file mode 100644 index e7336e1d59..0000000000 --- a/services/metrics/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM debian:stable-slim - -RUN set -ex \ - # 1. Install packages needed for this container - && apt-get update \ - && apt-get install --yes --no-install-recommends \ - ca-certificates \ - curl \ - jq \ - unzip \ - # 2. Download and install the latest AWS CLI - && cd /tmp \ - && curl -fsSL https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -o awscliv2.zip \ - && unzip awscliv2.zip \ - && ./aws/install \ - # 3. Clean up the CLI installer files - && rm -rf ./aws awscliv2.zip \ - # 4. Remove unneeded packages by marking them as automatic and letting the - # 'autoremove' command do its thing - && apt-mark auto unzip \ - && apt-get autoremove --purge --yes \ - && rm -rf /var/apt/lists/* - -COPY transform.jq /etc/transform.jq -COPY entrypoint.sh /bin/entrypoint.sh - -ENTRYPOINT [ "/bin/sh", "/bin/entrypoint.sh" ] diff --git a/services/metrics/README.md b/services/metrics/README.md deleted file mode 100644 index 4eaeb75f01..0000000000 --- a/services/metrics/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# PHP-FPM Metrics Sidecar - -## Table of Contents - -## About - -This directory contains the build for a sidecar container that exports PHP-FPM metrics. PHP-FPM's statistics are enabled via [`pm.status_path`](https://www.php.net/manual/en/install.fpm.configuration.php#pm.status-path) PHP-FPM option. The best reference for what stats are exported is the example [`www.conf`](https://github.com/php/php-src/blob/php-8.0.0/sapi/fpm/www.conf.in#L142-L161) in the PHP source tree; the link here points to PHP 8.0 but generally the exposed stats are stable between PHP releases. - -This container is run as a [sidecar](https://docs.microsoft.com/en-us/azure/architecture/patterns/sidecar) container alongside the WebCMS' Drupal and nginx containers. Every 60 seconds, it runs these steps: - -1. Query PHP-FPM's status endpoint. -2. Transform the JSON metrics into CloudWatch metrics (see the [Metrics Structure](#metrics-structure) section). -3. Publish metrics to CloudWatch with the [`aws cloudwatch put-metric-data`](https://awscli.amazonaws.com/v2/documentation/api/latest/reference/cloudwatch/put-metric-data.html) command. - -Note that if PHP-FPM is overloaded, it's possible that the `curl` command will timeout or otherwise error. In that case, this script publishes nothing and sleeps for another minute. - -## Files - -- `transform.jq`: This is the `jq` transformation. The script iterates over an array of PHP-FPM metrics and transforms them into CloudWatch's expected metric data structure. -- `entrypoint.sh`: As the name suggests, this is the primary script that the container executes. - -## Running - -The entrypoint script requires two environment variables. It will deliberately crash on startup if they are not present: - -1. `$AWS_REGION`: Used to tell the AWS CLI which region the metrics are being published in. -2. `$WEBCMS_SITE`: The name of this deployment, such as `dev` - -## Metrics Structure - -In order to publish metrics to CloudWatch, we need a few key pieces of information: - -- `MetricName`, the name of the metric. While AWS is relatively permissive with these, we use the PascalCase convention established by AWS' own built-in metrics. (For example, the number of idle processes is reported as `ProcessesIdle`). -- `Unit`, the unit of the metric. This can be a count, a unit of time (such as seconds), or a quantity such as bytes or gigabytes. -- `Value`, the numeric value of the metric. -- `Dimensions`, an optional array of `{ Name, Value }` pairs to scope metrics. We primarily use this to scope FPM metrics to the specific WebCMS deployment (e.g., "English dev" or "Spanish production") using the environment name exposed via Terraform. - -As an example, here is a sample of PHP-FPM's JSON statistics. - -```json -{ - "pool": "www", - "process manager": "dynamic", - "start time": 1616779665, - "start since": 1854, - "accepted conn": 10, - "listen queue": 0, - "max listen queue": 0, - "listen queue len": 511, - "idle processes": 1, - "active processes": 1, - "total processes": 2, - "max active processes": 1, - "max children reached": 0, - "slow requests": 0 -} -``` - -The `jq` script will create a metric array with this structure (some elements omitted for brevity): - -```json -[ - { - "MetricName": "Age", - "Unit": "Seconds", - "Value": 1854, - "Timestamp": 1616781829, - "Dimensions": [{ "Name": "Environment", "Value": "example" }] - }, - { - "MetricName": "RequestsAccepted", - "Unit": "Count", - "Value": 10, - "Timestamp": 1616781829, - "Dimensions": [{ "Name": "Environment", "Value": "example" }] - }, - { - "MetricName": "RequestsPending", - "Unit": "Count", - "Value": 0, - "Timestamp": 1616781829, - "Dimensions": [{ "Name": "Environment", "Value": "example" }] - } -] -``` - -Note that the `jq` script does not capture all metrics; some of them are either redundant or not useful. For example, we don't report the "start time" metric since CloudWatch doesn't use timestamps, and it's possible to use metric math to compute "total processes" instead of reporting it. This reduces the size of the payload we send to AWS (there is a hard limit in the API) and also cuts down on the amount of data stored in CloudWatch. CloudWatch is priced per metric, which creates an incentive to avoid redundancy. diff --git a/services/metrics/entrypoint.sh b/services/metrics/entrypoint.sh deleted file mode 100644 index e740713066..0000000000 --- a/services/metrics/entrypoint.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh - -# Preflight check: ensure required environment variables have been set - -if test -z "$AWS_REGION"; then - echo "Required environment variable AWS_REGION is not set." >&2 - exit 1 -fi - -if test -z "$WEBCMS_SITE"; then - echo "Required environment variable WEBCMS_SITE is not set." >&2 - exit 1 -fi - -while true; do - # Sleep at the start of the loop to allow this task to fully warm up - sleep 60 - - # If curl returned an error, skip reporting and - if ! input="$(curl -sS "http://localhost:8080/status?json")"; then - echo "Curl failed to load metrics; skipping" >&2 - continue - fi - - if test -z "$input"; then - echo "Received empty metrics; skipping" >&2 - continue - fi - - metrics="$(echo "$input" | jq -c -f /etc/transform.jq)" - - echo "Input metrics: $input" - echo "Output metrics: $metrics" - - aws cloudwatch put-metric-data --namespace WebCMS/FPM --metric-data "$metrics" -done diff --git a/services/metrics/transform.jq b/services/metrics/transform.jq deleted file mode 100644 index 318b310a20..0000000000 --- a/services/metrics/transform.jq +++ /dev/null @@ -1,33 +0,0 @@ -# This is a jq script (see https://stedolan.github.io/jq/) to transform PHP-FPM's JSON -# stats into a more CloudWatch-friendly format. -# -# See the the README for a discussion of the transformation logic. - -# Capture the input as $input -. as $input - -# Next, iterate over each of the PHP-FPM metrics we're interested in. Each object has this -# structure: -# * key: PHP-FPM metric name -# * name: CloudWatch metric name -# * unit: CloudWatch metric unit -| [ - { key: "start since", name: "Age", unit: "Seconds" }, - { key: "accepted conn", name: "RequestsAccepted", unit: "Count" }, - { key: "listen queue", name: "RequestsPending", unit: "Count" }, - { key: "listen queue len", name: "ListenQueueLength", unit: "Count" }, - { key: "idle processes", name: "ProcessesIdle", unit: "Count" }, - { key: "active processes", name: "ProcessesActive", unit: "Count" }, - { key: "max children reached", name: "MaxCh8ildrenReached", unit: "Count" } -] - -# For each PHP-FPM metric, construct a CloudWatch metric object -| map({ - MetricName: .name, - Unit: .unit, - Value: $input[.key], - Timestamp: now | floor, - Dimensions: [ - { Name: "Environment", Value: $ENV.WEBCMS_SITE } - ], -}) diff --git a/terraform/infrastructure/drupal_iam_task.tf b/terraform/infrastructure/drupal_iam_task.tf index d3f7309d8b..83e43a9a1f 100644 --- a/terraform/infrastructure/drupal_iam_task.tf +++ b/terraform/infrastructure/drupal_iam_task.tf @@ -92,31 +92,6 @@ resource "aws_iam_role_policy_attachment" "drupal_es_access" { policy_arn = aws_iam_policy.drupal_es_access.arn } -data "aws_iam_policy_document" "drupal_publish_metrics" { - version = "2012-10-17" - - statement { - sid = "putMetrics" - effect = "Allow" - actions = ["cloudwatch:PutMetricData"] - resources = ["*"] - } -} - -resource "aws_iam_policy" "drupal_publish_metrics" { - name = "${var.iam_prefix}-${var.aws_region}-${var.environment}-PublishMetrics" - description = "Permits publishing CloudWatch metrics" - - policy = data.aws_iam_policy_document.drupal_publish_metrics.json -} - -resource "aws_iam_role_policy_attachment" "drupal_publish_metrics" { - for_each = local.sites - - role = aws_iam_role.drupal_task[each.key].name - policy_arn = aws_iam_policy.drupal_publish_metrics.arn -} - # Grant the Drupal container permissions to Cloudwatch to create a log stream # and publish log events. data "aws_iam_policy_document" "drupal_put_logs" { diff --git a/terraform/infrastructure/ecr.tf b/terraform/infrastructure/ecr.tf index 29254d202c..4f4d9668df 100644 --- a/terraform/infrastructure/ecr.tf +++ b/terraform/infrastructure/ecr.tf @@ -30,16 +30,6 @@ resource "aws_ecr_repository" "drush" { tags = var.tags } -# Create a custom repo for the Alpine-based metrics sidecar. See services/metrics for more -# information. -resource "aws_ecr_repository" "metrics" { - for_each = toset(var.sites) - - name = "webcms-${var.environment}-${each.key}-fpm-metrics" - - tags = var.tags -} - # Finally, we create a cache repository for Kaniko-based builds. This repository has some # lifecycle policies that aggressively expire images in order to avoid an arbitrarily large # cache from building up (see below). diff --git a/terraform/infrastructure/logging.tf b/terraform/infrastructure/logging.tf index d21600899e..33ac553736 100644 --- a/terraform/infrastructure/logging.tf +++ b/terraform/infrastructure/logging.tf @@ -34,15 +34,6 @@ resource "aws_cloudwatch_log_group" "drupal" { tags = var.tags } -# Log group for the FPM metrics helper -resource "aws_cloudwatch_log_group" "fpm_metrics" { - for_each = local.sites - - name = "/webcms/${var.environment}/${each.value.site}/${each.value.lang}/fpm-metrics" - - tags = var.tags -} - # Log group for any Terraform runs performed inside the ECS cluster resource "aws_cloudwatch_log_group" "terraform" { name = "/webcms/${var.environment}/terraform" diff --git a/terraform/infrastructure/parameters.tf b/terraform/infrastructure/parameters.tf index 169a77bbb4..2d2f050b2f 100644 --- a/terraform/infrastructure/parameters.tf +++ b/terraform/infrastructure/parameters.tf @@ -151,16 +151,6 @@ resource "aws_ssm_parameter" "ecr_drush" { tags = var.tags } -resource "aws_ssm_parameter" "ecr_metrics" { - for_each = local.sites - - name = "/webcms/${var.environment}/${each.value.site}/${each.value.lang}/ecr/metrics" - type = "String" - value = aws_ecr_repository.metrics[each.value.site].repository_url - - tags = var.tags -} - #endregion #region Log groups @@ -195,16 +185,6 @@ resource "aws_ssm_parameter" "drush_log_group" { tags = var.tags } -resource "aws_ssm_parameter" "fpm_metrics_log_group" { - for_each = local.sites - - name = "/webcms/${var.environment}/${each.value.site}/${each.value.lang}/log-groups/fpm-metrics" - type = "String" - value = aws_cloudwatch_log_group.fpm_metrics[each.key].name - - tags = var.tags -} - resource "aws_ssm_parameter" "drupal_log_group" { for_each = local.sites diff --git a/terraform/webcms/README.md b/terraform/webcms/README.md index d71f6fc378..449f09682b 100644 --- a/terraform/webcms/README.md +++ b/terraform/webcms/README.md @@ -60,7 +60,7 @@ See the [parent directory's README](../) for instructions on using a backend for ### Built Images -The images for Drupal, nginx, Drush, and the metrics sidecar must have been built before this module is deployed. See [How to Run](#how-to-run) for more information. +The images for Drupal, nginx, and Drush must have been built before this module is deployed. See [How to Run](#how-to-run) for more information. ## Module Inputs @@ -173,7 +173,6 @@ As with the infrastructure and database modules, this module assumes that certai - Log group identifiers are also read from Parameter Store: - `/webcms/${var.environment}/${var.site}/${var.lang}log-groups/php-fpm`: The name of the log group for Drupal's PHP-FPM container. - `/webcms/${var.environment}/${var.site}/${var.lang}log-groups/nginx`: The name of the log group for Drupal's nginx container. - - `/webcms/${var.environment}/${var.site}/${var.lang}log-groups/fpm-metrics`: The name of the log group for for Drupal's FPM metrics container. - `/webcms/${var.environment}/${var.site}/${var.lang}log-groups/drush`: The name of the log group for Drush runs. - `/webcms/${var.environment}/${var.site}/${var.lang}log-groups/drupal`: The name of the log group for Drupal application logs. - Finally, Secrets Manager ARNs are read from Parameter Store. More information on how these are used can be read @@ -187,7 +186,7 @@ As with the infrastructure and database modules, this module assumes that certai ### Drupal -This module creates an ECS task definition and service for running the WebCMS. This task includes a pair of containers, nginx and PHP-FPM, that handle incoming web traffic. In addition, a third container runs a basic Alpine image that gathers PHP-FPM metrics every 60 seconds and publishes them to CloudWatch. +This module creates an ECS task definition and service for running the WebCMS. This task includes a pair of containers, nginx and PHP-FPM, that handle incoming web traffic. An autoscaling policy is attached to the Drupal service that tracks 60% CPU utilization. Scale-out is more aggressive than scale-in by a factor of five. We enforce slow scale in due to the relatively slow warm-up time of the Drupal containers; a long cooldown smooths out spiky traffic patterns and keeps containers from exhibiting thrashing-like behavior as opcache warms up. @@ -209,7 +208,7 @@ Deployments can be broken down into three steps: build images, apply Terraform, ### Build Images -There are four custom Docker images: Drupal, nginx, Drush, and the metrics sidecar. While it is possible to build these in parallel, it is probably best to build them in serial and push after deployments. A sample shell script is below: +There are four custom Docker images: Drupal, nginx, and Drush. While it is possible to build these in parallel, it is probably best to build them in serial and push after deployments. A sample shell script is below: ```sh #!/bin/bash @@ -231,10 +230,6 @@ docker build services/drupal --tag ":$BUILD_TAG" --target drus docker push ":$BUILD_TAG" docker push ":$BUILD_TAG" docker push ":$BUILD_TAG" - -# Now, build the metrics sidecar. -docker build services/metrics --tag ":$BUILD_TAG" -docker push ":$BUILD_TAG" ``` Note that this script does not cover authenticating with ECR or other topics; see the AWS CLI's documentation on authenticating with ECR: diff --git a/terraform/webcms/drupal.tf b/terraform/webcms/drupal.tf index b80c7969d2..4cc7184d52 100644 --- a/terraform/webcms/drupal.tf +++ b/terraform/webcms/drupal.tf @@ -120,28 +120,6 @@ resource "aws_ecs_task_definition" "drupal_task" { } } }, - - # Report FPM metrics to CloudWatch using the custom metrics container. See the - # services/metrics directory for more. - { - name = "metrics" - image = "${data.aws_ssm_parameter.ecr_metrics.value}:${var.image_tag}" - - environment = [ - { name = "AWS_REGION", value = var.aws_region }, - { name = "WEBCMS_SITE", value = "${var.site}-${var.lang}" }, - ] - - logConfiguration = { - logDriver = "awslogs" - - options = { - awslogs-group = data.aws_ssm_parameter.fpm_metrics_log_group.value - awslogs-region = var.aws_region - awslogs-stream-prefix = "fpm-metrics" - } - } - }, ]) tags = var.tags diff --git a/terraform/webcms/shared.tf b/terraform/webcms/shared.tf index 1cade356ec..bf82f48a9b 100644 --- a/terraform/webcms/shared.tf +++ b/terraform/webcms/shared.tf @@ -90,10 +90,6 @@ data "aws_ssm_parameter" "ecr_drush" { name = "/webcms/${var.environment}/${var.site}/${var.lang}/ecr/drush" } -data "aws_ssm_parameter" "ecr_metrics" { - name = "/webcms/${var.environment}/${var.site}/${var.lang}/ecr/metrics" -} - #endregion #region Log groups @@ -110,10 +106,6 @@ data "aws_ssm_parameter" "drush_log_group" { name = "/webcms/${var.environment}/${var.site}/${var.lang}/log-groups/drush" } -data "aws_ssm_parameter" "fpm_metrics_log_group" { - name = "/webcms/${var.environment}/${var.site}/${var.lang}/log-groups/fpm-metrics" -} - data "aws_ssm_parameter" "drupal_log_group" { name = "/webcms/${var.environment}/${var.site}/${var.lang}/log-groups/drupal" }