From 6624fdd3a84985ffafed3026ec57ba9361a22207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Thu, 18 Apr 2024 07:15:52 +0200 Subject: [PATCH] Add more manual cleanup (#58) * parameterize aws-nuke retries and add filter * Filter AWSBackupVaultAccessPolicy * Filter rejected EC2 VPC Endpoint connections * Remove backup recovery points * Delete all servicecatalog registry applications * Manually Delete Cognito User pools * Restore async poll on aws-nuke --- conan/conan.sh | 10 +- conan/readme.adoc | 7 +- conan/wipe_sandbox.sh | 2 + .../roles/infra-aws-sandbox/defaults/main.yml | 6 + .../infra-aws-sandbox/files/manual_cleanup.py | 230 ++++++++++++++++-- .../roles/infra-aws-sandbox/tasks/reset.yml | 45 +++- 6 files changed, 263 insertions(+), 37 deletions(-) diff --git a/conan/conan.sh b/conan/conan.sh index 299dcdd9..da22c64e 100755 --- a/conan/conan.sh +++ b/conan/conan.sh @@ -10,6 +10,7 @@ set -u -o pipefail threads="${threads:-12}" # Number of attempts to run cleanup on a sandbox max_retries="${max_retries:-2}" +aws_nuke_retries=${aws_nuke_retries:-0} # AWS profile aws_profile="${aws_profile:-pool-manager}" @@ -49,6 +50,9 @@ kerberos_keytab=${kerberos_keytab:-~/secrets/hostadmin.keytab} kerberos_user=${kerberos_user:-hostadmin} kerberos_password=${kerberos_password:-} +# Pattern to filter the sandboxes to cleanup +sandbox_filter=${sandbox_filter:-} + if [ -n "${kerberos_password}" ]; then unset kerberos_keytab fi @@ -76,11 +80,13 @@ export kerberos_password export kerberos_user export lock_timeout export max_retries +export aws_nuke_retries export noop export poll_interval export threads export vault_file export workdir +export sandbox_filter ORIG="$(cd "$(dirname "$0")" || exit; pwd)" @@ -144,7 +150,9 @@ while true; do export AWS_REGION=${dynamodb_region} export dynamodb_table=${dynamodb_table} sandbox-list --to-cleanup --no-headers - ) | rush --immediate-output -j "${threads}" './wipe_sandbox.sh {1}' + ) \ + | grep -E "${sandbox_filter}" \ + | rush --immediate-output -j "${threads}" './wipe_sandbox.sh {1}' sleep "${poll_interval}" done diff --git a/conan/readme.adoc b/conan/readme.adoc index c81d6c2a..72a14223 100644 --- a/conan/readme.adoc +++ b/conan/readme.adoc @@ -83,11 +83,12 @@ $ podman run \ -e AWSCLI=aws \ -e threads=1 \ -e NOVENV=true \ - sandbox-conan:latest + -v $PWD:/home/opentlc-mgr/pool_management/sandbox \ + quay.io/rhpds/sandbox-conan:latest -# For fast iterations, you can pass a volume of your current version of the repo +# For fast iterations on a specific sandbox, you can pass a pattern -podman run -v $PWD:/home/opentlc-mgr/pool_management/sandbox ... +podman run -e sandbox_filter="^sandbox2345 " ... # Delete the secrets when done diff --git a/conan/wipe_sandbox.sh b/conan/wipe_sandbox.sh index b4c6eb38..49205196 100755 --- a/conan/wipe_sandbox.sh +++ b/conan/wipe_sandbox.sh @@ -4,6 +4,7 @@ ORIG="$(cd "$(dirname "$0")" || exit; pwd)" # Stop after max_retries max_retries=${max_retries:-2} +aws_nuke_retries=${aws_nuke_retries:-0} # retry after 48h TTL_EVENTLOG=$((3600*24)) @@ -224,6 +225,7 @@ sandbox_reset() { -e dynamodb_table="${dynamodb_table}" \ -e dynamodb_region="${dynamodb_region}" \ -e aws_nuke_binary_path="${aws_nuke_binary_path}" \ + -e aws_nuke_retries="${aws_nuke_retries}" \ -e output_dir="${workdir}/output_dir_sandbox" \ -e vault_file="${vault_file}" \ -e aws_cli="${AWSCLI}" \ diff --git a/playbooks/roles/infra-aws-sandbox/defaults/main.yml b/playbooks/roles/infra-aws-sandbox/defaults/main.yml index 8119bee8..044d0f32 100644 --- a/playbooks/roles/infra-aws-sandbox/defaults/main.yml +++ b/playbooks/roles/infra-aws-sandbox/defaults/main.yml @@ -135,7 +135,13 @@ aws_nuke_filters_default: AWSBackupVault: - property: Name value: aws/efs/automatic-backup-vault + AWSBackupVaultAccessPolicy: + - aws/efs/automatic-backup-vault + # Rejected VPC Endpoints cannot be deleted + EC2VPCEndpointConnection: + - property: State + value: rejected ############################## # POOL management diff --git a/playbooks/roles/infra-aws-sandbox/files/manual_cleanup.py b/playbooks/roles/infra-aws-sandbox/files/manual_cleanup.py index d4a62b1c..b3d9154d 100644 --- a/playbooks/roles/infra-aws-sandbox/files/manual_cleanup.py +++ b/playbooks/roles/infra-aws-sandbox/files/manual_cleanup.py @@ -1,47 +1,216 @@ #!/usr/bin/env python3 -import sys +import json +import os import time import boto3 import botocore changed = False -# Cleanup Public ECR -client = boto3.client('ecr-public') +aws_nuke_filter = {} + +# if /tmp/aws_nuke_filters.json exists, load it +if os.path.exists('/tmp/aws_nuke_filters.json'): + with open('/tmp/aws_nuke_filters.json', 'r') as f: + aws_nuke_filter.update(json.load(f)) + +# Delete all Cognito User Pools + +client = boto3.client('cognito-idp') + +try: + response = client.list_user_pools( + MaxResults=60 + ) + + for user_pool in response['UserPools']: + # Delete all users + response2 = client.list_users( + UserPoolId=user_pool['Id'] + ) + + for user in response2['Users']: + client.admin_delete_user( + UserPoolId=user_pool['Id'], + Username=user['Username'] + ) + print("Deleted user: " + user['Username']) + changed = True + + # Disable deletion protection + client.update_user_pool( + UserPoolId=user_pool['Id'], + DeletionProtection='INACTIVE', + AutoVerifiedAttributes=[ + 'email' + ] + ) + # Delete user pool + client.delete_user_pool( + UserPoolId=user_pool['Id'] + ) + print("Deleted user pool: " + user_pool['Id']) + changed = True + +except botocore.exceptions.ClientError as e: + print(e) + +# Delete all app registry applications + +client = boto3.client('servicecatalog-appregistry') + +try: + response = client.list_applications() + + for application in response['applications']: + # Delete all resources + response2 = client.list_associated_resources( + application=application['id'] + ) + + for resource in response2['resources']: + client.disassociate_resource( + application=application['id'], + resource=resource['resourceType'], + resourceType=resource['resourceType'] + ) + print("Disassociated resource: " + resource['resourceType']) + changed = True + + # Delete application + client.delete_application( + application=application['id'] + ) + print("Deleted application: " + application['id']) + changed = True + + +except botocore.exceptions.ClientError as e: + print(e) + +# Cleanup AWSBackupRecoveryPoint +client = boto3.client('backup') try: - response = client.describe_repositories() - - for repo in response['repositories']: - # Delete all images inside the repository - # Get all images - response2 = client.describe_images(repositoryName=repo['repositoryName']) - - # Delete all images - for image in response2['imageDetails']: - client.batch_delete_image( - repositoryName=repo['repositoryName'], - imageIds=[ - { - 'imageDigest': image['imageDigest'] - } - ] + # Get all vaults + response = client.list_backup_vaults() + + for vault in response['BackupVaultList']: + # Change access policy so we can delete recovery points later + + response2 = client.get_backup_vault_access_policy( + BackupVaultName=vault['BackupVaultName'] + ) + + if response2['Policy'] != '{}': + # Set to empty policy + client.put_backup_vault_access_policy( + BackupVaultName=vault['BackupVaultName'], + Policy='''{"Version": "2012-10-17", "Statement": [ + { + "Effect": "Deny", + "Principal": { + "AWS": "*" + }, + "Resource": "*", + "Action": ["backup:StartCopyJob"]}]}''') + + # Get all recovery points + response2 = client.list_recovery_points_by_backup_vault( + BackupVaultName=vault['BackupVaultName'] + ) + + for recovery_point in response2['RecoveryPoints']: + # Delete recovery point + client.delete_recovery_point( + BackupVaultName=vault['BackupVaultName'], + RecoveryPointArn=recovery_point['RecoveryPointArn'] ) + print(recovery_point['RecoveryPointArn']) + print("Deleted recovery point: " + recovery_point['RecoveryPointArn']) changed = True - print("Deleted image: " + image['imageDigest']) + # Delete vault + # If vault is aws/efs/automatic-backup-vault ignore + + if vault['BackupVaultName'] == 'aws/efs/automatic-backup-vault': + print("Skipping vault: " + vault['BackupVaultName']) + continue + + client.delete_backup_vault( + BackupVaultName=vault['BackupVaultName'] + ) + print("Deleted vault: " + vault['BackupVaultName']) + changed = True + +except botocore.exceptions.ClientError as e: + print(e) + + +# Cleanup VPC Endpoints EC2VPCEndpointConnection +client = boto3.client('ec2') + +try: + response = client.describe_vpc_endpoint_connections() - # Delete repository - client.delete_repository( - repositoryName=repo['repositoryName'] + for connection in response['VpcEndpointConnections']: + # Reject connection + if connection['VpcEndpointState'] == "rejected": + print("VPC Endpoint Connection is already rejected: " + connection['VpcEndpointId']) + # ignore this connection + aws_nuke_filter['EC2VPCEndpointConnection'] = aws_nuke_filter.get('EC2VPCEndpointConnection', []) + aws_nuke_filter['EC2VPCEndpointConnection'].append(connection['ServiceId']) + continue + client.reject_vpc_endpoint_connections( + ServiceId=connection['ServiceId'], + VpcEndpointIds=[connection['VpcEndpointId']] ) - print("Deleted repository: " + repo['repositoryName']) + print("Rejected VPC Endpoint Connection: " + connection['VpcEndpointId']) changed = True -except botocore.exceptions.EndpointConnectionError: - print("ECR Public is not supported in this region") +except client.exceptions.EndpointConnectionError: + print("EC2VPCEndpointConnection is not supported in this region") +except botocore.exceptions.ClientError as e: + print(e) + +# Cleanup Public ECR +client = boto3.client('ecr-public') + +if os.environ.get('AWS_REGION') == 'us-east-1': + try: + response = client.describe_repositories() + + for repo in response['repositories']: + # Delete all images inside the repository + # Get all images + response2 = client.describe_images(repositoryName=repo['repositoryName']) + + # Delete all images + for image in response2['imageDetails']: + client.batch_delete_image( + repositoryName=repo['repositoryName'], + imageIds=[ + { + 'imageDigest': image['imageDigest'] + } + ] + ) + changed = True + + print("Deleted image: " + image['imageDigest']) + + # Delete repository + + client.delete_repository( + repositoryName=repo['repositoryName'] + ) + print("Deleted repository: " + repo['repositoryName']) + changed = True + + except botocore.exceptions.EndpointConnectionError: + print("ECR Public is not supported in this region") # Cleanup MGNSourceServer client = boto3.client('mgn') @@ -85,10 +254,17 @@ ) print("Deleted source server: " + server['sourceServerID']) changed = True -except botocore.exceptions.EndpointConnectionError: +# UninitializedAccountException +except client.exceptions.UninitializedAccountException: print("MGNSourceServer is not supported in this region") + + # Display Change if changed: print("Changes were made") + +# write to /tmp/aws_nuke_filters.json +with open('/tmp/aws_nuke_filters.json', 'w') as f: + json.dump(aws_nuke_filter, f) diff --git a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml index 4f53ec09..e5374585 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml @@ -17,6 +17,27 @@ _hostedzoneid: "{{ _route53zone.zone_id }}" aws_public_zone: "{{ account_name }}{{subdomain_base}}." +# Get a new token as the current one may have timed out (1h) +- include_tasks: assume.yml + +- loop: "{{ all_regions }}" + loop_control: + loop_var: _region + environment: + AWS_REGION: "{{ _region }}" + AWS_DEFAULT_REGION: "{{ _region }}" + AWS_ACCESS_KEY_ID: "{{ assumed_role.sts_creds.access_key }}" + AWS_SECRET_ACCESS_KEY: "{{ assumed_role.sts_creds.secret_key }}" + AWS_SESSION_TOKEN: "{{ assumed_role.sts_creds.session_token }}" + ignore_errors: true + name: Run files/manual_cleanup.py script + script: files/manual_cleanup.py + register: r_manual_cleanup + # timeout after 2 minutes + timeout: 120 + changed_when: >- + 'Changes were made' in r_manual_cleanup.stdout + - tags: nuke when: nuke_sandbox | bool block: @@ -27,6 +48,18 @@ - "{{ account_name }}{{ subdomain_base }}." - /hostedzone/{{ _route53zone.zone_id }} ({{ account_name }}{{ subdomain_base }}.) + - name: check if /tmp/aws_nuke_filters.json exists + stat: + path: /tmp/aws_nuke_filters.json + register: _stat + + - name: Load /tmp/aws_nuke_filters.json and merge it into aws_nuke_filters + when: _stat.stat.exists + set_fact: + aws_nuke_filters: "{{ aws_nuke_filters | combine(_aws_nuke_filters) }}" + vars: + _aws_nuke_filters: "{{ lookup('file', '/tmp/aws_nuke_filters.json') | from_json | default({}) }}" + - name: Generate config file for aws-nuke template: src: "{{ role_path }}/templates/nuke-config.yml.j2" @@ -41,13 +74,13 @@ args: stdin: "{{ account_name }}{{ alias_suffix }}" register: _awsnuke - async: 1800 - poll: 30 ignore_errors: true retries: "{{ aws_nuke_retries }}" - delay: 30 until: _awsnuke is succeeded no_log: true + async: 1800 + poll: 30 + delay: 30 - debug: var: _awsnuke @@ -74,13 +107,13 @@ args: stdin: "{{ account_name }}{{ alias_suffix }}" register: _awsnuke2 - async: 1800 - poll: 30 ignore_errors: true retries: 0 - delay: 30 until: _awsnuke2 is succeeded no_log: true + async: 1800 + poll: 30 + delay: 30 - debug: var: _awsnuke2