diff --git a/README.md b/README.md index 3edea25f..3d456eff 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ This repo is used to deploy an EKS cluster to AWS. CI/CD is managed through Spac │ └── policies: Rego policies that can be attached to 0..* spacelift stacks ├── dev: Development/sandbox environment │ ├── spacelift: Terraform scripts to manage spacelift resources -│ │ └── dpe-sandbox: Spacelift specific resources to manage the CI/CD pipeline +│ │ └── dpe-k8s/dpe-sandbox: Spacelift specific resources to manage the CI/CD pipeline │ └── stacks: The deployable cloud resources +│ ├── dpe-auth0: Stack used to provision and setup auth0 IDP (Identity Provider) settings │ ├── dpe-sandbox-k8s: K8s + supporting AWS resources │ └── dpe-sandbox-k8s-deployments: Resources deployed inside of a K8s cluster └── modules: Templatized collections of terraform resources that are used in a stack @@ -19,15 +20,22 @@ This repo is used to deploy an EKS cluster to AWS. CI/CD is managed through Spac │ └── templates: Resources used during deployment of airflow ├── argo-cd: K8s deployment for Argo CD, a declarative, GitOps continuous delivery tool for Kubernetes. │ └── templates: Resources used during deployment of this helm chart - ├── trivy-operator: K8s deployment for trivy, along with a few supporting charts for security scanning - │ └── templates: Resources used during deployment of these helm charts - ├── victoria-metrics: K8s deployment for victoria metrics, a promethus like tool for cluster metric collection - │ └── templates: Resources used during deployment of these helm charts + ├── cert-manager: Handles provisioning TLS certificates for the cluster + ├── envoy-gateway: API Gateway for the cluster securing and providing secure traffic into the cluster + ├── postgres-cloud-native: Used to provision a postgres instance + ├── postgres-cloud-native-operator: Operator that manages the lifecycle of postgres instances on the cluster ├── demo-network-policies: K8s deployment for a demo showcasing how to use network policies ├── demo-pod-level-security-groups-strict: K8s deployment for a demo showcasing how to use pod level security groups in strict mode ├── sage-aws-eks: Sage specific EKS cluster for AWS + ├── sage-aws-eks-addons: Sets up additional resources that need to be installed post creation of the EKS cluster ├── sage-aws-k8s-node-autoscaler: K8s node autoscaler using spotinst ocean - └── sage-aws-vpc: Sage specific VPC for AWS + ├── sage-aws-ses: AWS SES (Simple email service) setup + ├── sage-aws-vpc: Sage specific VPC for AWS + ├── signoz: SigNoz provides APM, logs, traces, metrics, exceptions, & alerts in a single tool + ├── trivy-operator: K8s deployment for trivy, along with a few supporting charts for security scanning + │ └── templates: Resources used during deployment of these helm charts + ├── victoria-metrics: K8s deployment for victoria metrics, a promethus like tool for cluster metric collection + │ └── templates: Resources used during deployment of these helm charts ``` This root `main.tf` contains all the "Things" that are going to be deployed. @@ -283,10 +291,27 @@ This document describes the abbreviated process below: "iam:*PolicyVersion", "iam:*OpenIDConnectProvider", "iam:*InstanceProfile", - "iam:ListPolicyVersions" + "iam:ListPolicyVersions", + "iam:ListGroupsForUser", + "iam:ListAttachedUserPolicies" ], "Resource": "*" - } + }, + { + "Effect": "Allow", + "Action": [ + "iam:CreateUser", + "iam:AttachUserPolicy", + "iam:ListPolicies", + "iam:TagUser", + "iam:GetUser", + "iam:DeleteUser", + "iam:CreateAccessKey", + "iam:ListAccessKeys", + "iam:DeleteAccessKeys" + ], + "Resource": "arn:aws:iam::{{AWS ACCOUNT ID}}:user/smtp_user" + } ] } ``` diff --git a/deployments/main.tf b/deployments/main.tf index f3ffaca6..f75f13b1 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -69,8 +69,11 @@ module "dpe-sandbox-spacelift-development" { enable_cluster_ingress = true enable_otel_ingress = true - ssl_hostname = "a09a38cc5a8d6497ea69c6bf6318701b-1974793757.us-east-1.elb.amazonaws.com" + ssl_hostname = "dev.sagedpe.org" auth0_jwks_uri = "https://dev-sage-dpe.us.auth0.com/.well-known/jwks.json" + ses_email_identities = ["aws-dpe-dev@sagebase.org"] + # Defines the email address that will be used as the sender of the email alerts + smtp_from = "aws-dpe-dev@sagebase.org" } module "dpe-sandbox-spacelift-production" { @@ -115,4 +118,6 @@ module "dpe-sandbox-spacelift-production" { enable_otel_ingress = false ssl_hostname = "" auth0_jwks_uri = "" + + ses_email_identities = [] } diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 38674a99..c29b3ce8 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -11,6 +11,7 @@ locals { private_subnet_cidrs_eks_worker_nodes = var.private_subnet_cidrs_eks_worker_nodes azs_eks_control_plane = var.azs_eks_control_plane azs_eks_worker_nodes = var.azs_eks_worker_nodes + ses_email_identities = var.ses_email_identities } k8s_stack_deployments_variables = { @@ -25,12 +26,13 @@ locals { enable_otel_ingress = var.enable_otel_ingress ssl_hostname = var.ssl_hostname auth0_jwks_uri = var.auth0_jwks_uri + smtp_from = var.smtp_from } auth0_stack_variables = { - cluster_name = var.cluster_name - auth0_domain = var.auth0_domain - auth0_clients = var.auth0_clients + cluster_name = var.cluster_name + auth0_domain = var.auth0_domain + auth0_clients = var.auth0_clients } # Variables to be passed from the k8s stack to the deployments stack @@ -39,6 +41,8 @@ locals { private_subnet_ids_eks_worker_nodes = "TF_VAR_private_subnet_ids_eks_worker_nodes" node_security_group_id = "TF_VAR_node_security_group_id" pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" + smtp_user = "TF_VAR_smtp_user" + smtp_password = "TF_VAR_smtp_password" } } @@ -250,4 +254,4 @@ resource "spacelift_environment_variable" "auth0-stack-environment-variables" { name = "TF_VAR_${each.key}" value = try(tostring(each.value), jsonencode(each.value)) write_only = false -} \ No newline at end of file +} diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index ec63b620..f2fa71c8 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -180,4 +180,27 @@ variable "auth0_clients" { description = string app_type = string })) -} \ No newline at end of file +} + +variable "ses_email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} diff --git a/deployments/stacks/dpe-auth0/main.tf b/deployments/stacks/dpe-auth0/main.tf index 0cb83e05..780ea89d 100644 --- a/deployments/stacks/dpe-auth0/main.tf +++ b/deployments/stacks/dpe-auth0/main.tf @@ -4,8 +4,11 @@ resource "auth0_resource_server" "k8s-cluster-telemetry" { identifier = "${var.cluster_name}-telemetry" signing_alg = "RS256" - allow_offline_access = false - token_lifetime = 86400 + allow_offline_access = false + # 108000 seconds = 1.25 days + # An offset of 1.25 days allows a daily token refresh to occur by simple cronjob + # for the services that use the token + token_lifetime = 108000 skip_consent_for_verifiable_first_party_clients = true # https://registry.terraform.io/providers/auth0/auth0/latest/docs/resources/resource_server_scopes # Says to use the following, however it errors out: diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 1cbe3d4b..5db44e36 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -89,6 +89,9 @@ module "signoz" { gateway_namespace = "envoy-gateway" cluster_name = var.cluster_name auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from } module "envoy-gateway" { @@ -102,11 +105,8 @@ module "envoy-gateway" { git_revision = var.git_revision namespace = "envoy-gateway" argo_deployment_name = "envoy-gateway" - cluster_issuer_name = "selfsigned" - # To determine more elegant ways to fill in these values, for example, if we have - # a pre-defined DNS name for the cluster (https://sagebionetworks.jira.com/browse/IT-3931) - ssl_hostname = var.ssl_hostname - auth0_jwks_uri = var.auth0_jwks_uri + cluster_issuer_name = "lets-encrypt-prod" + ssl_hostname = var.ssl_hostname } module "cert-manager" { diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index e12008c0..2b9be26a 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -85,3 +85,21 @@ variable "auth0_jwks_uri" { description = "The JWKS URI for Auth0" type = string } + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} diff --git a/deployments/stacks/dpe-k8s/main.tf b/deployments/stacks/dpe-k8s/main.tf index d9f9cf69..17c12f0d 100644 --- a/deployments/stacks/dpe-k8s/main.tf +++ b/deployments/stacks/dpe-k8s/main.tf @@ -36,3 +36,9 @@ module "sage-aws-eks" { private_subnet_ids_eks_control_plane = module.sage-aws-vpc.private_subnet_ids_eks_control_plane private_subnet_ids_eks_worker_nodes = module.sage-aws-vpc.private_subnet_ids_eks_worker_nodes } + +module "sage-aws-ses" { + source = "../../../modules/sage-aws-ses" + + email_identities = var.ses_email_identities +} diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 19418d1c..4a062261 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -37,3 +37,12 @@ output "region" { output "cluster_name" { value = module.sage-aws-eks.cluster_name } + +output "smtp_user" { + value = module.sage-aws-ses.smtp_user +} + +output "smtp_password" { + sensitive = true + value = module.sage-aws-ses.smtp_password +} \ No newline at end of file diff --git a/deployments/stacks/dpe-k8s/variables.tf b/deployments/stacks/dpe-k8s/variables.tf index 6edc1cc1..9054a549 100644 --- a/deployments/stacks/dpe-k8s/variables.tf +++ b/deployments/stacks/dpe-k8s/variables.tf @@ -54,3 +54,8 @@ variable "azs_eks_worker_nodes" { type = list(string) description = "Availability Zones for the EKS worker nodes" } + +variable "ses_email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} diff --git a/modules/apache-airflow/README.md b/modules/apache-airflow/README.md index 736303cf..96e4d7df 100644 --- a/modules/apache-airflow/README.md +++ b/modules/apache-airflow/README.md @@ -65,4 +65,20 @@ YAML ## Accessing the web UI An `admin` user is created for airflow via the `airflow-admin-user-secret` secret that is added to the namespace. Decode the base64 encoded password/username and use it for -the UI. \ No newline at end of file +the UI. + +## Building a new image for airflow +The deployment of our airflow instance depends on a custom apache airflow image being +created and pushed to a public available GCHR url. The image is created from the +`orca-recipes` git repo: + +1. Update the dockerfile within the orca-recipes repo +2. Build the new image `docker build .` +3. Tag the build image with the tag you want to use `docker tag sha256:... ghcr.io/sage-bionetworks-workflows/orca-recipes:0.0.1` +4. Push to GCHR `docker push ghcr.io/sage-bionetworks-workflows/orca-recipes:0.0.1` (May require an admin of the repo to push this) +5. Update the `values.yaml` file in this `modules/apache-airflow/templates` directory. + +Transitive dependencies may also need to be updated when building a new image for +airflow, for example `py-orca` was updated in this example PR: . +Additionally, this PR covers what was completed in order to update the +requirements/dockerfile: . diff --git a/modules/apache-airflow/main.tf b/modules/apache-airflow/main.tf index 3ed255a5..81c219ec 100644 --- a/modules/apache-airflow/main.tf +++ b/modules/apache-airflow/main.tf @@ -66,7 +66,7 @@ spec: sources: - repoURL: 'https://airflow.apache.org' chart: airflow - targetRevision: 1.11.0 + targetRevision: 1.15.0 helm: releaseName: airflow valueFiles: diff --git a/modules/apache-airflow/templates/values.yaml b/modules/apache-airflow/templates/values.yaml index da1c2df8..f4e3eaa9 100644 --- a/modules/apache-airflow/templates/values.yaml +++ b/modules/apache-airflow/templates/values.yaml @@ -25,6 +25,20 @@ fullnameOverride: "" # Provide a name to substitute for the name of the chart nameOverride: "" +# Use standard naming for all resources using airflow.fullname template +# Consider removing this later and default it to true +# to make this chart follow standard naming conventions using the fullname template. +# For now this is an opt-in switch for backwards compatibility to leverage the standard naming convention +# and being able to use fully fullnameOverride and nameOverride in all resources +# For new installations - it is recommended to set it to True to follow standard naming conventions +# For existing installations, this will rename and redeploy your resources with the new names. Be aware that +# this will recreate your deployment/statefulsets along with their persistent volume claims and data storage +# migration may be needed to keep your old data +# +# Note:fernet-key,redis-password and broker-url secrets don't use this logic yet, +# as this may break existing installations due to how they get installed via pre-install hook. +useStandardNaming: false + # Max number of old replicasets to retain. Can be overridden by each deployment's revisionHistoryLimit revisionHistoryLimit: ~ @@ -43,21 +57,24 @@ securityContexts: pod: {} containers: {} +# Global container lifecycle hooks for airflow containers +containerLifecycleHooks: {} + # Airflow home directory # Used for mount paths airflowHome: /opt/airflow # Default airflow repository -- overridden by all the specific images below -defaultAirflowRepository: bfaublesage/airflow +defaultAirflowRepository: ghcr.io/sage-bionetworks-workflows/orca-recipes # Default airflow tag to deploy -defaultAirflowTag: "2.7.1-python-3.10" +defaultAirflowTag: "0.1.0" # Default airflow digest. If specified, it takes precedence over tag defaultAirflowDigest: ~ # Airflow version (Used to make some decisions based on Airflow Version being deployed) -airflowVersion: "2.7.1" +airflowVersion: "2.9.3" # Images images: @@ -88,23 +105,25 @@ images: pullPolicy: IfNotPresent statsd: repository: quay.io/prometheus/statsd-exporter - tag: v0.22.8 + tag: v0.26.1 pullPolicy: IfNotPresent redis: repository: redis - tag: 7-bullseye + # Redis is limited to 7.2-bookworm due to licencing change + # https://redis.io/blog/redis-adopts-dual-source-available-licensing/ + tag: 7.2-bookworm pullPolicy: IfNotPresent pgbouncer: repository: apache/airflow - tag: airflow-pgbouncer-2023.02.24-1.16.1 + tag: airflow-pgbouncer-2024.01.19-1.21.0 pullPolicy: IfNotPresent pgbouncerExporter: repository: apache/airflow - tag: airflow-pgbouncer-exporter-2023.02.21-0.14.0 + tag: airflow-pgbouncer-exporter-2024.06.18-0.17.0 pullPolicy: IfNotPresent gitSync: repository: registry.k8s.io/git-sync/git-sync - tag: v3.6.3 + tag: v4.1.0 pullPolicy: IfNotPresent # Select certain nodes for airflow pods. @@ -114,6 +133,7 @@ nodeSelector: { affinity: {} tolerations: [] topologySpreadConstraints: [] +schedulerName: ~ # Add common labels to all objects and pods defined in this chart. labels: {} @@ -142,6 +162,7 @@ ingress: # The hostnames or hosts configuration for the web Ingress hosts: [] + # # The hostname for the web Ingress (can be templated) # - name: "" # # configs for web Ingress TLS # tls: @@ -185,6 +206,7 @@ ingress: # The hostnames or hosts configuration for the flower Ingress hosts: [] + # # The hostname for the flower Ingress (can be templated) # - name: "" # tls: # # Enable TLS termination for the flower Ingress @@ -225,7 +247,8 @@ airflowLocalSettings: |- UIAlert( 'Usage of a dynamic webserver secret key detected. We recommend a static webserver secret key instead.' ' See the ' + '"https://airflow.apache.org/docs/helm-chart/stable/production-guide.html#webserver-secret-key" ' + 'target="_blank" rel="noopener noreferrer">' 'Helm Chart Production Guide for more details.', category="warning", roles=["Admin"], @@ -253,6 +276,8 @@ allowPodLaunching: true # Environment variables for all airflow containers env: [] +# - name: "" +# value: "" # Volumes for all airflow containers volumes: [] @@ -319,6 +344,11 @@ extraSecrets: {} # '{{ .Release.Name }}-other-secret-name-suffix': # data: | # ... +# 'proxy-config': +# stringData: | +# HTTP_PROXY: http://proxy_user:proxy_password@192.168.0.10:2080 +# HTTPS_PROXY: http://proxy_user:proxy_password@192.168.0.10:2080 +# NO_PROXY: "localhost,127.0.0.1,.svc.cluster.local,kubernetes.default.svc" # Extra ConfigMaps that will be managed by the chart # (You can use them with extraEnv or extraEnvFrom or some of the extraVolumes values). @@ -513,6 +543,9 @@ workers: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -569,9 +602,36 @@ workers: # This configuration will be ignored if PGBouncer is not enabled usePgbouncer: true + # Allow HPA (KEDA must be disabled). + hpa: + enabled: false + + # Minimum number of workers created by HPA + minReplicaCount: 0 + + # Maximum number of workers created by HPA + maxReplicaCount: 5 + + # Specifications for which to use to calculate the desired replica count + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + + # Scaling behavior of the target in both Up and Down directions + behavior: {} + persistence: # Enable persistent volumes enabled: true + # This policy determines whether PVCs should be deleted when StatefulSet is scaled down or removed. + persistentVolumeClaimRetentionPolicy: ~ + # persistentVolumeClaimRetentionPolicy: + # whenDeleted: Delete + # whenScaled: Delete # Volume size for worker StatefulSet size: 30Gi # If using a custom storageClass, pass name ref to all statefulSets here @@ -585,6 +645,8 @@ workers: # Detailed default security context for persistence for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} kerberosSidecar: # Enable kerberos sidecar @@ -599,6 +661,20 @@ workers: # Detailed default security context for kerberosSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + + kerberosInitContainer: + # Enable kerberos init container + enabled: false + resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + resources: {} # limits: @@ -613,13 +689,13 @@ workers: # This setting tells kubernetes that its ok to evict # when it wants to scale a node down. - safeToEvict: true + safeToEvict: false - # Launch additional containers into worker. + # Launch additional containers into worker (templated). # Note: If used with KubernetesExecutor, you are responsible for signaling sidecars to exit when the main # container finishes so Airflow can continue the worker shutdown process! extraContainers: [] - # Add additional init containers into workers. + # Add additional init containers into workers (templated). extraInitContainers: [] # Mount additional volumes into worker. It can be templated like in the following example: @@ -639,7 +715,8 @@ workers: # Select certain nodes for airflow worker pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } runtimeClassName: ~ priorityClassName: ~ @@ -704,8 +781,34 @@ workers: env: [] + volumeClaimTemplates: [] + # Additional volumeClaimTemplates needed. + # Comment out the above and uncomment the section below to enable it. + # Add more as needed + # Make sure to mount it under extraVolumeMounts. + # volumeClaimTemplates: + # - metadata: + # name: data-volume-1 + # spec: + # storageClassName: "storage-class-1" + # accessModes: + # - "ReadWriteOnce" + # resources: + # requests: + # storage: "10Gi" + # - metadata: + # name: data-volume-2 + # spec: + # storageClassName: "storage-class-2" + # accessModes: + # - "ReadWriteOnce" + # resources: + # requests: + # storage: "20Gi" + # Airflow scheduler settings scheduler: + enabled: true # hostAliases for the scheduler pod hostAliases: [] # - ip: "127.0.0.1" @@ -723,6 +826,15 @@ scheduler: failureThreshold: 5 periodSeconds: 60 command: ~ + + # Wait for at most 1 minute (6*10s) for the scheduler container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + failureThreshold: 6 + periodSeconds: 10 + timeoutSeconds: 20 + command: ~ + # Airflow 2.0 allows users to run multiple schedulers, # However this feature is only recommended for MySQL 8+ and Postgres replicas: 1 @@ -753,6 +865,9 @@ scheduler: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -789,9 +904,9 @@ scheduler: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into scheduler. + # Launch additional containers into scheduler (templated). extraContainers: [] - # Add additional init containers into scheduler. + # Add additional init containers into scheduler (templated). extraInitContainers: [] # Mount additional volumes into scheduler. It can be templated like in the following example: @@ -811,7 +926,8 @@ scheduler: # Select certain nodes for airflow scheduler pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: # default scheduler affinity is: @@ -855,6 +971,8 @@ scheduler: # Detailed default security context for logGroomerSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} waitForMigrations: # Whether to create init container to wait for db migrations @@ -915,6 +1033,9 @@ createUserJob: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -932,6 +1053,9 @@ createUserJob: # Launch additional containers into user creation job extraContainers: [] + # Add additional init containers into user creation job (templated). + extraInitContainers: [] + # Mount additional volumes into user creation job. It can be templated like in the following example: # extraVolumes: # - name: my-templated-extra-volume @@ -951,6 +1075,7 @@ createUserJob: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # In case you need to disable the helm hooks that create the jobs after install. # Disable this if you are using ArgoCD for example useHelmHooks: false @@ -977,10 +1102,12 @@ migrateDatabaseJob: args: - "bash" - "-c" - # The format below is necessary to get `helm lint` happy - - |- + - >- exec \ - airflow {{ semverCompare ">=2.0.0" .Values.airflowVersion | ternary "db upgrade" "upgradedb" }} + + airflow {{ semverCompare ">=2.7.0" .Values.airflowVersion + | ternary "db migrate" (semverCompare ">=2.0.0" .Values.airflowVersion + | ternary "db upgrade" "upgradedb") }} # Annotations on the database migration pod annotations: {} @@ -989,6 +1116,9 @@ migrateDatabaseJob: argocd.argoproj.io/hook: Sync argocd.argoproj.io/hook-delete-policy: HookSucceeded + # Labels specific to migrate database job objects and pods + labels: {} + # When not set, the values defined in the global securityContext will be used securityContext: {} # runAsUser: 50000 @@ -1000,6 +1130,9 @@ migrateDatabaseJob: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1025,6 +1158,9 @@ migrateDatabaseJob: # Launch additional containers into database migration job extraContainers: [] + # Add additional init containers into migrate database job (templated). + extraInitContainers: [] + # Mount additional volumes into database migration job. It can be templated like in the following example: # extraVolumes: # - name: my-templated-extra-volume @@ -1044,13 +1180,121 @@ migrateDatabaseJob: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # In case you need to disable the helm hooks that create the jobs after install. # Disable this if you are using ArgoCD for example useHelmHooks: false applyCustomEnv: true +# rpcServer support is experimental / dev purpose only and will later be renamed +_rpcServer: + enabled: false + + # Labels specific to workers objects and pods + labels: {} + + # Command to use when running the Airflow rpc server (templated). + command: + - "bash" + # Args to use when running the Airflow rpc server (templated). + args: ["-c", "exec airflow internal-api"] + env: [] + serviceAccount: + # default value is true + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ + automountServiceAccountToken: true + # Specifies whether a ServiceAccount should be created + create: true + # The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the release name + name: ~ + + # Annotations to add to webserver kubernetes service account. + annotations: {} + service: + type: ClusterIP + ## service annotations + annotations: {} + ports: + - name: rpc-server + port: "{{ .Values.ports._rpcServer }}" + + loadBalancerIP: ~ + ## Limit load balancer source ips to list of CIDRs + # loadBalancerSourceRanges: + # - "10.123.0.0/16" + loadBalancerSourceRanges: [] + + podDisruptionBudget: + enabled: false + + # PDB configuration + config: + # minAvailable and maxUnavailable are mutually exclusive + maxUnavailable: 1 + # minAvailable: 1 + + # Detailed default security contexts for webserver deployments for container and pod level + securityContexts: + pod: {} + container: {} + + waitForMigrations: + # Whether to create init container to wait for db migrations + enabled: true + env: [] + # Detailed default security context for waitForMigrations for container level + securityContexts: + container: {} + + # Launch additional containers into the flower pods. + extraContainers: [] + + # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`) + extraNetworkPolicies: [] + networkPolicy: + ingress: + # Peers for webserver NetworkPolicy ingress + from: [] + # Ports for webserver NetworkPolicy ingress (if `from` is set) + ports: + - port: "{{ .Values.ports._rpcServer }}" + + resources: {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + + livenessProbe: + initialDelaySeconds: 15 + timeoutSeconds: 5 + failureThreshold: 5 + periodSeconds: 10 + scheme: HTTP + + readinessProbe: + initialDelaySeconds: 15 + timeoutSeconds: 5 + failureThreshold: 5 + periodSeconds: 10 + scheme: HTTP + + # Wait for at most 1 minute (6*10s) for the RPC server container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + timeoutSeconds: 20 + failureThreshold: 6 + periodSeconds: 10 + scheme: HTTP + # Airflow webserver settings webserver: + enabled: true + # Add custom annotations to the webserver configmap + configMapAnnotations: {} # hostAliases for the webserver pod hostAliases: [] # - ip: "127.0.0.1" @@ -1074,6 +1318,14 @@ webserver: periodSeconds: 10 scheme: HTTP + # Wait for at most 1 minute (6*10s) for the webserver container to startup. + # livenessProbe kicks in after the first successful startupProbe + startupProbe: + timeoutSeconds: 20 + failureThreshold: 6 + periodSeconds: 10 + scheme: HTTP + # Number of webservers replicas: 1 # Max number of old replicasets to retain @@ -1123,6 +1375,9 @@ webserver: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Additional network policies as needed (Deprecated - renamed to `webserver.networkPolicy.ingress.from`) extraNetworkPolicies: [] networkPolicy: @@ -1151,9 +1406,9 @@ webserver: lastName: user password: admin # This is randomized during install - # Launch additional containers into webserver. + # Launch additional containers into webserver (templated). extraContainers: [] - # Add additional init containers into webserver. + # Add additional init containers into webserver (templated). extraInitContainers: [] # Mount additional volumes into webserver. It can be templated like in the following example: @@ -1304,9 +1559,15 @@ triggerer: securityContexts: pod: {} container: {} + + # container level lifecycle hooks + containerLifecycleHooks: {} + persistence: # Enable persistent volumes enabled: true + # This policy determines whether PVCs should be deleted when StatefulSet is scaled down or removed. + persistentVolumeClaimRetentionPolicy: ~ # Volume size for triggerer StatefulSet size: 30Gi # If using a custom storageClass, pass name ref to all statefulSets here @@ -1333,9 +1594,9 @@ triggerer: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into triggerer. + # Launch additional containers into triggerer (templated). extraContainers: [] - # Add additional init containers into triggerers. + # Add additional init containers into triggerers (templated). extraInitContainers: [] # Mount additional volumes into triggerer. It can be templated like in the following example: @@ -1355,7 +1616,8 @@ triggerer: # Select certain nodes for airflow triggerer pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: # default triggerer affinity is: @@ -1400,6 +1662,9 @@ triggerer: securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + waitForMigrations: # Whether to create init container to wait for db migrations enabled: true @@ -1410,6 +1675,44 @@ triggerer: env: [] + # Allow KEDA autoscaling. + keda: + enabled: false + namespaceLabels: {} + + # How often KEDA polls the airflow DB to report new scale requests to the HPA + pollingInterval: 5 + + # How many seconds KEDA will wait before scaling to zero. + # Note that HPA has a separate cooldown period for scale-downs + cooldownPeriod: 30 + + # Minimum number of triggerers created by keda + minReplicaCount: 0 + + # Maximum number of triggerers created by keda + maxReplicaCount: 10 + + # Specify HPA related options + advanced: {} + # horizontalPodAutoscalerConfig: + # behavior: + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Percent + # value: 100 + # periodSeconds: 15 + + # Query to use for KEDA autoscaling. Must return a single integer. + query: >- + SELECT ceil(COUNT(*)::decimal / {{ .Values.config.triggerer.default_capacity }}) + FROM trigger + + # Whether to use PGBouncer to connect to the database or not when it is enabled + # This configuration will be ignored if PGBouncer is not enabled + usePgbouncer: false + # Airflow Dag Processor Config dagProcessor: enabled: false @@ -1463,6 +1766,9 @@ dagProcessor: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + resources: {} # limits: # cpu: 100m @@ -1478,9 +1784,9 @@ dagProcessor: # when it wants to scale a node down. safeToEvict: true - # Launch additional containers into dag processor. + # Launch additional containers into dag processor (templated). extraContainers: [] - # Add additional init containers into dag processors. + # Add additional init containers into dag processors (templated). extraInitContainers: [] # Mount additional volumes into dag processor. It can be templated like in the following example: @@ -1536,11 +1842,16 @@ dagProcessor: # requests: # cpu: 100m # memory: 128Mi + securityContexts: + container: {} waitForMigrations: # Whether to create init container to wait for db migrations enabled: true env: [] + # Detailed default security context for waitForMigrations for container level + securityContexts: + container: {} env: [] @@ -1549,6 +1860,19 @@ flower: # Enable flower. # If True, and using CeleryExecutor/CeleryKubernetesExecutor, will deploy flower app. enabled: false + + livenessProbe: + initialDelaySeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + periodSeconds: 5 + + readinessProbe: + initialDelaySeconds: 10 + timeoutSeconds: 5 + failureThreshold: 10 + periodSeconds: 5 + # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -1592,6 +1916,9 @@ flower: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1667,6 +1994,9 @@ flower: # StatsD settings statsd: + # Add custom annotations to the statsd configmap + configMapAnnotations: {} + enabled: false # Max number of old replicasets to retain revisionHistoryLimit: ~ @@ -1705,6 +2035,9 @@ statsd: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Additional network policies as needed extraNetworkPolicies: [] resources: {} @@ -1737,6 +2070,7 @@ statsd: overrideMappings: [] podAnnotations: {} + env: [] # PgBouncer settings pgbouncer: @@ -1750,7 +2084,7 @@ pgbouncer: command: ["pgbouncer", "-u", "nobody", "/etc/pgbouncer/pgbouncer.ini"] # Args to use for PgBouncer(templated). args: ~ - auth_type: md5 + auth_type: scram-sha-256 auth_file: /etc/pgbouncer/users.txt # annotations to be added to the PgBouncer deployment @@ -1861,6 +2195,9 @@ pgbouncer: extraVolumes: [] extraVolumeMounts: [] + # Launch additional containers into pgbouncer. + extraContainers: [] + # Select certain nodes for PgBouncer pods. nodeSelector: {} affinity: {} @@ -1876,6 +2213,13 @@ pgbouncer: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: + preStop: + exec: + # Allow existing queries clients to complete within 120 seconds + command: ["/bin/sh", "-c", "killall -INT pgbouncer && sleep 120"] + metricsExporterSidecar: resources: {} # limits: @@ -1886,10 +2230,31 @@ pgbouncer: # memory: 128Mi sslmode: "disable" + # supply the name of existing secret with PGBouncer connection URI containing + # stats user and password. + # you can load them to a k8s secret like the one below + # apiVersion: v1 + # kind: Secret + # metadata: + # name: pgbouncer-stats-secret + # data: + # connection: postgresql://:@127.0.0.1:6543/pgbouncer? + # type: Opaque + # + # statsSecretName: pgbouncer-stats-secret + # + statsSecretName: ~ + + # Key containing the PGBouncer connection URI, defaults to `connection` if not defined + statsSecretKey: ~ + # Detailed default security context for metricsExporterSidecar for container level securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + livenessProbe: initialDelaySeconds: 10 periodSeconds: 10 @@ -1900,11 +2265,17 @@ pgbouncer: periodSeconds: 10 timeoutSeconds: 1 + # Environment variables to add to pgbouncer container + env: [] + # Configuration for the redis provisioned by the chart redis: enabled: true terminationGracePeriodSeconds: 600 + # Annotations for Redis Statefulset + annotations: {} + # Create ServiceAccount serviceAccount: # default value is true @@ -1929,6 +2300,11 @@ redis: # Annotations to add to redis volumes annotations: {} + # Configuration for empty dir volume (if redis.persistence.enabled == false) + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + resources: {} # limits: # cpu: 100m @@ -1951,11 +2327,13 @@ redis: # Select certain nodes for redis pods. nodeSelector: { - spotinst.io/node-lifecycle: "od" + spotinst.io/node-lifecycle: "od", + topology.kubernetes.io/zone: "us-east-1a" } affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ # Set to 0 for backwards-compatiblity uid: 0 @@ -1969,6 +2347,9 @@ redis: pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + podAnnotations: {} # Auth secret for a private registry # This is used if pulling airflow images from a private registry @@ -1992,6 +2373,7 @@ elasticsearch: # Or an object representing the connection # Example: # connection: + # scheme: ~ # user: ~ # pass: ~ # host: ~ @@ -2009,6 +2391,8 @@ ports: statsdScrape: 9102 pgbouncer: 6543 pgbouncerScrape: 9127 + # rpcServer support is experimental / dev purpose only and will later be renamed + _rpcServer: 9080 # Define any ResourceQuotas for namespace quotas: {} @@ -2019,7 +2403,7 @@ limits: [] # This runs as a CronJob to cleanup old pods. cleanup: enabled: false - # Run every 15 minutes (templated). + # Run every 60 minutes (templated). schedule: "*/60 * * * *" # To select a random-ish, deterministic starting minute between 3 and 12 inclusive for each release: # '{{- add 3 (regexFind ".$" (adler32sum .Release.Name)) -}}-59/15 * * * *' @@ -2039,6 +2423,7 @@ cleanup: affinity: {} tolerations: [] topologySpreadConstraints: [] + priorityClassName: ~ podAnnotations: {} @@ -2075,8 +2460,12 @@ cleanup: # Detailed default security context for cleanup for container level securityContexts: + pod: {} container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Specify history limit # When set, overwrite the default k8s number of successful and failed CronJob executions that are saved. failedJobsHistoryLimit: ~ @@ -2086,8 +2475,6 @@ cleanup: # Not recommended for production postgresql: enabled: false - image: - tag: "11" auth: enablePostgresUser: true postgresPassword: postgres @@ -2113,7 +2500,7 @@ config: # For Airflow 1.10, backward compatibility; moved to [logging] in 2.0 colored_console_log: 'False' remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' - allowed_deserialization_classes: ".*" + allowed_deserialization_classes_regexp: ".*" logging: remote_logging: '{{- ternary "True" "False" .Values.elasticsearch.enabled }}' colored_console_log: 'False' @@ -2121,13 +2508,13 @@ config: statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' statsd_port: 9125 statsd_prefix: airflow - statsd_host: '{{ printf "%s-statsd" .Release.Name }}' + statsd_host: '{{ printf "%s-statsd" (include "airflow.fullname" .) }}' webserver: enable_proxy_fix: 'True' # For Airflow 1.10 rbac: 'True' celery: - flower_url_prefix: '{{ .Values.ingress.flower.path }}' + flower_url_prefix: '{{ ternary "" .Values.ingress.flower.path (eq .Values.ingress.flower.path "/") }}' worker_concurrency: 16 scheduler: standalone_dag_processor: '{{ ternary "True" "False" .Values.dagProcessor.enabled }}' @@ -2135,7 +2522,7 @@ config: statsd_on: '{{ ternary "True" "False" .Values.statsd.enabled }}' statsd_port: 9125 statsd_prefix: airflow - statsd_host: '{{ printf "%s-statsd" .Release.Name }}' + statsd_host: '{{ printf "%s-statsd" (include "airflow.fullname" .) }}' # `run_duration` included for Airflow 1.10 backward compatibility; removed in 2.0. run_duration: 41460 elasticsearch: @@ -2173,6 +2560,8 @@ config: secrets: backend: airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend backend_kwargs: '{"connections_prefix": "airflow/connections", "variables_prefix": "airflow/variables", "region_name": "us-east-1"}' + triggerer: + default_capacity: 1000 # yamllint enable rule:line-length # Whether Airflow can launch workers and/or pods in multiple namespaces @@ -2206,6 +2595,9 @@ podTemplate: ~ # Git sync dags: + # Where dags volume will be mounted. Works for both persistence and gitSync. + # If not specified, dags mount path will be set to $AIRFLOW_HOME/dags + mountPath: ~ persistence: # Annotations for dags PVC annotations: {} @@ -2230,6 +2622,8 @@ dags: repo: https://github.com/Sage-Bionetworks-Workflows/orca-recipes branch: main rev: HEAD + # The git revision (branch, tag, or hash) to check out, v4 only + ref: v2-2-stable depth: 1 # the number of consecutive failures allowed before aborting maxFailures: 0 @@ -2244,8 +2638,12 @@ dags: # metadata: # name: git-credentials # data: + # # For git-sync v3 # GIT_SYNC_USERNAME: # GIT_SYNC_PASSWORD: + # # For git-sync v4 + # GITSYNC_USERNAME: + # GITSYNC_PASSWORD: # and specify the name of the secret below # # credentialsSecret: git-credentials @@ -2264,6 +2662,12 @@ dags: # and specify the name of the secret below # sshKeySecret: airflow-ssh-secret # + # Or set sshKeySecret with your key + # sshKey: |- + # -----BEGIN {OPENSSH PRIVATE KEY}----- + # ... + # -----END {OPENSSH PRIVATE KEY}----- + # # If you are using an ssh private key, you can additionally # specify the content of your known_hosts file, example: # @@ -2274,7 +2678,16 @@ dags: # interval between git sync attempts in seconds # high values are more likely to cause DAGs to become out of sync between different components # low values cause more traffic to the remote git repository + # Go-style duration string (e.g. "100ms" or "0.1s" = 100ms). + # For backwards compatibility, wait will be used if it is specified. + period: 5s wait: 600 + # add variables from secret into gitSync containers, such proxy-config + envFrom: ~ + # envFrom: | + # - secretRef: + # name: 'proxy-config' + containerName: git-sync uid: 65533 @@ -2286,6 +2699,9 @@ dags: securityContexts: container: {} + # container level lifecycle hooks + containerLifecycleHooks: {} + # Mount additional volumes into git-sync. It can be templated like in the following example: # extraVolumeMounts: # - name: my-templated-extra-volume @@ -2297,6 +2713,11 @@ dags: # - name: "" # value: "" + # Configuration for empty dir volume + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + resources: {} # limits: # cpu: 100m @@ -2306,6 +2727,11 @@ dags: # memory: 128Mi logs: + # Configuration for empty dir volume (if logs.persistence.enabled == false) + # emptyDirConfig: + # sizeLimit: 1Gi + # medium: Memory + persistence: # Enable persistent volume for storing logs enabled: false @@ -2317,3 +2743,4 @@ logs: storageClassName: gp3 ## the name of an existing PVC to use existingClaim: + diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index e35d05e2..25b2c7ff 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -63,15 +63,6 @@ spec: - op: replace path: /metadata/name value: ${var.cluster_issuer_name} - - target: - kind: SecurityPolicy - patch: |- - - op: replace - path: /spec/jwt/providers - value: - - name: auth0 - remoteJWKS: - uri: ${var.auth0_jwks_uri} destination: server: 'https://kubernetes.default.svc' namespace: ${var.namespace} diff --git a/modules/envoy-gateway/resources/cert-issuer.yaml b/modules/envoy-gateway/resources/cert-issuer.yaml index 1d4ebc9a..4608c97a 100644 --- a/modules/envoy-gateway/resources/cert-issuer.yaml +++ b/modules/envoy-gateway/resources/cert-issuer.yaml @@ -1,25 +1,18 @@ # To implement using something like letsencrypt -# apiVersion: cert-manager.io/v1 -# kind: ClusterIssuer -# metadata: -# name: letsencrypt-staging -# spec: -# acme: -# server: https://acme-staging-v02.api.letsencrypt.org/directory -# email: "bryan.fauble@sagebase.org" -# privateKeySecretRef: -# name: letsencrypt-staging-account-key -# solvers: -# - http01: -# gatewayHTTPRoute: -# parentRefs: -# - kind: Gateway -# name: eg -# namespace: envoy-gateway ---- apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: - name: selfsigned + name: lets-encrypt-prod spec: - selfSigned: {} + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: "dpe@sagebase.org" + privateKeySecretRef: + name: letsencrypt-prod-account-key + solvers: + - http01: + gatewayHTTPRoute: + parentRefs: + - kind: Gateway + name: eg + namespace: envoy-gateway diff --git a/modules/envoy-gateway/resources/http-to-https-redirect.yaml b/modules/envoy-gateway/resources/http-to-https-redirect.yaml new file mode 100644 index 00000000..73dc9836 --- /dev/null +++ b/modules/envoy-gateway/resources/http-to-https-redirect.yaml @@ -0,0 +1,16 @@ +# Upgrades HTTP requests to HTTPS +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: tls-redirect +spec: + parentRefs: + - name: eg + sectionName: http + hostnames: + - "*.sagedpe.org" + rules: + - filters: + - type: RequestRedirect + requestRedirect: + scheme: https \ No newline at end of file diff --git a/modules/envoy-gateway/resources/kustomization.yaml b/modules/envoy-gateway/resources/kustomization.yaml index e72e3f85..cea76074 100644 --- a/modules/envoy-gateway/resources/kustomization.yaml +++ b/modules/envoy-gateway/resources/kustomization.yaml @@ -6,4 +6,4 @@ resources: - envoy-proxy.yaml - gateway.yaml - traffic-policy.yaml -- security-policy.yaml +- http-to-https-redirect.yaml diff --git a/modules/envoy-gateway/resources/security-policy.yaml b/modules/envoy-gateway/resources/security-policy.yaml deleted file mode 100644 index 40f6d384..00000000 --- a/modules/envoy-gateway/resources/security-policy.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: SecurityPolicy -metadata: - name: require-jwt-at-gateway -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: eg - jwt: - providers: - # - name: auth0 - # remoteJWKS: - # uri: \ No newline at end of file diff --git a/modules/envoy-gateway/variables.tf b/modules/envoy-gateway/variables.tf index 9a006ee8..03f078cd 100644 --- a/modules/envoy-gateway/variables.tf +++ b/modules/envoy-gateway/variables.tf @@ -35,8 +35,3 @@ variable "ssl_hostname" { description = "The hostname to use for the SSL certificate" type = string } - -variable "auth0_jwks_uri" { - description = "The JWKS URI for Auth0" - type = string -} \ No newline at end of file diff --git a/modules/sage-aws-ses/README.md b/modules/sage-aws-ses/README.md new file mode 100644 index 00000000..47561e34 --- /dev/null +++ b/modules/sage-aws-ses/README.md @@ -0,0 +1,35 @@ +# Purpose +This module is used to set up SES (Simple email service) in AWS. + +By setting a few variables we are able to create a number of Email addresses +to AWS SES. The variables to be set are: + +- `email_identities`, example: `["example@sagebase.org"]` + +# Manual steps required +After running this module a number of manual steps are required as they are external +processes that need to happen: + +## Verify Email address +1) Navigate to Amazon SES in the web console +2) Navigate to `identities` +3) Choose the Identity to verify +4) Send a test email and click the link received to verify the email + +Optional: Send a test email after verifying to confirm you may receive emails + +# Request production access +After creating AWS SES settings the first time you will be in "Sandbox" mode. In order +to request production access follow the following document: +under the section "To request that your account be removed from the Amazon SES sandbox using the AWS CLI". + +The command will look something like: + +``` +aws sesv2 put-account-details \ +--production-access-enabled \ +--mail-type TRANSACTIONAL \ +--website-url https://www.synapse.org/ \ +--additional-contact-email-addresses dpe@sagebase.org \ +--contact-language EN +``` diff --git a/modules/sage-aws-ses/data.tf b/modules/sage-aws-ses/data.tf new file mode 100644 index 00000000..3fe7d177 --- /dev/null +++ b/modules/sage-aws-ses/data.tf @@ -0,0 +1,6 @@ +data "aws_iam_policy_document" "ses_sender" { + statement { + actions = ["ses:SendRawEmail"] + resources = ["*"] + } +} \ No newline at end of file diff --git a/modules/sage-aws-ses/main.tf b/modules/sage-aws-ses/main.tf new file mode 100644 index 00000000..f733d8d2 --- /dev/null +++ b/modules/sage-aws-ses/main.tf @@ -0,0 +1,23 @@ +resource "aws_ses_email_identity" "identities" { + for_each = { for identity in var.email_identities : identity => identity } + email = each.value +} + +resource "aws_iam_user" "smtp_user" { + name = "smtp_user" +} + +resource "aws_iam_access_key" "smtp_user" { + user = aws_iam_user.smtp_user.name +} + +resource "aws_iam_policy" "ses_sender" { + name = "ses_sender" + description = "Allows sending of e-mails via Simple Email Service" + policy = data.aws_iam_policy_document.ses_sender.json +} + +resource "aws_iam_user_policy_attachment" "test-attach" { + user = aws_iam_user.smtp_user.name + policy_arn = aws_iam_policy.ses_sender.arn +} diff --git a/modules/sage-aws-ses/ouputs.tf b/modules/sage-aws-ses/ouputs.tf new file mode 100644 index 00000000..6a43bb52 --- /dev/null +++ b/modules/sage-aws-ses/ouputs.tf @@ -0,0 +1,9 @@ + +output "smtp_user" { + value = aws_iam_access_key.smtp_user.id +} + +output "smtp_password" { + sensitive = true + value = aws_iam_access_key.smtp_user.ses_smtp_password_v4 +} \ No newline at end of file diff --git a/modules/sage-aws-ses/variables.tf b/modules/sage-aws-ses/variables.tf new file mode 100644 index 00000000..d923b190 --- /dev/null +++ b/modules/sage-aws-ses/variables.tf @@ -0,0 +1,12 @@ +variable "email_identities" { + type = list(string) + description = "List of email identities to be added to SES" +} + +variable "tags" { + description = "AWS Resource Tags" + type = map(string) + default = { + "CostCenter" = "No Program / 000000" + } +} diff --git a/modules/sage-aws-ses/versions.tf b/modules/sage-aws-ses/versions.tf new file mode 100644 index 00000000..cba4c144 --- /dev/null +++ b/modules/sage-aws-ses/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 742f5ca4..efdf9ffb 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -13,6 +13,14 @@ A number of items are needed: - Set up ingress to the cluster/collector to send data to: https://sagebionetworks.jira.com/browse/IBCDPE-1095 - Set up accounts and access to the service decleratively + +## Setting up SMTP for alertmanager +Alertmanager is an additional tool that is deployed to the kubernetes cluster that +handles forwarding an alert out to 1 or more streams that will receive the alert. +Alert manager is set to to send emails through AWS SES (Simple Email Service) set up +by the `modules/sage-aws-ses` terraform scripts. See that module for more information +about the setup of AWS SES. + ## Accessing signoz (Internet) #### Sending data into signoz (From internet) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 54ca947e..896f7c00 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -1,3 +1,6 @@ +locals { + alertmanager_enabled = var.smtp_from != "" && var.smtp_user != "" && var.smtp_password != "" +} resource "kubernetes_namespace" "signoz" { metadata { @@ -7,7 +10,7 @@ resource "kubernetes_namespace" "signoz" { resource "kubectl_manifest" "signoz-deployment" { depends_on = [kubernetes_namespace.signoz] - + yaml_body = < name: "alertmanager" replicaCount: 1 @@ -1036,8 +1036,14 @@ alertmanager: nodePort: null # -- Additional environments to set for Alertmanager - additionalEnvs: {} - # env_key: env_value + additionalEnvs: + ALERTMANAGER_SMTP_FROM: + ALERTMANAGER_SMTP_HOST: email-smtp.us-east-1.amazonaws.com + # 587 is the STARTTLS port for SMTP + # https://docs.aws.amazon.com/ses/latest/dg/smtp-connect.html#smtp-connect-starttls + ALERTMANAGER_SMTP_PORT: "587" + ALERTMANAGER_SMTP_AUTH_USERNAME: + ALERTMANAGER_SMTP_AUTH_PASSWORD: initContainers: init: diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 249c0fa2..2a917ff1 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -26,6 +26,7 @@ variable "namespace" { type = string } + variable "enable_otel_ingress" { description = "Enable OpenTelemetry ingress" type = bool @@ -45,4 +46,22 @@ variable "cluster_name" { variable "auth0_jwks_uri" { description = "The JWKS URI for Auth0" type = string -} \ No newline at end of file +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +}