diff --git a/README.md b/README.md index 3d456eff..cbedb844 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ allow us to review for any security advisories. ### Deploying an application to the kubernetes cluster Deployment of applications to the kubernetes cluster is handled through the combination -of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd (Declarative definitions +of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd or Flux CD (Declarative definitions for applications). To start of the deployment journey the first step is to create a new terraform module diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index cd03d65a..c6ad7e5d 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -45,6 +45,7 @@ locals { pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" smtp_user = "TF_VAR_smtp_user" smtp_password = "TF_VAR_smtp_password" + cluster_oidc_provider_arn = "TF_VAR_cluster_oidc_provider_arn" } } @@ -178,31 +179,6 @@ resource "spacelift_stack_dependency_reference" "cluster-name" { # stack_id = spacelift_stack.k8s-stack.id # } -resource "spacelift_stack_destructor" "k8s-stack-deployments-destructor" { - depends_on = [ - spacelift_stack.k8s-stack, - spacelift_aws_integration_attachment.k8s-deployments-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-deployments-environment-variables - ] - - stack_id = spacelift_stack.k8s-stack-deployments.id -} - -resource "spacelift_stack_destructor" "k8s-stack-destructor" { - depends_on = [ - spacelift_aws_integration_attachment.k8s-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-environment-variables - ] - - stack_id = spacelift_stack.k8s-stack.id -} - resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" { integration_id = var.aws_integration_id stack_id = spacelift_stack.k8s-stack.id @@ -244,12 +220,6 @@ resource "spacelift_stack" "auth0" { ] } -resource "spacelift_stack_destructor" "auth0-stack-destructor" { - count = var.deploy_auth0 ? 1 : 0 - stack_id = spacelift_stack.auth0[0].id -} - - resource "spacelift_environment_variable" "auth0-stack-environment-variables" { depends_on = [ spacelift_stack.auth0 diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 30c32b9f..1b78a28b 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -1,3 +1,6 @@ +locals { + git_revision = var.git_revision +} module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" version = "0.9.0" @@ -26,13 +29,19 @@ module "argo-cd" { source = "../../../modules/argo-cd" } +module "flux-cd" { + depends_on = [module.sage-aws-eks-autoscaler] + source = "../../../modules/flux-cd" +} + module "victoria-metrics" { - depends_on = [module.argo-cd] - source = "spacelift.io/sagebionetworks/victoria-metrics/aws" - version = "0.4.8" + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/victoria-metrics/aws" + # version = "0.4.8" + source = "../../../modules/victoria-metrics" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "trivy-operator" { @@ -41,7 +50,7 @@ module "trivy-operator" { version = "0.3.2" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "airflow" { @@ -50,7 +59,7 @@ module "airflow" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" } @@ -60,7 +69,7 @@ module "postgres-cloud-native-operator" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "postgres-cloud-native-database" { @@ -69,30 +78,40 @@ module "postgres-cloud-native-database" { version = "0.5.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" argo_deployment_name = "airflow-postgres-cloud-native" } +module "clickhouse-backup-bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn +} module "signoz" { depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/signoz" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision - namespace = "signoz" - argo_deployment_name = "signoz" - enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress - gateway_namespace = "envoy-gateway" - cluster_name = var.cluster_name - auth0_jwks_uri = var.auth0_jwks_uri - smtp_password = var.smtp_password - smtp_user = var.smtp_user - smtp_from = var.smtp_from - auth0_identifier = var.auth0_identifier + source = "../../../modules/signoz" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = local.git_revision + namespace = "signoz" + argo_deployment_name = "signoz" + enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress + gateway_namespace = "envoy-gateway" + cluster_name = var.cluster_name + auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier + s3_backup_bucket_name = module.clickhouse-backup-bucket.bucket_name + s3_access_role_arn = module.clickhouse-backup-bucket.access_role_arn } module "envoy-gateway" { @@ -103,7 +122,7 @@ module "envoy-gateway" { source = "../../../modules/envoy-gateway" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "envoy-gateway" argo_deployment_name = "envoy-gateway" cluster_issuer_name = "lets-encrypt-prod" @@ -118,7 +137,7 @@ module "cert-manager" { source = "../../../modules/cert-manager" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "cert-manager" argo_deployment_name = "cert-manager" } diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 8f62670b..21b40836 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -40,6 +40,11 @@ variable "cluster_name" { type = string } +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" + type = string +} + variable "spotinst_account" { description = "Spot.io account" type = string diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 6851f6a7..1a920dad 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -38,11 +38,15 @@ output "cluster_name" { value = module.sage-aws-eks.cluster_name } +output "cluster_oidc_provider_arn" { + value = module.sage-aws-eks.cluster_oidc_provider_arn +} + output "smtp_user" { - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : "" } output "smtp_password" { sensitive = true - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : "" } diff --git a/modules/flux-cd/README.md b/modules/flux-cd/README.md new file mode 100644 index 00000000..df006cd6 --- /dev/null +++ b/modules/flux-cd/README.md @@ -0,0 +1,56 @@ +# Purpose +This module is used to deploy the `Flux CD` [helm chart](https://fluxcd-community.github.io/helm-charts) to the cluster. [`Flux CD`](https://fluxcd.io/) is a GitOps tool used to manage the application lifecycle on a Kubernetes cluster. It was originally deployed because unlike `Argo CD`, it supports the use of `postRenderers` which are used to apply any additional changes to the application after it has been deployed, and were needed to be used to deploy the `clickhouse-backup` sidecar container to the `signoz` helm release. We do not plan to move all existing applications to using `Flux CD` at this time, but it is available and preferred to be used for any new applications added to the cluster. + +## What resources are being deployed through this module +In addition to a `helm_release` which deploys the `Flux CD` helm chart, this module also creates a `capacitor` resource which is used as the frontend for `Flux CD`. + +## Accessing the Flux CD UI +To access the `Flux CD` UI, you only need to port-forward the `capacitor` pod and access it in your browser. + +# Deploying an application with Flux CD +To deploy an application with `Flux CD`, you will need to create a `HelmRepository` resource which points to the helm chart you want to deploy. In that resource definition, you will set the `apiVersion` to `source.toolkit.fluxcd.io/v1` and the `kind` to `HelmRepository`. For example (code from the `signoz` module): + +``` +resource "kubectl_manifest" "signoz-helm-repo" { + depends_on = [kubernetes_namespace.signoz] + + yaml_body = <=0.1.0" +YAML +} + +resource "kubectl_manifest" "capacitor-kustomization" { + depends_on = [helm_release.fluxcd] + + yaml_body = < +# secretkey: + +# Enables podMonitor creation for the Prometheus Operator +prometheus: + podMonitor: + # -- Enables podMonitor endpoint + create: false + podMetricsEndpoints: + - port: http-prom + relabelings: + # https://github.com/prometheus-operator/prometheus-operator/issues/4816 + - sourceLabels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running diff --git a/modules/flux-cd/versions.tf b/modules/flux-cd/versions.tf new file mode 100644 index 00000000..31cbf926 --- /dev/null +++ b/modules/flux-cd/versions.tf @@ -0,0 +1,20 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} diff --git a/modules/s3-bucket/README.md b/modules/s3-bucket/README.md new file mode 100644 index 00000000..44aa0594 --- /dev/null +++ b/modules/s3-bucket/README.md @@ -0,0 +1,37 @@ +# Purpose +This is a simple module that can be used within applications to create an S3 bucket. + +## WARNING +If you are tearing down a stack with a deployed S3 Bucket, you will likely encounter an error similar to the following: +``` +deleting S3 Bucket (my-beautiful-bucket): operation error S3: DeleteBucket, https response error StatusCode: 409, RequestID: 123, HostID: 123456789+g=, api error BucketNotEmpty: The bucket you tried to delete is not empty. You must delete all versions in the bucket. +``` +We have intentionally not handled this behavior as a safeguard against accidental deletion of a bucket that contains important data. If you need to delete the bucket, you will need to manually delete all objects within it. If versioning is enabled for the bucket, you will also need to delete all versions of the objects. + +# Usage +Using this module only requires calling it in your terraform code: +``` +module "my_beautiful_bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "my-beautiful-bucket" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn +} +``` + +The module handles creating the necessary IAM policy, role, and role policy attachment for accessing the bucket and provides the role ARN as an output. + +After confirming that the policy and role are configured correctly, you can either use the ARN directly in your application code or configure a kubernetes service account bound to the IAM role. The latter can be done like so: +``` +resource "kubernetes_service_account" "my_beautiful_bucket_service_account" { + metadata { + name = "my-beautiful-bucket-service-account" + namespace = var.namespace + annotations = { + "eks.amazonaws.com/role-arn" = "${module.my_beautiful_bucket.iam_role_arn}" + } + } +} +``` diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf new file mode 100644 index 00000000..a9d13f22 --- /dev/null +++ b/modules/s3-bucket/main.tf @@ -0,0 +1,64 @@ +resource "aws_s3_bucket" "bucket" { + bucket = var.bucket_name + tags = merge( + var.tags, + { + Name = var.bucket_name + } + ) +} + +resource "aws_s3_bucket_versioning" "versioning" { + bucket = aws_s3_bucket.bucket.id + versioning_configuration { + status = var.enable_versioning ? "Enabled" : "Disabled" + } +} + + +resource "aws_iam_policy" "s3-access-policy" { + name = "access-policy-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + description = "Policy to access the s3 bucket" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ] + Resource = [ + aws_s3_bucket.bucket.arn, + "${aws_s3_bucket.bucket.arn}/*" + ] + } + ] + }) +} + +resource "aws_iam_role" "s3-access-iam-role" { + name = "s3-${var.cluster_name}-${var.bucket_name}" + description = "Assumed role to access the s3 bucket with the given permissions." + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Federated = "${var.cluster_oidc_provider_arn}", + } + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "s3-access-policy-attachment" { + role = aws_iam_role.s3-access-iam-role.name + policy_arn = aws_iam_policy.s3-access-policy.arn +} diff --git a/modules/s3-bucket/outputs.tf b/modules/s3-bucket/outputs.tf new file mode 100644 index 00000000..25983295 --- /dev/null +++ b/modules/s3-bucket/outputs.tf @@ -0,0 +1,14 @@ +output "bucket_name" { + description = "Name of the created S3 bucket" + value = aws_s3_bucket.bucket.id +} + +output "bucket_arn" { + description = "ARN of the created S3 bucket" + value = aws_s3_bucket.bucket.arn +} + +output "access_role_arn" { + description = "ARN of the role to access the S3 bucket" + value = aws_iam_role.s3-access-iam-role.arn +} \ No newline at end of file diff --git a/modules/s3-bucket/variables.tf b/modules/s3-bucket/variables.tf new file mode 100644 index 00000000..25877694 --- /dev/null +++ b/modules/s3-bucket/variables.tf @@ -0,0 +1,33 @@ +variable "bucket_name" { + description = "Name of the S3 bucket to create" + type = string +} + +variable "tags" { + description = "Tags to apply to the S3 bucket" + type = map(string) + default = { + "CostCenter" = "No Program / 000000" + } +} + +variable "enable_versioning" { + description = "Enable versioning on the bucket" + type = bool + default = true +} + +variable "aws_account_id" { + description = "AWS account ID" + type = string +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" + type = string +} diff --git a/modules/s3-bucket/versions.tf b/modules/s3-bucket/versions.tf new file mode 100644 index 00000000..cba4c144 --- /dev/null +++ b/modules/s3-bucket/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/modules/sage-aws-eks/ouputs.tf b/modules/sage-aws-eks/ouputs.tf index 59692964..8114420b 100644 --- a/modules/sage-aws-eks/ouputs.tf +++ b/modules/sage-aws-eks/ouputs.tf @@ -13,3 +13,7 @@ output "node_security_group_id" { output "pod_to_node_dns_sg_id" { value = aws_security_group.pod-dns-egress.id } + +output "cluster_oidc_provider_arn" { + value = module.eks.oidc_provider_arn +} diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 4cdcfa66..82f5cb67 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -5,13 +5,12 @@ SigNoz is an open-source APM. It helps developers monitor their applications & troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open source Application Performance Monitoring (APM) & Observability tool. +## Initial setup -## This module is a work in progress (To be completed before production, or determine if not needed) -A number of items are needed: - -- Setting up backups and data retention: https://sagebionetworks.jira.com/browse/IBCDPE-1094 -- Set up accounts and access to the service declaratively - +- Accounts in SigNoz need to be manually set up (SSO is only available in the enterprise version) +- 120 months for "Total Retention Period" and 1 month for "Move to S3" settings should be set +- Any dashboards need to be copied or set up +- Alert channels (Email/Slack) need to be set ## Setting up SMTP for alertmanager Alertmanager is an additional tool that is deployed to the kubernetes cluster that @@ -107,3 +106,12 @@ Once you're connected via a port-forward session the next item is to make sure t application you're sending data from is instrumented with open-telemetry. This is going to be application specific so instructions will need to live within the application you are using. + +### Clickhouse Backups and Restores +This module uses the `clickhouse-backup` tool to automatically back up the clickhouse database and store the data in an S3 bucket to ensure continuity of the data regardless of the state of the cluster.`clickhouse-backup` is deployed as a sidecar container to the `signoz` helm release. It will perform incremental backups of the database every 8 hours and full backups every 24 hours. + +To restore the database from an S3 backup, you can use the following steps: +1. Scale the replica cluster (`chi-signoz-clickhouse-cluster-0-1`) `StatefulSet` to 0 replicas. +1. Identify the backup that you would like to restore from. You can get the full list of backups by shelling into the `clickhouse-backup-sidecar` container within the `chi-signoz-clickhouse-cluster-0-0-0` pod and running `clickhouse-backup list`. +1. Restore the database from your backup by running `clickhouse-backup restore_remote --rm --schema ` (assuming the backup from remote storage). +1. Scale the replica cluster `StatefulSet` back to 1 replica. Once the `chi-signoz-clickhouse-cluster-0-1-0` has fully come back up, you should see the restored data showing in the `signoz` UI. diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 4a366f9f..ed1f8f2d 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -8,104 +8,273 @@ resource "kubernetes_namespace" "signoz" { } } -resource "kubectl_manifest" "signoz-deployment" { +resource "kubectl_manifest" "signoz-helm-repo" { depends_on = [kubernetes_namespace.signoz] yaml_body = < + + + + 10485760 + + + s3 + https://${var.s3_backup_bucket_name}.s3.amazonaws.com/coldstorage/ + true + us-east-1 + + + + + + + default + + + s3 + 0 + 1 + + + 0 + + + + +YAML +} + +resource "kubectl_manifest" "signoz-git-repo" { + depends_on = [kubectl_manifest.signoz-helm-release] + + yaml_body = </data/ endpoint: https://.s3-.amazonaws.com/data/ # -- Access Key for S3 or GCS - accessKey: + # accessKey: # -- Secret Access Key for S3 or GCS - secretAccess: + # secretAccess: # AWS role configuration - to use environment variables instead of passing access and secret keys role: # -- Whether to enable AWS IAM ARN role. @@ -1286,7 +1291,7 @@ schemaMigrator: annotations: {} # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. - upgradeHelmHooks: true + upgradeHelmHooks: false # -- Whether to enable replication for schemaMigrator enableReplication: true diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 344c8f60..2370bdc6 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -70,3 +70,13 @@ variable "smtp_from" { type = string default = "" } + +variable "s3_backup_bucket_name" { + description = "The name of the S3 bucket to use for backups" + type = string +} + +variable "s3_access_role_arn" { + description = "The ARN of the role to use for accessing the S3 bucket" + type = string +} diff --git a/modules/victoria-metrics/templates/values.yaml b/modules/victoria-metrics/templates/values.yaml index c4e84892..1cf22307 100644 --- a/modules/victoria-metrics/templates/values.yaml +++ b/modules/victoria-metrics/templates/values.yaml @@ -808,6 +808,10 @@ grafana: gnetId: 20417 revision: 3 datasource: VictoriaMetrics + altinity-clickhouse-operator-dashboard: + gnetId: 12163 + revision: 2 + datasource: VictoriaMetrics defaultDashboardsTimezone: utc