From e000497aa2c2e68cc7c8af7b8a40ee21db432d68 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:08:55 -0700 Subject: [PATCH 001/135] Reset back to working config --- deployments/brad-sandbox/main.tf | 83 ++++++++++++++++++++++++++++ deployments/brad-sandbox/provider.tf | 6 ++ deployments/brad-sandbox/versions.tf | 8 +++ 3 files changed, 97 insertions(+) create mode 100644 deployments/brad-sandbox/main.tf create mode 100644 deployments/brad-sandbox/provider.tf create mode 100644 deployments/brad-sandbox/versions.tf diff --git a/deployments/brad-sandbox/main.tf b/deployments/brad-sandbox/main.tf new file mode 100644 index 00000000..544f2228 --- /dev/null +++ b/deployments/brad-sandbox/main.tf @@ -0,0 +1,83 @@ + +locals { + git_branch = "schematic-138-cold-storage-and-backups" +} + +import { + # The initial administrative stack is created manually in the Spacelift UI, and imported + # See https://docs.spacelift.io/vendors/terraform/terraform-provider.html#proposed-workflow + # "We suggest to first manually create a single administrative stack, and then use it + # to programmatically define other stacks as necessary." + to = spacelift_stack.brad-sandbox + id = "brad-sandbox-administrative-stack" +} + +resource "spacelift_stack" "brad-sandbox" { + github_enterprise { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + + administrative = true + autodeploy = false + branch = local.git_branch + description = "Manages other spacelift resources" + name = "Brad Sandbox Administrative Stack" + project_root = "deployments/brad-sandbox" + terraform_version = "1.8.5" + terraform_workflow_tool = "OPEN_TOFU" + repository = "eks-stack" + space_id = "root" + additional_project_globs = [ + "modules/*", + "modules/**/*", + ] +} + +module "brad-sandbox-spacelift" { + source = "../spacelift/dpe-k8s" + parent_space_id = "development-01J49XEN88DQ8K9MCPPTTEXSKE" + admin_stack_id = spacelift_stack.brad-sandbox.id + spotinst_account = "act-45de6f47" + + aws_integration_id = "01J3R9GX6DC09QV7NV872DDYR3" + auto_deploy = false + auto_prune = true + git_branch = "schematic-138-cold-storage-and-backups" + + space_name = "brad-sandbox" + + k8s_stack_name = "Brad sandbox Kubernetes Infrastructure" + k8s_stack_project_root = "deployments/stacks/dpe-k8s" + + k8s_stack_deployments_name = "Brad sandbox Kubernetes Deployments" + k8s_stack_deployments_project_root = "deployments/stacks/dpe-k8s-deployments" + + auth0_stack_name = "" + auth0_stack_project_root = "" + auth0_domain = "" + auth0_clients = [] + + aws_account_id = "631692904429" + region = "us-east-1" + + cluster_name = "brad-k8-sandbox" + vpc_name = "brad-sandbox" + + vpc_cidr_block = "10.52.32.0/20" + # A public subnet is required for each AZ in which the worker nodes are deployed + public_subnet_cidrs = ["10.52.32.0/24", "10.52.33.0/24", "10.52.35.0/24"] + private_subnet_cidrs_eks_control_plane = ["10.52.34.0/28", "10.52.34.16/28"] + azs_eks_control_plane = ["us-east-1a", "us-east-1b"] + + private_subnet_cidrs_eks_worker_nodes = ["10.52.44.0/22", "10.52.40.0/22", "10.52.36.0/22"] + azs_eks_worker_nodes = ["us-east-1c", "us-east-1b", "us-east-1a"] + + enable_cluster_ingress = false + enable_otel_ingress = false + ssl_hostname = "" + auth0_jwks_uri = "" + ses_email_identities = [] + # Defines the email address that will be used as the sender of the email alerts + smtp_from = "" +} diff --git a/deployments/brad-sandbox/provider.tf b/deployments/brad-sandbox/provider.tf new file mode 100644 index 00000000..83f5c0c4 --- /dev/null +++ b/deployments/brad-sandbox/provider.tf @@ -0,0 +1,6 @@ +provider "spacelift" { + # Running from within spacelift does not require these to be set + # api_key_endpoint = "https://sagebionetworks.app.spacelift.io" + # api_key_id = "" + # api_key_secret = "" +} diff --git a/deployments/brad-sandbox/versions.tf b/deployments/brad-sandbox/versions.tf new file mode 100644 index 00000000..aed5ef97 --- /dev/null +++ b/deployments/brad-sandbox/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + spacelift = { + source = "spacelift-io/spacelift" + version = "1.13.0" + } + } +} From 53732c46236eea4e8b4c56751e7f3c268ea5458f Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 08:57:22 -0700 Subject: [PATCH 002/135] Move stack to development space --- deployments/brad-sandbox/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/brad-sandbox/main.tf b/deployments/brad-sandbox/main.tf index 544f2228..524542e5 100644 --- a/deployments/brad-sandbox/main.tf +++ b/deployments/brad-sandbox/main.tf @@ -27,7 +27,7 @@ resource "spacelift_stack" "brad-sandbox" { terraform_version = "1.8.5" terraform_workflow_tool = "OPEN_TOFU" repository = "eks-stack" - space_id = "root" + space_id = "development-01J49XEN88DQ8K9MCPPTTEXSKE" additional_project_globs = [ "modules/*", "modules/**/*", From 7cfa68a2224867e78a74156967ab95fcdfb39476 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 09:04:32 -0700 Subject: [PATCH 003/135] Try with import after manually creating spacelift space --- deployments/brad-sandbox/main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deployments/brad-sandbox/main.tf b/deployments/brad-sandbox/main.tf index 524542e5..20e5603d 100644 --- a/deployments/brad-sandbox/main.tf +++ b/deployments/brad-sandbox/main.tf @@ -34,6 +34,11 @@ resource "spacelift_stack" "brad-sandbox" { ] } +import { + to = module.brad-sandbox-spacelift.spacelift_space.dpe-space + id = "brad-sandbox-01JC3NVVPWXMP68X90QYCMH7A3" +} + module "brad-sandbox-spacelift" { source = "../spacelift/dpe-k8s" parent_space_id = "development-01J49XEN88DQ8K9MCPPTTEXSKE" From 21f865171ab49d6ecafd63112c0647bdd62a9a2d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 09:09:08 -0700 Subject: [PATCH 004/135] Move admin stack to root --- deployments/brad-sandbox/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/brad-sandbox/main.tf b/deployments/brad-sandbox/main.tf index 20e5603d..6d2f75a6 100644 --- a/deployments/brad-sandbox/main.tf +++ b/deployments/brad-sandbox/main.tf @@ -27,7 +27,7 @@ resource "spacelift_stack" "brad-sandbox" { terraform_version = "1.8.5" terraform_workflow_tool = "OPEN_TOFU" repository = "eks-stack" - space_id = "development-01J49XEN88DQ8K9MCPPTTEXSKE" + space_id = "root" additional_project_globs = [ "modules/*", "modules/**/*", From cd45c112bea576e67d55bdd5fae263d942a47227 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 7 Nov 2024 12:02:55 -0500 Subject: [PATCH 005/135] change smtp outputs to empty strings --- deployments/stacks/dpe-k8s/outputs.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 6851f6a7..7a0e2ceb 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -39,10 +39,10 @@ output "cluster_name" { } output "smtp_user" { - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : "" } output "smtp_password" { sensitive = true - value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : null + value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_password : "" } From 1c4c0c397d1b28565843adc74e39c5403da969e6 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:10:38 -0700 Subject: [PATCH 006/135] Deploy fluxcd as an alternative to ArgoCD for kubernetes deployments --- modules/flux2/main.tf | 50 +++++ modules/flux2/templates/values.yaml | 327 ++++++++++++++++++++++++++++ modules/flux2/versions.tf | 16 ++ 3 files changed, 393 insertions(+) create mode 100644 modules/flux2/main.tf create mode 100644 modules/flux2/templates/values.yaml create mode 100644 modules/flux2/versions.tf diff --git a/modules/flux2/main.tf b/modules/flux2/main.tf new file mode 100644 index 00000000..224d3c75 --- /dev/null +++ b/modules/flux2/main.tf @@ -0,0 +1,50 @@ +resource "kubernetes_namespace" "flux-system" { + metadata { + name = "flux-system" + } +} + +resource "helm_release" "fluxcd" { + name = "argo-cd" + repository = "oci://ghcr.io/fluxcd-community/charts/flux2" + chart = "flux2" + namespace = "flux-system" + version = "2.4.0" + depends_on = [kubernetes_namespace.flux-system] + + values = [templatefile("${path.module}/templates/values.yaml", {})] +} + +resource "kubectl_manifest" "capacitor" { + depends_on = [helm_release.fluxcd] + + yaml_body = <=0.1.0" +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: capacitor + namespace: flux-system +spec: + targetNamespace: flux-system + interval: 1h + retryInterval: 2m + timeout: 5m + wait: true + prune: true + path: "./" + sourceRef: + kind: OCIRepository + name: capacitor +YAML +} diff --git a/modules/flux2/templates/values.yaml b/modules/flux2/templates/values.yaml new file mode 100644 index 00000000..c42ea8d2 --- /dev/null +++ b/modules/flux2/templates/values.yaml @@ -0,0 +1,327 @@ +# global + +installCRDs: true +crds: + # -- Add annotations to all CRD resources, e.g. "helm.sh/resource-policy": keep + annotations: {} + +multitenancy: + # -- Implement the patches for Multi-tenancy lockdown. + # See https://fluxcd.io/docs/installation/#multi-tenancy-lockdown + enabled: false + # -- All Kustomizations and HelmReleases which don’t have spec.serviceAccountName + # specified, will use the default account from the tenant’s namespace. + # Tenants have to specify a service account in their Flux resources to be able + # to deploy workloads in their namespaces as the default account has no permissions. + defaultServiceAccount: "default" + # -- Both kustomize-controller and helm-controller service accounts run privileged + # with cluster-admin ClusterRoleBinding. Disable if you want to run them with a + # minimum set of permissions. + privileged: true + +clusterDomain: cluster.local + +cli: + image: ghcr.io/fluxcd/flux-cli + tag: v2.4.0 + nodeSelector: {} + affinity: {} + tolerations: [] + annotations: {} + serviceAccount: + automount: true + +# controllers + +helmController: + create: true + image: ghcr.io/fluxcd/helm-controller + tag: v1.1.0 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + extraEnv: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + nodeSelector: {} + # expects input structure as per specification https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#affinity-v1-core + # for example: + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: foo.bar.com/role + # operator: In + # values: + # - master + + affinity: {} + # expects input structure as per specification https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.11/#toleration-v1-core + # for example: + # tolerations: + # - key: foo.bar.com/role + # operator: Equal + # value: master + # effect: NoSchedule + + tolerations: [] + +imageAutomationController: + create: true + image: ghcr.io/fluxcd/image-automation-controller + tag: v0.39.0 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + extraEnv: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + nodeSelector: {} + affinity: {} + tolerations: [] + +imageReflectionController: + create: true + image: ghcr.io/fluxcd/image-reflector-controller + tag: v0.33.0 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + extraEnv: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + nodeSelector: {} + affinity: {} + tolerations: [] + +kustomizeController: + create: true + image: ghcr.io/fluxcd/kustomize-controller + tag: v1.4.0 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + extraEnv: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + secret: + # -- Create a secret to use it with extraSecretMounts. Defaults to false. + create: false + name: "" + data: {} + # -- Defines envFrom using a configmap and/or secret. + envFrom: + map: + name: "" + secret: + name: "" + # -- Defines additional mounts with secrets. + # Secrets must be manually created in the namespace or with kustomizeController.secret + extraSecretMounts: [] + # - name: secret-files + # mountPath: /etc/secrets + # subPath: "" + # secretName: secret-files + # readOnly: true + + nodeSelector: {} + affinity: {} + tolerations: [] + +notificationController: + create: true + image: ghcr.io/fluxcd/notification-controller + tag: v1.4.0 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + extraEnv: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + service: + labels: {} + annotations: {} + webhookReceiver: + service: + labels: {} + annotations: {} + ingress: + create: false + # ingressClassName: nginx + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + hosts: + - host: flux-webhook.example.com + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: flux-webhook-tls + # hosts: + # - flux-webhook.example.com + + + nodeSelector: {} + affinity: {} + tolerations: [] + +sourceController: + create: true + image: ghcr.io/fluxcd/source-controller + tag: v1.4.1 + resources: + limits: {} + # cpu: 1000m + # memory: 1Gi + requests: + cpu: 100m + memory: 64Mi + priorityClassName: "" + annotations: + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + labels: {} + container: + additionalArgs: [] + serviceAccount: + create: true + automount: true + annotations: {} + imagePullPolicy: "" + service: + labels: {} + annotations: {} + nodeSelector: {} + affinity: {} + tolerations: [] + extraEnv: [] + +policies: + create: true + +rbac: + create: true + # -- Grant the Kubernetes view, edit and admin roles access to Flux custom resources + createAggregation: true + # -- Add annotations to all RBAC resources, e.g. "helm.sh/resource-policy": keep + annotations: {} + roleRef: + name: cluster-admin + +logLevel: info +watchAllNamespaces: true + +# -- contents of pod imagePullSecret in form 'name=[secretName]'; applied to all controllers +imagePullSecrets: [] + +# -- Array of extra K8s manifests to deploy +extraObjects: [] +# Example usage from https://fluxcd.io/docs/components/source/buckets/#static-authentication +# - apiVersion: source.toolkit.fluxcd.io/v1beta2 +# kind: Bucket +# metadata: +# name: podinfo +# namespace: default +# spec: +# interval: 1m +# provider: generic +# bucketName: podinfo +# endpoint: minio.minio.svc.cluster.local:9000 +# insecure: true +# secretRef: +# name: minio-credentials +# - apiVersion: v1 +# kind: Secret +# metadata: +# name: minio-credentials +# namespace: default +# type: Opaque +# data: +# accesskey: +# secretkey: + +# Enables podMonitor creation for the Prometheus Operator +prometheus: + podMonitor: + # -- Enables podMonitor endpoint + create: false + podMetricsEndpoints: + - port: http-prom + relabelings: + # https://github.com/prometheus-operator/prometheus-operator/issues/4816 + - sourceLabels: [__meta_kubernetes_pod_phase] + action: keep + regex: Running diff --git a/modules/flux2/versions.tf b/modules/flux2/versions.tf new file mode 100644 index 00000000..00cbb0b3 --- /dev/null +++ b/modules/flux2/versions.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.0" + } + } +} From 3f0325cf0b2eb5b3e64a98f645e62f0e7bf7a2d0 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:13:36 -0700 Subject: [PATCH 007/135] Deploy flux to stack --- deployments/stacks/dpe-k8s-deployments/main.tf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5db44e36..17fb2b8e 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -26,6 +26,13 @@ module "argo-cd" { source = "../../../modules/argo-cd" } +module "flux-cd" { + depends_on = [module.sage-aws-eks-autoscaler] + # source = "spacelift.io/sagebionetworks/argo-cd/aws" + # version = "0.3.1" + source = "../../../modules/flux-cd" +} + module "victoria-metrics" { depends_on = [module.argo-cd] source = "spacelift.io/sagebionetworks/victoria-metrics/aws" From 7ea14bc7485544da015b6311f3c5fe03163d619a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:18:26 -0700 Subject: [PATCH 008/135] correct directories --- modules/{flux2 => flux-cd}/main.tf | 0 modules/{flux2 => flux-cd}/templates/values.yaml | 0 modules/{flux2 => flux-cd}/versions.tf | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename modules/{flux2 => flux-cd}/main.tf (100%) rename modules/{flux2 => flux-cd}/templates/values.yaml (100%) rename modules/{flux2 => flux-cd}/versions.tf (100%) diff --git a/modules/flux2/main.tf b/modules/flux-cd/main.tf similarity index 100% rename from modules/flux2/main.tf rename to modules/flux-cd/main.tf diff --git a/modules/flux2/templates/values.yaml b/modules/flux-cd/templates/values.yaml similarity index 100% rename from modules/flux2/templates/values.yaml rename to modules/flux-cd/templates/values.yaml diff --git a/modules/flux2/versions.tf b/modules/flux-cd/versions.tf similarity index 100% rename from modules/flux2/versions.tf rename to modules/flux-cd/versions.tf From cc17c334a1866735d79178cdc490157c1418059c Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:20:14 -0700 Subject: [PATCH 009/135] correct versions --- modules/flux-cd/versions.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flux-cd/versions.tf b/modules/flux-cd/versions.tf index 00cbb0b3..31cbf926 100644 --- a/modules/flux-cd/versions.tf +++ b/modules/flux-cd/versions.tf @@ -12,5 +12,9 @@ terraform { source = "hashicorp/helm" version = "~> 2.0" } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } } } From 82e10cc39eb78f30a440fb28fb73868fee536649 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:28:40 -0700 Subject: [PATCH 010/135] authenticate to github oci --- deployments/stacks/dpe-k8s-deployments/provider.tf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/deployments/stacks/dpe-k8s-deployments/provider.tf b/deployments/stacks/dpe-k8s-deployments/provider.tf index 32049e25..55be84ec 100644 --- a/deployments/stacks/dpe-k8s-deployments/provider.tf +++ b/deployments/stacks/dpe-k8s-deployments/provider.tf @@ -13,6 +13,14 @@ provider "helm" { kubernetes { config_path = var.kube_config_path } + + registry { + url = "oci://ghcr.io" + # TODO: Is there a service account we can use instead of my personal account? + username = "BryanFauble" + # Requires that a secret be created in spacelift TF_VAR_github_container_repository_token + password = var.github_container_repository_token + } } provider "spotinst" { From 7b2de5961909c913f0d880918592cf1fc1a9e281 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:31:07 -0700 Subject: [PATCH 011/135] Update variables filefor ghcr token --- deployments/stacks/dpe-k8s-deployments/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 2b9be26a..3a60bba9 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -103,3 +103,8 @@ variable "smtp_from" { type = string default = "" } + +variable "github_container_repository_token" { + description = "The GitHub container repository token. Used to authenticate with the container registry for OCI based helm charts." + type = string +} From 59f9532ad083344dca2e1404010f12e9596b0dea Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:36:07 -0700 Subject: [PATCH 012/135] Correct helm chart version --- modules/flux-cd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flux-cd/main.tf b/modules/flux-cd/main.tf index 224d3c75..7eb83104 100644 --- a/modules/flux-cd/main.tf +++ b/modules/flux-cd/main.tf @@ -9,7 +9,7 @@ resource "helm_release" "fluxcd" { repository = "oci://ghcr.io/fluxcd-community/charts/flux2" chart = "flux2" namespace = "flux-system" - version = "2.4.0" + version = "2.14.0" depends_on = [kubernetes_namespace.flux-system] values = [templatefile("${path.module}/templates/values.yaml", {})] From 37c8e43128f2b46d21a79f5be04b1ab014d3c986 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:51:22 -0700 Subject: [PATCH 013/135] Swap away from OCI for flux --- deployments/stacks/dpe-k8s-deployments/provider.tf | 8 -------- deployments/stacks/dpe-k8s-deployments/variables.tf | 5 ----- modules/flux-cd/main.tf | 4 ++-- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/provider.tf b/deployments/stacks/dpe-k8s-deployments/provider.tf index 55be84ec..32049e25 100644 --- a/deployments/stacks/dpe-k8s-deployments/provider.tf +++ b/deployments/stacks/dpe-k8s-deployments/provider.tf @@ -13,14 +13,6 @@ provider "helm" { kubernetes { config_path = var.kube_config_path } - - registry { - url = "oci://ghcr.io" - # TODO: Is there a service account we can use instead of my personal account? - username = "BryanFauble" - # Requires that a secret be created in spacelift TF_VAR_github_container_repository_token - password = var.github_container_repository_token - } } provider "spotinst" { diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 3a60bba9..2b9be26a 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -103,8 +103,3 @@ variable "smtp_from" { type = string default = "" } - -variable "github_container_repository_token" { - description = "The GitHub container repository token. Used to authenticate with the container registry for OCI based helm charts." - type = string -} diff --git a/modules/flux-cd/main.tf b/modules/flux-cd/main.tf index 7eb83104..a37d3618 100644 --- a/modules/flux-cd/main.tf +++ b/modules/flux-cd/main.tf @@ -5,8 +5,8 @@ resource "kubernetes_namespace" "flux-system" { } resource "helm_release" "fluxcd" { - name = "argo-cd" - repository = "oci://ghcr.io/fluxcd-community/charts/flux2" + name = "flux2" + repository = "https://fluxcd-community.github.io/helm-charts" chart = "flux2" namespace = "flux-system" version = "2.14.0" From 7fb70f1c159e088e47e5dd6a214ee7b229076c3a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:07:59 -0700 Subject: [PATCH 014/135] Deploy kustomization resource separately --- modules/flux-cd/main.tf | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/modules/flux-cd/main.tf b/modules/flux-cd/main.tf index a37d3618..d718f995 100644 --- a/modules/flux-cd/main.tf +++ b/modules/flux-cd/main.tf @@ -19,17 +19,29 @@ resource "kubectl_manifest" "capacitor" { depends_on = [helm_release.fluxcd] yaml_body = <=0.1.0" ---- + targetNamespace: flux-system + interval: 1h + retryInterval: 2m + timeout: 5m + wait: true + prune: true + path: "./" + sourceRef: + kind: OCIRepository + name: capacitor +YAML +} + +resource "kubectl_manifest" "capacitor-kustomization" { + depends_on = [helm_release.fluxcd] + + yaml_body = < Date: Thu, 7 Nov 2024 15:15:10 -0700 Subject: [PATCH 015/135] Correct my flipped logic --- modules/flux-cd/main.tf | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/modules/flux-cd/main.tf b/modules/flux-cd/main.tf index d718f995..a6b0b5c3 100644 --- a/modules/flux-cd/main.tf +++ b/modules/flux-cd/main.tf @@ -19,22 +19,16 @@ resource "kubectl_manifest" "capacitor" { depends_on = [helm_release.fluxcd] yaml_body = <=0.1.0" YAML } From 84541137628e5c409868c5e6cdf4cdf70e3bd3c8 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:44:25 -0700 Subject: [PATCH 016/135] Attempt deployment of signoz through fluxcd --- .../stacks/dpe-k8s-deployments/main.tf | 19 + modules/signoz-fluxcd/main.tf | 171 ++ modules/signoz-fluxcd/templates/values.yaml | 2437 +++++++++++++++++ modules/signoz-fluxcd/variables.tf | 67 + modules/signoz-fluxcd/versions.tf | 17 + 5 files changed, 2711 insertions(+) create mode 100644 modules/signoz-fluxcd/main.tf create mode 100644 modules/signoz-fluxcd/templates/values.yaml create mode 100644 modules/signoz-fluxcd/variables.tf create mode 100644 modules/signoz-fluxcd/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 17fb2b8e..8cf41dfb 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -101,6 +101,25 @@ module "signoz" { smtp_from = var.smtp_from } +module "signoz-flux-deployment" { + depends_on = [module.argo-cd] + # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" + # version = "0.5.0" + source = "../../../modules/signoz-fluxcd" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = var.git_revision + namespace = "signoz-fluxcd" + argo_deployment_name = "signoz-fluxcd" + enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress + gateway_namespace = "envoy-gateway" + cluster_name = var.cluster_name + auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from +} + module "envoy-gateway" { count = var.enable_cluster_ingress ? 1 : 0 depends_on = [module.argo-cd] diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf new file mode 100644 index 00000000..c4b32fe9 --- /dev/null +++ b/modules/signoz-fluxcd/main.tf @@ -0,0 +1,171 @@ +locals { + alertmanager_enabled = var.smtp_from != "" && var.smtp_user != "" && var.smtp_password != "" +} + +resource "kubernetes_namespace" "signoz" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "signoz-helm-repo" { + depends_on = [kubernetes_namespace.signoz] + + yaml_body = < + + # -- Clickhouse image + image: + # -- Clickhouse image registry to use. + registry: docker.io + # -- Clickhouse image repository to use. + repository: clickhouse/clickhouse-server + # -- Clickhouse image tag to use (example: `21.8`). + # SigNoz is not always tested with latest version of ClickHouse. + # Only if you know what you are doing, proceed with overriding. + tag: 24.1.2-alpine + # -- Clickhouse image pull policy. + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for ClickHouse. + # If global.imagePullSecrets is set as well, it will merged. + imagePullSecrets: [] + # - "clickhouse-pull-secret" + + # -- ClickHouse instance annotations. + annotations: {} + + # ClickHouse Service Account + serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- Annotations to add to the service account + annotations: {} + # -- The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Clickhouse service + service: + # -- Annotations to use by service associated to Clickhouse instance + annotations: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Clickhouse HTTP port + httpPort: 8123 + # -- Clickhouse TCP port + tcpPort: 9000 + + # -- Whether to use TLS connection connecting to ClickHouse + secure: false + # -- Whether to verify TLS certificate on connection to ClickHouse + verify: false + # -- URL for zookeeper. + externalZookeeper: {} + # servers: + # - host: signoz-signoz-zookeeper + # port: 2181 + + # -- Node selector for settings for clickhouse pod + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } + # -- Toleration labels for clickhouse pod assignment + tolerations: [] + # -- Affinity settings for clickhouse pod + affinity: {} + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 200Mi + # limits: + # cpu: 2000m + # memory: 4Gi + + # -- Security context for Clickhouse node + securityContext: + enabled: true + runAsUser: 101 + runAsGroup: 101 + fsGroup: 101 + fsGroupChangePolicy: OnRootMismatch + + # -- An allowlist of IP addresses or network masks the ClickHouse user is + # allowed to access from. By default anything within a private network will be + # allowed. This should suffice for most use case although to expose to other + # networks you will need to update this setting. + # + # Refs: + # - https://clickhouse.com/docs/en/operations/settings/settings-users/#user-namenetworks + # - https://en.wikipedia.org/wiki/Reserved_IP_addresses#IPv4 + allowedNetworkIps: + - "10.0.0.0/8" + - "100.64.0.0/10" + - "172.16.0.0/12" + - "192.0.0.0/24" + - "198.18.0.0/15" + - "192.168.0.0/16" + + persistence: + # -- Enable data persistence using PVC for ClickHouseDB data. + enabled: true + + # -- Use a manually managed Persistent Volume and Claim. + # If defined, PVC must be created manually before volume will be bound. + # (only when deploying a single replica). + # + existingClaim: "" + + # -- Persistent Volume Storage Class to use. + # If defined, `storageClassName: `. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 20Gi + + # -- Clickhouse user profile configuration. + # You can use this to override profile settings, for example + # `default/max_memory_usage: 40000000000` or `default/max_concurrent_queries: 200` + # + # For the full list of settings, see: + # - https://clickhouse.com/docs/en/operations/settings/settings-profiles/ + # - https://clickhouse.com/docs/en/operations/settings/settings/ + # + profiles: {} + + # -- Default user profile configuration for Clickhouse. !!! Please DO NOT override this !!! + defaultProfiles: + default/allow_experimental_window_functions: "1" + default/allow_nondeterministic_mutations: "1" + + # -- Clickhouse init container to copy histogramQuantile UDF + # @default -- See `values.yaml` for defaults + initContainers: + enabled: true + udf: + enabled: true + image: + registry: docker.io + repository: alpine + tag: 3.18.2 + pullPolicy: IfNotPresent + command: + - sh + - -c + - | + set -x + wget -O /tmp/histogramQuantile https://github.com/SigNoz/signoz/raw/develop/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile + mv /tmp/histogramQuantile /var/lib/clickhouse/user_scripts/histogramQuantile + chmod +x /var/lib/clickhouse/user_scripts/histogramQuantile + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - | + set -e + until curl -s -o /dev/null http://signoz-clickhouse:8123/ + do sleep 1 + done + + # -- Clickhouse cluster layout. (Experimental, use at own risk) + # For a full list of options, see https://github.com/Altinity/clickhouse-operator/blob/master/docs/custom_resource_explained.md + # section on clusters and layouts. + # + layout: + shardsCount: 1 + replicasCount: 2 + + # -- ClickHouse settings configuration. + # You can use this to override settings, for example `prometheus/port: 9363` + # For the full list of settings, see: + # - https://clickhouse.com/docs/en/operations/settings/settings/ + # + settings: + # Uncomment those lines if you want to enable the built-in Prometheus HTTP endpoint in ClickHouse. + prometheus/endpoint: /metrics + prometheus/port: 9363 + # prometheus/metrics: true + # prometheus/events: true + # prometheus/asynchronous_metrics: true + + # -- Default settings configuration for ClickHouse. !!! Please DO NOT override this !!! + defaultSettings: + format_schema_path: /etc/clickhouse-server/config.d/ + user_scripts_path: /var/lib/clickhouse/user_scripts/ + user_defined_executable_functions_config: '/etc/clickhouse-server/functions/custom-functions.xml' + + # -- ClickHouse pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '9363' + signoz.io/path: /metrics + + # -- Topologies on how to distribute the ClickHouse pod. + # Possible values can be found here: + # - https://github.com/Altinity/clickhouse-operator/blob/1414503921da3ae475eb6f9a296d3475a6993768/docs/chi-examples/99-clickhouseinstallation-max.yaml#L428-L481 + podDistribution: [] + # - type: ShardAntiAffinity + # topologyKey: kubernetes.io/hostname + # - type: ReplicaAntiAffinity + # topologyKey: kubernetes.io/hostname + # - type: MaxNumberPerNode + # number: 2 + # topologyKey: kubernetes.io/hostname + + # TODO: Enable cold storage: https://sagebionetworks.jira.com/browse/IBCDPE-1094 + # Cold storage configuration + coldStorage: + # -- Whether to enable S3 cold storage + enabled: false + # -- Reserve free space on default disk (in bytes) + # Default value is 10MiB + defaultKeepFreeSpaceBytes: "10485760" + # -- Type of cold storage: s3 or gcs + type: s3 + # -- Endpoint for S3 or GCS + # For S3, if region is us-east-1, endpoint can be https://s3.amazonaws.com + # if region is not us-east-1, endpoint should be https://s3-.amazonaws.com + # For GCS, endpoint should be https://storage.googleapis.com//data/ + endpoint: https://.s3-.amazonaws.com/data/ + # -- Access Key for S3 or GCS + accessKey: + # -- Secret Access Key for S3 or GCS + secretAccess: + # AWS role configuration - to use environment variables instead of passing access and secret keys + role: + # -- Whether to enable AWS IAM ARN role. + enabled: false + # -- Annotations to use by service account associated to Clickhouse instance + annotations: + # aws role arn + eks.amazonaws.com/role-arn: arn:aws:iam::******:role/***** + + # -- Clickhouse configuration files. + # + # Refs: + # - https://clickhouse.com/docs/en/operations/configuration-files/ + # - https://github.com/Altinity/clickhouse-operator/blob/master/docs/chi-examples/05-settings-05-files-nested.yaml + files: {} + # config.d/log_rotation.xml: | + # + # + # trace + # true + # /var/log/clickhouse-server/clickhouse-server.err.log + # /var/log/clickhouse-server/clickhouse-server.log + # 100M + # 10 + # + # + # test.xml: | + # + # some-value + # + + ### + ### + ### ---- MISC ---- + ### + ### + + # -- When the `installCustomStorageClass` is enabled with `cloud` set as `gcp` or `aws`, + # it creates custom storage class with volume expansion permission. + installCustomStorageClass: false + + ### + ### + ### ---- CLICKHOUSE OPERATOR ---- + ### + ### + clickhouseOperator: + # -- name of the component + name: operator + + # -- Version of the operator + version: 0.21.2 + + # -- Clickhouse Operator image + image: + # -- Clickhouse Operator image registry to use. + registry: docker.io + # -- Clickhouse Operator image repository to use. + repository: altinity/clickhouse-operator + # -- Clickhouse Operator image tag. + tag: 0.21.2 + # -- Clickhouse Operator image pull policy. + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Clickhouse Operator. + # If global.imagePullSecrets is set as well, it will merged. + imagePullSecrets: [] + # - "clickhouseOperator-pull-secret" + + # ClickHouse Operator Service Account + serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- Annotations to add to the service account + annotations: {} + # -- The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Clickhouse logging config + logger: + # -- Logging level. Acceptable values: trace, debug, information, warning, error. + level: information + # -- Size of the file. Applies to log and errorlog. Once the file reaches size, + # ClickHouse archives and renames it, and creates a new log file in its place. + size: 1000M + # -- The number of archived log files that ClickHouse stores. + count: 10 + # -- Whether to send log and errorlog to the console instead of file. To enable, set to 1 or true. + console: 1 + + # Query Log table configuration + queryLog: + # -- The number of days to keep the data in the query_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the query_log table. + flushInterval: 7500 + # Part Log table configuration + partLog: + # -- The number of days to keep the data in the part_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the part_log table. + flushInterval: 7500 + # Trace Log table configuration + traceLog: + # -- The number of days to keep the data in the trace_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the trace_log table. + flushInterval: 7500 + + asynchronousInsertLog: + # -- The number of days to keep the data in the asynchronous_insert_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the asynchronous_insert_log table. + flushInterval: 7500 + asynchronousMetricLog: + # -- The number of days to keep the data in the asynchronous_metric_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the asynchronous_metric_log table. + flushInterval: 7500 + backupLog: + # -- The number of days to keep the data in the backup_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the backup_log table. + flushInterval: 7500 + blobStorageLog: + # -- The number of days to keep the data in the blob_storage_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the blob_storage_log table. + flushInterval: 7500 + crashLog: + # -- The number of days to keep the data in the crash_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the crash_log table. + flushInterval: 7500 + metricLog: + # -- The number of days to keep the data in the metric_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the metric_log table. + flushInterval: 7500 + queryThreadLog: + # -- The number of days to keep the data in the query_thread_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the query_thread_log table. + flushInterval: 7500 + queryViewsLog: + # -- The number of days to keep the data in the query_views_log table. + ttl: 15 + # -- Time interval in milliseconds between flushes of the query_views_log table. + flushInterval: 7500 + sessionLog: + # -- The number of days to keep the data in the session_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the session_log table. + flushInterval: 7500 + zookeeperLog: + # -- The number of days to keep the data in the zookeeper_log table. + ttl: 30 + # -- Time interval in milliseconds between flushes of the zookeeper_log table. + flushInterval: 7500 + processorsProfileLog: + # -- The number of days to keep the data in the processors_profile_log table. + ttl: 7 + # -- Time interval in milliseconds between flushes of the processors_profile_log table. + flushInterval: 7500 + + # -- Clickhouse Operator pod(s) annotation. + podAnnotations: + signoz.io/port: '8888' + signoz.io/scrape: 'true' + + # -- Clickhouse Operator node selector + nodeSelector: {} + + # -- Metrics Exporter config. + metricsExporter: + # -- name of the component + name: metrics-exporter + + # -- Metrics Exporter service + service: + # -- Annotations to use by service associated to Metrics Exporter + annotations: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Metrics Exporter port + port: 8888 + + # -- Metrics Exporter image + image: + # -- Metrics Exporter image registry to use. + registry: docker.io + # -- Metrics Exporter image repository to use. + repository: altinity/metrics-exporter + # -- Metrics Exporter image tag. + tag: 0.21.2 + # -- Metrics Exporter image pull policy. + pullPolicy: IfNotPresent + + +## External clickhouse configuration +## This is required when clickhouse.enabled is false +externalClickhouse: + # -- Host of the external cluster. + host: + # -- Name of the external cluster to run DDL queries on. + cluster: cluster + # -- Database name for the external cluster + database: signoz_metrics + # -- Clickhouse trace database (SigNoz Traces) + traceDatabase: signoz_traces + # -- Clickhouse log database (SigNoz Logs) + logDatabase: signoz_logs + # -- User name for the external cluster to connect to the external cluster as + user: "" + # -- Password for the cluster. Ignored if externalClickhouse.existingSecret is set + password: "" + # -- Name of an existing Kubernetes secret object containing the password + existingSecret: + # -- Name of the key pointing to the password in your Kubernetes secret + existingSecretPasswordKey: + # -- Whether to use TLS connection connecting to ClickHouse + secure: false + # -- Whether to verify TLS connection connecting to ClickHouse + verify: false + # -- HTTP port of Clickhouse + httpPort: 8123 + # -- TCP port of Clickhouse + tcpPort: 9000 + +# Default values for query-service +queryService: + name: "query-service" + replicaCount: 1 + image: + registry: docker.io + repository: signoz/query-service + tag: 0.57.0 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Query-Service + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # Query-Service Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Query-Service service + service: + # -- Annotations to use by service associated to Query-Service + annotations: {} + # -- Labels to use by service associated to Query-Service + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Query-Service HTTP port + port: 8080 + # -- Query-Service Internal port + internalPort: 8085 + # -- Query-Service OpAMP Internal port + opampPort: 4320 + # -- Set this to you want to force a specific nodePort for http. + # Must be use with service.type=NodePort + nodePort: null + # -- Set this to you want to force a specific nodePort for internal. + # Must be use with service.type=NodePort + internalNodePort: null + + # -- Query-Service annotations + annotations: {} + + # -- Query-Service additional arguments for command line + additionalArgs: + - --use-logs-new-schema=true + + # -- Additional environments to set for queryService + additionalEnvs: {} + # env_key: env_value + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting query service now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + migration: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + args: [] + command: [] + # - sh + # - -c + # - | + # echo "Running migration" + # sleep 10 # Replace with actual migration command + # echo "Migration completed" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + configVars: + storage: clickhouse + # ClickHouse URL is set and applied internally. + # Don't override unless you know what you are doing. + # clickHouseUrl: tcp://clickhouse_operator:clickhouse_operator_password@my-release-clickhouse:9000/signoz_traces + goDebug: netdns=go + telemetryEnabled: true + deploymentType: kubernetes-helm + + # Query-Service cache options + cache: + # -- Whether to enable cache for Query-Service + enabled: true + # -- Cache flux interval for Query-Service + fluxInterval: 30m + # -- Cache configurations for Query-Service + config: + name: cache + provider: inmemory + inmemory: + ttl: 168h + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + # -- Configure liveness and readiness probes. + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + livenessProbe: + enabled: true + port: http + path: /api/v1/health + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: http + path: /api/v1/health?live=1 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + # -- Custom liveness probe + customLivenessProbe: {} + # -- Custom readiness probe + customReadinessProbe: {} + + ingress: + # -- Enable ingress for Query-Service + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Query-Service Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Query-Service Ingress Host names with their path details + hosts: + - host: query-service.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 8080 + # -- Query-Service Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - query-service.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 750m + # memory: 1000Mi + + # -- QueryService priority class name + priorityClassName: "" + # -- Node selector for settings for QueryService pod + nodeSelector: {} + # -- Toleration labels for QueryService pod assignment + tolerations: [] + # -- Affinity settings for QueryService pod + affinity: {} + # -- TopologySpreadConstraints describes how QueryService pods ought to spread + topologySpreadConstraints: [] + + persistence: + # -- Enable data persistence using PVC for SQLiteDB data. + enabled: true + + # -- Name of an existing PVC to use (only when deploying a single replica) + existingClaim: "" + + # -- Persistent Volume Storage Class to use. + # If defined, `storageClassName: `. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 1Gi + + +# Default values for frontend +frontend: + name: "frontend" + replicaCount: 1 + + image: + registry: docker.io + repository: signoz/frontend + tag: 0.57.0 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for Frontend + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # Frontend Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Frontend service + service: + # -- Annotations to use by service associated to Frontend + annotations: {} + # -- Labels to use by service associated to Frontend + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Frontend HTTP port + port: 3301 + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /api/v1/health?live=1 + waitMessage: "waiting for query-service" + doneMessage: "query-service ready, starting frontend now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 11 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + behavior: {} + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 2 + # periodSeconds: 60 + + autoscalingTemplate: [] + keda: + enabled: false + pollingInterval: "30" # check 30sec periodically for metrics data + cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale + minReplicaCount: "1" # should be >= replicaCount specified in values.yaml + maxReplicaCount: "5" + triggers: + - type: memory + metadata: + type: Utilization + value: "80" # hpa make sure average Utilization <=80 by adding new pods + - type: cpu + metadata: + type: Utilization + value: "80" # hpa make sure average Utlization <=80 by adding new pods + + configVars: {} + + # -- Frontend deployment annotations + annotations: {} + + # -- Frontend pod security context + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + ingress: + # -- Enable ingress for Frontend + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Frontend Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Frontend Ingress Host names with their path details + hosts: + - host: frontend.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 3301 + # -- Frontend Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - frontend.domain.com + + # -- Frontend Nginx extra configurations + nginxExtraConfig: | + client_max_body_size 24M; + large_client_header_buffers 8 16k; + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + + # -- Frontend priority class name + priorityClassName: "" + # -- Node selector for settings for Frontend pod + nodeSelector: {} + # -- Toleration labels for Frontend pod assignment + tolerations: [] + # -- Affinity settings for Frontend pod + affinity: {} + # -- TopologySpreadConstraints describes how Frontend pods ought to spread + topologySpreadConstraints: [] + +# Default values for Alertmanager +alertmanager: + enabled: + name: "alertmanager" + replicaCount: 1 + + image: + registry: docker.io + repository: signoz/alertmanager + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: 0.23.7 + + # -- Image Registry Secret Names for Alertmanager + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # -- Alertmanager custom command override + command: [] + # -- Alertmanager extra Arguments + extraArgs: {} + + # Alertmanager Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # Alertmanager service + service: + # -- Annotations to use by service associated to Alertmanager + annotations: {} + # -- Labels to use by service associated to Alertmanager + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + # -- Alertmanager HTTP port + port: 9093 + # -- Alertmanager cluster port + clusterPort: 9094 + # -- Set this to you want to force a specific nodePort. Must be use with service.type=NodePort + nodePort: null + + # -- Additional environments to set for Alertmanager + additionalEnvs: + ALERTMANAGER_SMTP_FROM: + ALERTMANAGER_SMTP_HOST: email-smtp.us-east-1.amazonaws.com + # 587 is the STARTTLS port for SMTP + # https://docs.aws.amazon.com/ses/latest/dg/smtp-connect.html#smtp-connect-starttls + ALERTMANAGER_SMTP_PORT: "587" + ALERTMANAGER_SMTP_AUTH_USERNAME: + ALERTMANAGER_SMTP_AUTH_PASSWORD: + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /api/v1/health?live=1 + waitMessage: "waiting for query-service" + doneMessage: "query-service ready, starting alertmanager now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + podSecurityContext: + fsGroup: 65534 + dnsConfig: {} + # nameservers: + # - 1.2.3.4 + # searches: + # - ns1.svc.cluster-domain.example + # - my.dns.search.suffix + # options: + # - name: ndots + # value: "2" + # - name: edns0 + securityContext: + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + runAsUser: 65534 + runAsNonRoot: true + runAsGroup: 65534 + + additionalPeers: [] + + livenessProbe: + httpGet: + path: / + port: http + + readinessProbe: + httpGet: + path: / + port: http + + ingress: + # -- Enable ingress for Alertmanager + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to Alertmanager Ingress + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # cert-manager.io/cluster-issuer: letsencrypt-prod + # -- Alertmanager Ingress Host names with their path details + hosts: + - host: alertmanager.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 9093 + # -- Alertmanager Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - alertmanager.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + + # -- Alertmanager priority class name + priorityClassName: "" + # -- Node selector for settings for Alertmanager pod + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } + # -- Toleration labels for Alertmanager pod assignment + tolerations: [] + # -- Affinity settings for Alertmanager pod + affinity: {} + # -- TopologySpreadConstraints describes how Alertmanager pods ought to spread + topologySpreadConstraints: [] + + statefulSet: + annotations: {} + + podAnnotations: {} + podLabels: {} + + # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ + podDisruptionBudget: {} + # maxUnavailable: 1 + # minAvailable: 1 + + persistence: + # -- Enable data persistence using PVC for Alertmanager data. + enabled: true + + # -- Name of an existing PVC to use (only when deploying a single replica) + existingClaim: "" + + # -- Persistent Volume Storage Class to use. + # If defined, `storageClassName: `. + # If set to "-", `storageClassName: ""`, which disables dynamic provisioning + # If undefined (the default) or set to `null`, no storageClassName spec is + # set, choosing the default provisioner. + # + storageClass: null + + # -- Access Modes for persistent volume + accessModes: + - ReadWriteOnce + + # -- Persistent Volume size + size: 100Mi + + ## Using the config, alertmanager.yml file is created. + ## We no longer need the config file as query services + ## delivers the required config. + # config: + # global: + # resolve_timeout: 1m + # slack_api_url: 'https://hooks.slack.com/services/xxx' + + # templates: + # - '/etc/alertmanager/*.tmpl' + + # receivers: + # - name: 'slack-notifications' + # slack_configs: + # - channel: '#alerts' + # send_resolved: true + # icon_url: https://avatars3.githubusercontent.com/u/3380462 + # title: '{{ template "slack.title" . }}' + # text: '{{ template "slack.text" . }}' + + # route: + # receiver: 'slack-notifications' + + ## Templates are no longer needed as they are included + ## from frontend placeholder while creating alert channels. + # templates: + # title.tmpl: |- + # {{ define "slack.title" }} + # [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + # {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + # {{" "}}( + # {{- with .CommonLabels.Remove .GroupLabels.Names }} + # {{- range $index, $label := .SortedPairs -}} + # {{ if $index }}, {{ end }} + # {{- $label.Name }}="{{ $label.Value -}}" + # {{- end }} + # {{- end -}} + # ) + # {{- end }} + # {{ end }} + # text.tmpl: |- + # {{ define "slack.text" }} + # {{ range .Alerts -}} + # *Alert:* {{ .Labels.alertname }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + # *Summary:* {{ .Annotations.summary }} + # *Description:* {{ .Annotations.description }} + + # *Details:* + # {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + # {{ end }} + # {{ end }} + # {{ end }} + + ## Monitors ConfigMap changes and POSTs to a URL + ## Ref: https://github.com/jimmidyson/configmap-reload + ## + configmapReload: + ## If false, the configmap-reload container will not be deployed + ## + enabled: false + + ## configmap-reload container name + ## + name: configmap-reload + + ## configmap-reload container image + ## + image: + repository: jimmidyson/configmap-reload + tag: v0.5.0 + pullPolicy: IfNotPresent + + # containerPort: 9533 + + # -- Configure resource requests and limits. Update as per your need. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: 200m + # memory: 200Mi + +# Default values for schemaMigrator +schemaMigrator: + enabled: true + name: "schema-migrator" + + image: + registry: docker.io + repository: signoz/signoz-schema-migrator + tag: 0.111.5 + pullPolicy: IfNotPresent + + args: + - "--up=" + # For usual Helm installs, we don't need any additional annotations. + # As well as for Helm upgrade (with upgradeHelmHooks to true), we automatically include the required pre-upgrade helm hooks. + # For ArgoCD, since every upgrade is an install, we need to automatically include the relevant ArgoCD hooks using upgradeHelmHooks. + annotations: {} + # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. + # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. + upgradeHelmHooks: true + + # -- Whether to enable replication for schemaMigrator + enableReplication: true + + # -- Node selector for settings for schemaMigrator + nodeSelector: {} + # -- Toleration labels for schemaMigrator assignment + tolerations: [] + # -- Affinity settings for schemaMigrator + affinity: {} + # -- TopologySpreadConstraints describes how schemaMigrator pods ought to spread + topologySpreadConstraints: [] + + initContainers: + init: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting schema migrator now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + chReady: + enabled: true + image: + registry: docker.io + repository: clickhouse/clickhouse-server + tag: 24.1.2-alpine + pullPolicy: IfNotPresent + command: + - "sh" + - "-c" + - | + echo "Running clickhouse ready check" + while true + do + version="$(CLICKHOUSE_VERSION)" + shards="$(CLICKHOUSE_SHARDS)" + replicas="$(CLICKHOUSE_REPLICAS)" + current_version="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT version()")" + if [ -z "$current_version" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ -z "$(echo "$current_version" | grep "$version")" ]; then + echo "expected version: $version, current version: $current_version" + echo "waiting for clickhouse with correct version" + sleep 5 + continue + fi + current_shards="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(shard_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" + if [ -z "$current_shards" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ "$current_shards" -ne "$shards" ]; then + echo "expected shard count: $shards, current shard count: $current_shards" + echo "waiting for clickhouse with correct shard count" + sleep 5 + continue + fi + current_replicas="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(replica_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" + if [ -z "$current_replicas" ]; then + echo "waiting for clickhouse to be ready" + sleep 5 + continue + fi + if [ "$current_replicas" -ne "$replicas" ]; then + echo "expected replica count: $replicas, current replica count: $current_replicas" + echo "waiting for clickhouse with correct replica count" + sleep 5 + continue + fi + break + done + echo "clickhouse ready, starting schema migrator now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + wait: + enabled: true + image: + registry: docker.io + repository: groundnuty/k8s-wait-for + tag: v2.0 + pullPolicy: IfNotPresent + env: [] + + # SchemaMigrator Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # SchemaMigrator RBAC config + role: + # -- Specifies whether a clusterRole should be created + create: true + # -- Annotations to add to the clusterRole + annotations: {} + # -- The name of the clusterRole to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + # -- A set of rules as documented here. + # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # @default -- See `values.yaml` for defaults + rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] + + # SchemaMigrator clusterRoleBinding + roleBinding: + # Annotations to add to the clusterRoleBinding + annotations: {} + # The name of the clusterRoleBinding to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# Default values for OtelCollector +otelCollector: + name: "otel-collector" + image: + registry: docker.io + repository: signoz/signoz-otel-collector + tag: 0.111.5 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for OtelCollector + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + initContainers: + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting otel collector now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + # OpenTelemetry Collector executable + command: + # -- OtelCollector command name + name: /signoz-collector + # -- OtelCollector command extra arguments + extraArgs: + - --feature-gates=-pkg.translator.prometheus.NormalizeName + + configMap: + # -- Specifies whether a configMap should be created (true by default) + create: true + + # OtelCollector Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # OtelCollector service + service: + # -- Annotations to use by service associated to OtelCollector + annotations: {} + # -- Labels to use by service associated to OtelCollector + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + + # -- OtelCollector Deployment annotation. + annotations: {} + # -- OtelCollector pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '8888' + + # -- OtelCollector pod(s) labels. + podLabels: {} + + # -- Additional environments to set for OtelCollector + additionalEnvs: {} + # env_key: env_value + + # -- Whether to enable grouping of exceptions with same name and different stack trace. + # This is useful when you have a lot of exceptions with same name but different stack trace. + # This is a tradeoff between cardinality and accuracy of exception grouping. + lowCardinalityExceptionGrouping: false + + minReadySeconds: 5 + progressDeadlineSeconds: 600 + replicaCount: 2 + + # OtelCollector RBAC config + clusterRole: + # -- Specifies whether a clusterRole should be created + create: true + # -- Annotations to add to the clusterRole + annotations: {} + # -- The name of the clusterRole to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + # -- A set of rules as documented here. + # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # @default -- See `values.yaml` for defaults + rules: + # k8sattributes processor requires these permissions + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] + + # OtelCollector clusterRoleBinding + clusterRoleBinding: + # Annotations to add to the clusterRoleBinding + annotations: {} + # The name of the clusterRoleBinding to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + + # Configuration for ports + ports: + otlp: + # -- Whether to enable service port for OTLP gRPC + enabled: true + # -- Container port for OTLP gRPC + containerPort: 4317 + # -- Service port for OTLP gRPC + servicePort: 4317 + # -- Node port for OTLP gRPC + nodePort: "" + # -- Protocol to use for OTLP gRPC + protocol: TCP + otlp-http: + # -- Whether to enable service port for OTLP HTTP + enabled: true + # -- Container port for OTLP HTTP + containerPort: 4318 + # -- Service port for OTLP HTTP + servicePort: 4318 + # -- Node port for OTLP HTTP + nodePort: "" + # -- Protocol to use for OTLP HTTP + protocol: TCP + jaeger-compact: + # -- Whether to enable service port for Jaeger Compact + enabled: false + # -- Container port for Jaeger Compact + containerPort: 6831 + # -- Service port for Jaeger Compact + servicePort: 6831 + # -- Node port for Jaeger Compact + nodePort: "" + # -- Protocol to use for Jaeger Compact + protocol: UDP + jaeger-thrift: + # -- Whether to enable service port for Jaeger Thrift HTTP + enabled: false + # -- Container port for Jaeger Thrift + containerPort: 14268 + # -- Service port for Jaeger Thrift + servicePort: 14268 + # -- Node port for Jaeger Thrift + nodePort: "" + # -- Protocol to use for Jaeger Thrift + protocol: TCP + jaeger-grpc: + # -- Whether to enable service port for Jaeger gRPC + enabled: false + # -- Container port for Jaeger gRPC + containerPort: 14250 + # -- Service port for Jaeger gRPC + servicePort: 14250 + # -- Node port for Jaeger gRPC + nodePort: "" + # -- Protocol to use for Jaeger gRPC + protocol: TCP + zipkin: + # -- Whether to enable service port for Zipkin + enabled: false + # -- Container port for Zipkin + containerPort: 9411 + # -- Service port for Zipkin + servicePort: 9411 + # -- Node port for Zipkin + nodePort: "" + # -- Protocol to use for Zipkin + protocol: TCP + prometheus: + # -- Whether to enable service port for SigNoz exported prometheus metrics + enabled: false + # -- Container port for SigNoz exported prometheus metrics + containerPort: 8889 + # -- Service port for SigNoz exported prometheus metrics + servicePort: 8889 + # -- Node port for SigNoz exported prometheus metrics + nodePort: "" + # -- Protocol to use for SigNoz exported prometheus metrics + protocol: TCP + metrics: + # -- Whether to enable service port for internal metrics + enabled: true + # -- Container port for internal metrics + containerPort: 8888 + # -- Service port for internal metrics + servicePort: 8888 + # -- Node port for internal metrics + nodePort: "" + # -- Protocol to use for internal metrics + protocol: TCP + zpages: + # -- Whether to enable service port for ZPages + enabled: false + # -- Container port for Zpages + containerPort: 55679 + # -- Service port for Zpages + servicePort: 55679 + # -- Node port for Zpages + nodePort: "" + # -- Protocol to use for Zpages + protocol: TCP + pprof: + # -- Whether to enable service port for pprof + enabled: false + # -- Container port for pprof + containerPort: 1777 + # -- Service port for pprof + servicePort: 1777 + # -- Node port for pprof + nodePort: "" + # -- Protocol to use for pprof + protocol: TCP + logsheroku: + # -- Whether to enable service port for logsheroku + enabled: false + # -- Container port for logsheroku + containerPort: 8081 + # -- Service port for logsheroku + servicePort: 8081 + # -- Node port for logsheroku + nodePort: "" + # -- Protocol to use for logsheroku + protocol: TCP + logsjson: + # -- Whether to enable service port for logsjson + enabled: false + # -- Container port for logsjson + containerPort: 8082 + # -- Service port for logsjson + servicePort: 8082 + # -- Node port for logsjson + nodePort: "" + # -- Protocol to use for logsjson + protocol: TCP + + # -- Configure liveness and readiness probes. + # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + livenessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + # -- Custom liveness probe + customLivenessProbe: {} + # -- Custom readiness probe + customReadinessProbe: {} + + # -- Extra volumes mount for OtelCollector pod + extraVolumeMounts: [] + # -- Extra volumes for OtelCollector pod + extraVolumes: [] + + ingress: + # -- Enable ingress for OtelCollector + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to OtelCollector Ingress + annotations: {} + # cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- OtelCollector Ingress Host names with their path details + hosts: + - host: otelcollector.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 4318 + # -- OtelCollector Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - otelcollector.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 200Mi + # limits: + # cpu: "1" + # memory: 2Gi + + # -- OtelCollector priority class name + priorityClassName: "" + # -- Node selector for settings for OtelCollector pod + nodeSelector: { + spotinst.io/node-lifecycle: "od" + } + # -- Toleration labels for OtelCollector pod assignment + tolerations: [] + # -- Affinity settings for OtelCollector pod + affinity: {} + # -- TopologySpreadConstraints describes how OtelCollector pods ought to spread + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + app.kubernetes.io/component: otel-collector + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 11 + targetCPUUtilizationPercentage: 50 + targetMemoryUtilizationPercentage: 50 + behavior: {} + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 2 + # periodSeconds: 60 + + autoscalingTemplate: [] + keda: + annotations: + enabled: false + pollingInterval: "30" # check 30sec periodically for metrics data + cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale + minReplicaCount: "1" # should be >= replicaCount specified in values.yaml + maxReplicaCount: "5" + triggers: [] + # - type: memory + # metadata: + # type: Utilization + # value: "80" # hpa make sure average Utilization <=80 by adding new pods + # - type: cpu + # metadata: + # type: Utilization + # value: "80" # hpa make sure average Utlization <=80 by adding new pods + + # -- Configurations for OtelCollector + # @default -- See `values.yaml` for defaults + config: + receivers: + otlp/spanmetrics: + protocols: + grpc: + endpoint: localhost:12345 + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + max_recv_msg_size_mib: 16 + http: + endpoint: 0.0.0.0:4318 + jaeger: + protocols: + grpc: + endpoint: 0.0.0.0:14250 + thrift_http: + endpoint: 0.0.0.0:14268 + # Uncomment to enable thift_company receiver. + # You will also have set set enable it in `otelCollector.ports + # thrift_compact: + # endpoint: 0.0.0.0:6831 + hostmetrics: + collection_interval: 30s + scrapers: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + httplogreceiver/heroku: + # endpoint specifies the network interface and port which will receive data + endpoint: 0.0.0.0:8081 + source: heroku + httplogreceiver/json: + # endpoint specifies the network interface and port which will receive data + endpoint: 0.0.0.0:8082 + source: json + processors: + # default parsing of logs + # logstransform/internal: + # operators: + # - type: regex_parser + # id: traceid + # # https://regex101.com/r/yFW5UC/1 + # regex: '(?i)(^trace|(("| )+trace))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' + # parse_from: body + # parse_to: attributes.temp_trace + # if: 'body matches "(?i)(^trace|((\"| )+trace))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' + # output: spanid + # - type: regex_parser + # id: spanid + # # https://regex101.com/r/DZ2gng/1 + # regex: '(?i)(^span|(("| )+span))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' + # parse_from: body + # parse_to: attributes.temp_trace + # if: 'body matches "(?i)(^span|((\"| )+span))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' + # output: trace_parser + # - type: trace_parser + # id: trace_parser + # trace_id: + # parse_from: attributes.temp_trace.trace_id + # span_id: + # parse_from: attributes.temp_trace.span_id + # output: remove_temp + # - type: remove + # id: remove_temp + # field: attributes.temp_trace + # if: '"temp_trace" in attributes' + # Batch processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md + batch: + send_batch_size: 50000 + timeout: 1s + # Resource detection processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md + resourcedetection: + # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure + # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar + detectors: + - env + # - elastic_beanstalk + # - eks + # - ecs + # - ec2 + # - gcp + # - azure + # - heroku + - system + timeout: 2s + system: + hostname_sources: [dns, os] + # Memory Limiter processor. + # If not set, will be overridden with values based on k8s resource limits. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiterprocessor/README.md + # memory_limiter: null + signozspanmetrics/cumulative: + metrics_exporter: clickhousemetricswrite + latency_histogram_buckets: + [ + 100us, + 1ms, + 2ms, + 6ms, + 10ms, + 50ms, + 100ms, + 250ms, + 500ms, + 1000ms, + 1400ms, + 2000ms, + 5s, + 10s, + 20s, + 40s, + 60s, + ] + dimensions_cache_size: 100000 + dimensions: + - name: service.namespace + default: default + - name: deployment.environment + default: default + - name: signoz.collector.id + signozspanmetrics/delta: + metrics_exporter: clickhousemetricswrite + latency_histogram_buckets: + [ + 100us, + 1ms, + 2ms, + 6ms, + 10ms, + 50ms, + 100ms, + 250ms, + 500ms, + 1000ms, + 1400ms, + 2000ms, + 5s, + 10s, + 20s, + 40s, + 60s, + ] + dimensions_cache_size: 100000 + dimensions: + - name: service.namespace + default: default + - name: deployment.environment + default: default + - name: signoz.collector.id + aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA + # K8s Attribute processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/k8sattributesprocessor/README.md + k8sattributes: + # -- Whether to detect the IP address of agents and add it as an attribute to all telemetry resources. + # If set to true, Agents will not make any k8s API calls, do any discovery of pods or extract any metadata. + passthrough: false + # -- Filters can be used to limit each OpenTelemetry agent to query pods based on specific + # selector to only dramatically reducing resource requirements for very large clusters. + filter: + # -- Restrict each OpenTelemetry agent to query pods running on the same node + node_from_env_var: K8S_NODE_NAME + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + extract: + metadata: + - k8s.namespace.name + - k8s.pod.name + - k8s.pod.uid + - k8s.pod.start_time + - k8s.deployment.name + - k8s.node.name + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + exporters: + clickhousetraces: + datasource: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_TRACE_DATABASE} + low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING} + clickhousemetricswrite: + endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} + timeout: 15s + resource_to_telemetry_conversion: + enabled: true + clickhouselogsexporter: + dsn: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_LOG_DATABASE} + timeout: 10s + use_new_schema: true + prometheus: + endpoint: 0.0.0.0:8889 + service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] + pipelines: + traces: + receivers: [otlp, jaeger] + processors: [signozspanmetrics/cumulative, signozspanmetrics/delta, batch] + exporters: [clickhousetraces] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [clickhousemetricswrite] + metrics/internal: + receivers: [hostmetrics] + processors: [resourcedetection, k8sattributes, batch] + exporters: [clickhousemetricswrite] + logs: + receivers: [otlp, httplogreceiver/heroku, httplogreceiver/json] + processors: [batch] + exporters: [clickhouselogsexporter] + +# Default values for OtelCollectorMetrics +otelCollectorMetrics: + enabled: false + name: "otel-collector-metrics" + image: + registry: docker.io + repository: signoz/signoz-otel-collector + tag: 0.111.5 + pullPolicy: IfNotPresent + + # -- Image Registry Secret Names for OtelCollector + # If set, this has higher precedence than the root level or global value of imagePullSecrets. + imagePullSecrets: [] + + # OpenTelemetry Collector executable + command: + # -- OtelCollectorMetrics command name + name: /signoz-collector + # -- OtelCollectorMetrics command extra arguments + extraArgs: + - --feature-gates=-pkg.translator.prometheus.NormalizeName + + configMap: + # -- Specifies whether a configMap should be created (true by default) + create: true + + # OtelCollectorMetrics Service Account + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + # OtelCollectorMetrics service + service: + # -- Annotations to use by service associated to OtelCollectorMetrics + annotations: {} + # -- Labels to use by service associated to OtelCollectorMetrics + labels: {} + # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) + type: ClusterIP + + # -- OtelCollectorMetrics Deployment annotation. + annotations: {} + # -- OtelCollectorMetrics pod(s) annotation. + podAnnotations: + signoz.io/scrape: 'true' + signoz.io/port: '8888' + + # -- Additional environments to set for OtelCollectorMetrics + additionalEnvs: {} + # env_key: env_value + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + + minReadySeconds: 5 + progressDeadlineSeconds: 600 + replicaCount: 1 + + initContainers: + init: + enabled: false + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + delay: 5 + endpoint: /ping + waitMessage: "waiting for clickhouseDB" + doneMessage: "clickhouse ready, starting otel collector metrics now" + resources: {} + # requests: + # cpu: 100m + # memory: 100Mi + # limits: + # cpu: 100m + # memory: 100Mi + + # Configuration for ports + ports: + metrics: + # -- Whether to enable service port for internal metrics + enabled: false + # -- Container port for internal metrics + containerPort: 8888 + # -- Service port for internal metrics + servicePort: 8888 + # -- Protocol to use for internal metrics + protocol: TCP + zpages: + # -- Whether to enable service port for ZPages + enabled: false + # -- Container port for Zpages + containerPort: 55679 + # -- Service port for Zpages + servicePort: 55679 + # -- Protocol to use for Zpages + protocol: TCP + health-check: + # -- Whether to enable service port for health check + enabled: true + # -- Container port for health check + containerPort: 13133 + # -- Service port for health check + servicePort: 13133 + # -- Protocol to use for health check + protocol: TCP + pprof: + # -- Whether to enable service port for pprof + enabled: false + # -- Container port for pprof + containerPort: 1777 + # -- Service port for pprof + servicePort: 1777 + # -- Protocol to use for pprof + protocol: TCP + + + ## Configure liveness and readiness probes. + ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes + ## + livenessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + readinessProbe: + enabled: true + port: 13133 + path: / + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 6 + successThreshold: 1 + + ## Custom liveness and readiness probes + customLivenessProbe: {} + customReadinessProbe: {} + + # -- Extra volumes mount for OtelCollectorMetrics pod + extraVolumeMounts: [] + # -- Extra volumes for OtelCollectorMetrics pod + extraVolumes: [] + + ingress: + # -- Enable ingress for OtelCollectorMetrics + enabled: false + # -- Ingress Class Name to be used to identify ingress controllers + className: "" + # -- Annotations to OtelCollectorMetrics Ingress + annotations: {} + # cert-manager.io/cluster-issuer: letsencrypt-prod + # nginx.ingress.kubernetes.io/ssl-redirect: "true" + # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + # -- OtelCollectorMetrics Ingress Host names with their path details + hosts: + - host: otelcollector-metrics.domain.com + paths: + - path: / + pathType: ImplementationSpecific + port: 13133 + # -- OtelCollectorMetrics Ingress TLS + tls: [] + # - secretName: chart-example-tls + # hosts: + # - otelcollector-metrics.domain.com + + # -- Configure resource requests and limits. Update according to your own use + # case as these values might not be suitable for your workload. + # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ + # + # @default -- See `values.yaml` for defaults + resources: + requests: + cpu: 100m + memory: 100Mi + # limits: + # cpu: "1" + # memory: 2Gi + + # -- OtelCollectorMetrics priority class name + priorityClassName: "" + # -- Node selector for settings for OtelCollectorMetrics pod + nodeSelector: {} + # -- Toleration labels for OtelCollectorMetrics pod assignment + tolerations: [] + # -- Affinity settings for OtelCollectorMetrics pod + affinity: {} + # -- TopologySpreadConstraints describes how OtelCollectorMetrics pods ought to spread + topologySpreadConstraints: [] + + # OtelCollectorMetrics RBAC config + clusterRole: + # -- Specifies whether a clusterRole should be created + create: true + # -- Annotations to add to the clusterRole + annotations: {} + # -- The name of the clusterRole to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + # -- A set of rules as documented here. + # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # @default -- See `values.yaml` for defaults + rules: + # k8sattributes processor requires these permissions + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["replicasets"] + verbs: ["get", "list", "watch"] + # other processors and receivers require these permissions + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "services", "endpoints"] + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: ["ingresses"] + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics"] + verbs: ["get"] + + # OtelCollectorMetrics clusterRoleBinding + clusterRoleBinding: + # -- Annotations to add to the clusterRoleBinding + annotations: {} + # -- The name of the clusterRoleBinding to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + + # -- Configurations for OtelCollectorMetrics + # @default -- See `values.yaml` for defaults + config: + receivers: + # prometheus scrape config + prometheus: + config: + scrape_configs: + # generic prometheus metrics scraper (scrapped when signoz.io pod annotations are set) + - job_name: "generic-collector" + scrape_interval: 60s + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: + [__meta_kubernetes_pod_annotation_signoz_io_scrape] + action: keep + regex: true + - source_labels: + [__meta_kubernetes_pod_annotation_signoz_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: + [ + __meta_kubernetes_pod_ip, + __meta_kubernetes_pod_annotation_signoz_io_port, + ] + action: replace + separator: ":" + target_label: __address__ + - target_label: job_name + replacement: generic-collector + # Uncomment line below to include all labels of the pod + # - action: labelmap + # regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] + action: replace + target_label: signoz_k8s_name + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] + action: replace + target_label: signoz_k8s_instance + - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] + action: replace + target_label: signoz_k8s_component + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: k8s_namespace_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: k8s_pod_name + - source_labels: [__meta_kubernetes_pod_uid] + action: replace + target_label: k8s_pod_uid + - source_labels: [__meta_kubernetes_pod_container_name] + action: replace + target_label: k8s_container_name + - source_labels: [__meta_kubernetes_pod_container_name] + regex: (.+)-init + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: k8s_node_name + - source_labels: [__meta_kubernetes_pod_ready] + action: replace + target_label: k8s_pod_ready + - source_labels: [__meta_kubernetes_pod_phase] + action: replace + target_label: k8s_pod_phase + processors: + # Batch processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md + batch: + send_batch_size: 10000 + timeout: 1s + # Resource detection processor config. + # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md + resourcedetection: + # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure + # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar + detectors: + - env + # - elastic_beanstalk + # - eks + # - ecs + # - ec2 + # - gcp + # - azure + # - heroku + - system + timeout: 2s + system: + hostname_sources: [dns, os] + extensions: + health_check: + endpoint: 0.0.0.0:13133 + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + exporters: + clickhousemetricswrite: + timeout: 15s + endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} + clickhousemetricswrite/hostmetrics: + endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} + resource_to_telemetry_conversion: + enabled: true + service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages, pprof] + pipelines: + metrics: + receivers: [prometheus] + processors: [batch] + exporters: [clickhousemetricswrite] + +signoz-otel-gateway: + enabled: false diff --git a/modules/signoz-fluxcd/variables.tf b/modules/signoz-fluxcd/variables.tf new file mode 100644 index 00000000..2a917ff1 --- /dev/null +++ b/modules/signoz-fluxcd/variables.tf @@ -0,0 +1,67 @@ +variable "auto_deploy" { + description = "Auto deploy through ArgoCD" + type = bool + default = false +} + +variable "auto_prune" { + description = "Auto prune through ArgoCD" + type = bool + default = false +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} + +variable "argo_deployment_name" { + description = "The name of the ArgoCD deployment, must be globally unique" + type = string +} + +variable "namespace" { + description = "The namespace to deploy into" + type = string +} + + +variable "enable_otel_ingress" { + description = "Enable OpenTelemetry ingress" + type = bool + default = false +} + +variable "gateway_namespace" { + description = "The namespace of the gateway" + type = string +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "auth0_jwks_uri" { + description = "The JWKS URI for Auth0" + type = string +} + +variable "smtp_user" { + description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_password" { + description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} + +variable "smtp_from" { + description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" + type = string + default = "" +} diff --git a/modules/signoz-fluxcd/versions.tf b/modules/signoz-fluxcd/versions.tf new file mode 100644 index 00000000..ce834c32 --- /dev/null +++ b/modules/signoz-fluxcd/versions.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} + From 4644aa4d8d987638b4e3dfc298dd827a77c9c954 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:46:14 -0700 Subject: [PATCH 017/135] relative file reference --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index c4b32fe9..2826850b 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -29,7 +29,7 @@ resource "kubernetes_config_map" "signoz-values" { } data = { - "signoz_values.yaml" = "${file("modules/signoz-fluxcd/templates/values.yaml")}" + "signoz_values.yaml" = "${file("./templates/values.yaml")}" } } From ea309276a62244b4972a64c0cffbbddefdd1bae9 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:50:20 -0700 Subject: [PATCH 018/135] TRY ANOTHER FILE FORMAT --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 2826850b..2a4fe282 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -29,7 +29,7 @@ resource "kubernetes_config_map" "signoz-values" { } data = { - "signoz_values.yaml" = "${file("./templates/values.yaml")}" + "signoz_values.yaml" = "${file("${path.module}/templates/values.yaml")}" } } From 981c8630ee766abe1fb08920fd10e6ec5704a2e5 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:10:23 -0700 Subject: [PATCH 019/135] Deploy weave and correct signoz values --- .../stacks/dpe-k8s-deployments/main.tf | 10 +- modules/signoz-fluxcd/main.tf | 3 + modules/weave-gitops/README.md | 2 + modules/weave-gitops/main.tf | 63 ++++++ modules/weave-gitops/templates/values.yaml | 207 ++++++++++++++++++ modules/weave-gitops/variables.tf | 4 + modules/weave-gitops/versions.tf | 17 ++ 7 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 modules/weave-gitops/README.md create mode 100644 modules/weave-gitops/main.tf create mode 100644 modules/weave-gitops/templates/values.yaml create mode 100644 modules/weave-gitops/variables.tf create mode 100644 modules/weave-gitops/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 8cf41dfb..34fbb222 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -102,9 +102,7 @@ module "signoz" { } module "signoz-flux-deployment" { - depends_on = [module.argo-cd] - # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" - # version = "0.5.0" + depends_on = [module.flux-cd] source = "../../../modules/signoz-fluxcd" auto_deploy = var.auto_deploy auto_prune = var.auto_prune @@ -120,6 +118,12 @@ module "signoz-flux-deployment" { smtp_from = var.smtp_from } +module "weave-gitops" { + depends_on = [module.flux-cd] + source = "../../../modules/weave-gitops" + namespace = "weave-gitops" +} + module "envoy-gateway" { count = var.enable_cluster_ingress ? 1 : 0 depends_on = [module.argo-cd] diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 2a4fe282..7107a15e 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -58,6 +58,9 @@ spec: alertmanager: enabled: false valuesFrom: + - kind: ConfigMap + name: signoz-values + valuesKey: signoz_values.yaml - kind: Secret name: clickhouse-admin-password valuesKey: password diff --git a/modules/weave-gitops/README.md b/modules/weave-gitops/README.md new file mode 100644 index 00000000..8f4fe05a --- /dev/null +++ b/modules/weave-gitops/README.md @@ -0,0 +1,2 @@ +# Purpose +Deploy the gitops server: \ No newline at end of file diff --git a/modules/weave-gitops/main.tf b/modules/weave-gitops/main.tf new file mode 100644 index 00000000..22cd1f0d --- /dev/null +++ b/modules/weave-gitops/main.tf @@ -0,0 +1,63 @@ +resource "kubernetes_namespace" "weave" { + metadata { + name = var.namespace + } +} + +resource "kubectl_manifest" "weave-git-repo" { + depends_on = [kubernetes_namespace.weave] + + yaml_body = < Date: Thu, 7 Nov 2024 16:13:37 -0700 Subject: [PATCH 020/135] deploy configmaps to namespace --- modules/signoz-fluxcd/main.tf | 3 ++- modules/weave-gitops/main.tf | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 7107a15e..a79c5e8e 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -25,7 +25,8 @@ YAML resource "kubernetes_config_map" "signoz-values" { metadata { - name = "signoz-values" + name = "signoz-values" + namespace = var.namespace } data = { diff --git a/modules/weave-gitops/main.tf b/modules/weave-gitops/main.tf index 22cd1f0d..01193541 100644 --- a/modules/weave-gitops/main.tf +++ b/modules/weave-gitops/main.tf @@ -28,7 +28,8 @@ YAML resource "kubernetes_config_map" "weave-values" { metadata { - name = "weave-values" + name = "weave-values" + namespace = var.namespace } data = { From 33c4255e68a557a8c4425a1248fd062b960dda8d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:27:49 -0700 Subject: [PATCH 021/135] Create temp password hash --- modules/weave-gitops/templates/values.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/weave-gitops/templates/values.yaml b/modules/weave-gitops/templates/values.yaml index d52bf888..b66b020f 100644 --- a/modules/weave-gitops/templates/values.yaml +++ b/modules/weave-gitops/templates/values.yaml @@ -65,7 +65,7 @@ rbac: impersonationResources: ["users", "groups"] # -- If non-empty, this limits the secrets that can be accessed by # the service account to the specified ones, e.g. `['weave-gitops-enterprise-credentials']` - viewSecretsResourceNames: ["cluster-user-auth", "oidc-auth"] + viewSecretsResourceNames: ["cluster-user-auth"] # -- If non-empty, these additional rules will be appended to the RBAC role and the cluster role. # for example, # additionalRules: @@ -76,7 +76,7 @@ rbac: adminUser: # -- Whether the local admin user should be created. # If you use this make sure you add it to `rbac.impersonationResourceNames`. - create: false + create: true # -- Specifies whether the clusterRole & binding to the admin user should be created. # Will be created only if `adminUser.create` is enabled. Without this, # the adminUser will only be able to see resources in the target namespace. @@ -93,7 +93,7 @@ adminUser: # -- (string) Set the password for local admin user. Requires `adminUser.create` and `adminUser.createSecret` # This needs to have been hashed using bcrypt. # You can do this via our CLI with `gitops get bcrypt-hash`. - passwordHash: + passwordHash: $2b$12$ICeAVneTc.7fr3c1B7uyxO78UcKLcS4qon8blsPChE0BRPf1R92YO podAnnotations: {} podLabels: {} # aadpodidbinding: identity From 54d45dee3834014ba1c7e6525106e765f5889f9c Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 8 Nov 2024 13:15:29 -0500 Subject: [PATCH 022/135] adds clickhouse backup w/ s3 bucket --- modules/signoz-fluxcd/main.tf | 143 +++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index a79c5e8e..f644aed2 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -35,9 +35,100 @@ resource "kubernetes_config_map" "signoz-values" { } + +resource "aws_iam_user" "backup" { + name = "clickhouse-backup-${var.namespace}" +} + +resource "aws_iam_access_key" "backup" { + user = aws_iam_user.backup.name +} + +// Create the S3 bucket +resource "aws_s3_bucket" "clickhouse_backup" { + bucket = "signoz-clickhouse-backup-${var.cluster_name}" +} + +// Enable versioning +resource "aws_s3_bucket_versioning" "clickhouse_backup" { + bucket = aws_s3_bucket.clickhouse_backup.id + versioning_configuration { + status = "Enabled" + } +} + +// Configure lifecycle rules for backup management +resource "aws_s3_bucket_lifecycle_configuration" "clickhouse_backup" { + bucket = aws_s3_bucket.clickhouse_backup.id + + rule { + id = "cleanup_old_backups" + status = "Enabled" + + expiration { + days = 30 // Adjust retention period as needed + } + + noncurrent_version_expiration { + noncurrent_days = 7 + } + } +} + +resource "aws_iam_user_policy" "backup" { + name = "clickhouse-backup-policy" + user = aws_iam_user.backup.name + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:PutObject", + "s3:GetObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + Resource = [ + "${aws_s3_bucket.clickhouse_backup.arn}/*", + aws_s3_bucket.clickhouse_backup.arn + ] + } + ] + }) +} + +resource "kubernetes_config_map" "clickhouse-backup-config" { + metadata { + name = "clickhouse-backup-config" + namespace = var.namespace + } + + data = { + "config.yml" = <<-EOT + general: + remote_storage: s3 + upload_concurrency: 4 + download_concurrency: 4 + disable_progress_bar: false + clickhouse: + host: localhost + port: 9000 + username: admin + password_from_env: CLICKHOUSE_PASSWORD + s3: + bucket: ${aws_s3_bucket.clickhouse_backup.id} + endpoint: s3.amazonaws.com + region: ${data.aws_region.current.name} + access_key: ${aws_iam_access_key.backup.id} + secret_key: ${aws_iam_access_key.backup.secret} + EOT + } +} + resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] - yaml_body = < Date: Fri, 8 Nov 2024 13:27:58 -0500 Subject: [PATCH 023/135] specify region --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index f644aed2..4261045f 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -120,7 +120,7 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { s3: bucket: ${aws_s3_bucket.clickhouse_backup.id} endpoint: s3.amazonaws.com - region: ${data.aws_region.current.name} + region: us-east-1 access_key: ${aws_iam_access_key.backup.id} secret_key: ${aws_iam_access_key.backup.secret} EOT From 4a32bf5f71a48ac0cb9d0e6bb2c053db1b3acecd Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 8 Nov 2024 14:04:05 -0500 Subject: [PATCH 024/135] assign smtp_ vars --- modules/signoz-fluxcd/main.tf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 4261045f..75944ce6 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -120,7 +120,7 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { s3: bucket: ${aws_s3_bucket.clickhouse_backup.id} endpoint: s3.amazonaws.com - region: us-east-1 + region: ${var.region} access_key: ${aws_iam_access_key.backup.id} secret_key: ${aws_iam_access_key.backup.secret} EOT @@ -129,12 +129,17 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] + + smtp_user = var.smtp_user + smtp_password = var.smtp_password + smtp_from = var.smtp_from + yaml_body = < Date: Fri, 8 Nov 2024 14:27:06 -0500 Subject: [PATCH 025/135] sets variables in yaml --- modules/signoz-fluxcd/main.tf | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 75944ce6..0bad5634 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -120,7 +120,7 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { s3: bucket: ${aws_s3_bucket.clickhouse_backup.id} endpoint: s3.amazonaws.com - region: ${var.region} + region: us-east-1 access_key: ${aws_iam_access_key.backup.id} secret_key: ${aws_iam_access_key.backup.secret} EOT @@ -130,10 +130,6 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] - smtp_user = var.smtp_user - smtp_password = var.smtp_password - smtp_from = var.smtp_from - yaml_body = < Date: Mon, 11 Nov 2024 10:11:49 -0500 Subject: [PATCH 026/135] revert first attempt - start fresh --- modules/signoz-fluxcd/main.tf | 163 +--------------------------------- 1 file changed, 1 insertion(+), 162 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 0bad5634..a79c5e8e 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -35,98 +35,6 @@ resource "kubernetes_config_map" "signoz-values" { } - -resource "aws_iam_user" "backup" { - name = "clickhouse-backup-${var.namespace}" -} - -resource "aws_iam_access_key" "backup" { - user = aws_iam_user.backup.name -} - -// Create the S3 bucket -resource "aws_s3_bucket" "clickhouse_backup" { - bucket = "signoz-clickhouse-backup-${var.cluster_name}" -} - -// Enable versioning -resource "aws_s3_bucket_versioning" "clickhouse_backup" { - bucket = aws_s3_bucket.clickhouse_backup.id - versioning_configuration { - status = "Enabled" - } -} - -// Configure lifecycle rules for backup management -resource "aws_s3_bucket_lifecycle_configuration" "clickhouse_backup" { - bucket = aws_s3_bucket.clickhouse_backup.id - - rule { - id = "cleanup_old_backups" - status = "Enabled" - - expiration { - days = 30 // Adjust retention period as needed - } - - noncurrent_version_expiration { - noncurrent_days = 7 - } - } -} - -resource "aws_iam_user_policy" "backup" { - name = "clickhouse-backup-policy" - user = aws_iam_user.backup.name - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "s3:PutObject", - "s3:GetObject", - "s3:DeleteObject", - "s3:ListBucket" - ] - Resource = [ - "${aws_s3_bucket.clickhouse_backup.arn}/*", - aws_s3_bucket.clickhouse_backup.arn - ] - } - ] - }) -} - -resource "kubernetes_config_map" "clickhouse-backup-config" { - metadata { - name = "clickhouse-backup-config" - namespace = var.namespace - } - - data = { - "config.yml" = <<-EOT - general: - remote_storage: s3 - upload_concurrency: 4 - download_concurrency: 4 - disable_progress_bar: false - clickhouse: - host: localhost - port: 9000 - username: admin - password_from_env: CLICKHOUSE_PASSWORD - s3: - bucket: ${aws_s3_bucket.clickhouse_backup.id} - endpoint: s3.amazonaws.com - region: us-east-1 - access_key: ${aws_iam_access_key.backup.id} - secret_key: ${aws_iam_access_key.backup.secret} - EOT - } -} - resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] @@ -135,7 +43,7 @@ apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: signoz - namespace: ${var.namespace} + namespace: ${var.namespace} spec: interval: 10m chart: @@ -147,25 +55,6 @@ spec: name: signoz namespace: ${var.namespace} interval: 10m - helm: - releaseName: signoz-fluxcd - # Extra parameters to set (same as setting through values.yaml, but these take precedence) - parameters: - - name: "clickhouse.password" - value: ${random_password.clickhouse-admin-password.result} - %{if local.alertmanager_enabled} - - name: "alertmanager.enabled" - value: "true" - - name: "alertmanager.additionalEnvs.ALERTMANAGER_SMTP_FROM" - value: ${var.smtp_from} - - name: "alertmanager.additionalEnvs.ALERTMANAGER_SMTP_AUTH_USERNAME" - value: ${var.smtp_user} - - name: "alertmanager.additionalEnvs.ALERTMANAGER_SMTP_AUTH_PASSWORD" - value: ${var.smtp_password} - %{else} - - name: "alertmanager.enabled" - value: "false" - %{endif} values: alertmanager: enabled: false @@ -177,56 +66,6 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password - postRenderers: - - kustomize: - patches: - - target: - kind: StatefulSet - name: signoz-clickhouse - patch: | - - op: add - path: /spec/template/spec/containers/- - value: - name: backup - image: altinity/clickhouse-backup:2.4.4 - imagePullPolicy: IfNotPresent - securityContext: - runAsUser: 101 - runAsGroup: 101 - env: - - name: CLICKHOUSE_HOST - value: "localhost" - - name: CLICKHOUSE_PORT - value: "9000" - - name: CLICKHOUSE_USER - value: "admin" - - name: CLICKHOUSE_PASSWORD - valueFrom: - secretKeyRef: - name: clickhouse-admin-password - key: password - volumeMounts: - - name: data - mountPath: /var/lib/clickhouse - - name: backup - mountPath: /var/lib/clickhouse/backup - - name: config - mountPath: /etc/clickhouse-backup - - target: - kind: StatefulSet - name: signoz-clickhouse - patch: | - - op: add - path: /spec/template/spec/volumes/- - value: - name: backup - emptyDir: {} - - op: add - path: /spec/template/spec/volumes/- - value: - name: config - configMap: - name: clickhouse-backup-config YAML } From 0997a7518ea2b9500c0df39ae732d354f7c96869 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 14:21:03 -0500 Subject: [PATCH 027/135] adds s3-bucket stack --- .../clickhouse-backup-bucket/main.tf | 5 +++++ modules/s3-bucket/main.tf | 17 +++++++++++++++++ modules/s3-bucket/outputs.tf | 9 +++++++++ modules/s3-bucket/variables.tf | 16 ++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf create mode 100644 modules/s3-bucket/main.tf create mode 100644 modules/s3-bucket/outputs.tf create mode 100644 modules/s3-bucket/variables.tf diff --git a/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf b/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf new file mode 100644 index 00000000..44ebc273 --- /dev/null +++ b/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf @@ -0,0 +1,5 @@ +module "clickhouse_backup_bucket" { + source = "./modules/s3-bucket" + + bucket_name = "clickhouse-backup" +} diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf new file mode 100644 index 00000000..fcd2c7c1 --- /dev/null +++ b/modules/s3-bucket/main.tf @@ -0,0 +1,17 @@ +resource "aws_s3_bucket" "bucket" { + bucket = var.bucket_name + + tags = merge( + var.tags, + { + Name = var.bucket_name + } + ) +} + +resource "aws_s3_bucket_versioning" "versioning" { + bucket = aws_s3_bucket.bucket.id + versioning_configuration { + status = var.enable_versioning ? "Enabled" : "Suspended" + } +} diff --git a/modules/s3-bucket/outputs.tf b/modules/s3-bucket/outputs.tf new file mode 100644 index 00000000..c8a58a33 --- /dev/null +++ b/modules/s3-bucket/outputs.tf @@ -0,0 +1,9 @@ +output "bucket_name" { + description = "Name of the created S3 bucket" + value = aws_s3_bucket.bucket.id +} + +output "bucket_arn" { + description = "ARN of the created S3 bucket" + value = aws_s3_bucket.bucket.arn +} diff --git a/modules/s3-bucket/variables.tf b/modules/s3-bucket/variables.tf new file mode 100644 index 00000000..b8e0a882 --- /dev/null +++ b/modules/s3-bucket/variables.tf @@ -0,0 +1,16 @@ +variable "bucket_name" { + description = "Name of the S3 bucket to create" + type = string +} + +variable "tags" { + description = "Tags to apply to the S3 bucket" + type = map(string) + default = {} +} + +variable "enable_versioning" { + description = "Enable versioning on the bucket" + type = bool + default = true +} From c541d395019300e13679736b209dc98a2f8facce Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 14:29:35 -0500 Subject: [PATCH 028/135] fixes module path --- .../stacks/aws-resources/clickhouse-backup-bucket/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf b/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf index 44ebc273..dd3f295e 100644 --- a/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf +++ b/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf @@ -1,5 +1,5 @@ module "clickhouse_backup_bucket" { - source = "./modules/s3-bucket" + source = "../../../../modules/s3-bucket" bucket_name = "clickhouse-backup" } From 70770cefbff2eb255d4b89ce2d3f074540417c38 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 14:45:25 -0500 Subject: [PATCH 029/135] move clickhouse backup to k8s deployments --- .../stacks/aws-resources/clickhouse-backup-bucket/main.tf | 5 ----- deployments/stacks/dpe-k8s-deployments/main.tf | 6 ++++++ 2 files changed, 6 insertions(+), 5 deletions(-) delete mode 100644 deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf diff --git a/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf b/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf deleted file mode 100644 index dd3f295e..00000000 --- a/deployments/stacks/aws-resources/clickhouse-backup-bucket/main.tf +++ /dev/null @@ -1,5 +0,0 @@ -module "clickhouse_backup_bucket" { - source = "../../../../modules/s3-bucket" - - bucket_name = "clickhouse-backup" -} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 34fbb222..ea6b67ab 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -151,3 +151,9 @@ module "cert-manager" { namespace = "cert-manager" argo_deployment_name = "cert-manager" } + +module "clickhouse_backup_bucket" { + source = "./modules/s3-bucket" + + bucket_name = "clickhouse-backup" +} From 8b0791a96d462e7d95da54b45d2b9260d2192c83 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 14:50:18 -0500 Subject: [PATCH 030/135] fixes module path --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ea6b67ab..7bedc3b1 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -153,7 +153,7 @@ module "cert-manager" { } module "clickhouse_backup_bucket" { - source = "./modules/s3-bucket" + source = "../../../modules/s3-bucket" bucket_name = "clickhouse-backup" } From be39eb1d7e933cf503ab599378915fc313419acf Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 15:15:04 -0500 Subject: [PATCH 031/135] give bucket unique name --- deployments/stacks/dpe-k8s-deployments/main.tf | 3 +-- modules/s3-bucket/main.tf | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 7bedc3b1..1945b4ea 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -154,6 +154,5 @@ module "cert-manager" { module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" - - bucket_name = "clickhouse-backup" + bucket_name = "clickhouse-backup-${var.aws_account_id}" } diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index fcd2c7c1..967a0b36 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -1,6 +1,5 @@ resource "aws_s3_bucket" "bucket" { bucket = var.bucket_name - tags = merge( var.tags, { From 9c50599a2a9132d71e8df9b89af260cbcec06d19 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 15:27:44 -0500 Subject: [PATCH 032/135] adds IAM role for s3 access --- .../stacks/dpe-k8s-deployments/main.tf | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 1945b4ea..71b57083 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -156,3 +156,47 @@ module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" bucket_name = "clickhouse-backup-${var.aws_account_id}" } + +resource "aws_iam_role" "clickhouse_backup_access" { + name = "clickhouse-backup-access-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = ["ec2.amazonaws.com", "eks.amazonaws.com"] + } + } + ] + }) +} + +resource "aws_iam_policy" "clickhouse_backup_policy" { + name = "clickhouse-backup-access-policy" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject" + ] + Resource = [ + module.clickhouse_backup_bucket.bucket_arn, + "${module.clickhouse_backup_bucket.bucket_arn}/*" + ] + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { + role = aws_iam_role.clickhouse_backup_access.name + policy_arn = aws_iam_policy.clickhouse_backup_policy.arn +} From 87519d60a9e48fd56430e9f3520f1cd8e5f3dfb1 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 15:45:20 -0500 Subject: [PATCH 033/135] adds clickhouse backup job --- modules/signoz-fluxcd/main.tf | 39 +++++++++++++++++++ .../templates/clickhouse-backup-patch.yaml | 30 ++++++++++++++ modules/signoz-fluxcd/templates/values.yaml | 5 +++ 3 files changed, 74 insertions(+) create mode 100644 modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index a79c5e8e..15a768dd 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -35,6 +35,16 @@ resource "kubernetes_config_map" "signoz-values" { } +resource "kubernetes_service_account" "clickhouse-backup-service-account" { + metadata { + name = "clickhouse-backup" + namespace = var.namespace + annotations = { + "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role" + } + } +} + resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] @@ -66,6 +76,23 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password + - kind: Secret + name: aws-credentials + valuesKey: aws_access_key_id + targetPath: clickhouse.s3.accessKey + - kind: Secret + name: aws-credentials + valuesKey: aws_secret_access_key + targetPath: clickhouse.s3.secretKey + postRenderers: + - kustomize: + patches: + - target: + version: v1 + kind: Deployment + name: clickhouse-backup + patch: | + ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} YAML } @@ -173,3 +200,15 @@ resource "kubernetes_secret" "clickhouse-admin-password" { depends_on = [kubernetes_namespace.signoz] } + +resource "kubernetes_secret" "aws_credentials" { + metadata { + name = "aws-credentials" + namespace = var.namespace + } + + data = { + aws_access_key_id = var.aws_access_key_id + aws_secret_access_key = var.aws_secret_access_key + } +} diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml new file mode 100644 index 00000000..01f74a46 --- /dev/null +++ b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml @@ -0,0 +1,30 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: clickhouse-backup +spec: + schedule: "0 0 * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + jobTemplate: + spec: + template: + spec: + containers: + - name: clickhouse-backup + image: aws/aws-cli:2.8.12 + command: + - /bin/sh + - -c + - | + aws s3 mb s3://clickhouse-backup-${aws_account_id} + aws s3 sync /data/clickhouse-backup s3://clickhouse-backup-${aws_account_id} + volumeMounts: + - name: backup-data + mountPath: /data/clickhouse-backup + volumes: + - name: backup-data + emptyDir: {} + serviceAccountName: clickhouse-backup-service-account + restartPolicy: OnFailure diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index fa93d5fe..f93b6374 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -41,6 +41,11 @@ imagePullSecrets: [] clickhouse: # -- Whether to install clickhouse. If false, `clickhouse.host` must be set enabled: true + storage: + type: s3 + endpoint: s3.amazonaws.com + bucket: clickhouse-backup-${aws_account_id} + region: us-east-1 # Zookeeper default values # Ref: https://github.com/bitnami/charts/blob/main/bitnami/zookeeper/values.yaml From 63b50b4beb2cd3c197c7ce6b64159cfeec0b9ae7 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 15:58:44 -0500 Subject: [PATCH 034/135] removes redundant aws secrets --- modules/signoz-fluxcd/main.tf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 15a768dd..18cec81e 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,14 +76,6 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password - - kind: Secret - name: aws-credentials - valuesKey: aws_access_key_id - targetPath: clickhouse.s3.accessKey - - kind: Secret - name: aws-credentials - valuesKey: aws_secret_access_key - targetPath: clickhouse.s3.secretKey postRenderers: - kustomize: patches: @@ -200,15 +192,3 @@ resource "kubernetes_secret" "clickhouse-admin-password" { depends_on = [kubernetes_namespace.signoz] } - -resource "kubernetes_secret" "aws_credentials" { - metadata { - name = "aws-credentials" - namespace = var.namespace - } - - data = { - aws_access_key_id = var.aws_access_key_id - aws_secret_access_key = var.aws_secret_access_key - } -} From 82b39567558b9f8f63c610e0c5f5708aa60f43e0 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 16:12:46 -0500 Subject: [PATCH 035/135] passes through aws account id --- deployments/stacks/dpe-k8s-deployments/main.tf | 1 + modules/signoz-fluxcd/main.tf | 1 + 2 files changed, 2 insertions(+) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 71b57083..a93cd6d5 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -116,6 +116,7 @@ module "signoz-flux-deployment" { smtp_password = var.smtp_password smtp_user = var.smtp_user smtp_from = var.smtp_from + aws_account_id = var.aws_account_id } module "weave-gitops" { diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 18cec81e..e06a8725 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -35,6 +35,7 @@ resource "kubernetes_config_map" "signoz-values" { } + resource "kubernetes_service_account" "clickhouse-backup-service-account" { metadata { name = "clickhouse-backup" From 8aae4dda83b7167337fcbb95420ae13c5f9f3ae4 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 11 Nov 2024 16:16:41 -0500 Subject: [PATCH 036/135] adds aws account id var to signoz-fluxcd module --- modules/signoz-fluxcd/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/signoz-fluxcd/variables.tf b/modules/signoz-fluxcd/variables.tf index 2a917ff1..c00e78d8 100644 --- a/modules/signoz-fluxcd/variables.tf +++ b/modules/signoz-fluxcd/variables.tf @@ -65,3 +65,8 @@ variable "smtp_from" { type = string default = "" } + +variable "aws_account_id" { + description = "The AWS account ID" + type = string +} From 7d8d4b6e7d143ae05a17799597062bd03033e9e0 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 12 Nov 2024 11:46:59 -0500 Subject: [PATCH 037/135] updates cronjob config --- .../templates/clickhouse-backup-patch.yaml | 144 +++++++++++++++--- 1 file changed, 125 insertions(+), 19 deletions(-) diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml index 01f74a46..45b1e449 100644 --- a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml +++ b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml @@ -1,30 +1,136 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: clickhouse-backup-config + namespace: signoz-fluxcd +data: + backup_disk.xml: | + + + + + s3 + https://s3.us-east-1.amazonaws.com + 1 + us-east-1 + + + + + + disk + s3 + clickhouse-backup-{$aws_account_id} + + + +--- apiVersion: batch/v1 kind: CronJob metadata: - name: clickhouse-backup + name: clickhouse-backup-cron + namespace: signoz-fluxcd spec: schedule: "0 0 * * *" - concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 - failedJobsHistoryLimit: 1 + concurrencyPolicy: "Forbid" jobTemplate: spec: + backoffLimit: 1 template: + metadata: + labels: + app: clickhouse-backup-cron spec: + serviceAccountName: clickhouse-backup-service-account + restartPolicy: Never containers: - - name: clickhouse-backup - image: aws/aws-cli:2.8.12 - command: - - /bin/sh - - -c - - | - aws s3 mb s3://clickhouse-backup-${aws_account_id} - aws s3 sync /data/clickhouse-backup s3://clickhouse-backup-${aws_account_id} - volumeMounts: - - name: backup-data - mountPath: /data/clickhouse-backup + - name: run-backup-cron + image: clickhouse/clickhouse-client:latest + imagePullPolicy: IfNotPresent + env: + - name: CLICKHOUSE_SERVICES + value: "signoz-clickhouse" + - name: CLICKHOUSE_PORT + value: "9000" + - name: BACKUP_USER + valueFrom: + secretKeyRef: + name: clickhouse-admin-password + key: username + - name: BACKUP_PASSWORD + valueFrom: + secretKeyRef: + name: clickhouse-admin-password + key: password + - name: MAKE_INCREMENT_BACKUP + value: "1" + - name: FULL_BACKUP_WEEKDAY + value: "1" + command: + - bash + - -ec + - | + CLICKHOUSE_SERVICES=$(echo $CLICKHOUSE_SERVICES | tr "," " "); + BACKUP_DATE=$(date +%Y-%m-%d-%H-%M-%S); + declare -A BACKUP_NAMES; + declare -A DIFF_FROM; + if [[ "" != "$BACKUP_PASSWORD" ]]; then + BACKUP_PASSWORD="--password=$BACKUP_PASSWORD"; + fi; + for SERVER in $CLICKHOUSE_SERVICES; do + if [[ "1" == "$MAKE_INCREMENT_BACKUP" ]]; then + LAST_FULL_BACKUP=$(clickhouse-client -q "SELECT name FROM system.backup_list WHERE location='remote' AND name LIKE '%${SERVER}%' AND name LIKE '%full%' AND desc NOT LIKE 'broken%' ORDER BY created DESC LIMIT 1 FORMAT TabSeparatedRaw" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD); + TODAY_FULL_BACKUP=$(clickhouse-client -q "SELECT name FROM system.backup_list WHERE location='remote' AND name LIKE '%${SERVER}%' AND name LIKE '%full%' AND desc NOT LIKE 'broken%' AND toDate(created) = today() ORDER BY created DESC LIMIT 1 FORMAT TabSeparatedRaw" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD) + PREV_BACKUP_NAME=$(clickhouse-client -q "SELECT name FROM system.backup_list WHERE location='remote' AND desc NOT LIKE 'broken%' ORDER BY created DESC LIMIT 1 FORMAT TabSeparatedRaw" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD); + DIFF_FROM[$SERVER]=""; + if [[ ("$FULL_BACKUP_WEEKDAY" == "$(date +%u)" && "" == "$TODAY_FULL_BACKUP") || "" == "$PREV_BACKUP_NAME" || "" == "$LAST_FULL_BACKUP" ]]; then + BACKUP_NAMES[$SERVER]="full-$BACKUP_DATE"; + else + BACKUP_NAMES[$SERVER]="increment-$BACKUP_DATE"; + DIFF_FROM[$SERVER]="--diff-from-remote=$PREV_BACKUP_NAME"; + fi + else + BACKUP_NAMES[$SERVER]="full-$BACKUP_DATE"; + fi; + echo "set backup name on $SERVER = ${BACKUP_NAMES[$SERVER]}"; + done; + for SERVER in $CLICKHOUSE_SERVICES; do + echo "create ${BACKUP_NAMES[$SERVER]} on $SERVER"; + clickhouse-client --echo -mn -q "INSERT INTO system.backup_actions(command) VALUES('create ${SERVER}-${BACKUP_NAMES[$SERVER]}')" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD; + done; + for SERVER in $CLICKHOUSE_SERVICES; do + while [[ "in progress" == $(clickhouse-client -mn -q "SELECT status FROM system.backup_actions WHERE command='create ${SERVER}-${BACKUP_NAMES[$SERVER]}' FORMAT TabSeparatedRaw" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD) ]]; do + echo "still in progress ${BACKUP_NAMES[$SERVER]} on $SERVER"; + sleep 1; + done; + if [[ "success" != $(clickhouse-client -mn -q "SELECT status FROM system.backup_actions WHERE command='create ${SERVER}-${BACKUP_NAMES[$SERVER]}' FORMAT TabSeparatedRaw" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD) ]]; then + echo "error create ${BACKUP_NAMES[$SERVER]} on $SERVER"; + clickhouse-client -mn --echo -q "SELECT status,error FROM system.backup_actions WHERE command='create ${SERVER}-${BACKUP_NAMES[$SERVER]}'" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD; + exit 1; + fi; + done; + for SERVER in $CLICKHOUSE_SERVICES; do + echo "upload ${DIFF_FROM[$SERVER]} ${BACKUP_NAMES[$SERVER]} on $SERVER"; + clickhouse-client --echo -mn -q "INSERT INTO system.backup_actions(command) VALUES('upload ${DIFF_FROM[$SERVER]} ${SERVER}-${BACKUP_NAMES[$SERVER]}')" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD; + done; + for SERVER in $CLICKHOUSE_SERVICES; do + while [[ "in progress" == $(clickhouse-client -mn -q "SELECT status FROM system.backup_actions WHERE command='upload ${DIFF_FROM[$SERVER]} ${SERVER}-${BACKUP_NAMES[$SERVER]}'" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD) ]]; do + echo "upload still in progress ${BACKUP_NAMES[$SERVER]} on $SERVER"; + sleep 5; + done; + if [[ "success" != $(clickhouse-client -mn -q "SELECT status FROM system.backup_actions WHERE command='upload ${DIFF_FROM[$SERVER]} ${SERVER}-${BACKUP_NAMES[$SERVER]}'" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD) ]]; then + echo "error ${BACKUP_NAMES[$SERVER]} on $SERVER"; + clickhouse-client -mn --echo -q "SELECT status,error FROM system.backup_actions WHERE command='upload ${DIFF_FROM[$SERVER]} ${SERVER}-${BACKUP_NAMES[$SERVER]}'" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD; + exit 1; + fi; + clickhouse-client --echo -mn -q "INSERT INTO system.backup_actions(command) VALUES('delete local ${SERVER}-${BACKUP_NAMES[$SERVER]}')" --host="$SERVER" --port="$CLICKHOUSE_PORT" --user="$BACKUP_USER" $BACKUP_PASSWORD; + done; + echo "BACKUP CREATED" + volumeMounts: + - name: backup-config + mountPath: /etc/clickhouse-server/config.d/backup_disk.xml + subPath: backup_disk.xml volumes: - - name: backup-data - emptyDir: {} - serviceAccountName: clickhouse-backup-service-account - restartPolicy: OnFailure + - name: backup-config + configMap: + name: clickhouse-backup-config From 61c5855c49d0bb02cb58c883ad209a5808aa53e5 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 12 Nov 2024 13:50:38 -0500 Subject: [PATCH 038/135] try adding default --- modules/signoz-fluxcd/variables.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/signoz-fluxcd/variables.tf b/modules/signoz-fluxcd/variables.tf index c00e78d8..1a070655 100644 --- a/modules/signoz-fluxcd/variables.tf +++ b/modules/signoz-fluxcd/variables.tf @@ -69,4 +69,5 @@ variable "smtp_from" { variable "aws_account_id" { description = "The AWS account ID" type = string + default = "" } From d294250ed1631fa01834e1b54244e16a8d5ce270 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 12 Nov 2024 14:10:21 -0500 Subject: [PATCH 039/135] create clickhouse config --- modules/signoz-fluxcd/main.tf | 30 +++++++++++++++++++ .../templates/clickhouse-backup-patch.yaml | 29 ++---------------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index e06a8725..6bf5907b 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -35,6 +35,36 @@ resource "kubernetes_config_map" "signoz-values" { } +resource "kubernetes_config_map" "clickhouse-backup-config" { + metadata { + name = "clickhouse-backup-config" + namespace = var.namespace + } + + data = { + "backup_disk.xml" = <<-EOT + + + + + s3 + https://s3.us-east-1.amazonaws.com + 1 + us-east-1 + + + + + + disk + s3 + clickhouse-backup-${var.aws_account_id} + + + + EOT + } +} resource "kubernetes_service_account" "clickhouse-backup-service-account" { metadata { diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml index 45b1e449..9c27e347 100644 --- a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml +++ b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml @@ -1,32 +1,7 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: clickhouse-backup-config - namespace: signoz-fluxcd -data: - backup_disk.xml: | - - - - - s3 - https://s3.us-east-1.amazonaws.com - 1 - us-east-1 - - - - - - disk - s3 - clickhouse-backup-{$aws_account_id} - - - ---- apiVersion: batch/v1 kind: CronJob +dependsOn: + - clickhouse-backup-config metadata: name: clickhouse-backup-cron namespace: signoz-fluxcd From ae851cafad741b0040d1614d59c84f67ca7bc0e2 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 09:19:41 -0500 Subject: [PATCH 040/135] try combining config maps --- modules/signoz-fluxcd/main.tf | 12 +----------- .../templates/clickhouse-backup-patch.yaml | 2 +- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 6bf5907b..20f0a28c 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -31,17 +31,6 @@ resource "kubernetes_config_map" "signoz-values" { data = { "signoz_values.yaml" = "${file("${path.module}/templates/values.yaml")}" - } - -} - -resource "kubernetes_config_map" "clickhouse-backup-config" { - metadata { - name = "clickhouse-backup-config" - namespace = var.namespace - } - - data = { "backup_disk.xml" = <<-EOT @@ -64,6 +53,7 @@ resource "kubernetes_config_map" "clickhouse-backup-config" { EOT } + } resource "kubernetes_service_account" "clickhouse-backup-service-account" { diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml index 9c27e347..0010c9d5 100644 --- a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml +++ b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml @@ -108,4 +108,4 @@ spec: volumes: - name: backup-config configMap: - name: clickhouse-backup-config + name: signoz-values From d0ba693efd081a59a3874890530f8497f8cbc14b Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 09:57:22 -0500 Subject: [PATCH 041/135] creates test pod for S3 connection --- modules/signoz-fluxcd/main.tf | 72 ++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 20f0a28c..9adae3ff 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -97,17 +97,71 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password - postRenderers: - - kustomize: - patches: - - target: - version: v1 - kind: Deployment - name: clickhouse-backup - patch: | - ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} YAML } + # postRenderers: + # - kustomize: + # patches: + # - target: + # version: v1 + # kind: Deployment + # name: clickhouse-backup + # patch: | + # apiVersion: apps/v1 + # kind: Deployment + # metadata: + # name: nginx + # namespace: demo-s3 + # spec: + # selector: + # matchLabels: + # app: nginx + # template: + # metadata: + # labels: + # app: nginx + # spec: + # serviceAccountName: default + # initContainers: + # - name: demo-aws-cli + # image: amazon/aws-cli + # command: ['aws', 's3', 'cp', 's3://demo-bucket/test.txt, '-' + # containers: + # - name: my-app + # image: nginx + # ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} + +resource "kubectl_manifest" "s3_test_pod" { + yaml_body = < Date: Wed, 13 Nov 2024 10:15:21 -0500 Subject: [PATCH 042/135] fixes service account name --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 9adae3ff..2bab360c 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -58,7 +58,7 @@ resource "kubernetes_config_map" "signoz-values" { resource "kubernetes_service_account" "clickhouse-backup-service-account" { metadata { - name = "clickhouse-backup" + name = "clickhouse-backup-service-account" namespace = var.namespace annotations = { "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role" From 7121fdc169dfd0d7c4e0abf2c783ef8a14d897ea Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 10:32:18 -0500 Subject: [PATCH 043/135] updates iam role access --- .../stacks/dpe-k8s-deployments/main.tf | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index a93cd6d5..ba4c0049 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,23 +158,6 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -resource "aws_iam_role" "clickhouse_backup_access" { - name = "clickhouse-backup-access-role" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = ["ec2.amazonaws.com", "eks.amazonaws.com"] - } - } - ] - }) -} - resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" policy = jsonencode({ @@ -197,6 +180,24 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { }) } +resource "aws_iam_role" "clickhouse_backup_access" { + name = "clickhouse-backup-access-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Service = ["ec2.amazonaws.com", "eks.amazonaws.com"] + } + } + ] + }) +} + + resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { role = aws_iam_role.clickhouse_backup_access.name policy_arn = aws_iam_policy.clickhouse_backup_policy.arn From ac90e5b8e27786f5ae2107962edb6b856beff1b8 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 10:48:56 -0500 Subject: [PATCH 044/135] updates IAM role access --- deployments/stacks/dpe-k8s-deployments/main.tf | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ba4c0049..728ed1c3 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -160,6 +160,7 @@ module "clickhouse_backup_bucket" { resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" + description = "Policy to access the clickhouse backup bucket" policy = jsonencode({ Version = "2012-10-17" Statement = [ @@ -180,8 +181,9 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { }) } -resource "aws_iam_role" "clickhouse_backup_access" { +resource "aws_iam_role" "clickhouse_backup_access_role" { name = "clickhouse-backup-access-role" + description = "Assumed role to access the clickhouse backup policy" assume_role_policy = jsonencode({ Version = "2012-10-17" @@ -190,7 +192,12 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Service = ["ec2.amazonaws.com", "eks.amazonaws.com"] + Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${var.cluster_name}" + } + Condition = { + StringEquals = { + "oidc.eks.${var.region}.amazonaws.com/id/${var.cluster_name}:aud" = "sts.amazonaws.com" + } } } ] @@ -199,6 +206,6 @@ resource "aws_iam_role" "clickhouse_backup_access" { resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { - role = aws_iam_role.clickhouse_backup_access.name + role = aws_iam_role.clickhouse_backup_access_role.name policy_arn = aws_iam_policy.clickhouse_backup_policy.arn } From 575d9f6698c340a39b9b43de9e6039a80fbf8612 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 11:00:25 -0500 Subject: [PATCH 045/135] revert role resource name change --- deployments/stacks/dpe-k8s-deployments/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 728ed1c3..c55c1bef 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -181,7 +181,7 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { }) } -resource "aws_iam_role" "clickhouse_backup_access_role" { +resource "aws_iam_role" "clickhouse_backup_access" { name = "clickhouse-backup-access-role" description = "Assumed role to access the clickhouse backup policy" @@ -206,6 +206,6 @@ resource "aws_iam_role" "clickhouse_backup_access_role" { resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { - role = aws_iam_role.clickhouse_backup_access_role.name + role = aws_iam_role.clickhouse_backup_access.name policy_arn = aws_iam_policy.clickhouse_backup_policy.arn } From d0f57cfc27cc402a3a1864d7935fc08476e93539 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 11:11:01 -0500 Subject: [PATCH 046/135] adds OIDC provider --- deployments/stacks/dpe-k8s-deployments/main.tf | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index c55c1bef..b8d609ab 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,6 +158,20 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } +data "tls_certificate" "eks" { + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +} + +resource "aws_iam_openid_connect_provider" "eks" { + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer + + tags = { + Name = "${var.cluster_name}-eks-irsa" + } +} + resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" description = "Policy to access the clickhouse backup bucket" @@ -192,11 +206,11 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.${var.region}.amazonaws.com/id/${var.cluster_name}" + Federated = aws_iam_openid_connect_provider.eks.arn } Condition = { StringEquals = { - "oidc.eks.${var.region}.amazonaws.com/id/${var.cluster_name}:aud" = "sts.amazonaws.com" + "${aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" } } } From 0ec3d469d5087d338007d305b74f9abe79c80908 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 11:21:10 -0500 Subject: [PATCH 047/135] uses exustung OIDC provider --- deployments/stacks/dpe-k8s-deployments/main.tf | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index b8d609ab..3074cedf 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,20 +158,10 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -data "tls_certificate" "eks" { +data "aws_iam_openid_connect_provider" "eks" { url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer } -resource "aws_iam_openid_connect_provider" "eks" { - client_id_list = ["sts.amazonaws.com"] - thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] - url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer - - tags = { - Name = "${var.cluster_name}-eks-irsa" - } -} - resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" description = "Policy to access the clickhouse backup bucket" @@ -206,11 +196,11 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = aws_iam_openid_connect_provider.eks.arn + Federated = data.aws_iam_openid_connect_provider.eks.arn } Condition = { StringEquals = { - "${aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" + "${data.aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" } } } From 9de97dae53fa305e08d9bf739c65dcfa5296bc68 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 12:26:48 -0500 Subject: [PATCH 048/135] creates oidc provider --- deployments/stacks/dpe-k8s-deployments/main.tf | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 3074cedf..b8d609ab 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,10 +158,20 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -data "aws_iam_openid_connect_provider" "eks" { +data "tls_certificate" "eks" { url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer } +resource "aws_iam_openid_connect_provider" "eks" { + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer + + tags = { + Name = "${var.cluster_name}-eks-irsa" + } +} + resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" description = "Policy to access the clickhouse backup bucket" @@ -196,11 +206,11 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = data.aws_iam_openid_connect_provider.eks.arn + Federated = aws_iam_openid_connect_provider.eks.arn } Condition = { StringEquals = { - "${data.aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" + "${aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" } } } From 76e0d7e19d097c2d197cd6df59b998bf640475fb Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 13:23:21 -0500 Subject: [PATCH 049/135] hard-code OIDC url for testing --- .../stacks/dpe-k8s-deployments/main.tf | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index b8d609ab..09963866 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,19 +158,19 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -data "tls_certificate" "eks" { - url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer -} +# data "tls_certificate" "eks" { +# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +# } -resource "aws_iam_openid_connect_provider" "eks" { - client_id_list = ["sts.amazonaws.com"] - thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] - url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +# resource "aws_iam_openid_connect_provider" "eks" { +# client_id_list = ["sts.amazonaws.com"] +# thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] +# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer - tags = { - Name = "${var.cluster_name}-eks-irsa" - } -} +# tags = { +# Name = "${var.cluster_name}-eks-irsa" +# } +# } resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" @@ -206,13 +206,13 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = aws_iam_openid_connect_provider.eks.arn - } - Condition = { - StringEquals = { - "${aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" - } + Federated = "https://oidc.eks.us-east-1.amazonaws.com/id/DA1DF11424BEFC68B1726FDB70DA037E" } + # Condition = { + # StringEquals = { + # "${data.aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" + # } + # } } ] }) From c45595bdc1ce502f3286cfb90a227e676c164bdb Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 13:51:13 -0500 Subject: [PATCH 050/135] try using eks module output --- .../stacks/dpe-k8s-deployments/main.tf | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 09963866..ba86fa19 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -158,19 +158,19 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -# data "tls_certificate" "eks" { -# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer -# } +data "tls_certificate" "eks" { + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +} -# resource "aws_iam_openid_connect_provider" "eks" { -# client_id_list = ["sts.amazonaws.com"] -# thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] -# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +resource "aws_iam_openid_connect_provider" "eks" { + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] + url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer -# tags = { -# Name = "${var.cluster_name}-eks-irsa" -# } -# } + tags = { + Name = "${var.cluster_name}-eks-irsa" + } +} resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" @@ -206,7 +206,7 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = "https://oidc.eks.us-east-1.amazonaws.com/id/DA1DF11424BEFC68B1726FDB70DA037E" + Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${module.eks.cluster_id}" } # Condition = { # StringEquals = { From 85aa4855d4689ee47db24d77081d233f5ef62b5a Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 14:00:26 -0500 Subject: [PATCH 051/135] try sage-aws-eks module --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ba86fa19..ed690d0c 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -206,7 +206,7 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${module.eks.cluster_id}" + Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${module.sage-aws-eks.eks.cluster_id}" } # Condition = { # StringEquals = { From 936319c030e45b8eb8c8ccf65839ee715e1299bc Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 14:22:16 -0500 Subject: [PATCH 052/135] adds cluster_id to deployment for OIDC provider --- deployments/spacelift/dpe-k8s/main.tf | 1 + deployments/stacks/dpe-k8s-deployments/main.tf | 3 ++- deployments/stacks/dpe-k8s-deployments/variables.tf | 5 +++++ deployments/stacks/dpe-k8s/outputs.tf | 4 ++++ modules/sage-aws-eks/ouputs.tf | 4 ++++ 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 73146ec8..345ae08b 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -43,6 +43,7 @@ locals { pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" smtp_user = "TF_VAR_smtp_user" smtp_password = "TF_VAR_smtp_password" + cluster_id = "TF_VAR_cluster_id" } } diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index ed690d0c..5304d705 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -206,7 +206,8 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${module.sage-aws-eks.eks.cluster_id}" + # https://oidc.eks.us-east-1.amazonaws.com/id/DA1DF11424BEFC68B1726FDB70DA037E + Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${var.cluster_id}" } # Condition = { # StringEquals = { diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index 2b9be26a..d974b342 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -40,6 +40,11 @@ variable "cluster_name" { type = string } +variable "cluster_id" { + description = "EKS cluster ID" + type = string +} + variable "spotinst_account" { description = "Spot.io account" type = string diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 7a0e2ceb..488f0c70 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -38,6 +38,10 @@ output "cluster_name" { value = module.sage-aws-eks.cluster_name } +output "cluster_id" { + value = module.sage-aws-eks.cluster_id +} + output "smtp_user" { value = length(module.sage-aws-ses) > 0 ? module.sage-aws-ses[0].smtp_user : "" } diff --git a/modules/sage-aws-eks/ouputs.tf b/modules/sage-aws-eks/ouputs.tf index 59692964..3ed962e7 100644 --- a/modules/sage-aws-eks/ouputs.tf +++ b/modules/sage-aws-eks/ouputs.tf @@ -13,3 +13,7 @@ output "node_security_group_id" { output "pod_to_node_dns_sg_id" { value = aws_security_group.pod-dns-egress.id } + +output "cluster_id" { + value = module.eks.cluster_id +} From 9da27e3a29052b32b757ee38bc42ad9f848fed95 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 14:39:49 -0500 Subject: [PATCH 053/135] changes to using oidc provider arn directly (infra) --- deployments/spacelift/dpe-k8s/main.tf | 2 +- deployments/stacks/dpe-k8s/outputs.tf | 4 ++-- modules/sage-aws-eks/ouputs.tf | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 345ae08b..41804c3e 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -43,7 +43,7 @@ locals { pod_to_node_dns_sg_id = "TF_VAR_pod_to_node_dns_sg_id" smtp_user = "TF_VAR_smtp_user" smtp_password = "TF_VAR_smtp_password" - cluster_id = "TF_VAR_cluster_id" + cluster_oidc_provider_arn = "TF_VAR_cluster_oidc_provider_arn" } } diff --git a/deployments/stacks/dpe-k8s/outputs.tf b/deployments/stacks/dpe-k8s/outputs.tf index 488f0c70..1a920dad 100644 --- a/deployments/stacks/dpe-k8s/outputs.tf +++ b/deployments/stacks/dpe-k8s/outputs.tf @@ -38,8 +38,8 @@ output "cluster_name" { value = module.sage-aws-eks.cluster_name } -output "cluster_id" { - value = module.sage-aws-eks.cluster_id +output "cluster_oidc_provider_arn" { + value = module.sage-aws-eks.cluster_oidc_provider_arn } output "smtp_user" { diff --git a/modules/sage-aws-eks/ouputs.tf b/modules/sage-aws-eks/ouputs.tf index 3ed962e7..8114420b 100644 --- a/modules/sage-aws-eks/ouputs.tf +++ b/modules/sage-aws-eks/ouputs.tf @@ -14,6 +14,6 @@ output "pod_to_node_dns_sg_id" { value = aws_security_group.pod-dns-egress.id } -output "cluster_id" { - value = module.eks.cluster_id +output "cluster_oidc_provider_arn" { + value = module.eks.oidc_provider_arn } From d92364562100a4932c5f0947530ece77055afb97 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 14:40:20 -0500 Subject: [PATCH 054/135] changes to using OIDC arn directly (deployments) --- deployments/stacks/dpe-k8s-deployments/main.tf | 3 ++- deployments/stacks/dpe-k8s-deployments/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5304d705..32a7978d 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -155,6 +155,7 @@ module "cert-manager" { module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" + # bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" bucket_name = "clickhouse-backup-${var.aws_account_id}" } @@ -207,7 +208,7 @@ resource "aws_iam_role" "clickhouse_backup_access" { Effect = "Allow" Principal = { # https://oidc.eks.us-east-1.amazonaws.com/id/DA1DF11424BEFC68B1726FDB70DA037E - Federated = "arn:aws:iam::${var.aws_account_id}:oidc-provider/oidc.eks.us-east-1.amazonaws.com/id/${var.cluster_id}" + Federated = "${var.cluster_oidc_provider_arn}" } # Condition = { # StringEquals = { diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index d974b342..cc95e545 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -40,8 +40,8 @@ variable "cluster_name" { type = string } -variable "cluster_id" { - description = "EKS cluster ID" +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" type = string } From 2f45ef4acbf0029964ca1a6b8ab8947b917a832f Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 13 Nov 2024 14:48:07 -0500 Subject: [PATCH 055/135] remove oidc creation --- .../stacks/dpe-k8s-deployments/main.tf | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 32a7978d..a17fb475 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -159,19 +159,19 @@ module "clickhouse_backup_bucket" { bucket_name = "clickhouse-backup-${var.aws_account_id}" } -data "tls_certificate" "eks" { - url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer -} - -resource "aws_iam_openid_connect_provider" "eks" { - client_id_list = ["sts.amazonaws.com"] - thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] - url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer - - tags = { - Name = "${var.cluster_name}-eks-irsa" - } -} +# data "tls_certificate" "eks" { +# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer +# } + +# resource "aws_iam_openid_connect_provider" "eks" { +# client_id_list = ["sts.amazonaws.com"] +# thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] +# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer + +# tags = { +# Name = "${var.cluster_name}-eks-irsa" +# } +# } resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy" From 0e6208444ca3d0bc21574a0adc913acabdbf2c8d Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 10:20:06 -0500 Subject: [PATCH 056/135] cleans up code --- .../stacks/dpe-k8s-deployments/main.tf | 27 +---- modules/signoz-fluxcd/main.tf | 54 +-------- .../templates/clickhouse-backup-patch.yaml | 111 ------------------ 3 files changed, 4 insertions(+), 188 deletions(-) delete mode 100644 modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index a17fb475..7eb70644 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -155,26 +155,11 @@ module "cert-manager" { module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" - # bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - bucket_name = "clickhouse-backup-${var.aws_account_id}" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" } -# data "tls_certificate" "eks" { -# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer -# } - -# resource "aws_iam_openid_connect_provider" "eks" { -# client_id_list = ["sts.amazonaws.com"] -# thumbprint_list = [data.tls_certificate.eks.certificates[0].sha1_fingerprint] -# url = data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer - -# tags = { -# Name = "${var.cluster_name}-eks-irsa" -# } -# } - resource "aws_iam_policy" "clickhouse_backup_policy" { - name = "clickhouse-backup-access-policy" + name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}" description = "Policy to access the clickhouse backup bucket" policy = jsonencode({ Version = "2012-10-17" @@ -197,7 +182,7 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { } resource "aws_iam_role" "clickhouse_backup_access" { - name = "clickhouse-backup-access-role" + name = "clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" description = "Assumed role to access the clickhouse backup policy" assume_role_policy = jsonencode({ @@ -207,14 +192,8 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - # https://oidc.eks.us-east-1.amazonaws.com/id/DA1DF11424BEFC68B1726FDB70DA037E Federated = "${var.cluster_oidc_provider_arn}" } - # Condition = { - # StringEquals = { - # "${data.aws_iam_openid_connect_provider.eks.url}:aud" = "sts.amazonaws.com" - # } - # } } ] }) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 2bab360c..4bf967a2 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -31,27 +31,6 @@ resource "kubernetes_config_map" "signoz-values" { data = { "signoz_values.yaml" = "${file("${path.module}/templates/values.yaml")}" - "backup_disk.xml" = <<-EOT - - - - - s3 - https://s3.us-east-1.amazonaws.com - 1 - us-east-1 - - - - - - disk - s3 - clickhouse-backup-${var.aws_account_id} - - - - EOT } } @@ -61,7 +40,7 @@ resource "kubernetes_service_account" "clickhouse-backup-service-account" { name = "clickhouse-backup-service-account" namespace = var.namespace annotations = { - "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role" + "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" } } } @@ -99,37 +78,6 @@ spec: targetPath: clickhouse.password YAML } - # postRenderers: - # - kustomize: - # patches: - # - target: - # version: v1 - # kind: Deployment - # name: clickhouse-backup - # patch: | - # apiVersion: apps/v1 - # kind: Deployment - # metadata: - # name: nginx - # namespace: demo-s3 - # spec: - # selector: - # matchLabels: - # app: nginx - # template: - # metadata: - # labels: - # app: nginx - # spec: - # serviceAccountName: default - # initContainers: - # - name: demo-aws-cli - # image: amazon/aws-cli - # command: ['aws', 's3', 'cp', 's3://demo-bucket/test.txt, '-' - # containers: - # - name: my-app - # image: nginx - # ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} resource "kubectl_manifest" "s3_test_pod" { yaml_body = < Date: Thu, 14 Nov 2024 11:00:38 -0500 Subject: [PATCH 057/135] revert bucket name --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 7eb70644..dcba409c 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -155,7 +155,7 @@ module "cert-manager" { module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" - bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + bucket_name = "clickhouse-backup-${var.aws_account_id}" } resource "aws_iam_policy" "clickhouse_backup_policy" { From 08a11c28ddc628c6709d19ff5eea686587be8607 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 11:04:49 -0500 Subject: [PATCH 058/135] reapply new bucket name --- deployments/stacks/dpe-k8s-deployments/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index dcba409c..7eb70644 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -155,7 +155,7 @@ module "cert-manager" { module "clickhouse_backup_bucket" { source = "../../../modules/s3-bucket" - bucket_name = "clickhouse-backup-${var.aws_account_id}" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" } resource "aws_iam_policy" "clickhouse_backup_policy" { From 015bb44456077c9a4fbc12e104999344bbe84801 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 11:31:03 -0500 Subject: [PATCH 059/135] test clickhouse backup deploy --- modules/signoz-fluxcd/main.tf | 8 +++++ .../templates/clickhouse-backup-patch.yaml | 36 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 4bf967a2..51193d93 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,6 +76,14 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password + postRenderers: + - kustomize: + patches: + - target: + kind: Deployment + name: clickhouse-backup + patch: | + ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} YAML } diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml new file mode 100644 index 00000000..ad81cdb3 --- /dev/null +++ b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml @@ -0,0 +1,36 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: clickhouse-backup +spec: + template: + spec: + serviceAccountName: clickhouse-backup-service-account + containers: + - name: clickhouse-backup + image: altinity/clickhouse-backup:1.5.0 # or your preferred version + resources: + requests: + cpu: "100m" + memory: "128Mi" + storage: "10Gi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "my-backup" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + volumes: + - name: clickhouse-data + persistentVolumeClaim: + claimName: data-clickhouse-0 From 63f836d489b0a90eaf5979cc4b91849273d97725 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 12:02:42 -0500 Subject: [PATCH 060/135] adds resource patch --- modules/signoz-fluxcd/main.tf | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 51193d93..b8f72894 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -80,10 +80,35 @@ spec: - kustomize: patches: - target: - kind: Deployment - name: clickhouse-backup - patch: | - ${file("${path.module}/templates/clickhouse-backup-patch.yaml")} + kind: HelmRelease + name: signoz + patch: | + - op: add + path: /spec/template/spec/containers/- + value: + name: clickhouse-backup + image: altinity/clickhouse-backup:2.6.3 + resources: + requests: + cpu: "100m" + memory: "128Mi" + storage: "10Gi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "my-backup" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML } From 3bb64cdf4b2234582fabee947af1b6cc6335cc30 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 12:09:59 -0500 Subject: [PATCH 061/135] adds missing - indicator --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index b8f72894..ff427270 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -82,7 +82,7 @@ spec: - target: kind: HelmRelease name: signoz - patch: | + patch: |- - op: add path: /spec/template/spec/containers/- value: From eb52f79f2d734085cc4f9345e3e047a9dc3f26bd Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 12:54:13 -0500 Subject: [PATCH 062/135] fix patch indenting --- modules/signoz-fluxcd/main.tf | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index ff427270..37269078 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -82,33 +82,33 @@ spec: - target: kind: HelmRelease name: signoz - patch: |- - - op: add - path: /spec/template/spec/containers/- - value: - name: clickhouse-backup - image: altinity/clickhouse-backup:2.6.3 - resources: - requests: - cpu: "100m" - memory: "128Mi" - storage: "10Gi" - volumeMounts: - - name: clickhouse-data - mountPath: /var/lib/clickhouse - env: - - name: REMOTE_STORAGE - value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote - - name: FULL_INTERVAL - value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging - value: "debug" - - name: BACKUP_NAME - value: "my-backup" - - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + patch: | + - op: add + path: /spec/template/spec/containers/- + value: + name: clickhouse-backup + image: altinity/clickhouse-backup:2.6.3 + resources: + requests: + cpu: "100m" + memory: "128Mi" + storage: "10Gi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "my-backup" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML } From 02fd04328cbd1980db2466eddc4292dc736814ba Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 13:33:27 -0500 Subject: [PATCH 063/135] updates clickhouse bucket --- modules/signoz-fluxcd/templates/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index f93b6374..31932341 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -44,7 +44,7 @@ clickhouse: storage: type: s3 endpoint: s3.amazonaws.com - bucket: clickhouse-backup-${aws_account_id} + bucket: clickhouse-backup-${aws_account_id}-${cluster_name} region: us-east-1 # Zookeeper default values From c31de326033a37bd9f1ae73097403ad24ee0440a Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 13:33:38 -0500 Subject: [PATCH 064/135] reverts clickhouse backup - try to get back to stable --- modules/signoz-fluxcd/main.tf | 66 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 37269078..797e6610 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,41 +76,41 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password - postRenderers: - - kustomize: - patches: - - target: - kind: HelmRelease - name: signoz - patch: | - - op: add - path: /spec/template/spec/containers/- - value: - name: clickhouse-backup - image: altinity/clickhouse-backup:2.6.3 - resources: - requests: - cpu: "100m" - memory: "128Mi" - storage: "10Gi" - volumeMounts: - - name: clickhouse-data - mountPath: /var/lib/clickhouse - env: - - name: REMOTE_STORAGE - value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote - - name: FULL_INTERVAL - value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging - value: "debug" - - name: BACKUP_NAME - value: "my-backup" - - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML } + # postRenderers: + # - kustomize: + # patches: + # - target: + # kind: HelmRelease + # name: signoz + # patch: | + # - op: add + # path: /spec/template/spec/containers/- + # value: + # name: clickhouse-backup + # image: altinity/clickhouse-backup:2.6.3 + # resources: + # requests: + # cpu: "100m" + # memory: "128Mi" + # storage: "10Gi" + # volumeMounts: + # - name: clickhouse-data + # mountPath: /var/lib/clickhouse + # env: + # - name: REMOTE_STORAGE + # value: "s3" + # - name: BACKUPS_TO_KEEP_REMOTE + # value: "0" # 0 means keep all backups remote + # - name: FULL_INTERVAL + # value: "24h" + # - name: LOG_LEVEL # TODO: remove this before merging + # value: "debug" + # - name: BACKUP_NAME + # value: "my-backup" + # - name: S3_BUCKET + # value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" resource "kubectl_manifest" "s3_test_pod" { yaml_body = < Date: Thu, 14 Nov 2024 15:07:39 -0500 Subject: [PATCH 065/135] updates test command --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 797e6610..10fa3c33 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -128,7 +128,7 @@ resource "kubectl_manifest" "s3_test_pod" { - /bin/sh - -c - | - aws s3 ls s3://clickhouse-backup-${var.aws_account_id} + aws s3 ls s3://clickhouse-backup-${var.aws_account_id}-${var.cluster_name} echo "S3 list completed with exit code $?" # Keep pod running for inspection tail -f /dev/null From ac745b9271ed91a49a1cbbaa32484c0f22108dc3 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:15:24 -0700 Subject: [PATCH 066/135] Try deploying out lovely-plugin as argocd side car --- modules/argo-cd/templates/values.yaml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/modules/argo-cd/templates/values.yaml b/modules/argo-cd/templates/values.yaml index cc4daf35..c027d87d 100644 --- a/modules/argo-cd/templates/values.yaml +++ b/modules/argo-cd/templates/values.yaml @@ -2453,7 +2453,27 @@ repoServer: # -- Additional containers to be added to the repo server pod ## Ref: https://argo-cd.readthedocs.io/en/stable/user-guide/config-management-plugins/ ## Note: Supports use of custom Helm templates - extraContainers: [] + extraContainers: + - name: lovely-plugin + # Choose your image here - see https://github.com/crumbhole/argocd-lovely-plugin/blob/main/doc/variations.md + image: ghcr.io/crumbhole/lovely:1.1.1 + securityContext: + runAsNonRoot: true + runAsUser: 999 + volumeMounts: + # Import the repo-server's plugin binary + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + # Starting with v2.4, do NOT mount the same tmp volume as the repo-server container. The filesystem separation helps + # mitigate path traversal attacks. + - mountPath: /tmp + name: lovely-tmp + volumes: + # A temporary directory for the tool to work in. + - emptyDir: {} + name: lovely-tmp # - name: cmp-my-plugin # command: # - "/var/run/argocd/argocd-cmp-server" From 62e1c1cec43342ad848e069e1344710b52cd9a9d Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Thu, 14 Nov 2024 13:16:24 -0700 Subject: [PATCH 067/135] Revert "Try deploying out lovely-plugin as argocd side car" This reverts commit ac745b9271ed91a49a1cbbaa32484c0f22108dc3. --- modules/argo-cd/templates/values.yaml | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/modules/argo-cd/templates/values.yaml b/modules/argo-cd/templates/values.yaml index c027d87d..cc4daf35 100644 --- a/modules/argo-cd/templates/values.yaml +++ b/modules/argo-cd/templates/values.yaml @@ -2453,27 +2453,7 @@ repoServer: # -- Additional containers to be added to the repo server pod ## Ref: https://argo-cd.readthedocs.io/en/stable/user-guide/config-management-plugins/ ## Note: Supports use of custom Helm templates - extraContainers: - - name: lovely-plugin - # Choose your image here - see https://github.com/crumbhole/argocd-lovely-plugin/blob/main/doc/variations.md - image: ghcr.io/crumbhole/lovely:1.1.1 - securityContext: - runAsNonRoot: true - runAsUser: 999 - volumeMounts: - # Import the repo-server's plugin binary - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - # Starting with v2.4, do NOT mount the same tmp volume as the repo-server container. The filesystem separation helps - # mitigate path traversal attacks. - - mountPath: /tmp - name: lovely-tmp - volumes: - # A temporary directory for the tool to work in. - - emptyDir: {} - name: lovely-tmp + extraContainers: [] # - name: cmp-my-plugin # command: # - "/var/run/argocd/argocd-cmp-server" From 82caef2492c04163721efbdcc3bcf9fbb38df32e Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 15:26:17 -0500 Subject: [PATCH 068/135] revert aws command change --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 10fa3c33..797e6610 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -128,7 +128,7 @@ resource "kubectl_manifest" "s3_test_pod" { - /bin/sh - -c - | - aws s3 ls s3://clickhouse-backup-${var.aws_account_id}-${var.cluster_name} + aws s3 ls s3://clickhouse-backup-${var.aws_account_id} echo "S3 list completed with exit code $?" # Keep pod running for inspection tail -f /dev/null From e26e8271b2cd6932550711708da3b99ba9f81504 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 15:49:57 -0500 Subject: [PATCH 069/135] test deployment update --- deployments/stacks/dpe-k8s-deployments/main.tf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 7eb70644..727b4ef3 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -161,6 +161,7 @@ module "clickhouse_backup_bucket" { resource "aws_iam_policy" "clickhouse_backup_policy" { name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}" description = "Policy to access the clickhouse backup bucket" + policy = jsonencode({ Version = "2012-10-17" Statement = [ @@ -192,14 +193,14 @@ resource "aws_iam_role" "clickhouse_backup_access" { Action = "sts:AssumeRoleWithWebIdentity" Effect = "Allow" Principal = { - Federated = "${var.cluster_oidc_provider_arn}" + Federated = "${var.cluster_oidc_provider_arn}", + Service = "eks.amazonaws.com" } } ] }) } - resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { role = aws_iam_role.clickhouse_backup_access.name policy_arn = aws_iam_policy.clickhouse_backup_policy.arn From a97c5fc2c0c00d085ada7ab054f6b1f5571a3387 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 16:24:19 -0500 Subject: [PATCH 070/135] revert service change --- deployments/stacks/dpe-k8s-deployments/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 727b4ef3..a112c4fb 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -194,7 +194,6 @@ resource "aws_iam_role" "clickhouse_backup_access" { Effect = "Allow" Principal = { Federated = "${var.cluster_oidc_provider_arn}", - Service = "eks.amazonaws.com" } } ] From ed0a9d155271f3553e024175fa9401817412906c Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 16:38:48 -0500 Subject: [PATCH 071/135] try adding sidecar with postrenders --- modules/signoz-fluxcd/main.tf | 71 +++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 797e6610..5c29aeb2 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,41 +76,46 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password + postRenderers: + - kustomize: + patchesStrategicMerge: + - kind: Deployment + apiVersion: apps/v1 + metadata: + name: clickhouse-backup-sidecar + spec: + template: + spec: + serviceAccountName: clickhouse-backup-service-account + containers: + - name: clickhouse-backup-sidecar + image: altinity/clickhouse-backup:2.6.3 + resources: + requests: + cpu: "100m" + memory: "128Mi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + volumes: + - name: clickhouse-data + persistentVolumeClaim: + claimName: data-clickhouse-0 YAML } - # postRenderers: - # - kustomize: - # patches: - # - target: - # kind: HelmRelease - # name: signoz - # patch: | - # - op: add - # path: /spec/template/spec/containers/- - # value: - # name: clickhouse-backup - # image: altinity/clickhouse-backup:2.6.3 - # resources: - # requests: - # cpu: "100m" - # memory: "128Mi" - # storage: "10Gi" - # volumeMounts: - # - name: clickhouse-data - # mountPath: /var/lib/clickhouse - # env: - # - name: REMOTE_STORAGE - # value: "s3" - # - name: BACKUPS_TO_KEEP_REMOTE - # value: "0" # 0 means keep all backups remote - # - name: FULL_INTERVAL - # value: "24h" - # - name: LOG_LEVEL # TODO: remove this before merging - # value: "debug" - # - name: BACKUP_NAME - # value: "my-backup" - # - name: S3_BUCKET - # value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" resource "kubectl_manifest" "s3_test_pod" { yaml_body = < Date: Thu, 14 Nov 2024 17:10:06 -0500 Subject: [PATCH 072/135] removes comments --- modules/signoz-fluxcd/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 5c29aeb2..7c225e47 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -101,10 +101,10 @@ spec: - name: REMOTE_STORAGE value: "s3" - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote + value: "0" - name: FULL_INTERVAL value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging + - name: LOG_LEVEL value: "debug" - name: BACKUP_NAME value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" From eca6bc399dbfa37e00597ab83a255b9818497d3b Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 17:32:10 -0500 Subject: [PATCH 073/135] try add deploy --- modules/signoz-fluxcd/main.tf | 67 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 7c225e47..b8f72894 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -78,42 +78,37 @@ spec: targetPath: clickhouse.password postRenderers: - kustomize: - patchesStrategicMerge: - - kind: Deployment - apiVersion: apps/v1 - metadata: - name: clickhouse-backup-sidecar - spec: - template: - spec: - serviceAccountName: clickhouse-backup-service-account - containers: - - name: clickhouse-backup-sidecar - image: altinity/clickhouse-backup:2.6.3 - resources: - requests: - cpu: "100m" - memory: "128Mi" - volumeMounts: - - name: clickhouse-data - mountPath: /var/lib/clickhouse - env: - - name: REMOTE_STORAGE - value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" - - name: FULL_INTERVAL - value: "24h" - - name: LOG_LEVEL - value: "debug" - - name: BACKUP_NAME - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - volumes: - - name: clickhouse-data - persistentVolumeClaim: - claimName: data-clickhouse-0 + patches: + - target: + kind: HelmRelease + name: signoz + patch: | + - op: add + path: /spec/template/spec/containers/- + value: + name: clickhouse-backup + image: altinity/clickhouse-backup:2.6.3 + resources: + requests: + cpu: "100m" + memory: "128Mi" + storage: "10Gi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "my-backup" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML } From 96e87042402fd56705be020ae4bf6e90d063bb2e Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 17:38:00 -0500 Subject: [PATCH 074/135] fixes indent --- modules/signoz-fluxcd/main.tf | 54 +++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index b8f72894..37269078 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -82,33 +82,33 @@ spec: - target: kind: HelmRelease name: signoz - patch: | - - op: add - path: /spec/template/spec/containers/- - value: - name: clickhouse-backup - image: altinity/clickhouse-backup:2.6.3 - resources: - requests: - cpu: "100m" - memory: "128Mi" - storage: "10Gi" - volumeMounts: - - name: clickhouse-data - mountPath: /var/lib/clickhouse - env: - - name: REMOTE_STORAGE - value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote - - name: FULL_INTERVAL - value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging - value: "debug" - - name: BACKUP_NAME - value: "my-backup" - - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + patch: | + - op: add + path: /spec/template/spec/containers/- + value: + name: clickhouse-backup + image: altinity/clickhouse-backup:2.6.3 + resources: + requests: + cpu: "100m" + memory: "128Mi" + storage: "10Gi" + volumeMounts: + - name: clickhouse-data + mountPath: /var/lib/clickhouse + env: + - name: REMOTE_STORAGE + value: "s3" + - name: BACKUPS_TO_KEEP_REMOTE + value: "0" # 0 means keep all backups remote + - name: FULL_INTERVAL + value: "24h" + - name: LOG_LEVEL # TODO: remove this before merging + value: "debug" + - name: BACKUP_NAME + value: "my-backup" + - name: S3_BUCKET + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML } From 615e05686ada43f68ac761ceae9e96e06c6fab48 Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 14 Nov 2024 17:54:20 -0500 Subject: [PATCH 075/135] updates target to statefulset --- modules/signoz-fluxcd/main.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 37269078..936fabcb 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -80,13 +80,13 @@ spec: - kustomize: patches: - target: - kind: HelmRelease - name: signoz + kind: StatefulSet + name: chi-signoz-clickhouse-cluster-0-0 patch: | - op: add path: /spec/template/spec/containers/- value: - name: clickhouse-backup + name: clickhouse-backup-sidecar image: altinity/clickhouse-backup:2.6.3 resources: requests: @@ -100,13 +100,13 @@ spec: - name: REMOTE_STORAGE value: "s3" - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote + value: "0" - name: FULL_INTERVAL value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging + - name: LOG_LEVEL value: "debug" - name: BACKUP_NAME - value: "my-backup" + value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" YAML From cc6eb7c196d2c89f1329cb9567f77c0832dd84dc Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 09:43:29 -0500 Subject: [PATCH 076/135] targets ClickHouseInstallation --- modules/signoz-fluxcd/main.tf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 936fabcb..ecedeeb2 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -80,11 +80,10 @@ spec: - kustomize: patches: - target: - kind: StatefulSet - name: chi-signoz-clickhouse-cluster-0-0 + kind: ClickHouseInstallation patch: | - op: add - path: /spec/template/spec/containers/- + path: /spec/templates/podTemplates/spec/containers/- value: name: clickhouse-backup-sidecar image: altinity/clickhouse-backup:2.6.3 From 72ecbb73b6d87947ecb53295b1e0663326f2ffd0 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 10:19:20 -0500 Subject: [PATCH 077/135] try adding in service account patch --- modules/signoz-fluxcd/main.tf | 42 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index ecedeeb2..8d7760a3 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -79,21 +79,48 @@ spec: postRenderers: - kustomize: patches: + # Set the service account + - target: + kind: ClickHouseInstallation + patch: | + - op: replace + path: /spec/templates/podTemplates/0/spec/serviceAccountName + value: clickhouse-backup-service-account + # Add the backup volume to the volumes list + - target: + kind: ClickHouseInstallation + patch: | + - op: add + path: /spec/templates/podTemplates/0/spec/volumes/- + value: + name: clickhouse-backup + persistentVolumeClaim: + claimName: data-volumeclaim-template + # Add the sidecar container - target: kind: ClickHouseInstallation patch: | - op: add - path: /spec/templates/podTemplates/spec/containers/- + path: /spec/templates/podTemplates/0/spec/containers/- value: name: clickhouse-backup-sidecar image: altinity/clickhouse-backup:2.6.3 + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -c + - | + echo "Clickhouse backup sidecar started!!!" + /usr/local/bin/clickhouse-backup server resources: requests: cpu: "100m" memory: "128Mi" - storage: "10Gi" + limits: + cpu: "500m" + memory: "256Mi" volumeMounts: - - name: clickhouse-data + - name: data-volumeclaim-template mountPath: /var/lib/clickhouse env: - name: REMOTE_STORAGE @@ -108,6 +135,13 @@ spec: value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + - name: API_CREATE_INTEGRATION_TABLES + value: "true" + - name: CLICKHOUSE_PASSWORD + valueFrom: + secretKeyRef: + name: clickhouse-admin-password + key: password YAML } @@ -127,7 +161,7 @@ resource "kubectl_manifest" "s3_test_pod" { - /bin/sh - -c - | - aws s3 ls s3://clickhouse-backup-${var.aws_account_id} + aws s3 ls s3://clickhouse-backup-${var.aws_account_id}-${var.cluster_name} echo "S3 list completed with exit code $?" # Keep pod running for inspection tail -f /dev/null From a1c621468993362ba0d9d1b1bc9089d96b22de48 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 10:32:13 -0500 Subject: [PATCH 078/135] removes s3-test pod --- modules/signoz-fluxcd/main.tf | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 8d7760a3..67336b22 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -145,38 +145,6 @@ spec: YAML } -resource "kubectl_manifest" "s3_test_pod" { - yaml_body = < Date: Fri, 15 Nov 2024 10:42:17 -0500 Subject: [PATCH 079/135] fixes pvc labelling --- modules/signoz-fluxcd/main.tf | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 67336b22..73f16e7b 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -95,7 +95,7 @@ spec: value: name: clickhouse-backup persistentVolumeClaim: - claimName: data-volumeclaim-template + claimName: data-volumeclaim-clickhouse-backup # Add the sidecar container - target: kind: ClickHouseInstallation @@ -116,11 +116,8 @@ spec: requests: cpu: "100m" memory: "128Mi" - limits: - cpu: "500m" - memory: "256Mi" volumeMounts: - - name: data-volumeclaim-template + - name: data-volumeclaim-clickhouse-backup mountPath: /var/lib/clickhouse env: - name: REMOTE_STORAGE From 4e15386da3b4b213613e93b282dd4b5d5ab17b53 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 10:51:39 -0500 Subject: [PATCH 080/135] removes service account for now --- modules/signoz-fluxcd/main.tf | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 73f16e7b..8bd729b8 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -79,13 +79,6 @@ spec: postRenderers: - kustomize: patches: - # Set the service account - - target: - kind: ClickHouseInstallation - patch: | - - op: replace - path: /spec/templates/podTemplates/0/spec/serviceAccountName - value: clickhouse-backup-service-account # Add the backup volume to the volumes list - target: kind: ClickHouseInstallation From 6fc9812eaa43175c0dda52880078d3898f696e8c Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 11:20:52 -0500 Subject: [PATCH 081/135] reverts to last successful HelmRelease --- modules/signoz-fluxcd/main.tf | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 8bd729b8..5ada087a 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -79,6 +79,13 @@ spec: postRenderers: - kustomize: patches: + # Set the service account + - target: + kind: ClickHouseInstallation + patch: | + - op: replace + path: /spec/templates/podTemplates/0/spec/serviceAccountName + value: clickhouse-backup-service-account # Add the backup volume to the volumes list - target: kind: ClickHouseInstallation @@ -88,7 +95,7 @@ spec: value: name: clickhouse-backup persistentVolumeClaim: - claimName: data-volumeclaim-clickhouse-backup + claimName: data-volumeclaim-template # Add the sidecar container - target: kind: ClickHouseInstallation @@ -109,8 +116,11 @@ spec: requests: cpu: "100m" memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" volumeMounts: - - name: data-volumeclaim-clickhouse-backup + - name: data-volumeclaim-template mountPath: /var/lib/clickhouse env: - name: REMOTE_STORAGE @@ -135,6 +145,7 @@ spec: YAML } + # resource "kubectl_manifest" "signoz-deployment" { # depends_on = [kubernetes_namespace.signoz] From 7961869fba8fa2c2fbe643b709b2b31db1665524 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 11:36:35 -0500 Subject: [PATCH 082/135] change pvc name --- modules/signoz-fluxcd/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 5ada087a..0460db4f 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -95,7 +95,7 @@ spec: value: name: clickhouse-backup persistentVolumeClaim: - claimName: data-volumeclaim-template + claimName: clickhouse-backup-data-volumeclaim # Add the sidecar container - target: kind: ClickHouseInstallation @@ -120,7 +120,7 @@ spec: cpu: "500m" memory: "256Mi" volumeMounts: - - name: data-volumeclaim-template + - name: clickhouse-backup-data-volumeclaim mountPath: /var/lib/clickhouse env: - name: REMOTE_STORAGE From 81991638bbba42944e0855ec2c7faaa2a6c5da3a Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 11:54:44 -0500 Subject: [PATCH 083/135] frees up ports --- modules/signoz-fluxcd/main.tf | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 0460db4f..df09e8f9 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -79,14 +79,7 @@ spec: postRenderers: - kustomize: patches: - # Set the service account - - target: - kind: ClickHouseInstallation - patch: | - - op: replace - path: /spec/templates/podTemplates/0/spec/serviceAccountName - value: clickhouse-backup-service-account - # Add the backup volume to the volumes list + # Add the backup volume to the volumes list if it doesn't exist - target: kind: ClickHouseInstallation patch: | @@ -95,7 +88,7 @@ spec: value: name: clickhouse-backup persistentVolumeClaim: - claimName: clickhouse-backup-data-volumeclaim + claimName: data-volumeclaim-template # Add the sidecar container - target: kind: ClickHouseInstallation @@ -112,6 +105,10 @@ spec: - | echo "Clickhouse backup sidecar started!!!" /usr/local/bin/clickhouse-backup server + ports: + - name: backup-api + containerPort: 7171 + protocol: TCP resources: requests: cpu: "100m" @@ -120,8 +117,11 @@ spec: cpu: "500m" memory: "256Mi" volumeMounts: - - name: clickhouse-backup-data-volumeclaim + - name: data-volumeclaim-template mountPath: /var/lib/clickhouse + securityContext: + runAsUser: 101 + runAsGroup: 101 env: - name: REMOTE_STORAGE value: "s3" @@ -135,8 +135,16 @@ spec: value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + - name: API_LISTEN + value: "0.0.0.0:7171" - name: API_CREATE_INTEGRATION_TABLES value: "true" + - name: CLICKHOUSE_HOST + value: "localhost" + - name: CLICKHOUSE_PORT + value: "9000" + - name: CLICKHOUSE_USER + value: "admin" - name: CLICKHOUSE_PASSWORD valueFrom: secretKeyRef: From 387efd4b85c988f0df555f4ab3f8e73b3ebff8b3 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 12:19:30 -0500 Subject: [PATCH 084/135] removes additional volume patch --- modules/signoz-fluxcd/main.tf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index df09e8f9..412f279d 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -79,16 +79,6 @@ spec: postRenderers: - kustomize: patches: - # Add the backup volume to the volumes list if it doesn't exist - - target: - kind: ClickHouseInstallation - patch: | - - op: add - path: /spec/templates/podTemplates/0/spec/volumes/- - value: - name: clickhouse-backup - persistentVolumeClaim: - claimName: data-volumeclaim-template # Add the sidecar container - target: kind: ClickHouseInstallation From 5c6ef4a3fe7968fdae0a62a00188951f0d992794 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 12:25:27 -0500 Subject: [PATCH 085/135] simplifies patch --- modules/signoz-fluxcd/main.tf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 412f279d..47f03826 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -95,10 +95,6 @@ spec: - | echo "Clickhouse backup sidecar started!!!" /usr/local/bin/clickhouse-backup server - ports: - - name: backup-api - containerPort: 7171 - protocol: TCP resources: requests: cpu: "100m" @@ -106,12 +102,6 @@ spec: limits: cpu: "500m" memory: "256Mi" - volumeMounts: - - name: data-volumeclaim-template - mountPath: /var/lib/clickhouse - securityContext: - runAsUser: 101 - runAsGroup: 101 env: - name: REMOTE_STORAGE value: "s3" @@ -125,16 +115,6 @@ spec: value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - - name: API_LISTEN - value: "0.0.0.0:7171" - - name: API_CREATE_INTEGRATION_TABLES - value: "true" - - name: CLICKHOUSE_HOST - value: "localhost" - - name: CLICKHOUSE_PORT - value: "9000" - - name: CLICKHOUSE_USER - value: "admin" - name: CLICKHOUSE_PASSWORD valueFrom: secretKeyRef: From d4eadc189161b224ce4d42ac4006c82afa597836 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 12:33:36 -0500 Subject: [PATCH 086/135] removes password overwrite --- modules/signoz-fluxcd/main.tf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 47f03826..91c047bf 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -115,11 +115,6 @@ spec: value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - - name: CLICKHOUSE_PASSWORD - valueFrom: - secretKeyRef: - name: clickhouse-admin-password - key: password YAML } From eb9808ac19ef965894d32716189c50e00980615f Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 13:06:03 -0500 Subject: [PATCH 087/135] remove incorrect command --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 91c047bf..d4102af2 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -94,7 +94,7 @@ spec: - -c - | echo "Clickhouse backup sidecar started!!!" - /usr/local/bin/clickhouse-backup server + tail -f /dev/null resources: requests: cpu: "100m" From 4e786982d15eec9e4234f40790a42859de6ccc3d Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 13:42:04 -0500 Subject: [PATCH 088/135] updates backup config --- modules/signoz-fluxcd/main.tf | 26 +++++++++++++-------- modules/signoz-fluxcd/templates/values.yaml | 4 +++- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index d4102af2..5526237b 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -89,12 +89,7 @@ spec: name: clickhouse-backup-sidecar image: altinity/clickhouse-backup:2.6.3 imagePullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - | - echo "Clickhouse backup sidecar started!!!" - tail -f /dev/null + args: ["server"] resources: requests: cpu: "100m" @@ -103,18 +98,29 @@ spec: cpu: "500m" memory: "256Mi" env: + - name: LOG_LEVEL + value: "debug" + - name: ALLOW_EMPTY_BACKUPS + value: "true" + - name: API_LISTEN + value: "0.0.0.0:7171" + - name: API_CREATE_INTEGRATION_TABLES + value: "true" + - name: BACKUPS_TO_KEEP_REMOTE + value: "3" - name: REMOTE_STORAGE value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" - name: FULL_INTERVAL value: "24h" - - name: LOG_LEVEL - value: "debug" - name: BACKUP_NAME value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_BUCKET value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + - name: S3_PATH + value: "backup/shard-{shard}" + ports: + - name: backup-rest + containerPort: 7171 YAML } diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index 31932341..3070c362 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -160,7 +160,9 @@ clickhouse: # -- Specifies whether a service account should be created create: true # -- Annotations to add to the service account - annotations: {} + annotations: { + "eks.amazonaws.com/role-arn": "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" + } # -- The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: From 7abaf9d656f6365fb0d86d81c782c2c4907ad47c Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 13:53:49 -0500 Subject: [PATCH 089/135] moves annotation to helmrelease block --- modules/signoz-fluxcd/main.tf | 4 ++++ modules/signoz-fluxcd/templates/values.yaml | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 5526237b..872c8f98 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -68,6 +68,10 @@ spec: values: alertmanager: enabled: false + clickhouse: + serviceAccount: + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" valuesFrom: - kind: ConfigMap name: signoz-values diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index 3070c362..31932341 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -160,9 +160,7 @@ clickhouse: # -- Specifies whether a service account should be created create: true # -- Annotations to add to the service account - annotations: { - "eks.amazonaws.com/role-arn": "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" - } + annotations: {} # -- The name of the service account to use. # If not set and create is true, a name is generated using the fullname template name: From 3bc493484df2026ba6d56058dd6cca07ebfec1d7 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 15:11:52 -0500 Subject: [PATCH 090/135] sets WATCH_INTERVAL = 8h --- modules/signoz-fluxcd/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 872c8f98..8d1e79ab 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -114,6 +114,8 @@ spec: value: "3" - name: REMOTE_STORAGE value: "s3" + - name: WATCH_INTERVAL + value: "8h" - name: FULL_INTERVAL value: "24h" - name: BACKUP_NAME From 8b5d11beed54c513fc7b632b054ffef9c86bb6a6 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 15 Nov 2024 13:42:41 -0700 Subject: [PATCH 091/135] Remove weave gitops in favor of a simpler deployment of capacitor --- .../stacks/dpe-k8s-deployments/main.tf | 12 +- modules/weave-gitops/README.md | 2 - modules/weave-gitops/main.tf | 64 ------ modules/weave-gitops/templates/values.yaml | 207 ------------------ modules/weave-gitops/variables.tf | 4 - modules/weave-gitops/versions.tf | 17 -- 6 files changed, 3 insertions(+), 303 deletions(-) delete mode 100644 modules/weave-gitops/README.md delete mode 100644 modules/weave-gitops/main.tf delete mode 100644 modules/weave-gitops/templates/values.yaml delete mode 100644 modules/weave-gitops/variables.tf delete mode 100644 modules/weave-gitops/versions.tf diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index a112c4fb..f44e52c6 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -119,12 +119,6 @@ module "signoz-flux-deployment" { aws_account_id = var.aws_account_id } -module "weave-gitops" { - depends_on = [module.flux-cd] - source = "../../../modules/weave-gitops" - namespace = "weave-gitops" -} - module "envoy-gateway" { count = var.enable_cluster_ingress ? 1 : 0 depends_on = [module.argo-cd] @@ -154,12 +148,12 @@ module "cert-manager" { } module "clickhouse_backup_bucket" { - source = "../../../modules/s3-bucket" + source = "../../../modules/s3-bucket" bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" } resource "aws_iam_policy" "clickhouse_backup_policy" { - name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}" + name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}" description = "Policy to access the clickhouse backup bucket" policy = jsonencode({ @@ -183,7 +177,7 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { } resource "aws_iam_role" "clickhouse_backup_access" { - name = "clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" + name = "clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" description = "Assumed role to access the clickhouse backup policy" assume_role_policy = jsonencode({ diff --git a/modules/weave-gitops/README.md b/modules/weave-gitops/README.md deleted file mode 100644 index 8f4fe05a..00000000 --- a/modules/weave-gitops/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Purpose -Deploy the gitops server: \ No newline at end of file diff --git a/modules/weave-gitops/main.tf b/modules/weave-gitops/main.tf deleted file mode 100644 index 01193541..00000000 --- a/modules/weave-gitops/main.tf +++ /dev/null @@ -1,64 +0,0 @@ -resource "kubernetes_namespace" "weave" { - metadata { - name = var.namespace - } -} - -resource "kubectl_manifest" "weave-git-repo" { - depends_on = [kubernetes_namespace.weave] - - yaml_body = < Date: Fri, 15 Nov 2024 16:02:13 -0500 Subject: [PATCH 092/135] configures cold storage --- modules/signoz-fluxcd/main.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 8d1e79ab..dcd631aa 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -72,6 +72,11 @@ spec: serviceAccount: annotations: eks.amazonaws.com/role-arn: "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" + coldStorage: + enabled: true + defaultKeepFreeSpaceBytes: "10485760" # 10MiB + type: s3 + endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ valuesFrom: - kind: ConfigMap name: signoz-values From 42bbf6c2a3e4c7081e4d151181d178bc50a7a339 Mon Sep 17 00:00:00 2001 From: bwmac Date: Fri, 15 Nov 2024 17:21:37 -0500 Subject: [PATCH 093/135] enable role for coldStorage --- modules/signoz-fluxcd/main.tf | 4 ++++ modules/signoz-fluxcd/templates/values.yaml | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index dcd631aa..5ac15a78 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -77,6 +77,10 @@ spec: defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ + role: + enabled: true + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::${var.aws_account_id}:role/clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" valuesFrom: - kind: ConfigMap name: signoz-values diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index 31932341..88ca40e7 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -368,9 +368,9 @@ clickhouse: # For GCS, endpoint should be https://storage.googleapis.com//data/ endpoint: https://.s3-.amazonaws.com/data/ # -- Access Key for S3 or GCS - accessKey: + # accessKey: # -- Secret Access Key for S3 or GCS - secretAccess: + # secretAccess: # AWS role configuration - to use environment variables instead of passing access and secret keys role: # -- Whether to enable AWS IAM ARN role. From 992820459e44b2769d6a66a760d1bbdb993ebd4e Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:37:54 -0700 Subject: [PATCH 094/135] Swap to revision strategy --- modules/signoz-fluxcd/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 5ac15a78..e39a9347 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -65,6 +65,7 @@ spec: name: signoz namespace: ${var.namespace} interval: 10m + reconcileStrategy: Revision values: alertmanager: enabled: false From ed4f3a4aa8b14a45810a24e8553675ea12cc8951 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:24:30 -0700 Subject: [PATCH 095/135] Try flipped off helm hooks --- modules/signoz-fluxcd/templates/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/templates/values.yaml b/modules/signoz-fluxcd/templates/values.yaml index 88ca40e7..03503707 100644 --- a/modules/signoz-fluxcd/templates/values.yaml +++ b/modules/signoz-fluxcd/templates/values.yaml @@ -1291,7 +1291,7 @@ schemaMigrator: annotations: {} # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. - upgradeHelmHooks: true + upgradeHelmHooks: false # -- Whether to enable replication for schemaMigrator enableReplication: true From 291643a5b12f8815e1d7c3c8fc9ec1adab570c8b Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 18 Nov 2024 11:17:57 -0500 Subject: [PATCH 096/135] adds --watch flag --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index e39a9347..9b540b0b 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -103,7 +103,7 @@ spec: name: clickhouse-backup-sidecar image: altinity/clickhouse-backup:2.6.3 imagePullPolicy: IfNotPresent - args: ["server"] + args: ["server", "--watch"] resources: requests: cpu: "100m" From 3cd86357806e7c9db97667460921e1a599831151 Mon Sep 17 00:00:00 2001 From: bwmac Date: Mon, 18 Nov 2024 14:37:17 -0500 Subject: [PATCH 097/135] adds CopyObject permissions to IAM role --- .../stacks/dpe-k8s-deployments/main.tf | 3 +- .../templates/clickhouse-backup-patch.yaml | 36 ------------------- 2 files changed, 2 insertions(+), 37 deletions(-) delete mode 100644 modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index f44e52c6..461070ab 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -165,7 +165,8 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { "s3:ListBucket", "s3:GetObject", "s3:PutObject", - "s3:DeleteObject" + "s3:DeleteObject", + "s3:CopyObject" ] Resource = [ module.clickhouse_backup_bucket.bucket_arn, diff --git a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml b/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml deleted file mode 100644 index ad81cdb3..00000000 --- a/modules/signoz-fluxcd/templates/clickhouse-backup-patch.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: clickhouse-backup -spec: - template: - spec: - serviceAccountName: clickhouse-backup-service-account - containers: - - name: clickhouse-backup - image: altinity/clickhouse-backup:1.5.0 # or your preferred version - resources: - requests: - cpu: "100m" - memory: "128Mi" - storage: "10Gi" - volumeMounts: - - name: clickhouse-data - mountPath: /var/lib/clickhouse - env: - - name: REMOTE_STORAGE - value: "s3" - - name: BACKUPS_TO_KEEP_REMOTE - value: "0" # 0 means keep all backups remote - - name: FULL_INTERVAL - value: "24h" - - name: LOG_LEVEL # TODO: remove this before merging - value: "debug" - - name: BACKUP_NAME - value: "my-backup" - - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - volumes: - - name: clickhouse-data - persistentVolumeClaim: - claimName: data-clickhouse-0 From 6337f7c6641f50194dfca0fde6942fe5619051e7 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:39:14 -0700 Subject: [PATCH 098/135] Add victoria metrics service scrape on clickhouse operator --- .../stacks/dpe-k8s-deployments/main.tf | 5 ++- modules/signoz-fluxcd/main.tf | 39 +++++++++++++++++++ .../resources/kustomization.yaml | 4 ++ modules/signoz-fluxcd/resources/scrape.yaml | 10 +++++ .../victoria-metrics/templates/values.yaml | 4 ++ 5 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 modules/signoz-fluxcd/resources/kustomization.yaml create mode 100644 modules/signoz-fluxcd/resources/scrape.yaml diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 461070ab..5e666d7e 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -35,8 +35,9 @@ module "flux-cd" { module "victoria-metrics" { depends_on = [module.argo-cd] - source = "spacelift.io/sagebionetworks/victoria-metrics/aws" - version = "0.4.8" + # source = "spacelift.io/sagebionetworks/victoria-metrics/aws" + # version = "0.4.8" + source = "../../../modules/victoria-metrics" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 9b540b0b..681c58da 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -227,6 +227,45 @@ YAML # YAML # } +resource "kubectl_manifest" "signoz-git-repo" { + depends_on = [helm_release.fluxcd] + + yaml_body = < Date: Mon, 18 Nov 2024 12:44:16 -0700 Subject: [PATCH 099/135] Correct ref --- modules/signoz-fluxcd/main.tf | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 681c58da..346c219d 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -228,7 +228,7 @@ YAML # } resource "kubectl_manifest" "signoz-git-repo" { - depends_on = [helm_release.fluxcd] + depends_on = [kubectl_manifest.signoz-helm-release] yaml_body = < Date: Mon, 18 Nov 2024 12:46:07 -0700 Subject: [PATCH 100/135] Covert over related SMTP config to FluxCD deployment --- modules/signoz-fluxcd/main.tf | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 346c219d..b3765045 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -67,8 +67,6 @@ spec: interval: 10m reconcileStrategy: Revision values: - alertmanager: - enabled: false clickhouse: serviceAccount: annotations: @@ -90,6 +88,9 @@ spec: name: clickhouse-admin-password valuesKey: password targetPath: clickhouse.password + - kind: Secret + name: signoz-smtp-config + valuesKey: smtp_config.yaml postRenderers: - kustomize: patches: @@ -292,9 +293,14 @@ resource "kubernetes_secret" "signoz-smtp-config" { } data = { - "smtp-config.yaml" = < Date: Mon, 18 Nov 2024 14:54:54 -0500 Subject: [PATCH 101/135] revert permission update --- deployments/stacks/dpe-k8s-deployments/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5e666d7e..48fec230 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -167,7 +167,6 @@ resource "aws_iam_policy" "clickhouse_backup_policy" { "s3:GetObject", "s3:PutObject", "s3:DeleteObject", - "s3:CopyObject" ] Resource = [ module.clickhouse_backup_bucket.bucket_arn, From 1b41132cbf81548ea35b54530f4e2ffa2e03c431 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Mon, 18 Nov 2024 20:16:44 -0700 Subject: [PATCH 102/135] Point to use east --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index b3765045..aa6601fc 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -75,7 +75,7 @@ spec: enabled: true defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 - endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ + endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ role: enabled: true annotations: From 30f42d5f06bc60d7aa0a4fec822e8c1fd0aecec2 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 10:48:46 -0500 Subject: [PATCH 103/135] adds region to storage config --- modules/signoz-fluxcd/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index aa6601fc..6a59042e 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,6 +76,7 @@ spec: defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ + region: us-east-1 role: enabled: true annotations: From 9e8d682ac0f5778108546470133eca60cec0b61d Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 10:59:07 -0500 Subject: [PATCH 104/135] patches xml file --- modules/signoz-fluxcd/main.tf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 6a59042e..b3219cc1 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -76,7 +76,6 @@ spec: defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ - region: us-east-1 role: enabled: true annotations: @@ -139,6 +138,12 @@ spec: ports: - name: backup-rest containerPort: 7171 + - target: + kind: ClickHouseInstallation + patch: | + - op: add + path: /spec/configuration/files/storage.xml/clickhouse/storage_configuration/disks/s3/region + value: "us-east-1" YAML } From 9ae15288f9e911b53e7c0652fa702548acf4b903 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 11:07:01 -0500 Subject: [PATCH 105/135] updates path --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index b3219cc1..e085dae3 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -142,7 +142,7 @@ spec: kind: ClickHouseInstallation patch: | - op: add - path: /spec/configuration/files/storage.xml/clickhouse/storage_configuration/disks/s3/region + path: /spec/configuration/files/config.d/storage.xml/clickhouse/storage_configuration/disks/s3/region value: "us-east-1" YAML } From 11b2d63d21943bbb5236b73125d64bfa45bfe468 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 11:16:27 -0500 Subject: [PATCH 106/135] try replacing storage.xml --- modules/signoz-fluxcd/main.tf | 42 ++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index e085dae3..2cfb0bb7 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -138,12 +138,42 @@ spec: ports: - name: backup-rest containerPort: 7171 - - target: - kind: ClickHouseInstallation - patch: | - - op: add - path: /spec/configuration/files/config.d/storage.xml/clickhouse/storage_configuration/disks/s3/region - value: "us-east-1" + - target: + kind: ClickHouseInstallation + patch: | + - op: replace + path: /spec/configuration/files/config.d/storage.xml + value: | + + + + + 10485760 + + + s3 + https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ + true + us-east-1 + + + + + + + default + + + s3 + 0 + 1 + + + 0 + + + + YAML } From 98aa032445f22dd1e67db6d2981caec160474076 Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 11:35:54 -0500 Subject: [PATCH 107/135] remove region from endpoint --- modules/signoz-fluxcd/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 2cfb0bb7..90fb5527 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -75,7 +75,7 @@ spec: enabled: true defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 - endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ + endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ role: enabled: true annotations: @@ -152,7 +152,7 @@ spec: s3 - https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.us-east-1.s3.amazonaws.com/data/ + https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ true us-east-1 From b6fca8b2726f3e37716e3b6a93491a06d7671773 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:50:16 -0700 Subject: [PATCH 108/135] Patch kustomize path --- modules/signoz-fluxcd/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 90fb5527..11cb32d6 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -142,7 +142,7 @@ spec: kind: ClickHouseInstallation patch: | - op: replace - path: /spec/configuration/files/config.d/storage.xml + path: /spec/configuration/files/config.d~1storage.xml value: | From 7c6208a96cd6d4b5d38f69727b7692e2ca64f7fe Mon Sep 17 00:00:00 2001 From: bwmac Date: Tue, 19 Nov 2024 12:43:36 -0500 Subject: [PATCH 109/135] updates coldstorage folder structure, adds S3_OBJECT_DISK_PATH env --- modules/signoz-fluxcd/main.tf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 11cb32d6..35958464 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -75,7 +75,7 @@ spec: enabled: true defaultKeepFreeSpaceBytes: "10485760" # 10MiB type: s3 - endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ + endpoint: https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/coldstorage/ role: enabled: true annotations: @@ -135,6 +135,8 @@ spec: value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - name: S3_PATH value: "backup/shard-{shard}" + - name: S3_OBJECT_DISK_PATH + value: "backup-object-disks/shard-{shard}" ports: - name: backup-rest containerPort: 7171 @@ -152,7 +154,7 @@ spec: s3 - https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/data/ + https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/coldstorage/ true us-east-1 From 831f6df222e714a55f74f092766da8135e83520d Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 20 Nov 2024 11:02:07 -0500 Subject: [PATCH 110/135] sets BACKUPS_TO_KEEP_LOCAL to 1 --- modules/signoz-fluxcd/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf index 35958464..192f83b0 100644 --- a/modules/signoz-fluxcd/main.tf +++ b/modules/signoz-fluxcd/main.tf @@ -121,6 +121,8 @@ spec: value: "0.0.0.0:7171" - name: API_CREATE_INTEGRATION_TABLES value: "true" + - name: BACKUPS_TO_KEEP_LOCAL + value: "1" - name: BACKUPS_TO_KEEP_REMOTE value: "3" - name: REMOTE_STORAGE From 6ac478083826463077f57463c9d11eaef1753968 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 20 Nov 2024 11:23:35 -0500 Subject: [PATCH 111/135] updates signoz readme with backup info --- modules/signoz/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 4cdcfa66..5f5d0deb 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -107,3 +107,12 @@ Once you're connected via a port-forward session the next item is to make sure t application you're sending data from is instrumented with open-telemetry. This is going to be application specific so instructions will need to live within the application you are using. + +### Clickhouse Backups and Restores +This module uses the `clickhouse-backup` tool to automatically back up the clickhouse database and store the data in an S3 bucket to ensure continuity of the data regardless of the state of the cluster.`clickhouse-backup` is deployed as a sidecar container to the `signoz` helm release. It will perform incremental backups of the database every 8 hours and full backups every 24 hours. + +To restore the database from an S3 backup, you can use the following steps: +1. Scale the replica cluster (`chi-signoz-clickhouse-cluster-0-1`) `StatefulSet` to 0 replicas. +1. Identify the backup that you would like to restore from. You can get the full list of backups by shelling into the `clickhouse-backup-sidecar` container within the `chi-signoz-clickhouse-cluster-0-0-0` pod and running `clickhouse-backup list`. +1. Restore the database from your backup by running `clickhouse-backup restore_remote --rm --schema ` (assuming the backup from remote storage). +1. Scale the replica cluster `StatefulSet` back to 1 replica. Once the `chi-signoz-clickhouse-cluster-0-1-0` has fully come back up, you should see the restored data showing in the `signoz` UI. From 64b854e1091b565da1b268cb3a8e8d57f3301670 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:10:19 -0700 Subject: [PATCH 112/135] Remove unused moved blocks --- deployments/spacelift/dpe-k8s/main.tf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 41804c3e..2040e91f 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -217,16 +217,6 @@ resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration write = true } -moved { - from = spacelift_stack.auth0 - to = spacelift_stack.auth0[0] -} - -moved { - from = spacelift_stack_destructor.auth0-stack-destructor - to = spacelift_stack_destructor.auth0-stack-destructor[0] -} - resource "spacelift_stack" "auth0" { count = var.deploy_auth0 ? 1 : 0 github_enterprise { From 46dce57869d0a7a08a7a8f8c2064da75e717ef14 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:53:24 -0700 Subject: [PATCH 113/135] [IBCDPE-1095] Use scope based authorization on telemetry upload route (#48) * Use scope based authroization on telemetry upload route --- deployments/main.tf | 5 +++ deployments/spacelift/dpe-k8s/main.tf | 8 ++-- deployments/spacelift/dpe-k8s/variables.tf | 6 +++ deployments/stacks/dpe-auth0/main.tf | 14 +++---- deployments/stacks/dpe-auth0/variables.tf | 6 +++ .../stacks/dpe-k8s-deployments/main.tf | 3 +- .../stacks/dpe-k8s-deployments/variables.tf | 5 +++ modules/envoy-gateway/main.tf | 2 +- modules/envoy-gateway/templates/values.yaml | 40 +++++++++++++++++-- modules/signoz/main.tf | 14 ++++++- .../security-policy.yaml | 1 + modules/signoz/variables.tf | 5 +++ 12 files changed, 92 insertions(+), 17 deletions(-) diff --git a/deployments/main.tf b/deployments/main.tf index aef63ffd..ea53ff7b 100644 --- a/deployments/main.tf +++ b/deployments/main.tf @@ -39,18 +39,22 @@ module "dpe-sandbox-spacelift-development" { name = "bfauble - automation" description = "App for testing signoz" app_type = "non_interactive" + scopes = ["write:telemetry"] }, { name = "schematic - Github Actions" description = "Client for Github Actions to export telemetry data" app_type = "non_interactive" + scopes = ["write:telemetry"] }, { name = "schematic - Dev" description = "Client for schematic deployed to AWS DEV to export telemetry data" app_type = "non_interactive" + scopes = ["write:telemetry"] }, ] + auth0_identifier = "https://dev.sagedpe.org" aws_account_id = "631692904429" region = "us-east-1" @@ -100,6 +104,7 @@ module "dpe-sandbox-spacelift-production" { auth0_stack_project_root = "deployments/stacks/dpe-auth0" auth0_domain = "" auth0_clients = [] + auth0_identifier = "" aws_account_id = "766808016710" region = "us-east-1" diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 2040e91f..15f3b590 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -27,12 +27,14 @@ locals { ssl_hostname = var.ssl_hostname auth0_jwks_uri = var.auth0_jwks_uri smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier } auth0_stack_variables = { - cluster_name = var.cluster_name - auth0_domain = var.auth0_domain - auth0_clients = var.auth0_clients + cluster_name = var.cluster_name + auth0_domain = var.auth0_domain + auth0_clients = var.auth0_clients + auth0_identifier = var.auth0_identifier } # Variables to be passed from the k8s stack to the deployments stack diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 5c7d31b0..599801f5 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -185,9 +185,15 @@ variable "auth0_clients" { name = string description = string app_type = string + scopes = list(string) })) } +variable "auth0_identifier" { + description = "Auth0 identifier for the created API." + type = string +} + variable "ses_email_identities" { type = list(string) description = "List of email identities to be added to SES" diff --git a/deployments/stacks/dpe-auth0/main.tf b/deployments/stacks/dpe-auth0/main.tf index 780ea89d..31d2a3b8 100644 --- a/deployments/stacks/dpe-auth0/main.tf +++ b/deployments/stacks/dpe-auth0/main.tf @@ -1,7 +1,7 @@ # Used to create the Auth0 resources for the DPE stack -resource "auth0_resource_server" "k8s-cluster-telemetry" { - name = "${var.cluster_name}-telemetry" - identifier = "${var.cluster_name}-telemetry" +resource "auth0_resource_server" "k8s-cluster-api" { + name = "${var.cluster_name}-api" + identifier = var.auth0_identifier signing_alg = "RS256" allow_offline_access = false @@ -31,8 +31,8 @@ resource "auth0_client" "oauth2_clients" { } resource "auth0_resource_server_scopes" "k8s-cluster-scopes" { - resource_server_identifier = auth0_resource_server.k8s-cluster-telemetry.identifier - # This scope is not yet used, however, kept for future use to grant authorization based on scopes + resource_server_identifier = auth0_resource_server.k8s-cluster-api.identifier + scopes { name = "write:telemetry" description = "Grants write access to telemetry data" @@ -52,6 +52,6 @@ resource "auth0_client_grant" "access_to_k8s_cluster" { for_each = { for client in var.auth0_clients : client.name => client } client_id = auth0_client.oauth2_clients[each.key].id - audience = auth0_resource_server.k8s-cluster-telemetry.identifier - scopes = [] + audience = auth0_resource_server.k8s-cluster-api.identifier + scopes = each.value.scopes } diff --git a/deployments/stacks/dpe-auth0/variables.tf b/deployments/stacks/dpe-auth0/variables.tf index a348f001..5176a7b0 100644 --- a/deployments/stacks/dpe-auth0/variables.tf +++ b/deployments/stacks/dpe-auth0/variables.tf @@ -24,5 +24,11 @@ variable "auth0_clients" { name = string description = string app_type = string + scopes = list(string) })) } + +variable "auth0_identifier" { + description = "Auth0 identifier for the created API." + type = string +} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 48fec230..48c40b56 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -100,6 +100,7 @@ module "signoz" { smtp_password = var.smtp_password smtp_user = var.smtp_user smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier } module "signoz-flux-deployment" { @@ -122,7 +123,7 @@ module "signoz-flux-deployment" { module "envoy-gateway" { count = var.enable_cluster_ingress ? 1 : 0 - depends_on = [module.argo-cd] + depends_on = [module.argo-cd, module.cert-manager] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" source = "../../../modules/envoy-gateway" diff --git a/deployments/stacks/dpe-k8s-deployments/variables.tf b/deployments/stacks/dpe-k8s-deployments/variables.tf index cc95e545..21b40836 100644 --- a/deployments/stacks/dpe-k8s-deployments/variables.tf +++ b/deployments/stacks/dpe-k8s-deployments/variables.tf @@ -91,6 +91,11 @@ variable "auth0_jwks_uri" { type = string } +variable "auth0_identifier" { + description = "Auth0 identifier for the API. Used to verify the audience in the JWT." + type = string +} + variable "smtp_user" { description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" type = string diff --git a/modules/envoy-gateway/main.tf b/modules/envoy-gateway/main.tf index 25b2c7ff..47bca383 100644 --- a/modules/envoy-gateway/main.tf +++ b/modules/envoy-gateway/main.tf @@ -23,7 +23,7 @@ spec: sources: - repoURL: registry-1.docker.io chart: envoyproxy/gateway-helm - targetRevision: v1.1.2 + targetRevision: v1.2.1 helm: releaseName: gateway-helm valueFiles: diff --git a/modules/envoy-gateway/templates/values.yaml b/modules/envoy-gateway/templates/values.yaml index 1edd623c..56cf3083 100644 --- a/modules/envoy-gateway/templates/values.yaml +++ b/modules/envoy-gateway/templates/values.yaml @@ -4,7 +4,7 @@ global: images: envoyGateway: # This is the full image name including the hub, repo, and tag. - image: docker.io/envoyproxy/gateway:v1.1.2 + image: docker.io/envoyproxy/gateway:v1.2.1 # Specify image pull policy if default behavior isn't desired. # Default behavior: latest images will be Always else IfNotPresent. pullPolicy: IfNotPresent @@ -12,7 +12,7 @@ global: pullSecrets: [] ratelimit: # This is the full image name including the hub, repo, and tag. - image: "docker.io/envoyproxy/ratelimit:26f28d78" + image: "docker.io/envoyproxy/ratelimit:master" # Specify image pull policy if default behavior isn't desired. # Default behavior: latest images will be Always else IfNotPresent. pullPolicy: IfNotPresent @@ -20,6 +20,8 @@ global: pullSecrets: [] podDisruptionBudget: minAvailable: 0 + # maxUnavailable: 1 + deployment: envoyGateway: image: @@ -29,11 +31,21 @@ deployment: imagePullSecrets: [] resources: limits: - cpu: 500m memory: 1024Mi requests: cpu: 100m memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + runAsNonRoot: true + runAsGroup: 65532 + runAsUser: 65532 + seccompProfile: + type: RuntimeDefault ports: - name: grpc port: 18000 @@ -47,6 +59,7 @@ deployment: - name: metrics port: 19001 targetPort: 19001 + priorityClassName: null replicas: 1 pod: affinity: {} @@ -56,6 +69,10 @@ deployment: labels: {} topologySpreadConstraints: [] tolerations: [] + nodeSelector: {} + +service: + annotations: {} config: envoyGateway: @@ -76,7 +93,22 @@ certgen: job: annotations: {} resources: {} + affinity: {} + tolerations: [] + nodeSelector: {} ttlSecondsAfterFinished: 30 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsGroup: 65534 + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault rbac: annotations: {} - labels: {} \ No newline at end of file + labels: {} diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index e1c40079..4a366f9f 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -86,7 +86,19 @@ spec: remoteJWKS: uri: ${var.auth0_jwks_uri} audiences: - - ${var.cluster_name}-telemetry + - ${var.auth0_identifier} + - op: replace + path: /spec/authorization + value: + defaultAction: Deny + rules: + - name: allow + action: Allow + principal: + jwt: + provider: auth0 + scopes: + - write:telemetry %{endif} destination: server: 'https://kubernetes.default.svc' diff --git a/modules/signoz/resources-otel-ingress/security-policy.yaml b/modules/signoz/resources-otel-ingress/security-policy.yaml index 34bd58da..3d45d127 100644 --- a/modules/signoz/resources-otel-ingress/security-policy.yaml +++ b/modules/signoz/resources-otel-ingress/security-policy.yaml @@ -10,3 +10,4 @@ spec: name: signoz-otel-collector-route jwt: providers: + authorization: diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 2a917ff1..344c8f60 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -48,6 +48,11 @@ variable "auth0_jwks_uri" { type = string } +variable "auth0_identifier" { + description = "Auth0 identifier for the API. Used to verify the audience in the JWT." + type = string +} + variable "smtp_user" { description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" type = string From a2240c1832ca8e35d0fd438f13e0e92bf5120998 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:16:01 -0700 Subject: [PATCH 114/135] Default to empty string --- deployments/spacelift/dpe-k8s/variables.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 599801f5..5a365f56 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -192,6 +192,7 @@ variable "auth0_clients" { variable "auth0_identifier" { description = "Auth0 identifier for the created API." type = string + default = "" } variable "ses_email_identities" { From 29bd314e3488563fb6df2e9f83a27dc229b7f094 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 20 Nov 2024 13:21:09 -0500 Subject: [PATCH 115/135] updates documentation for fluxcd --- README.md | 2 +- modules/flux-cd/README.md | 56 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 modules/flux-cd/README.md diff --git a/README.md b/README.md index 3d456eff..cbedb844 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ allow us to review for any security advisories. ### Deploying an application to the kubernetes cluster Deployment of applications to the kubernetes cluster is handled through the combination -of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd (Declarative definitions +of terraform (.tf) scripts, spacelift (CICD tool), and ArgoCd or Flux CD (Declarative definitions for applications). To start of the deployment journey the first step is to create a new terraform module diff --git a/modules/flux-cd/README.md b/modules/flux-cd/README.md new file mode 100644 index 00000000..01e5a5e9 --- /dev/null +++ b/modules/flux-cd/README.md @@ -0,0 +1,56 @@ +# Purpose +This module is used to deploy the `Flux CD` [helm chart](https://fluxcd-community.github.io/helm-charts) to the cluster. [`Flux CD`](https://fluxcd.io/) is a GitOps tool used to manage the application lifecycle on a Kubernetes cluster. It was originally deployed because unlike `Argo CD`, it supports the use of `postRenderers` which are used to apply any additional changes to the application after it has been deployed, and which were needed to be used to deploy the `clickhouse-backup` sidecar container to the `signoz` helm release. We do not plan to move all existing applications to `fluxcd` at this time, but it is available and preferred to be used for any new applications added to the cluster. + +## What resources are being deployed through this module +In addition to a `helm_release` which deploys the `fluxcd` helm chart, this module also creates a `capacitor` resource which is used as the frontend for `fluxcd`. + +## Accessing the Flux CD UI +To access the `Flux CD` UI, you only need to port-forward the `capacitor` pod and access it from your browser. + +# Deploying an application with Flux CD +To deploy an application with `Flux CD`, will need to create a `HelmRepository` resource which points to the helm chart you want to deploy. In that resource definition, you will set the `apiVersion` to `source.toolkit.fluxcd.io/v1` and the `kind` to `HelmRepository`. For example (code from the `signoz` module): + +``` +resource "kubectl_manifest" "signoz-helm-repo" { + depends_on = [kubernetes_namespace.signoz] + + yaml_body = < Date: Wed, 20 Nov 2024 11:26:53 -0700 Subject: [PATCH 116/135] Remove stack destructors as theyre broken on the free tier --- deployments/spacelift/dpe-k8s/main.tf | 52 +++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 15f3b590..06bd0bc0 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -179,30 +179,30 @@ resource "spacelift_stack_dependency_reference" "cluster-name" { # stack_id = spacelift_stack.k8s-stack.id # } -resource "spacelift_stack_destructor" "k8s-stack-deployments-destructor" { - depends_on = [ - spacelift_stack.k8s-stack, - spacelift_aws_integration_attachment.k8s-deployments-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-deployments-environment-variables - ] +# resource "spacelift_stack_destructor" "k8s-stack-deployments-destructor" { +# depends_on = [ +# spacelift_stack.k8s-stack, +# spacelift_aws_integration_attachment.k8s-deployments-aws-integration-attachment, +# spacelift_context_attachment.k8s-kubeconfig-hooks, +# spacelift_stack_dependency_reference.cluster-name, +# spacelift_stack_dependency_reference.region-name, +# spacelift_environment_variable.k8s-stack-deployments-environment-variables +# ] + +# stack_id = spacelift_stack.k8s-stack-deployments.id +# } - stack_id = spacelift_stack.k8s-stack-deployments.id -} +# resource "spacelift_stack_destructor" "k8s-stack-destructor" { +# depends_on = [ +# spacelift_aws_integration_attachment.k8s-aws-integration-attachment, +# spacelift_context_attachment.k8s-kubeconfig-hooks, +# spacelift_stack_dependency_reference.cluster-name, +# spacelift_stack_dependency_reference.region-name, +# spacelift_environment_variable.k8s-stack-environment-variables +# ] -resource "spacelift_stack_destructor" "k8s-stack-destructor" { - depends_on = [ - spacelift_aws_integration_attachment.k8s-aws-integration-attachment, - spacelift_context_attachment.k8s-kubeconfig-hooks, - spacelift_stack_dependency_reference.cluster-name, - spacelift_stack_dependency_reference.region-name, - spacelift_environment_variable.k8s-stack-environment-variables - ] - - stack_id = spacelift_stack.k8s-stack.id -} +# stack_id = spacelift_stack.k8s-stack.id +# } resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" { integration_id = var.aws_integration_id @@ -245,10 +245,10 @@ resource "spacelift_stack" "auth0" { ] } -resource "spacelift_stack_destructor" "auth0-stack-destructor" { - count = var.deploy_auth0 ? 1 : 0 - stack_id = spacelift_stack.auth0[0].id -} +# resource "spacelift_stack_destructor" "auth0-stack-destructor" { +# count = var.deploy_auth0 ? 1 : 0 +# stack_id = spacelift_stack.auth0[0].id +# } resource "spacelift_environment_variable" "auth0-stack-environment-variables" { From 4959fd27c61df56e8389926caa68a0c17b32ffd2 Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 20 Nov 2024 13:33:27 -0500 Subject: [PATCH 117/135] adds s3-bucket readme --- modules/s3-bucket/README.md | 83 +++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 modules/s3-bucket/README.md diff --git a/modules/s3-bucket/README.md b/modules/s3-bucket/README.md new file mode 100644 index 00000000..c97150e5 --- /dev/null +++ b/modules/s3-bucket/README.md @@ -0,0 +1,83 @@ +# Purpose +This is a simple module that can be used within applications to deploy an S3 bucket. + +## WARNING +If you are tearing down a stack with a deployed S3 Bucket, you will likely encounter an error similar to the following: +``` +deleting S3 Bucket (my-beautiful-bucket): operation error S3: DeleteBucket, https response error StatusCode: 409, RequestID: 123, HostID: 123456789+g=, api error BucketNotEmpty: The bucket you tried to delete is not empty. You must delete all versions in the bucket. +``` +We have intentionally not handled this behavior as a safeguard against accidental deletion of a bucket that contains important data. + +# Usage +Using this module is as simple as calling it in your terraform code: +``` +module "my_beautiful_bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "my-beautiful-bucket" +} +``` + +You will need to configure access to the bucket. This will involve the definition of `aws_iam_policy`, `aws_iam_role`, and `aws_iam_role_policy_attachment` resources with the necessary permissions for your use case. For example (from the `dpe-k8s-deployments` stack): +``` +resource "aws_iam_policy" "my_beautiful_bucket_policy" { + name = "my-beautiful-bucket-access-policy" + description = "Policy to access the my beautiful bucket" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ] + Resource = [ + module.my_beautiful_bucket.bucket_arn, + "${module.my_beautiful_bucket.bucket_arn}/*" + ] + } + ] + }) +} + +resource "aws_iam_role" "my_beautiful_bucket_access" { + name = "my-beautiful-bucket-access-role" + description = "Assumed role to access the my beautiful bucket" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Federated = "${var.cluster_oidc_provider_arn}", + } + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "my_beautiful_bucket_policy_attachment" { + role = aws_iam_role.my_beautiful_bucket_access.name + policy_arn = aws_iam_policy.my_beautiful_bucket_policy.arn +} +``` + +After confirming that the policy and role are configured correctly, you will then need to configure a kubernetes service account bound to the IAM role. This can be done in your application/module code like so: +``` +resource "kubernetes_service_account" "my_beautiful_bucket_service_account" { + metadata { + name = "my-beautiful-bucket-service-account" + namespace = var.namespace + annotations = { + "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/my-beautiful-bucket-access-role" + } + } +} +``` + +Finally, you can leverage the newly created service account in your application code by setting `serviceAccountName` to the name of the service account you just created. From 2ab1a753e23190fc12d34edb7070fd3b978a6929 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 11:41:57 -0700 Subject: [PATCH 118/135] Delete temp stack, make s3 module generic, mirgrate signoz module --- deployments/brad-sandbox/main.tf | 88 - deployments/brad-sandbox/provider.tf | 6 - deployments/brad-sandbox/versions.tf | 8 - .../stacks/dpe-k8s-deployments/main.tf | 79 +- modules/s3-bucket/main.tf | 48 + modules/s3-bucket/outputs.tf | 5 + modules/s3-bucket/variables.tf | 15 + modules/s3-bucket/versions.tf | 8 + modules/signoz-fluxcd/main.tf | 347 --- modules/signoz-fluxcd/templates/values.yaml | 2442 ----------------- modules/signoz-fluxcd/variables.tf | 73 - modules/signoz-fluxcd/versions.tf | 17 - modules/signoz/main.tf | 365 ++- .../kustomization.yaml | 0 .../resources-service-scrape}/scrape.yaml | 0 modules/signoz/templates/values.yaml | 11 +- modules/signoz/variables.tf | 10 + 17 files changed, 380 insertions(+), 3142 deletions(-) delete mode 100644 deployments/brad-sandbox/main.tf delete mode 100644 deployments/brad-sandbox/provider.tf delete mode 100644 deployments/brad-sandbox/versions.tf create mode 100644 modules/s3-bucket/versions.tf delete mode 100644 modules/signoz-fluxcd/main.tf delete mode 100644 modules/signoz-fluxcd/templates/values.yaml delete mode 100644 modules/signoz-fluxcd/variables.tf delete mode 100644 modules/signoz-fluxcd/versions.tf rename modules/{signoz-fluxcd/resources => signoz/resources-service-scrape}/kustomization.yaml (100%) rename modules/{signoz-fluxcd/resources => signoz/resources-service-scrape}/scrape.yaml (100%) diff --git a/deployments/brad-sandbox/main.tf b/deployments/brad-sandbox/main.tf deleted file mode 100644 index 6d2f75a6..00000000 --- a/deployments/brad-sandbox/main.tf +++ /dev/null @@ -1,88 +0,0 @@ - -locals { - git_branch = "schematic-138-cold-storage-and-backups" -} - -import { - # The initial administrative stack is created manually in the Spacelift UI, and imported - # See https://docs.spacelift.io/vendors/terraform/terraform-provider.html#proposed-workflow - # "We suggest to first manually create a single administrative stack, and then use it - # to programmatically define other stacks as necessary." - to = spacelift_stack.brad-sandbox - id = "brad-sandbox-administrative-stack" -} - -resource "spacelift_stack" "brad-sandbox" { - github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" - } - - administrative = true - autodeploy = false - branch = local.git_branch - description = "Manages other spacelift resources" - name = "Brad Sandbox Administrative Stack" - project_root = "deployments/brad-sandbox" - terraform_version = "1.8.5" - terraform_workflow_tool = "OPEN_TOFU" - repository = "eks-stack" - space_id = "root" - additional_project_globs = [ - "modules/*", - "modules/**/*", - ] -} - -import { - to = module.brad-sandbox-spacelift.spacelift_space.dpe-space - id = "brad-sandbox-01JC3NVVPWXMP68X90QYCMH7A3" -} - -module "brad-sandbox-spacelift" { - source = "../spacelift/dpe-k8s" - parent_space_id = "development-01J49XEN88DQ8K9MCPPTTEXSKE" - admin_stack_id = spacelift_stack.brad-sandbox.id - spotinst_account = "act-45de6f47" - - aws_integration_id = "01J3R9GX6DC09QV7NV872DDYR3" - auto_deploy = false - auto_prune = true - git_branch = "schematic-138-cold-storage-and-backups" - - space_name = "brad-sandbox" - - k8s_stack_name = "Brad sandbox Kubernetes Infrastructure" - k8s_stack_project_root = "deployments/stacks/dpe-k8s" - - k8s_stack_deployments_name = "Brad sandbox Kubernetes Deployments" - k8s_stack_deployments_project_root = "deployments/stacks/dpe-k8s-deployments" - - auth0_stack_name = "" - auth0_stack_project_root = "" - auth0_domain = "" - auth0_clients = [] - - aws_account_id = "631692904429" - region = "us-east-1" - - cluster_name = "brad-k8-sandbox" - vpc_name = "brad-sandbox" - - vpc_cidr_block = "10.52.32.0/20" - # A public subnet is required for each AZ in which the worker nodes are deployed - public_subnet_cidrs = ["10.52.32.0/24", "10.52.33.0/24", "10.52.35.0/24"] - private_subnet_cidrs_eks_control_plane = ["10.52.34.0/28", "10.52.34.16/28"] - azs_eks_control_plane = ["us-east-1a", "us-east-1b"] - - private_subnet_cidrs_eks_worker_nodes = ["10.52.44.0/22", "10.52.40.0/22", "10.52.36.0/22"] - azs_eks_worker_nodes = ["us-east-1c", "us-east-1b", "us-east-1a"] - - enable_cluster_ingress = false - enable_otel_ingress = false - ssl_hostname = "" - auth0_jwks_uri = "" - ses_email_identities = [] - # Defines the email address that will be used as the sender of the email alerts - smtp_from = "" -} diff --git a/deployments/brad-sandbox/provider.tf b/deployments/brad-sandbox/provider.tf deleted file mode 100644 index 83f5c0c4..00000000 --- a/deployments/brad-sandbox/provider.tf +++ /dev/null @@ -1,6 +0,0 @@ -provider "spacelift" { - # Running from within spacelift does not require these to be set - # api_key_endpoint = "https://sagebionetworks.app.spacelift.io" - # api_key_id = "" - # api_key_secret = "" -} diff --git a/deployments/brad-sandbox/versions.tf b/deployments/brad-sandbox/versions.tf deleted file mode 100644 index aed5ef97..00000000 --- a/deployments/brad-sandbox/versions.tf +++ /dev/null @@ -1,8 +0,0 @@ -terraform { - required_providers { - spacelift = { - source = "spacelift-io/spacelift" - version = "1.13.0" - } - } -} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 48c40b56..a6e61e48 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -82,6 +82,14 @@ module "postgres-cloud-native-database" { argo_deployment_name = "airflow-postgres-cloud-native" } +module "clickhouse-backup-bucket" { + source = "../../../modules/s3-bucket" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn +} module "signoz" { depends_on = [module.argo-cd] @@ -101,24 +109,9 @@ module "signoz" { smtp_user = var.smtp_user smtp_from = var.smtp_from auth0_identifier = var.auth0_identifier -} - -module "signoz-flux-deployment" { - depends_on = [module.flux-cd] - source = "../../../modules/signoz-fluxcd" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision - namespace = "signoz-fluxcd" - argo_deployment_name = "signoz-fluxcd" - enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress - gateway_namespace = "envoy-gateway" - cluster_name = var.cluster_name - auth0_jwks_uri = var.auth0_jwks_uri - smtp_password = var.smtp_password - smtp_user = var.smtp_user - smtp_from = var.smtp_from aws_account_id = var.aws_account_id + s3_backup_bucket_name = module.clickhouse-backup-bucket.bucket_name + s3_access_role_arn = module.clickhouse-backup-bucket.access_role_arn } module "envoy-gateway" { @@ -148,55 +141,3 @@ module "cert-manager" { namespace = "cert-manager" argo_deployment_name = "cert-manager" } - -module "clickhouse_backup_bucket" { - source = "../../../modules/s3-bucket" - bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" -} - -resource "aws_iam_policy" "clickhouse_backup_policy" { - name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}" - description = "Policy to access the clickhouse backup bucket" - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "s3:ListBucket", - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject", - ] - Resource = [ - module.clickhouse_backup_bucket.bucket_arn, - "${module.clickhouse_backup_bucket.bucket_arn}/*" - ] - } - ] - }) -} - -resource "aws_iam_role" "clickhouse_backup_access" { - name = "clickhouse-backup-access-role-${var.aws_account_id}-${var.cluster_name}" - description = "Assumed role to access the clickhouse backup policy" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRoleWithWebIdentity" - Effect = "Allow" - Principal = { - Federated = "${var.cluster_oidc_provider_arn}", - } - } - ] - }) -} - -resource "aws_iam_role_policy_attachment" "clickhouse_backup_policy_attachment" { - role = aws_iam_role.clickhouse_backup_access.name - policy_arn = aws_iam_policy.clickhouse_backup_policy.arn -} diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index 967a0b36..068b4920 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -14,3 +14,51 @@ resource "aws_s3_bucket_versioning" "versioning" { status = var.enable_versioning ? "Enabled" : "Suspended" } } + + +resource "aws_iam_policy" "s3-access-policy" { + name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + description = "Policy to access the s3 bucket" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + ] + Resource = [ + module.clickhouse_backup_bucket.bucket_arn, + "${module.clickhouse_backup_bucket.bucket_arn}/*" + ] + } + ] + }) +} + +resource "aws_iam_role" "s3-access-iam-role" { + name = "s3-access-role-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + description = "Assumed role to access the s3 bucket with the given permissions." + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRoleWithWebIdentity" + Effect = "Allow" + Principal = { + Federated = "${var.cluster_oidc_provider_arn}", + } + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "s3-access-policy-attachment" { + role = aws_iam_role.s3-access-iam-role.name + policy_arn = aws_iam_policy.s3-access-policy.arn +} diff --git a/modules/s3-bucket/outputs.tf b/modules/s3-bucket/outputs.tf index c8a58a33..25983295 100644 --- a/modules/s3-bucket/outputs.tf +++ b/modules/s3-bucket/outputs.tf @@ -7,3 +7,8 @@ output "bucket_arn" { description = "ARN of the created S3 bucket" value = aws_s3_bucket.bucket.arn } + +output "access_role_arn" { + description = "ARN of the role to access the S3 bucket" + value = aws_iam_role.s3-access-iam-role.arn +} \ No newline at end of file diff --git a/modules/s3-bucket/variables.tf b/modules/s3-bucket/variables.tf index b8e0a882..fb9e9193 100644 --- a/modules/s3-bucket/variables.tf +++ b/modules/s3-bucket/variables.tf @@ -14,3 +14,18 @@ variable "enable_versioning" { type = bool default = true } + +variable "aws_account_id" { + description = "AWS account ID" + type = string +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "cluster_oidc_provider_arn" { + description = "EKS cluster ARN for the oidc provider" + type = string +} diff --git a/modules/s3-bucket/versions.tf b/modules/s3-bucket/versions.tf new file mode 100644 index 00000000..cba4c144 --- /dev/null +++ b/modules/s3-bucket/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/modules/signoz-fluxcd/main.tf b/modules/signoz-fluxcd/main.tf deleted file mode 100644 index 192f83b0..00000000 --- a/modules/signoz-fluxcd/main.tf +++ /dev/null @@ -1,347 +0,0 @@ -locals { - alertmanager_enabled = var.smtp_from != "" && var.smtp_user != "" && var.smtp_password != "" -} - -resource "kubernetes_namespace" "signoz" { - metadata { - name = var.namespace - } -} - -resource "kubectl_manifest" "signoz-helm-repo" { - depends_on = [kubernetes_namespace.signoz] - - yaml_body = < - - - - 10485760 - - - s3 - https://clickhouse-backup-${var.aws_account_id}-${var.cluster_name}.s3.amazonaws.com/coldstorage/ - true - us-east-1 - - - - - - - default - - - s3 - 0 - 1 - - - 0 - - - - -YAML -} - - -# resource "kubectl_manifest" "signoz-deployment" { -# depends_on = [kubernetes_namespace.signoz] - -# yaml_body = < - - # -- Clickhouse image - image: - # -- Clickhouse image registry to use. - registry: docker.io - # -- Clickhouse image repository to use. - repository: clickhouse/clickhouse-server - # -- Clickhouse image tag to use (example: `21.8`). - # SigNoz is not always tested with latest version of ClickHouse. - # Only if you know what you are doing, proceed with overriding. - tag: 24.1.2-alpine - # -- Clickhouse image pull policy. - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for ClickHouse. - # If global.imagePullSecrets is set as well, it will merged. - imagePullSecrets: [] - # - "clickhouse-pull-secret" - - # -- ClickHouse instance annotations. - annotations: {} - - # ClickHouse Service Account - serviceAccount: - # -- Specifies whether a service account should be created - create: true - # -- Annotations to add to the service account - annotations: {} - # -- The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # Clickhouse service - service: - # -- Annotations to use by service associated to Clickhouse instance - annotations: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - # -- Clickhouse HTTP port - httpPort: 8123 - # -- Clickhouse TCP port - tcpPort: 9000 - - # -- Whether to use TLS connection connecting to ClickHouse - secure: false - # -- Whether to verify TLS certificate on connection to ClickHouse - verify: false - # -- URL for zookeeper. - externalZookeeper: {} - # servers: - # - host: signoz-signoz-zookeeper - # port: 2181 - - # -- Node selector for settings for clickhouse pod - nodeSelector: { - spotinst.io/node-lifecycle: "od" - } - # -- Toleration labels for clickhouse pod assignment - tolerations: [] - # -- Affinity settings for clickhouse pod - affinity: {} - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 200Mi - # limits: - # cpu: 2000m - # memory: 4Gi - - # -- Security context for Clickhouse node - securityContext: - enabled: true - runAsUser: 101 - runAsGroup: 101 - fsGroup: 101 - fsGroupChangePolicy: OnRootMismatch - - # -- An allowlist of IP addresses or network masks the ClickHouse user is - # allowed to access from. By default anything within a private network will be - # allowed. This should suffice for most use case although to expose to other - # networks you will need to update this setting. - # - # Refs: - # - https://clickhouse.com/docs/en/operations/settings/settings-users/#user-namenetworks - # - https://en.wikipedia.org/wiki/Reserved_IP_addresses#IPv4 - allowedNetworkIps: - - "10.0.0.0/8" - - "100.64.0.0/10" - - "172.16.0.0/12" - - "192.0.0.0/24" - - "198.18.0.0/15" - - "192.168.0.0/16" - - persistence: - # -- Enable data persistence using PVC for ClickHouseDB data. - enabled: true - - # -- Use a manually managed Persistent Volume and Claim. - # If defined, PVC must be created manually before volume will be bound. - # (only when deploying a single replica). - # - existingClaim: "" - - # -- Persistent Volume Storage Class to use. - # If defined, `storageClassName: `. - # If set to "-", `storageClassName: ""`, which disables dynamic provisioning - # If undefined (the default) or set to `null`, no storageClassName spec is - # set, choosing the default provisioner. - # - storageClass: null - - # -- Access Modes for persistent volume - accessModes: - - ReadWriteOnce - - # -- Persistent Volume size - size: 20Gi - - # -- Clickhouse user profile configuration. - # You can use this to override profile settings, for example - # `default/max_memory_usage: 40000000000` or `default/max_concurrent_queries: 200` - # - # For the full list of settings, see: - # - https://clickhouse.com/docs/en/operations/settings/settings-profiles/ - # - https://clickhouse.com/docs/en/operations/settings/settings/ - # - profiles: {} - - # -- Default user profile configuration for Clickhouse. !!! Please DO NOT override this !!! - defaultProfiles: - default/allow_experimental_window_functions: "1" - default/allow_nondeterministic_mutations: "1" - - # -- Clickhouse init container to copy histogramQuantile UDF - # @default -- See `values.yaml` for defaults - initContainers: - enabled: true - udf: - enabled: true - image: - registry: docker.io - repository: alpine - tag: 3.18.2 - pullPolicy: IfNotPresent - command: - - sh - - -c - - | - set -x - wget -O /tmp/histogramQuantile https://github.com/SigNoz/signoz/raw/develop/deploy/docker/clickhouse-setup/user_scripts/histogramQuantile - mv /tmp/histogramQuantile /var/lib/clickhouse/user_scripts/histogramQuantile - chmod +x /var/lib/clickhouse/user_scripts/histogramQuantile - init: - enabled: false - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - - /bin/sh - - -c - - | - set -e - until curl -s -o /dev/null http://signoz-clickhouse:8123/ - do sleep 1 - done - - # -- Clickhouse cluster layout. (Experimental, use at own risk) - # For a full list of options, see https://github.com/Altinity/clickhouse-operator/blob/master/docs/custom_resource_explained.md - # section on clusters and layouts. - # - layout: - shardsCount: 1 - replicasCount: 2 - - # -- ClickHouse settings configuration. - # You can use this to override settings, for example `prometheus/port: 9363` - # For the full list of settings, see: - # - https://clickhouse.com/docs/en/operations/settings/settings/ - # - settings: - # Uncomment those lines if you want to enable the built-in Prometheus HTTP endpoint in ClickHouse. - prometheus/endpoint: /metrics - prometheus/port: 9363 - # prometheus/metrics: true - # prometheus/events: true - # prometheus/asynchronous_metrics: true - - # -- Default settings configuration for ClickHouse. !!! Please DO NOT override this !!! - defaultSettings: - format_schema_path: /etc/clickhouse-server/config.d/ - user_scripts_path: /var/lib/clickhouse/user_scripts/ - user_defined_executable_functions_config: '/etc/clickhouse-server/functions/custom-functions.xml' - - # -- ClickHouse pod(s) annotation. - podAnnotations: - signoz.io/scrape: 'true' - signoz.io/port: '9363' - signoz.io/path: /metrics - - # -- Topologies on how to distribute the ClickHouse pod. - # Possible values can be found here: - # - https://github.com/Altinity/clickhouse-operator/blob/1414503921da3ae475eb6f9a296d3475a6993768/docs/chi-examples/99-clickhouseinstallation-max.yaml#L428-L481 - podDistribution: [] - # - type: ShardAntiAffinity - # topologyKey: kubernetes.io/hostname - # - type: ReplicaAntiAffinity - # topologyKey: kubernetes.io/hostname - # - type: MaxNumberPerNode - # number: 2 - # topologyKey: kubernetes.io/hostname - - # TODO: Enable cold storage: https://sagebionetworks.jira.com/browse/IBCDPE-1094 - # Cold storage configuration - coldStorage: - # -- Whether to enable S3 cold storage - enabled: false - # -- Reserve free space on default disk (in bytes) - # Default value is 10MiB - defaultKeepFreeSpaceBytes: "10485760" - # -- Type of cold storage: s3 or gcs - type: s3 - # -- Endpoint for S3 or GCS - # For S3, if region is us-east-1, endpoint can be https://s3.amazonaws.com - # if region is not us-east-1, endpoint should be https://s3-.amazonaws.com - # For GCS, endpoint should be https://storage.googleapis.com//data/ - endpoint: https://.s3-.amazonaws.com/data/ - # -- Access Key for S3 or GCS - # accessKey: - # -- Secret Access Key for S3 or GCS - # secretAccess: - # AWS role configuration - to use environment variables instead of passing access and secret keys - role: - # -- Whether to enable AWS IAM ARN role. - enabled: false - # -- Annotations to use by service account associated to Clickhouse instance - annotations: - # aws role arn - eks.amazonaws.com/role-arn: arn:aws:iam::******:role/***** - - # -- Clickhouse configuration files. - # - # Refs: - # - https://clickhouse.com/docs/en/operations/configuration-files/ - # - https://github.com/Altinity/clickhouse-operator/blob/master/docs/chi-examples/05-settings-05-files-nested.yaml - files: {} - # config.d/log_rotation.xml: | - # - # - # trace - # true - # /var/log/clickhouse-server/clickhouse-server.err.log - # /var/log/clickhouse-server/clickhouse-server.log - # 100M - # 10 - # - # - # test.xml: | - # - # some-value - # - - ### - ### - ### ---- MISC ---- - ### - ### - - # -- When the `installCustomStorageClass` is enabled with `cloud` set as `gcp` or `aws`, - # it creates custom storage class with volume expansion permission. - installCustomStorageClass: false - - ### - ### - ### ---- CLICKHOUSE OPERATOR ---- - ### - ### - clickhouseOperator: - # -- name of the component - name: operator - - # -- Version of the operator - version: 0.21.2 - - # -- Clickhouse Operator image - image: - # -- Clickhouse Operator image registry to use. - registry: docker.io - # -- Clickhouse Operator image repository to use. - repository: altinity/clickhouse-operator - # -- Clickhouse Operator image tag. - tag: 0.21.2 - # -- Clickhouse Operator image pull policy. - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for Clickhouse Operator. - # If global.imagePullSecrets is set as well, it will merged. - imagePullSecrets: [] - # - "clickhouseOperator-pull-secret" - - # ClickHouse Operator Service Account - serviceAccount: - # -- Specifies whether a service account should be created - create: true - # -- Annotations to add to the service account - annotations: {} - # -- The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # Clickhouse logging config - logger: - # -- Logging level. Acceptable values: trace, debug, information, warning, error. - level: information - # -- Size of the file. Applies to log and errorlog. Once the file reaches size, - # ClickHouse archives and renames it, and creates a new log file in its place. - size: 1000M - # -- The number of archived log files that ClickHouse stores. - count: 10 - # -- Whether to send log and errorlog to the console instead of file. To enable, set to 1 or true. - console: 1 - - # Query Log table configuration - queryLog: - # -- The number of days to keep the data in the query_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the query_log table. - flushInterval: 7500 - # Part Log table configuration - partLog: - # -- The number of days to keep the data in the part_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the part_log table. - flushInterval: 7500 - # Trace Log table configuration - traceLog: - # -- The number of days to keep the data in the trace_log table. - ttl: 7 - # -- Time interval in milliseconds between flushes of the trace_log table. - flushInterval: 7500 - - asynchronousInsertLog: - # -- The number of days to keep the data in the asynchronous_insert_log table. - ttl: 7 - # -- Time interval in milliseconds between flushes of the asynchronous_insert_log table. - flushInterval: 7500 - asynchronousMetricLog: - # -- The number of days to keep the data in the asynchronous_metric_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the asynchronous_metric_log table. - flushInterval: 7500 - backupLog: - # -- The number of days to keep the data in the backup_log table. - ttl: 7 - # -- Time interval in milliseconds between flushes of the backup_log table. - flushInterval: 7500 - blobStorageLog: - # -- The number of days to keep the data in the blob_storage_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the blob_storage_log table. - flushInterval: 7500 - crashLog: - # -- The number of days to keep the data in the crash_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the crash_log table. - flushInterval: 7500 - metricLog: - # -- The number of days to keep the data in the metric_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the metric_log table. - flushInterval: 7500 - queryThreadLog: - # -- The number of days to keep the data in the query_thread_log table. - ttl: 7 - # -- Time interval in milliseconds between flushes of the query_thread_log table. - flushInterval: 7500 - queryViewsLog: - # -- The number of days to keep the data in the query_views_log table. - ttl: 15 - # -- Time interval in milliseconds between flushes of the query_views_log table. - flushInterval: 7500 - sessionLog: - # -- The number of days to keep the data in the session_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the session_log table. - flushInterval: 7500 - zookeeperLog: - # -- The number of days to keep the data in the zookeeper_log table. - ttl: 30 - # -- Time interval in milliseconds between flushes of the zookeeper_log table. - flushInterval: 7500 - processorsProfileLog: - # -- The number of days to keep the data in the processors_profile_log table. - ttl: 7 - # -- Time interval in milliseconds between flushes of the processors_profile_log table. - flushInterval: 7500 - - # -- Clickhouse Operator pod(s) annotation. - podAnnotations: - signoz.io/port: '8888' - signoz.io/scrape: 'true' - - # -- Clickhouse Operator node selector - nodeSelector: {} - - # -- Metrics Exporter config. - metricsExporter: - # -- name of the component - name: metrics-exporter - - # -- Metrics Exporter service - service: - # -- Annotations to use by service associated to Metrics Exporter - annotations: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - # -- Metrics Exporter port - port: 8888 - - # -- Metrics Exporter image - image: - # -- Metrics Exporter image registry to use. - registry: docker.io - # -- Metrics Exporter image repository to use. - repository: altinity/metrics-exporter - # -- Metrics Exporter image tag. - tag: 0.21.2 - # -- Metrics Exporter image pull policy. - pullPolicy: IfNotPresent - - -## External clickhouse configuration -## This is required when clickhouse.enabled is false -externalClickhouse: - # -- Host of the external cluster. - host: - # -- Name of the external cluster to run DDL queries on. - cluster: cluster - # -- Database name for the external cluster - database: signoz_metrics - # -- Clickhouse trace database (SigNoz Traces) - traceDatabase: signoz_traces - # -- Clickhouse log database (SigNoz Logs) - logDatabase: signoz_logs - # -- User name for the external cluster to connect to the external cluster as - user: "" - # -- Password for the cluster. Ignored if externalClickhouse.existingSecret is set - password: "" - # -- Name of an existing Kubernetes secret object containing the password - existingSecret: - # -- Name of the key pointing to the password in your Kubernetes secret - existingSecretPasswordKey: - # -- Whether to use TLS connection connecting to ClickHouse - secure: false - # -- Whether to verify TLS connection connecting to ClickHouse - verify: false - # -- HTTP port of Clickhouse - httpPort: 8123 - # -- TCP port of Clickhouse - tcpPort: 9000 - -# Default values for query-service -queryService: - name: "query-service" - replicaCount: 1 - image: - registry: docker.io - repository: signoz/query-service - tag: 0.57.0 - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for Query-Service - # If set, this has higher precedence than the root level or global value of imagePullSecrets. - imagePullSecrets: [] - - # Query-Service Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # Query-Service service - service: - # -- Annotations to use by service associated to Query-Service - annotations: {} - # -- Labels to use by service associated to Query-Service - labels: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - # -- Query-Service HTTP port - port: 8080 - # -- Query-Service Internal port - internalPort: 8085 - # -- Query-Service OpAMP Internal port - opampPort: 4320 - # -- Set this to you want to force a specific nodePort for http. - # Must be use with service.type=NodePort - nodePort: null - # -- Set this to you want to force a specific nodePort for internal. - # Must be use with service.type=NodePort - internalNodePort: null - - # -- Query-Service annotations - annotations: {} - - # -- Query-Service additional arguments for command line - additionalArgs: - - --use-logs-new-schema=true - - # -- Additional environments to set for queryService - additionalEnvs: {} - # env_key: env_value - - initContainers: - init: - enabled: true - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /ping - waitMessage: "waiting for clickhouseDB" - doneMessage: "clickhouse ready, starting query service now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - migration: - enabled: false - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - args: [] - command: [] - # - sh - # - -c - # - | - # echo "Running migration" - # sleep 10 # Replace with actual migration command - # echo "Migration completed" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - - configVars: - storage: clickhouse - # ClickHouse URL is set and applied internally. - # Don't override unless you know what you are doing. - # clickHouseUrl: tcp://clickhouse_operator:clickhouse_operator_password@my-release-clickhouse:9000/signoz_traces - goDebug: netdns=go - telemetryEnabled: true - deploymentType: kubernetes-helm - - # Query-Service cache options - cache: - # -- Whether to enable cache for Query-Service - enabled: true - # -- Cache flux interval for Query-Service - fluxInterval: 30m - # -- Cache configurations for Query-Service - config: - name: cache - provider: inmemory - inmemory: - ttl: 168h - - podSecurityContext: {} - # fsGroup: 2000 - - securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - - # -- Configure liveness and readiness probes. - # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes - livenessProbe: - enabled: true - port: http - path: /api/v1/health - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - readinessProbe: - enabled: true - port: http - path: /api/v1/health?live=1 - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - - # -- Custom liveness probe - customLivenessProbe: {} - # -- Custom readiness probe - customReadinessProbe: {} - - ingress: - # -- Enable ingress for Query-Service - enabled: false - # -- Ingress Class Name to be used to identify ingress controllers - className: "" - # -- Annotations to Query-Service Ingress - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - # cert-manager.io/cluster-issuer: letsencrypt-prod - # -- Query-Service Ingress Host names with their path details - hosts: - - host: query-service.domain.com - paths: - - path: / - pathType: ImplementationSpecific - port: 8080 - # -- Query-Service Ingress TLS - tls: [] - # - secretName: chart-example-tls - # hosts: - # - query-service.domain.com - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 100Mi - # limits: - # cpu: 750m - # memory: 1000Mi - - # -- QueryService priority class name - priorityClassName: "" - # -- Node selector for settings for QueryService pod - nodeSelector: {} - # -- Toleration labels for QueryService pod assignment - tolerations: [] - # -- Affinity settings for QueryService pod - affinity: {} - # -- TopologySpreadConstraints describes how QueryService pods ought to spread - topologySpreadConstraints: [] - - persistence: - # -- Enable data persistence using PVC for SQLiteDB data. - enabled: true - - # -- Name of an existing PVC to use (only when deploying a single replica) - existingClaim: "" - - # -- Persistent Volume Storage Class to use. - # If defined, `storageClassName: `. - # If set to "-", `storageClassName: ""`, which disables dynamic provisioning - # If undefined (the default) or set to `null`, no storageClassName spec is - # set, choosing the default provisioner. - # - storageClass: null - - # -- Access Modes for persistent volume - accessModes: - - ReadWriteOnce - - # -- Persistent Volume size - size: 1Gi - - -# Default values for frontend -frontend: - name: "frontend" - replicaCount: 1 - - image: - registry: docker.io - repository: signoz/frontend - tag: 0.57.0 - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for Frontend - # If set, this has higher precedence than the root level or global value of imagePullSecrets. - imagePullSecrets: [] - - # Frontend Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # Frontend service - service: - # -- Annotations to use by service associated to Frontend - annotations: {} - # -- Labels to use by service associated to Frontend - labels: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - # -- Frontend HTTP port - port: 3301 - - initContainers: - init: - enabled: true - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /api/v1/health?live=1 - waitMessage: "waiting for query-service" - doneMessage: "query-service ready, starting frontend now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - - autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 11 - targetCPUUtilizationPercentage: 50 - targetMemoryUtilizationPercentage: 50 - behavior: {} - # scaleDown: - # stabilizationWindowSeconds: 300 - # policies: - # - type: Pods - # value: 1 - # periodSeconds: 180 - # scaleUp: - # stabilizationWindowSeconds: 300 - # policies: - # - type: Pods - # value: 2 - # periodSeconds: 60 - - autoscalingTemplate: [] - keda: - enabled: false - pollingInterval: "30" # check 30sec periodically for metrics data - cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale - minReplicaCount: "1" # should be >= replicaCount specified in values.yaml - maxReplicaCount: "5" - triggers: - - type: memory - metadata: - type: Utilization - value: "80" # hpa make sure average Utilization <=80 by adding new pods - - type: cpu - metadata: - type: Utilization - value: "80" # hpa make sure average Utlization <=80 by adding new pods - - configVars: {} - - # -- Frontend deployment annotations - annotations: {} - - # -- Frontend pod security context - podSecurityContext: {} - # fsGroup: 2000 - - securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - - ingress: - # -- Enable ingress for Frontend - enabled: false - # -- Ingress Class Name to be used to identify ingress controllers - className: "" - # -- Annotations to Frontend Ingress - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - # cert-manager.io/cluster-issuer: letsencrypt-prod - # -- Frontend Ingress Host names with their path details - hosts: - - host: frontend.domain.com - paths: - - path: / - pathType: ImplementationSpecific - port: 3301 - # -- Frontend Ingress TLS - tls: [] - # - secretName: chart-example-tls - # hosts: - # - frontend.domain.com - - # -- Frontend Nginx extra configurations - nginxExtraConfig: | - client_max_body_size 24M; - large_client_header_buffers 8 16k; - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 100Mi - # limits: - # cpu: 200m - # memory: 200Mi - - # -- Frontend priority class name - priorityClassName: "" - # -- Node selector for settings for Frontend pod - nodeSelector: {} - # -- Toleration labels for Frontend pod assignment - tolerations: [] - # -- Affinity settings for Frontend pod - affinity: {} - # -- TopologySpreadConstraints describes how Frontend pods ought to spread - topologySpreadConstraints: [] - -# Default values for Alertmanager -alertmanager: - enabled: - name: "alertmanager" - replicaCount: 1 - - image: - registry: docker.io - repository: signoz/alertmanager - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: 0.23.7 - - # -- Image Registry Secret Names for Alertmanager - # If set, this has higher precedence than the root level or global value of imagePullSecrets. - imagePullSecrets: [] - - # -- Alertmanager custom command override - command: [] - # -- Alertmanager extra Arguments - extraArgs: {} - - # Alertmanager Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # Alertmanager service - service: - # -- Annotations to use by service associated to Alertmanager - annotations: {} - # -- Labels to use by service associated to Alertmanager - labels: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - # -- Alertmanager HTTP port - port: 9093 - # -- Alertmanager cluster port - clusterPort: 9094 - # -- Set this to you want to force a specific nodePort. Must be use with service.type=NodePort - nodePort: null - - # -- Additional environments to set for Alertmanager - additionalEnvs: - ALERTMANAGER_SMTP_FROM: - ALERTMANAGER_SMTP_HOST: email-smtp.us-east-1.amazonaws.com - # 587 is the STARTTLS port for SMTP - # https://docs.aws.amazon.com/ses/latest/dg/smtp-connect.html#smtp-connect-starttls - ALERTMANAGER_SMTP_PORT: "587" - ALERTMANAGER_SMTP_AUTH_USERNAME: - ALERTMANAGER_SMTP_AUTH_PASSWORD: - - initContainers: - init: - enabled: true - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /api/v1/health?live=1 - waitMessage: "waiting for query-service" - doneMessage: "query-service ready, starting alertmanager now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - - podSecurityContext: - fsGroup: 65534 - dnsConfig: {} - # nameservers: - # - 1.2.3.4 - # searches: - # - ns1.svc.cluster-domain.example - # - my.dns.search.suffix - # options: - # - name: ndots - # value: "2" - # - name: edns0 - securityContext: - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - runAsUser: 65534 - runAsNonRoot: true - runAsGroup: 65534 - - additionalPeers: [] - - livenessProbe: - httpGet: - path: / - port: http - - readinessProbe: - httpGet: - path: / - port: http - - ingress: - # -- Enable ingress for Alertmanager - enabled: false - # -- Ingress Class Name to be used to identify ingress controllers - className: "" - # -- Annotations to Alertmanager Ingress - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - # cert-manager.io/cluster-issuer: letsencrypt-prod - # -- Alertmanager Ingress Host names with their path details - hosts: - - host: alertmanager.domain.com - paths: - - path: / - pathType: ImplementationSpecific - port: 9093 - # -- Alertmanager Ingress TLS - tls: [] - # - secretName: chart-example-tls - # hosts: - # - alertmanager.domain.com - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 100Mi - # limits: - # cpu: 200m - # memory: 200Mi - - # -- Alertmanager priority class name - priorityClassName: "" - # -- Node selector for settings for Alertmanager pod - nodeSelector: { - spotinst.io/node-lifecycle: "od" - } - # -- Toleration labels for Alertmanager pod assignment - tolerations: [] - # -- Affinity settings for Alertmanager pod - affinity: {} - # -- TopologySpreadConstraints describes how Alertmanager pods ought to spread - topologySpreadConstraints: [] - - statefulSet: - annotations: {} - - podAnnotations: {} - podLabels: {} - - # Ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/ - podDisruptionBudget: {} - # maxUnavailable: 1 - # minAvailable: 1 - - persistence: - # -- Enable data persistence using PVC for Alertmanager data. - enabled: true - - # -- Name of an existing PVC to use (only when deploying a single replica) - existingClaim: "" - - # -- Persistent Volume Storage Class to use. - # If defined, `storageClassName: `. - # If set to "-", `storageClassName: ""`, which disables dynamic provisioning - # If undefined (the default) or set to `null`, no storageClassName spec is - # set, choosing the default provisioner. - # - storageClass: null - - # -- Access Modes for persistent volume - accessModes: - - ReadWriteOnce - - # -- Persistent Volume size - size: 100Mi - - ## Using the config, alertmanager.yml file is created. - ## We no longer need the config file as query services - ## delivers the required config. - # config: - # global: - # resolve_timeout: 1m - # slack_api_url: 'https://hooks.slack.com/services/xxx' - - # templates: - # - '/etc/alertmanager/*.tmpl' - - # receivers: - # - name: 'slack-notifications' - # slack_configs: - # - channel: '#alerts' - # send_resolved: true - # icon_url: https://avatars3.githubusercontent.com/u/3380462 - # title: '{{ template "slack.title" . }}' - # text: '{{ template "slack.text" . }}' - - # route: - # receiver: 'slack-notifications' - - ## Templates are no longer needed as they are included - ## from frontend placeholder while creating alert channels. - # templates: - # title.tmpl: |- - # {{ define "slack.title" }} - # [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} - # {{- if gt (len .CommonLabels) (len .GroupLabels) -}} - # {{" "}}( - # {{- with .CommonLabels.Remove .GroupLabels.Names }} - # {{- range $index, $label := .SortedPairs -}} - # {{ if $index }}, {{ end }} - # {{- $label.Name }}="{{ $label.Value -}}" - # {{- end }} - # {{- end -}} - # ) - # {{- end }} - # {{ end }} - # text.tmpl: |- - # {{ define "slack.text" }} - # {{ range .Alerts -}} - # *Alert:* {{ .Labels.alertname }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} - - # *Summary:* {{ .Annotations.summary }} - # *Description:* {{ .Annotations.description }} - - # *Details:* - # {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` - # {{ end }} - # {{ end }} - # {{ end }} - - ## Monitors ConfigMap changes and POSTs to a URL - ## Ref: https://github.com/jimmidyson/configmap-reload - ## - configmapReload: - ## If false, the configmap-reload container will not be deployed - ## - enabled: false - - ## configmap-reload container name - ## - name: configmap-reload - - ## configmap-reload container image - ## - image: - repository: jimmidyson/configmap-reload - tag: v0.5.0 - pullPolicy: IfNotPresent - - # containerPort: 9533 - - # -- Configure resource requests and limits. Update as per your need. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 100Mi - # limits: - # cpu: 200m - # memory: 200Mi - -# Default values for schemaMigrator -schemaMigrator: - enabled: true - name: "schema-migrator" - - image: - registry: docker.io - repository: signoz/signoz-schema-migrator - tag: 0.111.5 - pullPolicy: IfNotPresent - - args: - - "--up=" - # For usual Helm installs, we don't need any additional annotations. - # As well as for Helm upgrade (with upgradeHelmHooks to true), we automatically include the required pre-upgrade helm hooks. - # For ArgoCD, since every upgrade is an install, we need to automatically include the relevant ArgoCD hooks using upgradeHelmHooks. - annotations: {} - # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. - # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. - upgradeHelmHooks: false - - # -- Whether to enable replication for schemaMigrator - enableReplication: true - - # -- Node selector for settings for schemaMigrator - nodeSelector: {} - # -- Toleration labels for schemaMigrator assignment - tolerations: [] - # -- Affinity settings for schemaMigrator - affinity: {} - # -- TopologySpreadConstraints describes how schemaMigrator pods ought to spread - topologySpreadConstraints: [] - - initContainers: - init: - enabled: true - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /ping - waitMessage: "waiting for clickhouseDB" - doneMessage: "clickhouse ready, starting schema migrator now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - chReady: - enabled: true - image: - registry: docker.io - repository: clickhouse/clickhouse-server - tag: 24.1.2-alpine - pullPolicy: IfNotPresent - command: - - "sh" - - "-c" - - | - echo "Running clickhouse ready check" - while true - do - version="$(CLICKHOUSE_VERSION)" - shards="$(CLICKHOUSE_SHARDS)" - replicas="$(CLICKHOUSE_REPLICAS)" - current_version="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT version()")" - if [ -z "$current_version" ]; then - echo "waiting for clickhouse to be ready" - sleep 5 - continue - fi - if [ -z "$(echo "$current_version" | grep "$version")" ]; then - echo "expected version: $version, current version: $current_version" - echo "waiting for clickhouse with correct version" - sleep 5 - continue - fi - current_shards="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(shard_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" - if [ -z "$current_shards" ]; then - echo "waiting for clickhouse to be ready" - sleep 5 - continue - fi - if [ "$current_shards" -ne "$shards" ]; then - echo "expected shard count: $shards, current shard count: $current_shards" - echo "waiting for clickhouse with correct shard count" - sleep 5 - continue - fi - current_replicas="$(clickhouse client --host ${CLICKHOUSE_HOST} --port ${CLICKHOUSE_PORT} --user "${CLICKHOUSE_USER}" --password "${CLICKHOUSE_PASSWORD}" -q "SELECT count(DISTINCT(replica_num)) FROM system.clusters WHERE cluster = '${CLICKHOUSE_CLUSTER}'")" - if [ -z "$current_replicas" ]; then - echo "waiting for clickhouse to be ready" - sleep 5 - continue - fi - if [ "$current_replicas" -ne "$replicas" ]; then - echo "expected replica count: $replicas, current replica count: $current_replicas" - echo "waiting for clickhouse with correct replica count" - sleep 5 - continue - fi - break - done - echo "clickhouse ready, starting schema migrator now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - wait: - enabled: true - image: - registry: docker.io - repository: groundnuty/k8s-wait-for - tag: v2.0 - pullPolicy: IfNotPresent - env: [] - - # SchemaMigrator Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # SchemaMigrator RBAC config - role: - # -- Specifies whether a clusterRole should be created - create: true - # -- Annotations to add to the clusterRole - annotations: {} - # -- The name of the clusterRole to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - # -- A set of rules as documented here. - # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ - # @default -- See `values.yaml` for defaults - rules: - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "watch"] - - # SchemaMigrator clusterRoleBinding - roleBinding: - # Annotations to add to the clusterRoleBinding - annotations: {} - # The name of the clusterRoleBinding to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - -# Default values for OtelCollector -otelCollector: - name: "otel-collector" - image: - registry: docker.io - repository: signoz/signoz-otel-collector - tag: 0.111.5 - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for OtelCollector - # If set, this has higher precedence than the root level or global value of imagePullSecrets. - imagePullSecrets: [] - - initContainers: - init: - enabled: false - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /ping - waitMessage: "waiting for clickhouseDB" - doneMessage: "clickhouse ready, starting otel collector now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - - # OpenTelemetry Collector executable - command: - # -- OtelCollector command name - name: /signoz-collector - # -- OtelCollector command extra arguments - extraArgs: - - --feature-gates=-pkg.translator.prometheus.NormalizeName - - configMap: - # -- Specifies whether a configMap should be created (true by default) - create: true - - # OtelCollector Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # OtelCollector service - service: - # -- Annotations to use by service associated to OtelCollector - annotations: {} - # -- Labels to use by service associated to OtelCollector - labels: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - - # -- OtelCollector Deployment annotation. - annotations: {} - # -- OtelCollector pod(s) annotation. - podAnnotations: - signoz.io/scrape: 'true' - signoz.io/port: '8888' - - # -- OtelCollector pod(s) labels. - podLabels: {} - - # -- Additional environments to set for OtelCollector - additionalEnvs: {} - # env_key: env_value - - # -- Whether to enable grouping of exceptions with same name and different stack trace. - # This is useful when you have a lot of exceptions with same name but different stack trace. - # This is a tradeoff between cardinality and accuracy of exception grouping. - lowCardinalityExceptionGrouping: false - - minReadySeconds: 5 - progressDeadlineSeconds: 600 - replicaCount: 2 - - # OtelCollector RBAC config - clusterRole: - # -- Specifies whether a clusterRole should be created - create: true - # -- Annotations to add to the clusterRole - annotations: {} - # -- The name of the clusterRole to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - # -- A set of rules as documented here. - # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ - # @default -- See `values.yaml` for defaults - rules: - # k8sattributes processor requires these permissions - - apiGroups: [""] - resources: ["pods", "namespaces", "nodes"] - verbs: ["get", "list", "watch"] - - apiGroups: ["apps"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] - - apiGroups: ["extensions"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "watch"] - - # OtelCollector clusterRoleBinding - clusterRoleBinding: - # Annotations to add to the clusterRoleBinding - annotations: {} - # The name of the clusterRoleBinding to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - - # Configuration for ports - ports: - otlp: - # -- Whether to enable service port for OTLP gRPC - enabled: true - # -- Container port for OTLP gRPC - containerPort: 4317 - # -- Service port for OTLP gRPC - servicePort: 4317 - # -- Node port for OTLP gRPC - nodePort: "" - # -- Protocol to use for OTLP gRPC - protocol: TCP - otlp-http: - # -- Whether to enable service port for OTLP HTTP - enabled: true - # -- Container port for OTLP HTTP - containerPort: 4318 - # -- Service port for OTLP HTTP - servicePort: 4318 - # -- Node port for OTLP HTTP - nodePort: "" - # -- Protocol to use for OTLP HTTP - protocol: TCP - jaeger-compact: - # -- Whether to enable service port for Jaeger Compact - enabled: false - # -- Container port for Jaeger Compact - containerPort: 6831 - # -- Service port for Jaeger Compact - servicePort: 6831 - # -- Node port for Jaeger Compact - nodePort: "" - # -- Protocol to use for Jaeger Compact - protocol: UDP - jaeger-thrift: - # -- Whether to enable service port for Jaeger Thrift HTTP - enabled: false - # -- Container port for Jaeger Thrift - containerPort: 14268 - # -- Service port for Jaeger Thrift - servicePort: 14268 - # -- Node port for Jaeger Thrift - nodePort: "" - # -- Protocol to use for Jaeger Thrift - protocol: TCP - jaeger-grpc: - # -- Whether to enable service port for Jaeger gRPC - enabled: false - # -- Container port for Jaeger gRPC - containerPort: 14250 - # -- Service port for Jaeger gRPC - servicePort: 14250 - # -- Node port for Jaeger gRPC - nodePort: "" - # -- Protocol to use for Jaeger gRPC - protocol: TCP - zipkin: - # -- Whether to enable service port for Zipkin - enabled: false - # -- Container port for Zipkin - containerPort: 9411 - # -- Service port for Zipkin - servicePort: 9411 - # -- Node port for Zipkin - nodePort: "" - # -- Protocol to use for Zipkin - protocol: TCP - prometheus: - # -- Whether to enable service port for SigNoz exported prometheus metrics - enabled: false - # -- Container port for SigNoz exported prometheus metrics - containerPort: 8889 - # -- Service port for SigNoz exported prometheus metrics - servicePort: 8889 - # -- Node port for SigNoz exported prometheus metrics - nodePort: "" - # -- Protocol to use for SigNoz exported prometheus metrics - protocol: TCP - metrics: - # -- Whether to enable service port for internal metrics - enabled: true - # -- Container port for internal metrics - containerPort: 8888 - # -- Service port for internal metrics - servicePort: 8888 - # -- Node port for internal metrics - nodePort: "" - # -- Protocol to use for internal metrics - protocol: TCP - zpages: - # -- Whether to enable service port for ZPages - enabled: false - # -- Container port for Zpages - containerPort: 55679 - # -- Service port for Zpages - servicePort: 55679 - # -- Node port for Zpages - nodePort: "" - # -- Protocol to use for Zpages - protocol: TCP - pprof: - # -- Whether to enable service port for pprof - enabled: false - # -- Container port for pprof - containerPort: 1777 - # -- Service port for pprof - servicePort: 1777 - # -- Node port for pprof - nodePort: "" - # -- Protocol to use for pprof - protocol: TCP - logsheroku: - # -- Whether to enable service port for logsheroku - enabled: false - # -- Container port for logsheroku - containerPort: 8081 - # -- Service port for logsheroku - servicePort: 8081 - # -- Node port for logsheroku - nodePort: "" - # -- Protocol to use for logsheroku - protocol: TCP - logsjson: - # -- Whether to enable service port for logsjson - enabled: false - # -- Container port for logsjson - containerPort: 8082 - # -- Service port for logsjson - servicePort: 8082 - # -- Node port for logsjson - nodePort: "" - # -- Protocol to use for logsjson - protocol: TCP - - # -- Configure liveness and readiness probes. - # ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes - livenessProbe: - enabled: true - port: 13133 - path: / - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - readinessProbe: - enabled: true - port: 13133 - path: / - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - - # -- Custom liveness probe - customLivenessProbe: {} - # -- Custom readiness probe - customReadinessProbe: {} - - # -- Extra volumes mount for OtelCollector pod - extraVolumeMounts: [] - # -- Extra volumes for OtelCollector pod - extraVolumes: [] - - ingress: - # -- Enable ingress for OtelCollector - enabled: false - # -- Ingress Class Name to be used to identify ingress controllers - className: "" - # -- Annotations to OtelCollector Ingress - annotations: {} - # cert-manager.io/cluster-issuer: letsencrypt-prod - # nginx.ingress.kubernetes.io/ssl-redirect: "true" - # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - # -- OtelCollector Ingress Host names with their path details - hosts: - - host: otelcollector.domain.com - paths: - - path: / - pathType: ImplementationSpecific - port: 4318 - # -- OtelCollector Ingress TLS - tls: [] - # - secretName: chart-example-tls - # hosts: - # - otelcollector.domain.com - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 200Mi - # limits: - # cpu: "1" - # memory: 2Gi - - # -- OtelCollector priority class name - priorityClassName: "" - # -- Node selector for settings for OtelCollector pod - nodeSelector: { - spotinst.io/node-lifecycle: "od" - } - # -- Toleration labels for OtelCollector pod assignment - tolerations: [] - # -- Affinity settings for OtelCollector pod - affinity: {} - # -- TopologySpreadConstraints describes how OtelCollector pods ought to spread - topologySpreadConstraints: - - maxSkew: 1 - topologyKey: kubernetes.io/hostname - whenUnsatisfiable: ScheduleAnyway - labelSelector: - matchLabels: - app.kubernetes.io/component: otel-collector - - podSecurityContext: {} - # fsGroup: 2000 - - securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - - autoscaling: - enabled: false - minReplicas: 1 - maxReplicas: 11 - targetCPUUtilizationPercentage: 50 - targetMemoryUtilizationPercentage: 50 - behavior: {} - # scaleDown: - # stabilizationWindowSeconds: 300 - # policies: - # - type: Pods - # value: 1 - # periodSeconds: 180 - # scaleUp: - # stabilizationWindowSeconds: 300 - # policies: - # - type: Pods - # value: 2 - # periodSeconds: 60 - - autoscalingTemplate: [] - keda: - annotations: - enabled: false - pollingInterval: "30" # check 30sec periodically for metrics data - cooldownPeriod: "300" # once the load decreased, it will wait for 5 min and downscale - minReplicaCount: "1" # should be >= replicaCount specified in values.yaml - maxReplicaCount: "5" - triggers: [] - # - type: memory - # metadata: - # type: Utilization - # value: "80" # hpa make sure average Utilization <=80 by adding new pods - # - type: cpu - # metadata: - # type: Utilization - # value: "80" # hpa make sure average Utlization <=80 by adding new pods - - # -- Configurations for OtelCollector - # @default -- See `values.yaml` for defaults - config: - receivers: - otlp/spanmetrics: - protocols: - grpc: - endpoint: localhost:12345 - otlp: - protocols: - grpc: - endpoint: 0.0.0.0:4317 - max_recv_msg_size_mib: 16 - http: - endpoint: 0.0.0.0:4318 - jaeger: - protocols: - grpc: - endpoint: 0.0.0.0:14250 - thrift_http: - endpoint: 0.0.0.0:14268 - # Uncomment to enable thift_company receiver. - # You will also have set set enable it in `otelCollector.ports - # thrift_compact: - # endpoint: 0.0.0.0:6831 - hostmetrics: - collection_interval: 30s - scrapers: - cpu: {} - load: {} - memory: {} - disk: {} - filesystem: {} - network: {} - httplogreceiver/heroku: - # endpoint specifies the network interface and port which will receive data - endpoint: 0.0.0.0:8081 - source: heroku - httplogreceiver/json: - # endpoint specifies the network interface and port which will receive data - endpoint: 0.0.0.0:8082 - source: json - processors: - # default parsing of logs - # logstransform/internal: - # operators: - # - type: regex_parser - # id: traceid - # # https://regex101.com/r/yFW5UC/1 - # regex: '(?i)(^trace|(("| )+trace))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' - # parse_from: body - # parse_to: attributes.temp_trace - # if: 'body matches "(?i)(^trace|((\"| )+trace))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' - # output: spanid - # - type: regex_parser - # id: spanid - # # https://regex101.com/r/DZ2gng/1 - # regex: '(?i)(^span|(("| )+span))((-|_||)id("|=| |-|:)*)(?P[A-Fa-f0-9]+)' - # parse_from: body - # parse_to: attributes.temp_trace - # if: 'body matches "(?i)(^span|((\"| )+span))((-|_||)id(\"|=| |-|:)*)(?P[A-Fa-f0-9]+)"' - # output: trace_parser - # - type: trace_parser - # id: trace_parser - # trace_id: - # parse_from: attributes.temp_trace.trace_id - # span_id: - # parse_from: attributes.temp_trace.span_id - # output: remove_temp - # - type: remove - # id: remove_temp - # field: attributes.temp_trace - # if: '"temp_trace" in attributes' - # Batch processor config. - # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md - batch: - send_batch_size: 50000 - timeout: 1s - # Resource detection processor config. - # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md - resourcedetection: - # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure - # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar - detectors: - - env - # - elastic_beanstalk - # - eks - # - ecs - # - ec2 - # - gcp - # - azure - # - heroku - - system - timeout: 2s - system: - hostname_sources: [dns, os] - # Memory Limiter processor. - # If not set, will be overridden with values based on k8s resource limits. - # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/memorylimiterprocessor/README.md - # memory_limiter: null - signozspanmetrics/cumulative: - metrics_exporter: clickhousemetricswrite - latency_histogram_buckets: - [ - 100us, - 1ms, - 2ms, - 6ms, - 10ms, - 50ms, - 100ms, - 250ms, - 500ms, - 1000ms, - 1400ms, - 2000ms, - 5s, - 10s, - 20s, - 40s, - 60s, - ] - dimensions_cache_size: 100000 - dimensions: - - name: service.namespace - default: default - - name: deployment.environment - default: default - - name: signoz.collector.id - signozspanmetrics/delta: - metrics_exporter: clickhousemetricswrite - latency_histogram_buckets: - [ - 100us, - 1ms, - 2ms, - 6ms, - 10ms, - 50ms, - 100ms, - 250ms, - 500ms, - 1000ms, - 1400ms, - 2000ms, - 5s, - 10s, - 20s, - 40s, - 60s, - ] - dimensions_cache_size: 100000 - dimensions: - - name: service.namespace - default: default - - name: deployment.environment - default: default - - name: signoz.collector.id - aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA - # K8s Attribute processor config. - # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/k8sattributesprocessor/README.md - k8sattributes: - # -- Whether to detect the IP address of agents and add it as an attribute to all telemetry resources. - # If set to true, Agents will not make any k8s API calls, do any discovery of pods or extract any metadata. - passthrough: false - # -- Filters can be used to limit each OpenTelemetry agent to query pods based on specific - # selector to only dramatically reducing resource requirements for very large clusters. - filter: - # -- Restrict each OpenTelemetry agent to query pods running on the same node - node_from_env_var: K8S_NODE_NAME - pod_association: - - sources: - - from: resource_attribute - name: k8s.pod.ip - - sources: - - from: resource_attribute - name: k8s.pod.uid - - sources: - - from: connection - extract: - metadata: - - k8s.namespace.name - - k8s.pod.name - - k8s.pod.uid - - k8s.pod.start_time - - k8s.deployment.name - - k8s.node.name - extensions: - health_check: - endpoint: 0.0.0.0:13133 - zpages: - endpoint: localhost:55679 - pprof: - endpoint: localhost:1777 - exporters: - clickhousetraces: - datasource: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_TRACE_DATABASE} - low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING} - clickhousemetricswrite: - endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} - timeout: 15s - resource_to_telemetry_conversion: - enabled: true - clickhouselogsexporter: - dsn: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_LOG_DATABASE} - timeout: 10s - use_new_schema: true - prometheus: - endpoint: 0.0.0.0:8889 - service: - telemetry: - logs: - encoding: json - metrics: - address: 0.0.0.0:8888 - extensions: [health_check, zpages, pprof] - pipelines: - traces: - receivers: [otlp, jaeger] - processors: [signozspanmetrics/cumulative, signozspanmetrics/delta, batch] - exporters: [clickhousetraces] - metrics: - receivers: [otlp] - processors: [batch] - exporters: [clickhousemetricswrite] - metrics/internal: - receivers: [hostmetrics] - processors: [resourcedetection, k8sattributes, batch] - exporters: [clickhousemetricswrite] - logs: - receivers: [otlp, httplogreceiver/heroku, httplogreceiver/json] - processors: [batch] - exporters: [clickhouselogsexporter] - -# Default values for OtelCollectorMetrics -otelCollectorMetrics: - enabled: false - name: "otel-collector-metrics" - image: - registry: docker.io - repository: signoz/signoz-otel-collector - tag: 0.111.5 - pullPolicy: IfNotPresent - - # -- Image Registry Secret Names for OtelCollector - # If set, this has higher precedence than the root level or global value of imagePullSecrets. - imagePullSecrets: [] - - # OpenTelemetry Collector executable - command: - # -- OtelCollectorMetrics command name - name: /signoz-collector - # -- OtelCollectorMetrics command extra arguments - extraArgs: - - --feature-gates=-pkg.translator.prometheus.NormalizeName - - configMap: - # -- Specifies whether a configMap should be created (true by default) - create: true - - # OtelCollectorMetrics Service Account - serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - - # OtelCollectorMetrics service - service: - # -- Annotations to use by service associated to OtelCollectorMetrics - annotations: {} - # -- Labels to use by service associated to OtelCollectorMetrics - labels: {} - # -- Service Type: LoadBalancer (allows external access) or NodePort (more secure, no extra cost) - type: ClusterIP - - # -- OtelCollectorMetrics Deployment annotation. - annotations: {} - # -- OtelCollectorMetrics pod(s) annotation. - podAnnotations: - signoz.io/scrape: 'true' - signoz.io/port: '8888' - - # -- Additional environments to set for OtelCollectorMetrics - additionalEnvs: {} - # env_key: env_value - - podSecurityContext: {} - # fsGroup: 2000 - - securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - - minReadySeconds: 5 - progressDeadlineSeconds: 600 - replicaCount: 1 - - initContainers: - init: - enabled: false - image: - registry: docker.io - repository: busybox - tag: 1.35 - pullPolicy: IfNotPresent - command: - delay: 5 - endpoint: /ping - waitMessage: "waiting for clickhouseDB" - doneMessage: "clickhouse ready, starting otel collector metrics now" - resources: {} - # requests: - # cpu: 100m - # memory: 100Mi - # limits: - # cpu: 100m - # memory: 100Mi - - # Configuration for ports - ports: - metrics: - # -- Whether to enable service port for internal metrics - enabled: false - # -- Container port for internal metrics - containerPort: 8888 - # -- Service port for internal metrics - servicePort: 8888 - # -- Protocol to use for internal metrics - protocol: TCP - zpages: - # -- Whether to enable service port for ZPages - enabled: false - # -- Container port for Zpages - containerPort: 55679 - # -- Service port for Zpages - servicePort: 55679 - # -- Protocol to use for Zpages - protocol: TCP - health-check: - # -- Whether to enable service port for health check - enabled: true - # -- Container port for health check - containerPort: 13133 - # -- Service port for health check - servicePort: 13133 - # -- Protocol to use for health check - protocol: TCP - pprof: - # -- Whether to enable service port for pprof - enabled: false - # -- Container port for pprof - containerPort: 1777 - # -- Service port for pprof - servicePort: 1777 - # -- Protocol to use for pprof - protocol: TCP - - - ## Configure liveness and readiness probes. - ## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/#configure-probes - ## - livenessProbe: - enabled: true - port: 13133 - path: / - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - readinessProbe: - enabled: true - port: 13133 - path: / - initialDelaySeconds: 5 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 6 - successThreshold: 1 - - ## Custom liveness and readiness probes - customLivenessProbe: {} - customReadinessProbe: {} - - # -- Extra volumes mount for OtelCollectorMetrics pod - extraVolumeMounts: [] - # -- Extra volumes for OtelCollectorMetrics pod - extraVolumes: [] - - ingress: - # -- Enable ingress for OtelCollectorMetrics - enabled: false - # -- Ingress Class Name to be used to identify ingress controllers - className: "" - # -- Annotations to OtelCollectorMetrics Ingress - annotations: {} - # cert-manager.io/cluster-issuer: letsencrypt-prod - # nginx.ingress.kubernetes.io/ssl-redirect: "true" - # nginx.ingress.kubernetes.io/backend-protocol: "GRPC" - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - # -- OtelCollectorMetrics Ingress Host names with their path details - hosts: - - host: otelcollector-metrics.domain.com - paths: - - path: / - pathType: ImplementationSpecific - port: 13133 - # -- OtelCollectorMetrics Ingress TLS - tls: [] - # - secretName: chart-example-tls - # hosts: - # - otelcollector-metrics.domain.com - - # -- Configure resource requests and limits. Update according to your own use - # case as these values might not be suitable for your workload. - # Ref: http://kubernetes.io/docs/user-guide/compute-resources/ - # - # @default -- See `values.yaml` for defaults - resources: - requests: - cpu: 100m - memory: 100Mi - # limits: - # cpu: "1" - # memory: 2Gi - - # -- OtelCollectorMetrics priority class name - priorityClassName: "" - # -- Node selector for settings for OtelCollectorMetrics pod - nodeSelector: {} - # -- Toleration labels for OtelCollectorMetrics pod assignment - tolerations: [] - # -- Affinity settings for OtelCollectorMetrics pod - affinity: {} - # -- TopologySpreadConstraints describes how OtelCollectorMetrics pods ought to spread - topologySpreadConstraints: [] - - # OtelCollectorMetrics RBAC config - clusterRole: - # -- Specifies whether a clusterRole should be created - create: true - # -- Annotations to add to the clusterRole - annotations: {} - # -- The name of the clusterRole to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - # -- A set of rules as documented here. - # ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/ - # @default -- See `values.yaml` for defaults - rules: - # k8sattributes processor requires these permissions - - apiGroups: [""] - resources: ["pods", "namespaces", "nodes"] - verbs: ["get", "watch", "list"] - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["get", "list", "watch"] - - apiGroups: ["apps"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] - - apiGroups: ["extensions"] - resources: ["replicasets"] - verbs: ["get", "list", "watch"] - # other processors and receivers require these permissions - - apiGroups: [""] - resources: ["nodes", "nodes/proxy", "services", "endpoints"] - verbs: ["get", "list", "watch"] - - apiGroups: ["extensions"] - resources: ["ingresses"] - verbs: ["get", "list", "watch"] - - nonResourceURLs: ["/metrics"] - verbs: ["get"] - - # OtelCollectorMetrics clusterRoleBinding - clusterRoleBinding: - # -- Annotations to add to the clusterRoleBinding - annotations: {} - # -- The name of the clusterRoleBinding to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - - # -- Configurations for OtelCollectorMetrics - # @default -- See `values.yaml` for defaults - config: - receivers: - # prometheus scrape config - prometheus: - config: - scrape_configs: - # generic prometheus metrics scraper (scrapped when signoz.io pod annotations are set) - - job_name: "generic-collector" - scrape_interval: 60s - kubernetes_sd_configs: - - role: pod - relabel_configs: - - source_labels: - [__meta_kubernetes_pod_annotation_signoz_io_scrape] - action: keep - regex: true - - source_labels: - [__meta_kubernetes_pod_annotation_signoz_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: - [ - __meta_kubernetes_pod_ip, - __meta_kubernetes_pod_annotation_signoz_io_port, - ] - action: replace - separator: ":" - target_label: __address__ - - target_label: job_name - replacement: generic-collector - # Uncomment line below to include all labels of the pod - # - action: labelmap - # regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] - action: replace - target_label: signoz_k8s_name - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - action: replace - target_label: signoz_k8s_instance - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] - action: replace - target_label: signoz_k8s_component - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: k8s_namespace_name - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: k8s_pod_name - - source_labels: [__meta_kubernetes_pod_uid] - action: replace - target_label: k8s_pod_uid - - source_labels: [__meta_kubernetes_pod_container_name] - action: replace - target_label: k8s_container_name - - source_labels: [__meta_kubernetes_pod_container_name] - regex: (.+)-init - action: drop - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: k8s_node_name - - source_labels: [__meta_kubernetes_pod_ready] - action: replace - target_label: k8s_pod_ready - - source_labels: [__meta_kubernetes_pod_phase] - action: replace - target_label: k8s_pod_phase - processors: - # Batch processor config. - # ref: https://github.com/open-telemetry/opentelemetry-collector/blob/main/processor/batchprocessor/README.md - batch: - send_batch_size: 10000 - timeout: 1s - # Resource detection processor config. - # ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md - resourcedetection: - # detectors: include ec2/eks for AWS, gcp for GCP and azure/aks for Azure - # env detector included below adds custom labels using OTEL_RESOURCE_ATTRIBUTES envvar - detectors: - - env - # - elastic_beanstalk - # - eks - # - ecs - # - ec2 - # - gcp - # - azure - # - heroku - - system - timeout: 2s - system: - hostname_sources: [dns, os] - extensions: - health_check: - endpoint: 0.0.0.0:13133 - zpages: - endpoint: localhost:55679 - pprof: - endpoint: localhost:1777 - exporters: - clickhousemetricswrite: - timeout: 15s - endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} - clickhousemetricswrite/hostmetrics: - endpoint: tcp://${env:CLICKHOUSE_USER}:${env:CLICKHOUSE_PASSWORD}@${env:CLICKHOUSE_HOST}:${env:CLICKHOUSE_PORT}/${env:CLICKHOUSE_DATABASE} - resource_to_telemetry_conversion: - enabled: true - service: - telemetry: - logs: - encoding: json - metrics: - address: 0.0.0.0:8888 - extensions: [health_check, zpages, pprof] - pipelines: - metrics: - receivers: [prometheus] - processors: [batch] - exporters: [clickhousemetricswrite] - -signoz-otel-gateway: - enabled: false diff --git a/modules/signoz-fluxcd/variables.tf b/modules/signoz-fluxcd/variables.tf deleted file mode 100644 index 1a070655..00000000 --- a/modules/signoz-fluxcd/variables.tf +++ /dev/null @@ -1,73 +0,0 @@ -variable "auto_deploy" { - description = "Auto deploy through ArgoCD" - type = bool - default = false -} - -variable "auto_prune" { - description = "Auto prune through ArgoCD" - type = bool - default = false -} - -variable "git_revision" { - description = "The git revision to deploy" - type = string - default = "main" -} - -variable "argo_deployment_name" { - description = "The name of the ArgoCD deployment, must be globally unique" - type = string -} - -variable "namespace" { - description = "The namespace to deploy into" - type = string -} - - -variable "enable_otel_ingress" { - description = "Enable OpenTelemetry ingress" - type = bool - default = false -} - -variable "gateway_namespace" { - description = "The namespace of the gateway" - type = string -} - -variable "cluster_name" { - description = "EKS cluster name" - type = string -} - -variable "auth0_jwks_uri" { - description = "The JWKS URI for Auth0" - type = string -} - -variable "smtp_user" { - description = "The SMTP user. Required if smtp_user, smtp_password, and smtp_from are set" - type = string - default = "" -} - -variable "smtp_password" { - description = "The SMTP password. Required if smtp_user, smtp_password, and smtp_from are set" - type = string - default = "" -} - -variable "smtp_from" { - description = "The SMTP from address. Required if smtp_user, smtp_password, and smtp_from are set" - type = string - default = "" -} - -variable "aws_account_id" { - description = "The AWS account ID" - type = string - default = "" -} diff --git a/modules/signoz-fluxcd/versions.tf b/modules/signoz-fluxcd/versions.tf deleted file mode 100644 index ce834c32..00000000 --- a/modules/signoz-fluxcd/versions.tf +++ /dev/null @@ -1,17 +0,0 @@ -terraform { - required_providers { - aws = { - source = "hashicorp/aws" - version = "~> 5.0" - } - kubernetes = { - source = "hashicorp/kubernetes" - version = "~> 2.0" - } - kubectl = { - source = "gavinbunney/kubectl" - version = "1.14.0" - } - } -} - diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 4a366f9f..3feca3fe 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -8,104 +8,271 @@ resource "kubernetes_namespace" "signoz" { } } -resource "kubectl_manifest" "signoz-deployment" { +resource "kubectl_manifest" "signoz-helm-repo" { depends_on = [kubernetes_namespace.signoz] yaml_body = < + + + + 10485760 + + + s3 + https://${var.s3_backup_bucket_name}.s3.amazonaws.com/coldstorage/ + true + us-east-1 + + + + + + + default + + + s3 + 0 + 1 + + + 0 + + + + +YAML +} + +resource "kubectl_manifest" "signoz-git-repo" { + depends_on = [kubectl_manifest.signoz-helm-release] + + yaml_body = </data/ endpoint: https://.s3-.amazonaws.com/data/ # -- Access Key for S3 or GCS - accessKey: + # accessKey: # -- Secret Access Key for S3 or GCS - secretAccess: + # secretAccess: # AWS role configuration - to use environment variables instead of passing access and secret keys role: # -- Whether to enable AWS IAM ARN role. @@ -1286,7 +1291,7 @@ schemaMigrator: annotations: {} # In Helm, this is needed to apply helm hooks for pre-upgrade, delete policy and hook weight. # For ArgoCD, this is needed to apply the sync wave - ArgoCD equivalent of hook weight. - upgradeHelmHooks: true + upgradeHelmHooks: false # -- Whether to enable replication for schemaMigrator enableReplication: true diff --git a/modules/signoz/variables.tf b/modules/signoz/variables.tf index 344c8f60..2370bdc6 100644 --- a/modules/signoz/variables.tf +++ b/modules/signoz/variables.tf @@ -70,3 +70,13 @@ variable "smtp_from" { type = string default = "" } + +variable "s3_backup_bucket_name" { + description = "The name of the S3 bucket to use for backups" + type = string +} + +variable "s3_access_role_arn" { + description = "The ARN of the role to use for accessing the S3 bucket" + type = string +} From 530186a1dd5860ad30f1c2a671058cc385ae2bea Mon Sep 17 00:00:00 2001 From: bwmac Date: Wed, 20 Nov 2024 13:48:51 -0500 Subject: [PATCH 119/135] updates s3 readme --- modules/s3-bucket/README.md | 60 +++++-------------------------------- 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/modules/s3-bucket/README.md b/modules/s3-bucket/README.md index c97150e5..f8770361 100644 --- a/modules/s3-bucket/README.md +++ b/modules/s3-bucket/README.md @@ -14,70 +14,24 @@ Using this module is as simple as calling it in your terraform code: module "my_beautiful_bucket" { source = "../../../modules/s3-bucket" bucket_name = "my-beautiful-bucket" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name + cluster_oidc_provider_arn = var.cluster_oidc_provider_arn } ``` -You will need to configure access to the bucket. This will involve the definition of `aws_iam_policy`, `aws_iam_role`, and `aws_iam_role_policy_attachment` resources with the necessary permissions for your use case. For example (from the `dpe-k8s-deployments` stack): -``` -resource "aws_iam_policy" "my_beautiful_bucket_policy" { - name = "my-beautiful-bucket-access-policy" - description = "Policy to access the my beautiful bucket" - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "s3:ListBucket", - "s3:GetObject", - "s3:PutObject", - "s3:DeleteObject", - ] - Resource = [ - module.my_beautiful_bucket.bucket_arn, - "${module.my_beautiful_bucket.bucket_arn}/*" - ] - } - ] - }) -} - -resource "aws_iam_role" "my_beautiful_bucket_access" { - name = "my-beautiful-bucket-access-role" - description = "Assumed role to access the my beautiful bucket" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Action = "sts:AssumeRoleWithWebIdentity" - Effect = "Allow" - Principal = { - Federated = "${var.cluster_oidc_provider_arn}", - } - } - ] - }) -} +The module handles creating the necessary IAM policy, role, and role policy attachment for accessing the bucket and provides the role ARN as an output. -resource "aws_iam_role_policy_attachment" "my_beautiful_bucket_policy_attachment" { - role = aws_iam_role.my_beautiful_bucket_access.name - policy_arn = aws_iam_policy.my_beautiful_bucket_policy.arn -} -``` - -After confirming that the policy and role are configured correctly, you will then need to configure a kubernetes service account bound to the IAM role. This can be done in your application/module code like so: +After confirming that the policy and role are configured correctly, you can either use the ARN directly in your application code or configure a kubernetes service account bound to the IAM role. The latter can be done like so: ``` resource "kubernetes_service_account" "my_beautiful_bucket_service_account" { metadata { name = "my-beautiful-bucket-service-account" namespace = var.namespace annotations = { - "eks.amazonaws.com/role-arn" = "arn:aws:iam::${var.aws_account_id}:role/my-beautiful-bucket-access-role" + "eks.amazonaws.com/role-arn" = "${module.my_beautiful_bucket.iam_role_arn}" } } } ``` - -Finally, you can leverage the newly created service account in your application code by setting `serviceAccountName` to the name of the service account you just created. From e5f78691562dab3a54cc479568319650a35ed5cd Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:14:26 -0700 Subject: [PATCH 120/135] Correct resource references --- deployments/spacelift/dpe-k8s/main.tf | 31 ------------------- .../stacks/dpe-k8s-deployments/main.tf | 1 - modules/signoz/main.tf | 4 +-- 3 files changed, 2 insertions(+), 34 deletions(-) diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 06bd0bc0..c6ad7e5d 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -179,31 +179,6 @@ resource "spacelift_stack_dependency_reference" "cluster-name" { # stack_id = spacelift_stack.k8s-stack.id # } -# resource "spacelift_stack_destructor" "k8s-stack-deployments-destructor" { -# depends_on = [ -# spacelift_stack.k8s-stack, -# spacelift_aws_integration_attachment.k8s-deployments-aws-integration-attachment, -# spacelift_context_attachment.k8s-kubeconfig-hooks, -# spacelift_stack_dependency_reference.cluster-name, -# spacelift_stack_dependency_reference.region-name, -# spacelift_environment_variable.k8s-stack-deployments-environment-variables -# ] - -# stack_id = spacelift_stack.k8s-stack-deployments.id -# } - -# resource "spacelift_stack_destructor" "k8s-stack-destructor" { -# depends_on = [ -# spacelift_aws_integration_attachment.k8s-aws-integration-attachment, -# spacelift_context_attachment.k8s-kubeconfig-hooks, -# spacelift_stack_dependency_reference.cluster-name, -# spacelift_stack_dependency_reference.region-name, -# spacelift_environment_variable.k8s-stack-environment-variables -# ] - -# stack_id = spacelift_stack.k8s-stack.id -# } - resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" { integration_id = var.aws_integration_id stack_id = spacelift_stack.k8s-stack.id @@ -245,12 +220,6 @@ resource "spacelift_stack" "auth0" { ] } -# resource "spacelift_stack_destructor" "auth0-stack-destructor" { -# count = var.deploy_auth0 ? 1 : 0 -# stack_id = spacelift_stack.auth0[0].id -# } - - resource "spacelift_environment_variable" "auth0-stack-environment-variables" { depends_on = [ spacelift_stack.auth0 diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index a6e61e48..5c75507d 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -109,7 +109,6 @@ module "signoz" { smtp_user = var.smtp_user smtp_from = var.smtp_from auth0_identifier = var.auth0_identifier - aws_account_id = var.aws_account_id s3_backup_bucket_name = module.clickhouse-backup-bucket.bucket_name s3_access_role_arn = module.clickhouse-backup-bucket.access_role_arn } diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 3feca3fe..ac8ef016 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -132,9 +132,9 @@ spec: - name: FULL_INTERVAL value: "24h" - name: BACKUP_NAME - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + value: "${var.s3_backup_bucket_name}" - name: S3_BUCKET - value: "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + value: "${var.s3_backup_bucket_name}" - name: S3_PATH value: "backup/shard-{shard}" - name: S3_OBJECT_DISK_PATH From 1495779940c188e400d9eade2717292e2791f4c3 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:16:57 -0700 Subject: [PATCH 121/135] Correct arn reference --- modules/s3-bucket/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index 068b4920..d9cb1768 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -32,8 +32,8 @@ resource "aws_iam_policy" "s3-access-policy" { "s3:DeleteObject", ] Resource = [ - module.clickhouse_backup_bucket.bucket_arn, - "${module.clickhouse_backup_bucket.bucket_arn}/*" + aws_s3_bucket.bucket.arn, + "${aws_s3_bucket.bucket.arn}/*" ] } ] From bcc95d9476b72bda524172696b892826efb5a26f Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:19:39 -0700 Subject: [PATCH 122/135] Shorten iam role name --- modules/s3-bucket/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index d9cb1768..97994221 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -41,7 +41,7 @@ resource "aws_iam_policy" "s3-access-policy" { } resource "aws_iam_role" "s3-access-iam-role" { - name = "s3-access-role-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + name = "s3-${var.cluster_name}-${var.bucket_name}" description = "Assumed role to access the s3 bucket with the given permissions." assume_role_policy = jsonencode({ From a676d851695c0bed2ae1c5d4f92bc12df06efb3a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:25:44 -0700 Subject: [PATCH 123/135] Correct ingress patches --- modules/signoz/main.tf | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index ac8ef016..f9bc92b7 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -270,7 +270,19 @@ spec: remoteJWKS: uri: ${var.auth0_jwks_uri} audiences: - - ${var.cluster_name}-telemetry + - ${var.auth0_identifier} + - op: replace + path: /spec/authorization + value: + defaultAction: Deny + rules: + - name: allow + action: Allow + principal: + jwt: + provider: auth0 + scopes: + - write:telemetry YAML } From 6a0bc34da699e9b25cc02a696e1cf7aa0e922e40 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:35:29 -0700 Subject: [PATCH 124/135] Point all to local branch --- .../stacks/dpe-k8s-deployments/main.tf | 20 +++++++++++-------- modules/signoz/README.md | 11 ++++------ modules/signoz/main.tf | 2 ++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 5c75507d..e4e5a0b9 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -1,3 +1,7 @@ +locals { + # git_revision = var.git_revision + git_revision = "schematic-138-cold-storage-and-backups" +} module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" version = "0.9.0" @@ -40,7 +44,7 @@ module "victoria-metrics" { source = "../../../modules/victoria-metrics" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "trivy-operator" { @@ -49,7 +53,7 @@ module "trivy-operator" { version = "0.3.2" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "airflow" { @@ -58,7 +62,7 @@ module "airflow" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" } @@ -68,7 +72,7 @@ module "postgres-cloud-native-operator" { version = "0.4.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision } module "postgres-cloud-native-database" { @@ -77,7 +81,7 @@ module "postgres-cloud-native-database" { version = "0.5.0" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "airflow" argo_deployment_name = "airflow-postgres-cloud-native" } @@ -98,7 +102,7 @@ module "signoz" { source = "../../../modules/signoz" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "signoz" argo_deployment_name = "signoz" enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress @@ -121,7 +125,7 @@ module "envoy-gateway" { source = "../../../modules/envoy-gateway" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "envoy-gateway" argo_deployment_name = "envoy-gateway" cluster_issuer_name = "lets-encrypt-prod" @@ -136,7 +140,7 @@ module "cert-manager" { source = "../../../modules/cert-manager" auto_deploy = var.auto_deploy auto_prune = var.auto_prune - git_revision = var.git_revision + git_revision = local.git_revision namespace = "cert-manager" argo_deployment_name = "cert-manager" } diff --git a/modules/signoz/README.md b/modules/signoz/README.md index 5f5d0deb..f4bd3e2b 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -5,13 +5,10 @@ SigNoz is an open-source APM. It helps developers monitor their applications & troubleshoot problems, an open-source alternative to DataDog, NewRelic, etc. Open source Application Performance Monitoring (APM) & Observability tool. - -## This module is a work in progress (To be completed before production, or determine if not needed) -A number of items are needed: - -- Setting up backups and data retention: https://sagebionetworks.jira.com/browse/IBCDPE-1094 -- Set up accounts and access to the service declaratively - +## Initial setup +After this module is deployed to the kubernetes cluster the accounts in the service need +to be manually set up. This is only available with an enterprise license of this +service. Additionally, any dashboards need to be created within the cluster. ## Setting up SMTP for alertmanager Alertmanager is an additional tool that is deployed to the kubernetes cluster that diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index f9bc92b7..0ba191ae 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -36,6 +36,8 @@ resource "kubernetes_config_map" "signoz-values" { } resource "kubernetes_service_account" "clickhouse-backup-service-account" { + depends_on = [kubernetes_namespace.signoz] + metadata { name = "clickhouse-backup-service-account" namespace = var.namespace From c57736281d450bbc48f5113ea7abe86a1962726e Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:41:29 -0700 Subject: [PATCH 125/135] Remove extra service account that is not needed --- modules/signoz/main.tf | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 0ba191ae..7e3408d9 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -35,18 +35,6 @@ resource "kubernetes_config_map" "signoz-values" { } -resource "kubernetes_service_account" "clickhouse-backup-service-account" { - depends_on = [kubernetes_namespace.signoz] - - metadata { - name = "clickhouse-backup-service-account" - namespace = var.namespace - annotations = { - "eks.amazonaws.com/role-arn" = var.s3_access_role_arn - } - } -} - resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] From 50ad21fc3d11783dbe6731cb8497d017d8986222 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:44:46 -0700 Subject: [PATCH 126/135] Support disabled bucket versioning --- modules/s3-bucket/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index 97994221..7c2df002 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -11,7 +11,7 @@ resource "aws_s3_bucket" "bucket" { resource "aws_s3_bucket_versioning" "versioning" { bucket = aws_s3_bucket.bucket.id versioning_configuration { - status = var.enable_versioning ? "Enabled" : "Suspended" + status = var.enable_versioning ? "Enabled" : "Disabled" } } From 142b78f827a89a44321c09dcff0789ca6b76195a Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:19:30 -0700 Subject: [PATCH 127/135] Notes for setup --- modules/signoz/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/signoz/README.md b/modules/signoz/README.md index f4bd3e2b..82f5cb67 100644 --- a/modules/signoz/README.md +++ b/modules/signoz/README.md @@ -6,9 +6,11 @@ SigNoz is an open-source APM. It helps developers monitor their applications source Application Performance Monitoring (APM) & Observability tool. ## Initial setup -After this module is deployed to the kubernetes cluster the accounts in the service need -to be manually set up. This is only available with an enterprise license of this -service. Additionally, any dashboards need to be created within the cluster. + +- Accounts in SigNoz need to be manually set up (SSO is only available in the enterprise version) +- 120 months for "Total Retention Period" and 1 month for "Move to S3" settings should be set +- Any dashboards need to be copied or set up +- Alert channels (Email/Slack) need to be set ## Setting up SMTP for alertmanager Alertmanager is an additional tool that is deployed to the kubernetes cluster that From 4f9206396da3975e966550c7c58b408856ab54f5 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:36:43 -0700 Subject: [PATCH 128/135] Point back to main --- .../stacks/dpe-k8s-deployments/main.tf | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index e4e5a0b9..931c37b8 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -1,6 +1,5 @@ locals { - # git_revision = var.git_revision - git_revision = "schematic-138-cold-storage-and-backups" + git_revision = var.git_revision } module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" @@ -38,10 +37,10 @@ module "flux-cd" { } module "victoria-metrics" { - depends_on = [module.argo-cd] + depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/victoria-metrics/aws" # version = "0.4.8" - source = "../../../modules/victoria-metrics" + source = "../../../modules/victoria-metrics" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = local.git_revision @@ -87,11 +86,11 @@ module "postgres-cloud-native-database" { } module "clickhouse-backup-bucket" { - source = "../../../modules/s3-bucket" - bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" - enable_versioning = false - aws_account_id = var.aws_account_id - cluster_name = var.cluster_name + source = "../../../modules/s3-bucket" + bucket_name = "clickhouse-backup-${var.aws_account_id}-${var.cluster_name}" + enable_versioning = false + aws_account_id = var.aws_account_id + cluster_name = var.cluster_name cluster_oidc_provider_arn = var.cluster_oidc_provider_arn } @@ -99,20 +98,20 @@ module "signoz" { depends_on = [module.argo-cd] # source = "spacelift.io/sagebionetworks/postgres-cloud-native-database/aws" # version = "0.5.0" - source = "../../../modules/signoz" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = local.git_revision - namespace = "signoz" - argo_deployment_name = "signoz" - enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress - gateway_namespace = "envoy-gateway" - cluster_name = var.cluster_name - auth0_jwks_uri = var.auth0_jwks_uri - smtp_password = var.smtp_password - smtp_user = var.smtp_user - smtp_from = var.smtp_from - auth0_identifier = var.auth0_identifier + source = "../../../modules/signoz" + auto_deploy = var.auto_deploy + auto_prune = var.auto_prune + git_revision = local.git_revision + namespace = "signoz" + argo_deployment_name = "signoz" + enable_otel_ingress = var.enable_otel_ingress && var.enable_cluster_ingress + gateway_namespace = "envoy-gateway" + cluster_name = var.cluster_name + auth0_jwks_uri = var.auth0_jwks_uri + smtp_password = var.smtp_password + smtp_user = var.smtp_user + smtp_from = var.smtp_from + auth0_identifier = var.auth0_identifier s3_backup_bucket_name = module.clickhouse-backup-bucket.bucket_name s3_access_role_arn = module.clickhouse-backup-bucket.access_role_arn } From 92031b94ee9764fe392b038bc948dfb52eb6a0d8 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:44:25 -0700 Subject: [PATCH 129/135] Update comment --- deployments/stacks/dpe-k8s-deployments/main.tf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 931c37b8..1b78a28b 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -31,9 +31,7 @@ module "argo-cd" { module "flux-cd" { depends_on = [module.sage-aws-eks-autoscaler] - # source = "spacelift.io/sagebionetworks/argo-cd/aws" - # version = "0.3.1" - source = "../../../modules/flux-cd" + source = "../../../modules/flux-cd" } module "victoria-metrics" { From 499719815a5e3ec213f279709e1dedc9fe58c6ae Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:53:24 -0700 Subject: [PATCH 130/135] [IBCDPE-1095] Use scope based authorization on telemetry upload route (#48) * Use scope based authroization on telemetry upload route --- deployments/spacelift/dpe-k8s/variables.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/deployments/spacelift/dpe-k8s/variables.tf b/deployments/spacelift/dpe-k8s/variables.tf index 5a365f56..599801f5 100644 --- a/deployments/spacelift/dpe-k8s/variables.tf +++ b/deployments/spacelift/dpe-k8s/variables.tf @@ -192,7 +192,6 @@ variable "auth0_clients" { variable "auth0_identifier" { description = "Auth0 identifier for the created API." type = string - default = "" } variable "ses_email_identities" { From 27097fa5aca94a05c885a5c5a5a802026a3c1cbd Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:51:13 -0700 Subject: [PATCH 131/135] Generic policy name --- modules/s3-bucket/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/s3-bucket/main.tf b/modules/s3-bucket/main.tf index 7c2df002..a9d13f22 100644 --- a/modules/s3-bucket/main.tf +++ b/modules/s3-bucket/main.tf @@ -17,7 +17,7 @@ resource "aws_s3_bucket_versioning" "versioning" { resource "aws_iam_policy" "s3-access-policy" { - name = "clickhouse-backup-access-policy-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" + name = "access-policy-${var.aws_account_id}-${var.cluster_name}-${var.bucket_name}" description = "Policy to access the s3 bucket" policy = jsonencode({ From 8b725abab0730ee57a1534ff863fbf2d7f4e994c Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:51:44 -0700 Subject: [PATCH 132/135] Default bucket tags --- modules/s3-bucket/variables.tf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/s3-bucket/variables.tf b/modules/s3-bucket/variables.tf index fb9e9193..25877694 100644 --- a/modules/s3-bucket/variables.tf +++ b/modules/s3-bucket/variables.tf @@ -6,7 +6,9 @@ variable "bucket_name" { variable "tags" { description = "Tags to apply to the S3 bucket" type = map(string) - default = {} + default = { + "CostCenter" = "No Program / 000000" + } } variable "enable_versioning" { From 99c45ff0ede1c211ebea50af77777f72322e7cf2 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:54:33 -0700 Subject: [PATCH 133/135] Correct path --- modules/signoz/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/signoz/main.tf b/modules/signoz/main.tf index 7e3408d9..ed1f8f2d 100644 --- a/modules/signoz/main.tf +++ b/modules/signoz/main.tf @@ -204,7 +204,7 @@ spec: timeout: 5m wait: true prune: true - path: "./modules/signoz-fluxcd/resources-service-scrape" + path: "./modules/signoz/resources-service-scrape" sourceRef: kind: GitRepository name: signoz-git-repo @@ -227,7 +227,7 @@ spec: timeout: 5m wait: true prune: true - path: "./modules/signoz-fluxcd/resources-otel-ingress" + path: "./modules/signoz/resources-otel-ingress" sourceRef: kind: GitRepository name: signoz-git-repo From 6204892557f9df732ffe28586e2f7d770b5e396b Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 21 Nov 2024 11:15:37 -0500 Subject: [PATCH 134/135] fixes minor typos --- modules/flux-cd/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flux-cd/README.md b/modules/flux-cd/README.md index 01e5a5e9..df006cd6 100644 --- a/modules/flux-cd/README.md +++ b/modules/flux-cd/README.md @@ -1,14 +1,14 @@ # Purpose -This module is used to deploy the `Flux CD` [helm chart](https://fluxcd-community.github.io/helm-charts) to the cluster. [`Flux CD`](https://fluxcd.io/) is a GitOps tool used to manage the application lifecycle on a Kubernetes cluster. It was originally deployed because unlike `Argo CD`, it supports the use of `postRenderers` which are used to apply any additional changes to the application after it has been deployed, and which were needed to be used to deploy the `clickhouse-backup` sidecar container to the `signoz` helm release. We do not plan to move all existing applications to `fluxcd` at this time, but it is available and preferred to be used for any new applications added to the cluster. +This module is used to deploy the `Flux CD` [helm chart](https://fluxcd-community.github.io/helm-charts) to the cluster. [`Flux CD`](https://fluxcd.io/) is a GitOps tool used to manage the application lifecycle on a Kubernetes cluster. It was originally deployed because unlike `Argo CD`, it supports the use of `postRenderers` which are used to apply any additional changes to the application after it has been deployed, and were needed to be used to deploy the `clickhouse-backup` sidecar container to the `signoz` helm release. We do not plan to move all existing applications to using `Flux CD` at this time, but it is available and preferred to be used for any new applications added to the cluster. ## What resources are being deployed through this module -In addition to a `helm_release` which deploys the `fluxcd` helm chart, this module also creates a `capacitor` resource which is used as the frontend for `fluxcd`. +In addition to a `helm_release` which deploys the `Flux CD` helm chart, this module also creates a `capacitor` resource which is used as the frontend for `Flux CD`. ## Accessing the Flux CD UI -To access the `Flux CD` UI, you only need to port-forward the `capacitor` pod and access it from your browser. +To access the `Flux CD` UI, you only need to port-forward the `capacitor` pod and access it in your browser. # Deploying an application with Flux CD -To deploy an application with `Flux CD`, will need to create a `HelmRepository` resource which points to the helm chart you want to deploy. In that resource definition, you will set the `apiVersion` to `source.toolkit.fluxcd.io/v1` and the `kind` to `HelmRepository`. For example (code from the `signoz` module): +To deploy an application with `Flux CD`, you will need to create a `HelmRepository` resource which points to the helm chart you want to deploy. In that resource definition, you will set the `apiVersion` to `source.toolkit.fluxcd.io/v1` and the `kind` to `HelmRepository`. For example (code from the `signoz` module): ``` resource "kubectl_manifest" "signoz-helm-repo" { @@ -27,7 +27,7 @@ YAML } ``` -In the `Deployment` or `HelmRelease` resource, you will need to add a similar configuration, for example (again from the `signoz` module): +In your `Deployment` or `HelmRelease` resource, you will need to add a similar configuration, for example (again from the `signoz` module): ``` resource "kubectl_manifest" "signoz-helm-release" { depends_on = [kubernetes_namespace.signoz] From ab12fc9d87602ae9f9106f638c38621cad8c938c Mon Sep 17 00:00:00 2001 From: bwmac Date: Thu, 21 Nov 2024 11:20:07 -0500 Subject: [PATCH 135/135] updates s3 bucket docs --- modules/s3-bucket/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/s3-bucket/README.md b/modules/s3-bucket/README.md index f8770361..44aa0594 100644 --- a/modules/s3-bucket/README.md +++ b/modules/s3-bucket/README.md @@ -1,15 +1,15 @@ # Purpose -This is a simple module that can be used within applications to deploy an S3 bucket. +This is a simple module that can be used within applications to create an S3 bucket. ## WARNING If you are tearing down a stack with a deployed S3 Bucket, you will likely encounter an error similar to the following: ``` deleting S3 Bucket (my-beautiful-bucket): operation error S3: DeleteBucket, https response error StatusCode: 409, RequestID: 123, HostID: 123456789+g=, api error BucketNotEmpty: The bucket you tried to delete is not empty. You must delete all versions in the bucket. ``` -We have intentionally not handled this behavior as a safeguard against accidental deletion of a bucket that contains important data. +We have intentionally not handled this behavior as a safeguard against accidental deletion of a bucket that contains important data. If you need to delete the bucket, you will need to manually delete all objects within it. If versioning is enabled for the bucket, you will also need to delete all versions of the objects. # Usage -Using this module is as simple as calling it in your terraform code: +Using this module only requires calling it in your terraform code: ``` module "my_beautiful_bucket" { source = "../../../modules/s3-bucket"