diff --git a/README.md b/README.md index 93c6ef3f..fe30a568 100644 --- a/README.md +++ b/README.md @@ -201,3 +201,4 @@ This document describes the abbreviated process below: } ``` - Add a new `spacelift_aws_integration` resources to the `common-resources/aws-integrations` directory. + diff --git a/dev/main.tf b/dev/main.tf index 0b2d4aac..db35d921 100644 --- a/dev/main.tf +++ b/dev/main.tf @@ -8,4 +8,5 @@ resource "spacelift_space" "development" { module "dpe-sandbox-spacelift" { source = "./spacelift/dpe-sandbox" parent_space_id = spacelift_space.development.id + admin_stack_id = var.admin_stack_id } diff --git a/dev/spacelift/dpe-sandbox/main.tf b/dev/spacelift/dpe-sandbox/main.tf index cf38a3d1..55cc7155 100644 --- a/dev/spacelift/dpe-sandbox/main.tf +++ b/dev/spacelift/dpe-sandbox/main.tf @@ -41,6 +41,16 @@ resource "spacelift_stack" "k8s-stack-deployments" { space_id = spacelift_space.dpe-sandbox.id } +# resource "spacelift_stack_dependency" "dependency-on-admin-stack" { +# for_each = { +# k8s-stack = spacelift_stack.k8s-stack, +# k8s-stack-deployments = spacelift_stack.k8s-stack-deployments +# } + +# stack_id = each.value.id +# depends_on_stack_id = var.admin_stack_id +# } + resource "spacelift_context_attachment" "k8s-kubeconfig-hooks" { context_id = "kubernetes-deployments-kubeconfig" stack_id = spacelift_stack.k8s-stack-deployments.id @@ -118,7 +128,7 @@ resource "spacelift_stack_destructor" "k8s-stack-destructor" { resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" { # org-sagebase-dnt-dev-aws-integration - integration_id = "01J3DNYVM4AWWSDY3QEVRMQ076" + integration_id = "01J3R9GX6DC09QV7NV872DDYR3" stack_id = spacelift_stack.k8s-stack.id read = true write = true @@ -126,7 +136,7 @@ resource "spacelift_aws_integration_attachment" "k8s-aws-integration-attachment" resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration-attachment" { # org-sagebase-dnt-dev-aws-integration - integration_id = "01J3DNYVM4AWWSDY3QEVRMQ076" + integration_id = "01J3R9GX6DC09QV7NV872DDYR3" stack_id = spacelift_stack.k8s-stack-deployments.id read = true write = true diff --git a/dev/spacelift/dpe-sandbox/variables.tf b/dev/spacelift/dpe-sandbox/variables.tf index b6b4a9cf..48f5cf97 100644 --- a/dev/spacelift/dpe-sandbox/variables.tf +++ b/dev/spacelift/dpe-sandbox/variables.tf @@ -10,3 +10,8 @@ variable "tags" { "CostCenter" = "No Program / 000000" } } + +variable "admin_stack_id" { + description = "ID of the admin stack" + type = string +} diff --git a/dev/stacks/dpe-sandbox-k8s-deployments/main.tf b/dev/stacks/dpe-sandbox-k8s-deployments/main.tf index ace5e2f5..5baf50b3 100644 --- a/dev/stacks/dpe-sandbox-k8s-deployments/main.tf +++ b/dev/stacks/dpe-sandbox-k8s-deployments/main.tf @@ -1,10 +1,28 @@ module "sage-aws-eks-autoscaler" { source = "spacelift.io/sagebionetworks/sage-aws-eks-autoscaler/aws" - version = "0.3.2" + version = "0.4.2" + cluster_name = var.cluster_name cluster_name = var.cluster_name private_vpc_subnet_ids = var.private_subnet_ids vpc_id = var.vpc_id node_security_group_id = var.node_security_group_id spotinst_account = var.spotinst_account + # desired_capacity = 2 +} + +module "victoria-metrics" { + source = "spacelift.io/sagebionetworks/victoria-metrics/aws" + version = "0.0.7" +} + +module "trivy-operator" { + source = "spacelift.io/sagebionetworks/trivy-operator/aws" + version = "0.0.12" +} + +module "airflow" { + source = "spacelift.io/sagebionetworks/airflow/aws" + version = "0.0.1" + cluster_name = var.cluster_name } diff --git a/dev/stacks/dpe-sandbox-k8s/main.tf b/dev/stacks/dpe-sandbox-k8s/main.tf index 65d62069..92d35f9d 100644 --- a/dev/stacks/dpe-sandbox-k8s/main.tf +++ b/dev/stacks/dpe-sandbox-k8s/main.tf @@ -1,6 +1,6 @@ module "sage-aws-vpc" { source = "spacelift.io/sagebionetworks/sage-aws-vpc/aws" - version = "0.3.3" + version = "0.3.4" vpc_name = "dpe-sandbox" capture_flow_logs = true flow_log_retention = 1 @@ -8,7 +8,7 @@ module "sage-aws-vpc" { module "sage-aws-eks" { source = "spacelift.io/sagebionetworks/sage-aws-eks/aws" - version = "0.3.9" + version = "0.4.0" cluster_name = "dpe-k8-sandbox" private_vpc_subnet_ids = module.sage-aws-vpc.private_subnet_ids @@ -20,4 +20,14 @@ module "sage-aws-eks" { pod_security_group_enforcing_mode = "standard" aws_account_id = "631692904429" private_subnet_cidrs = module.sage-aws-vpc.vpc_private_subnet_cidrs + cluster_name = "dpe-k8-sandbox" + private_vpc_subnet_ids = module.sage-aws-vpc.private_subnet_ids + vpc_id = module.sage-aws-vpc.vpc_id + vpc_security_group_id = module.sage-aws-vpc.vpc_security_group_id + enable_policy_event_logs = true + capture_cloudwatch_logs = true + cloudwatch_retention = 1 + pod_security_group_enforcing_mode = "standard" + aws_account_id = "631692904429" + private_subnet_cidrs = module.sage-aws-vpc.vpc_private_subnet_cidrs } diff --git a/dev/variables.tf b/dev/variables.tf index ae2a3de0..ca21c16b 100644 --- a/dev/variables.tf +++ b/dev/variables.tf @@ -2,3 +2,8 @@ variable "parent_space_id" { description = "ID of the parent spacelift space" type = string } + +variable "admin_stack_id" { + description = "ID of the admin stack" + type = string +} diff --git a/main.tf b/main.tf index b7269a01..5c4f8d61 100644 --- a/main.tf +++ b/main.tf @@ -17,7 +17,7 @@ resource "spacelift_stack" "root_administrative_stack" { administrative = true autodeploy = true - branch = "main" + branch = "ibcdpe-1007-monitoring" description = "Manages other spacelift resources" name = "Root Spacelift Administrative Stack" project_root = "" @@ -56,4 +56,5 @@ module "dev-resources" { module.terraform-registry, ] parent_space_id = spacelift_space.environment.id + admin_stack_id = spacelift_stack.root_administrative_stack.id } diff --git a/modules/apache-airflow/data.tf b/modules/apache-airflow/data.tf deleted file mode 100644 index 765d5620..00000000 --- a/modules/apache-airflow/data.tf +++ /dev/null @@ -1,7 +0,0 @@ -data "aws_eks_cluster" "cluster" { - name = var.cluster_name -} - -data "aws_eks_cluster_auth" "cluster" { - name = var.cluster_name -} \ No newline at end of file diff --git a/modules/apache-airflow/variables.tf b/modules/apache-airflow/variables.tf deleted file mode 100644 index 93adc5a2..00000000 --- a/modules/apache-airflow/variables.tf +++ /dev/null @@ -1,18 +0,0 @@ -variable "cluster_name" { - description = "Name of K8 cluster" - type = string - default = "dpe-k8" -} - -variable "kube_config_path" { - description = "Kube config path" - type = string - default = "~/.kube/config" -} - -variable "region" { - description = "AWS region" - type = string - default = "us-east-1" -} - diff --git a/modules/internal-k8-infra/data.tf b/modules/internal-k8-infra/data.tf deleted file mode 100644 index be8854a8..00000000 --- a/modules/internal-k8-infra/data.tf +++ /dev/null @@ -1,55 +0,0 @@ -data "aws_eks_cluster" "cluster" { - name = var.cluster_name -} - -data "aws_eks_cluster_auth" "cluster" { - name = var.cluster_name -} - -data "aws_secretsmanager_secret" "spotinst_token" { - name = "spotinst_token" -} - -data "aws_secretsmanager_secret_version" "secret_credentials" { - secret_id = data.aws_secretsmanager_secret.spotinst_token.id -} - -data "aws_vpc" "selected" { - filter { - name = "tag:Name" - values = ["spacelift-created-vpc"] - } -} - -data "aws_subnets" "node_subnets" { - filter { - name = "vpc-id" - values = [data.aws_vpc.selected.id] - } - - filter { - name = "tag:Name" - values = ["private"] - } -} - -data "aws_iam_roles" "all_roles" {} - -data "aws_eks_node_groups" "node_groups" { - cluster_name = var.cluster_name -} - -data "aws_eks_node_group" "node_group" { - cluster_name = var.cluster_name - node_group_name = tolist(data.aws_eks_node_groups.node_groups.names)[0] -} - -data "aws_iam_instance_profiles" "profile" { - role_name = split("/", data.aws_eks_node_group.node_group.node_role_arn)[1] -} - -data "aws_security_group" "eks_cluster_security_group" { - tags = { - Name = "${var.cluster_name}-node" - } -} diff --git a/modules/internal-k8-infra/main.tf b/modules/internal-k8-infra/main.tf deleted file mode 100644 index 5955c301..00000000 --- a/modules/internal-k8-infra/main.tf +++ /dev/null @@ -1,128 +0,0 @@ -module "ocean-controller" { - source = "spotinst/ocean-controller/spotinst" - version = "0.54.0" - - # Credentials. - spotinst_token = data.aws_secretsmanager_secret_version.secret_credentials.secret_string - spotinst_account = var.spotinst_account - - # Configuration. - cluster_identifier = var.cluster_name -} - -module "ocean-aws-k8s" { - source = "spotinst/ocean-aws-k8s/spotinst" - version = "1.2.0" - - # Configuration - cluster_name = var.cluster_name - region = var.region - subnet_ids = data.aws_subnets.node_subnets.ids - worker_instance_profile_arn = tolist(data.aws_iam_instance_profiles.profile.arns)[0] - security_groups = [data.aws_security_group.eks_cluster_security_group.id] - is_aggressive_scale_down_enabled = true - max_scale_down_percentage = 33 - tags = var.tags -} - -resource "kubernetes_namespace" "airflow" { - metadata { - name = "airflow" - } -} - -resource "random_password" "airflow" { - length = 16 - special = true - override_special = "!#$%&*()-_=+[]{}<>:?" -} - -resource "kubernetes_secret" "airflow_webserver_secret" { - metadata { - name = "airflow-webserver-secret" - namespace = "airflow" - } - - data = { - "webserver-secret-key" = random_password.airflow.result - } - - depends_on = [kubernetes_namespace.airflow] -} - -# TODO: Should a long-term deployment use a managed RDS instance? -# https://github.com/apache/airflow/blob/main/chart/values.yaml#L2321-L2329 -resource "helm_release" "airflow" { - name = "apache-airflow" - repository = "https://airflow.apache.org" - chart = "airflow" - namespace = "airflow" - version = "1.11.0" - depends_on = [kubernetes_namespace.airflow, module.ocean-controller, module.ocean-aws-k8s] - - # https://github.com/hashicorp/terraform-provider-helm/issues/683#issuecomment-830872443 - wait = false - - set { - name = "config.webserver.expose_config" - value = "true" - } - - set { - name = "config.secrets.backend" - value = "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend" - } - - set { - name = "webserver.service.type" - value = "LoadBalancer" - } - - set { - name = "webserverSecretKeySecretName" - value = "airflow-webserver-secret" - } - - set { - name = "airflowVersion" - value = "2.7.1" - } - - set { - name = "defaultAirflowRepository" - value = "bfaublesage/airflow" - } - - set { - name = "defaultAirflowTag" - value = "2.7.1-python-3.10" - } - - set { - name = "dags.persistence.enabled" - value = "false" - } - - set { - name = "dags.gitSync.enabled" - value = "true" - } - - set { - name = "dags.gitSync.repo" - value = "https://github.com/Sage-Bionetworks-Workflows/orca-recipes" - } - - set { - name = "dags.gitSync.subPath" - value = "dags" - } - - set { - name = "dags.gitSync.branch" - value = "main" - } - - - values = [templatefile("${path.module}/templates/airflow-values.yaml", {})] -} diff --git a/modules/internal-k8-infra/provider.tf b/modules/internal-k8-infra/provider.tf deleted file mode 100644 index 451c9b98..00000000 --- a/modules/internal-k8-infra/provider.tf +++ /dev/null @@ -1,21 +0,0 @@ -provider "aws" { - region = var.region -} - -provider "spotinst" { - account = var.spotinst_account - token = data.aws_secretsmanager_secret_version.secret_credentials.secret_string -} - -provider "kubernetes" { - config_path = var.kube_config_path - host = data.aws_eks_cluster.cluster.endpoint - cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data) - token = data.aws_eks_cluster_auth.cluster.token -} - -provider "helm" { - kubernetes { - config_path = var.kube_config_path - } -} diff --git a/modules/internal-k8-infra/templates/airflow-values.yaml b/modules/internal-k8-infra/templates/airflow-values.yaml deleted file mode 100644 index c89e8e99..00000000 --- a/modules/internal-k8-infra/templates/airflow-values.yaml +++ /dev/null @@ -1,20 +0,0 @@ -config: - secrets: - backend_kwargs: '{"connections_prefix": "airflow/connections", "variables_prefix": "airflow/variables", "region_name": "us-east-1"}' - # webserver: - # authenticate: true - # auth_backend: airflow.contrib.auth.backends.google_auth - # web_server_ssl_cert = - # web_server_ssl_key = - # web_server_port = 443 - # base_url = http://:443 - # celery: - # ssl_active = True - # ssl_key = - # ssl_cert = - # ssl_cacert = - -# service: -# type: LoadBalancer # or another type as needed -# annotations: -# alb.ingress.kubernetes.io/scheme: "internal" diff --git a/modules/internal-k8-infra/variables.tf b/modules/internal-k8-infra/variables.tf deleted file mode 100644 index 6751b0a8..00000000 --- a/modules/internal-k8-infra/variables.tf +++ /dev/null @@ -1,36 +0,0 @@ -variable "cluster_name" { - description = "Name of K8 cluster" - type = string - default = "dpe-k8" -} - -variable "node_group_name" { - description = "Node group name for the cluster" - type = string - default = "airflow-node-group" -} - -variable "kube_config_path" { - description = "Kube config path" - type = string - default = "~/.kube/config" -} - -variable "region" { - description = "AWS region" - type = string - default = "us-east-1" -} - -variable "spotinst_account" { - description = "Spot.io account" - type = string -} - -variable "tags" { - description = "AWS Resource Tags" - type = map(string) - default = { - "CostCenter" = "No Program / 000000" - } -} diff --git a/modules/main.tf b/modules/main.tf index 00a4de85..e5f027b8 100644 --- a/modules/main.tf +++ b/modules/main.tf @@ -1,104 +1,150 @@ -resource "spacelift_module" "sage-aws-vpc" { - github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" - } +locals { + spacelift_modules = { - name = "sage-aws-vpc" - terraform_provider = "aws" - administrative = false - branch = "main" - description = "Terraform module for creating a VPC in AWS" - repository = "eks-stack" - project_root = "modules/sage-aws-vpc" - space_id = "root" -} + eks = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" -resource "spacelift_version" "sage-aws-vpc-version" { - module_id = spacelift_module.sage-aws-vpc.id - version_number = "0.3.3" -} + name = "sage-aws-eks" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Terraform module for creating an EKS cluster in AWS" + project_root = "modules/sage-aws-eks" + space_id = "root" + version_number = "0.4.0" + } -resource "spacelift_module" "sage-aws-eks" { - github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" - } + vpc = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" - name = "sage-aws-eks" - terraform_provider = "aws" - administrative = false - branch = "main" - description = "Terraform module for creating an EKS cluster in AWS" - repository = "eks-stack" - project_root = "modules/sage-aws-eks" - space_id = "root" -} + name = "sage-aws-vpc" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Terraform module for creating a VPC in AWS" + project_root = "modules/sage-aws-vpc" + space_id = "root" + version_number = "0.3.4" + } -resource "spacelift_version" "sage-aws-eks-version" { - module_id = spacelift_module.sage-aws-eks.id - version_number = "0.3.9" -} + eks-autoscaler = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" -resource "spacelift_module" "sage-aws-eks-autoscaler" { - github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" - } + name = "sage-aws-eks-autoscaler" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Terraform module for creating an EKS cluster autoscaler in AWS" + project_root = "modules/sage-aws-k8s-node-autoscaler" + space_id = "root" + version_number = "0.4.2" + } - name = "sage-aws-eks-autoscaler" - terraform_provider = "aws" - administrative = false - branch = "main" - description = "Terraform module for creating an EKS cluster autoscaler in AWS" - repository = "eks-stack" - project_root = "modules/sage-aws-k8s-node-autoscaler" - space_id = "root" -} + victoria-metrics = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" -resource "spacelift_version" "sage-aws-eks-autoscaler-version" { - module_id = spacelift_module.sage-aws-eks-autoscaler.id - version_number = "0.3.2" -} + name = "victoria-metrics" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Helm chart deployment for a single node Victoria Metrics instance" + project_root = "modules/victoria-metrics" + space_id = "root" + version_number = "0.0.7" + } -resource "spacelift_module" "spacelift-private-workerpool" { - github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" - } + trivy-operator = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" - name = "spacelift-private-workerpool" - terraform_provider = "aws" - administrative = false - branch = "main" - description = "Module for the spacelift private workerpool helm chart which deploys the K8s operator" - repository = "eks-stack" - project_root = "modules/spacelift-private-worker" - space_id = "root" -} + name = "trivy-operator" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Helm chart deployment for trivy-operator which handles security and vulnerability scanning." + project_root = "modules/trivy-operator" + space_id = "root" + version_number = "0.0.12" + } + + airflow = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" -resource "spacelift_version" "spacelift-private-workerpool-version" { - module_id = spacelift_module.spacelift-private-workerpool.id - version_number = "0.2.0" + name = "airflow" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Helm chart deployment for apache airflow." + project_root = "modules/apache-airflow" + space_id = "root" + version_number = "0.0.1" + } + + private-workerpool = { + github_enterprise = { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + repository = "eks-stack" + + name = "spacelift-private-workerpool" + terraform_provider = "aws" + administrative = false + branch = var.git_branch + description = "Module for the spacelift private workerpool helm chart which deploys the K8s operator" + project_root = "modules/spacelift-private-worker" + space_id = "root" + version_number = "0.2.1" + } + } } -resource "spacelift_module" "spacelift-private-workerpool" { +resource "spacelift_module" "spacelift_modules" { + for_each = local.spacelift_modules + github_enterprise { - namespace = "Sage-Bionetworks-Workflows" - id = "sage-bionetworks-workflows-gh" + namespace = each.value.github_enterprise.namespace + id = each.value.github_enterprise.id } - name = "spacelift-private-workerpool" - terraform_provider = "aws" - administrative = false - branch = "ibcdpe-935-vpc-updates" - description = "Module for the spacelift private workerpool helm chart which deploys the K8s operator" - repository = "eks-stack" - project_root = "modules/spacelift-private-worker" - space_id = "root" + name = each.value.name + terraform_provider = each.value.terraform_provider + administrative = each.value.administrative + branch = each.value.branch + description = each.value.description + repository = each.value.repository + project_root = each.value.project_root + space_id = each.value.space_id } -resource "spacelift_version" "spacelift-private-workerpool-version" { - module_id = spacelift_module.spacelift-private-workerpool.id - version_number = "0.1.3" +resource "spacelift_version" "spacelift_versions" { + for_each = local.spacelift_modules + module_id = spacelift_module.spacelift_modules[each.key].id + version_number = each.value.version_number + keepers = { + "version" = each.value.version_number + } } diff --git a/modules/sage-aws-eks/main.tf b/modules/sage-aws-eks/main.tf index 35c3b580..e8c7c7ca 100644 --- a/modules/sage-aws-eks/main.tf +++ b/modules/sage-aws-eks/main.tf @@ -95,6 +95,7 @@ module "eks" { env = { ENABLE_POD_ENI = "true", POD_SECURITY_GROUP_ENFORCING_MODE = var.pod_security_group_enforcing_mode, + ENABLE_PREFIX_DELEGATION = "true", } }) } } diff --git a/modules/sage-aws-k8s-node-autoscaler/main.tf b/modules/sage-aws-k8s-node-autoscaler/main.tf index 8147dfbc..10980c99 100644 --- a/modules/sage-aws-k8s-node-autoscaler/main.tf +++ b/modules/sage-aws-k8s-node-autoscaler/main.tf @@ -91,7 +91,7 @@ module "ocean-controller" { module "ocean-aws-k8s" { source = "spotinst/ocean-aws-k8s/spotinst" - version = "1.2.0" + version = "1.4.0" # Configuration cluster_name = var.cluster_name @@ -102,6 +102,35 @@ module "ocean-aws-k8s" { is_aggressive_scale_down_enabled = true max_scale_down_percentage = 33 tags = var.tags + # TODO: Fix this it does not seem to work + # `desired_capacity` does not seem to force the number of nodes to increase. Look + # through the documentation to determine how we might manually scale up the number + # of nodes if we wanted to. + desired_capacity = var.desired_capacity + + + filters = { + exclude_metal = true + hypervisor = ["nitro"] + + architectures = null + categories = null + disk_types = null + exclude_families = null + include_families = null + is_ena_supported = null + max_gpu = null + max_memory_gib = null + max_network_performance = null + max_vcpu = null + min_enis = null + min_gpu = null + min_memory_gib = null + min_network_performance = null + min_vcpu = null + root_device_types = null + virtualization_types = null + } } resource "aws_eks_addon" "coredns" { diff --git a/modules/sage-aws-k8s-node-autoscaler/variables.tf b/modules/sage-aws-k8s-node-autoscaler/variables.tf index 70877106..26efd91e 100644 --- a/modules/sage-aws-k8s-node-autoscaler/variables.tf +++ b/modules/sage-aws-k8s-node-autoscaler/variables.tf @@ -43,3 +43,9 @@ variable "tags" { "CostCenter" = "No Program / 000000" } } + +variable "desired_capacity" { + description = "Desired capacity of the node group" + type = number + default = 1 +} diff --git a/modules/trivy-operator/README.md b/modules/trivy-operator/README.md new file mode 100644 index 00000000..0b5f7d7d --- /dev/null +++ b/modules/trivy-operator/README.md @@ -0,0 +1,39 @@ +# Purpose + +This module is used to deploy the trivy operator k8s helm chart. + +The Trivy Operator leverages Trivy to continuously scan your Kubernetes cluster for +security issues. The scans are summarised in security reports as Kubernetes Custom +Resource Definitions, which become accessible through the Kubernetes API. The Operator +does this by watching Kubernetes for state changes and automatically triggering +security scans in response. For example, a vulnerability scan is initiated when a new +Pod is created. This way, users can find and view the risks that relate to different +resources in a Kubernetes-native way. + + +This module is responsible for installing 3 charts: + +- Scanner that regularly scans for vulnerabilities: `trivy-operator` from +- Convert trivy CRDs into policy-reporter +- A UI to easily view the scan results + +## Getting an overview of trivy results + +Results are provided in a grafana dashbaord that is scraped from the operator `/metrics` +endpoint. The dashboard looks like: + +![trivy operator dashboard](./trivy-operator-dashboard.png) + + +## Viewing more detailed information about the vulnerabilities +Data from trivy is converted over into resources that a tool called `policy-reporter` +can understand. It has a UI built on top of those resources which allow you to view +what the scans are telling you. In order to get access to the UI create a port forward +session to the UI in the `trivy-system` namespace. + +![policy-reporter UI example](./policy-reporter-ui.png) + +### Viewing the most detailed information +Tivy creates CRDs (Custom resource definitions) that store all of the information that +it collected. You are able to view this information as a kubernetes resource through +`kubectl` commands or `k9s`. Read more on these CRDs here . diff --git a/modules/trivy-operator/main.tf b/modules/trivy-operator/main.tf new file mode 100644 index 00000000..9ca673cb --- /dev/null +++ b/modules/trivy-operator/main.tf @@ -0,0 +1,70 @@ +resource "kubernetes_namespace" "trivy-system" { + metadata { + name = "trivy-system" + } +} + +resource "helm_release" "trivy-operator" { + name = "trivy-operator" + repository = "https://aquasecurity.github.io/helm-charts/" + chart = "trivy-operator" + namespace = "trivy-system" + version = "0.24.1" + depends_on = [ + kubernetes_namespace.trivy-system + ] + + values = [templatefile("${path.module}/templates/values-trivy-operator.yaml", {})] +} + +resource "kubernetes_manifest" "vmservicescrape" { + manifest = { + apiVersion = "operator.victoriametrics.com/v1beta1" + kind = "VMServiceScrape" + metadata = { + name = "trivy-vmservicescrape" + namespace = kubernetes_namespace.trivy-system.metadata[0].name + } + spec = { + endpoints = [ + { + port = "metrics" + } + ] + selector = { + matchLabels = { + "app.kubernetes.io/name" = "trivy-operator" + } + } + } + } +} + +# converts the trivy-operator metrics to policy reporter format +resource "helm_release" "trivy-operator-polr-adapter" { + name = "trivy-operator-polr-adapter" + repository = "https://fjogeleit.github.io/trivy-operator-polr-adapter" + chart = "trivy-operator-polr-adapter" + namespace = "trivy-system" + version = "0.8.0" + depends_on = [ + kubernetes_namespace.trivy-system + ] + + values = [templatefile("${path.module}/templates/values-trivy-operator-polr-adapter.yaml", {})] +} + +# UI for viewing Policy Reports +resource "helm_release" "policy-reporter" { + name = "policy-reporter" + repository = "https://kyverno.github.io/policy-reporter" + chart = "policy-reporter" + namespace = "trivy-system" + version = "2.24.1" + depends_on = [ + kubernetes_namespace.trivy-system, + helm_release.trivy-operator-polr-adapter + ] + + values = [templatefile("${path.module}/templates/values-policy-reporter.yaml", {})] +} diff --git a/modules/trivy-operator/policy-reporter-ui.png b/modules/trivy-operator/policy-reporter-ui.png new file mode 100644 index 00000000..a08d5430 Binary files /dev/null and b/modules/trivy-operator/policy-reporter-ui.png differ diff --git a/modules/trivy-operator/templates/values-policy-reporter.yaml b/modules/trivy-operator/templates/values-policy-reporter.yaml new file mode 100644 index 00000000..6a235def --- /dev/null +++ b/modules/trivy-operator/templates/values-policy-reporter.yaml @@ -0,0 +1,774 @@ +# Override the chart name used for all resources +nameOverride: "" + +image: + registry: ghcr.io + repository: kyverno/policy-reporter + pullPolicy: IfNotPresent + tag: 2.20.1 + +imagePullSecrets: [] + +priorityClassName: "" + +replicaCount: 1 + +revisionHistoryLimit: 10 + +deploymentStrategy: {} + # rollingUpdate: + # maxSurge: 25% + # maxUnavailable: 25% + # type: RollingUpdate + +# When using a custom port together with the PolicyReporter UI +# the port has also to be changed in the UI subchart as well because it can't access the parent values. +# You can change the port under `ui.policyReporter.port` +port: + name: http + number: 8080 + +# Key/value pairs that are attached to all resources. +annotations: {} + +# Create cluster role policies +rbac: + enabled: true + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +service: + enabled: true + ## configuration of service + # key/value + annotations: {} + # key/value + labels: {} + type: ClusterIP + # integer number. This is port for service + port: 8080 + +podSecurityContext: + fsGroup: 1234 + +securityContext: + runAsUser: 1234 + runAsNonRoot: true + privileged: false + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +# Key/value pairs that are attached to pods. +podAnnotations: {} + +# Key/value pairs that are attached to pods. +podLabels: {} + +# Allow additional env variables to be added +envVars: [] + +resources: + limits: + memory: 100Mi + cpu: 10m + requests: + memory: 75Mi + cpu: 5m + +# Enable a NetworkPolicy for this chart. Useful on clusters where Network Policies are +# used and configured in a default-deny fashion. +networkPolicy: + enabled: false + # Kubernetes API Server + egress: + - to: + ports: + - protocol: TCP + port: 6443 + ingress: [] + +## Set to true to enable ingress record generation +# ref to: https://kubernetes.io/docs/concepts/services-networking/ingress/ +ingress: + enabled: false + className: "" + # key/value + labels: {} + # key/value + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: [] + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +logging: + encoding: console # possible encodings are console and json + logLevel: 0 # default info + development: false # more human readable structure, enables stacktraces and removes log sampling + +api: + logging: false # enable debug API access logging, sets logLevel to debug + +# REST API +rest: + enabled: false + +# Prometheus Metrics API +metrics: + enabled: false + mode: detailed # available modes are detailed, simple and custom + customLabels: [] # only used for custom mode. Supported fields are: ["namespace", "rule", "policy", "report" // PolicyReport name, "kind" // resource kind, "name" // resource name, "status", "severity", "category", "source"] +# filter: +# sources: +# exclude: ["Trivy CIS Kube Bench"] +# status: +# exclude: ["pass", "skip"] + +profiling: + enabled: false + +# amount of queue workers for PolicyReport resource processing +worker: 5 + +# Filter PolicyReport resources to process +reportFilter: + namespaces: + # Process only PolicyReport resources from an included namespace, wildcards are supported + include: [] + # Ignore all PolicyReport resources from a excluded namespace, wildcards are supported + # exclude will be ignored if an include filter exists + exclude: [] + clusterReports: + # Disable the processing of ClusterPolicyReports + disabled: false + +# customize source specific logic like result ID generation +sourceConfig: {} +# sourcename: +# customID: +# enabled: true +# fields: ["resource", "policy", "rule", "category", "result", "message"] + +# Settings for the Policy Reporter UI subchart (see subchart's values.yaml) +ui: + enabled: true + displayMode: dark + logSize: 500 + # Refresh interval in milliseconds + refreshInterval: 100000 + views: + dashboard: + policyReports: true + clusterPolicyReports: true + logs: true + policyReports: true + clusterPolicyReports: true + kyvernoPolicies: true + kyvernoVerifyImages: true + +kyvernoPlugin: + enabled: false + +# Settings for the monitoring subchart +monitoring: + enabled: false + +database: + # Database Type, supported: mysql, postgres, mariadb + type: "" + database: "" # Database Name + username: "" + password: "" + host: "" + enableSSL: false + # instead of configure the individual values you can also provide an DSN string + # example postgres: postgres://postgres:password@localhost:5432/postgres?sslmode=disable + # example mysql: root:password@tcp(localhost:3306)/test?tls=false + dsn: "" + # configure an existing secret as source for your values + # supported fields: username, password, host, dsn, database + secretRef: "" + # use an mounted secret as source for your values, required the information in JSON format + # supported fields: username, password, host, dsn, database + mountedSecret: "" + +global: + # available plugins + plugins: + # enable kyverno for Policy Reporter UI and monitoring + kyverno: false + # The name of service policy-report. Defaults to ReleaseName. + backend: "" + # overwrite the fullname of all resources including subcharts + fullnameOverride: "" + # configure the namespace of all resources including subcharts + namespace: "" + # additional labels added on each resource + labels: {} + # basicAuth for APIs and metrics + basicAuth: + # HTTP BasicAuth username + username: "" + # HTTP BasicAuth password + password: "" + # read credentials from secret + secretRef: "" + +emailReports: + clusterName: "" # (optional) - displayed in the email report if configured + titlePrefix: "Report" # title prefix in the email subject + smtp: + secret: "" # (optional) secret name to provide the complete or partial SMTP configuration + host: "" + port: 465 + username: "" + password: "" + from: "" # displayed from email address + encryption: "" # default is none, supports ssl/tls and starttls + skipTLS: false + certificate: "" + + # basic summary report + summary: + enabled: false + schedule: "0 8 * * *" # CronJob schedule defines when the report will be send + activeDeadlineSeconds: 300 # timeout in seconds + backoffLimit: 3 # retry counter + ttlSecondsAfterFinished: 0 + restartPolicy: Never # pod restart policy + + to: [] # list of receiver email addresses + filter: {} # optional filters + # disableClusterReports: false # remove ClusterPolicyResults from Reports + # namespaces: + # include: [] + # exclude: [] + # sources: + # include: [] + # exclude: [] + channels: [] # (optional) channels can be used to to send only a subset of namespaces / sources to dedicated email addresses channels: [] # (optional) channels can be used to to send only a subset of namespaces / sources to dedicated email addresses + # - to: ['team-a@company.org'] + # filter: + # disableClusterReports: true + # namespaces: + # include: ['team-a-*'] + # sources: + # include: ['Kyverno'] + # violation summary report + violations: + enabled: false + schedule: "0 8 * * *" # CronJob schedule defines when the report will be send + activeDeadlineSeconds: 300 # timeout in seconds + backoffLimit: 3 # retry counter + ttlSecondsAfterFinished: 0 + restartPolicy: Never # pod restart policy + + to: [] # list of receiver email addresses + filter: {} # optional filters + # disableClusterReports: false # remove ClusterPolicyResults from Reports + # namespaces: + # include: [] + # exclude: [] + # sources: + # include: [] + # exclude: [] + channels: [] # (optional) channels can be used to to send only a subset of namespaces / sources to dedicated email addresses channels: [] # (optional) channels can be used to to send only a subset of namespaces / sources to dedicated email addresses + # - to: ['team-a@company.org'] + # filter: + # disableClusterReports: true + # namespaces: + # include: ['team-a-*'] + # sources: + # include: ['Kyverno'] + resources: {} + # limits: + # memory: 100Mi + # cpu: 10m + # requests: + # memory: 75Mi + # cpu: 5m + +# Reference a configuration which already exists instead of creating one +existingTargetConfig: + enabled: false + # Name of the secret with the config + name: "" + # subPath within the secret (defaults to config.yaml) + subPath: "" + +# Supported targets for new PolicyReport Results +target: + loki: + # loki host address + host: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # receive the host from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # loki api path, defaults to "/api/prom/push" (deprecated) + path: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to loki + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional labels to each Loki event + customLabels: {} + # Additional custom HTTP Headers + headers: {} + # HTTP BasicAuth credentials for Loki + username: "" + password: "" + # Filter Results which should send to this target by report labels, namespaces, priorities or policies + # Wildcars for namespaces and policies are supported, you can either define exclude or include values + # Filters are available for all targets except the UI + filter: {} +# namespaces: +# include: ["develop"] +# priorities: +# exclude: ["debug", "info", "error"] +# labels: +# include: ["app", "owner:team-a", "monitoring:*"] + channels: [] +# - host: "http://loki.loki-stack:3100" +# sources: [] +# customLabels: {} +# filter: +# namespaces: +# include: ["develop"] +# priorities: +# exclude: ["debug", "info", "error"] +# reportLabels: +# . include: ["app", "owner:team-b"] + + elasticsearch: + # elasticsearch host address + host: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # elasticsearch index (default: policy-reporter) + index: "" + # elasticsearch username für HTTP Basic Auth + username: "" + # elasticsearch password für HTTP Basic Auth + password: "" + # elasticsearch apiKey für apiKey authentication + apiKey: "" + # receive the host, username and/or password,apiKey from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # elasticsearch index rotation and index suffix + # possible values: daily, monthly, annually, none (default: daily) + rotation: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to elasticsearch + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # https://www.elastic.co/blog/moving-from-types-to-typeless-apis-in-elasticsearch-7-0 keeping as false for retrocompatibility. + typelessApi: false + # Added as additional properties to each elasticsearch event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional elasticsearch channels with different configurations and filters + channels: [] + + slack: + # slack app webhook address + webhook: "" + # slack channel + channel: "" + # receive the webhook from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to slack + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional fields to each Slack event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional slack channels with different configurations and filters + channels: [] +# - webhook: "https://slack.webhook1" +# channel: "" +# filter: +# namespaces: +# include: ["develop"] +# priorities: +# exclude: ["debug", "info", "error"] +# policies: +# include: ["require-run-as-nonroot"] +# reportLabels: +# . include: ["app", "owner:team-b"] +# - webhook: "https://slack.webhook2" +# minimumPriority: "warning" +# filter: +# namespaces: +# include: ["team-a-*"] + + discord: + # discord app webhook address + webhook: "" + # receive the webhook from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to discord + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # filter results send by namespaces, policies and priorities + filter: {} + # add additional discord channels with different configurations and filters + channels: [] + + teams: + # teams webhook address + webhook: "" + # receive the webhook from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to teams + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # filter results send by namespaces, policies and priorities + filter: {} + # add additional teams channels with different configurations and filters + channels: [] + + ui: + # ui host address + host: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # minimum priority "" < info < warning < critical < error + minimumPriority: "warning" + # list of sources which should send to the UI Log + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + + webhook: + # webhook host address + host: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # receive the host and/or token from an existing secret, the token is added as Authorization header + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # additional http headers + headers: {} + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to the UI Log + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each webhook event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional webhook channels with different configurations and filters + channels: [] + + telegram: + # telegram bot token + token: "" + # telegram chat id + chatID: "" + # optional telegram proxy host + host: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # receive the host and/or token from an existing secret, the token is added as Authorization header + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # additional http headers + headers: {} + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to telegram + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each notification + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional telegram channels with different configurations and filters + channels: [] + + googleChat: + # GoogleChat webhook + webhook: "" + # path to your custom certificate + # can be added under extraVolumes + certificate: "" + # skip TLS verification if necessary + skipTLS: false + # receive the host and/or token from an existing secret, the token is added as Authorization header + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # additional http headers + headers: {} + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to telegram + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each notification + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional telegram channels with different configurations and filters + channels: [] + + s3: + # S3 access key + accessKeyID: "" + # S3 secret access key + secretAccessKey: "" + # receive the accessKeyID and/or secretAccessKey from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # S3 storage region + region: "" + # S3 storage endpoint + endpoint: "" + # S3 storage, bucket name + bucket: "" + # S3 storage to use an S3 Bucket Key for object encryption with SSE-KMS + bucketKeyEnabled: false + # S3 storage KMS Key ID for object encryption with SSE-KMS + kmsKeyId: "" + # S3 storage server-side encryption algorithm used when storing this object in Amazon S3, AES256, aws:kms + serverSideEncryption: "" + # S3 storage, force path style configuration + pathStyle: false + # name of prefix, keys will have format: s3:////YYYY-MM-DD/YYYY-MM-DDTHH:mm:ss.s+01:00.json + prefix: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to S3 + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each s3 event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional s3 channels with different configurations and filters + channels: [] + + kinesis: + # AWS access key + accessKeyID: "" + # AWS secret access key + secretAccessKey: "" + # receive the accessKeyID and/or secretAccessKey from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # AWS region + region: "" + # AWS Kinesis endpoint + endpoint: "" + # AWS Kinesis stream name + streamName: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to S3 + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each kinesis event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional s3 channels with different configurations and filters + channels: [] + + securityHub: + # AWS access key + accessKeyID: "" + # AWS secret access key + secretAccessKey: "" + # receive the accessKeyID and/or secretAccessKey from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # AWS region + region: "" + # AWS SecurityHub endpoint (optional) + endpoint: "" + # AWS accountID + accountID: "" + # Used product name, defaults to "Polilcy Reporter" + productName: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to S3 + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Enable cleanup listener for SecurityHub + cleanup: false + # Delay between AWS GetFindings API calls, to avoid hitting the API RequestLimit + delayInSeconds: 2 + # Added as additional properties to each securityHub event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional s3 channels with different configurations and filters + channels: [] + + gcs: + # GCS (Google Cloud Storage) Service Accout Credentials + credentials: "" + # receive the credentials from an existing secret instead + secretRef: "" + # Mounted secret path by Secrets Controller, secret should be in json format + mountedSecret: "" + # GCS Bucket + bucket: "" + # minimum priority "" < info < warning < critical < error + minimumPriority: "" + # list of sources which should send to GCS + sources: [] + # Skip already existing PolicyReportResults on startup + skipExistingOnStartup: true + # Added as additional properties to each gcs event + customFields: {} + # filter results send by namespaces, policies and priorities + filter: {} + # add additional s3 channels with different configurations and filters + channels: [] + +# required when policy-reporter runs in HA mode and you have targets configured +# if no targets are configured, leaderElection is disabled automatically +# will be enabled when replicaCount > 1 +leaderElection: + enabled: false + releaseOnCancel: true + leaseDuration: 15 + renewDeadline: 10 + retryPeriod: 2 + +# use redis as external result cache instead of the in memory cache +redis: + enabled: false + address: "" + database: 0 + prefix: "policy-reporter" + username: "" + password: "" + +# enabled if replicaCount > 1 +podDisruptionBudget: + # -- Configures the minimum available pods for policy-reporter disruptions. + # Cannot be used if `maxUnavailable` is set. + minAvailable: 1 + # -- Configures the maximum unavailable pods for policy-reporter disruptions. + # Cannot be used if `minAvailable` is set. + maxUnavailable: + +# Node labels for pod assignment +# ref: https://kubernetes.io/docs/user-guide/node-selection/ +nodeSelector: {} + +# Tolerations for pod assignment +# ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ +tolerations: [] + +# Anti-affinity to disallow deploying client and master nodes on the same worker node +affinity: {} + +# Topology Spread Constraints to better spread pods +topologySpreadConstraints: [] + +# livenessProbe for policy-reporter +livenessProbe: + httpGet: + path: /healthz + port: http + +# readinessProbe for policy-reporter +readinessProbe: + httpGet: + path: /ready + port: http + +extraVolumes: + volumeMounts: [] + + volumes: [] + +# If set the volume for sqlite is freely configurable below "- name: sqlite". If no value is set an emptyDir is used. +sqliteVolume: {} + # emptyDir: + # sizeLimit: 10Mi + +# If set the volume for /tmp is freely configurable below "- name: tmp". If no value is set an emptyDir is used. +tmpVolume: {} + # emptyDir: + # sizeLimit: 10Mi + diff --git a/modules/trivy-operator/templates/values-trivy-operator-polr-adapter.yaml b/modules/trivy-operator/templates/values-trivy-operator-polr-adapter.yaml new file mode 100644 index 00000000..60dc8848 --- /dev/null +++ b/modules/trivy-operator/templates/values-trivy-operator-polr-adapter.yaml @@ -0,0 +1,108 @@ +# Sets values for https://github.com/fjogeleit/trivy-operator-polr-adapter/tree/main/charts/trivy-operator-polr-adapter/templates +replicaCount: 1 + +image: + registry: ghcr.io + repository: fjogeleit/trivy-operator-polr-adapter + pullPolicy: IfNotPresent + tag: 0.8.0 + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +crds: + install: true + +port: + name: http + number: 8080 + +livenessProbe: + httpGet: + path: /ready + port: http + +readinessProbe: + httpGet: + path: /healthz + port: http + +adapters: + vulnerabilityReports: + enabled: true + timeout: 2 + # apply labels from the source report + applyLabels: [] + configAuditReports: + enabled: true + timeout: 2 + applyLabels: [] + cisKubeBenchReports: + enabled: false + timeout: 2 + applyLabels: [] + complianceReports: + enabled: true + timeout: 2 + applyLabels: [] + rbacAssessmentReports: + enabled: true + timeout: 2 + applyLabels: [] + exposedSecretReports: + enabled: true + timeout: 2 + applyLabels: [] + infraAssessmentReports: + enabled: true + timeout: 2 + applyLabels: [] + clusterInfraAssessmentReports: + enabled: true + timeout: 2 + applyLabels: [] + clusterVulnerabilityReports: + enabled: true + timeout: 2 + applyLabels: [] + +rbac: + enabled: true + +podAnnotations: {} + +podLabels: {} + +podSecurityContext: + fsGroup: 1234 + +securityContext: + runAsUser: 1234 + runAsNonRoot: true + privileged: false + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +serviceAccount: + create: true + diff --git a/modules/trivy-operator/templates/values-trivy-operator.yaml b/modules/trivy-operator/templates/values-trivy-operator.yaml new file mode 100644 index 00000000..0de8324e --- /dev/null +++ b/modules/trivy-operator/templates/values-trivy-operator.yaml @@ -0,0 +1,779 @@ +# Default values for the trivy-operator Helm chart, these are used to render +# the templates into valid k8s Resources. + +# -- global values provide a centralized configuration for 'image.registry', reducing the potential for errors. +# If left blank, the chart will default to the individually set 'image.registry' values +global: + image: + registry: "" + +# -- managedBy is similar to .Release.Service but allows to overwrite the value +managedBy: Helm + +# -- targetNamespace defines where you want trivy-operator to operate. By +# default, it's a blank string to select all namespaces, but you can specify +# another namespace, or a comma separated list of namespaces. +targetNamespaces: "" + +# -- excludeNamespaces is a comma separated list of namespaces (or glob patterns) +# to be excluded from scanning. Only applicable in the all namespaces install +# mode, i.e. when the targetNamespaces values is a blank string. +excludeNamespaces: "" + +# -- targetWorkloads is a comma seperated list of Kubernetes workload resources +# to be included in the vulnerability and config-audit scans +# if left blank, all workload resources will be scanned +targetWorkloads: "pod,replicaset,replicationcontroller,statefulset,daemonset,cronjob,job" + +# -- nameOverride override operator name +nameOverride: "" + +# -- fullnameOverride override operator full name +fullnameOverride: "" + +operator: + # -- namespace to install the operator, defaults to the .Release.Namespace + namespace: "" + # -- replicas the number of replicas of the operator's pod + replicas: 1 + + # -- number of old history to retain to allow rollback (if not set, default Kubernetes value is set to 10) + revisionHistoryLimit: ~ + + # -- additional annotations for the operator deployment + annotations: {} + + # -- additional labels for the operator deployment + labels: {} + + # -- additional labels for the operator pod + podLabels: {} + + # -- leaderElectionId determines the name of the resource that leader election + # will use for holding the leader lock. + leaderElectionId: "trivyoperator-lock" + + # -- logDevMode the flag to enable development mode (more human-readable output, extra stack traces and logging information, etc) + logDevMode: false + + # -- scanJobTTL the set automatic cleanup time after the job is completed + scanJobTTL: "" + + # -- scanSecretTTL set an automatic cleanup for scan job secrets + scanSecretTTL: "" + + # -- scanJobTimeout the length of time to wait before giving up on a scan job + scanJobTimeout: 5m + + # -- scanJobsConcurrentLimit the maximum number of scan jobs create by the operator + scanJobsConcurrentLimit: 10 + + # -- scanNodeCollectorLimit the maximum number of node collector jobs create by the operator + scanNodeCollectorLimit: 1 + + # -- scanJobsRetryDelay the duration to wait before retrying a failed scan job + scanJobsRetryDelay: 30s + + # -- the flag to enable vulnerability scanner + vulnerabilityScannerEnabled: true + # -- the flag to enable sbom generation, required for enabling ClusterVulnerabilityReports + sbomGenerationEnabled: true + # -- the flag to enable cluster sbom cache generation + clusterSbomCacheEnabled: false + # -- scannerReportTTL the flag to set how long a report should exist. "" means that the ScannerReportTTL feature is disabled + scannerReportTTL: "24h" + # -- cacheReportTTL the flag to set how long a cluster sbom report should exist. "" means that the cacheReportTTL feature is disabled + cacheReportTTL: "120h" + # -- configAuditScannerEnabled the flag to enable configuration audit scanner + configAuditScannerEnabled: true + # -- rbacAssessmentScannerEnabled the flag to enable rbac assessment scanner + rbacAssessmentScannerEnabled: true + # -- infraAssessmentScannerEnabled the flag to enable infra assessment scanner + infraAssessmentScannerEnabled: true + # -- clusterComplianceEnabled the flag to enable cluster compliance scanner + clusterComplianceEnabled: true + # -- batchDeleteLimit the maximum number of config audit reports deleted by the operator when the plugin's config has changed. + batchDeleteLimit: 10 + # -- vulnerabilityScannerScanOnlyCurrentRevisions the flag to only create vulnerability scans on the current revision of a deployment. + vulnerabilityScannerScanOnlyCurrentRevisions: true + # -- configAuditScannerScanOnlyCurrentRevisions the flag to only create config audit scans on the current revision of a deployment. + configAuditScannerScanOnlyCurrentRevisions: true + # -- batchDeleteDelay the duration to wait before deleting another batch of config audit reports. + batchDeleteDelay: 10s + # -- accessGlobalSecretsAndServiceAccount The flag to enable access to global secrets/service accounts to allow `vulnerability scan job` to pull images from private registries + accessGlobalSecretsAndServiceAccount: false + # -- builtInTrivyServer The flag enables the usage of built-in trivy server in cluster. It also overrides the following trivy params with built-in values + # trivy.mode = ClientServer and serverURL = http://.:4975 + builtInTrivyServer: false + # -- builtInServerRegistryInsecure is the flag to enable insecure connection from the built-in Trivy server to the registry. + builtInServerRegistryInsecure: false + # -- controllerCacheSyncTimeout the duration to wait for controller resources cache sync (default: 5m). + controllerCacheSyncTimeout: "5m" + + # -- trivyServerHealthCheckCacheExpiration The flag to set the interval for trivy server health cache before it invalidate + trivyServerHealthCheckCacheExpiration: 10h + + # -- metricsFindingsEnabled the flag to enable metrics for findings + metricsFindingsEnabled: true + + # -- metricsVulnIdEnabled the flag to enable metrics about cve vulns id + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsVulnIdEnabled: false + + # -- exposedSecretScannerEnabled the flag to enable exposed secret scanner + exposedSecretScannerEnabled: true + + # -- MetricsExposedSecretInfo the flag to enable metrics about exposed secrets + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsExposedSecretInfo: false + + # -- MetricsConfigAuditInfo the flag to enable metrics about configuration audits + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsConfigAuditInfo: false + + # -- MetricsRbacAssessmentInfo the flag to enable metrics about Rbac Assessment + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsRbacAssessmentInfo: false + + # -- MetricsInfraAssessmentInfo the flag to enable metrics about Infra Assessment + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsInfraAssessmentInfo: false + + # -- MetricsImageInfo the flag to enable metrics about Image Information of scanned images + # This information has image os information including os family, name/version, and if end of service life has been reached + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsImageInfo: false + + # -- MetricsClusterComplianceInfo the flag to enable metrics about Cluster Compliance + # be aware of metrics cardinality is significantly increased with this feature enabled. + metricsClusterComplianceInfo: false + + # -- serverAdditionalAnnotations the flag to set additional annotations for the trivy server pod + serverAdditionalAnnotations: {} + + # -- webhookBroadcastURL the flag to set reports should be sent to a webhook endpoint. "" means that the webhookBroadcastURL feature is disabled + webhookBroadcastURL: "" + + # -- webhookBroadcastTimeout the flag to set timeout for webhook requests if webhookBroadcastURL is enabled + webhookBroadcastTimeout: 30s + + # -- webhookBroadcastCustomHeaders the flag to set webhook endpoint sent with custom defined headers if webhookBroadcastURL is enabled + webhookBroadcastCustomHeaders: "" + + # -- webhookSendDeletedReports the flag to enable sending deleted reports if webhookBroadcastURL is enabled + webhookSendDeletedReports: false + + # -- privateRegistryScanSecretsNames is map of namespace:secrets, secrets are comma seperated which can be used to authenticate in private registries in case if there no imagePullSecrets provided example : {"mynamespace":"mySecrets,anotherSecret"} + privateRegistryScanSecretsNames: {} + + # -- mergeRbacFindingWithConfigAudit the flag to enable merging rbac finding with config-audit report + mergeRbacFindingWithConfigAudit: false + + # -- httpProxy is the HTTP proxy used by Trivy operator to download the default policies from GitHub. + httpProxy: ~ + + # -- httpsProxy is the HTTPS proxy used by Trivy operator to download the default policies from GitHub. + httpsProxy: ~ + + # -- noProxy is a comma separated list of IPs and domain names that are not subject to proxy settings. + noProxy: ~ + + # -- vaulesFromConfigMap name of a ConfigMap to apply OPERATOR_* environment variables. Will override Helm values. + valuesFromConfigMap: "" + + # -- valuesFromSecret name of a Secret to apply OPERATOR_* environment variables. Will override Helm AND ConfigMap values. + valuesFromSecret: "" + +image: + registry: "ghcr.io" + repository: "aquasecurity/trivy-operator" + # -- tag is an override of the image tag, which is by default set by the + # appVersion field in Chart.yaml. + tag: "" + # -- pullPolicy set the operator pullPolicy + pullPolicy: IfNotPresent + # -- pullSecrets set the operator pullSecrets + pullSecrets: [] + +# -- service only expose a metrics endpoint for prometheus to scrape, +# trivy-operator does not have a user interface. +service: + # -- if true, the Service doesn't allocate any IP + headless: true + # -- port exposed by the Service + metricsPort: 80 + # -- annotations added to the operator's service + annotations: {} + # -- appProtocol of the monitoring service + metricsAppProtocol: TCP + # -- the Service type + type: ClusterIP + # -- the nodeport to use when service type is LoadBalancer or NodePort. If not set, Kubernetes automatically select one. + nodePort: + +# -- Prometheus ServiceMonitor configuration -- to install the trivy operator with the ServiceMonitor +# you must have Prometheus already installed and running. If you do not have Prometheus installed, enabling this will +# have no effect. +serviceMonitor: + # -- enabled determines whether a serviceMonitor should be deployed + enabled: false + # -- The namespace where Prometheus expects to find service monitors + namespace: ~ + # -- Interval at which metrics should be scraped. If not specified Prometheus’ global scrape interval is used. + interval: ~ + # -- Additional annotations for the serviceMonitor + annotations: {} + # -- Additional labels for the serviceMonitor + labels: {} + # -- HonorLabels chooses the metric’s labels on collisions with target labels + honorLabels: true + # -- EndpointAdditionalProperties allows setting additional properties on the endpoint such as relabelings, metricRelabelings etc. + endpointAdditionalProperties: {} + +trivyOperator: + # -- vulnerabilityReportsPlugin the name of the plugin that generates vulnerability reports `Trivy` + vulnerabilityReportsPlugin: "Trivy" + # -- configAuditReportsPlugin the name of the plugin that generates config audit reports. + configAuditReportsPlugin: "Trivy" + # -- scanJobCompressLogs control whether scanjob output should be compressed or plain + scanJobCompressLogs: true + # -- scanJobAffinity affinity to be applied to the scanner pods and node-collector + scanJobAffinity: [] + # -- scanJobTolerations tolerations to be applied to the scanner pods so that they can run on nodes with matching taints + scanJobTolerations: [] + # -- If you do want to specify tolerations, uncomment the following lines, adjust them as necessary, and remove the + # square brackets after 'scanJobTolerations:'. + # - key: "key1" + # operator: "Equal" + # value: "value1" + # effect: "NoSchedule" + # -- scanJobNodeSelector nodeSelector to be applied to the scanner pods so that they can run on nodes with matching labels + scanJobNodeSelector: {} + # -- If you do want to specify nodeSelector, uncomment the following lines, adjust them as necessary, and remove the + # square brackets after 'scanJobNodeSelector:'. + # nodeType: worker + # cpu: sandylake + # teamOwner: operators + + # -- scanJobCustomVolumesMount add custom volumes mount to the scan job + scanJobCustomVolumesMount: [] + # - name: var-lib-etcd + # mountPath: /var/lib/etcd + # readOnly: true + + # -- scanJobCustomVolumes add custom volumes to the scan job + scanJobCustomVolumes: [] + # - name: var-lib-etcd + # hostPath: + # path: /var/lib/etcd + + # -- useGCRServiceAccount the flag to enable the usage of GCR service account for scanning images in GCR + useGCRServiceAccount: true + # -- scanJobAutomountServiceAccountToken the flag to enable automount for service account token on scan job + scanJobAutomountServiceAccountToken: false + + # -- scanJobAnnotations comma-separated representation of the annotations which the user wants the scanner jobs and pods to be + # annotated with. Example: `foo=bar,env=stage` will annotate the scanner jobs and pods with the annotations `foo: bar` and `env: stage` + scanJobAnnotations: "" + + # -- scanJobPodTemplateLabels comma-separated representation of the labels which the user wants the scanner pods to be + # labeled with. Example: `foo=bar,env=stage` will labeled the scanner pods with the labels `foo: bar` and `env: stage` + scanJobPodTemplateLabels: "" + + # -- skipInitContainers when this flag is set to true, the initContainers will be skipped for the scanner and node collector pods + skipInitContainers: false + + # -- scanJobPodTemplatePodSecurityContext podSecurityContext the user wants the scanner and node collector pods to be amended with. + # Example: + # RunAsUser: 10000 + # RunAsGroup: 10000 + # RunAsNonRoot: true + scanJobPodTemplatePodSecurityContext: {} + + # -- scanJobPodTemplateContainerSecurityContext SecurityContext the user wants the scanner and node collector containers (and their + # initContainers) to be amended with. + scanJobPodTemplateContainerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + # -- For filesystem scanning, Trivy needs to run as the root user + # runAsUser: 0 + + # -- scanJobPodPriorityClassName Priority class name to be set on the pods created by trivy operator jobs. This accepts a string value + scanJobPodPriorityClassName: "" + + # -- reportResourceLabels comma-separated scanned resource labels which the user wants to include in the Prometheus + # metrics report. Example: `owner,app` + reportResourceLabels: "" + + # -- reportRecordFailedChecksOnly flag is to record only failed checks on misconfiguration reports (config-audit and rbac assessment) + reportRecordFailedChecksOnly: true + + # -- skipResourceByLabels comma-separated labels keys which trivy-operator will skip scanning on resources with matching labels + skipResourceByLabels: "" + + # -- metricsResourceLabelsPrefix Prefix that will be prepended to the labels names indicated in `reportResourceLabels` + # when including them in the Prometheus metrics + metricsResourceLabelsPrefix: "k8s_label_" + + # -- additionalReportLabels comma-separated representation of the labels which the user wants the scanner pods to be + # labeled with. Example: `foo=bar,env=stage` will labeled the reports with the labels `foo: bar` and `env: stage` + additionalReportLabels: "" + + # -- policiesConfig Custom Rego Policies to be used by the config audit scanner + # See https://github.com/aquasecurity/trivy-operator/blob/main/docs/tutorials/writing-custom-configuration-audit-policies.md for more details. + policiesConfig: "" + + # -- excludeImages is comma separated glob patterns for excluding images from scanning. + # Example: pattern: `k8s.gcr.io/*/*` will exclude image: `k8s.gcr.io/coredns/coredns:v1.8.0`. + excludeImages: "amazon-k8s-cni:*,amazon-k8s-cni-init:*,amazon/aws-network-policy-agent:*,eks/aws-ebs-csi-driver:*,eks/coredns:*,eks/csi-attacher:*,eks/csi-node-driver-registrar:*,eks/csi-provisioner:*,eks/csi-resizer:*,eks/csi-snapshotter:*,eks/kube-proxy:*,eks/livenessprobe:*" + +trivy: + # -- createConfig indicates whether to create config objects + createConfig: true + image: + # -- registry of the Trivy image + registry: ghcr.io + # -- repository of the Trivy image + repository: aquasecurity/trivy + # -- tag version of the Trivy image + tag: 0.53.0 + # -- imagePullSecret is the secret name to be used when pulling trivy image from private registries example : reg-secret + # It is the user responsibility to create the secret for the private registry in `trivy-operator` namespace + imagePullSecret: ~ + + # -- pullPolicy is the imge pull policy used for trivy image , valid values are (Always, Never, IfNotPresent) + pullPolicy: IfNotPresent + + # -- mode is the Trivy client mode. Either Standalone or ClientServer. Depending + # on the active mode other settings might be applicable or required. + mode: Standalone + + # -- sbomSources trivy will try to retrieve SBOM from the specified sources (oci,rekor) + sbomSources: "" + + # -- includeDevDeps include development dependencies in the report (supported: npm, yarn) (default: false) + # note: this flag is only applicable when trivy.command is set to filesystem + includeDevDeps: false + + # -- whether to use a storage class for trivy server or emptydir (one mey want to use ephemeral storage) + storageClassEnabled: true + + # -- storageClassName is the name of the storage class to be used for trivy server PVC. If empty, tries to find default storage class + storageClassName: "" + + # -- storageSize is the size of the trivy server PVC + storageSize: "5Gi" + + # -- labels is the extra labels to be used for trivy server statefulset + labels: {} + + # -- podLabels is the extra pod labels to be used for trivy server + podLabels: {} + + # -- priorityClassName is the name of the priority class used for trivy server + priorityClassName: "" + + # -- additionalVulnerabilityReportFields is a comma separated list of additional fields which + # can be added to the VulnerabilityReport. Supported parameters: Description, Links, CVSS, Target, Class, PackagePath and PackageType + additionalVulnerabilityReportFields: "" + + # -- httpProxy is the HTTP proxy used by Trivy to download the vulnerabilities database from GitHub. + httpProxy: ~ + + # -- httpsProxy is the HTTPS proxy used by Trivy to download the vulnerabilities database from GitHub. + httpsProxy: ~ + + # -- noProxy is a comma separated list of IPs and domain names that are not subject to proxy settings. + noProxy: ~ + + # -- Registries without SSL. There can be multiple registries with different keys. + nonSslRegistries: {} + # pocRegistry: poc.myregistry.harbor.com.pl + # qaRegistry: qa.registry.aquasec.com + # internalRegistry: registry.registry.svc:5000 + + # -- sslCertDir can be used to override the system default locations for SSL certificate files directory, example: /ssl/certs + sslCertDir: ~ + + # -- The registry to which insecure connections are allowed. There can be multiple registries with different keys. + insecureRegistries: {} + # pocRegistry: poc.myregistry.harbor.com.pl + # qaRegistry: qa.registry.aquasec.com + # internalRegistry: registry.registry.svc:5000 + + # -- Mirrored registries. There can be multiple registries with different keys. + # Make sure to quote registries containing dots + registry: + mirror: {} + # "docker.io": docker-mirror.example.com + + # -- severity is a comma separated list of severity levels reported by Trivy. + severity: UNKNOWN,LOW,MEDIUM,HIGH,CRITICAL + + # -- slow this flag is to use less CPU/memory for scanning though it takes more time than normal scanning. It fits small-footprint + slow: true + # -- ignoreUnfixed is the flag to show only fixed vulnerabilities in + # vulnerabilities reported by Trivy. Set to true to enable it. + # + ignoreUnfixed: false + # -- a comma separated list of file paths for Trivy to skip + skipFiles: + # -- a comma separated list of directories for Trivy to skip + skipDirs: + + # -- offlineScan is the flag to enable the offline scan functionality in Trivy + # This will prevent outgoing HTTP requests, e.g. to search.maven.org + offlineScan: false + + # -- timeout is the duration to wait for scan completion. + timeout: "5m0s" + + # -- ignoreFile can be used to tell Trivy to ignore vulnerabilities by ID (one per line) + ignoreFile: ~ + # ignoreFile: + # - CVE-1970-0001 + # - CVE-1970-0002 + + # -- ignorePolicy can be used to tell Trivy to ignore vulnerabilities by a policy + # If multiple policies would match, then the most specific one has precedence over the others. + # See https://aquasecurity.github.io/trivy/latest/docs/configuration/filtering/#by-open-policy-agent for more details. + # See https://github.com/aquasecurity/trivy/blob/v0.19.2/contrib/example_policy/basic.rego for more details on ignorePolicy filtering. + # + # ignorePolicy.application.my-app-.: | + # package trivy + + # import data.lib.trivy + + # default ignore = true + # applies to all workloads in namespace "application" with the name pattern "my-app-*" + # ignorePolicy.kube-system: | + # package trivy + + # import data.lib.trivy + + # default ignore = true + # applies to all workloads in namespace "kube-system" + # ignorePolicy: | + # package trivy + + # import data.lib.trivy + + # default ignore = true + # applies to all other workloads + + # -- vulnType can be used to tell Trivy to filter vulnerabilities by a pkg-type (library, os) + vulnType: ~ + + # -- resources resource requests and limits for scan job containers + resources: + requests: + cpu: 100m + memory: 100M + # ephemeralStorage: "2Gi" + limits: + cpu: 500m + memory: 500M + # ephemeralStorage: "2Gi" + + # -- githubToken is the GitHub access token used by Trivy to download the vulnerabilities + # database from GitHub. Only applicable in Standalone mode. + githubToken: ~ + + # -- serverURL is the endpoint URL of the Trivy server. Required in ClientServer mode. + # + # serverURL: "https://trivy.trivy:4975" + + # -- clientServerSkipUpdate is the flag to enable skip databases update for Trivy client. + # Only applicable in ClientServer mode. + clientServerSkipUpdate: false + + # -- skipJavaDBUpdate is the flag to enable skip Java index databases update for Trivy client. + skipJavaDBUpdate: false + + # -- serverInsecure is the flag to enable insecure connection to the Trivy server. + serverInsecure: false + + # -- serverToken is the token to authenticate Trivy client with Trivy server. Only + # applicable in ClientServer mode. + serverToken: ~ + + # -- existingSecret if a secret containing gitHubToken, serverToken or serverCustomHeaders has been created outside the chart (e.g external-secrets, sops, etc...). + # Keys must be at least one of the following: trivy.githubToken, trivy.serverToken, trivy.serverCustomHeaders + # Overrides trivy.gitHubToken, trivy.serverToken, trivy.serverCustomHeaders values. + # Note: The secret has to be named "trivy-operator-trivy-config". + # existingSecret: true + + # -- serverTokenHeader is the name of the HTTP header used to send the authentication + # token to Trivy server. Only application in ClientServer mode when + # trivy.serverToken is specified. + serverTokenHeader: "Trivy-Token" + + # -- serverCustomHeaders is a comma separated list of custom HTTP headers sent by + # Trivy client to Trivy server. Only applicable in ClientServer mode. + serverCustomHeaders: ~ + # serverCustomHeaders: "foo=bar" + + dbRegistry: "ghcr.io" + dbRepository: "aquasecurity/trivy-db" + + # -- The username for dbRepository authentication + # + dbRepositoryUsername: ~ + + # -- The password for dbRepository authentication + # + dbRepositoryPassword: ~ + + # -- javaDbRegistry is the registry for the Java vulnerability database. + javaDbRegistry: "ghcr.io" + javaDbRepository: "aquasecurity/trivy-java-db" + + # -- The Flag to enable insecure connection for downloading trivy-db via proxy (air-gaped env) + # + dbRepositoryInsecure: "false" + + # -- The Flag to enable the usage of builtin rego policies by default, these policies are downloaded by default from ghcr.io/aquasecurity/trivy-checks + # + useBuiltinRegoPolicies: "true" + # -- The Flag to enable the usage of external rego policies config-map, this should be used when the user wants to use their own rego policies + # + externalRegoPoliciesEnabled: false + # -- To enable the usage of embedded rego policies, set the flag useEmbeddedRegoPolicies. This should serve as a fallback for air-gapped environments. + # When useEmbeddedRegoPolicies is set to true, useBuiltinRegoPolicies should be set to false. + useEmbeddedRegoPolicies: "false" + + # -- The Flag is the list of supported kinds separated by comma delimiter to be scanned by the config audit scanner + # + supportedConfigAuditKinds: "Workload,Service,Role,ClusterRole,NetworkPolicy,Ingress,LimitRange,ResourceQuota" + + # -- command. One of `image`, `filesystem` or `rootfs` scanning, depending on the target type required for the scan. + # For 'filesystem' and `rootfs` scanning, ensure that the `trivyOperator.scanJobPodTemplateContainerSecurityContext` is configured + # to run as the root user (runAsUser = 0). + command: image + # -- imageScanCacheDir the flag to set custom path for trivy image scan `cache-dir` parameter. + # Only applicable in image scan mode. + imageScanCacheDir: "/tmp/trivy/.cache" + # -- filesystemScanCacheDir the flag to set custom path for trivy filesystem scan `cache-dir` parameter. + # Only applicable in filesystem scan mode. + filesystemScanCacheDir: "/var/trivyoperator/trivy-db" + # -- serverUser this param is the server user to be used to download db from private registry + serverUser: "" + # -- serverPassword this param is the server user to be used to download db from private registry + serverPassword: "" + # -- serverServiceName this param is the server service name to be used in cluster + serverServiceName: "trivy-service" + # -- debug One of `true` or `false`. Enables debug mode. + debug: false + + server: + # -- resources set trivy-server resource + resources: + requests: + cpu: 200m + memory: 512Mi + # ephemeral-storage: "2Gi" + limits: + cpu: 1 + memory: 1Gi + # ephemeral-storage: "2Gi" + + # -- podSecurityContext set trivy-server podSecurityContext + podSecurityContext: + runAsUser: 65534 + runAsNonRoot: true + fsGroup: 65534 + + # -- securityContext set trivy-server securityContext + securityContext: + privileged: false + readOnlyRootFilesystem: true + + # -- the number of replicas of the trivy-server + replicas: 1 + + # -- vaulesFromConfigMap name of a ConfigMap to apply TRIVY_* environment variables. Will override Helm values. + valuesFromConfigMap: "" + + # -- valuesFromSecret name of a Secret to apply TRIVY_* environment variables. Will override Helm AND ConfigMap values. + valuesFromSecret: "" + +compliance: + # -- failEntriesLimit the flag to limit the number of fail entries per control check in the cluster compliance detail report + # this limit is for preventing the report from being too large per control checks + failEntriesLimit: 10 + # -- reportType this flag control the type of report generated (summary or all) + reportType: summary + # -- cron this flag control the cron interval for compliance report generation + # At minute 0 past every 6th hour. + cron: 0 */6 * * * + # -- specs is a list of compliance specs to be used by the cluster compliance scanner + # - k8s-cis-1.23 + # - k8s-nsa-1.0 + # - k8s-pss-baseline-0.1 + # - k8s-pss-restricted-0.1 + # - eks-cis-1.4 + # - rke2-cis-1.24 + specs: + - k8s-cis-1.23 + - k8s-nsa-1.0 + - k8s-pss-baseline-0.1 + - k8s-pss-restricted-0.1 + +rbac: + create: true +serviceAccount: + # -- Specifies whether a service account should be created. + create: true + annotations: {} + # -- name specifies the name of the k8s Service Account. If not set and create is + # true, a name is generated using the fullname template. + name: "" + +# -- podAnnotations annotations added to the operator's pod +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +# -- securityContext security context +securityContext: + privileged: false + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +volumeMounts: + # do not remove , required for policies bundle + - mountPath: /tmp + name: cache-policies + readOnly: false + +volumes: + # do not remove , required for policies bundle + - name: cache-policies + emptyDir: {} + +resources: + limits: + cpu: 1 + memory: 1Gi + requests: + cpu: 100m + memory: 128Mi +# -- nodeSelector set the operator nodeSelector +nodeSelector: {} + +# -- tolerations set the operator tolerations +tolerations: [] + +# -- affinity set the operator affinity +affinity: {} + +# -- priorityClassName set the operator priorityClassName +priorityClassName: "" + + # -- automountServiceAccountToken the flag to enable automount for service account token +automountServiceAccountToken: true + +policiesBundle: + # -- registry of the policies bundle + registry: ghcr.io + # -- repository of the policies bundle + repository: aquasecurity/trivy-checks + # -- tag version of the policies bundle + tag: 0 + # -- registryUser is the user for the registry + registryUser: ~ + # -- registryPassword is the password for the registry + registryPassword: ~ + # -- existingSecret if a secret containing registry credentials that have been created outside the chart (e.g external-secrets, sops, etc...). + # Keys must be at least one of the following: policies.bundle.oci.user, policies.bundle.oci.password + # Overrides policiesBundle.registryUser, policiesBundle.registryPassword values. + # Note: The secret has to be named "trivy-operator". + existingSecret: false + # -- insecure is the flag to enable insecure connection to the policy bundle registry + insecure: false + + +nodeCollector: + # -- useNodeSelector determine if to use nodeSelector (by auto detecting node name) with node-collector scan job + useNodeSelector: true + # -- registry of the node-collector image + registry: ghcr.io + # -- repository of the node-collector image + repository: aquasecurity/node-collector + # -- tag version of the node-collector image + tag: 0.3.1 + # -- imagePullSecret is the secret name to be used when pulling node-collector image from private registries example : reg-secret + # It is the user responsibility to create the secret for the private registry in `trivy-operator` namespace + imagePullSecret: ~ + # -- excludeNodes comma-separated node labels that the node-collector job should exclude from scanning (example kubernetes.io/arch=arm64,team=dev) + excludeNodes: + # -- tolerations to be applied to the node-collector so that they can run on nodes with matching taints + tolerations: [] + # -- If you do want to specify tolerations, uncomment the following lines, adjust them as necessary, and remove the + # square brackets after 'scanJobTolerations:'. + # - key: "key1" + # operator: "Equal" + # value: "value1" + # effect: "NoSchedule" + # -- node-collector pod volume mounts definition for collecting config files information + volumeMounts: + - name: var-lib-etcd + mountPath: /var/lib/etcd + readOnly: true + - name: var-lib-kubelet + mountPath: /var/lib/kubelet + readOnly: true + - name: var-lib-kube-scheduler + mountPath: /var/lib/kube-scheduler + readOnly: true + - name: var-lib-kube-controller-manager + mountPath: /var/lib/kube-controller-manager + readOnly: true + - name: etc-systemd + mountPath: /etc/systemd + readOnly: true + - name: lib-systemd + mountPath: /lib/systemd/ + readOnly: true + - name: etc-kubernetes + mountPath: /etc/kubernetes + readOnly: true + - name: etc-cni-netd + mountPath: /etc/cni/net.d/ + readOnly: true + + # -- node-collector pod volumes definition for collecting config files information + volumes: + - name: var-lib-etcd + hostPath: + path: /var/lib/etcd + - name: var-lib-kubelet + hostPath: + path: /var/lib/kubelet + - name: var-lib-kube-scheduler + hostPath: + path: /var/lib/kube-scheduler + - name: var-lib-kube-controller-manager + hostPath: + path: /var/lib/kube-controller-manager + - name: etc-systemd + hostPath: + path: /etc/systemd + - name: lib-systemd + hostPath: + path: /lib/systemd + - name: etc-kubernetes + hostPath: + path: /etc/kubernetes + - name: etc-cni-netd + hostPath: + path: /etc/cni/net.d/ + diff --git a/modules/trivy-operator/trivy-operator-dashboard.png b/modules/trivy-operator/trivy-operator-dashboard.png new file mode 100644 index 00000000..e9733a0f Binary files /dev/null and b/modules/trivy-operator/trivy-operator-dashboard.png differ diff --git a/modules/internal-k8-infra/versions.tf b/modules/trivy-operator/versions.tf similarity index 66% rename from modules/internal-k8-infra/versions.tf rename to modules/trivy-operator/versions.tf index aae1e3f6..00cbb0b3 100644 --- a/modules/internal-k8-infra/versions.tf +++ b/modules/trivy-operator/versions.tf @@ -8,10 +8,6 @@ terraform { source = "hashicorp/kubernetes" version = "~> 2.0" } - spotinst = { - source = "spotinst/spotinst" - version = "1.172.0" # Specify the version you wish to use - } helm = { source = "hashicorp/helm" version = "~> 2.0" diff --git a/modules/variables.tf b/modules/variables.tf new file mode 100644 index 00000000..86d56d73 --- /dev/null +++ b/modules/variables.tf @@ -0,0 +1,6 @@ +variable "git_branch" { + description = "Branch to deploy" + type = string + # TODO: Migrate to using "main" here + default = "ibcdpe-1007-monitoring" +} diff --git a/modules/victoria-metrics/README.md b/modules/victoria-metrics/README.md new file mode 100644 index 00000000..c68c010c --- /dev/null +++ b/modules/victoria-metrics/README.md @@ -0,0 +1,94 @@ +# Purpose +This module is used to deploy the victoria metrics k8s helm chart. + + +This module is an All-in-one solution to start monitoring a kubernetes cluster. + + +It installs multiple dependency charts like grafana, node-exporter, kube-state-metrics +and victoria-metrics-operator. Also it installs Custom Resources like VMSingle, +VMCluster, VMAgent, VMAlert. + + +When installed this module will add a time series database (Victoria Metrics) along with +the required agents to support scraping for prometheus metrics from a number of data +sources. + +Resources: + +- + +## Description +This deployment is running in single-server mode. It also may be updated to run in +cluster mode, however, there are many additional considerations for cluster mode. That +includes the need to run multiple nodes, a different data storage medium (Preventing +easily swapping from single -> cluster) + +## Screenshots of the tool + +Default grafana dashboards: + +![Default grafana dashboards](./grafana-dashboards.png) + +
+ +Cluster resource utilization: + +![Cluster resource utilization](./cluster-resource-utilization.png) + +
+ +Victoria metrics UI to explore prometheus metrics: + +![Victoria metrics UI to explore prometheus metrics](./victoria-metrics-ui.png) + +
+ +## Creating a custom service scrape +When adding more services to the k8s cluster you will want to determine if the service +supports prometheus metric collection. If it does you will want to add a scrape config +that instructs VM to collect metric data from it. For example this terraform config is +used within the trivy operator to export metric data: + +``` +resource "kubernetes_manifest" "vmservicescrape" { + manifest = { + apiVersion = "operator.victoriametrics.com/v1beta1" + kind = "VMServiceScrape" + metadata = { + name = "trivy-vmservicescrape" + namespace = kubernetes_namespace.trivy-system.metadata[0].name + } + spec = { + endpoints = [ + { + port = "metrics" + } + ] + selector = { + matchLabels = { + "app.kubernetes.io/name" = "trivy-operator" + } + } + } + } +} +``` + +## Adding more grafana dashboards from grafana.com +Within the `values.yaml` define within the templates file you may add the ID of +additional dashboard you'd like to install with the grafana deployment. Add it to +`grafana.dashboards.default`. + +## Accessing the grafana dashboards +Access to the dashboards is only currently supported by setting up a port-forward +through kubectl commands. Find the `grafana` pod running in the `victoria-metrics` +namespace and start a port-forward session. The default admin password is stored as a +secret named `victoria-metrics-k8s-stack-grafana`. + +### Future work +- Update the module to allow passing in additional dashboards as a variable. +- Update the module and `values.yaml` to allow creating dashboards via json configuration. +- Implementing a backup mechanism or move to a managed instance of the time series database. +- User accounts for access to resources +- Integration with a secret storage backend (Like vault) to handle rotating secrets \ No newline at end of file diff --git a/modules/victoria-metrics/cluster-resource-utilization.png b/modules/victoria-metrics/cluster-resource-utilization.png new file mode 100644 index 00000000..a5b156b4 Binary files /dev/null and b/modules/victoria-metrics/cluster-resource-utilization.png differ diff --git a/modules/victoria-metrics/grafana-dashboards.png b/modules/victoria-metrics/grafana-dashboards.png new file mode 100644 index 00000000..d5545979 Binary files /dev/null and b/modules/victoria-metrics/grafana-dashboards.png differ diff --git a/modules/victoria-metrics/main.tf b/modules/victoria-metrics/main.tf new file mode 100644 index 00000000..a0297653 --- /dev/null +++ b/modules/victoria-metrics/main.tf @@ -0,0 +1,18 @@ +resource "kubernetes_namespace" "victoria-metrics" { + metadata { + name = "victoria-metrics" + } +} + +resource "helm_release" "victoria-metrics" { + name = "victoria-metrics-k8s-stack" + repository = "https://victoriametrics.github.io/helm-charts/" + chart = "victoria-metrics-k8s-stack" + namespace = "victoria-metrics" + version = "0.24.3" + depends_on = [ + kubernetes_namespace.victoria-metrics + ] + + values = [templatefile("${path.module}/templates/values.yaml", {})] +} diff --git a/modules/victoria-metrics/templates/values.yaml b/modules/victoria-metrics/templates/values.yaml new file mode 100644 index 00000000..3c22de72 --- /dev/null +++ b/modules/victoria-metrics/templates/values.yaml @@ -0,0 +1,1150 @@ +nameOverride: "" +fullnameOverride: "" +tenant: "0" +# -- If this chart is used in "Argocd" with "releaseName" field then +# -- VMServiceScrapes couldn't select the proper services. +# -- For correct working need set value 'argocdReleaseOverride=$ARGOCD_APP_NAME' +argocdReleaseOverride: "" + +# -- victoria-metrics-operator dependency chart configuration. +# -- For possible values refer to https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator#parameters +# -- also checkout here possible ENV variables to configure operator behaviour https://docs.victoriametrics.com/operator/vars.html +victoria-metrics-operator: + enabled: true + # -- Tells helm to clean up vm cr resources when uninstalling + cleanupCRD: true + cleanupImage: + repository: bitnami/kubectl + # use image tag that matches k8s API version by default + # tag: 1.29.6 + pullPolicy: IfNotPresent + + createCRD: false # we disable crd creation by operator chart as we create them in this chart + operator: + # -- By default, operator converts prometheus-operator objects. + disable_prometheus_converter: false + +serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- Annotations to add to the service account + annotations: {} + # -- The name of the service account to use. + # -- If not set and create is true, a name is generated using the fullname template + name: "" + +## -- Create default rules for monitoring the cluster +defaultRules: + create: true + + # -- Common properties for VMRule groups + group: + spec: + # -- Optional HTTP URL parameters added to each rule request + params: {} + + # -- Common properties for VMRules + rule: + spec: + # -- Additional labels for VMRule alerts + labels: {} + # -- Additional annotations for VMRule alerts + annotations: {} + + # -- Per rule properties + rules: {} + # CPUThrottlingHigh: + # create: true + # spec: + # for: 15m + # labels: + # severity: critical + groups: + etcd: + create: true + # -- Common properties for all rules in a group + rules: {} + # spec: + # annotations: + # dashboard: https://example.com/dashboard/1 + general: + create: true + rules: {} + k8sContainerMemoryRss: + create: true + rules: {} + k8sContainerMemoryCache: + create: true + rules: {} + k8sContainerCpuUsageSecondsTotal: + create: true + rules: {} + k8sPodOwner: + create: true + rules: {} + k8sContainerResource: + create: true + rules: {} + k8sContainerMemoryWorkingSetBytes: + create: true + rules: {} + k8sContainerMemorySwap: + create: true + rules: {} + kubeApiserver: + create: true + rules: {} + kubeApiserverAvailability: + create: true + rules: {} + kubeApiserverBurnrate: + create: true + rules: {} + kubeApiserverHistogram: + create: true + rules: {} + kubeApiserverSlos: + create: true + rules: {} + kubelet: + create: true + rules: {} + kubePrometheusGeneral: + create: true + rules: {} + kubePrometheusNodeRecording: + create: true + rules: {} + kubernetesApps: + create: true + rules: {} + targetNamespace: ".*" + kubernetesResources: + create: true + rules: {} + kubernetesStorage: + create: true + rules: {} + targetNamespace: ".*" + kubernetesSystem: + create: true + rules: {} + kubernetesSystemKubelet: + create: true + rules: {} + kubernetesSystemApiserver: + create: true + rules: {} + kubernetesSystemControllerManager: + create: true + rules: {} + kubeScheduler: + create: true + rules: {} + kubernetesSystemScheduler: + create: true + rules: {} + kubeStateMetrics: + create: true + rules: {} + nodeNetwork: + create: true + rules: {} + node: + create: true + rules: {} + vmagent: + create: true + rules: {} + vmsingle: + create: true + rules: {} + vmcluster: + create: true + rules: {} + vmHealth: + create: true + rules: {} + alertmanager: + create: false + rules: {} + + # -- Runbook url prefix for default rules + runbookUrl: https://runbooks.prometheus-operator.dev/runbooks + + # -- Labels for default rules + labels: {} + # -- Annotations for default rules + annotations: {} + +## -- Create default dashboards +defaultDashboardsEnabled: true + +## -- Create experimental dashboards +experimentalDashboardsEnabled: true + +## -- Create dashboards as CRDs (reuqires grafana-operator to be installed) +grafanaOperatorDashboardsFormat: + enabled: false + instanceSelector: + matchLabels: + dashboards: "grafana" + allowCrossNamespaceImport: false + +# Provide custom recording or alerting rules to be deployed into the cluster. +additionalVictoriaMetricsMap: +# rule-name: +# groups: +# - name: my_group +# rules: +# - record: my_record +# expr: 100 * my_record + +externalVM: + read: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + write: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + +############## + +# -- Configures vmsingle params +vmsingle: + annotations: {} + enabled: true + # spec for VMSingle crd + # https://docs.victoriametrics.com/operator/api.html#vmsinglespec + spec: + image: + tag: v1.102.0 + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicaCount: 1 + extraArgs: {} + storage: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmsingle.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmsingle-ingress-tls + # hosts: + # - vmsingle.domain.com + +vmcluster: + enabled: false + annotations: {} + # spec for VMCluster crd + # https://docs.victoriametrics.com/operator/api.html#vmclusterspec + spec: + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicationFactor: 2 + vmstorage: + image: + tag: v1.102.0-cluster + replicaCount: 2 + storageDataPath: "/vm-data" + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + resources: + {} + # limits: + # cpu: "1" + # memory: 1500Mi + vmselect: + image: + tag: v1.102.0-cluster + replicaCount: 2 + cacheMountPath: "/select-cache" + extraArgs: {} + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 2Gi + resources: + {} + # limits: + # cpu: "1" + # memory: "1000Mi" + # requests: + # cpu: "0.5" + # memory: "500Mi" + vminsert: + image: + tag: v1.102.0-cluster + replicaCount: 2 + extraArgs: {} + resources: + {} + # limits: + # cpu: "1" + # memory: 1000Mi + # requests: + # cpu: "0.5" + # memory: "500Mi" + + ingress: + storage: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmstorage.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmstorage-ingress-tls + # hosts: + # - vmstorage.domain.com + select: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmselect.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmselect-ingress-tls + # hosts: + # - vmselect.domain.com + insert: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vminsert.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vminsert-ingress-tls + # hosts: + # - vminsert.domain.com + +alertmanager: + enabled: false + annotations: {} + # spec for VMAlertmanager crd + # https://docs.victoriametrics.com/operator/api.html#vmalertmanagerspec + spec: + selectAllByDefault: true + image: + tag: v0.25.0 + externalURL: "" + routePrefix: / + + # if this one defined, it will be used for alertmanager configuration and config parameter will be ignored + # configSecret: "alertmanager-config" + + config: + templates: + - "/etc/vm/configs/**/*.tmpl" + route: + # group_by: ["alertgroup", "job"] + # group_wait: 30s + # group_interval: 5m + # repeat_interval: 12h + receiver: "blackhole" + ## routes: + ################################################### + ## Duplicate code_owner routes to teams + ## These will send alerts to team channels but continue + ## processing through the rest of the tree to handled by on-call + # - matchers: + # - code_owner_channel!="" + # - severity=~"info|warning|critical" + # group_by: ["code_owner_channel", "alertgroup", "job"] + # receiver: slack-code-owners + # ################################################### + # ## Standard on-call routes + # - matchers: + # - severity=~"info|warning|critical" + # receiver: slack-monitoring + # continue: true + + # inhibit_rules: + # - target_matchers: + # - severity=~"warning|info" + # source_matchers: + # - severity=critical + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - severity=warning + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - alertname=InfoInhibitor + # equal: + # - cluster + # - namespace + + receivers: + - name: blackhole + # - name: "slack-monitoring" + # slack_configs: + # - channel: "#channel" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook_url }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # - name: slack-code-owners + # slack_configs: + # - channel: "#{{ .CommonLabels.code_owner_channel }}" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # + # better alert templates for slack + # source https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512 + monzoTemplate: + enabled: true + + # extra alert templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - alertmanager.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: alertmanager-ingress-tls + # hosts: + # - alertmanager.domain.com + +vmalert: + annotations: {} + enabled: false + + # Controls whether VMAlert should use VMAgent or VMInsert as a target for remotewrite + remoteWriteVMAgent: false + # spec for VMAlert crd + # https://docs.victoriametrics.com/operator/api.html#vmalertspec + spec: + selectAllByDefault: true + image: + tag: v1.102.0 + evaluationInterval: 15s + + # External labels to add to all generated recording rules and alerts + externalLabels: {} + + # extra vmalert annotation templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + ## additionalNotifierConfigs allows to configure static notifiers, discover notifiers via Consul and DNS, + ## see specification in https://docs.victoriametrics.com/vmalert/#notifier-configuration-file. + ## This configuration will be created as separate secret and mounted to vmalert pod. + additionalNotifierConfigs: {} + # dns_sd_configs: + # - names: + # - my.domain.com + # type: 'A' + # port: 9093 + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmalert.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmalert-ingress-tls + # hosts: + # - vmalert.domain.com + +vmagent: + enabled: true + annotations: {} + # https://docs.victoriametrics.com/operator/api.html#vmagentremotewritespec + # defined spec will be added to the remoteWrite configuration of VMAgent + additionalRemoteWrites: + [] + #- url: http://some-remote-write/api/v1/write + # spec for VMAgent crd + # https://docs.victoriametrics.com/operator/api.html#vmagentspec + spec: + selectAllByDefault: true + image: + tag: v1.102.0 + scrapeInterval: 30s + externalLabels: {} + # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. + # For example: + # cluster: cluster-name + extraArgs: + promscrape.streamParse: "true" + # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent + # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug + promscrape.dropOriginalLabels: "true" + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmagent.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmagent-ingress-tls + # hosts: + # - vmagent.domain.com + +################################################# +### dependencies ##### +################################################# +# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration +grafana: + enabled: true + ## all values for grafana helm chart can be specified here + sidecar: + datasources: + enabled: true + initDatasources: true + createVMReplicasDatasources: false + # JSON options for VM datasources + # See https://grafana.com/docs/grafana/latest/administration/provisioning/#json-data + jsonData: {} + # timeInterval: "1m" + dashboards: + additionalDashboardLabels: {} + additionalDashboardAnnotations: {} + enabled: true + multicluster: false + + ## ForceDeployDatasource Create datasource configmap even if grafana deployment has been disabled + forceDeployDatasource: false + + ## Configure additional grafana datasources (passed through tpl) + ## ref: http://docs.grafana.org/administration/provisioning/#datasources + additionalDataSources: [] + # - name: prometheus-sample + # access: proxy + # basicAuth: true + # basicAuthPassword: pass + # basicAuthUser: daco + # editable: false + # jsonData: + # tlsSkipVerify: true + # orgId: 1 + # type: prometheus + # url: https://{{ printf "%s-prometheus.svc" .Release.Name }}:9090 + # version: 1 + + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: "default" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/default + + dashboards: + default: + nodeexporter: + gnetId: 1860 + revision: 22 + datasource: VictoriaMetrics + trivyoperator: + gnetId: 17813 + revision: 2 + datasource: VictoriaMetrics + + defaultDashboardsTimezone: utc + + # Enabling VictoriaMetrics Datasource in Grafana. See more details here: https://github.com/VictoriaMetrics/grafana-datasource/blob/main/README.md#victoriametrics-datasource-for-grafana + # Note that Grafana will need internet access to install the datasource plugin. + # Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana: + #plugins: + # - "https://github.com/VictoriaMetrics/grafana-datasource/releases/download/v0.5.0/victoriametrics-datasource-v0.5.0.zip;victoriametrics-datasource" + #grafana.ini: + # plugins: + # # Why VictoriaMetrics datasource is unsigned: https://github.com/VictoriaMetrics/grafana-datasource/blob/main/README.md#why-victoriametrics-datasource-is-unsigned + # allow_loading_unsigned_plugins: victoriametrics-datasource + + # Change datasource type in dashboards from Prometheus to VictoriaMetrics. + # you can use `victoriametrics-datasource` instead of `prometheus` if enabled VictoriaMetrics Datasource above + defaultDatasourceType: "prometheus" + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - grafana.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: grafana-ingress-tls + # hosts: + # - grafana.domain.com + + vmServiceScrape: + # whether we should create a service scrape resource for grafana + enabled: true + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: {} + +# prometheus-node-exporter dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml +prometheus-node-exporter: + enabled: true + + ## all values for prometheus-node-exporter helm chart can be specified here + podLabels: + ## Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards + ## + jobLabel: node-exporter + extraArgs: + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + + vmServiceScrape: + # whether we should create a service scrape resource for node-exporter + enabled: true + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - port: metrics + metricRelabelConfigs: + - action: drop + source_labels: [mountpoint] + regex: "/var/lib/kubelet/pods.+" +# kube-state-metrics dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml +kube-state-metrics: + enabled: true + ## all values for kube-state-metrics helm chart can be specified here + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmServiceScrape: + spec: {} + + #TODO: selector override for kube-state-metrics deployed separatelly + +################################################# +### Service Monitors ##### +################################################# +## Component scraping the kubelets +kubelet: + enabled: true + + # -- Enable scraping /metrics/cadvisor from kubelet's service + cadvisor: true + # -- Enable scraping /metrics/probes from kubelet's service + probes: true + # spec for VMNodeScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmnodescrapespec + spec: + scheme: "https" + honorLabels: true + interval: "30s" + scrapeTimeout: "5s" + tlsConfig: + insecureSkipVerify: true + caFile: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token" + # drop high cardinality label and useless metrics for cadvisor and kubelet + metricRelabelConfigs: + - action: labeldrop + regex: (uid) + - action: labeldrop + regex: (id|name) + - action: drop + source_labels: [__name__] + regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count) + relabelConfigs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - sourceLabels: [__metrics_path__] + targetLabel: metrics_path + - targetLabel: "job" + replacement: "kubelet" + # ignore timestamps of cadvisor's metrics by default + # more info here https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1656540535 + honorTimestamps: false +# -- Component scraping the kube api server +kubeApiServer: + enabled: true + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes + +# -- Component scraping the kube controller manager +kubeControllerManager: + enabled: true + + ## If your kube controller manager is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeControllerManager.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 10257 + targetPort: 10257 + # selector: + # component: kube-controller-manager + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + +# -Component scraping kubeDns. Use either this or coreDns +kubeDns: + enabled: false + service: + enabled: false + dnsmasq: + port: 10054 + targetPort: 10054 + skydns: + port: 10055 + targetPort: 10055 + selector: + k8s-app: kube-dns + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + endpoints: + - port: http-metrics-dnsmasq + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: http-metrics-skydns + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# -- Component scraping coreDns. Use either this or kubeDns +coreDns: + enabled: true + service: + enabled: true + port: 9153 + targetPort: 9153 + selector: + k8s-app: kube-dns + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - port: http-metrics + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +## Component scraping etcd +## +kubeEtcd: + enabled: true + + ## If your etcd is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 2379 + targetPort: 2379 + # selector: + # component: etcd + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +## Component scraping kube scheduler +## +kubeScheduler: + enabled: true + + ## If your kube scheduler is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeScheduler.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 10259 + targetPort: 10259 + # selector: + # component: kube-scheduler + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +## Component scraping kube proxy +## +kubeProxy: + enabled: false + + ## If your kube proxy is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + service: + enabled: true + port: 10249 + targetPort: 10249 + # selector: + # k8s-app: kube-proxy + + # spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + spec: + jobLabel: jobLabel + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +## install vm operator crds +crds: + enabled: true + +## install prometheus operator crds +prometheus-operator-crds: + enabled: false + +# -- Add extra objects dynamically to this chart +extraObjects: [] + diff --git a/modules/victoria-metrics/versions.tf b/modules/victoria-metrics/versions.tf new file mode 100644 index 00000000..00cbb0b3 --- /dev/null +++ b/modules/victoria-metrics/versions.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.0" + } + } +} diff --git a/modules/victoria-metrics/victoria-metrics-ui.png b/modules/victoria-metrics/victoria-metrics-ui.png new file mode 100644 index 00000000..a18e41ef Binary files /dev/null and b/modules/victoria-metrics/victoria-metrics-ui.png differ