[PLINT-543] Add support for environment metrics and tagging by enviro…

…nment, deployment, and release (#19192) * Add support for environment metrics * Add config for autodiscovery * Add support for environments via discovery * Add environment, release, and deployment tags * Add caching for environments, releases, and deployments * add to caddyfile * remove env fallback * change environments cache to dictionary * use tuple for deployment * Add pagination for environments * update caddyfile * update environments config * sync models * Filter by environment * only collect deploy tasks * update caddyfile
DataDog · Dec 20, 2024 · 02785fd · 02785fd
1 parent e0312a0
commit 02785fd
Show file tree

Hide file tree

Showing 42 changed files with 1,876 additions and 182 deletions.
diff --git a/octopus_deploy/assets/configuration/spec.yaml b/octopus_deploy/assets/configuration/spec.yaml
@@ -187,6 +187,57 @@ files:
           - name: interval
             type: integer
         example: {}
+    - name: environments
+      display_priority: 6
+      description: |
+          Optional configuration to indicate the environments that we want to be processed. If not configured,
+          all environments will be processed.
+
+          The 'include' key will indicate the regular expressions of the environments for which metrics are
+          to be reported.
+
+          The environments will be processed in the order indicated in the 'include'.
+          If an environment is matched on an 'include' key, it will only be processed there and not in a later 'include'
+          that it might match on.
+
+          The 'exclude' key will indicate the regular expressions of those environments for which metrics
+          are not to be reported.
+          The excludes will have priority over the includes, that is, if an environment matches an exclude, it will not be
+          processed even if it matches an include. The 'include' key must be used if using the 'exclude' key.
+
+          The 'limit' key will allow limiting the number of environments processed to avoid a combinatorial explosion of
+          tags associated with a metric.
+
+          The 'interval' key will indicate the validity time of the last list of environments obtained through the
+          endpoint. If 'interval' is not indicated, the list of environments will be obtained each time the check
+          is executed and will not be cached.
+
+          In the following example, only the environment named 'prod' will be collected.
+
+            environments:
+              include:
+                - 'prod'
+
+      value:
+        type: object
+        properties:
+          - name: limit
+            description: |
+              Maximum number of environments to be processed.
+            type: integer
+          - name: include
+            type: array
+            items:
+              anyOf:
+                - type: string
+                - type: object
+          - name: exclude
+            type: array
+            items:
+              type: string
+          - name: interval
+            type: integer
+        example: {}
     - name: paginated_limit
       description: |
           Sets the number of items API calls should return at a time. Default is 30.

diff --git a/octopus_deploy/datadog_checks/octopus_deploy/check.py b/octopus_deploy/datadog_checks/octopus_deploy/check.py
@@ -5,6 +5,7 @@
 import datetime
 from collections.abc import Iterable
 
+from cachetools import TTLCache
 from requests.exceptions import ConnectionError, HTTPError, InvalidURL, Timeout
 
 from datadog_checks.base import AgentCheck
@@ -15,6 +16,9 @@
 
 from .config_models import ConfigMixin
 
+TTL_CACHE_MAXSIZE = 50
+TTL_CACHE_TTL = 3600
+
 EVENT_TO_ALERT_TYPE = {
     'MachineHealthy': 'success',
     'MachineUnhealthy': 'warning',
@@ -41,6 +45,10 @@ def __init__(self, name, init_config, instances):
         self._project_groups_discovery = {}
         self._default_projects_discovery = {}
         self._projects_discovery = {}
+        self._environments_discovery = {}
+        self._environments_cache = {}
+        self._deployments_cache = TTLCache(maxsize=TTL_CACHE_MAXSIZE, ttl=TTL_CACHE_TTL)
+        self._releases_cache = TTLCache(maxsize=TTL_CACHE_MAXSIZE, ttl=TTL_CACHE_TTL)
         self._base_tags = self.instance.get("tags", [])
         self.collect_events = self.instance.get("collect_events", False)
 
@@ -185,6 +193,7 @@ def _process_spaces(self):
             tags = self._base_tags + [f'space_id:{space_id}', f'space_name:{space_name}']
             self.gauge("space.count", 1, tags=tags)
             self.log.debug("Processing space %s", space_name)
+            self._process_environments(space_id, space_name)
             self._process_project_groups(
                 space_id, space_name, space_config.get("project_groups") if space_config else None
             )
@@ -239,7 +248,7 @@ def _process_projects(self, space_id, space_name, project_group_id, project_grou
                         f"api/{space_id}/projectgroups/{project_group_id}/projects"
                     ).get('Items', [])
                 ]
-        self.log.debug("Monitoring %s Projects", len(projects))
+        self.log.debug("Monitoring %s Projects for %s in %s", len(projects), project_group_name, space_name)
         for _, _, project, _ in projects:
             project_id = project.get("Id")
             project_name = project.get("Name")
@@ -253,15 +262,58 @@ def _process_projects(self, space_id, space_name, project_group_id, project_grou
             self._process_queued_and_running_tasks(space_id, space_name, project_id, project_name)
             self._process_completed_tasks(space_id, space_name, project_id, project_name)
 
+    def _process_environments(self, space_id, space_name):
+        if self.config.environments:
+            self._init_environments_discovery(space_id)
+            environments = list(self._environments_discovery[space_id].get_items())
+        else:
+            environments = [
+                (None, environment.get("Name"), environment, None)
+                for environment in self._process_paginated_endpoint(f"api/{space_id}/environments").get('Items', [])
+            ]
+
+        self.log.debug("Collecting %s environments for %s", len(environments), space_name)
+
+        for _, _, environment, _ in environments:
+            environment_name = environment.get("Name")
+            environment_slug = environment.get("Slug")
+            environment_id = environment.get("Id")
+            self._environments_cache[environment_id] = environment_name
+            use_guided_failure = int(environment.get("UseGuidedFailure", False))
+            allow_dynamic_infrastructure = int(environment.get("AllowDynamicInfrastructure", False))
+
+            tags = self._base_tags + [
+                f"space_name:{space_name}",
+                f"environment_name:{environment_name}",
+                f"environment_id:{environment_id}",
+                f"environment_slug:{environment_slug}",
+            ]
+            self.gauge("environment.count", 1, tags=tags)
+            self.gauge("environment.use_guided_failure", use_guided_failure, tags=tags)
+            self.gauge("environment.allow_dynamic_infrastructure", allow_dynamic_infrastructure, tags=tags)
+
+    def _init_environments_discovery(self, space_id):
+        self.log.info("Default Environments discovery: %s", self.config.environments)
+        if space_id not in self._environments_discovery:
+            self._environments_discovery[space_id] = Discovery(
+                lambda: self._process_paginated_endpoint(f"api/{space_id}/environments").get('Items', []),
+                limit=self.config.environments.limit,
+                include=normalize_discover_config_include(self.config.environments),
+                exclude=self.config.environments.exclude,
+                interval=self.config.environments.interval,
+                key=lambda environment: environment.get("Name"),
+            )
+
     def _process_queued_and_running_tasks(self, space_id, space_name, project_id, project_name):
         self.log.debug("Collecting running and queued tasks for project %s", project_name)
-        params = {'project': project_id, 'states': ["Queued", "Executing"]}
+        params = {'name': 'Deploy', 'project': project_id, 'states': ["Queued", "Executing"]}
         response_json = self._process_paginated_endpoint(f"api/{space_id}/tasks", params)
         self._process_tasks(space_id, space_name, project_name, response_json.get('Items', []))
 
     def _process_completed_tasks(self, space_id, space_name, project_id, project_name):
         self.log.debug("Collecting completed tasks for project %s", project_name)
         params = {
+            'name': 'Deploy',
             'project': project_id,
             'fromCompletedDate': self._from_completed_time,
             'toCompletedDate': self._to_completed_time,
@@ -303,27 +355,70 @@ def _process_tasks(self, space_id, space_name, project_name, tasks_json):
             task_name = task.get("Name")
             server_node = task.get("ServerNode")
             task_state = task.get("State")
-            tags = self._base_tags + [
-                f'space_name:{space_name}',
-                f'project_name:{project_name}',
-                f'task_id:{task_id}',
-                f'task_name:{task_name}',
-                f'task_state:{task_state}',
-                f'server_node:{server_node}',
-            ]
-            self.log.debug("Processing task id %s for project %s", task_id, project_name)
-            queued_time, executing_time, completed_time = self._calculate_task_times(task)
-            self.gauge("deployment.count", 1, tags=tags)
-            self.gauge("deployment.queued_time", queued_time, tags=tags)
-            if executing_time != -1:
-                self.gauge("deployment.executing_time", executing_time, tags=tags)
-
-            if completed_time != -1:
-                self.gauge("deployment.completed_time", completed_time, tags=tags)
-
-                if self.logs_enabled:
-                    self.log.debug("Collecting logs for task %s, id: %s", task_name, task_id)
-                    self._collect_deployment_logs(space_id, task_id, tags)
+            deployment_id = task.get("Arguments", {}).get("DeploymentId")
+            environment_name, deployment_tags = self._get_deployment_tags(space_id, deployment_id)
+            if environment_name in self._environments_cache.values():
+                tags = (
+                    self._base_tags
+                    + deployment_tags
+                    + [
+                        f'space_name:{space_name}',
+                        f'project_name:{project_name}',
+                        f'task_state:{task_state}',
+                        f'server_node:{server_node}',
+                    ]
+                )
+                self.log.debug("Processing task id %s for project %s", task_id, project_name)
+                queued_time, executing_time, completed_time = self._calculate_task_times(task)
+                self.gauge("deployment.count", 1, tags=tags)
+                self.gauge("deployment.queued_time", queued_time, tags=tags)
+                if executing_time != -1:
+                    self.gauge("deployment.executing_time", executing_time, tags=tags)
+
+                if completed_time != -1:
+                    self.gauge("deployment.completed_time", completed_time, tags=tags)
+
+                    if self.logs_enabled:
+                        self.log.debug("Collecting logs for task %s, id: %s", task_name, task_id)
+                        self._collect_deployment_logs(space_id, task_id, tags)
+            else:
+                self.log.debug(
+                    "Skipping task id: %s for project %s in skipped environment: %s",
+                    task_id,
+                    project_name,
+                    environment_name,
+                )
+
+    def _get_deployment_tags(self, space_id, deployment_id):
+        self.log.debug("Getting deployment tags for deployment id: %s", deployment_id)
+        cached_deployment = self._deployments_cache.get(deployment_id)
+
+        if cached_deployment is not None:
+            release_version = cached_deployment[0]
+            environment_name = cached_deployment[1]
+        else:
+            self.log.debug("Cached deployment not found for deployment id: %s", deployment_id)
+            deployment = self._process_endpoint(f"api/{space_id}/deployments/{deployment_id}")
+            release_id = deployment.get("ReleaseId")
+            environment_id = deployment.get("EnvironmentId")
+            environment_name = self._environments_cache.get(environment_id)
+            release_version = self._releases_cache.get(release_id)
+            if release_version is None:
+                self.log.debug(
+                    "Cached release not found for deployment id: %s, release id: %s", deployment_id, release_id
+                )
+                release = self._process_endpoint(f"api/{space_id}/releases/{release_id}")
+                release_version = release.get("Version")
+                self._releases_cache[release_id] = release_version
+
+            self._deployments_cache[deployment_id] = (release_version, environment_name)
+
+        tags = [
+            f'deployment_id:{deployment_id}',
+            f'release_version:{release_version}',
+            f'environment_name:{environment_name}',
+        ]
+        return environment_name, tags
 
     def _collect_server_nodes_metrics(self):
         self.log.debug("Collecting server node metrics.")

diff --git a/octopus_deploy/datadog_checks/octopus_deploy/config_models/instance.py b/octopus_deploy/datadog_checks/octopus_deploy/config_models/instance.py
@@ -29,6 +29,17 @@ class AuthToken(BaseModel):
     writer: Optional[MappingProxyType[str, Any]] = None
 
 
+class Environments(BaseModel):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        frozen=True,
+    )
+    exclude: Optional[tuple[str, ...]] = None
+    include: Optional[tuple[Union[str, MappingProxyType[str, Any]], ...]] = None
+    interval: Optional[int] = None
+    limit: Optional[int] = Field(None, description='Maximum number of environments to be processed.\n')
+
+
 class MetricPatterns(BaseModel):
     model_config = ConfigDict(
         arbitrary_types_allowed=True,
@@ -96,6 +107,7 @@ class InstanceConfig(BaseModel):
     connect_timeout: Optional[float] = None
     disable_generic_tags: Optional[bool] = None
     empty_default_hostname: Optional[bool] = None
+    environments: Optional[Environments] = None
     extra_headers: Optional[MappingProxyType[str, Any]] = None
     headers: Optional[MappingProxyType[str, Any]] = None
     kerberos_auth: Optional[str] = None

diff --git a/octopus_deploy/datadog_checks/octopus_deploy/data/conf.yaml.example b/octopus_deploy/datadog_checks/octopus_deploy/data/conf.yaml.example
@@ -51,6 +51,37 @@ instances:
     #
   - octopus_endpoint: http://localhost:80/api
 
+    ## @param environments - mapping - optional
+    ## Optional configuration to indicate the environments that we want to be processed. If not configured,
+    ## all environments will be processed.
+    ##
+    ## The 'include' key will indicate the regular expressions of the environments for which metrics are
+    ## to be reported.
+    ##
+    ## The environments will be processed in the order indicated in the 'include'.
+    ## If an environment is matched on an 'include' key, it will only be processed there and not in a later 'include'
+    ## that it might match on.
+    ##
+    ## The 'exclude' key will indicate the regular expressions of those environments for which metrics
+    ## are not to be reported.
+    ## The excludes will have priority over the includes, that is, if an environment matches an exclude, it will not be
+    ## processed even if it matches an include. The 'include' key must be used if using the 'exclude' key.
+    ##
+    ## The 'limit' key will allow limiting the number of environments processed to avoid a combinatorial explosion of
+    ## tags associated with a metric.
+    ##
+    ## The 'interval' key will indicate the validity time of the last list of environments obtained through the
+    ## endpoint. If 'interval' is not indicated, the list of environments will be obtained each time the check
+    ## is executed and will not be cached.
+    ##
+    ## In the following example, only the environment named 'prod' will be collected.
+    ##
+    ##   environments:
+    ##     include:
+    ##       - 'prod'
+    #
+    # environments: {}
+
     ## @param headers - mapping - optional
     ## Headers to use for every request. An Authorization header including the Octopus Deploy API key token is required
     ## for authentication for the REST API.

diff --git a/octopus_deploy/metadata.csv b/octopus_deploy/metadata.csv
@@ -4,6 +4,9 @@ octopus_deploy.deployment.completed_time,gauge,,second,,Duration of deployment.,
 octopus_deploy.deployment.count,gauge,,,,Number of deployments monitored.,-1,octopus_deploy,octopus_deploy deploy count,,
 octopus_deploy.deployment.executing_time,gauge,,second,,How long the deployment has been executing.,-1,octopus_deploy,octopus_deploy deploy dur,,
 octopus_deploy.deployment.queued_time,gauge,,second,,Time deployment was in queue.,-1,octopus_deploy,octopus_deploy deploy queue,,
+octopus_deploy.environment.allow_dynamic_infrastructure,gauge,,,,Whether or not the environment allows dynamic infrastructure.,-1,octopus_deploy,octopus_deploy env infra,,
+octopus_deploy.environment.count,gauge,,,,Number of environments discovered.,-1,octopus_deploy,octopus_deploy env count,,
+octopus_deploy.environment.use_guided_failure,gauge,,,,Whether or not the environment is in guided failure mode.,-1,octopus_deploy,octopus_deploy env guided failure,,
 octopus_deploy.project.count,gauge,,,,Number of projects discovered.,-1,octopus_deploy,octopus_deploy projects count,,
 octopus_deploy.project_group.count,gauge,,,,Number of project groups discovered.,-1,octopus_deploy,octopus_deploy project group count,,
 octopus_deploy.server_node.count,gauge,,,,Number of Octopus server nodes discovered.,-1,octopus_deploy,octopus_deploy server count,,