Skip to content

Commit

Permalink
[PLINT-543] Add support for environment metrics and tagging by enviro…
Browse files Browse the repository at this point in the history
…nment, deployment, and release (#19192)

* Add support for environment metrics

* Add config for autodiscovery

* Add support for environments via discovery

* Add environment, release, and deployment tags

* Add caching for environments, releases, and deployments

* add to caddyfile

* remove env fallback

* change environments cache to dictionary

* use tuple for deployment

* Add pagination for environments

* update caddyfile

* update environments config

* sync models

* Filter by environment

* only collect deploy tasks

* update caddyfile
  • Loading branch information
sarah-witt authored Dec 20, 2024
1 parent e0312a0 commit 02785fd
Show file tree
Hide file tree
Showing 42 changed files with 1,876 additions and 182 deletions.
51 changes: 51 additions & 0 deletions octopus_deploy/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,57 @@ files:
- name: interval
type: integer
example: {}
- name: environments
display_priority: 6
description: |
Optional configuration to indicate the environments that we want to be processed. If not configured,
all environments will be processed.
The 'include' key will indicate the regular expressions of the environments for which metrics are
to be reported.
The environments will be processed in the order indicated in the 'include'.
If an environment is matched on an 'include' key, it will only be processed there and not in a later 'include'
that it might match on.
The 'exclude' key will indicate the regular expressions of those environments for which metrics
are not to be reported.
The excludes will have priority over the includes, that is, if an environment matches an exclude, it will not be
processed even if it matches an include. The 'include' key must be used if using the 'exclude' key.
The 'limit' key will allow limiting the number of environments processed to avoid a combinatorial explosion of
tags associated with a metric.
The 'interval' key will indicate the validity time of the last list of environments obtained through the
endpoint. If 'interval' is not indicated, the list of environments will be obtained each time the check
is executed and will not be cached.
In the following example, only the environment named 'prod' will be collected.
environments:
include:
- 'prod'
value:
type: object
properties:
- name: limit
description: |
Maximum number of environments to be processed.
type: integer
- name: include
type: array
items:
anyOf:
- type: string
- type: object
- name: exclude
type: array
items:
type: string
- name: interval
type: integer
example: {}
- name: paginated_limit
description: |
Sets the number of items API calls should return at a time. Default is 30.
Expand Down
141 changes: 118 additions & 23 deletions octopus_deploy/datadog_checks/octopus_deploy/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import datetime
from collections.abc import Iterable

from cachetools import TTLCache
from requests.exceptions import ConnectionError, HTTPError, InvalidURL, Timeout

from datadog_checks.base import AgentCheck
Expand All @@ -15,6 +16,9 @@

from .config_models import ConfigMixin

TTL_CACHE_MAXSIZE = 50
TTL_CACHE_TTL = 3600

EVENT_TO_ALERT_TYPE = {
'MachineHealthy': 'success',
'MachineUnhealthy': 'warning',
Expand All @@ -41,6 +45,10 @@ def __init__(self, name, init_config, instances):
self._project_groups_discovery = {}
self._default_projects_discovery = {}
self._projects_discovery = {}
self._environments_discovery = {}
self._environments_cache = {}
self._deployments_cache = TTLCache(maxsize=TTL_CACHE_MAXSIZE, ttl=TTL_CACHE_TTL)
self._releases_cache = TTLCache(maxsize=TTL_CACHE_MAXSIZE, ttl=TTL_CACHE_TTL)
self._base_tags = self.instance.get("tags", [])
self.collect_events = self.instance.get("collect_events", False)

Expand Down Expand Up @@ -185,6 +193,7 @@ def _process_spaces(self):
tags = self._base_tags + [f'space_id:{space_id}', f'space_name:{space_name}']
self.gauge("space.count", 1, tags=tags)
self.log.debug("Processing space %s", space_name)
self._process_environments(space_id, space_name)
self._process_project_groups(
space_id, space_name, space_config.get("project_groups") if space_config else None
)
Expand Down Expand Up @@ -239,7 +248,7 @@ def _process_projects(self, space_id, space_name, project_group_id, project_grou
f"api/{space_id}/projectgroups/{project_group_id}/projects"
).get('Items', [])
]
self.log.debug("Monitoring %s Projects", len(projects))
self.log.debug("Monitoring %s Projects for %s in %s", len(projects), project_group_name, space_name)
for _, _, project, _ in projects:
project_id = project.get("Id")
project_name = project.get("Name")
Expand All @@ -253,15 +262,58 @@ def _process_projects(self, space_id, space_name, project_group_id, project_grou
self._process_queued_and_running_tasks(space_id, space_name, project_id, project_name)
self._process_completed_tasks(space_id, space_name, project_id, project_name)

def _process_environments(self, space_id, space_name):
if self.config.environments:
self._init_environments_discovery(space_id)
environments = list(self._environments_discovery[space_id].get_items())
else:
environments = [
(None, environment.get("Name"), environment, None)
for environment in self._process_paginated_endpoint(f"api/{space_id}/environments").get('Items', [])
]

self.log.debug("Collecting %s environments for %s", len(environments), space_name)

for _, _, environment, _ in environments:
environment_name = environment.get("Name")
environment_slug = environment.get("Slug")
environment_id = environment.get("Id")
self._environments_cache[environment_id] = environment_name
use_guided_failure = int(environment.get("UseGuidedFailure", False))
allow_dynamic_infrastructure = int(environment.get("AllowDynamicInfrastructure", False))

tags = self._base_tags + [
f"space_name:{space_name}",
f"environment_name:{environment_name}",
f"environment_id:{environment_id}",
f"environment_slug:{environment_slug}",
]
self.gauge("environment.count", 1, tags=tags)
self.gauge("environment.use_guided_failure", use_guided_failure, tags=tags)
self.gauge("environment.allow_dynamic_infrastructure", allow_dynamic_infrastructure, tags=tags)

def _init_environments_discovery(self, space_id):
self.log.info("Default Environments discovery: %s", self.config.environments)
if space_id not in self._environments_discovery:
self._environments_discovery[space_id] = Discovery(
lambda: self._process_paginated_endpoint(f"api/{space_id}/environments").get('Items', []),
limit=self.config.environments.limit,
include=normalize_discover_config_include(self.config.environments),
exclude=self.config.environments.exclude,
interval=self.config.environments.interval,
key=lambda environment: environment.get("Name"),
)

def _process_queued_and_running_tasks(self, space_id, space_name, project_id, project_name):
self.log.debug("Collecting running and queued tasks for project %s", project_name)
params = {'project': project_id, 'states': ["Queued", "Executing"]}
params = {'name': 'Deploy', 'project': project_id, 'states': ["Queued", "Executing"]}
response_json = self._process_paginated_endpoint(f"api/{space_id}/tasks", params)
self._process_tasks(space_id, space_name, project_name, response_json.get('Items', []))

def _process_completed_tasks(self, space_id, space_name, project_id, project_name):
self.log.debug("Collecting completed tasks for project %s", project_name)
params = {
'name': 'Deploy',
'project': project_id,
'fromCompletedDate': self._from_completed_time,
'toCompletedDate': self._to_completed_time,
Expand Down Expand Up @@ -303,27 +355,70 @@ def _process_tasks(self, space_id, space_name, project_name, tasks_json):
task_name = task.get("Name")
server_node = task.get("ServerNode")
task_state = task.get("State")
tags = self._base_tags + [
f'space_name:{space_name}',
f'project_name:{project_name}',
f'task_id:{task_id}',
f'task_name:{task_name}',
f'task_state:{task_state}',
f'server_node:{server_node}',
]
self.log.debug("Processing task id %s for project %s", task_id, project_name)
queued_time, executing_time, completed_time = self._calculate_task_times(task)
self.gauge("deployment.count", 1, tags=tags)
self.gauge("deployment.queued_time", queued_time, tags=tags)
if executing_time != -1:
self.gauge("deployment.executing_time", executing_time, tags=tags)

if completed_time != -1:
self.gauge("deployment.completed_time", completed_time, tags=tags)

if self.logs_enabled:
self.log.debug("Collecting logs for task %s, id: %s", task_name, task_id)
self._collect_deployment_logs(space_id, task_id, tags)
deployment_id = task.get("Arguments", {}).get("DeploymentId")
environment_name, deployment_tags = self._get_deployment_tags(space_id, deployment_id)
if environment_name in self._environments_cache.values():
tags = (
self._base_tags
+ deployment_tags
+ [
f'space_name:{space_name}',
f'project_name:{project_name}',
f'task_state:{task_state}',
f'server_node:{server_node}',
]
)
self.log.debug("Processing task id %s for project %s", task_id, project_name)
queued_time, executing_time, completed_time = self._calculate_task_times(task)
self.gauge("deployment.count", 1, tags=tags)
self.gauge("deployment.queued_time", queued_time, tags=tags)
if executing_time != -1:
self.gauge("deployment.executing_time", executing_time, tags=tags)

if completed_time != -1:
self.gauge("deployment.completed_time", completed_time, tags=tags)

if self.logs_enabled:
self.log.debug("Collecting logs for task %s, id: %s", task_name, task_id)
self._collect_deployment_logs(space_id, task_id, tags)
else:
self.log.debug(
"Skipping task id: %s for project %s in skipped environment: %s",
task_id,
project_name,
environment_name,
)

def _get_deployment_tags(self, space_id, deployment_id):
self.log.debug("Getting deployment tags for deployment id: %s", deployment_id)
cached_deployment = self._deployments_cache.get(deployment_id)

if cached_deployment is not None:
release_version = cached_deployment[0]
environment_name = cached_deployment[1]
else:
self.log.debug("Cached deployment not found for deployment id: %s", deployment_id)
deployment = self._process_endpoint(f"api/{space_id}/deployments/{deployment_id}")
release_id = deployment.get("ReleaseId")
environment_id = deployment.get("EnvironmentId")
environment_name = self._environments_cache.get(environment_id)
release_version = self._releases_cache.get(release_id)
if release_version is None:
self.log.debug(
"Cached release not found for deployment id: %s, release id: %s", deployment_id, release_id
)
release = self._process_endpoint(f"api/{space_id}/releases/{release_id}")
release_version = release.get("Version")
self._releases_cache[release_id] = release_version

self._deployments_cache[deployment_id] = (release_version, environment_name)

tags = [
f'deployment_id:{deployment_id}',
f'release_version:{release_version}',
f'environment_name:{environment_name}',
]
return environment_name, tags

def _collect_server_nodes_metrics(self):
self.log.debug("Collecting server node metrics.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ class AuthToken(BaseModel):
writer: Optional[MappingProxyType[str, Any]] = None


class Environments(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
frozen=True,
)
exclude: Optional[tuple[str, ...]] = None
include: Optional[tuple[Union[str, MappingProxyType[str, Any]], ...]] = None
interval: Optional[int] = None
limit: Optional[int] = Field(None, description='Maximum number of environments to be processed.\n')


class MetricPatterns(BaseModel):
model_config = ConfigDict(
arbitrary_types_allowed=True,
Expand Down Expand Up @@ -96,6 +107,7 @@ class InstanceConfig(BaseModel):
connect_timeout: Optional[float] = None
disable_generic_tags: Optional[bool] = None
empty_default_hostname: Optional[bool] = None
environments: Optional[Environments] = None
extra_headers: Optional[MappingProxyType[str, Any]] = None
headers: Optional[MappingProxyType[str, Any]] = None
kerberos_auth: Optional[str] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,37 @@ instances:
#
- octopus_endpoint: http://localhost:80/api

## @param environments - mapping - optional
## Optional configuration to indicate the environments that we want to be processed. If not configured,
## all environments will be processed.
##
## The 'include' key will indicate the regular expressions of the environments for which metrics are
## to be reported.
##
## The environments will be processed in the order indicated in the 'include'.
## If an environment is matched on an 'include' key, it will only be processed there and not in a later 'include'
## that it might match on.
##
## The 'exclude' key will indicate the regular expressions of those environments for which metrics
## are not to be reported.
## The excludes will have priority over the includes, that is, if an environment matches an exclude, it will not be
## processed even if it matches an include. The 'include' key must be used if using the 'exclude' key.
##
## The 'limit' key will allow limiting the number of environments processed to avoid a combinatorial explosion of
## tags associated with a metric.
##
## The 'interval' key will indicate the validity time of the last list of environments obtained through the
## endpoint. If 'interval' is not indicated, the list of environments will be obtained each time the check
## is executed and will not be cached.
##
## In the following example, only the environment named 'prod' will be collected.
##
## environments:
## include:
## - 'prod'
#
# environments: {}

## @param headers - mapping - optional
## Headers to use for every request. An Authorization header including the Octopus Deploy API key token is required
## for authentication for the REST API.
Expand Down
3 changes: 3 additions & 0 deletions octopus_deploy/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ octopus_deploy.deployment.completed_time,gauge,,second,,Duration of deployment.,
octopus_deploy.deployment.count,gauge,,,,Number of deployments monitored.,-1,octopus_deploy,octopus_deploy deploy count,,
octopus_deploy.deployment.executing_time,gauge,,second,,How long the deployment has been executing.,-1,octopus_deploy,octopus_deploy deploy dur,,
octopus_deploy.deployment.queued_time,gauge,,second,,Time deployment was in queue.,-1,octopus_deploy,octopus_deploy deploy queue,,
octopus_deploy.environment.allow_dynamic_infrastructure,gauge,,,,Whether or not the environment allows dynamic infrastructure.,-1,octopus_deploy,octopus_deploy env infra,,
octopus_deploy.environment.count,gauge,,,,Number of environments discovered.,-1,octopus_deploy,octopus_deploy env count,,
octopus_deploy.environment.use_guided_failure,gauge,,,,Whether or not the environment is in guided failure mode.,-1,octopus_deploy,octopus_deploy env guided failure,,
octopus_deploy.project.count,gauge,,,,Number of projects discovered.,-1,octopus_deploy,octopus_deploy projects count,,
octopus_deploy.project_group.count,gauge,,,,Number of project groups discovered.,-1,octopus_deploy,octopus_deploy project group count,,
octopus_deploy.server_node.count,gauge,,,,Number of Octopus server nodes discovered.,-1,octopus_deploy,octopus_deploy server count,,
Expand Down
Loading

0 comments on commit 02785fd

Please sign in to comment.