Skip to content

Commit

Permalink
Rearchitect GoCD pipelines (#3131)
Browse files Browse the repository at this point in the history
In this PR, I've restructured Relay's GoCD pipelines to separate
Processing And PoPs into distinct pipelines. As we previously discussed,
this change aims to improve efficiency and robustness of Relay's
deployment process.

Key enhancements include:
* Addition of canary deployments for both Processing and PoPs, providing
an early warning system for issues in our deployment process
* Introduction of soak-time stage with Sentry and DataDog checks,
allowing us to detect and address issues early on and stop problematic
deploys from rolling out further.
* Inclusion of dedicated rollback pipelines for both Processing and
PoPs, increasing the speed at which we can mitigate issues.

#skip-changelog
  • Loading branch information
IanWoodard authored Feb 28, 2024
1 parent d52ae94 commit 41abecb
Show file tree
Hide file tree
Showing 19 changed files with 577 additions and 4 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ gocd: ## Build GoCD pipelines
@ find . -type f \( -name '*.libsonnet' -o -name '*.jsonnet' \) -print0 | xargs -n 1 -0 jsonnetfmt -i
@ find . -type f \( -name '*.libsonnet' -o -name '*.jsonnet' \) -print0 | xargs -n 1 -0 jsonnet-lint -J ./gocd/templates/vendor
@ cd ./gocd/templates && jsonnet --ext-code output-files=true -J vendor -m ../generated-pipelines ./relay.jsonnet
@ cd ./gocd/templates && jsonnet --ext-code output-files=true -J vendor -m ../generated-pipelines ./pops.jsonnet
@ cd ./gocd/templates && jsonnet --ext-code output-files=true -J vendor -m ../generated-pipelines ./relay.jsonnet
@ cd ./gocd/generated-pipelines && find . -type f \( -name '*.yaml' \) -print0 | xargs -n 1 -0 yq -p json -o yaml -i
.PHONY: gocd

Expand Down
5 changes: 5 additions & 0 deletions gocd/templates/bash/check-datadog-status.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

/devinfra/scripts/checks/datadog/monitor_status.py \
${DATADOG_MONITOR_IDS} \
--skip-check=false
25 changes: 25 additions & 0 deletions gocd/templates/bash/check-sentry-errors.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# shellcheck disable=SC2206
project_ids=(${SENTRY_PROJECT_IDS})
# shellcheck disable=SC2206
project_slugs=(${SENTRY_PROJECTS})


if [ ${##project_ids[@]} -ne ${##project_slugs[@]} ]; then
echo "Error: SENTRY_PROJECT_IDS and SENTRY_PROJECTS must have the same number of elements"
exit 1
fi

for i in "${!project_ids[@]}"; do
/devinfra/scripts/checks/sentry/release_error_events.py \
--project-id="${project_ids[i]}" \
--project-slug="${project_slugs[i]}" \
--release="relay@${GO_REVISION_GETSENTRY_REPO}" \
--duration=5 \
--error-events-limit="${ERROR_LIMIT}" \
--dry-run="${DRY_RUN}" \
--single-tenant="${SENTRY_SINGLE_TENANT}" \
--skip-check="${SKIP_CANARY_CHECKS}" \
--sentry-base="${SENTRY_BASE}"
done
23 changes: 23 additions & 0 deletions gocd/templates/bash/check-sentry-new-errors.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# shellcheck disable=SC2206
project_ids=(${SENTRY_PROJECT_IDS})
# shellcheck disable=SC2206
project_slugs=(${SENTRY_PROJECTS})

if [ ${##project_ids[@]} -ne ${##project_slugs[@]} ]; then
echo "Error: SENTRY_PROJECT_IDS and SENTRY_PROJECTS must have the same number of elements"
exit 1
fi

for i in "${!project_ids[@]}"; do
/devinfra/scripts/checks/sentry/release_new_issues.py \
--project-id="${project_ids[i]}" \
--project-slug="${project_slugs[i]}" \
--release="relay@${GO_REVISION_GETSENTRY_REPO}" \
--new-issues-limit=0 \
--dry-run="${DRY_RUN}" \
--single-tenant="${SENTRY_SINGLE_TENANT}" \
--skip-check="${SKIP_CANARY_CHECKS}" \
--sentry-base="${SENTRY_BASE}"
done
10 changes: 10 additions & 0 deletions gocd/templates/bash/deploy-pop-canary.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

eval $(/devinfra/scripts/regions/project_env_vars.py --region="${SENTRY_REGION}")

/devinfra/scripts/k8s/k8stunnel

/devinfra/scripts/k8s/k8s-deploy.py \
--label-selector="service=relay-pop,env=canary" \
--image="us.gcr.io/sentryio/relay-pop:${GO_REVISION_RELAY_REPO}" \
--container-name="relay"
10 changes: 10 additions & 0 deletions gocd/templates/bash/deploy-processing-canary.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

eval $(/devinfra/scripts/regions/project_env_vars.py --region="${SENTRY_REGION}")

/devinfra/scripts/k8s/k8stunnel

/devinfra/scripts/k8s/k8s-deploy.py \
--label-selector="service=relay,env=canary" \
--image="us.gcr.io/sentryio/relay:${GO_REVISION_RELAY_REPO}" \
--container-name="relay"
10 changes: 10 additions & 0 deletions gocd/templates/bash/deploy-processing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

eval $(/devinfra/scripts/regions/project_env_vars.py --region="${SENTRY_REGION}")

/devinfra/scripts/k8s/k8stunnel

/devinfra/scripts/k8s/k8s-deploy.py \
--label-selector="service=relay" \
--image="us.gcr.io/sentryio/relay:${GO_REVISION_RELAY_REPO}" \
--container-name="relay"
4 changes: 4 additions & 0 deletions gocd/templates/bash/pause-current-pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

gocd-pause-current-pipeline \
--pause-message="${PAUSE_MESSAGE}"
4 changes: 4 additions & 0 deletions gocd/templates/bash/wait-canary.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Wait for 5 minutes
sleep 300
4 changes: 4 additions & 0 deletions gocd/templates/bash/wait-soak.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

# Wait for 5 minutes
sleep 300
File renamed without changes.
33 changes: 33 additions & 0 deletions gocd/templates/libs/utils.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
local gocdtasks = import 'github.com/getsentry/gocd-jsonnet/libs/gocd-tasks.libsonnet';

{
pause_on_failure(): {
plugin: {
options: gocdtasks.script(importstr '../bash/pause-current-pipeline.sh'),
run_if: 'failed',
configuration: {
id: 'script-executor',
version: 1,
},
},
},
github_checks(): [
{
checks: {
fetch_materials: true,
jobs: {
checks: {
environment_variables: {
GITHUB_TOKEN: '{{SECRET:[devinfra-github][token]}}',
},
timeout: 1800,
elastic_profile_id: 'relay',
tasks: [
gocdtasks.script(importstr '../bash/github-check-runs.sh'),
],
},
},
},
},
],
}
232 changes: 232 additions & 0 deletions gocd/templates/pipelines/pops.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
local utils = import '../libs/utils.libsonnet';
local gocdtasks = import 'github.com/getsentry/gocd-jsonnet/libs/gocd-tasks.libsonnet';

local canary_region_pops = {
de: [],
// TODO: Check that these are right
us: ['us-pop-1', 'us-pop-regional-1'],
};

local region_pops = {
de: [
'de-pop-1',
'de-pop-2',
],
us: [
'us-pop-1',
'us-pop-2',
'us-pop-3',
'us-pop-4',
'us-pop-regional-1',
'us-pop-regional-2',
'us-pop-regional-3',
'us-pop-regional-4',
],
};

// The purpose of this stage is to let the deployment soak for a while and
// detect any issues that might have been introduced.
local soak_time(region) =
if region == 's4s' || region == 'us' then
[
{
'soak-time': {
jobs: {
soak: {
environment_variables: {
SENTRY_REGION: region,
GOCD_ACCESS_TOKEN: '{{SECRET:[devinfra][gocd_access_token]}}',
SENTRY_AUTH_TOKEN: '{{SECRET:[devinfra-sentryio][token]}}',
DATADOG_API_KEY: '{{SECRET:[devinfra][sentry_datadog_api_key]}}',
DATADOG_APP_KEY: '{{SECRET:[devinfra][sentry_datadog_app_key]}}',
// Datadog monitor IDs for the soak time
DATADOG_MONITOR_IDS: '137575470 22592147 27804625 22634395 22635255',
SENTRY_PROJECTS: if region == 's4s' then 'sentry-for-sentry' else 'pop-relay relay',
SENTRY_PROJECT_IDS: if region == 's4s' then '1513938' else '9 4',
SENTRY_SINGLE_TENANT: if region == 's4s' then 'true' else 'false',
SENTRY_BASE: if region == 's4s' then 'https://sentry.io/api/0' else 'https://sentry.my.sentry.io/api/0',
// TODO: Set a proper error limit
ERROR_LIMIT: 500,
PAUSE_MESSAGE: 'Detecting issues in the deployment. Pausing pipeline.',
// TODO: Switch dry run to false once we're confident in the soak time
DRY_RUN: 'true',
},
elastic_profile_id: 'relay-pop',
tasks: [
gocdtasks.script(importstr '../bash/wait-soak.sh'),
gocdtasks.script(importstr '../bash/check-sentry-errors.sh'),
gocdtasks.script(importstr '../bash/check-sentry-new-errors.sh'),
gocdtasks.script(importstr '../bash/check-datadog-status.sh'),
utils.pause_on_failure(),
],
},
},
},
},
]
else
[];

// Create a gocd job that will run the deploy-pop-canary script,
// wait for a few minutes, and check the status of the canary deployment.
local deploy_pop_canary_job(region) =
{
timeout: 1200,
elastic_profile_id: 'relay-pop',
environment_variables: {
SENTRY_REGION: region,
GOCD_ACCESS_TOKEN: '{{SECRET:[devinfra][gocd_access_token]}}',
SENTRY_AUTH_TOKEN: '{{SECRET:[devinfra-sentryio][token]}}',
DATADOG_API_KEY: '{{SECRET:[devinfra][sentry_datadog_api_key]}}',
DATADOG_APP_KEY: '{{SECRET:[devinfra][sentry_datadog_app_key]}}',
// Datadog monitor IDs for the canary deployment
DATADOG_MONITOR_IDS: '137575470 22592147 27804625 22634395 22635255',
SENTRY_PROJECTS: 'pop-relay relay',
SENTRY_PROJECT_IDS: '9 4',
SENTRY_SINGLE_TENANT: 'false',
SENTRY_BASE: 'https://sentry.my.sentry.io/api/0',
// TODO: Set a proper error limit
ERROR_LIMIT: 500,
PAUSE_MESSAGE: 'Pausing pipeline due to canary failure.',
// TODO: Switch dry run to false once we're confident in the soak time
DRY_RUN: 'true',
},
tasks: [
gocdtasks.script(importstr '../bash/deploy-pop-canary.sh'),
gocdtasks.script(importstr '../bash/wait-canary.sh'),
gocdtasks.script(importstr '../bash/check-sentry-errors.sh'),
gocdtasks.script(importstr '../bash/check-sentry-new-errors.sh'),
gocdtasks.script(importstr '../bash/check-datadog-status.sh'),
utils.pause_on_failure(),
],
};

// Create a gocd job that will run the deploy-pop script
local deploy_pop_job(region) =
{
timeout: 1200,
elastic_profile_id: 'relay-pop',
environment_variables: {
SENTRY_REGION: region,
},
tasks: [
gocdtasks.script(importstr '../bash/deploy-pop.sh'),
],
};

// Iterate over a list of regions and create a job for each
local deploy_jobs(regions, deploy_job, partition='-') =
{
['deploy-primary' + partition + region]: deploy_job(region)
for region in regions
};

// The purpose of this stage is to deploy a canary to all canary PoPs for a given region
// and wait for a few minutes to see if there are any issues.
local deploy_canary_pops_stage(region) =
{
'deploy-canary': {
fetch_materials: true,
jobs: {
create_sentry_release: {
timeout: 1200,
elastic_profile_id: 'relay',
environment_variables: {
SENTRY_ORG: 'sentry',
SENTRY_PROJECT: 'pop-relay',
SENTRY_URL: 'https://sentry.my.sentry.io/',
// Temporary; self-service encrypted secrets aren't implemented yet.
// This should really be rotated to an internal integration token.
SENTRY_AUTH_TOKEN: '{{SECRET:[devinfra-temp][relay_sentry_auth_token]}}',
SENTRY_ENVIRONMENT: 'canary',
},
tasks: [
gocdtasks.script(importstr '../bash/create-sentry-relay-release.sh'),
],
},
},
},
} {
'deploy-canary'+: {
fetch_materials: true,
jobs+: deploy_jobs(
[region] + canary_region_pops[region],
deploy_pop_canary_job,
'-canary-',
),
},
};

// The purpose of this stage is to deploy to all PoPs for a given region as well
// as create a sentry release.
local deploy_pops_stage(region) =
{
'deploy-primary': {
fetch_materials: true,
jobs: {
// PoPs have their own Sentry project, which requires separate symbol upload via
// create-sentry-release. They could be moved into the same project with a different
// environment to avoid this.
create_sentry_release: {
timeout: 1200,
elastic_profile_id: 'relay',
environment_variables: {
SENTRY_ORG: if region == 's4s' then 'sentry-st' else 'sentry',
SENTRY_PROJECT: if region == 's4s' then 'sentry-for-sentry' else 'pop-relay',
SENTRY_URL: if region == 's4s' then 'https://sentry-st.sentry.io/' else 'https://sentry.my.sentry.io/',
// Temporary; self-service encrypted secrets aren't implemented yet.
// This should really be rotated to an internal integration token.
SENTRY_AUTH_TOKEN: if region == 's4s' then '{{SECRET:[devinfra-temp][relay_sentry_st_auth_token]}}' else '{{SECRET:[devinfra-temp][relay_sentry_auth_token]}}',
},
tasks: [
gocdtasks.script(importstr '../bash/create-sentry-relay-pop-release.sh'),
],
},
},
},
} {
'deploy-primary'+: {
jobs+: deploy_jobs(
[region] + region_pops[region],
deploy_pop_job,
),
},
};

// The purpose of this stage is to deploy to a single PoP for a given region.
local deploy_generic_pops_stage(region) =
{
'deploy-primary': {
fetch_materials: true,
jobs: {
['deploy-primary-' + region]: deploy_pop_job(region),
},
},
};

// The US region deploys create a sentry release and deploys to a number
// of clusters, other regions only deploy to a single cluster.
local deployment_stages(region) =
if region == 'us' || region == 'de' then
// The canary stage is only for the US and DE regions
[deploy_canary_pops_stage(region), deploy_pops_stage(region)]
else
[deploy_generic_pops_stage(region)];


function(region) {
environment_variables: {
SENTRY_REGION: region,
},
group: 'relay-pops-next',
lock_behavior: 'unlockWhenFinished',
materials: {
relay_repo: {
git: 'git@github.com:getsentry/relay.git',
shallow_clone: true,
branch: 'master',
destination: 'relay',
},
},
stages: utils.github_checks() + deployment_stages(region) + soak_time(region),
}
Loading

0 comments on commit 41abecb

Please sign in to comment.