From 270ca65efa9242fbfaa53962c9706982a1ed38d6 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Thu, 25 Jan 2024 19:44:28 +0100 Subject: [PATCH 01/26] Query for labeled services as well --- cmd/backup/main.go | 2 +- cmd/backup/script.go | 47 ++++++++++++++-------- test/services/docker-compose.yml | 69 ++++++++++++++++++++++++++++++++ test/services/run.sh | 29 ++++++++++++++ 4 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 test/services/docker-compose.yml create mode 100755 test/services/run.sh diff --git a/cmd/backup/main.go b/cmd/backup/main.go index 89c9890b..64c6f21a 100644 --- a/cmd/backup/main.go +++ b/cmd/backup/main.go @@ -47,7 +47,7 @@ func main() { }() s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error { - restartContainers, err := s.stopContainers() + restartContainers, err := s.stopContainersAndServices() // The mechanism for restarting containers is not using hooks as it // should happen as soon as possible (i.e. before uploading backups or // similar). diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 1bed6a7d..3d66aacd 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -318,44 +318,59 @@ func newScript() (*script, error) { return s, nil } -// stopContainers stops all Docker containers that are marked as to being +// stopContainersAndServices stops all Docker containers that are marked as to being // stopped during the backup and returns a function that can be called to // restart everything that has been stopped. -func (s *script) stopContainers() (func() error, error) { +func (s *script) stopContainersAndServices() (func() error, error) { if s.cli == nil { return noop, nil } + matchLabel := fmt.Sprintf( + "docker-volume-backup.stop-during-backup=%s", + s.c.BackupStopContainerLabel, + ) + allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{}) if err != nil { return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err) } - - containerLabel := fmt.Sprintf( - "docker-volume-backup.stop-during-backup=%s", - s.c.BackupStopContainerLabel, - ) containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ Filters: filters.NewArgs(filters.KeyValuePair{ Key: "label", - Value: containerLabel, + Value: matchLabel, }), }) - if err != nil { return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err) } - if len(containersToStop) == 0 { + allServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) + if err != nil { + return noop, fmt.Errorf("stopContainers: error querying for services: %w", err) + } + servicesToScaleDown, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: matchLabel, + }), + }) + if err != nil { + return noop, fmt.Errorf("stopContainers: error querying for services to scale down: %w", err) + } + + if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 { return noop, nil } s.logger.Info( fmt.Sprintf( - "Stopping %d container(s) labeled `%s` out of %d running container(s).", + "Stopping %d container(s) out of %d running container(s) and scaling down %d service(s) out of %d, as they were labeled %s.", len(containersToStop), - containerLabel, len(allContainers), + len(servicesToScaleDown), + len(allServices), + matchLabel, ), ) @@ -385,12 +400,12 @@ func (s *script) stopContainers() (func() error, error) { } return func() error { - servicesRequiringUpdate := map[string]struct{}{} + servicesRequiringForceUpdate := map[string]struct{}{} var restartErrors []error for _, container := range stoppedContainers { if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok { - servicesRequiringUpdate[swarmServiceName] = struct{}{} + servicesRequiringForceUpdate[swarmServiceName] = struct{}{} continue } if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil { @@ -398,9 +413,9 @@ func (s *script) stopContainers() (func() error, error) { } } - if len(servicesRequiringUpdate) != 0 { + if len(servicesRequiringForceUpdate) != 0 { services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) - for serviceName := range servicesRequiringUpdate { + for serviceName := range servicesRequiringForceUpdate { var serviceMatch swarm.Service for _, service := range services { if service.Spec.Name == serviceName { diff --git a/test/services/docker-compose.yml b/test/services/docker-compose.yml new file mode 100644 index 00000000..dbbf8b9a --- /dev/null +++ b/test/services/docker-compose.yml @@ -0,0 +1,69 @@ +# Copyright 2020-2021 - Offen Authors +# SPDX-License-Identifier: Unlicense + +version: '3.8' + +services: + minio: + image: minio/minio:RELEASE.2020-08-04T23-10-51Z + deploy: + restart_policy: + condition: on-failure + environment: + MINIO_ROOT_USER: test + MINIO_ROOT_PASSWORD: test + MINIO_ACCESS_KEY: test + MINIO_SECRET_KEY: GMusLtUmILge2by+z890kQ + entrypoint: /bin/ash -c 'mkdir -p /data/backup && minio server /data' + volumes: + - backup_data:/data + + backup: + image: offen/docker-volume-backup:${TEST_VERSION:-canary} + depends_on: + - minio + deploy: + restart_policy: + condition: on-failure + environment: + AWS_ACCESS_KEY_ID: test + AWS_SECRET_ACCESS_KEY: GMusLtUmILge2by+z890kQ + AWS_ENDPOINT: minio:9000 + AWS_ENDPOINT_PROTO: http + AWS_S3_BUCKET_NAME: backup + BACKUP_FILENAME: test.tar.gz + BACKUP_CRON_EXPRESSION: 0 0 5 31 2 ? + BACKUP_RETENTION_DAYS: 7 + BACKUP_PRUNING_LEEWAY: 5s + volumes: + - pg_data:/backup/pg_data:ro + - /var/run/docker.sock:/var/run/docker.sock + + offen: + image: offen/offen:latest + healthcheck: + disable: true + deploy: + labels: + - docker-volume-backup.stop-during-backup=true + replicas: 2 + restart_policy: + condition: on-failure + + pg: + image: postgres:14-alpine + environment: + POSTGRES_PASSWORD: example + labels: + - docker-volume-backup.stop-during-backup=true + volumes: + - pg_data:/var/lib/postgresql/data + deploy: + restart_policy: + condition: on-failure + +volumes: + backup_data: + name: backup_data + pg_data: + name: pg_data diff --git a/test/services/run.sh b/test/services/run.sh new file mode 100755 index 00000000..ad1f8c08 --- /dev/null +++ b/test/services/run.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +set -e + +cd $(dirname $0) +. ../util.sh +current_test=$(basename $(pwd)) + +docker swarm init + +docker stack deploy --compose-file=docker-compose.yml test_stack + +while [ -z $(docker ps -q -f name=backup) ]; do + info "Backup container not ready yet. Retrying." + sleep 1 +done + +sleep 20 + +docker exec $(docker ps -q -f name=backup) backup + +docker run --rm \ + -v backup_data:/data alpine \ + ash -c 'tar -xf /data/backup/test.tar.gz && test -f /backup/pg_data/PG_VERSION' + +pass "Found relevant files in untared backup." + +sleep 5 +expect_running_containers "5" From 8ef7fa0d5d8f896217040c22b6c267013732db2e Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Thu, 25 Jan 2024 19:56:49 +0100 Subject: [PATCH 02/26] Try scaling down services --- cmd/backup/script.go | 63 +++++++++++++++++++++++++++++++++----------- cmd/backup/stats.go | 10 +++++++ 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 3d66aacd..a5535e16 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -326,6 +326,12 @@ func (s *script) stopContainersAndServices() (func() error, error) { return noop, nil } + dockerInfo, err := s.cli.Info(context.Background()) + if err != nil { + return noop, fmt.Errorf("stopContainers: error getting docker info: %w", err) + } + isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" + matchLabel := fmt.Sprintf( "docker-volume-backup.stop-during-backup=%s", s.c.BackupStopContainerLabel, @@ -345,18 +351,22 @@ func (s *script) stopContainersAndServices() (func() error, error) { return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err) } - allServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) - if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for services: %w", err) - } - servicesToScaleDown, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: matchLabel, - }), - }) - if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for services to scale down: %w", err) + var allServices []swarm.Service + var servicesToScaleDown []swarm.Service + if isDockerSwarm { + allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) + if err != nil { + return noop, fmt.Errorf("stopContainers: error querying for services: %w", err) + } + servicesToScaleDown, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: matchLabel, + }), + }) + if err != nil { + return noop, fmt.Errorf("stopContainers: error querying for services to scale down: %w", err) + } } if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 { @@ -393,10 +403,33 @@ func (s *script) stopContainersAndServices() (func() error, error) { ) } + var scaledDownServices []swarm.Service + var scaleDownErrors []error + if isDockerSwarm { + for _, service := range servicesToScaleDown { + var zero uint64 + service.Spec.Mode.Replicated.Replicas = &zero + service.Spec.TaskTemplate.ForceUpdate += 1 + if _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}); err != nil { + scaleDownErrors = append(scaleDownErrors, err) + } else { + scaledDownServices = append(scaledDownServices, service) + } + } + } + s.stats.Containers = ContainersStats{ - All: uint(len(allContainers)), - ToStop: uint(len(containersToStop)), - Stopped: uint(len(stoppedContainers)), + All: uint(len(allContainers)), + ToStop: uint(len(containersToStop)), + Stopped: uint(len(stoppedContainers)), + StopErrors: uint(len(stopErrors)), + } + + s.stats.Services = ServicesStats{ + All: uint(len(allServices)), + ToScaleDown: uint(len(servicesToScaleDown)), + ScaledDown: uint(len(scaledDownServices)), + ScaleDownErrors: uint(len(scaleDownErrors)), } return func() error { diff --git a/cmd/backup/stats.go b/cmd/backup/stats.go index 4eed0d96..9728e943 100644 --- a/cmd/backup/stats.go +++ b/cmd/backup/stats.go @@ -17,6 +17,15 @@ type ContainersStats struct { StopErrors uint } +// ServicesStats contains info about Swarm services that have been +// operated upon +type ServicesStats struct { + All uint + ToScaleDown uint + ScaledDown uint + ScaleDownErrors uint +} + // BackupFileStats stats about the created backup file type BackupFileStats struct { Name string @@ -40,6 +49,7 @@ type Stats struct { LockedTime time.Duration LogOutput *bytes.Buffer Containers ContainersStats + Services ServicesStats BackupFile BackupFileStats Storages map[string]StorageStats } From 511b79bd436812330eeb8759145d53a4a0ad2892 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Thu, 25 Jan 2024 20:14:15 +0100 Subject: [PATCH 03/26] Scale services back up --- cmd/backup/script.go | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index a5535e16..2902f7a1 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -394,15 +394,6 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } - var stopError error - if len(stopErrors) != 0 { - stopError = fmt.Errorf( - "stopContainers: %d error(s) stopping containers: %w", - len(stopErrors), - errors.Join(stopErrors...), - ) - } - var scaledDownServices []swarm.Service var scaleDownErrors []error if isDockerSwarm { @@ -432,6 +423,16 @@ func (s *script) stopContainersAndServices() (func() error, error) { ScaleDownErrors: uint(len(scaleDownErrors)), } + var initialErr error + allErrors := append(stopErrors, scaleDownErrors...) + if len(allErrors) != 0 { + initialErr = fmt.Errorf( + "stopContainers: %d error(s) stopping containers: %w", + len(allErrors), + errors.Join(allErrors...), + ) + } + return func() error { servicesRequiringForceUpdate := map[string]struct{}{} @@ -469,21 +470,34 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } - if len(restartErrors) != 0 { + var scaleUpErrors []error + if isDockerSwarm { + for _, service := range servicesToScaleDown { + service.Spec.Mode.Replicated.Replicas = service.PreviousSpec.Mode.Replicated.Replicas + service.Spec.TaskTemplate.ForceUpdate += 1 + if _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}); err != nil { + scaleUpErrors = append(scaleUpErrors, err) + } + } + } + + allErrors := append(restartErrors, scaleUpErrors...) + if len(allErrors) != 0 { return fmt.Errorf( "stopContainers: %d error(s) restarting containers and services: %w", - len(restartErrors), - errors.Join(restartErrors...), + len(allErrors), + errors.Join(allErrors...), ) } s.logger.Info( fmt.Sprintf( - "Restarted %d container(s) and the matching service(s).", + "Restarted %d container(s) and %d service(s).", len(stoppedContainers), + len(scaledDownServices), ), ) return nil - }, stopError + }, initialErr } // createArchive creates a tar archive of the configured backup location and From 978e9003089fa5a3d51f879d37a09d8b6b2d2075 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 15:59:30 +0100 Subject: [PATCH 04/26] Use progress tool from Docker CLI --- cmd/backup/script.go | 39 ++++++++++++++++++++++++++------ go.mod | 3 +++ go.sum | 3 +++ test/services/docker-compose.yml | 2 -- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 2902f7a1..0d49fe61 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -30,6 +30,7 @@ import ( openpgp "github.com/ProtonMail/go-crypto/openpgp/v2" "github.com/containrrr/shoutrrr" "github.com/containrrr/shoutrrr/pkg/router" + "github.com/docker/cli/cli/command/service/progress" "github.com/docker/docker/api/types" ctr "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/filters" @@ -318,6 +319,14 @@ func newScript() (*script, error) { return s, nil } +type noopWriteCloser struct { + io.Writer +} + +func (noopWriteCloser) Close() error { + return nil +} + // stopContainersAndServices stops all Docker containers that are marked as to being // stopped during the backup and returns a function that can be called to // restart everything that has been stopped. @@ -363,6 +372,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { Key: "label", Value: matchLabel, }), + Status: true, }) if err != nil { return noop, fmt.Errorf("stopContainers: error querying for services to scale down: %w", err) @@ -398,10 +408,22 @@ func (s *script) stopContainersAndServices() (func() error, error) { var scaleDownErrors []error if isDockerSwarm { for _, service := range servicesToScaleDown { - var zero uint64 - service.Spec.Mode.Replicated.Replicas = &zero - service.Spec.TaskTemplate.ForceUpdate += 1 - if _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}); err != nil { + var zero uint64 = 0 + serviceMode := &service.Spec.Mode + switch { + case serviceMode.Replicated != nil: + serviceMode.Replicated.Replicas = &zero + default: + scaleDownErrors = append(scaleDownErrors, errors.New("Labeled service has to be in replicated mode")) + continue + } + + _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + if err != nil { + scaleDownErrors = append(scaleDownErrors, err) + } + + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, &noopWriteCloser{io.Discard}); err != nil { scaleDownErrors = append(scaleDownErrors, err) } else { scaledDownServices = append(scaledDownServices, service) @@ -473,9 +495,12 @@ func (s *script) stopContainersAndServices() (func() error, error) { var scaleUpErrors []error if isDockerSwarm { for _, service := range servicesToScaleDown { - service.Spec.Mode.Replicated.Replicas = service.PreviousSpec.Mode.Replicated.Replicas - service.Spec.TaskTemplate.ForceUpdate += 1 - if _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}); err != nil { + updatedService, _, _ := s.cli.ServiceInspectWithRaw(context.Background(), service.ID, types.ServiceInspectOptions{}) + updatedService.Spec.Mode.Replicated.Replicas = updatedService.PreviousSpec.Mode.Replicated.Replicas + if _, err := s.cli.ServiceUpdate(context.Background(), updatedService.ID, updatedService.Version, updatedService.Spec, types.ServiceUpdateOptions{}); err != nil { + scaleUpErrors = append(scaleUpErrors, err) + } + if err := progress.ServiceProgress(context.Background(), s.cli, updatedService.ID, &noopWriteCloser{io.Discard}); err != nil { scaleUpErrors = append(scaleUpErrors, err) } } diff --git a/go.mod b/go.mod index 3536e134..6047793a 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.2.1 github.com/containrrr/shoutrrr v0.7.1 github.com/cosiner/argv v0.1.0 + github.com/docker/cli v25.0.1+incompatible github.com/docker/docker v24.0.7+incompatible github.com/gofrs/flock v0.8.1 github.com/klauspost/compress v1.17.4 @@ -22,9 +23,11 @@ require ( ) require ( + github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 // indirect github.com/cloudflare/circl v1.3.7 // indirect github.com/golang-jwt/jwt/v5 v5.2.0 // indirect github.com/golang/protobuf v1.5.3 // indirect + golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/protobuf v1.31.0 // indirect ) diff --git a/go.sum b/go.sum index dbd4a229..679ab93b 100644 --- a/go.sum +++ b/go.sum @@ -253,6 +253,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI= github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= +github.com/docker/cli v25.0.1+incompatible h1:mFpqnrS6Hsm3v1k7Wa/BO23oz0k121MTbTO1lpcGSkU= +github.com/docker/cli v25.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM= @@ -1241,6 +1243,7 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools v2.2.0+incompatible h1:VsBPFP1AI068pPrMxtb/S8Zkgf9xEmTLJjfM+P5UIEo= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= diff --git a/test/services/docker-compose.yml b/test/services/docker-compose.yml index dbbf8b9a..69e083f9 100644 --- a/test/services/docker-compose.yml +++ b/test/services/docker-compose.yml @@ -47,8 +47,6 @@ services: labels: - docker-volume-backup.stop-during-backup=true replicas: 2 - restart_policy: - condition: on-failure pg: image: postgres:14-alpine From f14b796aab0add36e655b8d8c4235fedd9cdee5f Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 16:18:48 +0100 Subject: [PATCH 05/26] In test, label both services --- test/services/docker-compose.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/test/services/docker-compose.yml b/test/services/docker-compose.yml index 69e083f9..494676cc 100644 --- a/test/services/docker-compose.yml +++ b/test/services/docker-compose.yml @@ -6,9 +6,6 @@ version: '3.8' services: minio: image: minio/minio:RELEASE.2020-08-04T23-10-51Z - deploy: - restart_policy: - condition: on-failure environment: MINIO_ROOT_USER: test MINIO_ROOT_PASSWORD: test @@ -22,9 +19,6 @@ services: image: offen/docker-volume-backup:${TEST_VERSION:-canary} depends_on: - minio - deploy: - restart_policy: - condition: on-failure environment: AWS_ACCESS_KEY_ID: test AWS_SECRET_ACCESS_KEY: GMusLtUmILge2by+z890kQ @@ -52,13 +46,11 @@ services: image: postgres:14-alpine environment: POSTGRES_PASSWORD: example - labels: - - docker-volume-backup.stop-during-backup=true volumes: - pg_data:/var/lib/postgresql/data deploy: - restart_policy: - condition: on-failure + labels: + - docker-volume-backup.stop-during-backup=true volumes: backup_data: From b7855605d4b734677cf1cc067a4b8442acec6948 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 20:01:45 +0100 Subject: [PATCH 06/26] Clean up error and log messages --- cmd/backup/script.go | 60 +++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 0d49fe61..6211ba41 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -337,27 +337,28 @@ func (s *script) stopContainersAndServices() (func() error, error) { dockerInfo, err := s.cli.Info(context.Background()) if err != nil { - return noop, fmt.Errorf("stopContainers: error getting docker info: %w", err) + return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err) } isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" + discardWriter := &noopWriteCloser{io.Discard} - matchLabel := fmt.Sprintf( + filterMatchLabel := fmt.Sprintf( "docker-volume-backup.stop-during-backup=%s", s.c.BackupStopContainerLabel, ) allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{}) if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err) + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err) } containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ Filters: filters.NewArgs(filters.KeyValuePair{ Key: "label", - Value: matchLabel, + Value: filterMatchLabel, }), }) if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err) + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err) } var allServices []swarm.Service @@ -365,17 +366,17 @@ func (s *script) stopContainersAndServices() (func() error, error) { if isDockerSwarm { allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for services: %w", err) + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err) } servicesToScaleDown, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{ Filters: filters.NewArgs(filters.KeyValuePair{ Key: "label", - Value: matchLabel, + Value: filterMatchLabel, }), Status: true, }) if err != nil { - return noop, fmt.Errorf("stopContainers: error querying for services to scale down: %w", err) + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err) } } @@ -385,12 +386,12 @@ func (s *script) stopContainersAndServices() (func() error, error) { s.logger.Info( fmt.Sprintf( - "Stopping %d container(s) out of %d running container(s) and scaling down %d service(s) out of %d, as they were labeled %s.", + "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", len(containersToStop), len(allContainers), len(servicesToScaleDown), len(allServices), - matchLabel, + filterMatchLabel, ), ) @@ -414,7 +415,10 @@ func (s *script) stopContainersAndServices() (func() error, error) { case serviceMode.Replicated != nil: serviceMode.Replicated.Replicas = &zero default: - scaleDownErrors = append(scaleDownErrors, errors.New("Labeled service has to be in replicated mode")) + scaleDownErrors = append( + scaleDownErrors, + fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), + ) continue } @@ -423,7 +427,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { scaleDownErrors = append(scaleDownErrors, err) } - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, &noopWriteCloser{io.Discard}); err != nil { + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { scaleDownErrors = append(scaleDownErrors, err) } else { scaledDownServices = append(scaledDownServices, service) @@ -449,7 +453,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { allErrors := append(stopErrors, scaleDownErrors...) if len(allErrors) != 0 { initialErr = fmt.Errorf( - "stopContainers: %d error(s) stopping containers: %w", + "(*script).stopContainersAndServices: %d error(s) stopping containers: %w", len(allErrors), errors.Join(allErrors...), ) @@ -480,7 +484,11 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } if serviceMatch.ID == "" { - return fmt.Errorf("stopContainers: couldn't find service with name %s", serviceName) + restartErrors = append( + restartErrors, + fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName), + ) + continue } serviceMatch.Spec.TaskTemplate.ForceUpdate += 1 if _, err := s.cli.ServiceUpdate( @@ -495,12 +503,30 @@ func (s *script) stopContainersAndServices() (func() error, error) { var scaleUpErrors []error if isDockerSwarm { for _, service := range servicesToScaleDown { - updatedService, _, _ := s.cli.ServiceInspectWithRaw(context.Background(), service.ID, types.ServiceInspectOptions{}) + updatedService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), service.ID, types.ServiceInspectOptions{}) + if err != nil { + scaleUpErrors = append(scaleUpErrors, err) + continue + } + + if updatedService.PreviousSpec == nil { + scaleUpErrors = append( + scaleUpErrors, + errors.New("(*script).stopContainersAndServices: service does not have PreviousSpec, cannot scale back up."), + ) + continue + } + updatedService.Spec.Mode.Replicated.Replicas = updatedService.PreviousSpec.Mode.Replicated.Replicas - if _, err := s.cli.ServiceUpdate(context.Background(), updatedService.ID, updatedService.Version, updatedService.Spec, types.ServiceUpdateOptions{}); err != nil { + if _, err := s.cli.ServiceUpdate( + context.Background(), + updatedService.ID, + updatedService.Version, updatedService.Spec, + types.ServiceUpdateOptions{}, + ); err != nil { scaleUpErrors = append(scaleUpErrors, err) } - if err := progress.ServiceProgress(context.Background(), s.cli, updatedService.ID, &noopWriteCloser{io.Discard}); err != nil { + if err := progress.ServiceProgress(context.Background(), s.cli, updatedService.ID, discardWriter); err != nil { scaleUpErrors = append(scaleUpErrors, err) } } From fee8cb234cf12d30b4e6fc89c79b0bd29d3e2761 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 20:25:59 +0100 Subject: [PATCH 07/26] Document scale-up/down approach in docs --- docs/how-tos/use-with-docker-swarm.md | 56 +++++++++++++++++++++++++-- test/services/docker-compose.yml | 2 - 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/docs/how-tos/use-with-docker-swarm.md b/docs/how-tos/use-with-docker-swarm.md index 539b09a7..507b8b28 100644 --- a/docs/how-tos/use-with-docker-swarm.md +++ b/docs/how-tos/use-with-docker-swarm.md @@ -7,12 +7,62 @@ nav_order: 13 # Use with Docker Swarm -By default, Docker Swarm will restart stopped containers automatically, even when manually stopped. -If you plan to have your containers / services stopped during backup, this means you need to apply the `on-failure` restart policy to your service's definitions. -A restart policy of `always` is not compatible with this tool. +{: .note } +The mechanisms described in this page __do only apply when Docker is running in [Swarm mode][swarm]__. + +[swarm]: https://docs.docker.com/engine/swarm/ + +## Stopping containers during backup + +Stopping and restarting containers during backup creation when running Docker in Swarm mode is supported in two ways. + +### Scaling services down to zero before scaling back up + +When labeling a service in the `deploy` section, the following strategy for stopping and restarting will be used: + +- The service is scaled down to zero replicas +- The backup is created +- The service is scaled back up to the previous number of replicas + +{: .note } +This approach will only work for services that are deployed in __replicated mode__. + +Such a service definition could look like: + +```yml +services: + app: + image: myorg/myimage:latest + deploy: + labels: + - docker-volume-backup.stop-during-backup=true + replicas: 2 +``` + +### Stopping the containers + +This approach bypasses the services and stops containers directly, creates the backup and restarts the containers again. +As Docker Swarm would usually try to instantly restart containers that are manually stopped, this approach only works when using the `on-failure` restart policy. +A restart policy of `always` is not compatible with this approach. + +Such a service definition could look like: + +```yml +services: + app: + image: myapp/myimage:latest + labels: + - docker-volume-backup.stop-during-backup=true + deploy: + replicas: 2 + restart_policy: + condition: on-failure +``` --- +## Memory limit considerations + When running in Swarm mode, it's also advised to set a hard memory limit on your service (~25MB should be enough in most cases, but if you backup large files above half a gigabyte or similar, you might have to raise this in case the backup exits with `Killed`): ```yml diff --git a/test/services/docker-compose.yml b/test/services/docker-compose.yml index 494676cc..70b3d0e2 100644 --- a/test/services/docker-compose.yml +++ b/test/services/docker-compose.yml @@ -35,8 +35,6 @@ services: offen: image: offen/offen:latest - healthcheck: - disable: true deploy: labels: - docker-volume-backup.stop-during-backup=true From 95e9e9945d2550ade31223d93c136de0fef22812 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 20:55:17 +0100 Subject: [PATCH 08/26] Downgrade Docker CLI to match client --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 6047793a..a1653476 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.2.1 github.com/containrrr/shoutrrr v0.7.1 github.com/cosiner/argv v0.1.0 - github.com/docker/cli v25.0.1+incompatible + github.com/docker/cli v24.0.1+incompatible github.com/docker/docker v24.0.7+incompatible github.com/gofrs/flock v0.8.1 github.com/klauspost/compress v1.17.4 diff --git a/go.sum b/go.sum index 679ab93b..26fe0c1d 100644 --- a/go.sum +++ b/go.sum @@ -253,8 +253,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI= github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= -github.com/docker/cli v25.0.1+incompatible h1:mFpqnrS6Hsm3v1k7Wa/BO23oz0k121MTbTO1lpcGSkU= -github.com/docker/cli v25.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/cli v24.0.1+incompatible h1:uVl5Xv/39kZJpDo9VaktTOYBc702sdYYF33FqwUG/dM= +github.com/docker/cli v24.0.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.8.2+incompatible h1:T3de5rq0dB1j30rp0sA2rER+m322EBzniBPB6ZIzuh8= github.com/docker/distribution v2.8.2+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/docker v24.0.7+incompatible h1:Wo6l37AuwP3JaMnZa226lzVXGA3F9Ig1seQen0cKYlM= From f4497177b51ae7b432a8b2cc3f37b3f43ff8c095 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Fri, 26 Jan 2024 21:23:22 +0100 Subject: [PATCH 09/26] Document services stats --- docs/how-tos/set-up-notifications.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/how-tos/set-up-notifications.md b/docs/how-tos/set-up-notifications.md index 4a3e9793..23d2aaf6 100644 --- a/docs/how-tos/set-up-notifications.md +++ b/docs/how-tos/set-up-notifications.md @@ -89,6 +89,11 @@ Here is a list of all data passed to the template: * `ToStop`: number of containers matched by the stop rule * `Stopped`: number of containers successfully stopped * `StopErrors`: number of containers that were unable to be stopped (equal to `ToStop - Stopped`) + * `Services`: object containing stats about the docker services (only populated when Docker is running in Swarm mode) + * `All`: total number of services + * `ToScaleDown`: number of containers matched by the scale down rule + * `ScaledDwon`: number of containers successfully scaled down + * `ScaleDownErrors`: number of containers that were unable to be stopped (equal to `ToScaleDown - ScaledDowm`) * `BackupFile`: object containing information about the backup file * `Name`: name of the backup file (e.g. `backup-2022-02-11T01-00-00.tar.gz`) * `FullPath`: full path of the backup file (e.g. `/archive/backup-2022-02-11T01-00-00.tar.gz`) From 94aa33369f76c3b26d0d9fa6f515d46ec35a7d86 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 12:23:09 +0100 Subject: [PATCH 10/26] Do not rely on PreviousSpec for storing desired replica count --- cmd/backup/script.go | 47 +++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 6211ba41..1759eb53 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -327,6 +327,11 @@ func (noopWriteCloser) Close() error { return nil } +type handledSwarmService struct { + serviceID string + initialReplicaCount uint64 +} + // stopContainersAndServices stops all Docker containers that are marked as to being // stopped during the backup and returns a function that can be called to // restart everything that has been stopped. @@ -362,19 +367,25 @@ func (s *script) stopContainersAndServices() (func() error, error) { } var allServices []swarm.Service - var servicesToScaleDown []swarm.Service + var servicesToScaleDown []handledSwarmService if isDockerSwarm { allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) if err != nil { return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err) } - servicesToScaleDown, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{ + matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{ Filters: filters.NewArgs(filters.KeyValuePair{ Key: "label", Value: filterMatchLabel, }), Status: true, }) + for _, s := range matchingServices { + servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{ + serviceID: s.ID, + initialReplicaCount: *s.Spec.Mode.Replicated.Replicas, + }) + } if err != nil { return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err) } @@ -408,7 +419,15 @@ func (s *script) stopContainersAndServices() (func() error, error) { var scaledDownServices []swarm.Service var scaleDownErrors []error if isDockerSwarm { - for _, service := range servicesToScaleDown { + for _, svc := range servicesToScaleDown { + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) + if err != nil { + scaleDownErrors = append( + scaleDownErrors, + fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), + ) + continue + } var zero uint64 = 0 serviceMode := &service.Spec.Mode switch { @@ -422,7 +441,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { continue } - _, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + _, err = s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) if err != nil { scaleDownErrors = append(scaleDownErrors, err) } @@ -502,31 +521,23 @@ func (s *script) stopContainersAndServices() (func() error, error) { var scaleUpErrors []error if isDockerSwarm { - for _, service := range servicesToScaleDown { - updatedService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), service.ID, types.ServiceInspectOptions{}) + for _, svc := range servicesToScaleDown { + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) if err != nil { scaleUpErrors = append(scaleUpErrors, err) continue } - if updatedService.PreviousSpec == nil { - scaleUpErrors = append( - scaleUpErrors, - errors.New("(*script).stopContainersAndServices: service does not have PreviousSpec, cannot scale back up."), - ) - continue - } - - updatedService.Spec.Mode.Replicated.Replicas = updatedService.PreviousSpec.Mode.Replicated.Replicas + service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount if _, err := s.cli.ServiceUpdate( context.Background(), - updatedService.ID, - updatedService.Version, updatedService.Spec, + service.ID, + service.Version, service.Spec, types.ServiceUpdateOptions{}, ); err != nil { scaleUpErrors = append(scaleUpErrors, err) } - if err := progress.ServiceProgress(context.Background(), s.cli, updatedService.ID, discardWriter); err != nil { + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { scaleUpErrors = append(scaleUpErrors, err) } } From 78a89c1a93a2d5db70d473ce54c1d65f681ac90c Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 12:32:06 +0100 Subject: [PATCH 11/26] Log warnings from Docker when updating services --- cmd/backup/script.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 1759eb53..287eaaf5 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -441,9 +441,16 @@ func (s *script) stopContainersAndServices() (func() error, error) { continue } - _, err = s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) if err != nil { scaleDownErrors = append(scaleDownErrors, err) + continue + } + + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), + ) } if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { @@ -529,13 +536,20 @@ func (s *script) stopContainersAndServices() (func() error, error) { } service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount - if _, err := s.cli.ServiceUpdate( + response, err := s.cli.ServiceUpdate( context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}, - ); err != nil { + ) + if err != nil { scaleUpErrors = append(scaleUpErrors, err) + continue + } + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), + ) } if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { scaleUpErrors = append(scaleUpErrors, err) From 538a069a70ecddb722718e590bfe4e43f3580f60 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 13:08:06 +0100 Subject: [PATCH 12/26] Check whether container and service labels collide --- cmd/backup/script.go | 20 ++++++++++++++++++ test/collision/docker-compose.yml | 28 +++++++++++++++++++++++++ test/collision/run.sh | 34 +++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 test/collision/docker-compose.yml create mode 100755 test/collision/run.sh diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 287eaaf5..a61f1d53 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -395,6 +395,26 @@ func (s *script) stopContainersAndServices() (func() error, error) { return noop, nil } + if isDockerSwarm { + for _, container := range containersToStop { + if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok { + parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{}) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err) + } + for label := range parentService.Spec.Labels { + if label == "docker-volume-backup.stop-during-backup" { + return noop, fmt.Errorf( + "(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue", + container.Names[0], + parentService.Spec.Name, + ) + } + } + } + } + } + s.logger.Info( fmt.Sprintf( "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", diff --git a/test/collision/docker-compose.yml b/test/collision/docker-compose.yml new file mode 100644 index 00000000..cd47a150 --- /dev/null +++ b/test/collision/docker-compose.yml @@ -0,0 +1,28 @@ +# Copyright 2020-2021 - Offen Authors +# SPDX-License-Identifier: Unlicense + +version: '3.8' + +services: + backup: + image: offen/docker-volume-backup:${TEST_VERSION:-canary} + environment: + BACKUP_FILENAME: test.tar.gz + volumes: + - offen_data:/backup/offen_data:ro + - ${LOCAL_DIR:-./local}:/archive + - /var/run/docker.sock:/var/run/docker.sock + + offen: + image: offen/offen:latest + labels: + - docker-volume-backup.stop-during-backup=true + deploy: + labels: + - docker-volume-backup.stop-during-backup=true + replicas: 2 + volumes: + - offen_data:/var/opt/offen + +volumes: + offen_data: diff --git a/test/collision/run.sh b/test/collision/run.sh new file mode 100755 index 00000000..8948febc --- /dev/null +++ b/test/collision/run.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +set -e + +cd $(dirname $0) +. ../util.sh +current_test=$(basename $(pwd)) + +export LOCAL_DIR=$(mktemp -d) + +docker swarm init + +docker stack deploy --compose-file=docker-compose.yml test_stack + +while [ -z $(docker ps -q -f name=backup) ]; do + info "Backup container not ready yet. Retrying." + sleep 1 +done + +sleep 20 + +set +e +docker exec $(docker ps -q -f name=backup) backup +if [ $? = "0" ]; then + fail "Expected script to exit with error code." +fi + +if [ -f "${LOCAL_DIR}/test.tar.gz" ]; then + fail "Found backup file that should not have been created." +fi + +expect_running_containers "3" + +pass "Script did not perform backup as there was a label collision." From bf1d13b78c6407507aea366539c2eae054b8ca7b Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 13:50:56 +0100 Subject: [PATCH 13/26] Document script behavior on label collision --- docs/how-tos/stop-containers-during-backup.md | 3 +++ docs/how-tos/use-with-docker-swarm.md | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/docs/how-tos/stop-containers-during-backup.md b/docs/how-tos/stop-containers-during-backup.md index 941ebdab..f87f7540 100644 --- a/docs/how-tos/stop-containers-during-backup.md +++ b/docs/how-tos/stop-containers-during-backup.md @@ -7,6 +7,9 @@ nav_order: 1 # Stop containers during backup +{: .note } +In case you are running Docker in Swarm mode, [dedicated documentation](./use-with-docker-swarm.html) on service and container restart applies. + In many cases, it will be desirable to stop the services that are consuming the volume you want to backup in order to ensure data integrity. This image can automatically stop and restart containers and services. By default, any container that is labeled `docker-volume-backup.stop-during-backup=true` will be stopped before the backup is being taken and restarted once it has finished. diff --git a/docs/how-tos/use-with-docker-swarm.md b/docs/how-tos/use-with-docker-swarm.md index 507b8b28..29178551 100644 --- a/docs/how-tos/use-with-docker-swarm.md +++ b/docs/how-tos/use-with-docker-swarm.md @@ -16,6 +16,10 @@ The mechanisms described in this page __do only apply when Docker is running in Stopping and restarting containers during backup creation when running Docker in Swarm mode is supported in two ways. +{: .important } +Make sure you label your services and containers using only one of the describe approaches. +In case the script encounters a container that is labeled and has a parent service that is also labeled, it will exit early. + ### Scaling services down to zero before scaling back up When labeling a service in the `deploy` section, the following strategy for stopping and restarting will be used: From bb37b8b1d8ead9a017bf7c46c762ebd19fa0661e Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 15:37:31 +0100 Subject: [PATCH 14/26] Add additional check if all containers have been removed --- cmd/backup/script.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index a61f1d53..50d3d70e 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -478,6 +478,25 @@ func (s *script) stopContainersAndServices() (func() error, error) { } else { scaledDownServices = append(scaledDownServices, service) } + + // progress.ServiceProgress returns too early, so we need to manually check + // whether all containers belonging to the service have actually been removed + for { + containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), + }), + }) + if err != nil { + scaleDownErrors = append(scaleDownErrors, err) + break + } + if len(containers) == 0 { + break + } + time.Sleep(time.Second) + } } } From 7ad6fc935561ba3e86b33904b2f1876d46153e50 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 17:00:43 +0100 Subject: [PATCH 15/26] Scale services concurrently --- cmd/backup/script.go | 184 +++++++++++++++++++++---------------------- cmd/backup/util.go | 29 +++++++ 2 files changed, 120 insertions(+), 93 deletions(-) diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 50d3d70e..71eadb8c 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -16,6 +16,7 @@ import ( "path/filepath" "slices" "strings" + "sync" "text/template" "time" @@ -319,19 +320,6 @@ func newScript() (*script, error) { return s, nil } -type noopWriteCloser struct { - io.Writer -} - -func (noopWriteCloser) Close() error { - return nil -} - -type handledSwarmService struct { - serviceID string - initialReplicaCount uint64 -} - // stopContainersAndServices stops all Docker containers that are marked as to being // stopped during the backup and returns a function that can be called to // restart everything that has been stopped. @@ -437,67 +425,71 @@ func (s *script) stopContainersAndServices() (func() error, error) { } var scaledDownServices []swarm.Service - var scaleDownErrors []error + var scaleDownErrors concurrentSlice[error] if isDockerSwarm { + wg := sync.WaitGroup{} for _, svc := range servicesToScaleDown { - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleDownErrors = append( - scaleDownErrors, - fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), - ) - continue - } - var zero uint64 = 0 - serviceMode := &service.Spec.Mode - switch { - case serviceMode.Replicated != nil: - serviceMode.Replicated.Replicas = &zero - default: - scaleDownErrors = append( - scaleDownErrors, - fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), - ) - continue - } - - response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) - if err != nil { - scaleDownErrors = append(scaleDownErrors, err) - continue - } + wg.Add(1) + go func(svc handledSwarmService) { + defer wg.Done() + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) + if err != nil { + scaleDownErrors.append( + fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), + ) + return + } + var zero uint64 = 0 + serviceMode := &service.Spec.Mode + switch { + case serviceMode.Replicated != nil: + serviceMode.Replicated.Replicas = &zero + default: + scaleDownErrors.append( + fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), + ) + return + } - for _, warning := range response.Warnings { - s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), - ) - } + response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + if err != nil { + scaleDownErrors.append(err) + return + } - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleDownErrors = append(scaleDownErrors, err) - } else { - scaledDownServices = append(scaledDownServices, service) - } + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), + ) + } - // progress.ServiceProgress returns too early, so we need to manually check - // whether all containers belonging to the service have actually been removed - for { - containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), - }), - }) - if err != nil { - scaleDownErrors = append(scaleDownErrors, err) - break + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { + scaleDownErrors.append(err) + } else { + scaledDownServices = append(scaledDownServices, service) } - if len(containers) == 0 { - break + + // progress.ServiceProgress returns too early, so we need to manually check + // whether all containers belonging to the service have actually been removed + for { + containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), + }), + }) + if err != nil { + scaleDownErrors.append(err) + break + } + if len(containers) == 0 { + break + } + time.Sleep(time.Second) } - time.Sleep(time.Second) - } + }(svc) } + wg.Wait() } s.stats.Containers = ContainersStats{ @@ -511,11 +503,11 @@ func (s *script) stopContainersAndServices() (func() error, error) { All: uint(len(allServices)), ToScaleDown: uint(len(servicesToScaleDown)), ScaledDown: uint(len(scaledDownServices)), - ScaleDownErrors: uint(len(scaleDownErrors)), + ScaleDownErrors: uint(len(scaleDownErrors.value())), } var initialErr error - allErrors := append(stopErrors, scaleDownErrors...) + allErrors := append(stopErrors, scaleDownErrors.value()...) if len(allErrors) != 0 { initialErr = fmt.Errorf( "(*script).stopContainersAndServices: %d error(s) stopping containers: %w", @@ -565,38 +557,44 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } - var scaleUpErrors []error + var scaleUpErrors concurrentSlice[error] if isDockerSwarm { + wg := &sync.WaitGroup{} for _, svc := range servicesToScaleDown { - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleUpErrors = append(scaleUpErrors, err) - continue - } + wg.Add(1) + go func(svc handledSwarmService) { + defer wg.Done() + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) + if err != nil { + scaleUpErrors.append(err) + return + } - service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount - response, err := s.cli.ServiceUpdate( - context.Background(), - service.ID, - service.Version, service.Spec, - types.ServiceUpdateOptions{}, - ) - if err != nil { - scaleUpErrors = append(scaleUpErrors, err) - continue - } - for _, warning := range response.Warnings { - s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), + service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount + response, err := s.cli.ServiceUpdate( + context.Background(), + service.ID, + service.Version, service.Spec, + types.ServiceUpdateOptions{}, ) - } - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleUpErrors = append(scaleUpErrors, err) - } + if err != nil { + scaleUpErrors.append(err) + return + } + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), + ) + } + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { + scaleUpErrors.append(err) + } + }(svc) } + wg.Wait() } - allErrors := append(restartErrors, scaleUpErrors...) + allErrors := append(restartErrors, scaleUpErrors.value()...) if len(allErrors) != 0 { return fmt.Errorf( "stopContainers: %d error(s) restarting containers and services: %w", diff --git a/cmd/backup/util.go b/cmd/backup/util.go index c349e7b7..e13da867 100644 --- a/cmd/backup/util.go +++ b/cmd/backup/util.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "os" + "sync" ) var noop = func() error { return nil } @@ -50,3 +51,31 @@ func (b *bufferingWriter) Write(p []byte) (n int, err error) { } return b.writer.Write(p) } + +type noopWriteCloser struct { + io.Writer +} + +func (noopWriteCloser) Close() error { + return nil +} + +type handledSwarmService struct { + serviceID string + initialReplicaCount uint64 +} + +type concurrentSlice[T any] struct { + val []T + sync.Mutex +} + +func (c *concurrentSlice[T]) append(v T) { + c.Lock() + defer c.Unlock() + c.val = append(c.val, v) +} + +func (c *concurrentSlice[T]) value() []T { + return c.val +} From 09cc1f5c6097ae2664deaf14db089fbb3864cf8e Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 18:41:03 +0100 Subject: [PATCH 16/26] Move docker interaction code into own file --- cmd/backup/docker.go | 309 +++++++++++++++++++++++++++++++++++++++++++ cmd/backup/script.go | 301 ----------------------------------------- 2 files changed, 309 insertions(+), 301 deletions(-) create mode 100644 cmd/backup/docker.go diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go new file mode 100644 index 00000000..33f37216 --- /dev/null +++ b/cmd/backup/docker.go @@ -0,0 +1,309 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "sync" + "time" + + "github.com/docker/cli/cli/command/service/progress" + "github.com/docker/docker/api/types" + ctr "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/api/types/swarm" +) + +// stopContainersAndServices stops all Docker containers that are marked as to being +// stopped during the backup and returns a function that can be called to +// restart everything that has been stopped. +func (s *script) stopContainersAndServices() (func() error, error) { + if s.cli == nil { + return noop, nil + } + + dockerInfo, err := s.cli.Info(context.Background()) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err) + } + isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" + discardWriter := &noopWriteCloser{io.Discard} + + filterMatchLabel := fmt.Sprintf( + "docker-volume-backup.stop-during-backup=%s", + s.c.BackupStopContainerLabel, + ) + + allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{}) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err) + } + containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: filterMatchLabel, + }), + }) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err) + } + + var allServices []swarm.Service + var servicesToScaleDown []handledSwarmService + if isDockerSwarm { + allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err) + } + matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: filterMatchLabel, + }), + Status: true, + }) + for _, s := range matchingServices { + servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{ + serviceID: s.ID, + initialReplicaCount: *s.Spec.Mode.Replicated.Replicas, + }) + } + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err) + } + } + + if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 { + return noop, nil + } + + if isDockerSwarm { + for _, container := range containersToStop { + if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok { + parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{}) + if err != nil { + return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err) + } + for label := range parentService.Spec.Labels { + if label == "docker-volume-backup.stop-during-backup" { + return noop, fmt.Errorf( + "(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue", + container.Names[0], + parentService.Spec.Name, + ) + } + } + } + } + } + + s.logger.Info( + fmt.Sprintf( + "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", + len(containersToStop), + len(allContainers), + len(servicesToScaleDown), + len(allServices), + filterMatchLabel, + ), + ) + + var stoppedContainers []types.Container + var stopErrors []error + for _, container := range containersToStop { + if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil { + stopErrors = append(stopErrors, err) + } else { + stoppedContainers = append(stoppedContainers, container) + } + } + + var scaledDownServices []swarm.Service + var scaleDownErrors concurrentSlice[error] + if isDockerSwarm { + wg := sync.WaitGroup{} + for _, svc := range servicesToScaleDown { + wg.Add(1) + go func(svc handledSwarmService) { + defer wg.Done() + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) + if err != nil { + scaleDownErrors.append( + fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), + ) + return + } + var zero uint64 = 0 + serviceMode := &service.Spec.Mode + switch { + case serviceMode.Replicated != nil: + serviceMode.Replicated.Replicas = &zero + default: + scaleDownErrors.append( + fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), + ) + return + } + + response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + if err != nil { + scaleDownErrors.append(err) + return + } + + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), + ) + } + + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { + scaleDownErrors.append(err) + } else { + scaledDownServices = append(scaledDownServices, service) + } + + // progress.ServiceProgress returns too early, so we need to manually check + // whether all containers belonging to the service have actually been removed + for { + containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), + }), + }) + if err != nil { + scaleDownErrors.append(err) + break + } + if len(containers) == 0 { + break + } + time.Sleep(time.Second) + } + }(svc) + } + wg.Wait() + } + + s.stats.Containers = ContainersStats{ + All: uint(len(allContainers)), + ToStop: uint(len(containersToStop)), + Stopped: uint(len(stoppedContainers)), + StopErrors: uint(len(stopErrors)), + } + + s.stats.Services = ServicesStats{ + All: uint(len(allServices)), + ToScaleDown: uint(len(servicesToScaleDown)), + ScaledDown: uint(len(scaledDownServices)), + ScaleDownErrors: uint(len(scaleDownErrors.value())), + } + + var initialErr error + allErrors := append(stopErrors, scaleDownErrors.value()...) + if len(allErrors) != 0 { + initialErr = fmt.Errorf( + "(*script).stopContainersAndServices: %d error(s) stopping containers: %w", + len(allErrors), + errors.Join(allErrors...), + ) + } + + return func() error { + servicesRequiringForceUpdate := map[string]struct{}{} + + var restartErrors []error + for _, container := range stoppedContainers { + if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok { + servicesRequiringForceUpdate[swarmServiceName] = struct{}{} + continue + } + if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil { + restartErrors = append(restartErrors, err) + } + } + + if len(servicesRequiringForceUpdate) != 0 { + services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) + for serviceName := range servicesRequiringForceUpdate { + var serviceMatch swarm.Service + for _, service := range services { + if service.Spec.Name == serviceName { + serviceMatch = service + break + } + } + if serviceMatch.ID == "" { + restartErrors = append( + restartErrors, + fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName), + ) + continue + } + serviceMatch.Spec.TaskTemplate.ForceUpdate += 1 + if _, err := s.cli.ServiceUpdate( + context.Background(), serviceMatch.ID, + serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{}, + ); err != nil { + restartErrors = append(restartErrors, err) + } + } + } + + var scaleUpErrors concurrentSlice[error] + if isDockerSwarm { + wg := &sync.WaitGroup{} + for _, svc := range servicesToScaleDown { + wg.Add(1) + go func(svc handledSwarmService) { + defer wg.Done() + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) + if err != nil { + scaleUpErrors.append(err) + return + } + + service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount + response, err := s.cli.ServiceUpdate( + context.Background(), + service.ID, + service.Version, service.Spec, + types.ServiceUpdateOptions{}, + ) + if err != nil { + scaleUpErrors.append(err) + return + } + for _, warning := range response.Warnings { + s.logger.Warn( + fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), + ) + } + if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { + scaleUpErrors.append(err) + } + }(svc) + } + wg.Wait() + } + + allErrors := append(restartErrors, scaleUpErrors.value()...) + if len(allErrors) != 0 { + return fmt.Errorf( + "stopContainers: %d error(s) restarting containers and services: %w", + len(allErrors), + errors.Join(allErrors...), + ) + } + s.logger.Info( + fmt.Sprintf( + "Restarted %d container(s) and %d service(s).", + len(stoppedContainers), + len(scaledDownServices), + ), + ) + return nil + }, initialErr +} diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 71eadb8c..65b8009d 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -5,8 +5,6 @@ package main import ( "bytes" - "context" - "errors" "fmt" "io" "io/fs" @@ -16,7 +14,6 @@ import ( "path/filepath" "slices" "strings" - "sync" "text/template" "time" @@ -31,11 +28,6 @@ import ( openpgp "github.com/ProtonMail/go-crypto/openpgp/v2" "github.com/containrrr/shoutrrr" "github.com/containrrr/shoutrrr/pkg/router" - "github.com/docker/cli/cli/command/service/progress" - "github.com/docker/docker/api/types" - ctr "github.com/docker/docker/api/types/container" - "github.com/docker/docker/api/types/filters" - "github.com/docker/docker/api/types/swarm" "github.com/docker/docker/client" "github.com/leekchan/timeutil" "github.com/offen/envconfig" @@ -320,299 +312,6 @@ func newScript() (*script, error) { return s, nil } -// stopContainersAndServices stops all Docker containers that are marked as to being -// stopped during the backup and returns a function that can be called to -// restart everything that has been stopped. -func (s *script) stopContainersAndServices() (func() error, error) { - if s.cli == nil { - return noop, nil - } - - dockerInfo, err := s.cli.Info(context.Background()) - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err) - } - isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" - discardWriter := &noopWriteCloser{io.Discard} - - filterMatchLabel := fmt.Sprintf( - "docker-volume-backup.stop-during-backup=%s", - s.c.BackupStopContainerLabel, - ) - - allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{}) - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers: %w", err) - } - containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: filterMatchLabel, - }), - }) - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for containers to stop: %w", err) - } - - var allServices []swarm.Service - var servicesToScaleDown []handledSwarmService - if isDockerSwarm { - allServices, err = s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services: %w", err) - } - matchingServices, err := s.cli.ServiceList(context.Background(), types.ServiceListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: filterMatchLabel, - }), - Status: true, - }) - for _, s := range matchingServices { - servicesToScaleDown = append(servicesToScaleDown, handledSwarmService{ - serviceID: s.ID, - initialReplicaCount: *s.Spec.Mode.Replicated.Replicas, - }) - } - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for services to scale down: %w", err) - } - } - - if len(containersToStop) == 0 && len(servicesToScaleDown) == 0 { - return noop, nil - } - - if isDockerSwarm { - for _, container := range containersToStop { - if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok { - parentService, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{}) - if err != nil { - return noop, fmt.Errorf("(*script).stopContainersAndServices: error querying for parent service with ID %s: %w", swarmServiceID, err) - } - for label := range parentService.Spec.Labels { - if label == "docker-volume-backup.stop-during-backup" { - return noop, fmt.Errorf( - "(*script).stopContainersAndServices: container %s is labeled to stop but has parent service %s which is also labeled, cannot continue", - container.Names[0], - parentService.Spec.Name, - ) - } - } - } - } - } - - s.logger.Info( - fmt.Sprintf( - "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", - len(containersToStop), - len(allContainers), - len(servicesToScaleDown), - len(allServices), - filterMatchLabel, - ), - ) - - var stoppedContainers []types.Container - var stopErrors []error - for _, container := range containersToStop { - if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil { - stopErrors = append(stopErrors, err) - } else { - stoppedContainers = append(stoppedContainers, container) - } - } - - var scaledDownServices []swarm.Service - var scaleDownErrors concurrentSlice[error] - if isDockerSwarm { - wg := sync.WaitGroup{} - for _, svc := range servicesToScaleDown { - wg.Add(1) - go func(svc handledSwarmService) { - defer wg.Done() - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleDownErrors.append( - fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), - ) - return - } - var zero uint64 = 0 - serviceMode := &service.Spec.Mode - switch { - case serviceMode.Replicated != nil: - serviceMode.Replicated.Replicas = &zero - default: - scaleDownErrors.append( - fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), - ) - return - } - - response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) - if err != nil { - scaleDownErrors.append(err) - return - } - - for _, warning := range response.Warnings { - s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), - ) - } - - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleDownErrors.append(err) - } else { - scaledDownServices = append(scaledDownServices, service) - } - - // progress.ServiceProgress returns too early, so we need to manually check - // whether all containers belonging to the service have actually been removed - for { - containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), - }), - }) - if err != nil { - scaleDownErrors.append(err) - break - } - if len(containers) == 0 { - break - } - time.Sleep(time.Second) - } - }(svc) - } - wg.Wait() - } - - s.stats.Containers = ContainersStats{ - All: uint(len(allContainers)), - ToStop: uint(len(containersToStop)), - Stopped: uint(len(stoppedContainers)), - StopErrors: uint(len(stopErrors)), - } - - s.stats.Services = ServicesStats{ - All: uint(len(allServices)), - ToScaleDown: uint(len(servicesToScaleDown)), - ScaledDown: uint(len(scaledDownServices)), - ScaleDownErrors: uint(len(scaleDownErrors.value())), - } - - var initialErr error - allErrors := append(stopErrors, scaleDownErrors.value()...) - if len(allErrors) != 0 { - initialErr = fmt.Errorf( - "(*script).stopContainersAndServices: %d error(s) stopping containers: %w", - len(allErrors), - errors.Join(allErrors...), - ) - } - - return func() error { - servicesRequiringForceUpdate := map[string]struct{}{} - - var restartErrors []error - for _, container := range stoppedContainers { - if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok { - servicesRequiringForceUpdate[swarmServiceName] = struct{}{} - continue - } - if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil { - restartErrors = append(restartErrors, err) - } - } - - if len(servicesRequiringForceUpdate) != 0 { - services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) - for serviceName := range servicesRequiringForceUpdate { - var serviceMatch swarm.Service - for _, service := range services { - if service.Spec.Name == serviceName { - serviceMatch = service - break - } - } - if serviceMatch.ID == "" { - restartErrors = append( - restartErrors, - fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName), - ) - continue - } - serviceMatch.Spec.TaskTemplate.ForceUpdate += 1 - if _, err := s.cli.ServiceUpdate( - context.Background(), serviceMatch.ID, - serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{}, - ); err != nil { - restartErrors = append(restartErrors, err) - } - } - } - - var scaleUpErrors concurrentSlice[error] - if isDockerSwarm { - wg := &sync.WaitGroup{} - for _, svc := range servicesToScaleDown { - wg.Add(1) - go func(svc handledSwarmService) { - defer wg.Done() - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleUpErrors.append(err) - return - } - - service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount - response, err := s.cli.ServiceUpdate( - context.Background(), - service.ID, - service.Version, service.Spec, - types.ServiceUpdateOptions{}, - ) - if err != nil { - scaleUpErrors.append(err) - return - } - for _, warning := range response.Warnings { - s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), - ) - } - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleUpErrors.append(err) - } - }(svc) - } - wg.Wait() - } - - allErrors := append(restartErrors, scaleUpErrors.value()...) - if len(allErrors) != 0 { - return fmt.Errorf( - "stopContainers: %d error(s) restarting containers and services: %w", - len(allErrors), - errors.Join(allErrors...), - ) - } - s.logger.Info( - fmt.Sprintf( - "Restarted %d container(s) and %d service(s).", - len(stoppedContainers), - len(scaledDownServices), - ), - ) - return nil - }, initialErr -} - // createArchive creates a tar archive of the configured backup location and // saves it to disk. func (s *script) createArchive() error { From 26bbc66cd56f65d8882ad95b49a360e8627b9bf2 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 19:26:39 +0100 Subject: [PATCH 17/26] Factor out code for service updating --- cmd/backup/docker.go | 127 ++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 67 deletions(-) diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go index 33f37216..ed60170c 100644 --- a/cmd/backup/docker.go +++ b/cmd/backup/docker.go @@ -13,8 +13,53 @@ import ( ctr "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/filters" "github.com/docker/docker/api/types/swarm" + "github.com/docker/docker/client" ) +func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]string, error) { + service, _, err := cli.ServiceInspectWithRaw(context.Background(), serviceID, types.ServiceInspectOptions{}) + if err != nil { + return nil, fmt.Errorf("scaleService: error inspecting service %s: %w", serviceID, err) + } + serviceMode := &service.Spec.Mode + switch { + case serviceMode.Replicated != nil: + serviceMode.Replicated.Replicas = &replicas + default: + return nil, fmt.Errorf("scaleService: service to be scaled %s has to be in replicated mode", service.Spec.Name) + } + + response, err := cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + if err != nil { + return nil, fmt.Errorf("scaleService: error updating service: %w", err) + } + + discardWriter := &noopWriteCloser{io.Discard} + if err := progress.ServiceProgress(context.Background(), cli, service.ID, discardWriter); err != nil { + return nil, err + } + return response.Warnings, nil +} + +func awaitContainerCountForService(cli *client.Client, serviceID string, count int) error { + for { + containers, err := cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: fmt.Sprintf("com.docker.swarm.service.id=%s", serviceID), + }), + }) + if err != nil { + return fmt.Errorf("awaitContainerCount: error listing containers: %w", err) + } + if len(containers) == count { + break + } + time.Sleep(time.Second) + } + return nil +} + // stopContainersAndServices stops all Docker containers that are marked as to being // stopped during the backup and returns a function that can be called to // restart everything that has been stopped. @@ -28,7 +73,6 @@ func (s *script) stopContainersAndServices() (func() error, error) { return noop, fmt.Errorf("(*script).stopContainersAndServices: error getting docker info: %w", err) } isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" - discardWriter := &noopWriteCloser{io.Discard} filterMatchLabel := fmt.Sprintf( "docker-volume-backup.stop-during-backup=%s", @@ -119,7 +163,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } - var scaledDownServices []swarm.Service + var scaledDownServices []handledSwarmService var scaleDownErrors concurrentSlice[error] if isDockerSwarm { wg := sync.WaitGroup{} @@ -127,60 +171,21 @@ func (s *script) stopContainersAndServices() (func() error, error) { wg.Add(1) go func(svc handledSwarmService) { defer wg.Done() - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleDownErrors.append( - fmt.Errorf("(*script).stopContainersAndServices: error inspecting service %s: %w", svc.serviceID, err), - ) - return - } - var zero uint64 = 0 - serviceMode := &service.Spec.Mode - switch { - case serviceMode.Replicated != nil: - serviceMode.Replicated.Replicas = &zero - default: - scaleDownErrors.append( - fmt.Errorf("(*script).stopContainersAndServices: labeled service %s has to be in replicated mode", service.Spec.Name), - ) - return - } - - response, err := s.cli.ServiceUpdate(context.Background(), service.ID, service.Version, service.Spec, types.ServiceUpdateOptions{}) + warnings, err := scaleService(s.cli, svc.serviceID, 0) if err != nil { scaleDownErrors.append(err) - return + } else { + scaledDownServices = append(scaledDownServices, svc) } - - for _, warning := range response.Warnings { + for _, warning := range warnings { s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", service.Spec.Name, warning), + fmt.Sprintf("The Docker API returned a warning when scaling down service %s: %s", svc.serviceID, warning), ) } - - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleDownErrors.append(err) - } else { - scaledDownServices = append(scaledDownServices, service) - } - // progress.ServiceProgress returns too early, so we need to manually check // whether all containers belonging to the service have actually been removed - for { - containers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: fmt.Sprintf("com.docker.swarm.service.id=%s", service.ID), - }), - }) - if err != nil { - scaleDownErrors.append(err) - break - } - if len(containers) == 0 { - break - } - time.Sleep(time.Second) + if err := awaitContainerCountForService(s.cli, svc.serviceID, 0); err != nil { + scaleDownErrors.append(err) } }(svc) } @@ -216,6 +221,9 @@ func (s *script) stopContainersAndServices() (func() error, error) { var restartErrors []error for _, container := range stoppedContainers { + // in case a container was part of a swarm service, teh service requires to + // be force updated instead of restarting the container as it would otherwise + // remain in a "completed" state if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok { servicesRequiringForceUpdate[swarmServiceName] = struct{}{} continue @@ -259,31 +267,16 @@ func (s *script) stopContainersAndServices() (func() error, error) { wg.Add(1) go func(svc handledSwarmService) { defer wg.Done() - service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), svc.serviceID, types.ServiceInspectOptions{}) - if err != nil { - scaleUpErrors.append(err) - return - } - - service.Spec.Mode.Replicated.Replicas = &svc.initialReplicaCount - response, err := s.cli.ServiceUpdate( - context.Background(), - service.ID, - service.Version, service.Spec, - types.ServiceUpdateOptions{}, - ) + warnings, err := scaleService(s.cli, svc.serviceID, svc.initialReplicaCount) if err != nil { - scaleUpErrors.append(err) + scaleDownErrors.append(err) return } - for _, warning := range response.Warnings { + for _, warning := range warnings { s.logger.Warn( - fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", service.Spec.Name, warning), + fmt.Sprintf("The Docker API returned a warning when scaling up service %s: %s", svc.serviceID, warning), ) } - if err := progress.ServiceProgress(context.Background(), s.cli, service.ID, discardWriter); err != nil { - scaleUpErrors.append(err) - } }(svc) } wg.Wait() From 2bc94d8a5b9ac9fbde6c808bf4c6f48be821d40d Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 19:42:21 +0100 Subject: [PATCH 18/26] Time out after five minutes of not reaching desired container count --- cmd/backup/docker.go | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go index ed60170c..9caaad45 100644 --- a/cmd/backup/docker.go +++ b/cmd/backup/docker.go @@ -42,22 +42,34 @@ func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]stri } func awaitContainerCountForService(cli *client.Client, serviceID string, count int) error { + poll := time.NewTicker(time.Second) + timeout := time.NewTicker(5 * time.Minute) + defer timeout.Stop() + defer poll.Stop() + for { - containers, err := cli.ContainerList(context.Background(), types.ContainerListOptions{ - Filters: filters.NewArgs(filters.KeyValuePair{ - Key: "label", - Value: fmt.Sprintf("com.docker.swarm.service.id=%s", serviceID), - }), - }) - if err != nil { - return fmt.Errorf("awaitContainerCount: error listing containers: %w", err) - } - if len(containers) == count { - break + select { + case <-timeout.C: + return fmt.Errorf( + "awaitContainerCount: timed out after waiting 5 minutes for service %s to reach desired container count of %d", + serviceID, + count, + ) + case <-poll.C: + containers, err := cli.ContainerList(context.Background(), types.ContainerListOptions{ + Filters: filters.NewArgs(filters.KeyValuePair{ + Key: "label", + Value: fmt.Sprintf("com.docker.swarm.service.id=%s", serviceID), + }), + }) + if err != nil { + return fmt.Errorf("awaitContainerCount: error listing containers: %w", err) + } + if len(containers) == count { + return nil + } } - time.Sleep(time.Second) } - return nil } // stopContainersAndServices stops all Docker containers that are marked as to being From 542d1fa69fca6c9418015c831949b60d765f56a4 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 19:48:50 +0100 Subject: [PATCH 19/26] Inline handling of in-swarm container level restart --- cmd/backup/docker.go | 49 ++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go index 9caaad45..87de18dd 100644 --- a/cmd/backup/docker.go +++ b/cmd/backup/docker.go @@ -229,46 +229,37 @@ func (s *script) stopContainersAndServices() (func() error, error) { } return func() error { - servicesRequiringForceUpdate := map[string]struct{}{} - var restartErrors []error + matchedServices := map[string]bool{} for _, container := range stoppedContainers { - // in case a container was part of a swarm service, teh service requires to - // be force updated instead of restarting the container as it would otherwise - // remain in a "completed" state - if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok { - servicesRequiringForceUpdate[swarmServiceName] = struct{}{} - continue - } - if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil { - restartErrors = append(restartErrors, err) - } - } - - if len(servicesRequiringForceUpdate) != 0 { - services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{}) - for serviceName := range servicesRequiringForceUpdate { - var serviceMatch swarm.Service - for _, service := range services { - if service.Spec.Name == serviceName { - serviceMatch = service - break - } + if swarmServiceID, ok := container.Labels["com.docker.swarm.service.id"]; ok && isDockerSwarm { + if _, ok := matchedServices[swarmServiceID]; ok { + continue } - if serviceMatch.ID == "" { + matchedServices[swarmServiceID] = true + // in case a container was part of a swarm service, the service requires to + // be force updated instead of restarting the container as it would otherwise + // remain in a "completed" state + service, _, err := s.cli.ServiceInspectWithRaw(context.Background(), swarmServiceID, types.ServiceInspectOptions{}) + if err != nil { restartErrors = append( restartErrors, - fmt.Errorf("(*script).stopContainersAndServices: couldn't find service with name %s", serviceName), + fmt.Errorf("(*script).stopContainersAndServices: error looking up parent service: %w", err), ) continue } - serviceMatch.Spec.TaskTemplate.ForceUpdate += 1 + service.Spec.TaskTemplate.ForceUpdate += 1 if _, err := s.cli.ServiceUpdate( - context.Background(), serviceMatch.ID, - serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{}, + context.Background(), service.ID, + service.Version, service.Spec, types.ServiceUpdateOptions{}, ); err != nil { restartErrors = append(restartErrors, err) } + continue + } + + if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil { + restartErrors = append(restartErrors, err) } } @@ -297,7 +288,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { allErrors := append(restartErrors, scaleUpErrors.value()...) if len(allErrors) != 0 { return fmt.Errorf( - "stopContainers: %d error(s) restarting containers and services: %w", + "(*script).stopContainersAndServices: %d error(s) restarting containers and services: %w", len(allErrors), errors.Join(allErrors...), ) From 409496af240c0589cde2afccb36768d21e47d185 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sat, 27 Jan 2024 21:07:04 +0100 Subject: [PATCH 20/26] Timer is more suitable for timeout race --- cmd/backup/docker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go index 87de18dd..340e14f2 100644 --- a/cmd/backup/docker.go +++ b/cmd/backup/docker.go @@ -43,7 +43,7 @@ func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]stri func awaitContainerCountForService(cli *client.Client, serviceID string, count int) error { poll := time.NewTicker(time.Second) - timeout := time.NewTicker(5 * time.Minute) + timeout := time.NewTimer(5 * time.Minute) defer timeout.Stop() defer poll.Stop() From 9acd6dc8abb81716cbd59d56186b7479aabcba8d Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sun, 28 Jan 2024 14:35:02 +0100 Subject: [PATCH 21/26] Timeout when scaling down services should be configurable --- cmd/backup/config.go | 1 + cmd/backup/docker.go | 9 +++++---- docs/reference/index.md | 8 ++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/cmd/backup/config.go b/cmd/backup/config.go index 0a5122e6..3afbe251 100644 --- a/cmd/backup/config.go +++ b/cmd/backup/config.go @@ -38,6 +38,7 @@ type Config struct { BackupPruningLeeway time.Duration `split_words:"true" default:"1m"` BackupPruningPrefix string `split_words:"true"` BackupStopContainerLabel string `split_words:"true" default:"true"` + BackupStopServiceTimeout time.Duration `split_words:"true" default:"5m"` BackupFromSnapshot bool `split_words:"true"` BackupExcludeRegexp RegexpDecoder `split_words:"true"` BackupSkipBackendsFromPrune []string `split_words:"true"` diff --git a/cmd/backup/docker.go b/cmd/backup/docker.go index 340e14f2..ca3265ba 100644 --- a/cmd/backup/docker.go +++ b/cmd/backup/docker.go @@ -41,9 +41,9 @@ func scaleService(cli *client.Client, serviceID string, replicas uint64) ([]stri return response.Warnings, nil } -func awaitContainerCountForService(cli *client.Client, serviceID string, count int) error { +func awaitContainerCountForService(cli *client.Client, serviceID string, count int, timeoutAfter time.Duration) error { poll := time.NewTicker(time.Second) - timeout := time.NewTimer(5 * time.Minute) + timeout := time.NewTimer(timeoutAfter) defer timeout.Stop() defer poll.Stop() @@ -51,7 +51,8 @@ func awaitContainerCountForService(cli *client.Client, serviceID string, count i select { case <-timeout.C: return fmt.Errorf( - "awaitContainerCount: timed out after waiting 5 minutes for service %s to reach desired container count of %d", + "awaitContainerCount: timed out after waiting %s for service %s to reach desired container count of %d", + timeoutAfter, serviceID, count, ) @@ -196,7 +197,7 @@ func (s *script) stopContainersAndServices() (func() error, error) { } // progress.ServiceProgress returns too early, so we need to manually check // whether all containers belonging to the service have actually been removed - if err := awaitContainerCountForService(s.cli, svc.serviceID, 0); err != nil { + if err := awaitContainerCountForService(s.cli, svc.serviceID, 0, s.c.BackupStopServiceTimeout); err != nil { scaleDownErrors.append(err) } }(svc) diff --git a/docs/reference/index.md b/docs/reference/index.md index cdbe3dc2..9eb157de 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -326,6 +326,14 @@ You can populate below template according to your requirements and use it as you # BACKUP_STOP_CONTAINER_LABEL="service1" +# When trying to scale down Docker Swarm services, give up after +# the specified amount of time in case the service has not converged yet. +# In case you need to adjust this timeout, supply a duration +# value as per https://pkg.go.dev/time#ParseDuration to `BACKUP_STOP_SERVICE_TIMEOUT`. +# Defaults to 5 minutes. + +# BACKUP_STOP_SERVICE_TIMEOUT="5m" + ########### EXECUTING COMMANDS IN CONTAINERS PRE/POST BACKUP # It is possible to define commands to be run in any container before and after From 4639b21f3bacc7db44f43ba6cfc52cb6bdce93be Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sun, 28 Jan 2024 18:06:06 +0100 Subject: [PATCH 22/26] Choose better filename --- cmd/backup/{docker.go => stop_restart.go} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cmd/backup/{docker.go => stop_restart.go} (100%) diff --git a/cmd/backup/docker.go b/cmd/backup/stop_restart.go similarity index 100% rename from cmd/backup/docker.go rename to cmd/backup/stop_restart.go From 57e7f2af9eca3319a3f6f5f1a625972a812923d1 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Sun, 28 Jan 2024 19:31:09 +0100 Subject: [PATCH 23/26] Reflect changes in naming --- cmd/backup/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/backup/main.go b/cmd/backup/main.go index 64c6f21a..b61c157c 100644 --- a/cmd/backup/main.go +++ b/cmd/backup/main.go @@ -47,12 +47,12 @@ func main() { }() s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error { - restartContainers, err := s.stopContainersAndServices() + restartContainersAndServices, err := s.stopContainersAndServices() // The mechanism for restarting containers is not using hooks as it // should happen as soon as possible (i.e. before uploading backups or // similar). defer func() { - s.must(restartContainers()) + s.must(restartContainersAndServices()) }() if err != nil { return err From 7d489a95e3dc79c04cf197530ee34c9ec04e7708 Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Mon, 29 Jan 2024 15:15:29 +0100 Subject: [PATCH 24/26] Rename and deprecate BACKUP_STOP_CONTAINER_LABEL --- cmd/backup/config.go | 3 ++- cmd/backup/script.go | 2 +- cmd/backup/stop_restart.go | 17 ++++++++++++++++- ...-deprecated-backup-stop-container-label.md | 19 +++++++++++++++++++ docs/how-tos/set-up-notifications.md | 2 +- docs/how-tos/stop-containers-during-backup.md | 4 ++-- docs/recipes/index.md | 4 ++-- docs/reference/index.md | 17 ++++++++--------- 8 files changed, 51 insertions(+), 17 deletions(-) create mode 100644 docs/how-tos/replace-deprecated-backup-stop-container-label.md diff --git a/cmd/backup/config.go b/cmd/backup/config.go index 3afbe251..db39acac 100644 --- a/cmd/backup/config.go +++ b/cmd/backup/config.go @@ -37,7 +37,8 @@ type Config struct { BackupRetentionDays int32 `split_words:"true" default:"-1"` BackupPruningLeeway time.Duration `split_words:"true" default:"1m"` BackupPruningPrefix string `split_words:"true"` - BackupStopContainerLabel string `split_words:"true" default:"true"` + BackupStopContainerLabel string `split_words:"true"` + BackupStopDuringBackupLabel string `split_words:"true" default:"true"` BackupStopServiceTimeout time.Duration `split_words:"true" default:"5m"` BackupFromSnapshot bool `split_words:"true"` BackupExcludeRegexp RegexpDecoder `split_words:"true"` diff --git a/cmd/backup/script.go b/cmd/backup/script.go index 65b8009d..747a4ddc 100644 --- a/cmd/backup/script.go +++ b/cmd/backup/script.go @@ -322,7 +322,7 @@ func (s *script) createArchive() error { "Using BACKUP_FROM_SNAPSHOT has been deprecated and will be removed in the next major version.", ) s.logger.Warn( - "Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the README for an upgrade guide.", + "Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the documentation for an upgrade guide.", ) backupSources = filepath.Join("/tmp", s.c.BackupSources) // copy before compressing guard against a situation where backup folder's content are still growing. diff --git a/cmd/backup/stop_restart.go b/cmd/backup/stop_restart.go index ca3265ba..0dba1381 100644 --- a/cmd/backup/stop_restart.go +++ b/cmd/backup/stop_restart.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "os" "sync" "time" @@ -87,9 +88,23 @@ func (s *script) stopContainersAndServices() (func() error, error) { } isDockerSwarm := dockerInfo.Swarm.LocalNodeState != "inactive" + labelValue := s.c.BackupStopDuringBackupLabel + if s.c.BackupStopContainerLabel != "" { + s.logger.Warn( + "Using BACKUP_STOP_CONTAINER_LABEL has been deprecated and will be removed in the next major version.", + ) + s.logger.Warn( + "Please use BACKUP_STOP_DURING_BACKUP_LABEL instead. Refer to the docs for an upgrade guide.", + ) + if _, ok := os.LookupEnv("BACKUP_STOP_DURING_BACKUP_LABEL"); ok { + return noop, errors.New("(*script).stopContainersAndServices: both BACKUP_STOP_DURING_BACKUP_LABEL and BACKUP_STOP_CONTAINER_LABEL have been set, cannot continue") + } + labelValue = s.c.BackupStopContainerLabel + } + filterMatchLabel := fmt.Sprintf( "docker-volume-backup.stop-during-backup=%s", - s.c.BackupStopContainerLabel, + labelValue, ) allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{}) diff --git a/docs/how-tos/replace-deprecated-backup-stop-container-label.md b/docs/how-tos/replace-deprecated-backup-stop-container-label.md new file mode 100644 index 00000000..f8841cb3 --- /dev/null +++ b/docs/how-tos/replace-deprecated-backup-stop-container-label.md @@ -0,0 +1,19 @@ +--- +title: Replace deprecated BACKUP_STOP_CONTAINER_LABEL setting +layout: default +parent: How Tos +nav_order: 19 +--- + +# Replace deprecated `BACKUP_STOP_CONTAINER_LABEL` setting + +Version `v2.36.0` deprecated the `BACKUP_STOP_CONTAINER_LABEL` setting and renamed it `BACKUP_STOP_DURING_BACKUP_LABEL` which is supposed to signal that this will stop both containers _and_ services. +Migrating is done by renaming the key for your custom value: + +```diff + env: +- BACKUP_STOP_CONTAINER_LABEL: database ++ BACKUP_STOP_DURING_BACKUP_LABEL: database +``` + +The old key will stay supported until the next major version, but logs a warning each time a backup is taken. diff --git a/docs/how-tos/set-up-notifications.md b/docs/how-tos/set-up-notifications.md index 23d2aaf6..3c0b4a59 100644 --- a/docs/how-tos/set-up-notifications.md +++ b/docs/how-tos/set-up-notifications.md @@ -76,7 +76,7 @@ Configuration, data about the backup run and helper functions will be passed to Here is a list of all data passed to the template: -* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_CONTAINER_LABEL` becomes `BackupStopContainerLabel`) +* `Config`: this object holds the configuration that has been passed to the script. The field names are the name of the recognized environment variables converted in PascalCase. (e.g. `BACKUP_STOP_DURING_BACKUP_LABEL` becomes `BackupStopDuringBackupLabel`) * `Error`: the error that made the backup fail. Only available in the `title_failure` and `body_failure` templates * `Stats`: objects that holds stats regarding script execution. In case of an unsuccessful run, some information may not be available. * `StartTime`: time when the script started execution diff --git a/docs/how-tos/stop-containers-during-backup.md b/docs/how-tos/stop-containers-during-backup.md index f87f7540..bc913458 100644 --- a/docs/how-tos/stop-containers-during-backup.md +++ b/docs/how-tos/stop-containers-during-backup.md @@ -14,7 +14,7 @@ In many cases, it will be desirable to stop the services that are consuming the This image can automatically stop and restart containers and services. By default, any container that is labeled `docker-volume-backup.stop-during-backup=true` will be stopped before the backup is being taken and restarted once it has finished. -In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_CONTAINER_LABEL` environment variable and then use the same value for labeling: +In case you need more fine grained control about which containers should be stopped (e.g. when backing up multiple volumes on different schedules), you can set the `BACKUP_STOP_DURING_BACKUP_LABEL` environment variable and then use the same value for labeling: ```yml version: '3' @@ -28,7 +28,7 @@ services: backup: image: offen/docker-volume-backup:v2 environment: - BACKUP_STOP_CONTAINER_LABEL: service1 + BACKUP_STOP_DURING_BACKUP_LABEL: service1 volumes: - data:/backup/my-app-backup:ro - /var/run/docker.sock:/var/run/docker.sock:ro diff --git a/docs/recipes/index.md b/docs/recipes/index.md index cd483f0b..5a2220c0 100644 --- a/docs/recipes/index.md +++ b/docs/recipes/index.md @@ -352,7 +352,7 @@ services: AWS_ACCESS_KEY_ID: AKIAIOSFODNN7EXAMPLE AWS_SECRET_ACCESS_KEY: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY # Label the container using the `data_1` volume as `docker-volume-backup.stop-during-backup=service1` - BACKUP_STOP_CONTAINER_LABEL: service1 + BACKUP_STOP_DURING_BACKUP_LABEL: service1 volumes: - data_1:/backup/data-1-backup:ro - /var/run/docker.sock:/var/run/docker.sock:ro @@ -362,7 +362,7 @@ services: <<: *backup_environment # Label the container using the `data_2` volume as `docker-volume-backup.stop-during-backup=service2` BACKUP_CRON_EXPRESSION: "0 3 * * *" - BACKUP_STOP_CONTAINER_LABEL: service2 + BACKUP_STOP_DURING_BACKUP_LABEL: service2 volumes: - data_2:/backup/data-2-backup:ro - /var/run/docker.sock:/var/run/docker.sock:ro diff --git a/docs/reference/index.md b/docs/reference/index.md index 9eb157de..8caf7755 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -316,15 +316,14 @@ You can populate below template according to your requirements and use it as you # GPG_PASSPHRASE="" -########### STOPPING CONTAINERS DURING BACKUP - -# Containers can be stopped by applying a -# `docker-volume-backup.stop-during-backup` label. By default, all containers -# that are labeled with `true` will be stopped. If you need more fine grained -# control (e.g. when running multiple containers based on this image), you can -# override this default by specifying a different value here. - -# BACKUP_STOP_CONTAINER_LABEL="service1" +########### STOPPING CONTAINERS AND SERVICES DURING BACKUP + +# Containers or services can be stopped by applying a +# `docker-volume-backup.stop-during-backup` label. By default, all containers and +# services that are labeled with `true` will be stopped. If you need more fine +# grained control (e.g. when running multiple containers based on this image), +# you can override this default by specifying a different value here. +# BACKUP_STOP_DURING_BACKUP_LABEL="service1" # When trying to scale down Docker Swarm services, give up after # the specified amount of time in case the service has not converged yet. From 87ea8d0930dd6ee36f24abc7e21971bb1804a3ad Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Mon, 29 Jan 2024 16:20:50 +0100 Subject: [PATCH 25/26] Improve logging --- cmd/backup/stop_restart.go | 55 ++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/cmd/backup/stop_restart.go b/cmd/backup/stop_restart.go index 0dba1381..8c5d8396 100644 --- a/cmd/backup/stop_restart.go +++ b/cmd/backup/stop_restart.go @@ -170,16 +170,27 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } - s.logger.Info( - fmt.Sprintf( - "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", - len(containersToStop), - len(allContainers), - len(servicesToScaleDown), - len(allServices), - filterMatchLabel, - ), - ) + if isDockerSwarm { + s.logger.Info( + fmt.Sprintf( + "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", + len(containersToStop), + len(allContainers), + len(servicesToScaleDown), + len(allServices), + filterMatchLabel, + ), + ) + } else { + s.logger.Info( + fmt.Sprintf( + "Stopping %d out of %d running container(s) as they were labeled %s.", + len(containersToStop), + len(allContainers), + filterMatchLabel, + ), + ) + } var stoppedContainers []types.Container var stopErrors []error @@ -309,13 +320,23 @@ func (s *script) stopContainersAndServices() (func() error, error) { errors.Join(allErrors...), ) } - s.logger.Info( - fmt.Sprintf( - "Restarted %d container(s) and %d service(s).", - len(stoppedContainers), - len(scaledDownServices), - ), - ) + if isDockerSwarm { + s.logger.Info( + fmt.Sprintf( + "Restarted %d container(s) and %d service(s).", + len(stoppedContainers), + len(scaledDownServices), + ), + ) + } else { + s.logger.Info( + fmt.Sprintf( + "Restarted %d container(s).", + len(stoppedContainers), + ), + ) + } + return nil }, initialErr } From b13f189ecac8a4293c461c9117096e3968fd9d0f Mon Sep 17 00:00:00 2001 From: Frederik Ring Date: Mon, 29 Jan 2024 20:45:54 +0100 Subject: [PATCH 26/26] Further simplify logging --- cmd/backup/stop_restart.go | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/cmd/backup/stop_restart.go b/cmd/backup/stop_restart.go index 8c5d8396..fa3219e5 100644 --- a/cmd/backup/stop_restart.go +++ b/cmd/backup/stop_restart.go @@ -170,26 +170,23 @@ func (s *script) stopContainersAndServices() (func() error, error) { } } + s.logger.Info( + fmt.Sprintf( + "Stopping %d out of %d running container(s) as they were labeled %s.", + len(containersToStop), + len(allContainers), + filterMatchLabel, + ), + ) if isDockerSwarm { s.logger.Info( fmt.Sprintf( - "Stopping %d out of %d running container(s) and scaling down %d out of %d active service(s) as they were labeled %s.", - len(containersToStop), - len(allContainers), + "Scaling down %d out of %d active service(s) as they were labeled %s.", len(servicesToScaleDown), len(allServices), filterMatchLabel, ), ) - } else { - s.logger.Info( - fmt.Sprintf( - "Stopping %d out of %d running container(s) as they were labeled %s.", - len(containersToStop), - len(allContainers), - filterMatchLabel, - ), - ) } var stoppedContainers []types.Container @@ -320,21 +317,20 @@ func (s *script) stopContainersAndServices() (func() error, error) { errors.Join(allErrors...), ) } + + s.logger.Info( + fmt.Sprintf( + "Restarted %d container(s).", + len(stoppedContainers), + ), + ) if isDockerSwarm { s.logger.Info( fmt.Sprintf( - "Restarted %d container(s) and %d service(s).", - len(stoppedContainers), + "Scaled %d service(s) back up.", len(scaledDownServices), ), ) - } else { - s.logger.Info( - fmt.Sprintf( - "Restarted %d container(s).", - len(stoppedContainers), - ), - ) } return nil