Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Swarm support #333

Merged
merged 26 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
270ca65
Query for labeled services as well
m90 Jan 25, 2024
8ef7fa0
Try scaling down services
m90 Jan 25, 2024
511b79b
Scale services back up
m90 Jan 25, 2024
978e900
Use progress tool from Docker CLI
m90 Jan 26, 2024
f14b796
In test, label both services
m90 Jan 26, 2024
b785560
Clean up error and log messages
m90 Jan 26, 2024
fee8cb2
Document scale-up/down approach in docs
m90 Jan 26, 2024
95e9e99
Downgrade Docker CLI to match client
m90 Jan 26, 2024
f449717
Document services stats
m90 Jan 26, 2024
94aa333
Do not rely on PreviousSpec for storing desired replica count
m90 Jan 27, 2024
78a89c1
Log warnings from Docker when updating services
m90 Jan 27, 2024
538a069
Check whether container and service labels collide
m90 Jan 27, 2024
bf1d13b
Document script behavior on label collision
m90 Jan 27, 2024
bb37b8b
Add additional check if all containers have been removed
m90 Jan 27, 2024
7ad6fc9
Scale services concurrently
m90 Jan 27, 2024
09cc1f5
Move docker interaction code into own file
m90 Jan 27, 2024
26bbc66
Factor out code for service updating
m90 Jan 27, 2024
2bc94d8
Time out after five minutes of not reaching desired container count
m90 Jan 27, 2024
542d1fa
Inline handling of in-swarm container level restart
m90 Jan 27, 2024
409496a
Timer is more suitable for timeout race
m90 Jan 27, 2024
9acd6dc
Timeout when scaling down services should be configurable
m90 Jan 28, 2024
4639b21
Choose better filename
m90 Jan 28, 2024
57e7f2a
Reflect changes in naming
m90 Jan 28, 2024
7d489a9
Rename and deprecate BACKUP_STOP_CONTAINER_LABEL
m90 Jan 29, 2024
87ea8d0
Improve logging
m90 Jan 29, 2024
b13f189
Further simplify logging
m90 Jan 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/backup/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ type Config struct {
BackupRetentionDays int32 `split_words:"true" default:"-1"`
BackupPruningLeeway time.Duration `split_words:"true" default:"1m"`
BackupPruningPrefix string `split_words:"true"`
BackupStopContainerLabel string `split_words:"true" default:"true"`
BackupStopContainerLabel string `split_words:"true"`
BackupStopDuringBackupLabel string `split_words:"true" default:"true"`
BackupStopServiceTimeout time.Duration `split_words:"true" default:"5m"`
BackupFromSnapshot bool `split_words:"true"`
BackupExcludeRegexp RegexpDecoder `split_words:"true"`
BackupSkipBackendsFromPrune []string `split_words:"true"`
Expand Down
4 changes: 2 additions & 2 deletions cmd/backup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ func main() {
}()

s.must(s.withLabeledCommands(lifecyclePhaseArchive, func() error {
restartContainers, err := s.stopContainers()
restartContainersAndServices, err := s.stopContainersAndServices()
// The mechanism for restarting containers is not using hooks as it
// should happen as soon as possible (i.e. before uploading backups or
// similar).
defer func() {
s.must(restartContainers())
s.must(restartContainersAndServices())
}()
if err != nil {
return err
Expand Down
128 changes: 1 addition & 127 deletions cmd/backup/script.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ package main

import (
"bytes"
"context"
"errors"
"fmt"
"io"
"io/fs"
Expand All @@ -30,10 +28,6 @@ import (
openpgp "github.com/ProtonMail/go-crypto/openpgp/v2"
"github.com/containrrr/shoutrrr"
"github.com/containrrr/shoutrrr/pkg/router"
"github.com/docker/docker/api/types"
ctr "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/swarm"
"github.com/docker/docker/client"
"github.com/leekchan/timeutil"
"github.com/offen/envconfig"
Expand Down Expand Up @@ -318,126 +312,6 @@ func newScript() (*script, error) {
return s, nil
}

// stopContainers stops all Docker containers that are marked as to being
// stopped during the backup and returns a function that can be called to
// restart everything that has been stopped.
func (s *script) stopContainers() (func() error, error) {
if s.cli == nil {
return noop, nil
}

allContainers, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{})
if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers: %w", err)
}

containerLabel := fmt.Sprintf(
"docker-volume-backup.stop-during-backup=%s",
s.c.BackupStopContainerLabel,
)
containersToStop, err := s.cli.ContainerList(context.Background(), types.ContainerListOptions{
Filters: filters.NewArgs(filters.KeyValuePair{
Key: "label",
Value: containerLabel,
}),
})

if err != nil {
return noop, fmt.Errorf("stopContainers: error querying for containers to stop: %w", err)
}

if len(containersToStop) == 0 {
return noop, nil
}

s.logger.Info(
fmt.Sprintf(
"Stopping %d container(s) labeled `%s` out of %d running container(s).",
len(containersToStop),
containerLabel,
len(allContainers),
),
)

var stoppedContainers []types.Container
var stopErrors []error
for _, container := range containersToStop {
if err := s.cli.ContainerStop(context.Background(), container.ID, ctr.StopOptions{}); err != nil {
stopErrors = append(stopErrors, err)
} else {
stoppedContainers = append(stoppedContainers, container)
}
}

var stopError error
if len(stopErrors) != 0 {
stopError = fmt.Errorf(
"stopContainers: %d error(s) stopping containers: %w",
len(stopErrors),
errors.Join(stopErrors...),
)
}

s.stats.Containers = ContainersStats{
All: uint(len(allContainers)),
ToStop: uint(len(containersToStop)),
Stopped: uint(len(stoppedContainers)),
}

return func() error {
servicesRequiringUpdate := map[string]struct{}{}

var restartErrors []error
for _, container := range stoppedContainers {
if swarmServiceName, ok := container.Labels["com.docker.swarm.service.name"]; ok {
servicesRequiringUpdate[swarmServiceName] = struct{}{}
continue
}
if err := s.cli.ContainerStart(context.Background(), container.ID, types.ContainerStartOptions{}); err != nil {
restartErrors = append(restartErrors, err)
}
}

if len(servicesRequiringUpdate) != 0 {
services, _ := s.cli.ServiceList(context.Background(), types.ServiceListOptions{})
for serviceName := range servicesRequiringUpdate {
var serviceMatch swarm.Service
for _, service := range services {
if service.Spec.Name == serviceName {
serviceMatch = service
break
}
}
if serviceMatch.ID == "" {
return fmt.Errorf("stopContainers: couldn't find service with name %s", serviceName)
}
serviceMatch.Spec.TaskTemplate.ForceUpdate += 1
if _, err := s.cli.ServiceUpdate(
context.Background(), serviceMatch.ID,
serviceMatch.Version, serviceMatch.Spec, types.ServiceUpdateOptions{},
); err != nil {
restartErrors = append(restartErrors, err)
}
}
}

if len(restartErrors) != 0 {
return fmt.Errorf(
"stopContainers: %d error(s) restarting containers and services: %w",
len(restartErrors),
errors.Join(restartErrors...),
)
}
s.logger.Info(
fmt.Sprintf(
"Restarted %d container(s) and the matching service(s).",
len(stoppedContainers),
),
)
return nil
}, stopError
}

// createArchive creates a tar archive of the configured backup location and
// saves it to disk.
func (s *script) createArchive() error {
Expand All @@ -448,7 +322,7 @@ func (s *script) createArchive() error {
"Using BACKUP_FROM_SNAPSHOT has been deprecated and will be removed in the next major version.",
)
s.logger.Warn(
"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the README for an upgrade guide.",
"Please use `archive-pre` and `archive-post` commands to prepare your backup sources. Refer to the documentation for an upgrade guide.",
)
backupSources = filepath.Join("/tmp", s.c.BackupSources)
// copy before compressing guard against a situation where backup folder's content are still growing.
Expand Down
10 changes: 10 additions & 0 deletions cmd/backup/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@ type ContainersStats struct {
StopErrors uint
}

// ServicesStats contains info about Swarm services that have been
// operated upon
type ServicesStats struct {
All uint
ToScaleDown uint
ScaledDown uint
ScaleDownErrors uint
}

// BackupFileStats stats about the created backup file
type BackupFileStats struct {
Name string
Expand All @@ -40,6 +49,7 @@ type Stats struct {
LockedTime time.Duration
LogOutput *bytes.Buffer
Containers ContainersStats
Services ServicesStats
BackupFile BackupFileStats
Storages map[string]StorageStats
}
Loading
Loading