Skip to content

Commit

Permalink
Zero downtime deployment with cord file
Browse files Browse the repository at this point in the history
When replacing a container currently we:
1. Boot the new container
2. Wait for it to become healthy
3. Stop the old container

Traefik will send requests to the old container until it notices that it
is unhealthy. But it may have stopped serving requests before that point
which can result in errors.

To get round that the new boot process is:

1. Create a directory with a single file on the host
2. Boot the new container, mounting the cord file into /tmp and
including a check for the file in the docker healthcheck
3. Wait for it to become healthy
4. Delete the healthcheck file ("cut the cord") for the old container
5. Wait for it to become unhealthy and give Traefik a couple of seconds
to notice
6. Stop the old container

The extra steps ensure that Traefik stops sending requests before the
old container is shutdown.
  • Loading branch information
djmb committed Aug 31, 2023
1 parent 989d09e commit 73943ae
Show file tree
Hide file tree
Showing 17 changed files with 233 additions and 64 deletions.
21 changes: 18 additions & 3 deletions lib/kamal/cli/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ def boot
roles.each do |role|
app = KAMAL.app(role: role)
auditor = KAMAL.auditor(role: role)
role_config = KAMAL.config.role(role)

if capture_with_info(*app.container_id_for_version(version, only_running: true), raise_on_non_zero_exit: false).present?
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
tmp_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
Expand All @@ -29,11 +30,25 @@ def boot
execute *auditor.record("Booted app version #{version}"), verbosity: :debug

old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
execute *app.start_or_run(hostname: "#{host}-#{SecureRandom.hex(6)}")

if role_config.uses_cord?
execute *app.tie_cord(role_config.cord_host_file)
end

execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")

Kamal::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }

execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
if old_version.present?
if role_config.uses_cord?
cord = capture_with_info(*app.cord(version: old_version), raise_on_non_zero_exit: false).strip
if cord.present?
execute *app.cut_cord(cord)
Kamal::Utils::HealthcheckPoller.wait_for_unhealthy(pause_after_ready: true) { capture_with_info(*app.status(version: old_version)) }
end
end
execute *app.stop(version: old_version), raise_on_non_zero_exit: false
end
end
end
end
Expand Down
47 changes: 27 additions & 20 deletions lib/kamal/commands/app.rb
Original file line number Diff line number Diff line change
@@ -1,34 +1,29 @@
class Kamal::Commands::App < Kamal::Commands::Base
ACTIVE_DOCKER_STATUSES = [ :running, :restarting ]

attr_reader :role
attr_reader :role, :role_config

def initialize(config, role: nil)
super(config)
@role = role
end

def start_or_run(hostname: nil)
combine start, run(hostname: hostname), by: "||"
@role_config = config.role(self.role)
end

def run(hostname: nil)
role = config.role(self.role)

docker :run,
"--detach",
"--restart unless-stopped",
"--name", container_name,
*(["--hostname", hostname] if hostname),
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*role.env_args,
*role.health_check_args,
*role_config.env_args,
*role_config.health_check_args,
*config.logging_args,
*config.volume_args,
*role.label_args,
*role.option_args,
*role_config.label_args,
*role_config.option_args,
config.absolute_image,
role.cmd
role_config.cmd
end

def start
Expand Down Expand Up @@ -76,14 +71,12 @@ def execute_in_existing_container(*command, interactive: false)
end

def execute_in_new_container(*command, interactive: false)
role = config.role(self.role)

docker :run,
("-it" if interactive),
"--rm",
*role&.env_args,
*role_config&.env_args,
*config.volume_args,
*role&.option_args,
*role_config&.option_args,
config.absolute_image,
*command
end
Expand Down Expand Up @@ -112,7 +105,7 @@ def current_running_version
def list_versions(*docker_args, statuses: nil)
pipe \
docker(:ps, *filter_args(statuses: statuses), *docker_args, "--format", '"{{.Names}}"'),
%(while read line; do echo ${line##{service_role_dest}-}; done) # Extract SHA from "service-role-dest-SHA"
%(while read line; do echo ${line##{role_config.full_name}-}; done) # Extract SHA from "service-role-dest-SHA"
end

def list_containers
Expand Down Expand Up @@ -150,16 +143,30 @@ def tag_current_as_latest
end

def make_env_directory
make_directory config.role(role).host_env_directory
make_directory role_config.host_env_directory
end

def remove_env_file
[:rm, "-f", config.role(role).host_env_file_path]
[:rm, "-f", role_config.host_env_file_path]
end

def cord(version:)
pipe \
docker(:inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", container_name(version)),
[:awk, "'$2 == \"#{role_config.cord_container_directory}\" {print $1}'"]
end

def tie_cord(cord)
create_empty_file(cord)
end

def cut_cord(cord)
remove_directory(cord)
end

private
def container_name(version = nil)
[ config.service, role, config.destination, version || config.version ].compact.join("-")
[ role_config.full_name, version || config.version ].compact.join("-")
end

def filter_args(statuses: nil)
Expand Down
10 changes: 10 additions & 0 deletions lib/kamal/commands/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def make_directory(path)
[ :mkdir, "-p", path ]
end

def remove_directory(path)
[ :rm, "-r", path ]
end

private
def combine(*commands, by: "&&")
commands
Expand Down Expand Up @@ -69,5 +73,11 @@ def docker(*args)
def tags(**details)
Kamal::Tags.from_config(config, **details)
end

def create_empty_file(file)
chain \
make_directory_for(file),
[:touch, file]
end
end
end
2 changes: 1 addition & 1 deletion lib/kamal/commands/healthcheck.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def run
"--label", "service=#{container_name}",
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*web.env_args,
*web.health_check_args,
*web.health_check_args(cord: false),
*config.volume_args,
*web.option_args,
config.absolute_image,
Expand Down
14 changes: 13 additions & 1 deletion lib/kamal/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ def run_directory
raw_config.run_directory || ".kamal"
end

def run_directory_as_docker_volume
if Pathname.new(run_directory).absolute?
run_directory
else
File.join "$(pwd)", run_directory
end
end


def roles
@roles ||= role_names.collect { |role_name| Role.new(role_name, config: self) }
Expand Down Expand Up @@ -141,7 +149,7 @@ def sshkit


def healthcheck
{ "path" => "/up", "port" => 3000, "max_attempts" => 7 }.merge(raw_config.healthcheck || {})
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord" }.merge(raw_config.healthcheck || {})
end

def readiness_delay
Expand Down Expand Up @@ -199,6 +207,10 @@ def host_env_directory
"#{run_directory}/env"
end

def run_id
@run_id ||= SecureRandom.hex(16)
end

private
# Will raise ArgumentError if any required config keys are missing
def ensure_required_keys_present
Expand Down
53 changes: 45 additions & 8 deletions lib/kamal/configuration/role.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
class Kamal::Configuration::Role
CORD_FILE = "cord"
delegate :argumentize, :env_file_with_secrets, :optionize, to: Kamal::Utils

attr_accessor :name
Expand Down Expand Up @@ -47,28 +48,52 @@ def env_args
argumentize "--env-file", host_env_file_path
end

def health_check_args
def health_check_args(cord: true)
if health_check_cmd.present?
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
if cord && uses_cord?
optionize({ "health-cmd" => health_check_cmd_with_cord, "health-interval" => health_check_interval })
.concat(["--volume", "#{cord_host_directory}:#{cord_container_directory}"])
else
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
end
else
[]
end
end

def health_check_cmd
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
health_check_options["cmd"] || http_health_check(port: health_check_options["port"], path: health_check_options["path"])
end

options["cmd"] || http_health_check(port: options["port"], path: options["path"])
def health_check_cmd_with_cord
"(#{health_check_cmd}) && (stat #{cord_container_file} > /dev/null || exit 1)"
end

def health_check_interval
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
health_check_options["interval"] || "1s"
end

def uses_cord?
running_traefik? && cord_container_directory.present? && health_check_cmd.present?
end

def cord_host_directory
File.join config.run_directory_as_docker_volume, "cords", [full_name, config.run_id].join("-")
end

def cord_host_file
File.join cord_host_directory, CORD_FILE
end

def cord_container_directory
health_check_options.fetch("cord", nil)
end

options["interval"] || "1s"
def cord_container_file
File.join cord_container_directory, CORD_FILE
end


def cmd
specializations["cmd"]
end
Expand All @@ -85,6 +110,10 @@ def running_traefik?
name.web? || specializations["traefik"]
end

def full_name
[ config.service, name, config.destination ].compact.join("-")
end

private
attr_accessor :config

Expand Down Expand Up @@ -164,4 +193,12 @@ def merged_env_with_secrets
def http_health_check(port:, path:)
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
end

def health_check_options
@health_check_options ||= begin
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
options
end
end
end
29 changes: 27 additions & 2 deletions lib/kamal/utils/healthcheck_poller.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
class Kamal::Utils::HealthcheckPoller
TRAEFIK_HEALTHY_DELAY = 2
TRAEFIK_UPDATE_DELAY = 2

class HealthcheckError < StandardError; end

Expand All @@ -11,7 +11,7 @@ def wait_for_healthy(pause_after_ready: false, &block)
begin
case status = block.call
when "healthy"
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
Expand All @@ -31,6 +31,31 @@ def wait_for_healthy(pause_after_ready: false, &block)
info "Container is healthy!"
end

def wait_for_unhealthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = KAMAL.config.healthcheck["max_attempts"]

begin
case status = block.call
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise HealthcheckError, "container not unhealthy (#{status})"
end
rescue HealthcheckError => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end

info "Container is unhealthy!"
end

private
def info(message)
SSHKit.config.output.info(message)
Expand Down
17 changes: 16 additions & 1 deletion test/cli/app_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ class CliAppTest < CliTestCase
end

test "boot will rename if same version is already running" do
Object.any_instance.stubs(:sleep)
run_command("details") # Preheat Kamal const

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.returns("12345678") # running version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
Expand All @@ -25,6 +26,14 @@ class CliAppTest < CliTestCase
.with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--filter", "status=restarting", "--latest", "--format", "\"{{.Names}}\"", "|", "while read line; do echo ${line#app-web-}; done", raise_on_non_zero_exit: false)
.returns("123") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", "app-web-123", "|", :awk, "'$2 == \"/tmp/kamal-cord\" {print $1}'", :raise_on_non_zero_exit => false)
.returns("cordfile") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # old version unhealthy

run_command("boot").tap do |output|
assert_match /Renaming container .* to .* as already deployed on 1.1.1.1/, output # Rename
assert_match /docker rename app-web-latest app-web-latest_replaced_[0-9a-f]{16}/, output
Expand Down Expand Up @@ -180,10 +189,16 @@ def run_command(*command, config: :with_accessories)
end

def stub_running
Object.any_instance.stubs(:sleep)

SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running") # health check

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # health check
end
end
1 change: 1 addition & 0 deletions test/cli/healthcheck_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class CliHealthcheckTest < CliTestCase
Thread.report_on_exception = false

Kamal::Utils::HealthcheckPoller.stubs(:sleep) # No sleeping when retrying
Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012")

SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
Expand Down
Loading

0 comments on commit 73943ae

Please sign in to comment.