Skip to content

Commit

Permalink
Merge pull request #439 from basecamp/zero-downtime-deploy-file
Browse files Browse the repository at this point in the history
Zero downtime deployment with cord file
  • Loading branch information
djmb authored Sep 7, 2023
2 parents 6263bf9 + 8a41d15 commit aa99998
Show file tree
Hide file tree
Showing 17 changed files with 233 additions and 64 deletions.
21 changes: 18 additions & 3 deletions lib/kamal/cli/app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ def boot
roles.each do |role|
app = KAMAL.app(role: role)
auditor = KAMAL.auditor(role: role)
role_config = KAMAL.config.role(role)

if capture_with_info(*app.container_id_for_version(version, only_running: true), raise_on_non_zero_exit: false).present?
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
tmp_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
info "Renaming container #{version} to #{tmp_version} as already deployed on #{host}"
execute *auditor.record("Renaming container #{version} to #{tmp_version}"), verbosity: :debug
Expand All @@ -29,11 +30,25 @@ def boot
execute *auditor.record("Booted app version #{version}"), verbosity: :debug

old_version = capture_with_info(*app.current_running_version, raise_on_non_zero_exit: false).strip
execute *app.start_or_run(hostname: "#{host}-#{SecureRandom.hex(6)}")

if role_config.uses_cord?
execute *app.tie_cord(role_config.cord_host_file)
end

execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")

Kamal::Utils::HealthcheckPoller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }

execute *app.stop(version: old_version), raise_on_non_zero_exit: false if old_version.present?
if old_version.present?
if role_config.uses_cord?
cord = capture_with_info(*app.cord(version: old_version), raise_on_non_zero_exit: false).strip
if cord.present?
execute *app.cut_cord(cord)
Kamal::Utils::HealthcheckPoller.wait_for_unhealthy(pause_after_ready: true) { capture_with_info(*app.status(version: old_version)) }
end
end
execute *app.stop(version: old_version), raise_on_non_zero_exit: false
end
end
end
end
Expand Down
47 changes: 27 additions & 20 deletions lib/kamal/commands/app.rb
Original file line number Diff line number Diff line change
@@ -1,34 +1,29 @@
class Kamal::Commands::App < Kamal::Commands::Base
ACTIVE_DOCKER_STATUSES = [ :running, :restarting ]

attr_reader :role
attr_reader :role, :role_config

def initialize(config, role: nil)
super(config)
@role = role
end

def start_or_run(hostname: nil)
combine start, run(hostname: hostname), by: "||"
@role_config = config.role(self.role)
end

def run(hostname: nil)
role = config.role(self.role)

docker :run,
"--detach",
"--restart unless-stopped",
"--name", container_name,
*(["--hostname", hostname] if hostname),
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*role.env_args,
*role.health_check_args,
*role_config.env_args,
*role_config.health_check_args,
*config.logging_args,
*config.volume_args,
*role.label_args,
*role.option_args,
*role_config.label_args,
*role_config.option_args,
config.absolute_image,
role.cmd
role_config.cmd
end

def start
Expand Down Expand Up @@ -76,14 +71,12 @@ def execute_in_existing_container(*command, interactive: false)
end

def execute_in_new_container(*command, interactive: false)
role = config.role(self.role)

docker :run,
("-it" if interactive),
"--rm",
*role&.env_args,
*role_config&.env_args,
*config.volume_args,
*role&.option_args,
*role_config&.option_args,
config.absolute_image,
*command
end
Expand Down Expand Up @@ -112,7 +105,7 @@ def current_running_version
def list_versions(*docker_args, statuses: nil)
pipe \
docker(:ps, *filter_args(statuses: statuses), *docker_args, "--format", '"{{.Names}}"'),
%(while read line; do echo ${line##{service_role_dest}-}; done) # Extract SHA from "service-role-dest-SHA"
%(while read line; do echo ${line##{role_config.full_name}-}; done) # Extract SHA from "service-role-dest-SHA"
end

def list_containers
Expand Down Expand Up @@ -150,16 +143,30 @@ def tag_current_as_latest
end

def make_env_directory
make_directory config.role(role).host_env_directory
make_directory role_config.host_env_directory
end

def remove_env_file
[:rm, "-f", config.role(role).host_env_file_path]
[:rm, "-f", role_config.host_env_file_path]
end

def cord(version:)
pipe \
docker(:inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", container_name(version)),
[:awk, "'$2 == \"#{role_config.cord_container_directory}\" {print $1}'"]
end

def tie_cord(cord)
create_empty_file(cord)
end

def cut_cord(cord)
remove_directory(cord)
end

private
def container_name(version = nil)
[ config.service, role, config.destination, version || config.version ].compact.join("-")
[ role_config.full_name, version || config.version ].compact.join("-")
end

def filter_args(statuses: nil)
Expand Down
10 changes: 10 additions & 0 deletions lib/kamal/commands/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def make_directory(path)
[ :mkdir, "-p", path ]
end

def remove_directory(path)
[ :rm, "-r", path ]
end

private
def combine(*commands, by: "&&")
commands
Expand Down Expand Up @@ -69,5 +73,11 @@ def docker(*args)
def tags(**details)
Kamal::Tags.from_config(config, **details)
end

def create_empty_file(file)
chain \
make_directory_for(file),
[:touch, file]
end
end
end
2 changes: 1 addition & 1 deletion lib/kamal/commands/healthcheck.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def run
"--label", "service=#{container_name}",
"-e", "KAMAL_CONTAINER_NAME=\"#{container_name}\"",
*web.env_args,
*web.health_check_args,
*web.health_check_args(cord: false),
*config.volume_args,
*web.option_args,
config.absolute_image,
Expand Down
14 changes: 13 additions & 1 deletion lib/kamal/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ def run_directory
raw_config.run_directory || ".kamal"
end

def run_directory_as_docker_volume
if Pathname.new(run_directory).absolute?
run_directory
else
File.join "$(pwd)", run_directory
end
end


def roles
@roles ||= role_names.collect { |role_name| Role.new(role_name, config: self) }
Expand Down Expand Up @@ -141,7 +149,7 @@ def sshkit


def healthcheck
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999 }.merge(raw_config.healthcheck || {})
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord" }.merge(raw_config.healthcheck || {})
end

def readiness_delay
Expand Down Expand Up @@ -199,6 +207,10 @@ def host_env_directory
"#{run_directory}/env"
end

def run_id
@run_id ||= SecureRandom.hex(16)
end

private
# Will raise ArgumentError if any required config keys are missing
def ensure_required_keys_present
Expand Down
53 changes: 45 additions & 8 deletions lib/kamal/configuration/role.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
class Kamal::Configuration::Role
CORD_FILE = "cord"
delegate :argumentize, :env_file_with_secrets, :optionize, to: Kamal::Utils

attr_accessor :name
Expand Down Expand Up @@ -47,28 +48,52 @@ def env_args
argumentize "--env-file", host_env_file_path
end

def health_check_args
def health_check_args(cord: true)
if health_check_cmd.present?
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
if cord && uses_cord?
optionize({ "health-cmd" => health_check_cmd_with_cord, "health-interval" => health_check_interval })
.concat(["--volume", "#{cord_host_directory}:#{cord_container_directory}"])
else
optionize({ "health-cmd" => health_check_cmd, "health-interval" => health_check_interval })
end
else
[]
end
end

def health_check_cmd
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
health_check_options["cmd"] || http_health_check(port: health_check_options["port"], path: health_check_options["path"])
end

options["cmd"] || http_health_check(port: options["port"], path: options["path"])
def health_check_cmd_with_cord
"(#{health_check_cmd}) && (stat #{cord_container_file} > /dev/null || exit 1)"
end

def health_check_interval
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
health_check_options["interval"] || "1s"
end

def uses_cord?
running_traefik? && cord_container_directory.present? && health_check_cmd.present?
end

def cord_host_directory
File.join config.run_directory_as_docker_volume, "cords", [full_name, config.run_id].join("-")
end

def cord_host_file
File.join cord_host_directory, CORD_FILE
end

def cord_container_directory
health_check_options.fetch("cord", nil)
end

options["interval"] || "1s"
def cord_container_file
File.join cord_container_directory, CORD_FILE
end


def cmd
specializations["cmd"]
end
Expand All @@ -85,6 +110,10 @@ def running_traefik?
name.web? || specializations["traefik"]
end

def full_name
[ config.service, name, config.destination ].compact.join("-")
end

private
attr_accessor :config

Expand Down Expand Up @@ -164,4 +193,12 @@ def merged_env_with_secrets
def http_health_check(port:, path:)
"curl -f #{URI.join("http://localhost:#{port}", path)} || exit 1" if path.present? || port.present?
end

def health_check_options
@health_check_options ||= begin
options = specializations["healthcheck"] || {}
options = config.healthcheck.merge(options) if running_traefik?
options
end
end
end
29 changes: 27 additions & 2 deletions lib/kamal/utils/healthcheck_poller.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
class Kamal::Utils::HealthcheckPoller
TRAEFIK_HEALTHY_DELAY = 2
TRAEFIK_UPDATE_DELAY = 2

class HealthcheckError < StandardError; end

Expand All @@ -11,7 +11,7 @@ def wait_for_healthy(pause_after_ready: false, &block)
begin
case status = block.call
when "healthy"
sleep TRAEFIK_HEALTHY_DELAY if pause_after_ready
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
when "running" # No health check configured
sleep KAMAL.config.readiness_delay if pause_after_ready
else
Expand All @@ -31,6 +31,31 @@ def wait_for_healthy(pause_after_ready: false, &block)
info "Container is healthy!"
end

def wait_for_unhealthy(pause_after_ready: false, &block)
attempt = 1
max_attempts = KAMAL.config.healthcheck["max_attempts"]

begin
case status = block.call
when "unhealthy"
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
else
raise HealthcheckError, "container not unhealthy (#{status})"
end
rescue HealthcheckError => e
if attempt <= max_attempts
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
sleep attempt
attempt += 1
retry
else
raise
end
end

info "Container is unhealthy!"
end

private
def info(message)
SSHKit.config.output.info(message)
Expand Down
17 changes: 16 additions & 1 deletion test/cli/app_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ class CliAppTest < CliTestCase
end

test "boot will rename if same version is already running" do
Object.any_instance.stubs(:sleep)
run_command("details") # Preheat Kamal const

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", raise_on_non_zero_exit: false)
.returns("12345678") # running version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
Expand All @@ -25,6 +26,14 @@ class CliAppTest < CliTestCase
.with(:docker, :ps, "--filter", "label=service=app", "--filter", "label=role=web", "--filter", "status=running", "--filter", "status=restarting", "--latest", "--format", "\"{{.Names}}\"", "|", "while read line; do echo ${line#app-web-}; done", raise_on_non_zero_exit: false)
.returns("123") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :inspect, "-f '{{ range .Mounts }}{{ .Source }} {{ .Destination }} {{ end }}'", "app-web-123", "|", :awk, "'$2 == \"/tmp/kamal-cord\" {print $1}'", :raise_on_non_zero_exit => false)
.returns("cordfile") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # old version unhealthy

run_command("boot").tap do |output|
assert_match /Renaming container .* to .* as already deployed on 1.1.1.1/, output # Rename
assert_match /docker rename app-web-latest app-web-latest_replaced_[0-9a-f]{16}/, output
Expand Down Expand Up @@ -180,10 +189,16 @@ def run_command(*command, config: :with_accessories)
end

def stub_running
Object.any_instance.stubs(:sleep)

SSHKit::Backend::Abstract.any_instance.stubs(:capture_with_info).returns("123") # old version

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("running") # health check

SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-123$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
.returns("unhealthy") # health check
end
end
1 change: 1 addition & 0 deletions test/cli/healthcheck_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class CliHealthcheckTest < CliTestCase
Thread.report_on_exception = false

Kamal::Utils::HealthcheckPoller.stubs(:sleep) # No sleeping when retrying
Kamal::Configuration.any_instance.stubs(:run_id).returns("12345678901234567890123456789012")

SSHKit::Backend::Abstract.any_instance.stubs(:execute)
.with(:docker, :container, :ls, "--all", "--filter", "name=^healthcheck-app-999$", "--quiet", "|", :xargs, :docker, :stop, raise_on_non_zero_exit: false)
Expand Down
Loading

0 comments on commit aa99998

Please sign in to comment.